aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/addr_list.c101
-rw-r--r--fs/afs/cell.c17
-rw-r--r--fs/afs/dynroot.c2
-rw-r--r--fs/afs/internal.h12
-rw-r--r--fs/afs/main.c2
-rw-r--r--fs/afs/proc.c22
-rw-r--r--fs/afs/rxrpc.c2
-rw-r--r--fs/binfmt_elf.c6
-rw-r--r--fs/btrfs/backref.c39
-rw-r--r--fs/btrfs/btrfs_inode.h8
-rw-r--r--fs/btrfs/check-integrity.c6
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/ctree.c68
-rw-r--r--fs/btrfs/ctree.h68
-rw-r--r--fs/btrfs/delayed-inode.c41
-rw-r--r--fs/btrfs/delayed-inode.h4
-rw-r--r--fs/btrfs/delayed-ref.c69
-rw-r--r--fs/btrfs/delayed-ref.h10
-rw-r--r--fs/btrfs/dev-replace.c64
-rw-r--r--fs/btrfs/dev-replace.h8
-rw-r--r--fs/btrfs/dir-item.c8
-rw-r--r--fs/btrfs/disk-io.c25
-rw-r--r--fs/btrfs/export.c4
-rw-r--r--fs/btrfs/extent-tree.c441
-rw-r--r--fs/btrfs/extent_io.c33
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/extent_map.c32
-rw-r--r--fs/btrfs/extent_map.h4
-rw-r--r--fs/btrfs/file.c33
-rw-r--r--fs/btrfs/free-space-cache.c16
-rw-r--r--fs/btrfs/inode.c237
-rw-r--r--fs/btrfs/ioctl.c53
-rw-r--r--fs/btrfs/qgroup.c460
-rw-r--r--fs/btrfs/qgroup.h8
-rw-r--r--fs/btrfs/ref-verify.c8
-rw-r--r--fs/btrfs/relocation.c74
-rw-r--r--fs/btrfs/scrub.c34
-rw-r--r--fs/btrfs/send.c24
-rw-r--r--fs/btrfs/super.c6
-rw-r--r--fs/btrfs/tests/extent-io-tests.c10
-rw-r--r--fs/btrfs/tests/extent-map-tests.c4
-rw-r--r--fs/btrfs/transaction.c31
-rw-r--r--fs/btrfs/tree-checker.c14
-rw-r--r--fs/btrfs/tree-log.c134
-rw-r--r--fs/btrfs/tree-log.h12
-rw-r--r--fs/btrfs/volumes.c124
-rw-r--r--fs/btrfs/volumes.h9
-rw-r--r--fs/buffer.c10
-rw-r--r--fs/cachefiles/namei.c2
-rw-r--r--fs/ceph/super.c16
-rw-r--r--fs/cifs/Kconfig1
-rw-r--r--fs/cifs/cifs_unicode.c3
-rw-r--r--fs/cifs/cifsglob.h1
-rw-r--r--fs/cifs/cifssmb.c11
-rw-r--r--fs/cifs/connect.c15
-rw-r--r--fs/cifs/inode.c2
-rw-r--r--fs/cifs/misc.c8
-rw-r--r--fs/cifs/readdir.c11
-rw-r--r--fs/cifs/smb2misc.c14
-rw-r--r--fs/cifs/smb2ops.c37
-rw-r--r--fs/cifs/smb2pdu.c32
-rw-r--r--fs/cifs/transport.c21
-rw-r--r--fs/compat_ioctl.c169
-rw-r--r--fs/coredump.c2
-rw-r--r--fs/dax.c27
-rw-r--r--fs/ecryptfs/inode.c11
-rw-r--r--fs/ext2/inode.c2
-rw-r--r--fs/ext4/dir.c20
-rw-r--r--fs/ext4/ext4.h20
-rw-r--r--fs/ext4/inline.c4
-rw-r--r--fs/ext4/inode.c20
-rw-r--r--fs/ext4/mmp.c1
-rw-r--r--fs/ext4/namei.c6
-rw-r--r--fs/ext4/page-io.c2
-rw-r--r--fs/ext4/resize.c23
-rw-r--r--fs/ext4/super.c4
-rw-r--r--fs/fat/fatent.c1
-rw-r--r--fs/fcntl.c2
-rw-r--r--fs/fscache/cookie.c31
-rw-r--r--fs/fscache/internal.h1
-rw-r--r--fs/fscache/main.c4
-rw-r--r--fs/gfs2/bmap.c6
-rw-r--r--fs/gfs2/dir.c28
-rw-r--r--fs/gfs2/file.c18
-rw-r--r--fs/gfs2/glock.c17
-rw-r--r--fs/gfs2/incore.h9
-rw-r--r--fs/gfs2/lock_dlm.c10
-rw-r--r--fs/gfs2/log.c11
-rw-r--r--fs/gfs2/lops.c2
-rw-r--r--fs/gfs2/main.c2
-rw-r--r--fs/gfs2/ops_fstype.c5
-rw-r--r--fs/gfs2/quota.c2
-rw-r--r--fs/gfs2/rgrp.c201
-rw-r--r--fs/gfs2/rgrp.h11
-rw-r--r--fs/gfs2/super.c4
-rw-r--r--fs/gfs2/trans.c15
-rw-r--r--fs/gfs2/util.c16
-rw-r--r--fs/gfs2/util.h2
-rw-r--r--fs/gfs2/xattr.c18
-rw-r--r--fs/ioctl.c2
-rw-r--r--fs/iomap.c2
-rw-r--r--fs/jffs2/background.c2
-rw-r--r--fs/jffs2/super.c4
-rw-r--r--fs/jfs/acl.c4
-rw-r--r--fs/jfs/inode.c1
-rw-r--r--fs/jfs/super.c2
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/nfs/nfs4proc.c31
-rw-r--r--fs/nfs/nfs4state.c2
-rw-r--r--fs/nfs/nfs4trace.h4
-rw-r--r--fs/nfs/pnfs.c26
-rw-r--r--fs/nfsd/vfs.c3
-rw-r--r--fs/notify/fsnotify.c13
-rw-r--r--fs/ocfs2/buffer_head_io.c1
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c4
-rw-r--r--fs/ocfs2/dlmglue.c2
-rw-r--r--fs/ocfs2/refcounttree.c16
-rw-r--r--fs/orangefs/acl.c4
-rw-r--r--fs/orangefs/inode.c8
-rw-r--r--fs/orangefs/namei.c8
-rw-r--r--fs/orangefs/orangefs-sysfs.c2
-rw-r--r--fs/overlayfs/copy_up.c2
-rw-r--r--fs/overlayfs/file.c25
-rw-r--r--fs/overlayfs/inode.c10
-rw-r--r--fs/overlayfs/namei.c2
-rw-r--r--fs/overlayfs/overlayfs.h4
-rw-r--r--fs/overlayfs/super.c26
-rw-r--r--fs/overlayfs/util.c3
-rw-r--r--fs/proc/base.c14
-rw-r--r--fs/proc/kcore.c1
-rw-r--r--fs/proc/vmcore.c34
-rw-r--r--fs/pstore/inode.c11
-rw-r--r--fs/pstore/internal.h5
-rw-r--r--fs/pstore/platform.c75
-rw-r--r--fs/pstore/ram.c47
-rw-r--r--fs/pstore/ram_core.c28
-rw-r--r--fs/read_write.c17
-rw-r--r--fs/signalfd.c6
-rw-r--r--fs/ubifs/super.c11
-rw-r--r--fs/ubifs/xattr.c24
-rw-r--r--fs/xattr.c24
-rw-r--r--fs/xfs/libxfs/xfs_attr.c264
-rw-r--r--fs/xfs/libxfs/xfs_attr.h (renamed from fs/xfs/xfs_attr.h)2
-rw-r--r--fs/xfs/libxfs/xfs_attr_remote.c10
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c94
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h1
-rw-r--r--fs/xfs/libxfs/xfs_format.h10
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c30
-rw-r--r--fs/xfs/libxfs/xfs_sb.c5
-rw-r--r--fs/xfs/scrub/alloc.c1
-rw-r--r--fs/xfs/scrub/inode.c4
-rw-r--r--fs/xfs/scrub/repair.c128
-rw-r--r--fs/xfs/scrub/scrub.c13
-rw-r--r--fs/xfs/xfs_aops.c4
-rw-r--r--fs/xfs/xfs_aops.h14
-rw-r--r--fs/xfs/xfs_bmap_util.c81
-rw-r--r--fs/xfs/xfs_buf.c109
-rw-r--r--fs/xfs/xfs_buf.h2
-rw-r--r--fs/xfs/xfs_buf_item.c119
-rw-r--r--fs/xfs/xfs_buf_item.h1
-rw-r--r--fs/xfs/xfs_fsops.c50
-rw-r--r--fs/xfs/xfs_inode.c10
-rw-r--r--fs/xfs/xfs_ioctl.c8
-rw-r--r--fs/xfs/xfs_iomap.c53
-rw-r--r--fs/xfs/xfs_iops.c12
-rw-r--r--fs/xfs/xfs_log_recover.c10
-rw-r--r--fs/xfs/xfs_reflink.c362
-rw-r--r--fs/xfs/xfs_reflink.h4
-rw-r--r--fs/xfs/xfs_stats.c52
-rw-r--r--fs/xfs/xfs_stats.h28
-rw-r--r--fs/xfs/xfs_super.c38
-rw-r--r--fs/xfs/xfs_trace.h1
-rw-r--r--fs/xfs/xfs_trans.c10
-rw-r--r--fs/xfs/xfs_trans.h1
-rw-r--r--fs/xfs/xfs_trans_ail.c28
-rw-r--r--fs/xfs/xfs_trans_buf.c141
176 files changed, 3357 insertions, 2159 deletions
diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
index 025a9a5e1c32..55a756c60746 100644
--- a/fs/afs/addr_list.c
+++ b/fs/afs/addr_list.c
@@ -17,11 +17,6 @@
#include "internal.h"
#include "afs_fs.h"
-//#define AFS_MAX_ADDRESSES
-// ((unsigned int)((PAGE_SIZE - sizeof(struct afs_addr_list)) /
-// sizeof(struct sockaddr_rxrpc)))
-#define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8))
-
/*
* Release an address list.
*/
@@ -43,11 +38,15 @@ struct afs_addr_list *afs_alloc_addrlist(unsigned int nr,
_enter("%u,%u,%u", nr, service, port);
+ if (nr > AFS_MAX_ADDRESSES)
+ nr = AFS_MAX_ADDRESSES;
+
alist = kzalloc(struct_size(alist, addrs, nr), GFP_KERNEL);
if (!alist)
return NULL;
refcount_set(&alist->usage, 1);
+ alist->max_addrs = nr;
for (i = 0; i < nr; i++) {
struct sockaddr_rxrpc *srx = &alist->addrs[i];
@@ -109,8 +108,6 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
} while (p < end);
_debug("%u/%u addresses", nr, AFS_MAX_ADDRESSES);
- if (nr > AFS_MAX_ADDRESSES)
- nr = AFS_MAX_ADDRESSES;
alist = afs_alloc_addrlist(nr, service, port);
if (!alist)
@@ -119,8 +116,10 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
/* Extract the addresses */
p = text;
do {
- struct sockaddr_rxrpc *srx = &alist->addrs[alist->nr_addrs];
const char *q, *stop;
+ unsigned int xport = port;
+ __be32 x[4];
+ int family;
if (*p == delim) {
p++;
@@ -136,19 +135,12 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
break;
}
- if (in4_pton(p, q - p,
- (u8 *)&srx->transport.sin6.sin6_addr.s6_addr32[3],
- -1, &stop)) {
- srx->transport.sin6.sin6_addr.s6_addr32[0] = 0;
- srx->transport.sin6.sin6_addr.s6_addr32[1] = 0;
- srx->transport.sin6.sin6_addr.s6_addr32[2] = htonl(0xffff);
- } else if (in6_pton(p, q - p,
- srx->transport.sin6.sin6_addr.s6_addr,
- -1, &stop)) {
- /* Nothing to do */
- } else {
+ if (in4_pton(p, q - p, (u8 *)&x[0], -1, &stop))
+ family = AF_INET;
+ else if (in6_pton(p, q - p, (u8 *)x, -1, &stop))
+ family = AF_INET6;
+ else
goto bad_address;
- }
if (stop != q)
goto bad_address;
@@ -160,7 +152,7 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
if (p < end) {
if (*p == '+') {
/* Port number specification "+1234" */
- unsigned int xport = 0;
+ xport = 0;
p++;
if (p >= end || !isdigit(*p))
goto bad_address;
@@ -171,7 +163,6 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
goto bad_address;
p++;
} while (p < end && isdigit(*p));
- srx->transport.sin6.sin6_port = htons(xport);
} else if (*p == delim) {
p++;
} else {
@@ -179,8 +170,12 @@ struct afs_addr_list *afs_parse_text_addrs(const char *text, size_t len,
}
}
- alist->nr_addrs++;
- } while (p < end && alist->nr_addrs < AFS_MAX_ADDRESSES);
+ if (family == AF_INET)
+ afs_merge_fs_addr4(alist, x[0], xport);
+ else
+ afs_merge_fs_addr6(alist, x, xport);
+
+ } while (p < end);
_leave(" = [nr %u]", alist->nr_addrs);
return alist;
@@ -237,19 +232,23 @@ struct afs_addr_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry)
*/
void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
{
- struct sockaddr_in6 *a;
- __be16 xport = htons(port);
+ struct sockaddr_rxrpc *srx;
+ u32 addr = ntohl(xdr);
int i;
+ if (alist->nr_addrs >= alist->max_addrs)
+ return;
+
for (i = 0; i < alist->nr_ipv4; i++) {
- a = &alist->addrs[i].transport.sin6;
- if (xdr == a->sin6_addr.s6_addr32[3] &&
- xport == a->sin6_port)
+ struct sockaddr_in *a = &alist->addrs[i].transport.sin;
+ u32 a_addr = ntohl(a->sin_addr.s_addr);
+ u16 a_port = ntohs(a->sin_port);
+
+ if (addr == a_addr && port == a_port)
return;
- if (xdr == a->sin6_addr.s6_addr32[3] &&
- (u16 __force)xport < (u16 __force)a->sin6_port)
+ if (addr == a_addr && port < a_port)
break;
- if ((u32 __force)xdr < (u32 __force)a->sin6_addr.s6_addr32[3])
+ if (addr < a_addr)
break;
}
@@ -258,12 +257,11 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
alist->addrs + i,
sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
- a = &alist->addrs[i].transport.sin6;
- a->sin6_port = xport;
- a->sin6_addr.s6_addr32[0] = 0;
- a->sin6_addr.s6_addr32[1] = 0;
- a->sin6_addr.s6_addr32[2] = htonl(0xffff);
- a->sin6_addr.s6_addr32[3] = xdr;
+ srx = &alist->addrs[i];
+ srx->transport_len = sizeof(srx->transport.sin);
+ srx->transport.sin.sin_family = AF_INET;
+ srx->transport.sin.sin_port = htons(port);
+ srx->transport.sin.sin_addr.s_addr = xdr;
alist->nr_ipv4++;
alist->nr_addrs++;
}
@@ -273,18 +271,20 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
*/
void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
{
- struct sockaddr_in6 *a;
- __be16 xport = htons(port);
+ struct sockaddr_rxrpc *srx;
int i, diff;
+ if (alist->nr_addrs >= alist->max_addrs)
+ return;
+
for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
- a = &alist->addrs[i].transport.sin6;
+ struct sockaddr_in6 *a = &alist->addrs[i].transport.sin6;
+ u16 a_port = ntohs(a->sin6_port);
+
diff = memcmp(xdr, &a->sin6_addr, 16);
- if (diff == 0 &&
- xport == a->sin6_port)
+ if (diff == 0 && port == a_port)
return;
- if (diff == 0 &&
- (u16 __force)xport < (u16 __force)a->sin6_port)
+ if (diff == 0 && port < a_port)
break;
if (diff < 0)
break;
@@ -295,12 +295,11 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
alist->addrs + i,
sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
- a = &alist->addrs[i].transport.sin6;
- a->sin6_port = xport;
- a->sin6_addr.s6_addr32[0] = xdr[0];
- a->sin6_addr.s6_addr32[1] = xdr[1];
- a->sin6_addr.s6_addr32[2] = xdr[2];
- a->sin6_addr.s6_addr32[3] = xdr[3];
+ srx = &alist->addrs[i];
+ srx->transport_len = sizeof(srx->transport.sin6);
+ srx->transport.sin6.sin6_family = AF_INET6;
+ srx->transport.sin6.sin6_port = htons(port);
+ memcpy(&srx->transport.sin6.sin6_addr, xdr, 16);
alist->nr_addrs++;
}
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index f3d0bef16d78..6127f0fcd62c 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -514,6 +514,8 @@ static int afs_alloc_anon_key(struct afs_cell *cell)
*/
static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell)
{
+ struct hlist_node **p;
+ struct afs_cell *pcell;
int ret;
if (!cell->anonymous_key) {
@@ -534,7 +536,18 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell)
return ret;
mutex_lock(&net->proc_cells_lock);
- list_add_tail(&cell->proc_link, &net->proc_cells);
+ for (p = &net->proc_cells.first; *p; p = &(*p)->next) {
+ pcell = hlist_entry(*p, struct afs_cell, proc_link);
+ if (strcmp(cell->name, pcell->name) < 0)
+ break;
+ }
+
+ cell->proc_link.pprev = p;
+ cell->proc_link.next = *p;
+ rcu_assign_pointer(*p, &cell->proc_link.next);
+ if (cell->proc_link.next)
+ cell->proc_link.next->pprev = &cell->proc_link.next;
+
afs_dynroot_mkdir(net, cell);
mutex_unlock(&net->proc_cells_lock);
return 0;
@@ -550,7 +563,7 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell)
afs_proc_cell_remove(cell);
mutex_lock(&net->proc_cells_lock);
- list_del_init(&cell->proc_link);
+ hlist_del_rcu(&cell->proc_link);
afs_dynroot_rmdir(net, cell);
mutex_unlock(&net->proc_cells_lock);
diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c
index 1cde710a8013..f29c6dade7f6 100644
--- a/fs/afs/dynroot.c
+++ b/fs/afs/dynroot.c
@@ -265,7 +265,7 @@ int afs_dynroot_populate(struct super_block *sb)
return -ERESTARTSYS;
net->dynroot_sb = sb;
- list_for_each_entry(cell, &net->proc_cells, proc_link) {
+ hlist_for_each_entry(cell, &net->proc_cells, proc_link) {
ret = afs_dynroot_mkdir(net, cell);
if (ret < 0)
goto error;
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 871a228d7f37..72de1f157d20 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -73,12 +73,14 @@ struct afs_addr_list {
struct rcu_head rcu; /* Must be first */
refcount_t usage;
u32 version; /* Version */
- unsigned short nr_addrs;
- unsigned short index; /* Address currently in use */
- unsigned short nr_ipv4; /* Number of IPv4 addresses */
+ unsigned char max_addrs;
+ unsigned char nr_addrs;
+ unsigned char index; /* Address currently in use */
+ unsigned char nr_ipv4; /* Number of IPv4 addresses */
unsigned long probed; /* Mask of servers that have been probed */
unsigned long yfs; /* Mask of servers that are YFS */
struct sockaddr_rxrpc addrs[];
+#define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8))
};
/*
@@ -242,7 +244,7 @@ struct afs_net {
seqlock_t cells_lock;
struct mutex proc_cells_lock;
- struct list_head proc_cells;
+ struct hlist_head proc_cells;
/* Known servers. Theoretically each fileserver can only be in one
* cell, but in practice, people create aliases and subsets and there's
@@ -320,7 +322,7 @@ struct afs_cell {
struct afs_net *net;
struct key *anonymous_key; /* anonymous user key for this cell */
struct work_struct manager; /* Manager for init/deinit/dns */
- struct list_head proc_link; /* /proc cell list link */
+ struct hlist_node proc_link; /* /proc cell list link */
#ifdef CONFIG_AFS_FSCACHE
struct fscache_cookie *cache; /* caching cookie */
#endif
diff --git a/fs/afs/main.c b/fs/afs/main.c
index e84fe822a960..107427688edd 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -87,7 +87,7 @@ static int __net_init afs_net_init(struct net *net_ns)
timer_setup(&net->cells_timer, afs_cells_timer, 0);
mutex_init(&net->proc_cells_lock);
- INIT_LIST_HEAD(&net->proc_cells);
+ INIT_HLIST_HEAD(&net->proc_cells);
seqlock_init(&net->fs_lock);
net->fs_servers = RB_ROOT;
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 0c3285c8db95..9101f62707af 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -33,9 +33,8 @@ static inline struct afs_net *afs_seq2net_single(struct seq_file *m)
static int afs_proc_cells_show(struct seq_file *m, void *v)
{
struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link);
- struct afs_net *net = afs_seq2net(m);
- if (v == &net->proc_cells) {
+ if (v == SEQ_START_TOKEN) {
/* display header on line 1 */
seq_puts(m, "USE NAME\n");
return 0;
@@ -50,12 +49,12 @@ static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos)
__acquires(rcu)
{
rcu_read_lock();
- return seq_list_start_head(&afs_seq2net(m)->proc_cells, *_pos);
+ return seq_hlist_start_head_rcu(&afs_seq2net(m)->proc_cells, *_pos);
}
static void *afs_proc_cells_next(struct seq_file *m, void *v, loff_t *pos)
{
- return seq_list_next(v, &afs_seq2net(m)->proc_cells, pos);
+ return seq_hlist_next_rcu(v, &afs_seq2net(m)->proc_cells, pos);
}
static void afs_proc_cells_stop(struct seq_file *m, void *v)
@@ -98,13 +97,13 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size)
goto inval;
args = strchr(name, ' ');
- if (!args)
- goto inval;
- do {
- *args++ = 0;
- } while(*args == ' ');
- if (!*args)
- goto inval;
+ if (args) {
+ do {
+ *args++ = 0;
+ } while(*args == ' ');
+ if (!*args)
+ goto inval;
+ }
/* determine command to perform */
_debug("cmd=%s name=%s args=%s", buf, name, args);
@@ -120,7 +119,6 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size)
if (test_and_set_bit(AFS_CELL_FL_NO_GC, &cell->flags))
afs_put_cell(net, cell);
- printk("kAFS: Added new cell '%s'\n", name);
} else {
goto inval;
}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 35f2ae30f31f..77a83790a31f 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -690,8 +690,6 @@ static void afs_process_async_call(struct work_struct *work)
}
if (call->state == AFS_CALL_COMPLETE) {
- call->reply[0] = NULL;
-
/* We have two refs to release - one from the alloc and one
* queued with the work item - and we can't just deallocate the
* call because the work item may be queued again.
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index efae2fb0930a..54207327f98f 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1580,7 +1580,7 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm)
}
static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata,
- const siginfo_t *siginfo)
+ const kernel_siginfo_t *siginfo)
{
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
@@ -1782,7 +1782,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- const siginfo_t *siginfo, struct pt_regs *regs)
+ const kernel_siginfo_t *siginfo, struct pt_regs *regs)
{
struct task_struct *dump_task = current;
const struct user_regset_view *view = task_user_regset_view(dump_task);
@@ -2031,7 +2031,7 @@ static int elf_note_info_init(struct elf_note_info *info)
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- const siginfo_t *siginfo, struct pt_regs *regs)
+ const kernel_siginfo_t *siginfo, struct pt_regs *regs)
{
struct list_head *t;
struct core_thread *ct;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index ae750b1574a2..68ebe188446a 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -112,11 +112,11 @@ static int find_extent_in_eb(const struct extent_buffer *eb,
}
struct preftree {
- struct rb_root root;
+ struct rb_root_cached root;
unsigned int count;
};
-#define PREFTREE_INIT { .root = RB_ROOT, .count = 0 }
+#define PREFTREE_INIT { .root = RB_ROOT_CACHED, .count = 0 }
struct preftrees {
struct preftree direct; /* BTRFS_SHARED_[DATA|BLOCK]_REF_KEY */
@@ -225,14 +225,15 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
struct prelim_ref *newref,
struct share_check *sc)
{
- struct rb_root *root;
+ struct rb_root_cached *root;
struct rb_node **p;
struct rb_node *parent = NULL;
struct prelim_ref *ref;
int result;
+ bool leftmost = true;
root = &preftree->root;
- p = &root->rb_node;
+ p = &root->rb_root.rb_node;
while (*p) {
parent = *p;
@@ -242,6 +243,7 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
p = &(*p)->rb_left;
} else if (result > 0) {
p = &(*p)->rb_right;
+ leftmost = false;
} else {
/* Identical refs, merge them and free @newref */
struct extent_inode_elem *eie = ref->inode_list;
@@ -272,7 +274,7 @@ static void prelim_ref_insert(const struct btrfs_fs_info *fs_info,
preftree->count++;
trace_btrfs_prelim_ref_insert(fs_info, newref, NULL, preftree->count);
rb_link_node(&newref->rbnode, parent, p);
- rb_insert_color(&newref->rbnode, root);
+ rb_insert_color_cached(&newref->rbnode, root, leftmost);
}
/*
@@ -283,11 +285,11 @@ static void prelim_release(struct preftree *preftree)
{
struct prelim_ref *ref, *next_ref;
- rbtree_postorder_for_each_entry_safe(ref, next_ref, &preftree->root,
- rbnode)
+ rbtree_postorder_for_each_entry_safe(ref, next_ref,
+ &preftree->root.rb_root, rbnode)
free_pref(ref);
- preftree->root = RB_ROOT;
+ preftree->root = RB_ROOT_CACHED;
preftree->count = 0;
}
@@ -627,7 +629,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
* freeing the entire indirect tree when we're done. In some test
* cases, the tree can grow quite large (~200k objects).
*/
- while ((rnode = rb_first(&preftrees->indirect.root))) {
+ while ((rnode = rb_first_cached(&preftrees->indirect.root))) {
struct prelim_ref *ref;
ref = rb_entry(rnode, struct prelim_ref, rbnode);
@@ -637,7 +639,7 @@ static int resolve_indirect_refs(struct btrfs_fs_info *fs_info,
goto out;
}
- rb_erase(&ref->rbnode, &preftrees->indirect.root);
+ rb_erase_cached(&ref->rbnode, &preftrees->indirect.root);
preftrees->indirect.count--;
if (ref->count == 0) {
@@ -717,9 +719,9 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info,
struct preftree *tree = &preftrees->indirect_missing_keys;
struct rb_node *node;
- while ((node = rb_first(&tree->root))) {
+ while ((node = rb_first_cached(&tree->root))) {
ref = rb_entry(node, struct prelim_ref, rbnode);
- rb_erase(node, &tree->root);
+ rb_erase_cached(node, &tree->root);
BUG_ON(ref->parent); /* should not be a direct ref */
BUG_ON(ref->key_for_search.type);
@@ -769,7 +771,7 @@ static int add_delayed_refs(const struct btrfs_fs_info *fs_info,
btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
spin_lock(&head->lock);
- for (n = rb_first(&head->ref_tree); n; n = rb_next(n)) {
+ for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) {
node = rb_entry(n, struct btrfs_delayed_ref_node,
ref_node);
if (node->seq > seq)
@@ -1229,14 +1231,14 @@ again:
if (ret)
goto out;
- WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root));
+ WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect_missing_keys.root.rb_root));
ret = resolve_indirect_refs(fs_info, path, time_seq, &preftrees,
extent_item_pos, total_refs, sc, ignore_offset);
if (ret)
goto out;
- WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root));
+ WARN_ON(!RB_EMPTY_ROOT(&preftrees.indirect.root.rb_root));
/*
* This walks the tree of merged and resolved refs. Tree blocks are
@@ -1245,7 +1247,7 @@ again:
*
* We release the entire tree in one go before returning.
*/
- node = rb_first(&preftrees.direct.root);
+ node = rb_first_cached(&preftrees.direct.root);
while (node) {
ref = rb_entry(node, struct prelim_ref, rbnode);
node = rb_next(&ref->rbnode);
@@ -1468,7 +1470,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr)
struct seq_list elem = SEQ_LIST_INIT(elem);
int ret = 0;
struct share_check shared = {
- .root_objectid = root->objectid,
+ .root_objectid = root->root_key.objectid,
.inum = inum,
.share_count = 0,
};
@@ -2031,7 +2033,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
/* path must be released before calling iterate()! */
btrfs_debug(fs_root->fs_info,
"following ref at offset %u for inode %llu in tree %llu",
- cur, found_key.objectid, fs_root->objectid);
+ cur, found_key.objectid,
+ fs_root->root_key.objectid);
ret = iterate(parent, name_len,
(unsigned long)(iref + 1), eb, ctx);
if (ret)
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 1343ac57b438..97d91e55b70a 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -206,7 +206,7 @@ static inline struct btrfs_inode *BTRFS_I(const struct inode *inode)
static inline unsigned long btrfs_inode_hash(u64 objectid,
const struct btrfs_root *root)
{
- u64 h = objectid ^ (root->objectid * GOLDEN_RATIO_PRIME);
+ u64 h = objectid ^ (root->root_key.objectid * GOLDEN_RATIO_PRIME);
#if BITS_PER_LONG == 32
h = (h >> 32) ^ (h & 0xffffffff);
@@ -339,15 +339,15 @@ static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
struct btrfs_root *root = inode->root;
/* Output minus objectid, which is more meaningful */
- if (root->objectid >= BTRFS_LAST_FREE_OBJECTID)
+ if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID)
btrfs_warn_rl(root->fs_info,
"csum failed root %lld ino %lld off %llu csum 0x%08x expected csum 0x%08x mirror %d",
- root->objectid, btrfs_ino(inode),
+ root->root_key.objectid, btrfs_ino(inode),
logical_start, csum, csum_expected, mirror_num);
else
btrfs_warn_rl(root->fs_info,
"csum failed root %llu ino %llu off %llu csum 0x%08x expected csum 0x%08x mirror %d",
- root->objectid, btrfs_ino(inode),
+ root->root_key.objectid, btrfs_ino(inode),
logical_start, csum, csum_expected, mirror_num);
}
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 833cf3c35b4d..2e43fba44035 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1594,6 +1594,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
{
unsigned int num_pages;
unsigned int i;
+ size_t size;
u64 dev_bytenr;
int ret;
@@ -1608,9 +1609,8 @@ static int btrfsic_read_block(struct btrfsic_state *state,
num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
PAGE_SHIFT;
- block_ctx->mem_to_free = kcalloc(sizeof(*block_ctx->datav) +
- sizeof(*block_ctx->pagev),
- num_pages, GFP_NOFS);
+ size = sizeof(*block_ctx->datav) + sizeof(*block_ctx->pagev);
+ block_ctx->mem_to_free = kcalloc(num_pages, size, GFP_NOFS);
if (!block_ctx->mem_to_free)
return -ENOMEM;
block_ctx->datav = block_ctx->mem_to_free;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 9bfa66592aa7..8703ce68fe9d 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -528,7 +528,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
- struct extent_io_tree *tree;
struct extent_map_tree *em_tree;
struct compressed_bio *cb;
unsigned long compressed_len;
@@ -545,7 +544,6 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int faili = 0;
u32 *sums;
- tree = &BTRFS_I(inode)->io_tree;
em_tree = &BTRFS_I(inode)->extent_tree;
/* we need the actual starting offset of this extent in the file */
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index d436fb4c002e..2ee43b6a4f09 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -52,42 +52,6 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
}
}
-/*
- * reset all the locked nodes in the patch to spinning locks.
- *
- * held is used to keep lockdep happy, when lockdep is enabled
- * we set held to a blocking lock before we go around and
- * retake all the spinlocks in the path. You can safely use NULL
- * for held
- */
-noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
- struct extent_buffer *held, int held_rw)
-{
- int i;
-
- if (held) {
- btrfs_set_lock_blocking_rw(held, held_rw);
- if (held_rw == BTRFS_WRITE_LOCK)
- held_rw = BTRFS_WRITE_LOCK_BLOCKING;
- else if (held_rw == BTRFS_READ_LOCK)
- held_rw = BTRFS_READ_LOCK_BLOCKING;
- }
- btrfs_set_path_blocking(p);
-
- for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
- if (p->nodes[i] && p->locks[i]) {
- btrfs_clear_lock_blocking_rw(p->nodes[i], p->locks[i]);
- if (p->locks[i] == BTRFS_WRITE_LOCK_BLOCKING)
- p->locks[i] = BTRFS_WRITE_LOCK;
- else if (p->locks[i] == BTRFS_READ_LOCK_BLOCKING)
- p->locks[i] = BTRFS_READ_LOCK;
- }
- }
-
- if (held)
- btrfs_clear_lock_blocking_rw(held, held_rw);
-}
-
/* this also releases the path */
void btrfs_free_path(struct btrfs_path *p)
{
@@ -207,7 +171,7 @@ static void add_root_to_dirty_list(struct btrfs_root *root)
spin_lock(&fs_info->trans_lock);
if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
/* Want the extent tree to be the last on the list */
- if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
+ if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
list_move_tail(&root->dirty_list,
&fs_info->dirty_cowonly_roots);
else
@@ -1306,7 +1270,6 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
}
}
- btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
@@ -1815,8 +1778,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
int orig_slot = path->slots[level];
u64 orig_ptr;
- if (level == 0)
- return 0;
+ ASSERT(level > 0);
mid = path->nodes[level];
@@ -2483,7 +2445,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
btrfs_set_path_blocking(p);
reada_for_balance(fs_info, p, level);
sret = split_node(trans, root, p, level);
- btrfs_clear_path_blocking(p, NULL, 0);
BUG_ON(sret > 0);
if (sret) {
@@ -2504,7 +2465,6 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
btrfs_set_path_blocking(p);
reada_for_balance(fs_info, p, level);
sret = balance_level(trans, root, p, level);
- btrfs_clear_path_blocking(p, NULL, 0);
if (sret) {
ret = sret;
@@ -2789,7 +2749,10 @@ again:
}
cow_done:
p->nodes[level] = b;
- btrfs_clear_path_blocking(p, NULL, 0);
+ /*
+ * Leave path with blocking locks to avoid massive
+ * lock context switch, this is made on purpose.
+ */
/*
* we have a lock on b and as long as we aren't changing
@@ -2871,8 +2834,6 @@ cow_done:
if (!err) {
btrfs_set_path_blocking(p);
btrfs_tree_lock(b);
- btrfs_clear_path_blocking(p, b,
- BTRFS_WRITE_LOCK);
}
p->locks[level] = BTRFS_WRITE_LOCK;
} else {
@@ -2880,8 +2841,6 @@ cow_done:
if (!err) {
btrfs_set_path_blocking(p);
btrfs_tree_read_lock(b);
- btrfs_clear_path_blocking(p, b,
- BTRFS_READ_LOCK);
}
p->locks[level] = BTRFS_READ_LOCK;
}
@@ -2900,7 +2859,6 @@ cow_done:
btrfs_set_path_blocking(p);
err = split_leaf(trans, root, key,
p, ins_len, ret == 0);
- btrfs_clear_path_blocking(p, NULL, 0);
BUG_ON(err > 0);
if (err) {
@@ -2910,7 +2868,7 @@ cow_done:
}
if (!p->search_for_split)
unlock_up(p, level, lowest_unlock,
- min_write_lock_level, &write_lock_level);
+ min_write_lock_level, NULL);
goto done;
}
}
@@ -2961,13 +2919,16 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
again:
b = get_old_root(root, time_seq);
+ if (!b) {
+ ret = -EIO;
+ goto done;
+ }
level = btrfs_header_level(b);
p->locks[level] = BTRFS_READ_LOCK;
while (b) {
level = btrfs_header_level(b);
p->nodes[level] = b;
- btrfs_clear_path_blocking(p, NULL, 0);
/*
* we have a lock on b and as long as we aren't changing
@@ -3013,8 +2974,6 @@ again:
if (!err) {
btrfs_set_path_blocking(p);
btrfs_tree_read_lock(b);
- btrfs_clear_path_blocking(p, b,
- BTRFS_READ_LOCK);
}
b = tree_mod_log_rewind(fs_info, p, b, time_seq);
if (!b) {
@@ -5198,7 +5157,6 @@ find_next_key:
path->locks[level - 1] = BTRFS_READ_LOCK;
path->nodes[level - 1] = cur;
unlock_up(path, level, 1, 0, NULL);
- btrfs_clear_path_blocking(path, NULL, 0);
}
out:
path->keep_locks = keep_locks;
@@ -5783,8 +5741,6 @@ again:
if (!ret) {
btrfs_set_path_blocking(path);
btrfs_tree_read_lock(next);
- btrfs_clear_path_blocking(path, next,
- BTRFS_READ_LOCK);
}
next_rw_lock = BTRFS_READ_LOCK;
}
@@ -5820,8 +5776,6 @@ again:
if (!ret) {
btrfs_set_path_blocking(path);
btrfs_tree_read_lock(next);
- btrfs_clear_path_blocking(path, next,
- BTRFS_READ_LOCK);
}
next_rw_lock = BTRFS_READ_LOCK;
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 53af9f5253f4..68ca41dbbef3 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -41,12 +41,6 @@ extern struct kmem_cache *btrfs_path_cachep;
extern struct kmem_cache *btrfs_free_space_cachep;
struct btrfs_ordered_sum;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-#define STATIC noinline
-#else
-#define STATIC static noinline
-#endif
-
#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
#define BTRFS_MAX_MIRRORS 3
@@ -367,11 +361,13 @@ struct btrfs_dev_replace {
struct mutex lock_finishing_cancel_unmount;
rwlock_t lock;
- atomic_t read_locks;
atomic_t blocking_readers;
wait_queue_head_t read_lock_wq;
struct btrfs_scrub_progress scrub_progress;
+
+ struct percpu_counter bio_counter;
+ wait_queue_head_t replace_wait;
};
/* For raid type sysfs entries */
@@ -1094,9 +1090,6 @@ struct btrfs_fs_info {
/* device replace state */
struct btrfs_dev_replace dev_replace;
- struct percpu_counter bio_counter;
- wait_queue_head_t replace_wait;
-
struct semaphore uuid_tree_rescan_sem;
/* Used to reclaim the metadata space in the background. */
@@ -1202,18 +1195,12 @@ struct btrfs_root {
int last_log_commit;
pid_t log_start_pid;
- u64 objectid;
u64 last_trans;
u32 type;
u64 highest_objectid;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */
- u64 alloc_bytenr;
-#endif
-
u64 defrag_trans_start;
struct btrfs_key defrag_progress;
struct btrfs_key defrag_max;
@@ -1280,11 +1267,16 @@ struct btrfs_root {
int send_in_progress;
struct btrfs_subvolume_writers *subv_writers;
atomic_t will_be_snapshotted;
+ atomic_t snapshot_force_cow;
/* For qgroup metadata reserved space */
spinlock_t qgroup_meta_rsv_lock;
u64 qgroup_meta_rsv_pertrans;
u64 qgroup_meta_rsv_prealloc;
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+ u64 alloc_bytenr;
+#endif
};
struct btrfs_file_private {
@@ -2606,10 +2598,8 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_fs_info *fs_info,
return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
}
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
-int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info);
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans);
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans);
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
const u64 start);
void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
@@ -2770,7 +2760,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
- int update_size);
+ bool update_size);
int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *dest, u64 num_bytes,
int min_factor);
@@ -2876,8 +2866,6 @@ void btrfs_release_path(struct btrfs_path *p);
struct btrfs_path *btrfs_alloc_path(void);
void btrfs_free_path(struct btrfs_path *p);
void btrfs_set_path_blocking(struct btrfs_path *p);
-void btrfs_clear_path_blocking(struct btrfs_path *p,
- struct extent_buffer *held, int held_rw);
void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -3020,8 +3008,7 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
/* dir-item.c */
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
const char *name, int name_len);
-int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, const char *name,
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
int name_len, struct btrfs_inode *dir,
struct btrfs_key *location, u8 type, u64 index);
struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
@@ -3179,8 +3166,8 @@ void __cold btrfs_destroy_cachep(void);
struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
struct btrfs_root *root, int *was_new);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- struct page *page, size_t pg_offset,
- u64 start, u64 end, int create);
+ struct page *page, size_t pg_offset,
+ u64 start, u64 end, int create);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode);
@@ -3200,9 +3187,6 @@ int btrfs_prealloc_file_range_trans(struct inode *inode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
extern const struct dentry_operations btrfs_dentry_operations;
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-void btrfs_test_inode_set_ops(struct inode *inode);
-#endif
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@@ -3390,9 +3374,9 @@ do { \
#define btrfs_debug(fs_info, fmt, args...) \
btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
- btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
+ btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
- btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
+ btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl(fs_info, fmt, args...) \
btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
#endif
@@ -3404,6 +3388,13 @@ do { \
rcu_read_unlock(); \
} while (0)
+#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \
+do { \
+ rcu_read_lock(); \
+ btrfs_no_printk(fs_info, fmt, ##args); \
+ rcu_read_unlock(); \
+} while (0)
+
#define btrfs_printk_ratelimited(fs_info, fmt, args...) \
do { \
static DEFINE_RATELIMIT_STATE(_rs, \
@@ -3708,18 +3699,19 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
/* Sanity test specific functions */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+void btrfs_test_inode_set_ops(struct inode *inode);
void btrfs_test_destroy_inode(struct inode *inode);
-#endif
static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
{
-#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
- if (unlikely(test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO,
- &fs_info->fs_state)))
- return 1;
-#endif
+ return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
+}
+#else
+static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
+{
return 0;
}
+#endif
static inline void cond_wake_up(struct wait_queue_head *wq)
{
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index f51b509f2d9b..c669f250d4a0 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -42,8 +42,8 @@ static inline void btrfs_init_delayed_node(
delayed_node->root = root;
delayed_node->inode_id = inode_id;
refcount_set(&delayed_node->refs, 0);
- delayed_node->ins_root = RB_ROOT;
- delayed_node->del_root = RB_ROOT;
+ delayed_node->ins_root = RB_ROOT_CACHED;
+ delayed_node->del_root = RB_ROOT_CACHED;
mutex_init(&delayed_node->mutex);
INIT_LIST_HEAD(&delayed_node->n_list);
INIT_LIST_HEAD(&delayed_node->p_list);
@@ -390,7 +390,7 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
struct btrfs_delayed_node *delayed_node,
struct btrfs_key *key)
{
- return __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
+ return __btrfs_lookup_delayed_item(&delayed_node->ins_root.rb_root, key,
NULL, NULL);
}
@@ -400,9 +400,10 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
{
struct rb_node **p, *node;
struct rb_node *parent_node = NULL;
- struct rb_root *root;
+ struct rb_root_cached *root;
struct btrfs_delayed_item *item;
int cmp;
+ bool leftmost = true;
if (action == BTRFS_DELAYED_INSERTION_ITEM)
root = &delayed_node->ins_root;
@@ -410,7 +411,7 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
root = &delayed_node->del_root;
else
BUG();
- p = &root->rb_node;
+ p = &root->rb_root.rb_node;
node = &ins->rb_node;
while (*p) {
@@ -419,16 +420,18 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
rb_node);
cmp = btrfs_comp_cpu_keys(&item->key, &ins->key);
- if (cmp < 0)
+ if (cmp < 0) {
p = &(*p)->rb_right;
- else if (cmp > 0)
+ leftmost = false;
+ } else if (cmp > 0) {
p = &(*p)->rb_left;
- else
+ } else {
return -EEXIST;
+ }
}
rb_link_node(node, parent_node, p);
- rb_insert_color(node, root);
+ rb_insert_color_cached(node, root, leftmost);
ins->delayed_node = delayed_node;
ins->ins_or_del = action;
@@ -468,7 +471,7 @@ static void finish_one_item(struct btrfs_delayed_root *delayed_root)
static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
{
- struct rb_root *root;
+ struct rb_root_cached *root;
struct btrfs_delayed_root *delayed_root;
delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
@@ -482,7 +485,7 @@ static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
else
root = &delayed_item->delayed_node->del_root;
- rb_erase(&delayed_item->rb_node, root);
+ rb_erase_cached(&delayed_item->rb_node, root);
delayed_item->delayed_node->count--;
finish_one_item(delayed_root);
@@ -503,7 +506,7 @@ static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
struct rb_node *p;
struct btrfs_delayed_item *item = NULL;
- p = rb_first(&delayed_node->ins_root);
+ p = rb_first_cached(&delayed_node->ins_root);
if (p)
item = rb_entry(p, struct btrfs_delayed_item, rb_node);
@@ -516,7 +519,7 @@ static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
struct rb_node *p;
struct btrfs_delayed_item *item = NULL;
- p = rb_first(&delayed_node->del_root);
+ p = rb_first_cached(&delayed_node->del_root);
if (p)
item = rb_entry(p, struct btrfs_delayed_item, rb_node);
@@ -559,7 +562,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
* reserved space when starting a transaction. So no need to reserve
* qgroup space here.
*/
- ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
if (!ret) {
trace_btrfs_space_reservation(fs_info, "delayed_item",
item->key.objectid,
@@ -647,7 +650,7 @@ static int btrfs_delayed_inode_reserve_metadata(
return ret;
}
- ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
if (!ret) {
trace_btrfs_space_reservation(fs_info, "delayed_inode",
btrfs_ino(inode), num_bytes, 1);
@@ -762,9 +765,6 @@ static int btrfs_batch_insert_items(struct btrfs_root *root,
i++;
}
- /* reset all the locked nodes in the patch to spinning locks. */
- btrfs_clear_path_blocking(path, NULL, 0);
-
/* insert the keys of the items */
setup_items_for_insert(root, path, keys, data_size,
total_data_size, total_size, nitems);
@@ -1462,7 +1462,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
"err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
- name_len, name, delayed_node->root->objectid,
+ name_len, name, delayed_node->root->root_key.objectid,
delayed_node->inode_id, ret);
BUG();
}
@@ -1533,7 +1533,8 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
if (unlikely(ret)) {
btrfs_err(trans->fs_info,
"err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)",
- index, node->root->objectid, node->inode_id, ret);
+ index, node->root->root_key.objectid,
+ node->inode_id, ret);
BUG();
}
mutex_unlock(&node->mutex);
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 33536cd681d4..74ae226ffaf0 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -50,8 +50,8 @@ struct btrfs_delayed_node {
* is waiting to be dealt with by the async worker.
*/
struct list_head p_list;
- struct rb_root ins_root;
- struct rb_root del_root;
+ struct rb_root_cached ins_root;
+ struct rb_root_cached del_root;
struct mutex mutex;
struct btrfs_inode_item inode_item;
refcount_t refs;
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 62ff545ba1f7..5149165b49a4 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -101,14 +101,15 @@ static int comp_refs(struct btrfs_delayed_ref_node *ref1,
}
/* insert a new ref to head ref rbtree */
-static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
+static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root,
struct rb_node *node)
{
- struct rb_node **p = &root->rb_node;
+ struct rb_node **p = &root->rb_root.rb_node;
struct rb_node *parent_node = NULL;
struct btrfs_delayed_ref_head *entry;
struct btrfs_delayed_ref_head *ins;
u64 bytenr;
+ bool leftmost = true;
ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
bytenr = ins->bytenr;
@@ -117,26 +118,29 @@ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root,
entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
href_node);
- if (bytenr < entry->bytenr)
+ if (bytenr < entry->bytenr) {
p = &(*p)->rb_left;
- else if (bytenr > entry->bytenr)
+ } else if (bytenr > entry->bytenr) {
p = &(*p)->rb_right;
- else
+ leftmost = false;
+ } else {
return entry;
+ }
}
rb_link_node(node, parent_node, p);
- rb_insert_color(node, root);
+ rb_insert_color_cached(node, root, leftmost);
return NULL;
}
-static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root,
+static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
struct btrfs_delayed_ref_node *ins)
{
- struct rb_node **p = &root->rb_node;
+ struct rb_node **p = &root->rb_root.rb_node;
struct rb_node *node = &ins->ref_node;
struct rb_node *parent_node = NULL;
struct btrfs_delayed_ref_node *entry;
+ bool leftmost = true;
while (*p) {
int comp;
@@ -145,16 +149,18 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root,
entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
ref_node);
comp = comp_refs(ins, entry, true);
- if (comp < 0)
+ if (comp < 0) {
p = &(*p)->rb_left;
- else if (comp > 0)
+ } else if (comp > 0) {
p = &(*p)->rb_right;
- else
+ leftmost = false;
+ } else {
return entry;
+ }
}
rb_link_node(node, parent_node, p);
- rb_insert_color(node, root);
+ rb_insert_color_cached(node, root, leftmost);
return NULL;
}
@@ -162,12 +168,14 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root *root,
* find an head entry based on bytenr. This returns the delayed ref
* head if it was able to find one, or NULL if nothing was in that spot.
* If return_bigger is given, the next bigger entry is returned if no exact
- * match is found.
+ * match is found. But if no bigger one is found then the first node of the
+ * ref head tree will be returned.
*/
-static struct btrfs_delayed_ref_head *
-find_ref_head(struct rb_root *root, u64 bytenr,
- int return_bigger)
+static struct btrfs_delayed_ref_head* find_ref_head(
+ struct btrfs_delayed_ref_root *dr, u64 bytenr,
+ bool return_bigger)
{
+ struct rb_root *root = &dr->href_root.rb_root;
struct rb_node *n;
struct btrfs_delayed_ref_head *entry;
@@ -187,7 +195,7 @@ find_ref_head(struct rb_root *root, u64 bytenr,
if (bytenr > entry->bytenr) {
n = rb_next(&entry->href_node);
if (!n)
- n = rb_first(root);
+ n = rb_first_cached(&dr->href_root);
entry = rb_entry(n, struct btrfs_delayed_ref_head,
href_node);
return entry;
@@ -197,12 +205,9 @@ find_ref_head(struct rb_root *root, u64 bytenr,
return NULL;
}
-int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head)
{
- struct btrfs_delayed_ref_root *delayed_refs;
-
- delayed_refs = &trans->transaction->delayed_refs;
lockdep_assert_held(&delayed_refs->lock);
if (mutex_trylock(&head->mutex))
return 0;
@@ -227,7 +232,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_node *ref)
{
lockdep_assert_held(&head->lock);
- rb_erase(&ref->ref_node, &head->ref_tree);
+ rb_erase_cached(&ref->ref_node, &head->ref_tree);
RB_CLEAR_NODE(&ref->ref_node);
if (!list_empty(&ref->add_list))
list_del(&ref->add_list);
@@ -296,7 +301,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
lockdep_assert_held(&head->lock);
- if (RB_EMPTY_ROOT(&head->ref_tree))
+ if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
return;
/* We don't have too many refs to merge for data. */
@@ -314,7 +319,8 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
spin_unlock(&fs_info->tree_mod_seq_lock);
again:
- for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
+ for (node = rb_first_cached(&head->ref_tree); node;
+ node = rb_next(node)) {
ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
if (seq && ref->seq >= seq)
continue;
@@ -345,24 +351,21 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
return ret;
}
-struct btrfs_delayed_ref_head *
-btrfs_select_ref_head(struct btrfs_trans_handle *trans)
+struct btrfs_delayed_ref_head *btrfs_select_ref_head(
+ struct btrfs_delayed_ref_root *delayed_refs)
{
- struct btrfs_delayed_ref_root *delayed_refs;
struct btrfs_delayed_ref_head *head;
u64 start;
bool loop = false;
- delayed_refs = &trans->transaction->delayed_refs;
-
again:
start = delayed_refs->run_delayed_start;
- head = find_ref_head(&delayed_refs->href_root, start, 1);
+ head = find_ref_head(delayed_refs, start, true);
if (!head && !loop) {
delayed_refs->run_delayed_start = 0;
start = 0;
loop = true;
- head = find_ref_head(&delayed_refs->href_root, start, 1);
+ head = find_ref_head(delayed_refs, start, true);
if (!head)
return NULL;
} else if (!head && loop) {
@@ -569,7 +572,7 @@ static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
head_ref->must_insert_reserved = must_insert_reserved;
head_ref->is_data = is_data;
head_ref->is_system = is_system;
- head_ref->ref_tree = RB_ROOT;
+ head_ref->ref_tree = RB_ROOT_CACHED;
INIT_LIST_HEAD(&head_ref->ref_add_list);
RB_CLEAR_NODE(&head_ref->href_node);
head_ref->processing = 0;
@@ -903,7 +906,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr)
{
- return find_ref_head(&delayed_refs->href_root, bytenr, 0);
+ return find_ref_head(delayed_refs, bytenr, false);
}
void __cold btrfs_delayed_ref_exit(void)
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index d9f2a4ebd5db..8e20c5cb5404 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -79,7 +79,7 @@ struct btrfs_delayed_ref_head {
struct mutex mutex;
spinlock_t lock;
- struct rb_root ref_tree;
+ struct rb_root_cached ref_tree;
/* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */
struct list_head ref_add_list;
@@ -148,7 +148,7 @@ struct btrfs_delayed_data_ref {
struct btrfs_delayed_ref_root {
/* head ref rbtree */
- struct rb_root href_root;
+ struct rb_root_cached href_root;
/* dirty extent records */
struct rb_root dirty_extent_root;
@@ -255,7 +255,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
struct btrfs_delayed_ref_head *
btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
u64 bytenr);
-int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
{
@@ -263,8 +263,8 @@ static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
}
-struct btrfs_delayed_ref_head *
-btrfs_select_ref_head(struct btrfs_trans_handle *trans);
+struct btrfs_delayed_ref_head *btrfs_select_ref_head(
+ struct btrfs_delayed_ref_root *delayed_refs);
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index dec01970d8c5..2aa48aecc52b 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -382,14 +382,6 @@ out:
return ret;
}
-void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info)
-{
- struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
-
- dev_replace->committed_cursor_left =
- dev_replace->cursor_left_last_write_of_item;
-}
-
static char* btrfs_dev_name(struct btrfs_device *device)
{
if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
@@ -408,11 +400,12 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
int ret;
struct btrfs_device *tgt_device = NULL;
struct btrfs_device *src_device = NULL;
+ bool need_unlock;
- ret = btrfs_find_device_by_devspec(fs_info, srcdevid,
- srcdev_name, &src_device);
- if (ret)
- return ret;
+ src_device = btrfs_find_device_by_devspec(fs_info, srcdevid,
+ srcdev_name);
+ if (IS_ERR(src_device))
+ return PTR_ERR(src_device);
ret = btrfs_init_dev_replace_tgtdev(fs_info, tgtdev_name,
src_device, &tgt_device);
@@ -432,6 +425,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return PTR_ERR(trans);
}
+ need_unlock = true;
btrfs_dev_replace_write_lock(dev_replace);
switch (dev_replace->replace_state) {
case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED:
@@ -440,6 +434,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
break;
case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED:
case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED:
+ ASSERT(0);
ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED;
goto leave;
}
@@ -470,6 +465,7 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
atomic64_set(&dev_replace->num_write_errors, 0);
atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0);
btrfs_dev_replace_write_unlock(dev_replace);
+ need_unlock = false;
ret = btrfs_sysfs_add_device_link(tgt_device->fs_devices, tgt_device);
if (ret)
@@ -481,7 +477,12 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
+ need_unlock = true;
btrfs_dev_replace_write_lock(dev_replace);
+ dev_replace->replace_state =
+ BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED;
+ dev_replace->srcdev = NULL;
+ dev_replace->tgtdev = NULL;
goto leave;
}
@@ -503,9 +504,8 @@ int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
return ret;
leave:
- dev_replace->srcdev = NULL;
- dev_replace->tgtdev = NULL;
- btrfs_dev_replace_write_unlock(dev_replace);
+ if (need_unlock)
+ btrfs_dev_replace_write_unlock(dev_replace);
btrfs_destroy_dev_replace_tgtdev(tgt_device);
return ret;
}
@@ -545,8 +545,8 @@ int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
{
set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
- wait_event(fs_info->replace_wait, !percpu_counter_sum(
- &fs_info->bio_counter));
+ wait_event(fs_info->dev_replace.replace_wait, !percpu_counter_sum(
+ &fs_info->dev_replace.bio_counter));
}
/*
@@ -555,7 +555,7 @@ static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
{
clear_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
- wake_up(&fs_info->replace_wait);
+ wake_up(&fs_info->dev_replace.replace_wait);
}
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
@@ -961,13 +961,10 @@ int btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace)
void btrfs_dev_replace_read_lock(struct btrfs_dev_replace *dev_replace)
{
read_lock(&dev_replace->lock);
- atomic_inc(&dev_replace->read_locks);
}
void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace)
{
- ASSERT(atomic_read(&dev_replace->read_locks) > 0);
- atomic_dec(&dev_replace->read_locks);
read_unlock(&dev_replace->lock);
}
@@ -985,7 +982,6 @@ again:
void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace)
{
- ASSERT(atomic_read(&dev_replace->blocking_readers) == 0);
write_unlock(&dev_replace->lock);
}
@@ -994,45 +990,31 @@ void btrfs_dev_replace_set_lock_blocking(
struct btrfs_dev_replace *dev_replace)
{
/* only set blocking for read lock */
- ASSERT(atomic_read(&dev_replace->read_locks) > 0);
atomic_inc(&dev_replace->blocking_readers);
read_unlock(&dev_replace->lock);
}
-/* acquire read lock and dec blocking cnt */
-void btrfs_dev_replace_clear_lock_blocking(
- struct btrfs_dev_replace *dev_replace)
-{
- /* only set blocking for read lock */
- ASSERT(atomic_read(&dev_replace->read_locks) > 0);
- ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
- read_lock(&dev_replace->lock);
- /* Barrier implied by atomic_dec_and_test */
- if (atomic_dec_and_test(&dev_replace->blocking_readers))
- cond_wake_up_nomb(&dev_replace->read_lock_wq);
-}
-
void btrfs_bio_counter_inc_noblocked(struct btrfs_fs_info *fs_info)
{
- percpu_counter_inc(&fs_info->bio_counter);
+ percpu_counter_inc(&fs_info->dev_replace.bio_counter);
}
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
{
- percpu_counter_sub(&fs_info->bio_counter, amount);
- cond_wake_up_nomb(&fs_info->replace_wait);
+ percpu_counter_sub(&fs_info->dev_replace.bio_counter, amount);
+ cond_wake_up_nomb(&fs_info->dev_replace.replace_wait);
}
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
{
while (1) {
- percpu_counter_inc(&fs_info->bio_counter);
+ percpu_counter_inc(&fs_info->dev_replace.bio_counter);
if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
&fs_info->fs_state)))
break;
btrfs_bio_counter_dec(fs_info);
- wait_event(fs_info->replace_wait,
+ wait_event(fs_info->dev_replace.replace_wait,
!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
&fs_info->fs_state));
}
diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h
index b6d4206188bb..795c551f5b5e 100644
--- a/fs/btrfs/dev-replace.h
+++ b/fs/btrfs/dev-replace.h
@@ -11,7 +11,6 @@ struct btrfs_ioctl_dev_replace_args;
int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info);
int btrfs_run_dev_replace(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info);
-void btrfs_after_dev_replace_commit(struct btrfs_fs_info *fs_info);
int btrfs_dev_replace_by_ioctl(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_dev_replace_args *args);
int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
@@ -28,12 +27,5 @@ void btrfs_dev_replace_read_unlock(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_write_lock(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_write_unlock(struct btrfs_dev_replace *dev_replace);
void btrfs_dev_replace_set_lock_blocking(struct btrfs_dev_replace *dev_replace);
-void btrfs_dev_replace_clear_lock_blocking(
- struct btrfs_dev_replace *dev_replace);
-
-static inline void btrfs_dev_replace_stats_inc(atomic64_t *stat_value)
-{
- atomic64_inc(stat_value);
-}
#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index a678b07fcf01..8de74d835dba 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -105,13 +105,13 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
* to use for the second index (if one is created).
* Will return 0 or -ENOMEM
*/
-int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, const char *name, int name_len,
- struct btrfs_inode *dir, struct btrfs_key *location,
- u8 type, u64 index)
+int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
+ int name_len, struct btrfs_inode *dir,
+ struct btrfs_key *location, u8 type, u64 index)
{
int ret = 0;
int ret2 = 0;
+ struct btrfs_root *root = dir->root;
struct btrfs_path *path;
struct btrfs_dir_item *dir_item;
struct extent_buffer *leaf;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5124c15705ce..b0ab41da91d1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -125,8 +125,8 @@ struct async_submit_bio {
* Different roots are used for different purposes and may nest inside each
* other and they require separate keysets. As lockdep keys should be
* static, assign keysets according to the purpose of the root as indicated
- * by btrfs_root->objectid. This ensures that all special purpose roots
- * have separate keysets.
+ * by btrfs_root->root_key.objectid. This ensures that all special purpose
+ * roots have separate keysets.
*
* Lock-nesting across peer nodes is always done with the immediate parent
* node locked thus preventing deadlock. As lockdep doesn't know this, use
@@ -1148,7 +1148,6 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->state = 0;
root->orphan_cleanup_state = 0;
- root->objectid = objectid;
root->last_trans = 0;
root->highest_objectid = 0;
root->nr_delalloc_inodes = 0;
@@ -1187,6 +1186,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
atomic_set(&root->log_batch, 0);
refcount_set(&root->refs, 1);
atomic_set(&root->will_be_snapshotted, 0);
+ atomic_set(&root->snapshot_force_cow, 0);
root->log_transid = 0;
root->log_transid_committed = -1;
root->last_log_commit = 0;
@@ -2155,9 +2155,8 @@ static void btrfs_init_dev_replace_locks(struct btrfs_fs_info *fs_info)
{
mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
rwlock_init(&fs_info->dev_replace.lock);
- atomic_set(&fs_info->dev_replace.read_locks, 0);
atomic_set(&fs_info->dev_replace.blocking_readers, 0);
- init_waitqueue_head(&fs_info->replace_wait);
+ init_waitqueue_head(&fs_info->dev_replace.replace_wait);
init_waitqueue_head(&fs_info->dev_replace.read_lock_wq);
}
@@ -2647,7 +2646,8 @@ int open_ctree(struct super_block *sb,
goto fail_dirty_metadata_bytes;
}
- ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL);
+ ret = percpu_counter_init(&fs_info->dev_replace.bio_counter, 0,
+ GFP_KERNEL);
if (ret) {
err = ret;
goto fail_delalloc_bytes;
@@ -3308,7 +3308,7 @@ fail_iput:
iput(fs_info->btree_inode);
fail_bio_counter:
- percpu_counter_destroy(&fs_info->bio_counter);
+ percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
fail_delalloc_bytes:
percpu_counter_destroy(&fs_info->delalloc_bytes);
fail_dirty_metadata_bytes:
@@ -3976,6 +3976,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
kthread_stop(fs_info->transaction_kthread);
kthread_stop(fs_info->cleaner_kthread);
+ ASSERT(list_empty(&fs_info->delayed_iputs));
set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags);
btrfs_free_qgroup_config(fs_info);
@@ -4017,7 +4018,7 @@ void close_ctree(struct btrfs_fs_info *fs_info)
percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
percpu_counter_destroy(&fs_info->delalloc_bytes);
- percpu_counter_destroy(&fs_info->bio_counter);
+ percpu_counter_destroy(&fs_info->dev_replace.bio_counter);
cleanup_srcu_struct(&fs_info->subvol_srcu);
btrfs_free_stripe_hash_table(fs_info);
@@ -4203,7 +4204,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
return ret;
}
- while ((node = rb_first(&delayed_refs->href_root)) != NULL) {
+ while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) {
struct btrfs_delayed_ref_head *head;
struct rb_node *n;
bool pin_bytes = false;
@@ -4221,11 +4222,11 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
continue;
}
spin_lock(&head->lock);
- while ((n = rb_first(&head->ref_tree)) != NULL) {
+ while ((n = rb_first_cached(&head->ref_tree)) != NULL) {
ref = rb_entry(n, struct btrfs_delayed_ref_node,
ref_node);
ref->in_tree = 0;
- rb_erase(&ref->ref_node, &head->ref_tree);
+ rb_erase_cached(&ref->ref_node, &head->ref_tree);
RB_CLEAR_NODE(&ref->ref_node);
if (!list_empty(&ref->add_list))
list_del(&ref->add_list);
@@ -4239,7 +4240,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
if (head->processing == 0)
delayed_refs->num_heads_ready--;
atomic_dec(&delayed_refs->num_entries);
- rb_erase(&head->href_node, &delayed_refs->href_root);
+ rb_erase_cached(&head->href_node, &delayed_refs->href_root);
RB_CLEAR_NODE(&head->href_node);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1f3755b3a37a..ddf28ecf17f9 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -33,7 +33,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
type = FILEID_BTRFS_WITHOUT_PARENT;
fid->objectid = btrfs_ino(BTRFS_I(inode));
- fid->root_objectid = BTRFS_I(inode)->root->objectid;
+ fid->root_objectid = BTRFS_I(inode)->root->root_key.objectid;
fid->gen = inode->i_generation;
if (parent) {
@@ -41,7 +41,7 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
fid->parent_objectid = BTRFS_I(parent)->location.objectid;
fid->parent_gen = parent->i_generation;
- parent_root_id = BTRFS_I(parent)->root->objectid;
+ parent_root_id = BTRFS_I(parent)->root->root_key.objectid;
if (parent_root_id != fid->root_objectid) {
fid->parent_root_objectid = parent_root_id;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index de6f75f5547b..a4cd0221bc8d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2374,7 +2374,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
struct btrfs_delayed_ref_node *ref;
- if (RB_EMPTY_ROOT(&head->ref_tree))
+ if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
return NULL;
/*
@@ -2387,7 +2387,7 @@ select_delayed_ref(struct btrfs_delayed_ref_head *head)
return list_first_entry(&head->ref_add_list,
struct btrfs_delayed_ref_node, add_list);
- ref = rb_entry(rb_first(&head->ref_tree),
+ ref = rb_entry(rb_first_cached(&head->ref_tree),
struct btrfs_delayed_ref_node, ref_node);
ASSERT(list_empty(&ref->add_list));
return ref;
@@ -2448,13 +2448,13 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
spin_unlock(&head->lock);
spin_lock(&delayed_refs->lock);
spin_lock(&head->lock);
- if (!RB_EMPTY_ROOT(&head->ref_tree) || head->extent_op) {
+ if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
return 1;
}
delayed_refs->num_heads--;
- rb_erase(&head->href_node, &delayed_refs->href_root);
+ rb_erase_cached(&head->href_node, &delayed_refs->href_root);
RB_CLEAR_NODE(&head->href_node);
spin_unlock(&head->lock);
spin_unlock(&delayed_refs->lock);
@@ -2502,102 +2502,66 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
return 0;
}
-/*
- * Returns 0 on success or if called with an already aborted transaction.
- * Returns -ENOMEM or -EIO on failure and will abort the transaction.
- */
-static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
- unsigned long nr)
+static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
+ struct btrfs_trans_handle *trans)
+{
+ struct btrfs_delayed_ref_root *delayed_refs =
+ &trans->transaction->delayed_refs;
+ struct btrfs_delayed_ref_head *head = NULL;
+ int ret;
+
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_select_ref_head(delayed_refs);
+ if (!head) {
+ spin_unlock(&delayed_refs->lock);
+ return head;
+ }
+
+ /*
+ * Grab the lock that says we are going to process all the refs for
+ * this head
+ */
+ ret = btrfs_delayed_ref_lock(delayed_refs, head);
+ spin_unlock(&delayed_refs->lock);
+
+ /*
+ * We may have dropped the spin lock to get the head mutex lock, and
+ * that might have given someone else time to free the head. If that's
+ * true, it has been removed from our list and we can move on.
+ */
+ if (ret == -EAGAIN)
+ head = ERR_PTR(-EAGAIN);
+
+ return head;
+}
+
+static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *locked_ref,
+ unsigned long *run_refs)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_delayed_ref_root *delayed_refs;
- struct btrfs_delayed_ref_node *ref;
- struct btrfs_delayed_ref_head *locked_ref = NULL;
struct btrfs_delayed_extent_op *extent_op;
- ktime_t start = ktime_get();
- int ret;
- unsigned long count = 0;
- unsigned long actual_count = 0;
+ struct btrfs_delayed_ref_node *ref;
int must_insert_reserved = 0;
+ int ret;
delayed_refs = &trans->transaction->delayed_refs;
- while (1) {
- if (!locked_ref) {
- if (count >= nr)
- break;
-
- spin_lock(&delayed_refs->lock);
- locked_ref = btrfs_select_ref_head(trans);
- if (!locked_ref) {
- spin_unlock(&delayed_refs->lock);
- break;
- }
- /* grab the lock that says we are going to process
- * all the refs for this head */
- ret = btrfs_delayed_ref_lock(trans, locked_ref);
- spin_unlock(&delayed_refs->lock);
- /*
- * we may have dropped the spin lock to get the head
- * mutex lock, and that might have given someone else
- * time to free the head. If that's true, it has been
- * removed from our list and we can move on.
- */
- if (ret == -EAGAIN) {
- locked_ref = NULL;
- count++;
- continue;
- }
- }
+ lockdep_assert_held(&locked_ref->mutex);
+ lockdep_assert_held(&locked_ref->lock);
- /*
- * We need to try and merge add/drops of the same ref since we
- * can run into issues with relocate dropping the implicit ref
- * and then it being added back again before the drop can
- * finish. If we merged anything we need to re-loop so we can
- * get a good ref.
- * Or we can get node references of the same type that weren't
- * merged when created due to bumps in the tree mod seq, and
- * we need to merge them to prevent adding an inline extent
- * backref before dropping it (triggering a BUG_ON at
- * insert_inline_extent_backref()).
- */
- spin_lock(&locked_ref->lock);
- btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
-
- ref = select_delayed_ref(locked_ref);
-
- if (ref && ref->seq &&
+ while ((ref = select_delayed_ref(locked_ref))) {
+ if (ref->seq &&
btrfs_check_delayed_seq(fs_info, ref->seq)) {
spin_unlock(&locked_ref->lock);
unselect_delayed_ref_head(delayed_refs, locked_ref);
- locked_ref = NULL;
- cond_resched();
- count++;
- continue;
- }
-
- /*
- * We're done processing refs in this ref_head, clean everything
- * up and move on to the next ref_head.
- */
- if (!ref) {
- ret = cleanup_ref_head(trans, locked_ref);
- if (ret > 0 ) {
- /* We dropped our lock, we need to loop. */
- ret = 0;
- continue;
- } else if (ret) {
- return ret;
- }
- locked_ref = NULL;
- count++;
- continue;
+ return -EAGAIN;
}
- actual_count++;
+ (*run_refs)++;
ref->in_tree = 0;
- rb_erase(&ref->ref_node, &locked_ref->ref_tree);
+ rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
RB_CLEAR_NODE(&ref->ref_node);
if (!list_empty(&ref->add_list))
list_del(&ref->add_list);
@@ -2619,8 +2583,8 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
atomic_dec(&delayed_refs->num_entries);
/*
- * Record the must-insert_reserved flag before we drop the spin
- * lock.
+ * Record the must_insert_reserved flag before we drop the
+ * spin lock.
*/
must_insert_reserved = locked_ref->must_insert_reserved;
locked_ref->must_insert_reserved = 0;
@@ -2642,10 +2606,90 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
}
btrfs_put_delayed_ref(ref);
- count++;
cond_resched();
+
+ spin_lock(&locked_ref->lock);
+ btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
}
+ return 0;
+}
+
+/*
+ * Returns 0 on success or if called with an already aborted transaction.
+ * Returns -ENOMEM or -EIO on failure and will abort the transaction.
+ */
+static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ unsigned long nr)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_head *locked_ref = NULL;
+ ktime_t start = ktime_get();
+ int ret;
+ unsigned long count = 0;
+ unsigned long actual_count = 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ do {
+ if (!locked_ref) {
+ locked_ref = btrfs_obtain_ref_head(trans);
+ if (IS_ERR_OR_NULL(locked_ref)) {
+ if (PTR_ERR(locked_ref) == -EAGAIN) {
+ continue;
+ } else {
+ break;
+ }
+ }
+ count++;
+ }
+ /*
+ * We need to try and merge add/drops of the same ref since we
+ * can run into issues with relocate dropping the implicit ref
+ * and then it being added back again before the drop can
+ * finish. If we merged anything we need to re-loop so we can
+ * get a good ref.
+ * Or we can get node references of the same type that weren't
+ * merged when created due to bumps in the tree mod seq, and
+ * we need to merge them to prevent adding an inline extent
+ * backref before dropping it (triggering a BUG_ON at
+ * insert_inline_extent_backref()).
+ */
+ spin_lock(&locked_ref->lock);
+ btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
+
+ ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
+ &actual_count);
+ if (ret < 0 && ret != -EAGAIN) {
+ /*
+ * Error, btrfs_run_delayed_refs_for_head already
+ * unlocked everything so just bail out
+ */
+ return ret;
+ } else if (!ret) {
+ /*
+ * Success, perform the usual cleanup of a processed
+ * head
+ */
+ ret = cleanup_ref_head(trans, locked_ref);
+ if (ret > 0 ) {
+ /* We dropped our lock, we need to loop. */
+ ret = 0;
+ continue;
+ } else if (ret) {
+ return ret;
+ }
+ }
+
+ /*
+ * Either success case or btrfs_run_delayed_refs_for_head
+ * returned -EAGAIN, meaning we need to select another head
+ */
+
+ locked_ref = NULL;
+ cond_resched();
+ } while ((nr != -1 && count < nr) || locked_ref);
+
/*
* We don't want to include ref heads since we can have empty ref heads
* and those will drastically skew our runtime down since we just do
@@ -2745,9 +2789,9 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
return num_csums;
}
-int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_block_rsv *global_rsv;
u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
@@ -2782,8 +2826,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
return ret;
}
-int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info)
+int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
{
u64 num_entries =
atomic_read(&trans->transaction->delayed_refs.num_entries);
@@ -2791,14 +2834,14 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
u64 val;
smp_mb();
- avg_runtime = fs_info->avg_delayed_ref_runtime;
+ avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
val = num_entries * avg_runtime;
if (val >= NSEC_PER_SEC)
return 1;
if (val >= NSEC_PER_SEC / 2)
return 2;
- return btrfs_check_space_for_delayed_refs(trans, fs_info);
+ return btrfs_check_space_for_delayed_refs(trans);
}
struct async_delayed_refs {
@@ -2940,7 +2983,7 @@ again:
btrfs_create_pending_block_groups(trans);
spin_lock(&delayed_refs->lock);
- node = rb_first(&delayed_refs->href_root);
+ node = rb_first_cached(&delayed_refs->href_root);
if (!node) {
spin_unlock(&delayed_refs->lock);
goto out;
@@ -3040,7 +3083,8 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
* XXX: We should replace this with a proper search function in the
* future.
*/
- for (node = rb_first(&head->ref_tree); node; node = rb_next(node)) {
+ for (node = rb_first_cached(&head->ref_tree); node;
+ node = rb_next(node)) {
ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
/* If it's a shared ref we know a cross reference exists */
if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
@@ -3139,7 +3183,6 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
{
struct btrfs_path *path;
int ret;
- int ret2;
path = btrfs_alloc_path();
if (!path)
@@ -3151,17 +3194,9 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
if (ret && ret != -ENOENT)
goto out;
- ret2 = check_delayed_ref(root, path, objectid,
- offset, bytenr);
- } while (ret2 == -EAGAIN);
+ ret = check_delayed_ref(root, path, objectid, offset, bytenr);
+ } while (ret == -EAGAIN);
- if (ret2 && ret2 != -ENOENT) {
- ret = ret2;
- goto out;
- }
-
- if (ret != -ENOENT || ret2 != -ENOENT)
- ret = 0;
out:
btrfs_free_path(path);
if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
@@ -5284,7 +5319,7 @@ static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
}
static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
- u64 num_bytes, int update_size)
+ u64 num_bytes, bool update_size)
{
spin_lock(&block_rsv->lock);
block_rsv->reserved += num_bytes;
@@ -5316,7 +5351,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
global_rsv->full = 0;
spin_unlock(&global_rsv->lock);
- block_rsv_add_bytes(dest, num_bytes, 1);
+ block_rsv_add_bytes(dest, num_bytes, true);
return 0;
}
@@ -5479,7 +5514,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src,
struct btrfs_block_rsv *dst, u64 num_bytes,
- int update_size)
+ bool update_size)
{
int ret;
@@ -5539,10 +5574,8 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
return 0;
ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
- if (!ret) {
- block_rsv_add_bytes(block_rsv, num_bytes, 1);
- return 0;
- }
+ if (!ret)
+ block_rsv_add_bytes(block_rsv, num_bytes, true);
return ret;
}
@@ -5587,7 +5620,7 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
if (!ret) {
- block_rsv_add_bytes(block_rsv, num_bytes, 0);
+ block_rsv_add_bytes(block_rsv, num_bytes, false);
return 0;
}
@@ -5629,7 +5662,7 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
return ret;
ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
if (!ret) {
- block_rsv_add_bytes(block_rsv, num_bytes, 0);
+ block_rsv_add_bytes(block_rsv, num_bytes, false);
trace_btrfs_space_reservation(root->fs_info, "delalloc",
btrfs_ino(inode), num_bytes, 1);
@@ -5800,7 +5833,7 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
* root: the root of the parent directory
* rsv: block reservation
* items: the number of items that we need do reservation
- * qgroup_reserved: used to return the reserved size in qgroup
+ * use_global_rsv: allow fallback to the global block reservation
*
* This function is used to reserve the space for snapshot/subvolume
* creation and deletion. Those operations are different with the
@@ -5810,10 +5843,10 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
* the space reservation mechanism in start_transaction().
*/
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
- struct btrfs_block_rsv *rsv,
- int items,
+ struct btrfs_block_rsv *rsv, int items,
bool use_global_rsv)
{
+ u64 qgroup_num_bytes = 0;
u64 num_bytes;
int ret;
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -5821,12 +5854,11 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
/* One for parent inode, two for dir entries */
- num_bytes = 3 * fs_info->nodesize;
- ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
+ qgroup_num_bytes = 3 * fs_info->nodesize;
+ ret = btrfs_qgroup_reserve_meta_prealloc(root,
+ qgroup_num_bytes, true);
if (ret)
return ret;
- } else {
- num_bytes = 0;
}
num_bytes = btrfs_calc_trans_metadata_size(fs_info, items);
@@ -5836,10 +5868,10 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
BTRFS_RESERVE_FLUSH_ALL);
if (ret == -ENOSPC && use_global_rsv)
- ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, 1);
+ ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes, true);
- if (ret && num_bytes)
- btrfs_qgroup_free_meta_prealloc(root, num_bytes);
+ if (ret && qgroup_num_bytes)
+ btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes);
return ret;
}
@@ -6400,10 +6432,6 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
} else {
cache->reserved += num_bytes;
space_info->bytes_reserved += num_bytes;
-
- trace_btrfs_space_reservation(cache->fs_info,
- "space_info", space_info->flags,
- ram_bytes, 0);
space_info->bytes_may_use -= ram_bytes;
if (delalloc)
cache->delalloc_bytes += num_bytes;
@@ -6425,11 +6453,10 @@ static int btrfs_add_reserved_bytes(struct btrfs_block_group_cache *cache,
* reserve set to 0 in order to clear the reservation.
*/
-static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
- u64 num_bytes, int delalloc)
+static void btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
+ u64 num_bytes, int delalloc)
{
struct btrfs_space_info *space_info = cache->space_info;
- int ret = 0;
spin_lock(&space_info->lock);
spin_lock(&cache->lock);
@@ -6442,7 +6469,6 @@ static int btrfs_free_reserved_bytes(struct btrfs_block_group_cache *cache,
cache->delalloc_bytes -= num_bytes;
spin_unlock(&cache->lock);
spin_unlock(&space_info->lock);
- return ret;
}
void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
{
@@ -6926,7 +6952,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
goto out_delayed_unlock;
spin_lock(&head->lock);
- if (!RB_EMPTY_ROOT(&head->ref_tree))
+ if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
goto out;
if (head->extent_op) {
@@ -6947,7 +6973,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
* at this point we have a head with no other entries. Go
* ahead and process it.
*/
- rb_erase(&head->href_node, &delayed_refs->href_root);
+ rb_erase_cached(&head->href_node, &delayed_refs->href_root);
RB_CLEAR_NODE(&head->href_node);
atomic_dec(&delayed_refs->num_entries);
@@ -8120,6 +8146,19 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
if (IS_ERR(buf))
return buf;
+ /*
+ * Extra safety check in case the extent tree is corrupted and extent
+ * allocator chooses to use a tree block which is already used and
+ * locked.
+ */
+ if (buf->lock_owner == current->pid) {
+ btrfs_err_rl(fs_info,
+"tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
+ buf->start, btrfs_header_owner(buf), current->pid);
+ free_extent_buffer(buf);
+ return ERR_PTR(-EUCLEAN);
+ }
+
btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
btrfs_tree_lock(buf);
clean_tree_block(fs_info, buf);
@@ -8216,7 +8255,7 @@ try_reserve:
static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u32 blocksize)
{
- block_rsv_add_bytes(block_rsv, blocksize, 0);
+ block_rsv_add_bytes(block_rsv, blocksize, false);
block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
}
@@ -8643,7 +8682,13 @@ skip:
parent = 0;
}
- if (need_account) {
+ /*
+ * Reloc tree doesn't contribute to qgroup numbers, and we have
+ * already accounted them at merge time (replace_path),
+ * thus we could skip expensive subtree trace here.
+ */
+ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
+ need_account) {
ret = btrfs_qgroup_trace_subtree(trans, next,
generation, level - 1);
if (ret) {
@@ -8764,15 +8809,14 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
if (eb == root->node) {
if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = eb->start;
- else
- BUG_ON(root->root_key.objectid !=
- btrfs_header_owner(eb));
+ else if (root->root_key.objectid != btrfs_header_owner(eb))
+ goto owner_mismatch;
} else {
if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
parent = path->nodes[level + 1]->start;
- else
- BUG_ON(root->root_key.objectid !=
- btrfs_header_owner(path->nodes[level + 1]));
+ else if (root->root_key.objectid !=
+ btrfs_header_owner(path->nodes[level + 1]))
+ goto owner_mismatch;
}
btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
@@ -8780,6 +8824,11 @@ out:
wc->refs[level] = 0;
wc->flags[level] = 0;
return 0;
+
+owner_mismatch:
+ btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
+ btrfs_header_owner(eb), root->root_key.objectid);
+ return -EUCLEAN;
}
static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -8833,6 +8882,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
ret = walk_up_proc(trans, root, path, wc);
if (ret > 0)
return 0;
+ if (ret < 0)
+ return ret;
if (path->locks[level]) {
btrfs_tree_unlock_rw(path->nodes[level],
@@ -8876,7 +8927,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
int level;
bool root_dropped = false;
- btrfs_debug(fs_info, "Drop subvolume %llu", root->objectid);
+ btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
path = btrfs_alloc_path();
if (!path) {
@@ -9614,6 +9665,7 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
block_group = btrfs_lookup_first_block_group(info, last);
while (block_group) {
+ wait_block_group_cache_done(block_group);
spin_lock(&block_group->lock);
if (block_group->iref)
break;
@@ -10075,7 +10127,7 @@ error:
void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- struct btrfs_block_group_cache *block_group, *tmp;
+ struct btrfs_block_group_cache *block_group;
struct btrfs_root *extent_root = fs_info->extent_root;
struct btrfs_block_group_item item;
struct btrfs_key key;
@@ -10083,7 +10135,10 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans)
bool can_flush_pending_bgs = trans->can_flush_pending_bgs;
trans->can_flush_pending_bgs = false;
- list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) {
+ while (!list_empty(&trans->new_bgs)) {
+ block_group = list_first_entry(&trans->new_bgs,
+ struct btrfs_block_group_cache,
+ bg_list);
if (ret)
goto next;
@@ -10754,14 +10809,16 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
* We don't want a transaction for this since the discard may take a
* substantial amount of time. We don't require that a transaction be
* running, but we do need to take a running transaction into account
- * to ensure that we're not discarding chunks that were released in
- * the current transaction.
+ * to ensure that we're not discarding chunks that were released or
+ * allocated in the current transaction.
*
* Holding the chunks lock will prevent other threads from allocating
* or releasing chunks, but it won't prevent a running transaction
* from committing and releasing the memory that the pending chunks
* list head uses. For that, we need to take a reference to the
- * transaction.
+ * transaction and hold the commit root sem. We only need to hold
+ * it while performing the free space search since we have already
+ * held back allocations.
*/
static int btrfs_trim_free_extents(struct btrfs_device *device,
u64 minlen, u64 *trimmed)
@@ -10771,6 +10828,10 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
*trimmed = 0;
+ /* Discard not supported = nothing to do. */
+ if (!blk_queue_discard(bdev_get_queue(device->bdev)))
+ return 0;
+
/* Not writeable = nothing to do. */
if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
return 0;
@@ -10788,9 +10849,13 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
if (ret)
- return ret;
+ break;
- down_read(&fs_info->commit_root_sem);
+ ret = down_read_killable(&fs_info->commit_root_sem);
+ if (ret) {
+ mutex_unlock(&fs_info->chunk_mutex);
+ break;
+ }
spin_lock(&fs_info->trans_lock);
trans = fs_info->running_transaction;
@@ -10798,13 +10863,17 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
refcount_inc(&trans->use_count);
spin_unlock(&fs_info->trans_lock);
+ if (!trans)
+ up_read(&fs_info->commit_root_sem);
+
ret = find_free_dev_extent_start(trans, device, minlen, start,
&start, &len);
- if (trans)
+ if (trans) {
+ up_read(&fs_info->commit_root_sem);
btrfs_put_transaction(trans);
+ }
if (ret) {
- up_read(&fs_info->commit_root_sem);
mutex_unlock(&fs_info->chunk_mutex);
if (ret == -ENOSPC)
ret = 0;
@@ -10812,7 +10881,6 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
}
ret = btrfs_issue_discard(device->bdev, start, len, &bytes);
- up_read(&fs_info->commit_root_sem);
mutex_unlock(&fs_info->chunk_mutex);
if (ret)
@@ -10832,6 +10900,15 @@ static int btrfs_trim_free_extents(struct btrfs_device *device,
return ret;
}
+/*
+ * Trim the whole filesystem by:
+ * 1) trimming the free space in each block group
+ * 2) trimming the unallocated space on each device
+ *
+ * This will also continue trimming even if a block group or device encounters
+ * an error. The return value will be the last error, or 0 if nothing bad
+ * happens.
+ */
int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
{
struct btrfs_block_group_cache *cache = NULL;
@@ -10841,18 +10918,14 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
u64 start;
u64 end;
u64 trimmed = 0;
- u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
+ u64 bg_failed = 0;
+ u64 dev_failed = 0;
+ int bg_ret = 0;
+ int dev_ret = 0;
int ret = 0;
- /*
- * try to trim all FS space, our block group may start from non-zero.
- */
- if (range->len == total_bytes)
- cache = btrfs_lookup_first_block_group(fs_info, range->start);
- else
- cache = btrfs_lookup_block_group(fs_info, range->start);
-
- while (cache) {
+ cache = btrfs_lookup_first_block_group(fs_info, range->start);
+ for (; cache; cache = next_block_group(fs_info, cache)) {
if (cache->key.objectid >= (range->start + range->len)) {
btrfs_put_block_group(cache);
break;
@@ -10866,13 +10939,15 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
if (!block_group_cache_done(cache)) {
ret = cache_block_group(cache, 0);
if (ret) {
- btrfs_put_block_group(cache);
- break;
+ bg_failed++;
+ bg_ret = ret;
+ continue;
}
ret = wait_block_group_cache_done(cache);
if (ret) {
- btrfs_put_block_group(cache);
- break;
+ bg_failed++;
+ bg_ret = ret;
+ continue;
}
}
ret = btrfs_trim_block_group(cache,
@@ -10883,28 +10958,40 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
trimmed += group_trimmed;
if (ret) {
- btrfs_put_block_group(cache);
- break;
+ bg_failed++;
+ bg_ret = ret;
+ continue;
}
}
-
- cache = next_block_group(fs_info, cache);
}
+ if (bg_failed)
+ btrfs_warn(fs_info,
+ "failed to trim %llu block group(s), last error %d",
+ bg_failed, bg_ret);
mutex_lock(&fs_info->fs_devices->device_list_mutex);
- devices = &fs_info->fs_devices->alloc_list;
- list_for_each_entry(device, devices, dev_alloc_list) {
+ devices = &fs_info->fs_devices->devices;
+ list_for_each_entry(device, devices, dev_list) {
ret = btrfs_trim_free_extents(device, range->minlen,
&group_trimmed);
- if (ret)
+ if (ret) {
+ dev_failed++;
+ dev_ret = ret;
break;
+ }
trimmed += group_trimmed;
}
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+ if (dev_failed)
+ btrfs_warn(fs_info,
+ "failed to trim %llu device(s), last error %d",
+ dev_failed, dev_ret);
range->len = trimmed;
- return ret;
+ if (bg_ret)
+ return bg_ret;
+ return dev_ret;
}
/*
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4dd6faab02bb..6877a74c7469 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1424,20 +1424,15 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
struct extent_state **cached_state)
{
struct extent_state *state;
- struct rb_node *n;
int ret = 1;
spin_lock(&tree->lock);
if (cached_state && *cached_state) {
state = *cached_state;
if (state->end == start - 1 && extent_state_in_tree(state)) {
- n = rb_next(&state->rb_node);
- while (n) {
- state = rb_entry(n, struct extent_state,
- rb_node);
+ while ((state = next_state(state)) != NULL) {
if (state->state & bits)
goto got_it;
- n = rb_next(n);
}
free_extent_state(*cached_state);
*cached_state = NULL;
@@ -1568,7 +1563,7 @@ static noinline int lock_delalloc_pages(struct inode *inode,
*
* 1 is returned if we find something, 0 if nothing was in the tree
*/
-STATIC u64 find_lock_delalloc_range(struct inode *inode,
+static noinline_for_stack u64 find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
u64 *end, u64 max_bytes)
@@ -1648,6 +1643,17 @@ out_failed:
return found;
}
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+u64 btrfs_find_lock_delalloc_range(struct inode *inode,
+ struct extent_io_tree *tree,
+ struct page *locked_page, u64 *start,
+ u64 *end, u64 max_bytes)
+{
+ return find_lock_delalloc_range(inode, tree, locked_page, start, end,
+ max_bytes);
+}
+#endif
+
static int __process_pages_contig(struct address_space *mapping,
struct page *locked_page,
pgoff_t start_index, pgoff_t end_index,
@@ -5165,11 +5171,11 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb)
WARN_ON(atomic_read(&eb->refs) == 0);
}
-int set_extent_buffer_dirty(struct extent_buffer *eb)
+bool set_extent_buffer_dirty(struct extent_buffer *eb)
{
int i;
int num_pages;
- int was_dirty = 0;
+ bool was_dirty;
check_buffer_tree_ref(eb);
@@ -5179,8 +5185,15 @@ int set_extent_buffer_dirty(struct extent_buffer *eb)
WARN_ON(atomic_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
+ if (!was_dirty)
+ for (i = 0; i < num_pages; i++)
+ set_page_dirty(eb->pages[i]);
+
+#ifdef CONFIG_BTRFS_DEBUG
for (i = 0; i < num_pages; i++)
- set_page_dirty(eb->pages[i]);
+ ASSERT(PageDirty(eb->pages[i]));
+#endif
+
return was_dirty;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index b4d03e677e1d..369daa5d4f73 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -479,7 +479,7 @@ void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
unsigned long pos, unsigned long len);
void clear_extent_buffer_dirty(struct extent_buffer *eb);
-int set_extent_buffer_dirty(struct extent_buffer *eb);
+bool set_extent_buffer_dirty(struct extent_buffer *eb);
void set_extent_buffer_uptodate(struct extent_buffer *eb);
void clear_extent_buffer_uptodate(struct extent_buffer *eb);
int extent_buffer_under_io(struct extent_buffer *eb);
@@ -546,7 +546,7 @@ int free_io_failure(struct extent_io_tree *failure_tree,
struct extent_io_tree *io_tree,
struct io_failure_record *rec);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-noinline u64 find_lock_delalloc_range(struct inode *inode,
+u64 btrfs_find_lock_delalloc_range(struct inode *inode,
struct extent_io_tree *tree,
struct page *locked_page, u64 *start,
u64 *end, u64 max_bytes);
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6648d55e5339..7eea8b6e2cd3 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -34,7 +34,7 @@ void __cold extent_map_exit(void)
*/
void extent_map_tree_init(struct extent_map_tree *tree)
{
- tree->map = RB_ROOT;
+ tree->map = RB_ROOT_CACHED;
INIT_LIST_HEAD(&tree->modified_extents);
rwlock_init(&tree->lock);
}
@@ -90,24 +90,27 @@ static u64 range_end(u64 start, u64 len)
return start + len;
}
-static int tree_insert(struct rb_root *root, struct extent_map *em)
+static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
{
- struct rb_node **p = &root->rb_node;
+ struct rb_node **p = &root->rb_root.rb_node;
struct rb_node *parent = NULL;
struct extent_map *entry = NULL;
struct rb_node *orig_parent = NULL;
u64 end = range_end(em->start, em->len);
+ bool leftmost = true;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct extent_map, rb_node);
- if (em->start < entry->start)
+ if (em->start < entry->start) {
p = &(*p)->rb_left;
- else if (em->start >= extent_map_end(entry))
+ } else if (em->start >= extent_map_end(entry)) {
p = &(*p)->rb_right;
- else
+ leftmost = false;
+ } else {
return -EEXIST;
+ }
}
orig_parent = parent;
@@ -130,7 +133,7 @@ static int tree_insert(struct rb_root *root, struct extent_map *em)
return -EEXIST;
rb_link_node(&em->rb_node, orig_parent, p);
- rb_insert_color(&em->rb_node, root);
+ rb_insert_color_cached(&em->rb_node, root, leftmost);
return 0;
}
@@ -242,7 +245,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
em->mod_start = merge->mod_start;
em->generation = max(em->generation, merge->generation);
- rb_erase(&merge->rb_node, &tree->map);
+ rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
free_extent_map(merge);
}
@@ -254,7 +257,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
if (rb && mergable_maps(em, merge)) {
em->len += merge->len;
em->block_len += merge->block_len;
- rb_erase(&merge->rb_node, &tree->map);
+ rb_erase_cached(&merge->rb_node, &tree->map);
RB_CLEAR_NODE(&merge->rb_node);
em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start;
em->generation = max(em->generation, merge->generation);
@@ -367,7 +370,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
struct rb_node *next = NULL;
u64 end = range_end(start, len);
- rb_node = __tree_search(&tree->map, start, &prev, &next);
+ rb_node = __tree_search(&tree->map.rb_root, start, &prev, &next);
if (!rb_node) {
if (prev)
rb_node = prev;
@@ -428,16 +431,13 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
* Removes @em from @tree. No reference counts are dropped, and no checks
* are done to see if the range is in use
*/
-int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
{
- int ret = 0;
-
WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags));
- rb_erase(&em->rb_node, &tree->map);
+ rb_erase_cached(&em->rb_node, &tree->map);
if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
list_del_init(&em->list);
RB_CLEAR_NODE(&em->rb_node);
- return ret;
}
void replace_extent_mapping(struct extent_map_tree *tree,
@@ -449,7 +449,7 @@ void replace_extent_mapping(struct extent_map_tree *tree,
ASSERT(extent_map_in_tree(cur));
if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags))
list_del_init(&cur->list);
- rb_replace_node(&cur->rb_node, &new->rb_node, &tree->map);
+ rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map);
RB_CLEAR_NODE(&cur->rb_node);
setup_extent_mapping(tree, new, modified);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 25d985e7532a..31977ffd6190 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -49,7 +49,7 @@ struct extent_map {
};
struct extent_map_tree {
- struct rb_root map;
+ struct rb_root_cached map;
struct list_head modified_extents;
rwlock_t lock;
};
@@ -78,7 +78,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
u64 start, u64 len);
int add_extent_mapping(struct extent_map_tree *tree,
struct extent_map *em, int modified);
-int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
void replace_extent_mapping(struct extent_map_tree *tree,
struct extent_map *cur,
struct extent_map *new,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 2be00e873e92..15b925142793 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -531,6 +531,14 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
end_of_last_block = start_pos + num_bytes - 1;
+ /*
+ * The pages may have already been dirty, clear out old accounting so
+ * we can set things up properly
+ */
+ clear_extent_bit(&BTRFS_I(inode)->io_tree, start_pos, end_of_last_block,
+ EXTENT_DIRTY | EXTENT_DELALLOC |
+ EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, cached);
+
if (!btrfs_is_free_space_inode(BTRFS_I(inode))) {
if (start_pos >= isize &&
!(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) {
@@ -1500,18 +1508,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
}
if (ordered)
btrfs_put_ordered_extent(ordered);
- clear_extent_bit(&inode->io_tree, start_pos, last_pos,
- EXTENT_DIRTY | EXTENT_DELALLOC |
- EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
- 0, 0, cached_state);
+
*lockstart = start_pos;
*lockend = last_pos;
ret = 1;
}
+ /*
+ * It's possible the pages are dirty right now, but we don't want
+ * to clean them yet because copy_from_user may catch a page fault
+ * and we might have to fall back to one page at a time. If that
+ * happens, we'll unlock these pages and we'd have a window where
+ * reclaim could sneak in and drop the once-dirty page on the floor
+ * without writing it.
+ *
+ * We have the pages locked and the extent range locked, so there's
+ * no way someone can start IO on any dirty pages in this range.
+ *
+ * We'll call btrfs_dirty_pages() later on, and that will flip around
+ * delalloc bits and dirty the pages as required.
+ */
for (i = 0; i < num_pages; i++) {
- if (clear_page_dirty_for_io(pages[i]))
- account_page_redirty(pages[i]);
set_page_extent_mapped(pages[i]);
WARN_ON(!PageLocked(pages[i]));
}
@@ -2544,7 +2561,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
- min_size, 0);
+ min_size, false);
BUG_ON(ret);
trans->block_rsv = rsv;
@@ -2594,7 +2611,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
}
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
- rsv, min_size, 0);
+ rsv, min_size, false);
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0adf38b00fa0..67441219d6c9 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -10,6 +10,7 @@
#include <linux/math64.h>
#include <linux/ratelimit.h>
#include <linux/error-injection.h>
+#include <linux/sched/mm.h>
#include "ctree.h"
#include "free-space-cache.h"
#include "transaction.h"
@@ -47,6 +48,7 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
struct btrfs_free_space_header *header;
struct extent_buffer *leaf;
struct inode *inode = NULL;
+ unsigned nofs_flag;
int ret;
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -68,7 +70,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
btrfs_disk_key_to_cpu(&location, &disk_key);
btrfs_release_path(path);
+ /*
+ * We are often under a trans handle at this point, so we need to make
+ * sure NOFS is set to keep us from deadlocking.
+ */
+ nofs_flag = memalloc_nofs_save();
inode = btrfs_iget(fs_info->sb, &location, root, NULL);
+ memalloc_nofs_restore(nofs_flag);
if (IS_ERR(inode))
return inode;
@@ -1679,6 +1687,8 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
bitmap_clear(info->bitmap, start, count);
info->bytes -= bytes;
+ if (info->max_extent_size > ctl->unit)
+ info->max_extent_size = 0;
}
static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
@@ -2110,8 +2120,7 @@ new_bitmap:
out:
if (info) {
- if (info->bitmap)
- kfree(info->bitmap);
+ kfree(info->bitmap);
kmem_cache_free(btrfs_free_space_cachep, info);
}
@@ -3601,8 +3610,7 @@ again:
if (info)
kmem_cache_free(btrfs_free_space_cachep, info);
- if (map)
- kfree(map);
+ kfree(map);
return 0;
}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9357a19d2bff..f22f77172c5f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -64,7 +64,6 @@ static const struct inode_operations btrfs_dir_ro_inode_operations;
static const struct inode_operations btrfs_special_inode_operations;
static const struct inode_operations btrfs_file_inode_operations;
static const struct address_space_operations btrfs_aops;
-static const struct address_space_operations btrfs_symlink_aops;
static const struct file_operations btrfs_dir_file_operations;
static const struct extent_io_ops btrfs_extent_io_ops;
@@ -1271,7 +1270,7 @@ static noinline int run_delalloc_nocow(struct inode *inode,
u64 disk_num_bytes;
u64 ram_bytes;
int extent_type;
- int ret, err;
+ int ret;
int type;
int nocow;
int check_prev = 1;
@@ -1403,11 +1402,8 @@ next_slot:
* if there are pending snapshots for this root,
* we fall into common COW way.
*/
- if (!nolock) {
- err = btrfs_start_write_no_snapshotting(root);
- if (!err)
- goto out_check;
- }
+ if (!nolock && atomic_read(&root->snapshot_force_cow))
+ goto out_check;
/*
* force cow if csum exists in the range.
* this ensure that csum for a given extent are
@@ -1416,9 +1412,6 @@ next_slot:
ret = csum_exist_in_range(fs_info, disk_bytenr,
num_bytes);
if (ret) {
- if (!nolock)
- btrfs_end_write_no_snapshotting(root);
-
/*
* ret could be -EIO if the above fails to read
* metadata.
@@ -1431,11 +1424,8 @@ next_slot:
WARN_ON_ONCE(nolock);
goto out_check;
}
- if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) {
- if (!nolock)
- btrfs_end_write_no_snapshotting(root);
+ if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr))
goto out_check;
- }
nocow = 1;
} else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
extent_end = found_key.offset +
@@ -1448,8 +1438,6 @@ next_slot:
out_check:
if (extent_end <= start) {
path->slots[0]++;
- if (!nolock && nocow)
- btrfs_end_write_no_snapshotting(root);
if (nocow)
btrfs_dec_nocow_writers(fs_info, disk_bytenr);
goto next_slot;
@@ -1471,8 +1459,6 @@ out_check:
end, page_started, nr_written, 1,
NULL);
if (ret) {
- if (!nolock && nocow)
- btrfs_end_write_no_snapshotting(root);
if (nocow)
btrfs_dec_nocow_writers(fs_info,
disk_bytenr);
@@ -1492,8 +1478,6 @@ out_check:
ram_bytes, BTRFS_COMPRESS_NONE,
BTRFS_ORDERED_PREALLOC);
if (IS_ERR(em)) {
- if (!nolock && nocow)
- btrfs_end_write_no_snapshotting(root);
if (nocow)
btrfs_dec_nocow_writers(fs_info,
disk_bytenr);
@@ -1532,8 +1516,6 @@ out_check:
EXTENT_CLEAR_DATA_RESV,
PAGE_UNLOCK | PAGE_SET_PRIVATE2);
- if (!nolock && nocow)
- btrfs_end_write_no_snapshotting(root);
cur_offset = extent_end;
/*
@@ -2767,12 +2749,9 @@ static void relink_file_extents(struct new_sa_defrag_extent *new)
struct btrfs_path *path;
struct sa_defrag_extent_backref *backref;
struct sa_defrag_extent_backref *prev = NULL;
- struct inode *inode;
struct rb_node *node;
int ret;
- inode = new->inode;
-
path = btrfs_alloc_path();
if (!path)
return;
@@ -3488,8 +3467,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
/* this will do delete_inode and everything for us */
iput(inode);
- if (ret)
- goto out;
}
/* release the path since we're done with it */
btrfs_release_path(path);
@@ -3755,7 +3732,7 @@ cache_acl:
case S_IFLNK:
inode->i_op = &btrfs_symlink_inode_operations;
inode_nohighmem(inode);
- inode->i_mapping->a_ops = &btrfs_symlink_aops;
+ inode->i_mapping->a_ops = &btrfs_aops;
break;
default:
inode->i_op = &btrfs_special_inode_operations;
@@ -3927,12 +3904,8 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
path->leave_spinning = 1;
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
name, name_len, -1);
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
- goto err;
- }
- if (!di) {
- ret = -ENOENT;
+ if (IS_ERR_OR_NULL(di)) {
+ ret = di ? PTR_ERR(di) : -ENOENT;
goto err;
}
leaf = path->nodes[0];
@@ -4092,10 +4065,7 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
name, name_len, -1);
if (IS_ERR_OR_NULL(di)) {
- if (!di)
- ret = -ENOENT;
- else
- ret = PTR_ERR(di);
+ ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
}
@@ -4287,18 +4257,17 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
* again is not run concurrently.
*/
spin_lock(&dest->root_item_lock);
- root_flags = btrfs_root_flags(&dest->root_item);
- if (dest->send_in_progress == 0) {
- btrfs_set_root_flags(&dest->root_item,
- root_flags | BTRFS_ROOT_SUBVOL_DEAD);
- spin_unlock(&dest->root_item_lock);
- } else {
+ if (dest->send_in_progress) {
spin_unlock(&dest->root_item_lock);
btrfs_warn(fs_info,
"attempt to delete subvolume %llu during send",
dest->root_key.objectid);
return -EPERM;
}
+ root_flags = btrfs_root_flags(&dest->root_item);
+ btrfs_set_root_flags(&dest->root_item,
+ root_flags | BTRFS_ROOT_SUBVOL_DEAD);
+ spin_unlock(&dest->root_item_lock);
down_write(&fs_info->subvol_sem);
@@ -4744,7 +4713,7 @@ delete:
btrfs_abort_transaction(trans, ret);
break;
}
- if (btrfs_should_throttle_delayed_refs(trans, fs_info))
+ if (btrfs_should_throttle_delayed_refs(trans))
btrfs_async_run_delayed_refs(fs_info,
trans->delayed_ref_updates * 2,
trans->transid, 0);
@@ -4753,8 +4722,7 @@ delete:
extent_num_bytes)) {
should_end = true;
}
- if (btrfs_should_throttle_delayed_refs(trans,
- fs_info))
+ if (btrfs_should_throttle_delayed_refs(trans))
should_throttle = true;
}
}
@@ -5252,10 +5220,10 @@ static void evict_inode_truncate_pages(struct inode *inode)
truncate_inode_pages_final(&inode->i_data);
write_lock(&map_tree->lock);
- while (!RB_EMPTY_ROOT(&map_tree->map)) {
+ while (!RB_EMPTY_ROOT(&map_tree->map.rb_root)) {
struct extent_map *em;
- node = rb_first(&map_tree->map);
+ node = rb_first_cached(&map_tree->map);
em = rb_entry(node, struct extent_map, rb_node);
clear_bit(EXTENT_FLAG_PINNED, &em->flags);
clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
@@ -5323,8 +5291,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
}
static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
- struct btrfs_block_rsv *rsv,
- u64 min_size)
+ struct btrfs_block_rsv *rsv)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
@@ -5334,7 +5301,7 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
struct btrfs_trans_handle *trans;
int ret;
- ret = btrfs_block_rsv_refill(root, rsv, min_size,
+ ret = btrfs_block_rsv_refill(root, rsv, rsv->size,
BTRFS_RESERVE_FLUSH_LIMIT);
if (ret && ++failures > 2) {
@@ -5351,8 +5318,8 @@ static struct btrfs_trans_handle *evict_refill_and_join(struct btrfs_root *root,
* Try to steal from the global reserve if there is space for
* it.
*/
- if (!btrfs_check_space_for_delayed_refs(trans, fs_info) &&
- !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0))
+ if (!btrfs_check_space_for_delayed_refs(trans) &&
+ !btrfs_block_rsv_migrate(global_rsv, rsv, rsv->size, false))
return trans;
/* If not, commit and try again. */
@@ -5368,7 +5335,6 @@ void btrfs_evict_inode(struct inode *inode)
struct btrfs_trans_handle *trans;
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_block_rsv *rsv;
- u64 min_size;
int ret;
trace_btrfs_inode_evict(inode);
@@ -5378,8 +5344,6 @@ void btrfs_evict_inode(struct inode *inode)
return;
}
- min_size = btrfs_calc_trunc_metadata_size(fs_info, 1);
-
evict_inode_truncate_pages(inode);
if (inode->i_nlink &&
@@ -5390,9 +5354,6 @@ void btrfs_evict_inode(struct inode *inode)
if (is_bad_inode(inode))
goto no_delete;
- /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */
- if (!special_file(inode->i_mode))
- btrfs_wait_ordered_range(inode, 0, (u64)-1);
btrfs_free_io_failure_record(BTRFS_I(inode), 0, (u64)-1);
@@ -5412,13 +5373,13 @@ void btrfs_evict_inode(struct inode *inode)
rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
if (!rsv)
goto no_delete;
- rsv->size = min_size;
+ rsv->size = btrfs_calc_trunc_metadata_size(fs_info, 1);
rsv->failfast = 1;
btrfs_i_size_write(BTRFS_I(inode), 0);
while (1) {
- trans = evict_refill_and_join(root, rsv, min_size);
+ trans = evict_refill_and_join(root, rsv);
if (IS_ERR(trans))
goto free_rsv;
@@ -5443,7 +5404,7 @@ void btrfs_evict_inode(struct inode *inode)
* If it turns out that we are dropping too many of these, we might want
* to add a mechanism for retrying these after a commit.
*/
- trans = evict_refill_and_join(root, rsv, min_size);
+ trans = evict_refill_and_join(root, rsv);
if (!IS_ERR(trans)) {
trans->block_rsv = rsv;
btrfs_orphan_del(trans, BTRFS_I(inode));
@@ -5488,12 +5449,8 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(BTRFS_I(dir)),
name, namelen, 0);
- if (!di) {
- ret = -ENOENT;
- goto out;
- }
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
+ if (IS_ERR_OR_NULL(di)) {
+ ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
}
@@ -6407,8 +6364,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
if (ret)
return ret;
- ret = btrfs_insert_dir_item(trans, root, name, name_len,
- parent_inode, &key,
+ ret = btrfs_insert_dir_item(trans, name, name_len, parent_inode, &key,
btrfs_inode_type(&inode->vfs_inode), index);
if (ret == -EEXIST || ret == -EOVERFLOW)
goto fail_dir_item;
@@ -6601,7 +6557,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
int drop_inode = 0;
/* do not allow sys_link's with other subvols of the same device */
- if (root->objectid != BTRFS_I(inode)->root->objectid)
+ if (root->root_key.objectid != BTRFS_I(inode)->root->root_key.objectid)
return -EXDEV;
if (inode->i_nlink >= BTRFS_LINK_MAX)
@@ -6639,6 +6595,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
drop_inode = 1;
} else {
struct dentry *parent = dentry->d_parent;
+ int ret;
+
err = btrfs_update_inode(trans, root, inode);
if (err)
goto fail;
@@ -6652,7 +6610,12 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
goto fail;
}
d_instantiate(dentry, inode);
- btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
+ ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
+ true, NULL);
+ if (ret == BTRFS_NEED_TRANS_COMMIT) {
+ err = btrfs_commit_transaction(trans);
+ trans = NULL;
+ }
}
fail:
@@ -6787,9 +6750,9 @@ static noinline int uncompress_inline(struct btrfs_path *path,
* This also copies inline extents directly into the page.
*/
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
- struct page *page,
- size_t pg_offset, u64 start, u64 len,
- int create)
+ struct page *page,
+ size_t pg_offset, u64 start, u64 len,
+ int create)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
int ret;
@@ -6833,19 +6796,21 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
em->len = (u64)-1;
em->block_len = (u64)-1;
+ path = btrfs_alloc_path();
if (!path) {
- path = btrfs_alloc_path();
- if (!path) {
- err = -ENOMEM;
- goto out;
- }
- /*
- * Chances are we'll be called again, so go ahead and do
- * readahead
- */
- path->reada = READA_FORWARD;
+ err = -ENOMEM;
+ goto out;
}
+ /* Chances are we'll be called again, so go ahead and do readahead */
+ path->reada = READA_FORWARD;
+
+ /*
+ * Unless we're going to uncompress the inline extent, no sleep would
+ * happen.
+ */
+ path->leave_spinning = 1;
+
ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0);
if (ret < 0) {
err = ret;
@@ -6948,6 +6913,8 @@ next:
em->orig_block_len = em->len;
em->orig_start = em->start;
ptr = btrfs_file_extent_inline_start(item) + extent_offset;
+
+ btrfs_set_path_blocking(path);
if (!PageUptodate(page)) {
if (btrfs_file_extent_compression(leaf, item) !=
BTRFS_COMPRESS_NONE) {
@@ -6995,10 +6962,10 @@ insert:
err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len);
write_unlock(&em_tree->lock);
out:
+ btrfs_free_path(path);
trace_btrfs_get_extent(root, inode, em);
- btrfs_free_path(path);
if (err) {
free_extent_map(em);
return ERR_PTR(err);
@@ -9031,7 +8998,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
/* Migrate the slack space for the truncate to our reserve */
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
- min_size, 0);
+ min_size, false);
BUG_ON(ret);
/*
@@ -9068,7 +9035,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
btrfs_block_rsv_release(fs_info, rsv, -1);
ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
- rsv, min_size, 0);
+ rsv, min_size, false);
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
}
@@ -9388,14 +9355,21 @@ static int btrfs_rename_exchange(struct inode *old_dir,
u64 new_idx = 0;
u64 root_objectid;
int ret;
- int ret2;
bool root_log_pinned = false;
bool dest_log_pinned = false;
+ struct btrfs_log_ctx ctx_root;
+ struct btrfs_log_ctx ctx_dest;
+ bool sync_log_root = false;
+ bool sync_log_dest = false;
+ bool commit_transaction = false;
/* we only allow rename subvolume link between subvolumes */
if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
return -EXDEV;
+ btrfs_init_log_ctx(&ctx_root, old_inode);
+ btrfs_init_log_ctx(&ctx_dest, new_inode);
+
/* close the race window with snapshot create/destroy ioctl */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
down_read(&fs_info->subvol_sem);
@@ -9542,15 +9516,29 @@ static int btrfs_rename_exchange(struct inode *old_dir,
if (root_log_pinned) {
parent = new_dentry->d_parent;
- btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
- parent);
+ ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
+ BTRFS_I(old_dir), parent,
+ false, &ctx_root);
+ if (ret == BTRFS_NEED_LOG_SYNC)
+ sync_log_root = true;
+ else if (ret == BTRFS_NEED_TRANS_COMMIT)
+ commit_transaction = true;
+ ret = 0;
btrfs_end_log_trans(root);
root_log_pinned = false;
}
if (dest_log_pinned) {
- parent = old_dentry->d_parent;
- btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
- parent);
+ if (!commit_transaction) {
+ parent = old_dentry->d_parent;
+ ret = btrfs_log_new_name(trans, BTRFS_I(new_inode),
+ BTRFS_I(new_dir), parent,
+ false, &ctx_dest);
+ if (ret == BTRFS_NEED_LOG_SYNC)
+ sync_log_dest = true;
+ else if (ret == BTRFS_NEED_TRANS_COMMIT)
+ commit_transaction = true;
+ ret = 0;
+ }
btrfs_end_log_trans(dest);
dest_log_pinned = false;
}
@@ -9583,8 +9571,26 @@ out_fail:
dest_log_pinned = false;
}
}
- ret2 = btrfs_end_transaction(trans);
- ret = ret ? ret : ret2;
+ if (!ret && sync_log_root && !commit_transaction) {
+ ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root,
+ &ctx_root);
+ if (ret)
+ commit_transaction = true;
+ }
+ if (!ret && sync_log_dest && !commit_transaction) {
+ ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root,
+ &ctx_dest);
+ if (ret)
+ commit_transaction = true;
+ }
+ if (commit_transaction) {
+ ret = btrfs_commit_transaction(trans);
+ } else {
+ int ret2;
+
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
+ }
out_notrans:
if (new_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
@@ -9661,6 +9667,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
int ret;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
bool log_pinned = false;
+ struct btrfs_log_ctx ctx;
+ bool sync_log = false;
+ bool commit_transaction = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -9818,8 +9827,15 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (log_pinned) {
struct dentry *parent = new_dentry->d_parent;
- btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
- parent);
+ btrfs_init_log_ctx(&ctx, old_inode);
+ ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
+ BTRFS_I(old_dir), parent,
+ false, &ctx);
+ if (ret == BTRFS_NEED_LOG_SYNC)
+ sync_log = true;
+ else if (ret == BTRFS_NEED_TRANS_COMMIT)
+ commit_transaction = true;
+ ret = 0;
btrfs_end_log_trans(root);
log_pinned = false;
}
@@ -9856,7 +9872,19 @@ out_fail:
btrfs_end_log_trans(root);
log_pinned = false;
}
- btrfs_end_transaction(trans);
+ if (!ret && sync_log) {
+ ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx);
+ if (ret)
+ commit_transaction = true;
+ }
+ if (commit_transaction) {
+ ret = btrfs_commit_transaction(trans);
+ } else {
+ int ret2;
+
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
+ }
out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
@@ -10140,7 +10168,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
inode->i_op = &btrfs_symlink_inode_operations;
inode_nohighmem(inode);
- inode->i_mapping->a_ops = &btrfs_symlink_aops;
+ inode->i_mapping->a_ops = &btrfs_aops;
inode_set_bytes(inode, name_len);
btrfs_i_size_write(BTRFS_I(inode), name_len);
err = btrfs_update_inode(trans, root, inode);
@@ -10516,13 +10544,6 @@ static const struct address_space_operations btrfs_aops = {
.error_remove_page = generic_error_remove_page,
};
-static const struct address_space_operations btrfs_symlink_aops = {
- .readpage = btrfs_readpage,
- .writepage = btrfs_writepage,
- .invalidatepage = btrfs_invalidatepage,
- .releasepage = btrfs_releasepage,
-};
-
static const struct inode_operations btrfs_file_inode_operations = {
.getattr = btrfs_getattr,
.setattr = btrfs_setattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 63600dc2ac4c..a990a9045139 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -491,7 +491,6 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
struct fstrim_range range;
u64 minlen = ULLONG_MAX;
u64 num_devices = 0;
- u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
int ret;
if (!capable(CAP_SYS_ADMIN))
@@ -515,11 +514,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
return -EOPNOTSUPP;
if (copy_from_user(&range, arg, sizeof(range)))
return -EFAULT;
- if (range.start > total_bytes ||
- range.len < fs_info->sb->s_blocksize)
+
+ /*
+ * NOTE: Don't truncate the range using super->total_bytes. Bytenr of
+ * block group is in the logical address space, which can be any
+ * sectorsize aligned bytenr in the range [0, U64_MAX].
+ */
+ if (range.len < fs_info->sb->s_blocksize)
return -EINVAL;
- range.len = min(range.len, total_bytes - range.start);
range.minlen = max(range.minlen, minlen);
ret = btrfs_trim_fs(fs_info, &range);
if (ret < 0)
@@ -686,8 +689,7 @@ static noinline int create_subvol(struct inode *dir,
goto fail;
}
- ret = btrfs_insert_dir_item(trans, root,
- name, namelen, BTRFS_I(dir), &key,
+ ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key,
BTRFS_FT_DIR, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -747,6 +749,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct btrfs_pending_snapshot *pending_snapshot;
struct btrfs_trans_handle *trans;
int ret;
+ bool snapshot_force_cow = false;
if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
return -EINVAL;
@@ -763,6 +766,11 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
goto free_pending;
}
+ /*
+ * Force new buffered writes to reserve space even when NOCOW is
+ * possible. This is to avoid later writeback (running dealloc) to
+ * fallback to COW mode and unexpectedly fail with ENOSPC.
+ */
atomic_inc(&root->will_be_snapshotted);
smp_mb__after_atomic();
/* wait for no snapshot writes */
@@ -773,6 +781,14 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
if (ret)
goto dec_and_free;
+ /*
+ * All previous writes have started writeback in NOCOW mode, so now
+ * we force future writes to fallback to COW mode during snapshot
+ * creation.
+ */
+ atomic_inc(&root->snapshot_force_cow);
+ snapshot_force_cow = true;
+
btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
@@ -837,6 +853,8 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
fail:
btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv);
dec_and_free:
+ if (snapshot_force_cow)
+ atomic_dec(&root->snapshot_force_cow);
if (atomic_dec_and_test(&root->will_be_snapshotted))
wake_up_var(&root->will_be_snapshotted);
free_pending:
@@ -1308,7 +1326,7 @@ again:
if (i_done != page_cnt) {
spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->outstanding_extents++;
+ btrfs_mod_outstanding_extents(BTRFS_I(inode), 1);
spin_unlock(&BTRFS_I(inode)->lock);
btrfs_delalloc_release_space(inode, data_reserved,
start_index << PAGE_SHIFT,
@@ -3453,6 +3471,25 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 olen,
same_lock_start = min_t(u64, loff, dst_loff);
same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start;
+ } else {
+ /*
+ * If the source and destination inodes are different, the
+ * source's range end offset matches the source's i_size, that
+ * i_size is not a multiple of the sector size, and the
+ * destination range does not go past the destination's i_size,
+ * we must round down the length to the nearest sector size
+ * multiple. If we don't do this adjustment we end replacing
+ * with zeroes the bytes in the range that starts at the
+ * deduplication range's end offset and ends at the next sector
+ * size multiple.
+ */
+ if (loff + olen == i_size_read(src) &&
+ dst_loff + len < i_size_read(dst)) {
+ const u64 sz = BTRFS_I(src)->root->fs_info->sectorsize;
+
+ len = round_down(i_size_read(src), sz) - loff;
+ olen = len;
+ }
}
again:
@@ -4358,7 +4395,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
ret = PTR_ERR(new_root);
goto out;
}
- if (!is_fstree(new_root->objectid)) {
+ if (!is_fstree(new_root->root_key.objectid)) {
ret = -ENOENT;
goto out;
}
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 4353bb69bb86..45868fd76209 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1019,10 +1019,9 @@ out_add_root:
spin_unlock(&fs_info->qgroup_lock);
ret = btrfs_commit_transaction(trans);
- if (ret) {
- trans = NULL;
+ trans = NULL;
+ if (ret)
goto out_free_path;
- }
ret = qgroup_rescan_init(fs_info, 0, 1);
if (!ret) {
@@ -1417,13 +1416,14 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
if (!qgroup) {
ret = -ENOENT;
goto out;
- } else {
- /* check if there are no children of this qgroup */
- if (!list_empty(&qgroup->members)) {
- ret = -EBUSY;
- goto out;
- }
}
+
+ /* Check if there are no children of this qgroup */
+ if (!list_empty(&qgroup->members)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
ret = del_qgroup_item(trans, qgroupid);
if (ret && ret != -ENOENT)
goto out;
@@ -1713,6 +1713,416 @@ static int adjust_slots_upwards(struct btrfs_path *path, int root_level)
return 0;
}
+/*
+ * Helper function to trace a subtree tree block swap.
+ *
+ * The swap will happen in highest tree block, but there may be a lot of
+ * tree blocks involved.
+ *
+ * For example:
+ * OO = Old tree blocks
+ * NN = New tree blocks allocated during balance
+ *
+ * File tree (257) Reloc tree for 257
+ * L2 OO NN
+ * / \ / \
+ * L1 OO OO (a) OO NN (a)
+ * / \ / \ / \ / \
+ * L0 OO OO OO OO OO OO NN NN
+ * (b) (c) (b) (c)
+ *
+ * When calling qgroup_trace_extent_swap(), we will pass:
+ * @src_eb = OO(a)
+ * @dst_path = [ nodes[1] = NN(a), nodes[0] = NN(c) ]
+ * @dst_level = 0
+ * @root_level = 1
+ *
+ * In that case, qgroup_trace_extent_swap() will search from OO(a) to
+ * reach OO(c), then mark both OO(c) and NN(c) as qgroup dirty.
+ *
+ * The main work of qgroup_trace_extent_swap() can be split into 3 parts:
+ *
+ * 1) Tree search from @src_eb
+ * It should acts as a simplified btrfs_search_slot().
+ * The key for search can be extracted from @dst_path->nodes[dst_level]
+ * (first key).
+ *
+ * 2) Mark the final tree blocks in @src_path and @dst_path qgroup dirty
+ * NOTE: In above case, OO(a) and NN(a) won't be marked qgroup dirty.
+ * They should be marked during preivous (@dst_level = 1) iteration.
+ *
+ * 3) Mark file extents in leaves dirty
+ * We don't have good way to pick out new file extents only.
+ * So we still follow the old method by scanning all file extents in
+ * the leave.
+ *
+ * This function can free us from keeping two pathes, thus later we only need
+ * to care about how to iterate all new tree blocks in reloc tree.
+ */
+static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
+ struct extent_buffer *src_eb,
+ struct btrfs_path *dst_path,
+ int dst_level, int root_level,
+ bool trace_leaf)
+{
+ struct btrfs_key key;
+ struct btrfs_path *src_path;
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ u32 nodesize = fs_info->nodesize;
+ int cur_level = root_level;
+ int ret;
+
+ BUG_ON(dst_level > root_level);
+ /* Level mismatch */
+ if (btrfs_header_level(src_eb) != root_level)
+ return -EINVAL;
+
+ src_path = btrfs_alloc_path();
+ if (!src_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ if (dst_level)
+ btrfs_node_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
+ else
+ btrfs_item_key_to_cpu(dst_path->nodes[dst_level], &key, 0);
+
+ /* For src_path */
+ extent_buffer_get(src_eb);
+ src_path->nodes[root_level] = src_eb;
+ src_path->slots[root_level] = dst_path->slots[root_level];
+ src_path->locks[root_level] = 0;
+
+ /* A simplified version of btrfs_search_slot() */
+ while (cur_level >= dst_level) {
+ struct btrfs_key src_key;
+ struct btrfs_key dst_key;
+
+ if (src_path->nodes[cur_level] == NULL) {
+ struct btrfs_key first_key;
+ struct extent_buffer *eb;
+ int parent_slot;
+ u64 child_gen;
+ u64 child_bytenr;
+
+ eb = src_path->nodes[cur_level + 1];
+ parent_slot = src_path->slots[cur_level + 1];
+ child_bytenr = btrfs_node_blockptr(eb, parent_slot);
+ child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+ btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
+
+ eb = read_tree_block(fs_info, child_bytenr, child_gen,
+ cur_level, &first_key);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ } else if (!extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ ret = -EIO;
+ goto out;
+ }
+
+ src_path->nodes[cur_level] = eb;
+
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ src_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
+ }
+
+ src_path->slots[cur_level] = dst_path->slots[cur_level];
+ if (cur_level) {
+ btrfs_node_key_to_cpu(dst_path->nodes[cur_level],
+ &dst_key, dst_path->slots[cur_level]);
+ btrfs_node_key_to_cpu(src_path->nodes[cur_level],
+ &src_key, src_path->slots[cur_level]);
+ } else {
+ btrfs_item_key_to_cpu(dst_path->nodes[cur_level],
+ &dst_key, dst_path->slots[cur_level]);
+ btrfs_item_key_to_cpu(src_path->nodes[cur_level],
+ &src_key, src_path->slots[cur_level]);
+ }
+ /* Content mismatch, something went wrong */
+ if (btrfs_comp_cpu_keys(&dst_key, &src_key)) {
+ ret = -ENOENT;
+ goto out;
+ }
+ cur_level--;
+ }
+
+ /*
+ * Now both @dst_path and @src_path have been populated, record the tree
+ * blocks for qgroup accounting.
+ */
+ ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
+ nodesize, GFP_NOFS);
+ if (ret < 0)
+ goto out;
+ ret = btrfs_qgroup_trace_extent(trans,
+ dst_path->nodes[dst_level]->start,
+ nodesize, GFP_NOFS);
+ if (ret < 0)
+ goto out;
+
+ /* Record leaf file extents */
+ if (dst_level == 0 && trace_leaf) {
+ ret = btrfs_qgroup_trace_leaf_items(trans, src_path->nodes[0]);
+ if (ret < 0)
+ goto out;
+ ret = btrfs_qgroup_trace_leaf_items(trans, dst_path->nodes[0]);
+ }
+out:
+ btrfs_free_path(src_path);
+ return ret;
+}
+
+/*
+ * Helper function to do recursive generation-aware depth-first search, to
+ * locate all new tree blocks in a subtree of reloc tree.
+ *
+ * E.g. (OO = Old tree blocks, NN = New tree blocks, whose gen == last_snapshot)
+ * reloc tree
+ * L2 NN (a)
+ * / \
+ * L1 OO NN (b)
+ * / \ / \
+ * L0 OO OO OO NN
+ * (c) (d)
+ * If we pass:
+ * @dst_path = [ nodes[1] = NN(b), nodes[0] = NULL ],
+ * @cur_level = 1
+ * @root_level = 1
+ *
+ * We will iterate through tree blocks NN(b), NN(d) and info qgroup to trace
+ * above tree blocks along with their counter parts in file tree.
+ * While during search, old tree blocsk OO(c) will be skiped as tree block swap
+ * won't affect OO(c).
+ */
+static int qgroup_trace_new_subtree_blocks(struct btrfs_trans_handle* trans,
+ struct extent_buffer *src_eb,
+ struct btrfs_path *dst_path,
+ int cur_level, int root_level,
+ u64 last_snapshot, bool trace_leaf)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct extent_buffer *eb;
+ bool need_cleanup = false;
+ int ret = 0;
+ int i;
+
+ /* Level sanity check */
+ if (cur_level < 0 || cur_level >= BTRFS_MAX_LEVEL ||
+ root_level < 0 || root_level >= BTRFS_MAX_LEVEL ||
+ root_level < cur_level) {
+ btrfs_err_rl(fs_info,
+ "%s: bad levels, cur_level=%d root_level=%d",
+ __func__, cur_level, root_level);
+ return -EUCLEAN;
+ }
+
+ /* Read the tree block if needed */
+ if (dst_path->nodes[cur_level] == NULL) {
+ struct btrfs_key first_key;
+ int parent_slot;
+ u64 child_gen;
+ u64 child_bytenr;
+
+ /*
+ * dst_path->nodes[root_level] must be initialized before
+ * calling this function.
+ */
+ if (cur_level == root_level) {
+ btrfs_err_rl(fs_info,
+ "%s: dst_path->nodes[%d] not initialized, root_level=%d cur_level=%d",
+ __func__, root_level, root_level, cur_level);
+ return -EUCLEAN;
+ }
+
+ /*
+ * We need to get child blockptr/gen from parent before we can
+ * read it.
+ */
+ eb = dst_path->nodes[cur_level + 1];
+ parent_slot = dst_path->slots[cur_level + 1];
+ child_bytenr = btrfs_node_blockptr(eb, parent_slot);
+ child_gen = btrfs_node_ptr_generation(eb, parent_slot);
+ btrfs_node_key_to_cpu(eb, &first_key, parent_slot);
+
+ /* This node is old, no need to trace */
+ if (child_gen < last_snapshot)
+ goto out;
+
+ eb = read_tree_block(fs_info, child_bytenr, child_gen,
+ cur_level, &first_key);
+ if (IS_ERR(eb)) {
+ ret = PTR_ERR(eb);
+ goto out;
+ } else if (!extent_buffer_uptodate(eb)) {
+ free_extent_buffer(eb);
+ ret = -EIO;
+ goto out;
+ }
+
+ dst_path->nodes[cur_level] = eb;
+ dst_path->slots[cur_level] = 0;
+
+ btrfs_tree_read_lock(eb);
+ btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+ dst_path->locks[cur_level] = BTRFS_READ_LOCK_BLOCKING;
+ need_cleanup = true;
+ }
+
+ /* Now record this tree block and its counter part for qgroups */
+ ret = qgroup_trace_extent_swap(trans, src_eb, dst_path, cur_level,
+ root_level, trace_leaf);
+ if (ret < 0)
+ goto cleanup;
+
+ eb = dst_path->nodes[cur_level];
+
+ if (cur_level > 0) {
+ /* Iterate all child tree blocks */
+ for (i = 0; i < btrfs_header_nritems(eb); i++) {
+ /* Skip old tree blocks as they won't be swapped */
+ if (btrfs_node_ptr_generation(eb, i) < last_snapshot)
+ continue;
+ dst_path->slots[cur_level] = i;
+
+ /* Recursive call (at most 7 times) */
+ ret = qgroup_trace_new_subtree_blocks(trans, src_eb,
+ dst_path, cur_level - 1, root_level,
+ last_snapshot, trace_leaf);
+ if (ret < 0)
+ goto cleanup;
+ }
+ }
+
+cleanup:
+ if (need_cleanup) {
+ /* Clean up */
+ btrfs_tree_unlock_rw(dst_path->nodes[cur_level],
+ dst_path->locks[cur_level]);
+ free_extent_buffer(dst_path->nodes[cur_level]);
+ dst_path->nodes[cur_level] = NULL;
+ dst_path->slots[cur_level] = 0;
+ dst_path->locks[cur_level] = 0;
+ }
+out:
+ return ret;
+}
+
+/*
+ * Inform qgroup to trace subtree swap used in balance.
+ *
+ * Unlike btrfs_qgroup_trace_subtree(), this function will only trace
+ * new tree blocks whose generation is equal to (or larger than) @last_snapshot.
+ *
+ * Will go down the tree block pointed by @dst_eb (pointed by @dst_parent and
+ * @dst_slot), and find any tree blocks whose generation is at @last_snapshot,
+ * and then go down @src_eb (pointed by @src_parent and @src_slot) to find
+ * the conterpart of the tree block, then mark both tree blocks as qgroup dirty,
+ * and skip all tree blocks whose generation is smaller than last_snapshot.
+ *
+ * This would skip tons of tree blocks of original btrfs_qgroup_trace_subtree(),
+ * which could be the cause of very slow balance if the file tree is large.
+ *
+ * @src_parent, @src_slot: pointer to src (file tree) eb.
+ * @dst_parent, @dst_slot: pointer to dst (reloc tree) eb.
+ */
+int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *bg_cache,
+ struct extent_buffer *src_parent, int src_slot,
+ struct extent_buffer *dst_parent, int dst_slot,
+ u64 last_snapshot)
+{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_path *dst_path = NULL;
+ struct btrfs_key first_key;
+ struct extent_buffer *src_eb = NULL;
+ struct extent_buffer *dst_eb = NULL;
+ bool trace_leaf = false;
+ u64 child_gen;
+ u64 child_bytenr;
+ int level;
+ int ret;
+
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return 0;
+
+ /* Check parameter order */
+ if (btrfs_node_ptr_generation(src_parent, src_slot) >
+ btrfs_node_ptr_generation(dst_parent, dst_slot)) {
+ btrfs_err_rl(fs_info,
+ "%s: bad parameter order, src_gen=%llu dst_gen=%llu", __func__,
+ btrfs_node_ptr_generation(src_parent, src_slot),
+ btrfs_node_ptr_generation(dst_parent, dst_slot));
+ return -EUCLEAN;
+ }
+
+ /*
+ * Only trace leaf if we're relocating data block groups, this could
+ * reduce tons of data extents tracing for meta/sys bg relocation.
+ */
+ if (bg_cache->flags & BTRFS_BLOCK_GROUP_DATA)
+ trace_leaf = true;
+ /* Read out real @src_eb, pointed by @src_parent and @src_slot */
+ child_bytenr = btrfs_node_blockptr(src_parent, src_slot);
+ child_gen = btrfs_node_ptr_generation(src_parent, src_slot);
+ btrfs_node_key_to_cpu(src_parent, &first_key, src_slot);
+
+ src_eb = read_tree_block(fs_info, child_bytenr, child_gen,
+ btrfs_header_level(src_parent) - 1, &first_key);
+ if (IS_ERR(src_eb)) {
+ ret = PTR_ERR(src_eb);
+ goto out;
+ }
+
+ /* Read out real @dst_eb, pointed by @src_parent and @src_slot */
+ child_bytenr = btrfs_node_blockptr(dst_parent, dst_slot);
+ child_gen = btrfs_node_ptr_generation(dst_parent, dst_slot);
+ btrfs_node_key_to_cpu(dst_parent, &first_key, dst_slot);
+
+ dst_eb = read_tree_block(fs_info, child_bytenr, child_gen,
+ btrfs_header_level(dst_parent) - 1, &first_key);
+ if (IS_ERR(dst_eb)) {
+ ret = PTR_ERR(dst_eb);
+ goto out;
+ }
+
+ if (!extent_buffer_uptodate(src_eb) || !extent_buffer_uptodate(dst_eb)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ level = btrfs_header_level(dst_eb);
+ dst_path = btrfs_alloc_path();
+ if (!dst_path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /* For dst_path */
+ extent_buffer_get(dst_eb);
+ dst_path->nodes[level] = dst_eb;
+ dst_path->slots[level] = 0;
+ dst_path->locks[level] = 0;
+
+ /* Do the generation-aware breadth-first search */
+ ret = qgroup_trace_new_subtree_blocks(trans, src_eb, dst_path, level,
+ level, last_snapshot, trace_leaf);
+ if (ret < 0)
+ goto out;
+ ret = 0;
+
+out:
+ free_extent_buffer(src_eb);
+ free_extent_buffer(dst_eb);
+ btrfs_free_path(dst_path);
+ if (ret < 0)
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+ return ret;
+}
+
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *root_eb,
u64 root_gen, int root_level)
@@ -2133,6 +2543,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
struct btrfs_delayed_ref_root *delayed_refs;
struct ulist *new_roots = NULL;
struct rb_node *node;
+ u64 num_dirty_extents = 0;
u64 qgroup_to_skip;
int ret = 0;
@@ -2142,6 +2553,7 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
record = rb_entry(node, struct btrfs_qgroup_extent_record,
node);
+ num_dirty_extents++;
trace_btrfs_qgroup_account_extents(fs_info, record);
if (!ret) {
@@ -2187,6 +2599,8 @@ cleanup:
kfree(record);
}
+ trace_qgroup_num_dirty_extents(fs_info, trans->transid,
+ num_dirty_extents);
return ret;
}
@@ -2898,6 +3312,7 @@ qgroup_rescan_zero_tracking(struct btrfs_fs_info *fs_info)
qgroup->rfer_cmpr = 0;
qgroup->excl = 0;
qgroup->excl_cmpr = 0;
+ qgroup_dirty(fs_info, qgroup);
}
spin_unlock(&fs_info->qgroup_lock);
}
@@ -3005,7 +3420,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
int ret;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &root->fs_info->flags) ||
- !is_fstree(root->objectid) || len == 0)
+ !is_fstree(root->root_key.objectid) || len == 0)
return 0;
/* @reserved parameter is mandatory for qgroup */
@@ -3091,7 +3506,7 @@ static int qgroup_free_reserved_data(struct inode *inode,
goto out;
freed += changeset.bytes_changed;
}
- btrfs_qgroup_free_refroot(root->fs_info, root->objectid, freed,
+ btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed,
BTRFS_QGROUP_RSV_DATA);
ret = freed;
out:
@@ -3107,6 +3522,10 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
int trace_op = QGROUP_RELEASE;
int ret;
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED,
+ &BTRFS_I(inode)->root->fs_info->flags))
+ return 0;
+
/* In release case, we shouldn't have @reserved */
WARN_ON(!free && reserved);
if (free && reserved)
@@ -3123,7 +3542,7 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
changeset.bytes_changed, trace_op);
if (free)
btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
- BTRFS_I(inode)->root->objectid,
+ BTRFS_I(inode)->root->root_key.objectid,
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
ret = changeset.bytes_changed;
out:
@@ -3216,7 +3635,7 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
int ret;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- !is_fstree(root->objectid) || num_bytes == 0)
+ !is_fstree(root->root_key.objectid) || num_bytes == 0)
return 0;
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
@@ -3241,13 +3660,13 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
struct btrfs_fs_info *fs_info = root->fs_info;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- !is_fstree(root->objectid))
+ !is_fstree(root->root_key.objectid))
return;
/* TODO: Update trace point to handle such free */
trace_qgroup_meta_free_all_pertrans(root);
/* Special value -1 means to free all reserved space */
- btrfs_qgroup_free_refroot(fs_info, root->objectid, (u64)-1,
+ btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid, (u64)-1,
BTRFS_QGROUP_RSV_META_PERTRANS);
}
@@ -3257,7 +3676,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
struct btrfs_fs_info *fs_info = root->fs_info;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- !is_fstree(root->objectid))
+ !is_fstree(root->root_key.objectid))
return;
/*
@@ -3268,7 +3687,8 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,
num_bytes = sub_root_meta_rsv(root, num_bytes, type);
BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize));
trace_qgroup_meta_reserve(root, type, -(s64)num_bytes);
- btrfs_qgroup_free_refroot(fs_info, root->objectid, num_bytes, type);
+ btrfs_qgroup_free_refroot(fs_info, root->root_key.objectid,
+ num_bytes, type);
}
static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root,
@@ -3322,13 +3742,13 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
struct btrfs_fs_info *fs_info = root->fs_info;
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) ||
- !is_fstree(root->objectid))
+ !is_fstree(root->root_key.objectid))
return;
/* Same as btrfs_qgroup_free_meta_prealloc() */
num_bytes = sub_root_meta_rsv(root, num_bytes,
BTRFS_QGROUP_RSV_META_PREALLOC);
trace_qgroup_meta_convert(root, num_bytes);
- qgroup_convert_meta(fs_info, root->objectid, num_bytes);
+ qgroup_convert_meta(fs_info, root->root_key.objectid, num_bytes);
}
/*
@@ -3355,7 +3775,7 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
inode->i_ino, unode->val, unode->aux);
}
btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
- BTRFS_I(inode)->root->objectid,
+ BTRFS_I(inode)->root->root_key.objectid,
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 54b8bb282c0e..d8f78f5ab854 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -236,6 +236,12 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
struct extent_buffer *root_eb,
u64 root_gen, int root_level);
+
+int btrfs_qgroup_trace_subtree_swap(struct btrfs_trans_handle *trans,
+ struct btrfs_block_group_cache *bg_cache,
+ struct extent_buffer *src_parent, int src_slot,
+ struct extent_buffer *dst_parent, int dst_slot,
+ u64 last_snapshot);
int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes, struct ulist *old_roots,
struct ulist *new_roots);
@@ -249,6 +255,8 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info,
static inline void btrfs_qgroup_free_delayed_ref(struct btrfs_fs_info *fs_info,
u64 ref_root, u64 num_bytes)
{
+ if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags))
+ return;
trace_btrfs_qgroup_free_delayed_ref(fs_info, ref_root, num_bytes);
btrfs_qgroup_free_refroot(fs_info, ref_root, num_bytes,
BTRFS_QGROUP_RSV_DATA);
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index e5b9e596bb92..d69fbfb30aa9 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -732,7 +732,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
INIT_LIST_HEAD(&ra->list);
ra->action = action;
- ra->root = root->objectid;
+ ra->root = root->root_key.objectid;
/*
* This is an allocation, preallocate the block_entry in case we haven't
@@ -787,8 +787,8 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
* one we want to lookup below when we modify the
* re->num_refs.
*/
- ref_root = root->objectid;
- re->root_objectid = root->objectid;
+ ref_root = root->root_key.objectid;
+ re->root_objectid = root->root_key.objectid;
re->num_refs = 0;
}
@@ -862,7 +862,7 @@ int btrfs_ref_tree_mod(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
* didn't thik of some other corner case.
*/
btrfs_err(fs_info, "failed to find root %llu for %llu",
- root->objectid, be->bytenr);
+ root->root_key.objectid, be->bytenr);
dump_block_entry(fs_info, be);
dump_ref_action(fs_info, ra);
kfree(ra);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 8783a1776540..924116f654a1 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -648,8 +648,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
int level, u64 bytenr)
{
struct backref_cache *cache = &rc->backref_cache;
- struct btrfs_path *path1;
- struct btrfs_path *path2;
+ struct btrfs_path *path1; /* For searching extent root */
+ struct btrfs_path *path2; /* For searching parent of TREE_BLOCK_REF */
struct extent_buffer *eb;
struct btrfs_root *root;
struct backref_node *cur;
@@ -662,7 +662,7 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
struct btrfs_key key;
unsigned long end;
unsigned long ptr;
- LIST_HEAD(list);
+ LIST_HEAD(list); /* Pending edge list, upper node needs to be checked */
LIST_HEAD(useless);
int cowonly;
int ret;
@@ -778,6 +778,10 @@ again:
key.type != BTRFS_SHARED_BLOCK_REF_KEY);
}
+ /*
+ * Parent node found and matches current inline ref, no need to
+ * rebuild this node for this inline ref.
+ */
if (exist &&
((key.type == BTRFS_TREE_BLOCK_REF_KEY &&
exist->owner == key.offset) ||
@@ -787,11 +791,12 @@ again:
goto next;
}
+ /* SHARED_BLOCK_REF means key.offset is the parent bytenr */
if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) {
if (key.objectid == key.offset) {
/*
- * only root blocks of reloc trees use
- * backref of this type.
+ * Only root blocks of reloc trees use backref
+ * pointing to itself.
*/
root = find_reloc_root(rc, cur->bytenr);
ASSERT(root);
@@ -840,7 +845,11 @@ again:
goto next;
}
- /* key.type == BTRFS_TREE_BLOCK_REF_KEY */
+ /*
+ * key.type == BTRFS_TREE_BLOCK_REF_KEY, inline ref offset
+ * means the root objectid. We need to search the tree to get
+ * its parent bytenr.
+ */
root = read_fs_root(rc->extent_root->fs_info, key.offset);
if (IS_ERR(root)) {
err = PTR_ERR(root);
@@ -863,10 +872,7 @@ again:
level = cur->level + 1;
- /*
- * searching the tree to find upper level blocks
- * reference the block.
- */
+ /* Search the tree to find parent blocks referring the block. */
path2->search_commit_root = 1;
path2->skip_locking = 1;
path2->lowest_level = level;
@@ -884,7 +890,8 @@ again:
cur->bytenr) {
btrfs_err(root->fs_info,
"couldn't find block (%llu) (level %d) in tree (%llu) with key (%llu %u %llu)",
- cur->bytenr, level - 1, root->objectid,
+ cur->bytenr, level - 1,
+ root->root_key.objectid,
node_key->objectid, node_key->type,
node_key->offset);
err = -ENOENT;
@@ -892,6 +899,8 @@ again:
}
lower = cur;
need_check = true;
+
+ /* Add all nodes and edges in the path */
for (; level < BTRFS_MAX_LEVEL; level++) {
if (!path2->nodes[level]) {
ASSERT(btrfs_root_bytenr(&root->root_item) ==
@@ -1281,7 +1290,7 @@ static void __del_reloc_root(struct btrfs_root *root)
struct mapping_node *node = NULL;
struct reloc_control *rc = fs_info->reloc_ctl;
- if (rc) {
+ if (rc && root->node) {
spin_lock(&rc->reloc_root_tree.lock);
rb_node = tree_search(&rc->reloc_root_tree.rb_root,
root->node->start);
@@ -1735,7 +1744,7 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot,
* errors, a negative error number is returned.
*/
static noinline_for_stack
-int replace_path(struct btrfs_trans_handle *trans,
+int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
struct btrfs_root *dest, struct btrfs_root *src,
struct btrfs_path *path, struct btrfs_key *next_key,
int lowest_level, int max_level)
@@ -1879,14 +1888,9 @@ again:
* and tree block numbers, if current trans doesn't free
* data reloc tree inode.
*/
- ret = btrfs_qgroup_trace_subtree(trans, parent,
- btrfs_header_generation(parent),
- btrfs_header_level(parent));
- if (ret < 0)
- break;
- ret = btrfs_qgroup_trace_subtree(trans, path->nodes[level],
- btrfs_header_generation(path->nodes[level]),
- btrfs_header_level(path->nodes[level]));
+ ret = btrfs_qgroup_trace_subtree_swap(trans, rc->block_group,
+ parent, slot, path->nodes[level],
+ path->slots[level], last_snapshot);
if (ret < 0)
break;
@@ -2205,7 +2209,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
ret = 0;
} else {
- ret = replace_path(trans, root, reloc_root, path,
+ ret = replace_path(trans, rc, root, reloc_root, path,
&next_key, level, max_level);
}
if (ret < 0) {
@@ -2911,7 +2915,6 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info,
free_extent_buffer(eb);
return -EIO;
}
- WARN_ON(btrfs_header_level(eb) != block->level);
if (block->level == 0)
btrfs_item_key_to_cpu(eb, &block->key, 0);
else
@@ -2987,7 +2990,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
struct backref_node *node;
struct btrfs_path *path;
struct tree_block *block;
- struct rb_node *rb_node;
+ struct tree_block *next;
int ret;
int err = 0;
@@ -2997,29 +3000,23 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
goto out_free_blocks;
}
- rb_node = rb_first(blocks);
- while (rb_node) {
- block = rb_entry(rb_node, struct tree_block, rb_node);
+ /* Kick in readahead for tree blocks with missing keys */
+ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
if (!block->key_ready)
readahead_tree_block(fs_info, block->bytenr);
- rb_node = rb_next(rb_node);
}
- rb_node = rb_first(blocks);
- while (rb_node) {
- block = rb_entry(rb_node, struct tree_block, rb_node);
+ /* Get first keys */
+ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
if (!block->key_ready) {
err = get_tree_block_key(fs_info, block);
if (err)
goto out_free_path;
}
- rb_node = rb_next(rb_node);
}
- rb_node = rb_first(blocks);
- while (rb_node) {
- block = rb_entry(rb_node, struct tree_block, rb_node);
-
+ /* Do tree relocation */
+ rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
node = build_backref_tree(rc, &block->key,
block->level, block->bytenr);
if (IS_ERR(node)) {
@@ -3030,11 +3027,10 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
ret = relocate_tree_block(trans, rc, node, &block->key,
path);
if (ret < 0) {
- if (ret != -EAGAIN || rb_node == rb_first(blocks))
+ if (ret != -EAGAIN || &block->rb_node == rb_first(blocks))
err = ret;
goto out;
}
- rb_node = rb_next(rb_node);
}
out:
err = finish_pending_nodes(trans, rc, path, err);
@@ -4669,7 +4665,7 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
if (rc->merge_reloc_tree) {
ret = btrfs_block_rsv_migrate(&pending->block_rsv,
rc->block_rsv,
- rc->nodes_relocated, 1);
+ rc->nodes_relocated, true);
if (ret)
return ret;
}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3be1456b5116..902819d3cf41 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1124,7 +1124,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
if (scrub_write_page_to_dev_replace(sblock_other,
page_num) != 0) {
- btrfs_dev_replace_stats_inc(
+ atomic64_inc(
&fs_info->dev_replace.num_write_errors);
success = 0;
}
@@ -1564,8 +1564,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
if (btrfsic_submit_bio_wait(bio)) {
btrfs_dev_stat_inc_and_print(page_bad->dev,
BTRFS_DEV_STAT_WRITE_ERRS);
- btrfs_dev_replace_stats_inc(
- &fs_info->dev_replace.num_write_errors);
+ atomic64_inc(&fs_info->dev_replace.num_write_errors);
bio_put(bio);
return -EIO;
}
@@ -1592,8 +1591,7 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
ret = scrub_write_page_to_dev_replace(sblock, page_num);
if (ret)
- btrfs_dev_replace_stats_inc(
- &fs_info->dev_replace.num_write_errors);
+ atomic64_inc(&fs_info->dev_replace.num_write_errors);
}
}
@@ -1726,8 +1724,7 @@ static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
struct scrub_page *spage = sbio->pagev[i];
spage->io_error = 1;
- btrfs_dev_replace_stats_inc(&dev_replace->
- num_write_errors);
+ atomic64_inc(&dev_replace->num_write_errors);
}
}
@@ -3022,8 +3019,7 @@ out:
static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
struct map_lookup *map,
struct btrfs_device *scrub_dev,
- int num, u64 base, u64 length,
- int is_dev_replace)
+ int num, u64 base, u64 length)
{
struct btrfs_path *path, *ppath;
struct btrfs_fs_info *fs_info = sctx->fs_info;
@@ -3299,7 +3295,7 @@ again:
extent_physical = extent_logical - logical + physical;
extent_dev = scrub_dev;
extent_mirror_num = mirror_num;
- if (is_dev_replace)
+ if (sctx->is_dev_replace)
scrub_remap_extent(fs_info, extent_logical,
extent_len, &extent_physical,
&extent_dev,
@@ -3397,8 +3393,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
struct btrfs_device *scrub_dev,
u64 chunk_offset, u64 length,
u64 dev_offset,
- struct btrfs_block_group_cache *cache,
- int is_dev_replace)
+ struct btrfs_block_group_cache *cache)
{
struct btrfs_fs_info *fs_info = sctx->fs_info;
struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
@@ -3435,8 +3430,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
map->stripes[i].physical == dev_offset) {
ret = scrub_stripe(sctx, map, scrub_dev, i,
- chunk_offset, length,
- is_dev_replace);
+ chunk_offset, length);
if (ret)
goto out;
}
@@ -3449,8 +3443,7 @@ out:
static noinline_for_stack
int scrub_enumerate_chunks(struct scrub_ctx *sctx,
- struct btrfs_device *scrub_dev, u64 start, u64 end,
- int is_dev_replace)
+ struct btrfs_device *scrub_dev, u64 start, u64 end)
{
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_path *path;
@@ -3544,7 +3537,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
*/
scrub_pause_on(fs_info);
ret = btrfs_inc_block_group_ro(cache);
- if (!ret && is_dev_replace) {
+ if (!ret && sctx->is_dev_replace) {
/*
* If we are doing a device replace wait for any tasks
* that started dellaloc right before we set the block
@@ -3609,7 +3602,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
dev_replace->item_needs_writeback = 1;
btrfs_dev_replace_write_unlock(&fs_info->dev_replace);
ret = scrub_chunk(sctx, scrub_dev, chunk_offset, length,
- found_key.offset, cache, is_dev_replace);
+ found_key.offset, cache);
/*
* flush, submit all pending read and write bios, afterwards
@@ -3670,7 +3663,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
btrfs_put_block_group(cache);
if (ret)
break;
- if (is_dev_replace &&
+ if (sctx->is_dev_replace &&
atomic64_read(&dev_replace->num_write_errors) > 0) {
ret = -EIO;
break;
@@ -3893,8 +3886,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
}
if (!ret)
- ret = scrub_enumerate_chunks(sctx, dev, start, end,
- is_dev_replace);
+ ret = scrub_enumerate_chunks(sctx, dev, start, end);
wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
atomic_dec(&fs_info->scrubs_running);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ba8950bfd9c7..094cc1444a90 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1186,9 +1186,9 @@ static int __clone_root_cmp_bsearch(const void *key, const void *elt)
u64 root = (u64)(uintptr_t)key;
struct clone_root *cr = (struct clone_root *)elt;
- if (root < cr->root->objectid)
+ if (root < cr->root->root_key.objectid)
return -1;
- if (root > cr->root->objectid)
+ if (root > cr->root->root_key.objectid)
return 1;
return 0;
}
@@ -1198,9 +1198,9 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
struct clone_root *cr1 = (struct clone_root *)e1;
struct clone_root *cr2 = (struct clone_root *)e2;
- if (cr1->root->objectid < cr2->root->objectid)
+ if (cr1->root->root_key.objectid < cr2->root->root_key.objectid)
return -1;
- if (cr1->root->objectid > cr2->root->objectid)
+ if (cr1->root->root_key.objectid > cr2->root->root_key.objectid)
return 1;
return 0;
}
@@ -1693,12 +1693,8 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
di = btrfs_lookup_dir_item(NULL, root, path,
dir, name, name_len, 0);
- if (!di) {
- ret = -ENOENT;
- goto out;
- }
- if (IS_ERR(di)) {
- ret = PTR_ERR(di);
+ if (IS_ERR_OR_NULL(di)) {
+ ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
}
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
@@ -2346,7 +2342,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
return -ENOMEM;
}
- key.objectid = send_root->objectid;
+ key.objectid = send_root->root_key.objectid;
key.type = BTRFS_ROOT_BACKREF_KEY;
key.offset = 0;
@@ -2362,7 +2358,7 @@ static int send_subvol_begin(struct send_ctx *sctx)
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.type != BTRFS_ROOT_BACKREF_KEY ||
- key.objectid != send_root->objectid) {
+ key.objectid != send_root->root_key.objectid) {
ret = -ENOENT;
goto out;
}
@@ -4907,8 +4903,8 @@ static int send_clone(struct send_ctx *sctx,
btrfs_debug(sctx->send_root->fs_info,
"send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu",
- offset, len, clone_root->root->objectid, clone_root->ino,
- clone_root->offset);
+ offset, len, clone_root->root->root_key.objectid,
+ clone_root->ino, clone_root->offset);
p = fs_path_alloc();
if (!p)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 6601c9aa5e35..b362b45dd757 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -2177,8 +2177,10 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]);
buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]);
/* Mask in the root object ID too, to disambiguate subvols */
- buf->f_fsid.val[0] ^= BTRFS_I(d_inode(dentry))->root->objectid >> 32;
- buf->f_fsid.val[1] ^= BTRFS_I(d_inode(dentry))->root->objectid;
+ buf->f_fsid.val[0] ^=
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid >> 32;
+ buf->f_fsid.val[1] ^=
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid;
return 0;
}
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index d9269a531a4d..9e0f4a01be14 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -106,7 +106,7 @@ static int test_find_delalloc(u32 sectorsize)
set_extent_delalloc(&tmp, 0, sectorsize - 1, 0, NULL);
start = 0;
end = 0;
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
test_err("should have found at least one delalloc");
@@ -137,7 +137,7 @@ static int test_find_delalloc(u32 sectorsize)
set_extent_delalloc(&tmp, sectorsize, max_bytes - 1, 0, NULL);
start = test_start;
end = 0;
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
test_err("couldn't find delalloc in our range");
@@ -171,7 +171,7 @@ static int test_find_delalloc(u32 sectorsize)
}
start = test_start;
end = 0;
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (found) {
test_err("found range when we shouldn't have");
@@ -192,7 +192,7 @@ static int test_find_delalloc(u32 sectorsize)
set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, 0, NULL);
start = test_start;
end = 0;
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
test_err("didn't find our range");
@@ -233,7 +233,7 @@ static int test_find_delalloc(u32 sectorsize)
* this changes at any point in the future we will need to fix this
* tests expected behavior.
*/
- found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
+ found = btrfs_find_lock_delalloc_range(inode, &tmp, locked_page, &start,
&end, max_bytes);
if (!found) {
test_err("didn't find our range");
diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c
index 385a5316e4bf..bf15d3a7f20e 100644
--- a/fs/btrfs/tests/extent-map-tests.c
+++ b/fs/btrfs/tests/extent-map-tests.c
@@ -12,8 +12,8 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree)
struct extent_map *em;
struct rb_node *node;
- while (!RB_EMPTY_ROOT(&em_tree->map)) {
- node = rb_first(&em_tree->map);
+ while (!RB_EMPTY_ROOT(&em_tree->map.rb_root)) {
+ node = rb_first_cached(&em_tree->map);
em = rb_entry(node, struct extent_map, rb_node);
remove_extent_mapping(em_tree, em);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3b84f5015029..5686290a50e1 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -44,7 +44,8 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
WARN_ON(refcount_read(&transaction->use_count) == 0);
if (refcount_dec_and_test(&transaction->use_count)) {
BUG_ON(!list_empty(&transaction->list));
- WARN_ON(!RB_EMPTY_ROOT(&transaction->delayed_refs.href_root));
+ WARN_ON(!RB_EMPTY_ROOT(
+ &transaction->delayed_refs.href_root.rb_root));
if (transaction->delayed_refs.pending_csums)
btrfs_err(transaction->fs_info,
"pending csums is %llu",
@@ -118,7 +119,7 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans)
list_del_init(&root->dirty_list);
free_extent_buffer(root->commit_root);
root->commit_root = btrfs_root_node(root);
- if (is_fstree(root->objectid))
+ if (is_fstree(root->root_key.objectid))
btrfs_unpin_free_ino(root);
clear_btree_io_tree(&root->dirty_log_pages);
}
@@ -245,7 +246,7 @@ loop:
memset(&cur_trans->delayed_refs, 0, sizeof(cur_trans->delayed_refs));
- cur_trans->delayed_refs.href_root = RB_ROOT;
+ cur_trans->delayed_refs.href_root = RB_ROOT_CACHED;
cur_trans->delayed_refs.dirty_extent_root = RB_ROOT;
atomic_set(&cur_trans->delayed_refs.num_entries, 0);
@@ -759,7 +760,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- if (btrfs_check_space_for_delayed_refs(trans, fs_info))
+ if (btrfs_check_space_for_delayed_refs(trans))
return 1;
return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
@@ -834,7 +835,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
trans->delayed_ref_updates = 0;
if (!trans->sync) {
must_run_delayed_refs =
- btrfs_should_throttle_delayed_refs(trans, info);
+ btrfs_should_throttle_delayed_refs(trans);
cur = max_t(unsigned long, cur, 32);
/*
@@ -1197,7 +1198,10 @@ again:
list_add_tail(&fs_info->extent_root->dirty_list,
&trans->transaction->switch_commits);
- btrfs_after_dev_replace_commit(fs_info);
+
+ /* Update dev-replace pointer once everything is committed */
+ fs_info->dev_replace.committed_cursor_left =
+ fs_info->dev_replace.cursor_left_last_write_of_item;
return 0;
}
@@ -1613,10 +1617,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
if (ret < 0)
goto fail;
- ret = btrfs_insert_dir_item(trans, parent_root,
- dentry->d_name.name, dentry->d_name.len,
- BTRFS_I(parent_inode), &key,
- BTRFS_FT_DIR, index);
+ ret = btrfs_insert_dir_item(trans, dentry->d_name.name,
+ dentry->d_name.len, BTRFS_I(parent_inode),
+ &key, BTRFS_FT_DIR, index);
/* We have check then name at the beginning, so it is impossible. */
BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
if (ret) {
@@ -1929,6 +1932,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
return ret;
}
+ btrfs_trans_release_metadata(trans);
+ trans->block_rsv = NULL;
+
/* make a pass through all the delayed refs we have so far
* any runnings procs may add more while we are here
*/
@@ -1938,9 +1944,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
return ret;
}
- btrfs_trans_release_metadata(trans);
- trans->block_rsv = NULL;
-
cur_trans = trans->transaction;
/*
@@ -2330,7 +2333,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
list_del_init(&root->root_list);
spin_unlock(&fs_info->trans_lock);
- btrfs_debug(fs_info, "cleaner removing %llu", root->objectid);
+ btrfs_debug(fs_info, "cleaner removing %llu", root->root_key.objectid);
btrfs_kill_all_delayed_nodes(root);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index db835635372f..cab0b1f1f741 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -487,6 +487,13 @@ static int check_leaf(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf,
u32 nritems = btrfs_header_nritems(leaf);
int slot;
+ if (btrfs_header_level(leaf) != 0) {
+ generic_err(fs_info, leaf, 0,
+ "invalid level for leaf, have %d expect 0",
+ btrfs_header_level(leaf));
+ return -EUCLEAN;
+ }
+
/*
* Extent buffers from a relocation tree have a owner field that
* corresponds to the subvolume tree they are based on. So just from an
@@ -645,9 +652,16 @@ int btrfs_check_node(struct btrfs_fs_info *fs_info, struct extent_buffer *node)
unsigned long nr = btrfs_header_nritems(node);
struct btrfs_key key, next_key;
int slot;
+ int level = btrfs_header_level(node);
u64 bytenr;
int ret = 0;
+ if (level <= 0 || level >= BTRFS_MAX_LEVEL) {
+ generic_err(fs_info, node, 0,
+ "invalid level for node, have %d expect [1, %d]",
+ level, BTRFS_MAX_LEVEL - 1);
+ return -EUCLEAN;
+ }
if (nr == 0 || nr > BTRFS_NODEPTRS_PER_BLOCK(fs_info)) {
btrfs_crit(fs_info,
"corrupt node: root=%llu block=%llu, nritems too %s, have %lu expect range [1,%u]",
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 1650dc44a5e3..0dba09334a16 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -205,14 +205,11 @@ static int join_running_log_trans(struct btrfs_root *root)
* until you call btrfs_end_log_trans() or it makes any future
* log transactions wait until you call btrfs_end_log_trans()
*/
-int btrfs_pin_log_trans(struct btrfs_root *root)
+void btrfs_pin_log_trans(struct btrfs_root *root)
{
- int ret = -ENOENT;
-
mutex_lock(&root->log_mutex);
atomic_inc(&root->log_writers);
mutex_unlock(&root->log_mutex);
- return ret;
}
/*
@@ -258,6 +255,13 @@ struct walk_control {
/* what stage of the replay code we're currently in */
int stage;
+ /*
+ * Ignore any items from the inode currently being processed. Needs
+ * to be set every time we find a BTRFS_INODE_ITEM_KEY and we are in
+ * the LOG_WALK_REPLAY_INODES stage.
+ */
+ bool ignore_cur_inode;
+
/* the root we are currently replaying */
struct btrfs_root *replay_dest;
@@ -2487,6 +2491,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
inode_item = btrfs_item_ptr(eb, i,
struct btrfs_inode_item);
+ /*
+ * If we have a tmpfile (O_TMPFILE) that got fsync'ed
+ * and never got linked before the fsync, skip it, as
+ * replaying it is pointless since it would be deleted
+ * later. We skip logging tmpfiles, but it's always
+ * possible we are replaying a log created with a kernel
+ * that used to log tmpfiles.
+ */
+ if (btrfs_inode_nlink(eb, inode_item) == 0) {
+ wc->ignore_cur_inode = true;
+ continue;
+ } else {
+ wc->ignore_cur_inode = false;
+ }
ret = replay_xattr_deletes(wc->trans, root, log,
path, key.objectid);
if (ret)
@@ -2524,16 +2542,8 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
root->fs_info->sectorsize);
ret = btrfs_drop_extents(wc->trans, root, inode,
from, (u64)-1, 1);
- /*
- * If the nlink count is zero here, the iput
- * will free the inode. We bump it to make
- * sure it doesn't get freed until the link
- * count fixup is done.
- */
if (!ret) {
- if (inode->i_nlink == 0)
- inc_nlink(inode);
- /* Update link count and nbytes. */
+ /* Update the inode's nbytes. */
ret = btrfs_update_inode(wc->trans,
root, inode);
}
@@ -2548,6 +2558,9 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
break;
}
+ if (wc->ignore_cur_inode)
+ continue;
+
if (key.type == BTRFS_DIR_INDEX_KEY &&
wc->stage == LOG_WALK_REPLAY_DIR_INDEX) {
ret = replay_one_dir_item(wc->trans, root, path,
@@ -3196,9 +3209,12 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
};
ret = walk_log_tree(trans, log, &wc);
- /* I don't think this can happen but just in case */
- if (ret)
- btrfs_abort_transaction(trans, ret);
+ if (ret) {
+ if (trans)
+ btrfs_abort_transaction(trans, ret);
+ else
+ btrfs_handle_fs_error(log->fs_info, ret, NULL);
+ }
while (1) {
ret = find_first_extent_bit(&log->dirty_log_pages,
@@ -5564,9 +5580,33 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
dir_inode = btrfs_iget(fs_info->sb, &inode_key,
root, NULL);
- /* If parent inode was deleted, skip it. */
- if (IS_ERR(dir_inode))
- continue;
+ /*
+ * If the parent inode was deleted, return an error to
+ * fallback to a transaction commit. This is to prevent
+ * getting an inode that was moved from one parent A to
+ * a parent B, got its former parent A deleted and then
+ * it got fsync'ed, from existing at both parents after
+ * a log replay (and the old parent still existing).
+ * Example:
+ *
+ * mkdir /mnt/A
+ * mkdir /mnt/B
+ * touch /mnt/B/bar
+ * sync
+ * mv /mnt/B/bar /mnt/A/bar
+ * mv -T /mnt/A /mnt/B
+ * fsync /mnt/B/bar
+ * <power fail>
+ *
+ * If we ignore the old parent B which got deleted,
+ * after a log replay we would have file bar linked
+ * at both parents and the old parent B would still
+ * exist.
+ */
+ if (IS_ERR(dir_inode)) {
+ ret = PTR_ERR(dir_inode);
+ goto out;
+ }
if (ctx)
ctx->log_new_dentries = false;
@@ -5640,7 +5680,13 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
if (ret)
goto end_no_trans;
- if (btrfs_inode_in_log(inode, trans->transid)) {
+ /*
+ * Skip already logged inodes or inodes corresponding to tmpfiles
+ * (since logging them is pointless, a link count of 0 means they
+ * will never be accessible).
+ */
+ if (btrfs_inode_in_log(inode, trans->transid) ||
+ inode->vfs_inode.i_nlink == 0) {
ret = BTRFS_NO_LOG_SYNC;
goto end_no_trans;
}
@@ -6025,14 +6071,25 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
* Call this after adding a new name for a file and it will properly
* update the log to reflect the new name.
*
- * It will return zero if all goes well, and it will return 1 if a
- * full transaction commit is required.
+ * @ctx can not be NULL when @sync_log is false, and should be NULL when it's
+ * true (because it's not used).
+ *
+ * Return value depends on whether @sync_log is true or false.
+ * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
+ * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT
+ * otherwise.
+ * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to
+ * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log,
+ * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
+ * committed (without attempting to sync the log).
*/
int btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent)
+ struct dentry *parent,
+ bool sync_log, struct btrfs_log_ctx *ctx)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
+ int ret;
/*
* this will force the logging code to walk the dentry chain
@@ -6047,9 +6104,34 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
*/
if (inode->logged_trans <= fs_info->last_trans_committed &&
(!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
- return 0;
+ return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT :
+ BTRFS_DONT_NEED_LOG_SYNC;
+
+ if (sync_log) {
+ struct btrfs_log_ctx ctx2;
+
+ btrfs_init_log_ctx(&ctx2, &inode->vfs_inode);
+ ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
+ LOG_INODE_EXISTS, &ctx2);
+ if (ret == BTRFS_NO_LOG_SYNC)
+ return BTRFS_DONT_NEED_TRANS_COMMIT;
+ else if (ret)
+ return BTRFS_NEED_TRANS_COMMIT;
+
+ ret = btrfs_sync_log(trans, inode->root, &ctx2);
+ if (ret)
+ return BTRFS_NEED_TRANS_COMMIT;
+ return BTRFS_DONT_NEED_TRANS_COMMIT;
+ }
+
+ ASSERT(ctx);
+ ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
+ LOG_INODE_EXISTS, ctx);
+ if (ret == BTRFS_NO_LOG_SYNC)
+ return BTRFS_DONT_NEED_LOG_SYNC;
+ else if (ret)
+ return BTRFS_NEED_TRANS_COMMIT;
- return btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, NULL);
+ return BTRFS_NEED_LOG_SYNC;
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 122e68b89a5a..767765031e59 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -65,14 +65,22 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *inode, u64 dirid);
void btrfs_end_log_trans(struct btrfs_root *root);
-int btrfs_pin_log_trans(struct btrfs_root *root);
+void btrfs_pin_log_trans(struct btrfs_root *root);
void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir, struct btrfs_inode *inode,
int for_rename);
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir);
+/* Return values for btrfs_log_new_name() */
+enum {
+ BTRFS_DONT_NEED_TRANS_COMMIT,
+ BTRFS_NEED_TRANS_COMMIT,
+ BTRFS_DONT_NEED_LOG_SYNC,
+ BTRFS_NEED_LOG_SYNC,
+};
int btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent);
+ struct dentry *parent,
+ bool sync_log, struct btrfs_log_ctx *ctx);
#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index da86706123ff..f435d397019e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1613,7 +1613,7 @@ static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
em_tree = &fs_info->mapping_tree.map_tree;
read_lock(&em_tree->lock);
- n = rb_last(&em_tree->map);
+ n = rb_last(&em_tree->map.rb_root);
if (n) {
em = rb_entry(n, struct extent_map, rb_node);
ret = em->start + em->len;
@@ -1854,6 +1854,24 @@ void btrfs_assign_next_active_device(struct btrfs_device *device,
fs_info->fs_devices->latest_bdev = next_device->bdev;
}
+/*
+ * Return btrfs_fs_devices::num_devices excluding the device that's being
+ * currently replaced.
+ */
+static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
+{
+ u64 num_devices = fs_info->fs_devices->num_devices;
+
+ btrfs_dev_replace_read_lock(&fs_info->dev_replace);
+ if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
+ ASSERT(num_devices > 1);
+ num_devices--;
+ }
+ btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+
+ return num_devices;
+}
+
int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
u64 devid)
{
@@ -1865,22 +1883,22 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
mutex_lock(&uuid_mutex);
- num_devices = fs_devices->num_devices;
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
- WARN_ON(num_devices < 1);
- num_devices--;
- }
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+ num_devices = btrfs_num_devices(fs_info);
ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
if (ret)
goto out;
- ret = btrfs_find_device_by_devspec(fs_info, devid, device_path,
- &device);
- if (ret)
+ device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
+
+ if (IS_ERR(device)) {
+ if (PTR_ERR(device) == -ENOENT &&
+ strcmp(device_path, "missing") == 0)
+ ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
+ else
+ ret = PTR_ERR(device);
goto out;
+ }
if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
ret = BTRFS_ERROR_DEV_TGT_REPLACE;
@@ -2096,9 +2114,8 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
call_rcu(&tgtdev->rcu, free_device_rcu);
}
-static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
- const char *device_path,
- struct btrfs_device **device)
+static struct btrfs_device *btrfs_find_device_by_path(
+ struct btrfs_fs_info *fs_info, const char *device_path)
{
int ret = 0;
struct btrfs_super_block *disk_super;
@@ -2106,28 +2123,27 @@ static int btrfs_find_device_by_path(struct btrfs_fs_info *fs_info,
u8 *dev_uuid;
struct block_device *bdev;
struct buffer_head *bh;
+ struct btrfs_device *device;
- *device = NULL;
ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
fs_info->bdev_holder, 0, &bdev, &bh);
if (ret)
- return ret;
+ return ERR_PTR(ret);
disk_super = (struct btrfs_super_block *)bh->b_data;
devid = btrfs_stack_device_id(&disk_super->dev_item);
dev_uuid = disk_super->dev_item.uuid;
- *device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
+ device = btrfs_find_device(fs_info, devid, dev_uuid, disk_super->fsid);
brelse(bh);
- if (!*device)
- ret = -ENOENT;
+ if (!device)
+ device = ERR_PTR(-ENOENT);
blkdev_put(bdev, FMODE_READ);
- return ret;
+ return device;
}
-int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
- const char *device_path,
- struct btrfs_device **device)
+static struct btrfs_device *btrfs_find_device_missing_or_by_path(
+ struct btrfs_fs_info *fs_info, const char *device_path)
{
- *device = NULL;
+ struct btrfs_device *device = NULL;
if (strcmp(device_path, "missing") == 0) {
struct list_head *devices;
struct btrfs_device *tmp;
@@ -2136,42 +2152,38 @@ int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
list_for_each_entry(tmp, devices, dev_list) {
if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
&tmp->dev_state) && !tmp->bdev) {
- *device = tmp;
+ device = tmp;
break;
}
}
- if (!*device)
- return BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
-
- return 0;
+ if (!device)
+ return ERR_PTR(-ENOENT);
} else {
- return btrfs_find_device_by_path(fs_info, device_path, device);
+ device = btrfs_find_device_by_path(fs_info, device_path);
}
+
+ return device;
}
/*
* Lookup a device given by device id, or the path if the id is 0.
*/
-int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
- const char *devpath,
- struct btrfs_device **device)
+struct btrfs_device *btrfs_find_device_by_devspec(
+ struct btrfs_fs_info *fs_info, u64 devid, const char *devpath)
{
- int ret;
+ struct btrfs_device *device;
if (devid) {
- ret = 0;
- *device = btrfs_find_device(fs_info, devid, NULL, NULL);
- if (!*device)
- ret = -ENOENT;
+ device = btrfs_find_device(fs_info, devid, NULL, NULL);
+ if (!device)
+ return ERR_PTR(-ENOENT);
} else {
if (!devpath || !devpath[0])
- return -EINVAL;
-
- ret = btrfs_find_device_missing_or_by_path(fs_info, devpath,
- device);
+ return ERR_PTR(-EINVAL);
+ device = btrfs_find_device_missing_or_by_path(fs_info, devpath);
}
- return ret;
+ return device;
}
/*
@@ -3679,7 +3691,7 @@ static int alloc_profile_is_valid(u64 flags, int extended)
return !extended; /* "0" is valid for usual profiles */
/* true if exactly one bit set */
- return (flags & (flags - 1)) == 0;
+ return is_power_of_2(flags);
}
static inline int balance_need_close(struct btrfs_fs_info *fs_info)
@@ -3740,13 +3752,8 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
}
}
- num_devices = fs_info->fs_devices->num_devices;
- btrfs_dev_replace_read_lock(&fs_info->dev_replace);
- if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
- BUG_ON(num_devices < 1);
- num_devices--;
- }
- btrfs_dev_replace_read_unlock(&fs_info->dev_replace);
+ num_devices = btrfs_num_devices(fs_info);
+
allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE | BTRFS_BLOCK_GROUP_DUP;
if (num_devices > 1)
allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
@@ -4491,7 +4498,12 @@ again:
/* Now btrfs_update_device() will change the on-disk size. */
ret = btrfs_update_device(trans, device);
- btrfs_end_transaction(trans);
+ if (ret < 0) {
+ btrfs_abort_transaction(trans, ret);
+ btrfs_end_transaction(trans);
+ } else {
+ ret = btrfs_commit_transaction(trans);
+ }
done:
btrfs_free_path(path);
if (ret) {
@@ -5892,7 +5904,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
}
out:
if (dev_replace_is_ongoing) {
- btrfs_dev_replace_clear_lock_blocking(dev_replace);
+ ASSERT(atomic_read(&dev_replace->blocking_readers) > 0);
+ btrfs_dev_replace_read_lock(dev_replace);
+ /* Barrier implied by atomic_dec_and_test */
+ if (atomic_dec_and_test(&dev_replace->blocking_readers))
+ cond_wake_up_nomb(&dev_replace->read_lock_wq);
btrfs_dev_replace_read_unlock(dev_replace);
}
free_extent_map(em);
@@ -7433,7 +7449,7 @@ static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
int ret = 0;
read_lock(&em_tree->lock);
- for (node = rb_first(&em_tree->map); node; node = rb_next(node)) {
+ for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
em = rb_entry(node, struct extent_map, rb_node);
if (em->map_lookup->num_stripes !=
em->map_lookup->verified_stripes) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 23e9285d88de..aefce895e994 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -410,12 +410,9 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices);
void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step);
void btrfs_assign_next_active_device(struct btrfs_device *device,
struct btrfs_device *this_dev);
-int btrfs_find_device_missing_or_by_path(struct btrfs_fs_info *fs_info,
- const char *device_path,
- struct btrfs_device **device);
-int btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info, u64 devid,
- const char *devpath,
- struct btrfs_device **device);
+struct btrfs_device *btrfs_find_device_by_devspec(struct btrfs_fs_info *fs_info,
+ u64 devid,
+ const char *devpath);
struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
const u64 *devid,
const u8 *uuid);
diff --git a/fs/buffer.c b/fs/buffer.c
index 6f1ae3ac9789..109f55196866 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3060,11 +3060,6 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
*/
bio = bio_alloc(GFP_NOIO, 1);
- if (wbc) {
- wbc_init_bio(wbc, bio);
- wbc_account_io(wbc, bh->b_page, bh->b_size);
- }
-
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_write_hint = write_hint;
@@ -3084,6 +3079,11 @@ static int submit_bh_wbc(int op, int op_flags, struct buffer_head *bh,
op_flags |= REQ_PRIO;
bio_set_op_attrs(bio, op, op_flags);
+ if (wbc) {
+ wbc_init_bio(wbc, bio);
+ wbc_account_io(wbc, bh->b_page, bh->b_size);
+ }
+
submit_bio(bio);
return 0;
}
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index af2b17b21b94..95983c744164 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -343,7 +343,7 @@ try_again:
trap = lock_rename(cache->graveyard, dir);
/* do some checks before getting the grave dentry */
- if (rep->d_parent != dir) {
+ if (rep->d_parent != dir || IS_DEADDIR(d_inode(rep))) {
/* the entry was probably culled when we dropped the parent dir
* lock */
unlock_rename(cache->graveyard, dir);
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 43ca3b763875..eab1359d0553 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -602,6 +602,8 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
/*
* create a new fs client
+ *
+ * Success or not, this function consumes @fsopt and @opt.
*/
static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
struct ceph_options *opt)
@@ -609,17 +611,20 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
struct ceph_fs_client *fsc;
int page_count;
size_t size;
- int err = -ENOMEM;
+ int err;
fsc = kzalloc(sizeof(*fsc), GFP_KERNEL);
- if (!fsc)
- return ERR_PTR(-ENOMEM);
+ if (!fsc) {
+ err = -ENOMEM;
+ goto fail;
+ }
fsc->client = ceph_create_client(opt, fsc);
if (IS_ERR(fsc->client)) {
err = PTR_ERR(fsc->client);
goto fail;
}
+ opt = NULL; /* fsc->client now owns this */
fsc->client->extra_mon_dispatch = extra_mon_dispatch;
fsc->client->osdc.abort_on_full = true;
@@ -677,6 +682,9 @@ fail_client:
ceph_destroy_client(fsc->client);
fail:
kfree(fsc);
+ if (opt)
+ ceph_destroy_options(opt);
+ destroy_mount_options(fsopt);
return ERR_PTR(err);
}
@@ -1042,8 +1050,6 @@ static struct dentry *ceph_mount(struct file_system_type *fs_type,
fsc = create_fs_client(fsopt, opt);
if (IS_ERR(fsc)) {
res = ERR_CAST(fsc);
- destroy_mount_options(fsopt);
- ceph_destroy_options(opt);
goto out_final;
}
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 35c83fe7dba0..abcd78e332fe 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -6,6 +6,7 @@ config CIFS
select CRYPTO_MD4
select CRYPTO_MD5
select CRYPTO_SHA256
+ select CRYPTO_SHA512
select CRYPTO_CMAC
select CRYPTO_HMAC
select CRYPTO_ARC4
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index b380e0871372..a2b2355e7f01 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -105,9 +105,6 @@ convert_sfm_char(const __u16 src_char, char *target)
case SFM_LESSTHAN:
*target = '<';
break;
- case SFM_SLASH:
- *target = '\\';
- break;
case SFM_SPACE:
*target = ' ';
break;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 0c9ab62c3df4..9dcaed031843 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1553,6 +1553,7 @@ static inline void free_dfs_info_array(struct dfs_info3_param *param,
/* Flags */
#define MID_WAIT_CANCELLED 1 /* Cancelled while waiting for response */
+#define MID_DELETED 2 /* Mid has been dequeued/deleted */
/* Types of response buffer returned from SendReceive2 */
#define CIFS_NO_BUFFER 0 /* Response buffer not returned */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index dc2f4cf08fe9..5657b79dbc99 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -601,10 +601,15 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses)
}
count = 0;
+ /*
+ * We know that all the name entries in the protocols array
+ * are short (< 16 bytes anyway) and are NUL terminated.
+ */
for (i = 0; i < CIFS_NUM_PROT; i++) {
- strncpy(pSMB->DialectsArray+count, protocols[i].name, 16);
- count += strlen(protocols[i].name) + 1;
- /* null at end of source and target buffers anyway */
+ size_t len = strlen(protocols[i].name) + 1;
+
+ memcpy(pSMB->DialectsArray+count, protocols[i].name, len);
+ count += len;
}
inc_rfc1001_len(pSMB, count);
pSMB->ByteCount = cpu_to_le16(count);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index c832a8a1970a..52d71b64c0c6 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -659,7 +659,15 @@ dequeue_mid(struct mid_q_entry *mid, bool malformed)
mid->mid_state = MID_RESPONSE_RECEIVED;
else
mid->mid_state = MID_RESPONSE_MALFORMED;
- list_del_init(&mid->qhead);
+ /*
+ * Trying to handle/dequeue a mid after the send_recv()
+ * function has finished processing it is a bug.
+ */
+ if (mid->mid_flags & MID_DELETED)
+ printk_once(KERN_WARNING
+ "trying to dequeue a deleted mid\n");
+ else
+ list_del_init(&mid->qhead);
spin_unlock(&GlobalMid_Lock);
}
@@ -938,8 +946,7 @@ next_pdu:
} else {
mids[0] = server->ops->find_mid(server, buf);
bufs[0] = buf;
- if (mids[0])
- num_mids = 1;
+ num_mids = 1;
if (!mids[0] || !mids[0]->receive)
length = standard_receive3(server, mids[0]);
@@ -2547,7 +2554,7 @@ cifs_setup_ipc(struct cifs_ses *ses, struct smb_vol *volume_info)
if (tcon == NULL)
return -ENOMEM;
- snprintf(unc, sizeof(unc), "\\\\%s\\IPC$", ses->serverName);
+ snprintf(unc, sizeof(unc), "\\\\%s\\IPC$", ses->server->hostname);
/* cannot fail */
nls_codepage = load_nls_default();
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index d32eaa4b2437..6e8765f44508 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -467,6 +467,8 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path,
oparms.cifs_sb = cifs_sb;
oparms.desired_access = GENERIC_READ;
oparms.create_options = CREATE_NOT_DIR;
+ if (backup_cred(cifs_sb))
+ oparms.create_options |= CREATE_OPEN_BACKUP_INTENT;
oparms.disposition = FILE_OPEN;
oparms.path = path;
oparms.fid = &fid;
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index dacb2c05674c..6926685e513c 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -402,9 +402,17 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv)
(struct smb_com_transaction_change_notify_rsp *)buf;
struct file_notify_information *pnotify;
__u32 data_offset = 0;
+ size_t len = srv->total_read - sizeof(pSMBr->hdr.smb_buf_length);
+
if (get_bcc(buf) > sizeof(struct file_notify_information)) {
data_offset = le32_to_cpu(pSMBr->DataOffset);
+ if (data_offset >
+ len - sizeof(struct file_notify_information)) {
+ cifs_dbg(FYI, "invalid data_offset %u\n",
+ data_offset);
+ return true;
+ }
pnotify = (struct file_notify_information *)
((char *)&pSMBr->hdr.Protocol + data_offset);
cifs_dbg(FYI, "dnotify on %s Action: 0x%x\n",
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index eeab81c9452f..e169e1a5fd35 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -376,8 +376,15 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level)
new_entry = old_entry + sizeof(FIND_FILE_STANDARD_INFO) +
pfData->FileNameLength;
- } else
- new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset);
+ } else {
+ u32 next_offset = le32_to_cpu(pDirInfo->NextEntryOffset);
+
+ if (old_entry + next_offset < old_entry) {
+ cifs_dbg(VFS, "invalid offset %u\n", next_offset);
+ return NULL;
+ }
+ new_entry = old_entry + next_offset;
+ }
cifs_dbg(FYI, "new entry %p old entry %p\n", new_entry, old_entry);
/* validate that new_entry is not past end of SMB */
if (new_entry >= end_of_smb) {
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index db0453660ff6..6a9c47541c53 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -248,16 +248,20 @@ smb2_check_message(char *buf, unsigned int len, struct TCP_Server_Info *srvr)
* MacOS server pads after SMB2.1 write response with 3 bytes
* of junk. Other servers match RFC1001 len to actual
* SMB2/SMB3 frame length (header + smb2 response specific data)
- * Some windows servers do too when compounding is used.
- * Log the server error (once), but allow it and continue
+ * Some windows servers also pad up to 8 bytes when compounding.
+ * If pad is longer than eight bytes, log the server behavior
+ * (once), since may indicate a problem but allow it and continue
* since the frame is parseable.
*/
if (clc_len < len) {
- printk_once(KERN_WARNING
- "SMB2 server sent bad RFC1001 len %d not %d\n",
- len, clc_len);
+ pr_warn_once(
+ "srv rsp padded more than expected. Length %d not %d for cmd:%d mid:%llu\n",
+ len, clc_len, command, mid);
return 0;
}
+ pr_warn_once(
+ "srv rsp too short, len %d not %d. cmd:%d mid:%llu\n",
+ len, clc_len, command, mid);
return 1;
}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 247a98e6c856..89985a0a6819 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -630,7 +630,10 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
oparms.tcon = tcon;
oparms.desired_access = FILE_READ_ATTRIBUTES;
oparms.disposition = FILE_OPEN;
- oparms.create_options = 0;
+ if (backup_cred(cifs_sb))
+ oparms.create_options = CREATE_OPEN_BACKUP_INTENT;
+ else
+ oparms.create_options = 0;
oparms.fid = &fid;
oparms.reconnect = false;
@@ -779,7 +782,10 @@ smb2_query_eas(const unsigned int xid, struct cifs_tcon *tcon,
oparms.tcon = tcon;
oparms.desired_access = FILE_READ_EA;
oparms.disposition = FILE_OPEN;
- oparms.create_options = 0;
+ if (backup_cred(cifs_sb))
+ oparms.create_options = CREATE_OPEN_BACKUP_INTENT;
+ else
+ oparms.create_options = 0;
oparms.fid = &fid;
oparms.reconnect = false;
@@ -858,7 +864,10 @@ smb2_set_ea(const unsigned int xid, struct cifs_tcon *tcon,
oparms.tcon = tcon;
oparms.desired_access = FILE_WRITE_EA;
oparms.disposition = FILE_OPEN;
- oparms.create_options = 0;
+ if (backup_cred(cifs_sb))
+ oparms.create_options = CREATE_OPEN_BACKUP_INTENT;
+ else
+ oparms.create_options = 0;
oparms.fid = &fid;
oparms.reconnect = false;
@@ -1453,7 +1462,10 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
oparms.tcon = tcon;
oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA;
oparms.disposition = FILE_OPEN;
- oparms.create_options = 0;
+ if (backup_cred(cifs_sb))
+ oparms.create_options = CREATE_OPEN_BACKUP_INTENT;
+ else
+ oparms.create_options = 0;
oparms.fid = fid;
oparms.reconnect = false;
@@ -1465,7 +1477,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
}
srch_inf->entries_in_buffer = 0;
- srch_inf->index_of_last_entry = 0;
+ srch_inf->index_of_last_entry = 2;
rc = SMB2_query_directory(xid, tcon, fid->persistent_fid,
fid->volatile_fid, 0, srch_inf);
@@ -1857,7 +1869,10 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
oparms.tcon = tcon;
oparms.desired_access = FILE_READ_ATTRIBUTES;
oparms.disposition = FILE_OPEN;
- oparms.create_options = 0;
+ if (backup_cred(cifs_sb))
+ oparms.create_options = CREATE_OPEN_BACKUP_INTENT;
+ else
+ oparms.create_options = 0;
oparms.fid = &fid;
oparms.reconnect = false;
@@ -3639,7 +3654,7 @@ struct smb_version_values smb21_values = {
struct smb_version_values smb3any_values = {
.version_string = SMB3ANY_VERSION_STRING,
.protocol_id = SMB302_PROT_ID, /* doesn't matter, send protocol array */
- .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION,
+ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
@@ -3660,7 +3675,7 @@ struct smb_version_values smb3any_values = {
struct smb_version_values smbdefault_values = {
.version_string = SMBDEFAULT_VERSION_STRING,
.protocol_id = SMB302_PROT_ID, /* doesn't matter, send protocol array */
- .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION,
+ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
@@ -3681,7 +3696,7 @@ struct smb_version_values smbdefault_values = {
struct smb_version_values smb30_values = {
.version_string = SMB30_VERSION_STRING,
.protocol_id = SMB30_PROT_ID,
- .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION,
+ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
@@ -3702,7 +3717,7 @@ struct smb_version_values smb30_values = {
struct smb_version_values smb302_values = {
.version_string = SMB302_VERSION_STRING,
.protocol_id = SMB302_PROT_ID,
- .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION,
+ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
@@ -3723,7 +3738,7 @@ struct smb_version_values smb302_values = {
struct smb_version_values smb311_values = {
.version_string = SMB311_VERSION_STRING,
.protocol_id = SMB311_PROT_ID,
- .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION,
+ .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION | SMB2_GLOBAL_CAP_DIRECTORY_LEASING,
.large_lock_type = 0,
.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 5740aa809be6..f54d07bda067 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -2178,6 +2178,9 @@ SMB2_open_init(struct cifs_tcon *tcon, struct smb_rqst *rqst, __u8 *oplock,
if (!(server->capabilities & SMB2_GLOBAL_CAP_LEASING) ||
*oplock == SMB2_OPLOCK_LEVEL_NONE)
req->RequestedOplockLevel = *oplock;
+ else if (!(server->capabilities & SMB2_GLOBAL_CAP_DIRECTORY_LEASING) &&
+ (oparms->create_options & CREATE_NOT_FILE))
+ req->RequestedOplockLevel = *oplock; /* no srv lease support */
else {
rc = add_lease_context(server, iov, &n_iov,
oparms->fid->lease_key, oplock);
@@ -2456,14 +2459,14 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
/* We check for obvious errors in the output buffer length and offset */
if (*plen == 0)
goto ioctl_exit; /* server returned no data */
- else if (*plen > 0xFF00) {
+ else if (*plen > rsp_iov.iov_len || *plen > 0xFF00) {
cifs_dbg(VFS, "srv returned invalid ioctl length: %d\n", *plen);
*plen = 0;
rc = -EIO;
goto ioctl_exit;
}
- if (rsp_iov.iov_len < le32_to_cpu(rsp->OutputOffset) + *plen) {
+ if (rsp_iov.iov_len - *plen < le32_to_cpu(rsp->OutputOffset)) {
cifs_dbg(VFS, "Malformed ioctl resp: len %d offset %d\n", *plen,
le32_to_cpu(rsp->OutputOffset));
*plen = 0;
@@ -3574,33 +3577,38 @@ num_entries(char *bufstart, char *end_of_buf, char **lastentry, size_t size)
int len;
unsigned int entrycount = 0;
unsigned int next_offset = 0;
- FILE_DIRECTORY_INFO *entryptr;
+ char *entryptr;
+ FILE_DIRECTORY_INFO *dir_info;
if (bufstart == NULL)
return 0;
- entryptr = (FILE_DIRECTORY_INFO *)bufstart;
+ entryptr = bufstart;
while (1) {
- entryptr = (FILE_DIRECTORY_INFO *)
- ((char *)entryptr + next_offset);
-
- if ((char *)entryptr + size > end_of_buf) {
+ if (entryptr + next_offset < entryptr ||
+ entryptr + next_offset > end_of_buf ||
+ entryptr + next_offset + size > end_of_buf) {
cifs_dbg(VFS, "malformed search entry would overflow\n");
break;
}
- len = le32_to_cpu(entryptr->FileNameLength);
- if ((char *)entryptr + len + size > end_of_buf) {
+ entryptr = entryptr + next_offset;
+ dir_info = (FILE_DIRECTORY_INFO *)entryptr;
+
+ len = le32_to_cpu(dir_info->FileNameLength);
+ if (entryptr + len < entryptr ||
+ entryptr + len > end_of_buf ||
+ entryptr + len + size > end_of_buf) {
cifs_dbg(VFS, "directory entry name would overflow frame end of buf %p\n",
end_of_buf);
break;
}
- *lastentry = (char *)entryptr;
+ *lastentry = entryptr;
entrycount++;
- next_offset = le32_to_cpu(entryptr->NextEntryOffset);
+ next_offset = le32_to_cpu(dir_info->NextEntryOffset);
if (!next_offset)
break;
}
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 78f96fa3d7d9..b48f43963da6 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -142,7 +142,8 @@ void
cifs_delete_mid(struct mid_q_entry *mid)
{
spin_lock(&GlobalMid_Lock);
- list_del(&mid->qhead);
+ list_del_init(&mid->qhead);
+ mid->mid_flags |= MID_DELETED;
spin_unlock(&GlobalMid_Lock);
DeleteMidQEntry(mid);
@@ -772,6 +773,11 @@ cifs_setup_request(struct cifs_ses *ses, struct smb_rqst *rqst)
return mid;
}
+static void
+cifs_noop_callback(struct mid_q_entry *mid)
+{
+}
+
int
compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
const int flags, const int num_rqst, struct smb_rqst *rqst,
@@ -826,8 +832,13 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
}
midQ[i]->mid_state = MID_REQUEST_SUBMITTED;
+ /*
+ * We don't invoke the callback compounds unless it is the last
+ * request.
+ */
+ if (i < num_rqst - 1)
+ midQ[i]->callback = cifs_noop_callback;
}
-
cifs_in_send_inc(ses->server);
rc = smb_send_rqst(ses->server, num_rqst, rqst, flags);
cifs_in_send_dec(ses->server);
@@ -908,6 +919,12 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses,
midQ[i]->resp_buf = NULL;
}
out:
+ /*
+ * This will dequeue all mids. After this it is important that the
+ * demultiplex_thread will not process any of these mids any futher.
+ * This is prevented above by using a noop callback that will not
+ * wake this thread except for the very last PDU.
+ */
for (i = 0; i < num_rqst; i++)
cifs_delete_mid(midQ[i]);
add_credits(ses->server, credits, optype);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index a9b00942e87d..0c445a03e682 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -560,69 +560,6 @@ static int mt_ioctl_trans(struct file *file,
#define HIDPGETCONNLIST _IOR('H', 210, int)
#define HIDPGETCONNINFO _IOR('H', 211, int)
-
-struct serial_struct32 {
- compat_int_t type;
- compat_int_t line;
- compat_uint_t port;
- compat_int_t irq;
- compat_int_t flags;
- compat_int_t xmit_fifo_size;
- compat_int_t custom_divisor;
- compat_int_t baud_base;
- unsigned short close_delay;
- char io_type;
- char reserved_char[1];
- compat_int_t hub6;
- unsigned short closing_wait; /* time to wait before closing */
- unsigned short closing_wait2; /* no longer used... */
- compat_uint_t iomem_base;
- unsigned short iomem_reg_shift;
- unsigned int port_high;
- /* compat_ulong_t iomap_base FIXME */
- compat_int_t reserved[1];
-};
-
-static int serial_struct_ioctl(struct file *file,
- unsigned cmd, struct serial_struct32 __user *ss32)
-{
- typedef struct serial_struct32 SS32;
- int err;
- struct serial_struct __user *ss = compat_alloc_user_space(sizeof(*ss));
- __u32 udata;
- unsigned int base;
- unsigned char *iomem_base;
-
- if (ss == NULL)
- return -EFAULT;
- if (cmd == TIOCSSERIAL) {
- if (copy_in_user(ss, ss32, offsetof(SS32, iomem_base)) ||
- get_user(udata, &ss32->iomem_base))
- return -EFAULT;
- iomem_base = compat_ptr(udata);
- if (put_user(iomem_base, &ss->iomem_base) ||
- convert_in_user(&ss32->iomem_reg_shift,
- &ss->iomem_reg_shift) ||
- convert_in_user(&ss32->port_high, &ss->port_high) ||
- put_user(0UL, &ss->iomap_base))
- return -EFAULT;
- }
- err = do_ioctl(file, cmd, (unsigned long)ss);
- if (cmd == TIOCGSERIAL && err >= 0) {
- if (copy_in_user(ss32, ss, offsetof(SS32, iomem_base)) ||
- get_user(iomem_base, &ss->iomem_base))
- return -EFAULT;
- base = (unsigned long)iomem_base >> 32 ?
- 0xffffffff : (unsigned)(unsigned long)iomem_base;
- if (put_user(base, &ss32->iomem_base) ||
- convert_in_user(&ss->iomem_reg_shift,
- &ss32->iomem_reg_shift) ||
- convert_in_user(&ss->port_high, &ss32->port_high))
- return -EFAULT;
- }
- return err;
-}
-
#define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t)
#define RTC_IRQP_SET32 _IOW('p', 0x0c, compat_ulong_t)
#define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t)
@@ -707,60 +644,8 @@ static int compat_ioctl_preallocate(struct file *file,
static unsigned int ioctl_pointer[] = {
/* compatible ioctls first */
-COMPATIBLE_IOCTL(0x4B50) /* KDGHWCLK - not in the kernel, but don't complain */
-COMPATIBLE_IOCTL(0x4B51) /* KDSHWCLK - not in the kernel, but don't complain */
-
-/* Big T */
-COMPATIBLE_IOCTL(TCGETA)
-COMPATIBLE_IOCTL(TCSETA)
-COMPATIBLE_IOCTL(TCSETAW)
-COMPATIBLE_IOCTL(TCSETAF)
-COMPATIBLE_IOCTL(TCSBRK)
-COMPATIBLE_IOCTL(TCXONC)
-COMPATIBLE_IOCTL(TCFLSH)
-COMPATIBLE_IOCTL(TCGETS)
-COMPATIBLE_IOCTL(TCSETS)
-COMPATIBLE_IOCTL(TCSETSW)
-COMPATIBLE_IOCTL(TCSETSF)
-COMPATIBLE_IOCTL(TIOCLINUX)
-COMPATIBLE_IOCTL(TIOCSBRK)
-COMPATIBLE_IOCTL(TIOCGDEV)
-COMPATIBLE_IOCTL(TIOCCBRK)
-COMPATIBLE_IOCTL(TIOCGSID)
-COMPATIBLE_IOCTL(TIOCGICOUNT)
-COMPATIBLE_IOCTL(TIOCGEXCL)
/* Little t */
-COMPATIBLE_IOCTL(TIOCGETD)
-COMPATIBLE_IOCTL(TIOCSETD)
-COMPATIBLE_IOCTL(TIOCEXCL)
-COMPATIBLE_IOCTL(TIOCNXCL)
-COMPATIBLE_IOCTL(TIOCCONS)
-COMPATIBLE_IOCTL(TIOCGSOFTCAR)
-COMPATIBLE_IOCTL(TIOCSSOFTCAR)
-COMPATIBLE_IOCTL(TIOCSWINSZ)
-COMPATIBLE_IOCTL(TIOCGWINSZ)
-COMPATIBLE_IOCTL(TIOCMGET)
-COMPATIBLE_IOCTL(TIOCMBIC)
-COMPATIBLE_IOCTL(TIOCMBIS)
-COMPATIBLE_IOCTL(TIOCMSET)
-COMPATIBLE_IOCTL(TIOCNOTTY)
-COMPATIBLE_IOCTL(TIOCSTI)
COMPATIBLE_IOCTL(TIOCOUTQ)
-COMPATIBLE_IOCTL(TIOCSPGRP)
-COMPATIBLE_IOCTL(TIOCGPGRP)
-COMPATIBLE_IOCTL(TIOCSERGETLSR)
-#ifdef TIOCSRS485
-COMPATIBLE_IOCTL(TIOCSRS485)
-#endif
-#ifdef TIOCGRS485
-COMPATIBLE_IOCTL(TIOCGRS485)
-#endif
-#ifdef TCGETS2
-COMPATIBLE_IOCTL(TCGETS2)
-COMPATIBLE_IOCTL(TCSETS2)
-COMPATIBLE_IOCTL(TCSETSW2)
-COMPATIBLE_IOCTL(TCSETSF2)
-#endif
/* Little f */
COMPATIBLE_IOCTL(FIOCLEX)
COMPATIBLE_IOCTL(FIONCLEX)
@@ -775,23 +660,6 @@ COMPATIBLE_IOCTL(FIGETBSZ)
COMPATIBLE_IOCTL(FIFREEZE)
COMPATIBLE_IOCTL(FITHAW)
COMPATIBLE_IOCTL(FITRIM)
-COMPATIBLE_IOCTL(KDGETKEYCODE)
-COMPATIBLE_IOCTL(KDSETKEYCODE)
-COMPATIBLE_IOCTL(KDGKBTYPE)
-COMPATIBLE_IOCTL(KDGETMODE)
-COMPATIBLE_IOCTL(KDGKBMODE)
-COMPATIBLE_IOCTL(KDGKBMETA)
-COMPATIBLE_IOCTL(KDGKBENT)
-COMPATIBLE_IOCTL(KDSKBENT)
-COMPATIBLE_IOCTL(KDGKBSENT)
-COMPATIBLE_IOCTL(KDSKBSENT)
-COMPATIBLE_IOCTL(KDGKBDIACR)
-COMPATIBLE_IOCTL(KDSKBDIACR)
-COMPATIBLE_IOCTL(KDGKBDIACRUC)
-COMPATIBLE_IOCTL(KDSKBDIACRUC)
-COMPATIBLE_IOCTL(KDKBDREP)
-COMPATIBLE_IOCTL(KDGKBLED)
-COMPATIBLE_IOCTL(KDGETLED)
#ifdef CONFIG_BLOCK
/* Big S */
COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN)
@@ -1133,11 +1001,6 @@ COMPATIBLE_IOCTL(CAPI_SET_FLAGS)
COMPATIBLE_IOCTL(CAPI_CLR_FLAGS)
COMPATIBLE_IOCTL(CAPI_NCCI_OPENCOUNT)
COMPATIBLE_IOCTL(CAPI_NCCI_GETUNIT)
-/* Siemens Gigaset */
-COMPATIBLE_IOCTL(GIGASET_REDIR)
-COMPATIBLE_IOCTL(GIGASET_CONFIG)
-COMPATIBLE_IOCTL(GIGASET_BRKCHARS)
-COMPATIBLE_IOCTL(GIGASET_VERSION)
/* Misc. */
COMPATIBLE_IOCTL(0x41545900) /* ATYIO_CLKR */
COMPATIBLE_IOCTL(0x41545901) /* ATYIO_CLKW */
@@ -1223,21 +1086,6 @@ COMPATIBLE_IOCTL(JSIOCGAXES)
COMPATIBLE_IOCTL(JSIOCGBUTTONS)
COMPATIBLE_IOCTL(JSIOCGNAME(0))
-#ifdef TIOCGLTC
-COMPATIBLE_IOCTL(TIOCGLTC)
-COMPATIBLE_IOCTL(TIOCSLTC)
-#endif
-#ifdef TIOCSTART
-/*
- * For these two we have definitions in ioctls.h and/or termios.h on
- * some architectures but no actual implemention. Some applications
- * like bash call them if they are defined in the headers, so we provide
- * entries here to avoid syslog message spew.
- */
-COMPATIBLE_IOCTL(TIOCSTART)
-COMPATIBLE_IOCTL(TIOCSTOP)
-#endif
-
/* fat 'r' ioctls. These are handled by fat with ->compat_ioctl,
but we don't want warnings on other file systems. So declare
them as compatible here. */
@@ -1293,10 +1141,6 @@ static long do_ioctl_trans(unsigned int cmd,
case MTIOCPOS32:
return mt_ioctl_trans(file, cmd, argp);
#endif
- /* Serial */
- case TIOCGSERIAL:
- case TIOCSSERIAL:
- return serial_struct_ioctl(file, cmd, argp);
/* Not implemented in the native kernel */
case RTC_IRQP_READ32:
case RTC_IRQP_SET32:
@@ -1316,24 +1160,11 @@ static long do_ioctl_trans(unsigned int cmd,
* so we must not do a compat_ptr() translation.
*/
switch (cmd) {
- /* Big T */
- case TCSBRKP:
- case TIOCMIWAIT:
- case TIOCSCTTY:
/* RAID */
case HOT_REMOVE_DISK:
case HOT_ADD_DISK:
case SET_DISK_FAULTY:
case SET_BITMAP_FILE:
- /* Big K */
- case KDSIGACCEPT:
- case KIOCSOUND:
- case KDMKTONE:
- case KDSETMODE:
- case KDSKBMODE:
- case KDSKBMETA:
- case KDSKBLED:
- case KDSETLED:
return vfs_ioctl(file, cmd, arg);
}
diff --git a/fs/coredump.c b/fs/coredump.c
index 1e2c87acac9b..e42e17e55bfd 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -536,7 +536,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
return err;
}
-void do_coredump(const siginfo_t *siginfo)
+void do_coredump(const kernel_siginfo_t *siginfo)
{
struct core_state core_state;
struct core_name cn;
diff --git a/fs/dax.c b/fs/dax.c
index f32d7125ad0f..0fb270f0a0ef 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -447,6 +447,7 @@ bool dax_lock_mapping_entry(struct page *page)
xa_unlock_irq(&mapping->i_pages);
break;
} else if (IS_ERR(entry)) {
+ xa_unlock_irq(&mapping->i_pages);
WARN_ON_ONCE(PTR_ERR(entry) != -EAGAIN);
continue;
}
@@ -665,6 +666,8 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
while (index < end && pagevec_lookup_entries(&pvec, mapping, index,
min(end - index, (pgoff_t)PAGEVEC_SIZE),
indices)) {
+ pgoff_t nr_pages = 1;
+
for (i = 0; i < pagevec_count(&pvec); i++) {
struct page *pvec_ent = pvec.pages[i];
void *entry;
@@ -679,8 +682,15 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
xa_lock_irq(&mapping->i_pages);
entry = get_unlocked_mapping_entry(mapping, index, NULL);
- if (entry)
+ if (entry) {
page = dax_busy_page(entry);
+ /*
+ * Account for multi-order entries at
+ * the end of the pagevec.
+ */
+ if (i + 1 >= pagevec_count(&pvec))
+ nr_pages = 1UL << dax_radix_order(entry);
+ }
put_unlocked_mapping_entry(mapping, index, entry);
xa_unlock_irq(&mapping->i_pages);
if (page)
@@ -695,7 +705,7 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
*/
pagevec_remove_exceptionals(&pvec);
pagevec_release(&pvec);
- index++;
+ index += nr_pages;
if (page)
break;
@@ -1120,21 +1130,12 @@ static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry,
{
struct inode *inode = mapping->host;
unsigned long vaddr = vmf->address;
- vm_fault_t ret = VM_FAULT_NOPAGE;
- struct page *zero_page;
- pfn_t pfn;
-
- zero_page = ZERO_PAGE(0);
- if (unlikely(!zero_page)) {
- ret = VM_FAULT_OOM;
- goto out;
- }
+ pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr));
+ vm_fault_t ret;
- pfn = page_to_pfn_t(zero_page);
dax_insert_mapping_entry(mapping, vmf, entry, pfn, RADIX_DAX_ZERO_PAGE,
false);
ret = vmf_insert_mixed(vmf->vma, vaddr, pfn);
-out:
trace_dax_load_hole(inode, vmf, ret);
return ret;
}
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 49121e5a8de2..5c36ceecb5c1 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -593,11 +593,16 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry,
lower_new_dir_dentry = dget_parent(lower_new_dentry);
target_inode = d_inode(new_dentry);
trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+ rc = -EINVAL;
+ if (lower_old_dentry->d_parent != lower_old_dir_dentry)
+ goto out_lock;
+ if (lower_new_dentry->d_parent != lower_new_dir_dentry)
+ goto out_lock;
+ if (d_unhashed(lower_old_dentry) || d_unhashed(lower_new_dentry))
+ goto out_lock;
/* source should not be ancestor of target */
- if (trap == lower_old_dentry) {
- rc = -EINVAL;
+ if (trap == lower_old_dentry)
goto out_lock;
- }
/* target should not be ancestor of source */
if (trap == lower_new_dentry) {
rc = -ENOTEMPTY;
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 7f7ee18fe179..e4bb9386c045 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -1448,6 +1448,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
}
inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
ei->i_flags = le32_to_cpu(raw_inode->i_flags);
+ ext2_set_inode_flags(inode);
ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
ei->i_frag_no = raw_inode->i_frag;
ei->i_frag_size = raw_inode->i_fsize;
@@ -1517,7 +1518,6 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
}
brelse (bh);
- ext2_set_inode_flags(inode);
unlock_new_inode(inode);
return inode;
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index e2902d394f1b..f93f9881ec18 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -76,7 +76,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len)))
error_msg = "rec_len is too small for name_len";
else if (unlikely(((char *) de - buf) + rlen > size))
- error_msg = "directory entry across range";
+ error_msg = "directory entry overrun";
else if (unlikely(le32_to_cpu(de->inode) >
le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count)))
error_msg = "inode out of bounds";
@@ -85,18 +85,16 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
if (filp)
ext4_error_file(filp, function, line, bh->b_blocknr,
- "bad entry in directory: %s - offset=%u(%u), "
- "inode=%u, rec_len=%d, name_len=%d",
- error_msg, (unsigned) (offset % size),
- offset, le32_to_cpu(de->inode),
- rlen, de->name_len);
+ "bad entry in directory: %s - offset=%u, "
+ "inode=%u, rec_len=%d, name_len=%d, size=%d",
+ error_msg, offset, le32_to_cpu(de->inode),
+ rlen, de->name_len, size);
else
ext4_error_inode(dir, function, line, bh->b_blocknr,
- "bad entry in directory: %s - offset=%u(%u), "
- "inode=%u, rec_len=%d, name_len=%d",
- error_msg, (unsigned) (offset % size),
- offset, le32_to_cpu(de->inode),
- rlen, de->name_len);
+ "bad entry in directory: %s - offset=%u, "
+ "inode=%u, rec_len=%d, name_len=%d, size=%d",
+ error_msg, offset, le32_to_cpu(de->inode),
+ rlen, de->name_len, size);
return 1;
}
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 0f0edd1cd0cd..caff935fbeb8 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -43,6 +43,17 @@
#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_EXT4_FS_ENCRYPTION)
#include <linux/fscrypt.h>
+#include <linux/compiler.h>
+
+/* Until this gets included into linux/compiler-gcc.h */
+#ifndef __nonstring
+#if defined(GCC_VERSION) && (GCC_VERSION >= 80000)
+#define __nonstring __attribute__((nonstring))
+#else
+#define __nonstring
+#endif
+#endif
+
/*
* The fourth extended filesystem constants/structures
*/
@@ -675,6 +686,9 @@ enum {
/* Max physical block we can address w/o extents */
#define EXT4_MAX_BLOCK_FILE_PHYS 0xFFFFFFFF
+/* Max logical block we can support */
+#define EXT4_MAX_LOGICAL_BLOCK 0xFFFFFFFF
+
/*
* Structure of an inode on the disk
*/
@@ -1226,7 +1240,7 @@ struct ext4_super_block {
__le32 s_feature_ro_compat; /* readonly-compatible feature set */
/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */
/*78*/ char s_volume_name[16]; /* volume name */
-/*88*/ char s_last_mounted[64]; /* directory where last mounted */
+/*88*/ char s_last_mounted[64] __nonstring; /* directory where last mounted */
/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */
/*
* Performance hints. Directory preallocation should only
@@ -1277,13 +1291,13 @@ struct ext4_super_block {
__le32 s_first_error_time; /* first time an error happened */
__le32 s_first_error_ino; /* inode involved in first error */
__le64 s_first_error_block; /* block involved of first error */
- __u8 s_first_error_func[32]; /* function where the error happened */
+ __u8 s_first_error_func[32] __nonstring; /* function where the error happened */
__le32 s_first_error_line; /* line number where error happened */
__le32 s_last_error_time; /* most recent time of an error */
__le32 s_last_error_ino; /* inode involved in last error */
__le32 s_last_error_line; /* line number where error happened */
__le64 s_last_error_block; /* block involved of last error */
- __u8 s_last_error_func[32]; /* function where the error happened */
+ __u8 s_last_error_func[32] __nonstring; /* function where the error happened */
#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts)
__u8 s_mount_opts[64];
__le32 s_usr_quota_inum; /* inode for tracking user quota */
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 3543fe80a3c4..7b4736022761 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -1753,6 +1753,7 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
{
int err, inline_size;
struct ext4_iloc iloc;
+ size_t inline_len;
void *inline_pos;
unsigned int offset;
struct ext4_dir_entry_2 *de;
@@ -1780,8 +1781,9 @@ bool empty_inline_dir(struct inode *dir, int *has_inline_data)
goto out;
}
+ inline_len = ext4_get_inline_size(dir);
offset = EXT4_INLINE_DOTDOT_SIZE;
- while (offset < dir->i_size) {
+ while (offset < inline_len) {
de = ext4_get_inline_entry(dir, &iloc, offset,
&inline_pos, &inline_size);
if (ext4_check_dir_entry(dir, NULL, de,
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d0dd585add6a..d767e993591d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3413,12 +3413,16 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
{
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
unsigned int blkbits = inode->i_blkbits;
- unsigned long first_block = offset >> blkbits;
- unsigned long last_block = (offset + length - 1) >> blkbits;
+ unsigned long first_block, last_block;
struct ext4_map_blocks map;
bool delalloc = false;
int ret;
+ if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
+ return -EINVAL;
+ first_block = offset >> blkbits;
+ last_block = min_t(loff_t, (offset + length - 1) >> blkbits,
+ EXT4_MAX_LOGICAL_BLOCK);
if (flags & IOMAP_REPORT) {
if (ext4_has_inline_data(inode)) {
@@ -3948,6 +3952,7 @@ static const struct address_space_operations ext4_dax_aops = {
.writepages = ext4_dax_writepages,
.direct_IO = noop_direct_IO,
.set_page_dirty = noop_set_page_dirty,
+ .bmap = ext4_bmap,
.invalidatepage = noop_invalidatepage,
};
@@ -4192,9 +4197,8 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
return 0;
}
-static void ext4_wait_dax_page(struct ext4_inode_info *ei, bool *did_unlock)
+static void ext4_wait_dax_page(struct ext4_inode_info *ei)
{
- *did_unlock = true;
up_write(&ei->i_mmap_sem);
schedule();
down_write(&ei->i_mmap_sem);
@@ -4204,14 +4208,12 @@ int ext4_break_layouts(struct inode *inode)
{
struct ext4_inode_info *ei = EXT4_I(inode);
struct page *page;
- bool retry;
int error;
if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem)))
return -EINVAL;
do {
- retry = false;
page = dax_layout_busy_page(inode->i_mapping);
if (!page)
return 0;
@@ -4219,8 +4221,8 @@ int ext4_break_layouts(struct inode *inode)
error = ___wait_var_event(&page->_refcount,
atomic_read(&page->_refcount) == 1,
TASK_INTERRUPTIBLE, 0, 0,
- ext4_wait_dax_page(ei, &retry));
- } while (error == 0 && retry);
+ ext4_wait_dax_page(ei));
+ } while (error == 0);
return error;
}
@@ -4895,6 +4897,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
* not initialized on a new filesystem. */
}
ei->i_flags = le32_to_cpu(raw_inode->i_flags);
+ ext4_set_inode_flags(inode);
inode->i_blocks = ext4_inode_blocks(raw_inode, ei);
ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo);
if (ext4_has_feature_64bit(sb))
@@ -5041,7 +5044,6 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
goto bad_inode;
}
brelse(iloc.bh);
- ext4_set_inode_flags(inode);
unlock_new_inode(inode);
return inode;
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 39b07c2d3384..2305b4374fd3 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -49,7 +49,6 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
*/
sb_start_write(sb);
ext4_mmp_csum_set(sb, mmp);
- mark_buffer_dirty(bh);
lock_buffer(bh);
bh->b_end_io = end_buffer_write_sync;
get_bh(bh);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 116ff68c5bd4..377d516c475f 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3478,6 +3478,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
int credits;
u8 old_file_type;
+ if (new.inode && new.inode->i_nlink == 0) {
+ EXT4_ERROR_INODE(new.inode,
+ "target of rename is already freed");
+ return -EFSCORRUPTED;
+ }
+
if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) &&
(!projid_eq(EXT4_I(new_dir)->i_projid,
EXT4_I(old_dentry->d_inode)->i_projid)))
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index db7590178dfc..2aa62d58d8dd 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -374,13 +374,13 @@ static int io_submit_init_bio(struct ext4_io_submit *io,
bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
if (!bio)
return -ENOMEM;
- wbc_init_bio(io->io_wbc, bio);
bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
bio_set_dev(bio, bh->b_bdev);
bio->bi_end_io = ext4_end_bio;
bio->bi_private = ext4_get_io_end(io->io_end);
io->io_bio = bio;
io->io_next_block = bh->b_blocknr;
+ wbc_init_bio(io->io_wbc, bio);
return 0;
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index e5fb38451a73..ebbc663d0798 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -19,6 +19,7 @@
int ext4_resize_begin(struct super_block *sb)
{
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
int ret = 0;
if (!capable(CAP_SYS_RESOURCE))
@@ -29,7 +30,7 @@ int ext4_resize_begin(struct super_block *sb)
* because the user tools have no way of handling this. Probably a
* bad time to do it anyways.
*/
- if (EXT4_SB(sb)->s_sbh->b_blocknr !=
+ if (EXT4_B2C(sbi, sbi->s_sbh->b_blocknr) !=
le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
ext4_warning(sb, "won't resize using backup superblock at %llu",
(unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
@@ -1986,6 +1987,26 @@ retry:
}
}
+ /*
+ * Make sure the last group has enough space so that it's
+ * guaranteed to have enough space for all metadata blocks
+ * that it might need to hold. (We might not need to store
+ * the inode table blocks in the last block group, but there
+ * will be cases where this might be needed.)
+ */
+ if ((ext4_group_first_block_no(sb, n_group) +
+ ext4_group_overhead_blocks(sb, n_group) + 2 +
+ sbi->s_itb_per_group + sbi->s_cluster_ratio) >= n_blocks_count) {
+ n_blocks_count = ext4_group_first_block_no(sb, n_group);
+ n_group--;
+ n_blocks_count_retry = 0;
+ if (resize_inode) {
+ iput(resize_inode);
+ resize_inode = NULL;
+ }
+ goto retry;
+ }
+
/* extend the last group */
if (n_group == o_group)
add = n_blocks_count - o_blocks_count;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 5863fd22e90b..1145109968ef 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2145,6 +2145,8 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
SEQ_OPTS_PRINT("max_dir_size_kb=%u", sbi->s_max_dir_size_kb);
if (test_opt(sb, DATA_ERR_ABORT))
SEQ_OPTS_PUTS("data_err=abort");
+ if (DUMMY_ENCRYPTION_ENABLED(sbi))
+ SEQ_OPTS_PUTS("test_dummy_encryption");
ext4_show_quota_options(seq, sb);
return 0;
@@ -4378,11 +4380,13 @@ no_journal:
block = ext4_count_free_clusters(sb);
ext4_free_blocks_count_set(sbi->s_es,
EXT4_C2B(sbi, block));
+ ext4_superblock_csum_set(sb);
err = percpu_counter_init(&sbi->s_freeclusters_counter, block,
GFP_KERNEL);
if (!err) {
unsigned long freei = ext4_count_free_inodes(sb);
sbi->s_es->s_free_inodes_count = cpu_to_le32(freei);
+ ext4_superblock_csum_set(sb);
err = percpu_counter_init(&sbi->s_freeinodes_counter, freei,
GFP_KERNEL);
}
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index defc2168de91..f58c0cacc531 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -682,6 +682,7 @@ int fat_count_free_clusters(struct super_block *sb)
if (ops->ent_get(&fatent) == FAT_ENT_FREE)
free++;
} while (fat_ent_next(sbi, &fatent));
+ cond_resched();
}
sbi->free_clusters = free;
sbi->free_clus_valid = 1;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 4137d96534a6..083185174c6d 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -735,7 +735,7 @@ static void send_sigio_to_task(struct task_struct *p,
return;
switch (signum) {
- siginfo_t si;
+ kernel_siginfo_t si;
default:
/* Queue a rt signal with the appropriate fd as its
value. We use SI_SIGIO as the source, not
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 83bfe04456b6..c550512ce335 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -70,20 +70,7 @@ void fscache_free_cookie(struct fscache_cookie *cookie)
}
/*
- * initialise an cookie jar slab element prior to any use
- */
-void fscache_cookie_init_once(void *_cookie)
-{
- struct fscache_cookie *cookie = _cookie;
-
- memset(cookie, 0, sizeof(*cookie));
- spin_lock_init(&cookie->lock);
- spin_lock_init(&cookie->stores_lock);
- INIT_HLIST_HEAD(&cookie->backing_objects);
-}
-
-/*
- * Set the index key in a cookie. The cookie struct has space for a 12-byte
+ * Set the index key in a cookie. The cookie struct has space for a 16-byte
* key plus length and hash, but if that's not big enough, it's instead a
* pointer to a buffer containing 3 bytes of hash, 1 byte of length and then
* the key data.
@@ -93,20 +80,18 @@ static int fscache_set_key(struct fscache_cookie *cookie,
{
unsigned long long h;
u32 *buf;
+ int bufs;
int i;
- cookie->key_len = index_key_len;
+ bufs = DIV_ROUND_UP(index_key_len, sizeof(*buf));
if (index_key_len > sizeof(cookie->inline_key)) {
- buf = kzalloc(index_key_len, GFP_KERNEL);
+ buf = kcalloc(bufs, sizeof(*buf), GFP_KERNEL);
if (!buf)
return -ENOMEM;
cookie->key = buf;
} else {
buf = (u32 *)cookie->inline_key;
- buf[0] = 0;
- buf[1] = 0;
- buf[2] = 0;
}
memcpy(buf, index_key, index_key_len);
@@ -116,7 +101,8 @@ static int fscache_set_key(struct fscache_cookie *cookie,
*/
h = (unsigned long)cookie->parent;
h += index_key_len + cookie->type;
- for (i = 0; i < (index_key_len + sizeof(u32) - 1) / sizeof(u32); i++)
+
+ for (i = 0; i < bufs; i++)
h += buf[i];
cookie->key_hash = h ^ (h >> 32);
@@ -161,7 +147,7 @@ struct fscache_cookie *fscache_alloc_cookie(
struct fscache_cookie *cookie;
/* allocate and initialise a cookie */
- cookie = kmem_cache_alloc(fscache_cookie_jar, GFP_KERNEL);
+ cookie = kmem_cache_zalloc(fscache_cookie_jar, GFP_KERNEL);
if (!cookie)
return NULL;
@@ -192,6 +178,9 @@ struct fscache_cookie *fscache_alloc_cookie(
cookie->netfs_data = netfs_data;
cookie->flags = (1 << FSCACHE_COOKIE_NO_DATA_YET);
cookie->type = def->type;
+ spin_lock_init(&cookie->lock);
+ spin_lock_init(&cookie->stores_lock);
+ INIT_HLIST_HEAD(&cookie->backing_objects);
/* radix tree insertion won't use the preallocation pool unless it's
* told it may not wait */
diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h
index f83328a7f048..d6209022e965 100644
--- a/fs/fscache/internal.h
+++ b/fs/fscache/internal.h
@@ -51,7 +51,6 @@ extern struct fscache_cache *fscache_select_cache_for_object(
extern struct kmem_cache *fscache_cookie_jar;
extern void fscache_free_cookie(struct fscache_cookie *);
-extern void fscache_cookie_init_once(void *);
extern struct fscache_cookie *fscache_alloc_cookie(struct fscache_cookie *,
const struct fscache_cookie_def *,
const void *, size_t,
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 7dce110bf17d..30ad89db1efc 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -143,9 +143,7 @@ static int __init fscache_init(void)
fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar",
sizeof(struct fscache_cookie),
- 0,
- 0,
- fscache_cookie_init_once);
+ 0, 0, NULL);
if (!fscache_cookie_jar) {
pr_notice("Failed to allocate a cookie jar\n");
ret = -ENOMEM;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 03128ed1f34e..a683d9b27d76 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1057,7 +1057,7 @@ static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
}
}
release_metapath(&mp);
- if (gfs2_is_jdata(ip))
+ if (!gfs2_is_stuffed(ip) && gfs2_is_jdata(ip))
iomap->page_done = gfs2_iomap_journaled_page_done;
return 0;
@@ -1566,7 +1566,7 @@ more_rgrps:
continue;
}
if (bstart) {
- __gfs2_free_blocks(ip, bstart, (u32)blen, meta);
+ __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
(*btotal) += blen;
gfs2_add_inode_blocks(&ip->i_inode, -blen);
}
@@ -1574,7 +1574,7 @@ more_rgrps:
blen = 1;
}
if (bstart) {
- __gfs2_free_blocks(ip, bstart, (u32)blen, meta);
+ __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
(*btotal) += blen;
gfs2_add_inode_blocks(&ip->i_inode, -blen);
}
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index e37002560c11..daa14ab4e31b 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -506,7 +506,8 @@ static int gfs2_dirent_gather(const struct gfs2_dirent *dent,
* For now the most important thing is to check that the various sizes
* are correct.
*/
-static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
+static int gfs2_check_dirent(struct gfs2_sbd *sdp,
+ struct gfs2_dirent *dent, unsigned int offset,
unsigned int size, unsigned int len, int first)
{
const char *msg = "gfs2_dirent too small";
@@ -528,12 +529,12 @@ static int gfs2_check_dirent(struct gfs2_dirent *dent, unsigned int offset,
goto error;
return 0;
error:
- pr_warn("%s: %s (%s)\n",
+ fs_warn(sdp, "%s: %s (%s)\n",
__func__, msg, first ? "first in block" : "not first in block");
return -EIO;
}
-static int gfs2_dirent_offset(const void *buf)
+static int gfs2_dirent_offset(struct gfs2_sbd *sdp, const void *buf)
{
const struct gfs2_meta_header *h = buf;
int offset;
@@ -552,7 +553,8 @@ static int gfs2_dirent_offset(const void *buf)
}
return offset;
wrong_type:
- pr_warn("%s: wrong block type %u\n", __func__, be32_to_cpu(h->mh_type));
+ fs_warn(sdp, "%s: wrong block type %u\n", __func__,
+ be32_to_cpu(h->mh_type));
return -1;
}
@@ -566,7 +568,7 @@ static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode, void *buf,
unsigned size;
int ret = 0;
- ret = gfs2_dirent_offset(buf);
+ ret = gfs2_dirent_offset(GFS2_SB(inode), buf);
if (ret < 0)
goto consist_inode;
@@ -574,7 +576,7 @@ static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode, void *buf,
prev = NULL;
dent = buf + offset;
size = be16_to_cpu(dent->de_rec_len);
- if (gfs2_check_dirent(dent, offset, size, len, 1))
+ if (gfs2_check_dirent(GFS2_SB(inode), dent, offset, size, len, 1))
goto consist_inode;
do {
ret = scan(dent, name, opaque);
@@ -586,7 +588,8 @@ static struct gfs2_dirent *gfs2_dirent_scan(struct inode *inode, void *buf,
prev = dent;
dent = buf + offset;
size = be16_to_cpu(dent->de_rec_len);
- if (gfs2_check_dirent(dent, offset, size, len, 0))
+ if (gfs2_check_dirent(GFS2_SB(inode), dent, offset, size,
+ len, 0))
goto consist_inode;
} while(1);
@@ -1043,7 +1046,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
len = BIT(dip->i_depth - be16_to_cpu(oleaf->lf_depth));
half_len = len >> 1;
if (!half_len) {
- pr_warn("i_depth %u lf_depth %u index %u\n",
+ fs_warn(GFS2_SB(inode), "i_depth %u lf_depth %u index %u\n",
dip->i_depth, be16_to_cpu(oleaf->lf_depth), index);
gfs2_consist_inode(dip);
error = -EIO;
@@ -1351,7 +1354,7 @@ static int gfs2_set_cookies(struct gfs2_sbd *sdp, struct buffer_head *bh,
if (!sdp->sd_args.ar_loccookie)
continue;
offset = (char *)(darr[i]) -
- (bh->b_data + gfs2_dirent_offset(bh->b_data));
+ (bh->b_data + gfs2_dirent_offset(sdp, bh->b_data));
offset /= GFS2_MIN_DIRENT_SIZE;
offset += leaf_nr * sdp->sd_max_dents_per_leaf;
if (offset >= GFS2_USE_HASH_FLAG ||
@@ -2018,7 +2021,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
l_blocks++;
}
- gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
+ gfs2_rlist_alloc(&rlist);
for (x = 0; x < rlist.rl_rgrps; x++) {
struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl);
@@ -2039,6 +2042,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
bh = leaf_bh;
for (blk = leaf_no; blk; blk = nblk) {
+ struct gfs2_rgrpd *rgd;
+
if (blk != leaf_no) {
error = get_leaf(dip, blk, &bh);
if (error)
@@ -2049,7 +2054,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
if (blk != leaf_no)
brelse(bh);
- gfs2_free_meta(dip, blk, 1);
+ rgd = gfs2_blk2rgrpd(sdp, blk, true);
+ gfs2_free_meta(dip, rgd, blk, 1);
gfs2_add_inode_blocks(&dip->i_inode, -1);
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 08369c6cd127..45a17b770d97 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -314,6 +314,17 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
return do_gfs2_set_flags(filp, gfsflags, mask);
}
+static int gfs2_getlabel(struct file *filp, char __user *label)
+{
+ struct inode *inode = file_inode(filp);
+ struct gfs2_sbd *sdp = GFS2_SB(inode);
+
+ if (copy_to_user(label, sdp->sd_sb.sb_locktable, GFS2_LOCKNAME_LEN))
+ return -EFAULT;
+
+ return 0;
+}
+
static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
switch(cmd) {
@@ -323,7 +334,10 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return gfs2_set_flags(filp, (u32 __user *)arg);
case FITRIM:
return gfs2_fitrim(filp, (void __user *)arg);
+ case FS_IOC_GETFSLABEL:
+ return gfs2_getlabel(filp, (char __user *)arg);
}
+
return -ENOTTY;
}
@@ -347,8 +361,8 @@ static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size)
size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift;
int hint = min_t(size_t, INT_MAX, blks);
- if (hint > atomic_read(&ip->i_res.rs_sizehint))
- atomic_set(&ip->i_res.rs_sizehint, hint);
+ if (hint > atomic_read(&ip->i_sizehint))
+ atomic_set(&ip->i_sizehint, hint);
}
/**
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 4614ee25f621..05431324b262 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -494,7 +494,8 @@ retry:
do_xmote(gl, gh, LM_ST_UNLOCKED);
break;
default: /* Everything else */
- pr_err("wanted %u got %u\n", gl->gl_target, state);
+ fs_err(gl->gl_name.ln_sbd, "wanted %u got %u\n",
+ gl->gl_target, state);
GLOCK_BUG_ON(gl, 1);
}
spin_unlock(&gl->gl_lockref.lock);
@@ -577,7 +578,7 @@ __acquires(&gl->gl_lockref.lock)
gfs2_glock_queue_work(gl, 0);
}
else if (ret) {
- pr_err("lm_lock ret %d\n", ret);
+ fs_err(sdp, "lm_lock ret %d\n", ret);
GLOCK_BUG_ON(gl, !test_bit(SDF_SHUTDOWN,
&sdp->sd_flags));
}
@@ -1064,13 +1065,13 @@ do_cancel:
return;
trap_recursive:
- pr_err("original: %pSR\n", (void *)gh2->gh_ip);
- pr_err("pid: %d\n", pid_nr(gh2->gh_owner_pid));
- pr_err("lock type: %d req lock state : %d\n",
+ fs_err(sdp, "original: %pSR\n", (void *)gh2->gh_ip);
+ fs_err(sdp, "pid: %d\n", pid_nr(gh2->gh_owner_pid));
+ fs_err(sdp, "lock type: %d req lock state : %d\n",
gh2->gh_gl->gl_name.ln_type, gh2->gh_state);
- pr_err("new: %pSR\n", (void *)gh->gh_ip);
- pr_err("pid: %d\n", pid_nr(gh->gh_owner_pid));
- pr_err("lock type: %d req lock state : %d\n",
+ fs_err(sdp, "new: %pSR\n", (void *)gh->gh_ip);
+ fs_err(sdp, "pid: %d\n", pid_nr(gh->gh_owner_pid));
+ fs_err(sdp, "lock type: %d req lock state : %d\n",
gh->gh_gl->gl_name.ln_type, gh->gh_state);
gfs2_dump_glock(NULL, gl);
BUG();
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index b96d39c28e17..888b62cfd6d1 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -92,7 +92,7 @@ struct gfs2_bitmap {
unsigned long bi_flags;
u32 bi_offset;
u32 bi_start;
- u32 bi_len;
+ u32 bi_bytes;
u32 bi_blocks;
};
@@ -309,10 +309,6 @@ struct gfs2_qadata { /* quota allocation data */
*/
struct gfs2_blkreserv {
- /* components used during write (step 1): */
- atomic_t rs_sizehint; /* hint of the write size */
-
- struct gfs2_holder rs_rgd_gh; /* Filled in by get_local_rgrp */
struct rb_node rs_node; /* link to other block reservations */
struct gfs2_rbm rs_rbm; /* Start of reservation */
u32 rs_free; /* how many blocks are still free */
@@ -417,8 +413,10 @@ struct gfs2_inode {
struct gfs2_holder i_iopen_gh;
struct gfs2_holder i_gh; /* for prepare/commit_write only */
struct gfs2_qadata *i_qadata; /* quota allocation data */
+ struct gfs2_holder i_rgd_gh;
struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */
u64 i_goal; /* goal block for allocations */
+ atomic_t i_sizehint; /* hint of the write size */
struct rw_semaphore i_rw_mutex;
struct list_head i_ordered;
struct list_head i_trunc_list;
@@ -623,6 +621,7 @@ enum {
SDF_RORECOVERY = 7, /* read only recovery */
SDF_SKIP_DLM_UNLOCK = 8,
SDF_FORCE_AIL_FLUSH = 9,
+ SDF_AIL1_IO_ERROR = 10,
};
enum gfs2_freeze_state {
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index ac7caa267ed6..31df26ed7854 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -177,14 +177,14 @@ static void gdlm_bast(void *arg, int mode)
gfs2_glock_cb(gl, LM_ST_SHARED);
break;
default:
- pr_err("unknown bast mode %d\n", mode);
+ fs_err(gl->gl_name.ln_sbd, "unknown bast mode %d\n", mode);
BUG();
}
}
/* convert gfs lock-state to dlm lock-mode */
-static int make_mode(const unsigned int lmstate)
+static int make_mode(struct gfs2_sbd *sdp, const unsigned int lmstate)
{
switch (lmstate) {
case LM_ST_UNLOCKED:
@@ -196,7 +196,7 @@ static int make_mode(const unsigned int lmstate)
case LM_ST_SHARED:
return DLM_LOCK_PR;
}
- pr_err("unknown LM state %d\n", lmstate);
+ fs_err(sdp, "unknown LM state %d\n", lmstate);
BUG();
return -1;
}
@@ -257,7 +257,7 @@ static int gdlm_lock(struct gfs2_glock *gl, unsigned int req_state,
u32 lkf;
char strname[GDLM_STRNAME_BYTES] = "";
- req = make_mode(req_state);
+ req = make_mode(gl->gl_name.ln_sbd, req_state);
lkf = make_flags(gl, flags, req);
gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT);
gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT);
@@ -309,7 +309,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
NULL, gl);
if (error) {
- pr_err("gdlm_unlock %x,%llx err=%d\n",
+ fs_err(sdp, "gdlm_unlock %x,%llx err=%d\n",
gl->gl_name.ln_type,
(unsigned long long)gl->gl_name.ln_number, error);
return;
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index ee20ea42e7b5..99dd58694ba1 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -108,7 +108,9 @@ __acquires(&sdp->sd_ail_lock)
gfs2_assert(sdp, bd->bd_tr == tr);
if (!buffer_busy(bh)) {
- if (!buffer_uptodate(bh)) {
+ if (!buffer_uptodate(bh) &&
+ !test_and_set_bit(SDF_AIL1_IO_ERROR,
+ &sdp->sd_flags)) {
gfs2_io_error_bh(sdp, bh);
*withdraw = true;
}
@@ -206,7 +208,8 @@ static void gfs2_ail1_empty_one(struct gfs2_sbd *sdp, struct gfs2_trans *tr,
gfs2_assert(sdp, bd->bd_tr == tr);
if (buffer_busy(bh))
continue;
- if (!buffer_uptodate(bh)) {
+ if (!buffer_uptodate(bh) &&
+ !test_and_set_bit(SDF_AIL1_IO_ERROR, &sdp->sd_flags)) {
gfs2_io_error_bh(sdp, bh);
*withdraw = true;
}
@@ -618,7 +621,7 @@ void gfs2_write_revokes(struct gfs2_sbd *sdp)
gfs2_ail1_empty(sdp);
spin_lock(&sdp->sd_ail_lock);
- list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
+ list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) {
list_for_each_entry(bd, &tr->tr_ail2_list, bd_ail_st_list) {
if (list_empty(&bd->bd_list)) {
have_revokes = 1;
@@ -642,7 +645,7 @@ done:
}
gfs2_log_lock(sdp);
spin_lock(&sdp->sd_ail_lock);
- list_for_each_entry(tr, &sdp->sd_ail1_list, tr_list) {
+ list_for_each_entry_reverse(tr, &sdp->sd_ail1_list, tr_list) {
list_for_each_entry_safe(bd, tmp, &tr->tr_ail2_list, bd_ail_st_list) {
if (max_revokes == 0)
goto out_of_blocks;
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index f2567f958d00..4c7069b8f3c1 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -81,7 +81,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd)
if (sdp->sd_args.ar_discard)
gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi, 1, NULL);
memcpy(bi->bi_clone + bi->bi_offset,
- bd->bd_bh->b_data + bi->bi_offset, bi->bi_len);
+ bd->bd_bh->b_data + bi->bi_offset, bi->bi_bytes);
clear_bit(GBF_FULL, &bi->bi_flags);
rgd->rd_free_clone = rgd->rd_free;
rgd->rd_extfail_pt = rgd->rd_free;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 2d55e2c3333c..c7603063f861 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -39,9 +39,11 @@ static void gfs2_init_inode_once(void *foo)
struct gfs2_inode *ip = foo;
inode_init_once(&ip->i_inode);
+ atomic_set(&ip->i_sizehint, 0);
init_rwsem(&ip->i_rw_mutex);
INIT_LIST_HEAD(&ip->i_trunc_list);
ip->i_qadata = NULL;
+ gfs2_holder_mark_uninitialized(&ip->i_rgd_gh);
memset(&ip->i_res, 0, sizeof(ip->i_res));
RB_CLEAR_NODE(&ip->i_res.rs_node);
ip->i_hash_cache = NULL;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index c2469833b4fb..b041cb8ae383 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -72,13 +72,13 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
if (!sdp)
return NULL;
- sb->s_fs_info = sdp;
sdp->sd_vfs = sb;
sdp->sd_lkstats = alloc_percpu(struct gfs2_pcpu_lkstats);
if (!sdp->sd_lkstats) {
kfree(sdp);
return NULL;
}
+ sb->s_fs_info = sdp;
set_bit(SDF_NOJOURNALID, &sdp->sd_flags);
gfs2_tune_init(&sdp->sd_tune);
@@ -1333,6 +1333,9 @@ static struct dentry *gfs2_mount_meta(struct file_system_type *fs_type,
struct path path;
int error;
+ if (!dev_name || !*dev_name)
+ return ERR_PTR(-EINVAL);
+
error = kern_path(dev_name, LOOKUP_FOLLOW, &path);
if (error) {
pr_warn("path_lookup on %s returned error %d\n",
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 0efae7a0ee80..2ae5a109eea7 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -1183,7 +1183,7 @@ static int print_message(struct gfs2_quota_data *qd, char *type)
*
* Returns: 0 on success.
* min_req = ap->min_target ? ap->min_target : ap->target;
- * quota must allow atleast min_req blks for success and
+ * quota must allow at least min_req blks for success and
* ap->allowed is set to the number of blocks allowed
*
* -EDQUOT otherwise, quota violation. ap->allowed is set to number
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 1ad3256b9cbc..ffe3032b1043 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -90,7 +90,7 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
{
unsigned char *byte1, *byte2, *end, cur_state;
struct gfs2_bitmap *bi = rbm_bi(rbm);
- unsigned int buflen = bi->bi_len;
+ unsigned int buflen = bi->bi_bytes;
const unsigned int bit = (rbm->offset % GFS2_NBBY) * GFS2_BIT_SIZE;
byte1 = bi->bi_bh->b_data + bi->bi_offset + (rbm->offset / GFS2_NBBY);
@@ -101,12 +101,16 @@ static inline void gfs2_setbit(const struct gfs2_rbm *rbm, bool do_clone,
cur_state = (*byte1 >> bit) & GFS2_BIT_MASK;
if (unlikely(!valid_change[new_state * 4 + cur_state])) {
- pr_warn("buf_blk = 0x%x old_state=%d, new_state=%d\n",
+ struct gfs2_sbd *sdp = rbm->rgd->rd_sbd;
+
+ fs_warn(sdp, "buf_blk = 0x%x old_state=%d, new_state=%d\n",
rbm->offset, cur_state, new_state);
- pr_warn("rgrp=0x%llx bi_start=0x%x\n",
- (unsigned long long)rbm->rgd->rd_addr, bi->bi_start);
- pr_warn("bi_offset=0x%x bi_len=0x%x\n",
- bi->bi_offset, bi->bi_len);
+ fs_warn(sdp, "rgrp=0x%llx bi_start=0x%x biblk: 0x%llx\n",
+ (unsigned long long)rbm->rgd->rd_addr, bi->bi_start,
+ (unsigned long long)bi->bi_bh->b_blocknr);
+ fs_warn(sdp, "bi_offset=0x%x bi_bytes=0x%x block=0x%llx\n",
+ bi->bi_offset, bi->bi_bytes,
+ (unsigned long long)gfs2_rbm_to_block(rbm));
dump_stack();
gfs2_consist_rgrpd(rbm->rgd);
return;
@@ -269,15 +273,10 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
{
- u64 rblock = block - rbm->rgd->rd_data0;
-
- if (WARN_ON_ONCE(rblock > UINT_MAX))
- return -EINVAL;
- if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data)
+ if (!rgrp_contains_block(rbm->rgd, block))
return -E2BIG;
-
rbm->bii = 0;
- rbm->offset = (u32)(rblock);
+ rbm->offset = block - rbm->rgd->rd_data0;
/* Check if the block is within the first block */
if (rbm->offset < rbm_bi(rbm)->bi_blocks)
return 0;
@@ -382,7 +381,7 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
if (bi->bi_clone)
start = bi->bi_clone;
start += bi->bi_offset;
- end = start + bi->bi_len;
+ end = start + bi->bi_bytes;
BUG_ON(rbm.offset & 3);
start += (rbm.offset / GFS2_NBBY);
bytes = min_t(u32, len / GFS2_NBBY, (end - start));
@@ -467,7 +466,7 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
count[x] += gfs2_bitcount(rgd,
bi->bi_bh->b_data +
bi->bi_offset,
- bi->bi_len, x);
+ bi->bi_bytes, x);
}
if (count[0] != rgd->rd_free) {
@@ -642,7 +641,10 @@ static void __rs_deltree(struct gfs2_blkreserv *rs)
RB_CLEAR_NODE(&rs->rs_node);
if (rs->rs_free) {
- struct gfs2_bitmap *bi = rbm_bi(&rs->rs_rbm);
+ u64 last_block = gfs2_rbm_to_block(&rs->rs_rbm) +
+ rs->rs_free - 1;
+ struct gfs2_rbm last_rbm = { .rgd = rs->rs_rbm.rgd, };
+ struct gfs2_bitmap *start, *last;
/* return reserved blocks to the rgrp */
BUG_ON(rs->rs_rbm.rgd->rd_reserved < rs->rs_free);
@@ -653,7 +655,13 @@ static void __rs_deltree(struct gfs2_blkreserv *rs)
it will force the number to be recalculated later. */
rgd->rd_extfail_pt += rs->rs_free;
rs->rs_free = 0;
- clear_bit(GBF_FULL, &bi->bi_flags);
+ if (gfs2_rbm_from_block(&last_rbm, last_block))
+ return;
+ start = rbm_bi(&rs->rs_rbm);
+ last = rbm_bi(&last_rbm);
+ do
+ clear_bit(GBF_FULL, &start->bi_flags);
+ while (start++ != last);
}
}
@@ -738,11 +746,13 @@ void gfs2_clear_rgrpd(struct gfs2_sbd *sdp)
static void gfs2_rindex_print(const struct gfs2_rgrpd *rgd)
{
- pr_info("ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
- pr_info("ri_length = %u\n", rgd->rd_length);
- pr_info("ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
- pr_info("ri_data = %u\n", rgd->rd_data);
- pr_info("ri_bitbytes = %u\n", rgd->rd_bitbytes);
+ struct gfs2_sbd *sdp = rgd->rd_sbd;
+
+ fs_info(sdp, "ri_addr = %llu\n", (unsigned long long)rgd->rd_addr);
+ fs_info(sdp, "ri_length = %u\n", rgd->rd_length);
+ fs_info(sdp, "ri_data0 = %llu\n", (unsigned long long)rgd->rd_data0);
+ fs_info(sdp, "ri_data = %u\n", rgd->rd_data);
+ fs_info(sdp, "ri_bitbytes = %u\n", rgd->rd_bitbytes);
}
/**
@@ -780,21 +790,21 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
bytes = bytes_left;
bi->bi_offset = sizeof(struct gfs2_rgrp);
bi->bi_start = 0;
- bi->bi_len = bytes;
+ bi->bi_bytes = bytes;
bi->bi_blocks = bytes * GFS2_NBBY;
/* header block */
} else if (x == 0) {
bytes = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_rgrp);
bi->bi_offset = sizeof(struct gfs2_rgrp);
bi->bi_start = 0;
- bi->bi_len = bytes;
+ bi->bi_bytes = bytes;
bi->bi_blocks = bytes * GFS2_NBBY;
/* last block */
} else if (x + 1 == length) {
bytes = bytes_left;
bi->bi_offset = sizeof(struct gfs2_meta_header);
bi->bi_start = rgd->rd_bitbytes - bytes_left;
- bi->bi_len = bytes;
+ bi->bi_bytes = bytes;
bi->bi_blocks = bytes * GFS2_NBBY;
/* other blocks */
} else {
@@ -802,7 +812,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
sizeof(struct gfs2_meta_header);
bi->bi_offset = sizeof(struct gfs2_meta_header);
bi->bi_start = rgd->rd_bitbytes - bytes_left;
- bi->bi_len = bytes;
+ bi->bi_bytes = bytes;
bi->bi_blocks = bytes * GFS2_NBBY;
}
@@ -814,11 +824,11 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd)
return -EIO;
}
bi = rgd->rd_bits + (length - 1);
- if ((bi->bi_start + bi->bi_len) * GFS2_NBBY != rgd->rd_data) {
+ if ((bi->bi_start + bi->bi_bytes) * GFS2_NBBY != rgd->rd_data) {
if (gfs2_consist_rgrpd(rgd)) {
gfs2_rindex_print(rgd);
fs_err(sdp, "start=%u len=%u offset=%u\n",
- bi->bi_start, bi->bi_len, bi->bi_offset);
+ bi->bi_start, bi->bi_bytes, bi->bi_offset);
}
return -EIO;
}
@@ -1103,12 +1113,35 @@ static int gfs2_rgrp_lvb_valid(struct gfs2_rgrpd *rgd)
{
struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
struct gfs2_rgrp *str = (struct gfs2_rgrp *)rgd->rd_bits[0].bi_bh->b_data;
+ int valid = 1;
- if (rgl->rl_flags != str->rg_flags || rgl->rl_free != str->rg_free ||
- rgl->rl_dinodes != str->rg_dinodes ||
- rgl->rl_igeneration != str->rg_igeneration)
- return 0;
- return 1;
+ if (rgl->rl_flags != str->rg_flags) {
+ printk(KERN_WARNING "GFS2: rgd: %llu lvb flag mismatch %u/%u",
+ (unsigned long long)rgd->rd_addr,
+ be32_to_cpu(rgl->rl_flags), be32_to_cpu(str->rg_flags));
+ valid = 0;
+ }
+ if (rgl->rl_free != str->rg_free) {
+ printk(KERN_WARNING "GFS2: rgd: %llu lvb free mismatch %u/%u",
+ (unsigned long long)rgd->rd_addr,
+ be32_to_cpu(rgl->rl_free), be32_to_cpu(str->rg_free));
+ valid = 0;
+ }
+ if (rgl->rl_dinodes != str->rg_dinodes) {
+ printk(KERN_WARNING "GFS2: rgd: %llu lvb dinode mismatch %u/%u",
+ (unsigned long long)rgd->rd_addr,
+ be32_to_cpu(rgl->rl_dinodes),
+ be32_to_cpu(str->rg_dinodes));
+ valid = 0;
+ }
+ if (rgl->rl_igeneration != str->rg_igeneration) {
+ printk(KERN_WARNING "GFS2: rgd: %llu lvb igen mismatch "
+ "%llu/%llu", (unsigned long long)rgd->rd_addr,
+ (unsigned long long)be64_to_cpu(rgl->rl_igeneration),
+ (unsigned long long)be64_to_cpu(str->rg_igeneration));
+ valid = 0;
+ }
+ return valid;
}
static u32 count_unlinked(struct gfs2_rgrpd *rgd)
@@ -1122,8 +1155,8 @@ static u32 count_unlinked(struct gfs2_rgrpd *rgd)
goal = 0;
buffer = bi->bi_bh->b_data + bi->bi_offset;
WARN_ON(!buffer_uptodate(bi->bi_bh));
- while (goal < bi->bi_len * GFS2_NBBY) {
- goal = gfs2_bitfit(buffer, bi->bi_len, goal,
+ while (goal < bi->bi_blocks) {
+ goal = gfs2_bitfit(buffer, bi->bi_bytes, goal,
GFS2_BLKST_UNLINKED);
if (goal == BFITNOENT)
break;
@@ -1226,7 +1259,7 @@ static int update_rgrp_lvb(struct gfs2_rgrpd *rgd)
rl_flags = be32_to_cpu(rgd->rd_rgl->rl_flags);
rl_flags &= ~GFS2_RDF_MASK;
rgd->rd_flags &= GFS2_RDF_MASK;
- rgd->rd_flags |= (rl_flags | GFS2_RDF_UPTODATE | GFS2_RDF_CHECK);
+ rgd->rd_flags |= (rl_flags | GFS2_RDF_CHECK);
if (rgd->rd_rgl->rl_unlinked == 0)
rgd->rd_flags &= ~GFS2_RDF_CHECK;
rgd->rd_free = be32_to_cpu(rgd->rd_rgl->rl_free);
@@ -1295,7 +1328,7 @@ int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
u32 trimmed = 0;
u8 diff;
- for (x = 0; x < bi->bi_len; x++) {
+ for (x = 0; x < bi->bi_bytes; x++) {
const u8 *clone = bi->bi_clone ? bi->bi_clone : bi->bi_bh->b_data;
clone += bi->bi_offset;
clone += x;
@@ -1541,8 +1574,8 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
if (S_ISDIR(inode->i_mode))
extlen = 1;
else {
- extlen = max_t(u32, atomic_read(&rs->rs_sizehint), ap->target);
- extlen = clamp(extlen, RGRP_RSRV_MINBLKS, free_blocks);
+ extlen = max_t(u32, atomic_read(&ip->i_sizehint), ap->target);
+ extlen = clamp(extlen, (u32)RGRP_RSRV_MINBLKS, free_blocks);
}
if ((rgd->rd_free_clone < rgd->rd_reserved) || (free_blocks < extlen))
return;
@@ -1728,7 +1761,7 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext,
if (state != GFS2_BLKST_UNLINKED && bi->bi_clone)
buffer = bi->bi_clone + bi->bi_offset;
initial_offset = rbm->offset;
- offset = gfs2_bitfit(buffer, bi->bi_len, rbm->offset, state);
+ offset = gfs2_bitfit(buffer, bi->bi_bytes, rbm->offset, state);
if (offset == BFITNOENT)
goto bitmap_full;
rbm->offset = offset;
@@ -1999,7 +2032,7 @@ static inline int fast_to_acquire(struct gfs2_rgrpd *rgd)
* We try our best to find an rgrp that has at least ap->target blocks
* available. After a couple of passes (loops == 2), the prospects of finding
* such an rgrp diminish. At this stage, we return the first rgrp that has
- * atleast ap->min_target blocks available. Either way, we set ap->allowed to
+ * at least ap->min_target blocks available. Either way, we set ap->allowed to
* the number of blocks available in the chosen rgrp.
*
* Returns: 0 on success,
@@ -2053,7 +2086,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
}
error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
LM_ST_EXCLUSIVE, flags,
- &rs->rs_rgd_gh);
+ &ip->i_rgd_gh);
if (unlikely(error))
return error;
if (!gfs2_rs_active(rs) && (loops < 2) &&
@@ -2062,13 +2095,13 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
if (sdp->sd_args.ar_rgrplvb) {
error = update_rgrp_lvb(rs->rs_rbm.rgd);
if (unlikely(error)) {
- gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
+ gfs2_glock_dq_uninit(&ip->i_rgd_gh);
return error;
}
}
}
- /* Skip unuseable resource groups */
+ /* Skip unusable resource groups */
if ((rs->rs_rbm.rgd->rd_flags & (GFS2_RGF_NOALLOC |
GFS2_RDF_ERROR)) ||
(loops == 0 && ap->target > rs->rs_rbm.rgd->rd_extfail_pt))
@@ -2105,7 +2138,7 @@ skip_rgrp:
/* Unlock rgrp if required */
if (!rg_locked)
- gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
+ gfs2_glock_dq_uninit(&ip->i_rgd_gh);
next_rgrp:
/* Find the next rgrp, and continue looking */
if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
@@ -2142,10 +2175,8 @@ next_rgrp:
void gfs2_inplace_release(struct gfs2_inode *ip)
{
- struct gfs2_blkreserv *rs = &ip->i_res;
-
- if (gfs2_holder_initialized(&rs->rs_rgd_gh))
- gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
+ if (gfs2_holder_initialized(&ip->i_rgd_gh))
+ gfs2_glock_dq_uninit(&ip->i_rgd_gh);
}
/**
@@ -2184,27 +2215,21 @@ static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
/**
* rgblk_free - Change alloc state of given block(s)
* @sdp: the filesystem
+ * @rgd: the resource group the blocks are in
* @bstart: the start of a run of blocks to free
* @blen: the length of the block run (all must lie within ONE RG!)
* @new_state: GFS2_BLKST_XXX the after-allocation block state
- *
- * Returns: Resource group containing the block(s)
*/
-static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
- u32 blen, unsigned char new_state)
+static void rgblk_free(struct gfs2_sbd *sdp, struct gfs2_rgrpd *rgd,
+ u64 bstart, u32 blen, unsigned char new_state)
{
struct gfs2_rbm rbm;
struct gfs2_bitmap *bi, *bi_prev = NULL;
- rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1);
- if (!rbm.rgd) {
- if (gfs2_consist(sdp))
- fs_err(sdp, "block = %llu\n", (unsigned long long)bstart);
- return NULL;
- }
-
- gfs2_rbm_from_block(&rbm, bstart);
+ rbm.rgd = rgd;
+ if (WARN_ON_ONCE(gfs2_rbm_from_block(&rbm, bstart)))
+ return;
while (blen--) {
bi = rbm_bi(&rbm);
if (bi != bi_prev) {
@@ -2213,7 +2238,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
GFP_NOFS | __GFP_NOFAIL);
memcpy(bi->bi_clone + bi->bi_offset,
bi->bi_bh->b_data + bi->bi_offset,
- bi->bi_len);
+ bi->bi_bytes);
}
gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh);
bi_prev = bi;
@@ -2221,8 +2246,6 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
gfs2_setbit(&rbm, false, new_state);
gfs2_rbm_incr(&rbm);
}
-
- return rbm.rgd;
}
/**
@@ -2244,6 +2267,14 @@ void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
(unsigned long long)rgd->rd_addr, rgd->rd_flags,
rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes,
rgd->rd_reserved, rgd->rd_extfail_pt);
+ if (rgd->rd_sbd->sd_args.ar_rgrplvb) {
+ struct gfs2_rgrp_lvb *rgl = rgd->rd_rgl;
+
+ gfs2_print_dbg(seq, " L: f:%02x b:%u i:%u\n",
+ be32_to_cpu(rgl->rl_flags),
+ be32_to_cpu(rgl->rl_free),
+ be32_to_cpu(rgl->rl_dinodes));
+ }
spin_lock(&rgd->rd_rsspin);
for (n = rb_first(&rgd->rd_rstree); n; n = rb_next(&trs->rs_node)) {
trs = rb_entry(n, struct gfs2_blkreserv, rs_node);
@@ -2295,7 +2326,7 @@ static void gfs2_adjust_reservation(struct gfs2_inode *ip,
goto out;
/* We used up our block reservation, so we should
reserve more blocks next time. */
- atomic_add(RGRP_RSRV_ADDBLKS, &rs->rs_sizehint);
+ atomic_add(RGRP_RSRV_ADDBLKS, &ip->i_sizehint);
}
__rs_deltree(rs);
}
@@ -2329,7 +2360,10 @@ static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
else
goal = rbm->rgd->rd_last_alloc + rbm->rgd->rd_data0;
- gfs2_rbm_from_block(rbm, goal);
+ if (WARN_ON_ONCE(gfs2_rbm_from_block(rbm, goal))) {
+ rbm->bii = 0;
+ rbm->offset = 0;
+ }
}
/**
@@ -2392,7 +2426,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
}
}
if (rbm.rgd->rd_free < *nblocks) {
- pr_warn("nblocks=%u\n", *nblocks);
+ fs_warn(sdp, "nblocks=%u\n", *nblocks);
goto rgrp_error;
}
@@ -2427,20 +2461,19 @@ rgrp_error:
/**
* __gfs2_free_blocks - free a contiguous run of block(s)
* @ip: the inode these blocks are being freed from
+ * @rgd: the resource group the blocks are in
* @bstart: first block of a run of contiguous blocks
* @blen: the length of the block run
* @meta: 1 if the blocks represent metadata
*
*/
-void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
+void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
+ u64 bstart, u32 blen, int meta)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- struct gfs2_rgrpd *rgd;
- rgd = rgblk_free(sdp, bstart, blen, GFS2_BLKST_FREE);
- if (!rgd)
- return;
+ rgblk_free(sdp, rgd, bstart, blen, GFS2_BLKST_FREE);
trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
rgd->rd_free += blen;
rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
@@ -2455,16 +2488,18 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
/**
* gfs2_free_meta - free a contiguous run of data block(s)
* @ip: the inode these blocks are being freed from
+ * @rgd: the resource group the blocks are in
* @bstart: first block of a run of contiguous blocks
* @blen: the length of the block run
*
*/
-void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen)
+void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
+ u64 bstart, u32 blen)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
- __gfs2_free_blocks(ip, bstart, blen, 1);
+ __gfs2_free_blocks(ip, rgd, bstart, blen, 1);
gfs2_statfs_change(sdp, 0, +blen, 0);
gfs2_quota_change(ip, -(s64)blen, ip->i_inode.i_uid, ip->i_inode.i_gid);
}
@@ -2476,9 +2511,10 @@ void gfs2_unlink_di(struct inode *inode)
struct gfs2_rgrpd *rgd;
u64 blkno = ip->i_no_addr;
- rgd = rgblk_free(sdp, blkno, 1, GFS2_BLKST_UNLINKED);
+ rgd = gfs2_blk2rgrpd(sdp, blkno, true);
if (!rgd)
return;
+ rgblk_free(sdp, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -2488,13 +2524,8 @@ void gfs2_unlink_di(struct inode *inode)
void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = rgd->rd_sbd;
- struct gfs2_rgrpd *tmp_rgd;
-
- tmp_rgd = rgblk_free(sdp, ip->i_no_addr, 1, GFS2_BLKST_FREE);
- if (!tmp_rgd)
- return;
- gfs2_assert_withdraw(sdp, rgd == tmp_rgd);
+ rgblk_free(sdp, rgd, ip->i_no_addr, 1, GFS2_BLKST_FREE);
if (!rgd->rd_dinodes)
gfs2_consist_rgrpd(rgd);
rgd->rd_dinodes--;
@@ -2538,7 +2569,8 @@ int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
rbm.rgd = rgd;
error = gfs2_rbm_from_block(&rbm, no_addr);
- WARN_ON_ONCE(error != 0);
+ if (WARN_ON_ONCE(error))
+ goto fail;
if (gfs2_testbit(&rbm, false) != type)
error = -ESTALE;
@@ -2624,13 +2656,12 @@ void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
* gfs2_rlist_alloc - all RGs have been added to the rlist, now allocate
* and initialize an array of glock holders for them
* @rlist: the list of resource groups
- * @state: the lock state to acquire the RG lock in
*
* FIXME: Don't use NOFAIL
*
*/
-void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
+void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist)
{
unsigned int x;
@@ -2639,7 +2670,7 @@ void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state)
GFP_NOFS | __GFP_NOFAIL);
for (x = 0; x < rlist->rl_rgrps; x++)
gfs2_holder_init(rlist->rl_rgd[x]->rd_gl,
- state, 0,
+ LM_ST_EXCLUSIVE, 0,
&rlist->rl_ghs[x]);
}
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index e90478e2f545..b596c3d17988 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -18,8 +18,7 @@
* By reserving 32 blocks at a time, we can optimize / shortcut how we search
* through the bitmaps by looking a word at a time.
*/
-#define RGRP_RSRV_MINBYTES 8
-#define RGRP_RSRV_MINBLKS ((u32)(RGRP_RSRV_MINBYTES * GFS2_NBBY))
+#define RGRP_RSRV_MINBLKS 32
#define RGRP_RSRV_ADDBLKS 64
struct gfs2_rgrpd;
@@ -52,8 +51,10 @@ extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
extern int gfs2_rsqa_alloc(struct gfs2_inode *ip);
extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
extern void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount);
-extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
-extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
+extern void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
+ u64 bstart, u32 blen, int meta);
+extern void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
+ u64 bstart, u32 blen);
extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
extern void gfs2_unlink_di(struct inode *inode);
extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr,
@@ -68,7 +69,7 @@ struct gfs2_rgrp_list {
extern void gfs2_rlist_add(struct gfs2_inode *ip, struct gfs2_rgrp_list *rlist,
u64 block);
-extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state);
+extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist);
extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist);
extern u64 gfs2_ri_total(struct gfs2_sbd *sdp);
extern void gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index c212893534ed..ca71163ff7cf 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -854,10 +854,10 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
return error;
+ flush_workqueue(gfs2_delete_workqueue);
kthread_stop(sdp->sd_quotad_process);
kthread_stop(sdp->sd_logd_process);
- flush_workqueue(gfs2_delete_workqueue);
gfs2_quota_sync(sdp->sd_vfs, 0);
gfs2_statfs_sync(sdp->sd_vfs, 0);
@@ -971,7 +971,7 @@ void gfs2_freeze_func(struct work_struct *work)
error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0,
&freeze_gh);
if (error) {
- printk(KERN_INFO "GFS2: couln't get freeze lock : %d\n", error);
+ printk(KERN_INFO "GFS2: couldn't get freeze lock : %d\n", error);
gfs2_assert_withdraw(sdp, 0);
}
else {
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 064c9a0ef046..423bc2d03dd8 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -74,13 +74,13 @@ fail:
return error;
}
-static void gfs2_print_trans(const struct gfs2_trans *tr)
+static void gfs2_print_trans(struct gfs2_sbd *sdp, const struct gfs2_trans *tr)
{
- pr_warn("Transaction created at: %pSR\n", (void *)tr->tr_ip);
- pr_warn("blocks=%u revokes=%u reserved=%u touched=%u\n",
+ fs_warn(sdp, "Transaction created at: %pSR\n", (void *)tr->tr_ip);
+ fs_warn(sdp, "blocks=%u revokes=%u reserved=%u touched=%u\n",
tr->tr_blocks, tr->tr_revokes, tr->tr_reserved,
test_bit(TR_TOUCHED, &tr->tr_flags));
- pr_warn("Buf %u/%u Databuf %u/%u Revoke %u/%u\n",
+ fs_warn(sdp, "Buf %u/%u Databuf %u/%u Revoke %u/%u\n",
tr->tr_num_buf_new, tr->tr_num_buf_rm,
tr->tr_num_databuf_new, tr->tr_num_databuf_rm,
tr->tr_num_revoke, tr->tr_num_revoke_rm);
@@ -109,7 +109,7 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
if (gfs2_assert_withdraw(sdp, (nbuf <= tr->tr_blocks) &&
(tr->tr_num_revoke <= tr->tr_revokes)))
- gfs2_print_trans(tr);
+ gfs2_print_trans(sdp, tr);
gfs2_log_commit(sdp, tr);
if (alloced && !test_bit(TR_ATTACHED, &tr->tr_flags))
@@ -225,12 +225,13 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) {
- pr_err("Attempting to add uninitialised block to journal (inplace block=%lld)\n",
+ fs_err(sdp, "Attempting to add uninitialised block to "
+ "journal (inplace block=%lld)\n",
(unsigned long long)bd->bd_bh->b_blocknr);
BUG();
}
if (unlikely(state == SFS_FROZEN)) {
- printk(KERN_INFO "GFS2:adding buf while frozen\n");
+ fs_info(sdp, "GFS2:adding buf while frozen\n");
gfs2_assert_withdraw(sdp, 0);
}
gfs2_pin(sdp, bd->bd_bh);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 59c811de0dc7..0a814ccac41d 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -19,6 +19,7 @@
#include "gfs2.h"
#include "incore.h"
#include "glock.h"
+#include "rgrp.h"
#include "util.h"
struct kmem_cache *gfs2_glock_cachep __read_mostly;
@@ -181,6 +182,8 @@ int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide,
{
struct gfs2_sbd *sdp = rgd->rd_sbd;
int rv;
+
+ gfs2_rgrp_dump(NULL, rgd->rd_gl);
rv = gfs2_lm_withdraw(sdp,
"fatal: filesystem consistency error\n"
" RG = %llu\n"
@@ -256,12 +259,13 @@ void gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh,
const char *function, char *file, unsigned int line,
bool withdraw)
{
- fs_err(sdp,
- "fatal: I/O error\n"
- " block = %llu\n"
- " function = %s, file = %s, line = %u\n",
- (unsigned long long)bh->b_blocknr,
- function, file, line);
+ if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+ fs_err(sdp,
+ "fatal: I/O error\n"
+ " block = %llu\n"
+ " function = %s, file = %s, line = %u\n",
+ (unsigned long long)bh->b_blocknr,
+ function, file, line);
if (withdraw)
gfs2_lm_withdraw(sdp, NULL);
}
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index 96ac4aba4738..9278fecba632 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -86,7 +86,7 @@ static inline int gfs2_meta_check(struct gfs2_sbd *sdp,
struct gfs2_meta_header *mh = (struct gfs2_meta_header *)bh->b_data;
u32 magic = be32_to_cpu(mh->mh_magic);
if (unlikely(magic != GFS2_MAGIC)) {
- pr_err("Magic number missing at %llu\n",
+ fs_err(sdp, "Magic number missing at %llu\n",
(unsigned long long)bh->b_blocknr);
return -EIO;
}
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 38515988aaf7..996c915a9c97 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -283,7 +283,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
blen++;
else {
if (bstart)
- gfs2_free_meta(ip, bstart, blen);
+ gfs2_free_meta(ip, rgd, bstart, blen);
bstart = bn;
blen = 1;
}
@@ -292,7 +292,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
gfs2_add_inode_blocks(&ip->i_inode, -1);
}
if (bstart)
- gfs2_free_meta(ip, bstart, blen);
+ gfs2_free_meta(ip, rgd, bstart, blen);
if (prev && !leave) {
u32 len;
@@ -1250,6 +1250,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
{
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
struct gfs2_rgrp_list rlist;
+ struct gfs2_rgrpd *rgd;
struct buffer_head *indbh, *dibh;
__be64 *eablk, *end;
unsigned int rg_blocks = 0;
@@ -1299,11 +1300,10 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
else
goto out;
- gfs2_rlist_alloc(&rlist, LM_ST_EXCLUSIVE);
+ gfs2_rlist_alloc(&rlist);
for (x = 0; x < rlist.rl_rgrps; x++) {
- struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl);
-
+ rgd = gfs2_glock2rgrp(rlist.rl_ghs[x].gh_gl);
rg_blocks += rgd->rd_length;
}
@@ -1320,6 +1320,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
bstart = 0;
+ rgd = NULL;
blen = 0;
for (; eablk < end; eablk++) {
@@ -1333,8 +1334,9 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
blen++;
else {
if (bstart)
- gfs2_free_meta(ip, bstart, blen);
+ gfs2_free_meta(ip, rgd, bstart, blen);
bstart = bn;
+ rgd = gfs2_blk2rgrpd(sdp, bstart, true);
blen = 1;
}
@@ -1342,7 +1344,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
gfs2_add_inode_blocks(&ip->i_inode, -1);
}
if (bstart)
- gfs2_free_meta(ip, bstart, blen);
+ gfs2_free_meta(ip, rgd, bstart, blen);
ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT;
@@ -1391,7 +1393,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
if (error)
goto out_gunlock;
- gfs2_free_meta(ip, ip->i_eattr, 1);
+ gfs2_free_meta(ip, rgd, ip->i_eattr, 1);
ip->i_eattr = 0;
gfs2_add_inode_blocks(&ip->i_inode, -1);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 3212c29235ce..2005529af560 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -230,7 +230,7 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
ret = -EXDEV;
if (src_file.file->f_path.mnt != dst_file->f_path.mnt)
goto fdput;
- ret = do_clone_file_range(src_file.file, off, dst_file, destoff, olen);
+ ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen);
fdput:
fdput(src_file);
return ret;
diff --git a/fs/iomap.c b/fs/iomap.c
index 74762b1ec233..ec15cf2ec696 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -1051,6 +1051,7 @@ iomap_page_mkwrite_actor(struct inode *inode, loff_t pos, loff_t length,
} else {
WARN_ON_ONCE(!PageUptodate(page));
iomap_page_create(inode, page);
+ set_page_dirty(page);
}
return length;
@@ -1090,7 +1091,6 @@ int iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops)
length -= ret;
}
- set_page_dirty(page);
wait_for_stable_page(page);
return VM_FAULT_LOCKED;
out_unlock:
diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 453a6a1fff34..2b4d5013dc5d 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -125,7 +125,7 @@ static int jffs2_garbage_collect_thread(void *_c)
if (try_to_freeze())
goto again;
- signr = kernel_dequeue_signal(NULL);
+ signr = kernel_dequeue_signal();
switch(signr) {
case SIGSTOP:
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index 87bdf0f4cba1..902a7dd10e5c 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -285,10 +285,8 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent)
sb->s_fs_info = c;
ret = jffs2_parse_options(c, data);
- if (ret) {
- kfree(c);
+ if (ret)
return -EINVAL;
- }
/* Initialize JFFS2 superblock locks, the further initialization will
* be done later */
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 2e71b6e7e646..8c06a6ea862d 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -146,12 +146,16 @@ int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
if (default_acl) {
rc = __jfs_set_acl(tid, inode, ACL_TYPE_DEFAULT, default_acl);
posix_acl_release(default_acl);
+ } else {
+ inode->i_default_acl = NULL;
}
if (acl) {
if (!rc)
rc = __jfs_set_acl(tid, inode, ACL_TYPE_ACCESS, acl);
posix_acl_release(acl);
+ } else {
+ inode->i_acl = NULL;
}
JFS_IP(inode)->mode2 = (JFS_IP(inode)->mode2 & 0xffff0000) |
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 054cc761b426..805ae9e8944a 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -166,7 +166,6 @@ void jfs_evict_inode(struct inode *inode)
/*
* Free the inode from the quota allocation.
*/
- dquot_initialize(inode);
dquot_free_inode(inode);
}
} else {
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 09da5cf14e27..65d8fc87ab11 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -247,7 +247,7 @@ static const match_table_t tokens = {
{Opt_resize_nosize, "resize"},
{Opt_errors, "errors=%s"},
{Opt_ignore, "noquota"},
- {Opt_ignore, "quota"},
+ {Opt_quota, "quota"},
{Opt_usrquota, "usrquota"},
{Opt_grpquota, "grpquota"},
{Opt_uid, "uid=%u"},
diff --git a/fs/namespace.c b/fs/namespace.c
index 99186556f8d3..d86830c86ce8 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2642,6 +2642,7 @@ static long exact_copy_from_user(void *to, const void __user * from,
if (!access_ok(VERIFY_READ, from, n))
return n;
+ current->kernel_uaccess_faults_ok++;
while (n) {
if (__get_user(c, f)) {
memset(t, 0, n);
@@ -2651,6 +2652,7 @@ static long exact_copy_from_user(void *to, const void __user * from,
f++;
n--;
}
+ current->kernel_uaccess_faults_ok--;
return n;
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 34830f6457ea..8220a168282e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1637,6 +1637,14 @@ static void nfs_state_set_delegation(struct nfs4_state *state,
write_sequnlock(&state->seqlock);
}
+static void nfs_state_clear_delegation(struct nfs4_state *state)
+{
+ write_seqlock(&state->seqlock);
+ nfs4_stateid_copy(&state->stateid, &state->open_stateid);
+ clear_bit(NFS_DELEGATED_STATE, &state->flags);
+ write_sequnlock(&state->seqlock);
+}
+
static int update_open_stateid(struct nfs4_state *state,
const nfs4_stateid *open_stateid,
const nfs4_stateid *delegation,
@@ -2145,10 +2153,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx,
if (IS_ERR(opendata))
return PTR_ERR(opendata);
nfs4_stateid_copy(&opendata->o_arg.u.delegation, stateid);
- write_seqlock(&state->seqlock);
- nfs4_stateid_copy(&state->stateid, &state->open_stateid);
- write_sequnlock(&state->seqlock);
- clear_bit(NFS_DELEGATED_STATE, &state->flags);
+ nfs_state_clear_delegation(state);
switch (type & (FMODE_READ|FMODE_WRITE)) {
case FMODE_READ|FMODE_WRITE:
case FMODE_WRITE:
@@ -2601,10 +2606,7 @@ static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state,
const nfs4_stateid *stateid)
{
nfs_remove_bad_delegation(state->inode, stateid);
- write_seqlock(&state->seqlock);
- nfs4_stateid_copy(&state->stateid, &state->open_stateid);
- write_sequnlock(&state->seqlock);
- clear_bit(NFS_DELEGATED_STATE, &state->flags);
+ nfs_state_clear_delegation(state);
}
static void nfs40_clear_delegation_stateid(struct nfs4_state *state)
@@ -2672,15 +2674,20 @@ static void nfs41_check_delegation_stateid(struct nfs4_state *state)
delegation = rcu_dereference(NFS_I(state->inode)->delegation);
if (delegation == NULL) {
rcu_read_unlock();
+ nfs_state_clear_delegation(state);
return;
}
nfs4_stateid_copy(&stateid, &delegation->stateid);
- if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) ||
- !test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED,
- &delegation->flags)) {
+ if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
+ rcu_read_unlock();
+ nfs_state_clear_delegation(state);
+ return;
+ }
+
+ if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED,
+ &delegation->flags)) {
rcu_read_unlock();
- nfs_finish_clear_delegation_stateid(state, &stateid);
return;
}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 3df0eb52da1c..40a08cd483f0 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1390,6 +1390,8 @@ int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_
if (!nfs4_state_mark_reclaim_nograce(clp, state))
return -EBADF;
+ nfs_inode_find_delegation_state_and_recover(state->inode,
+ &state->stateid);
dprintk("%s: scheduling stateid recovery for server %s\n", __func__,
clp->cl_hostname);
nfs4_schedule_state_manager(clp);
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index a275fba93170..b1483b303e0b 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -1137,7 +1137,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
TP_fast_assign(
__entry->error = error;
__entry->fhandle = nfs_fhandle_hash(fhandle);
- if (inode != NULL) {
+ if (!IS_ERR_OR_NULL(inode)) {
__entry->fileid = NFS_FILEID(inode);
__entry->dev = inode->i_sb->s_dev;
} else {
@@ -1194,7 +1194,7 @@ DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event,
TP_fast_assign(
__entry->error = error;
__entry->fhandle = nfs_fhandle_hash(fhandle);
- if (inode != NULL) {
+ if (!IS_ERR_OR_NULL(inode)) {
__entry->fileid = NFS_FILEID(inode);
__entry->dev = inode->i_sb->s_dev;
} else {
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index e8f232de484f..7d9a51e6b847 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1740,16 +1740,16 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
return ret;
}
-static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
+static int pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
{
/*
* send layoutcommit as it can hold up layoutreturn due to lseg
* reference
*/
pnfs_layoutcommit_inode(lo->plh_inode, false);
- return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
+ return wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
nfs_wait_bit_killable,
- TASK_UNINTERRUPTIBLE);
+ TASK_KILLABLE);
}
static void nfs_layoutget_begin(struct pnfs_layout_hdr *lo)
@@ -1830,7 +1830,9 @@ pnfs_update_layout(struct inode *ino,
}
lookup_again:
- nfs4_client_recover_expired_lease(clp);
+ lseg = ERR_PTR(nfs4_client_recover_expired_lease(clp));
+ if (IS_ERR(lseg))
+ goto out;
first = false;
spin_lock(&ino->i_lock);
lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
@@ -1863,9 +1865,9 @@ lookup_again:
if (list_empty(&lo->plh_segs) &&
atomic_read(&lo->plh_outstanding) != 0) {
spin_unlock(&ino->i_lock);
- if (wait_var_event_killable(&lo->plh_outstanding,
- atomic_read(&lo->plh_outstanding) == 0
- || !list_empty(&lo->plh_segs)))
+ lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding,
+ atomic_read(&lo->plh_outstanding)));
+ if (IS_ERR(lseg) || !list_empty(&lo->plh_segs))
goto out_put_layout_hdr;
pnfs_put_layout_hdr(lo);
goto lookup_again;
@@ -1898,8 +1900,11 @@ lookup_again:
if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
&lo->plh_flags)) {
spin_unlock(&ino->i_lock);
- wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET,
- TASK_UNINTERRUPTIBLE);
+ lseg = ERR_PTR(wait_on_bit(&lo->plh_flags,
+ NFS_LAYOUT_FIRST_LAYOUTGET,
+ TASK_KILLABLE));
+ if (IS_ERR(lseg))
+ goto out_put_layout_hdr;
pnfs_put_layout_hdr(lo);
dprintk("%s retrying\n", __func__);
goto lookup_again;
@@ -1925,7 +1930,8 @@ lookup_again:
if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
spin_unlock(&ino->i_lock);
dprintk("%s wait for layoutreturn\n", __func__);
- if (pnfs_prepare_to_retry_layoutget(lo)) {
+ lseg = ERR_PTR(pnfs_prepare_to_retry_layoutget(lo));
+ if (!IS_ERR(lseg)) {
if (first)
pnfs_clear_first_layoutget(lo);
pnfs_put_layout_hdr(lo);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 55a099e47ba2..b53e76391e52 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -541,7 +541,8 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
__be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst,
u64 dst_pos, u64 count)
{
- return nfserrno(do_clone_file_range(src, src_pos, dst, dst_pos, count));
+ return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos,
+ count));
}
ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index f174397b63a0..ababdbfab537 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -351,16 +351,9 @@ int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
iter_info.srcu_idx = srcu_read_lock(&fsnotify_mark_srcu);
- if ((mask & FS_MODIFY) ||
- (test_mask & to_tell->i_fsnotify_mask)) {
- iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] =
- fsnotify_first_mark(&to_tell->i_fsnotify_marks);
- }
-
- if (mnt && ((mask & FS_MODIFY) ||
- (test_mask & mnt->mnt_fsnotify_mask))) {
- iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] =
- fsnotify_first_mark(&to_tell->i_fsnotify_marks);
+ iter_info.marks[FSNOTIFY_OBJ_TYPE_INODE] =
+ fsnotify_first_mark(&to_tell->i_fsnotify_marks);
+ if (mnt) {
iter_info.marks[FSNOTIFY_OBJ_TYPE_VFSMOUNT] =
fsnotify_first_mark(&mnt->mnt_fsnotify_marks);
}
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index d9ebe11c8990..1d098c3c00e0 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -342,6 +342,7 @@ int ocfs2_read_blocks(struct ocfs2_caching_info *ci, u64 block, int nr,
* for this bh as it's not marked locally
* uptodate. */
status = -EIO;
+ clear_buffer_needs_validate(bh);
put_bh(bh);
bhs[i] = NULL;
continue;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index aaca0949fe53..826f0567ec43 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -584,9 +584,9 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
res->last_used = 0;
- spin_lock(&dlm->spinlock);
+ spin_lock(&dlm->track_lock);
list_add_tail(&res->tracking, &dlm->tracking_list);
- spin_unlock(&dlm->spinlock);
+ spin_unlock(&dlm->track_lock);
memset(res->lvb, 0, DLM_LVB_LEN);
memset(res->refmap, 0, sizeof(res->refmap));
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 8e712b614e6e..933aac5da193 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -96,7 +96,9 @@ struct ocfs2_unblock_ctl {
};
/* Lockdep class keys */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
static struct lock_class_key lockdep_keys[OCFS2_NUM_LOCK_TYPES];
+#endif
static int ocfs2_check_meta_downconvert(struct ocfs2_lock_res *lockres,
int new_level);
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 7869622af22a..7a5ee145c733 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2946,6 +2946,7 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
if (map_end & (PAGE_SIZE - 1))
to = map_end & (PAGE_SIZE - 1);
+retry:
page = find_or_create_page(mapping, page_index, GFP_NOFS);
if (!page) {
ret = -ENOMEM;
@@ -2954,11 +2955,18 @@ int ocfs2_duplicate_clusters_by_page(handle_t *handle,
}
/*
- * In case PAGE_SIZE <= CLUSTER_SIZE, This page
- * can't be dirtied before we CoW it out.
+ * In case PAGE_SIZE <= CLUSTER_SIZE, we do not expect a dirty
+ * page, so write it back.
*/
- if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize)
- BUG_ON(PageDirty(page));
+ if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) {
+ if (PageDirty(page)) {
+ /*
+ * write_on_page will unlock the page on return
+ */
+ ret = write_one_page(page);
+ goto retry;
+ }
+ }
if (!PageUptodate(page)) {
ret = block_read_full_page(page, ocfs2_get_block);
diff --git a/fs/orangefs/acl.c b/fs/orangefs/acl.c
index 10587413b20e..72d2ff17d27b 100644
--- a/fs/orangefs/acl.c
+++ b/fs/orangefs/acl.c
@@ -167,12 +167,16 @@ int orangefs_init_acl(struct inode *inode, struct inode *dir)
error = __orangefs_set_acl(inode, default_acl,
ACL_TYPE_DEFAULT);
posix_acl_release(default_acl);
+ } else {
+ inode->i_default_acl = NULL;
}
if (acl) {
if (!error)
error = __orangefs_set_acl(inode, acl, ACL_TYPE_ACCESS);
posix_acl_release(acl);
+ } else {
+ inode->i_acl = NULL;
}
/* If mode of the inode was changed, then do a forcible ->setattr */
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index 31932879b716..5e65d818937b 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -405,7 +405,11 @@ struct inode *orangefs_iget(struct super_block *sb,
orangefs_test_inode,
orangefs_set_inode,
ref);
- if (!inode || !(inode->i_state & I_NEW))
+
+ if (!inode)
+ return ERR_PTR(-ENOMEM);
+
+ if (!(inode->i_state & I_NEW))
return inode;
error = orangefs_inode_getattr(inode, 1, 1, STATX_ALL);
@@ -448,7 +452,7 @@ struct inode *orangefs_new_inode(struct super_block *sb, struct inode *dir,
inode = new_inode(sb);
if (!inode)
- return NULL;
+ return ERR_PTR(-ENOMEM);
orangefs_set_inode(inode, ref);
inode->i_ino = hash; /* needed for stat etc */
diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c
index 625b0580f9be..c8676c996249 100644
--- a/fs/orangefs/namei.c
+++ b/fs/orangefs/namei.c
@@ -58,7 +58,6 @@ static int orangefs_create(struct inode *dir,
goto out;
ref = new_op->downcall.resp.create.refn;
- op_release(new_op);
inode = orangefs_new_inode(dir->i_sb, dir, S_IFREG | mode, 0, &ref);
if (IS_ERR(inode)) {
@@ -92,6 +91,7 @@ static int orangefs_create(struct inode *dir,
mark_inode_dirty_sync(dir);
ret = 0;
out:
+ op_release(new_op);
gossip_debug(GOSSIP_NAME_DEBUG,
"%s: %pd: returning %d\n",
__func__,
@@ -157,7 +157,7 @@ static struct dentry *orangefs_lookup(struct inode *dir, struct dentry *dentry,
new_op->downcall.resp.lookup.refn.fs_id,
ret);
- if (ret >= 0) {
+ if (ret == 0) {
orangefs_set_timeout(dentry);
inode = orangefs_iget(dir->i_sb, &new_op->downcall.resp.lookup.refn);
} else if (ret == -ENOENT) {
@@ -269,7 +269,6 @@ static int orangefs_symlink(struct inode *dir,
}
ref = new_op->downcall.resp.sym.refn;
- op_release(new_op);
inode = orangefs_new_inode(dir->i_sb, dir, S_IFLNK | mode, 0, &ref);
if (IS_ERR(inode)) {
@@ -307,6 +306,7 @@ static int orangefs_symlink(struct inode *dir,
mark_inode_dirty_sync(dir);
ret = 0;
out:
+ op_release(new_op);
return ret;
}
@@ -346,7 +346,6 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
}
ref = new_op->downcall.resp.mkdir.refn;
- op_release(new_op);
inode = orangefs_new_inode(dir->i_sb, dir, S_IFDIR | mode, 0, &ref);
if (IS_ERR(inode)) {
@@ -379,6 +378,7 @@ static int orangefs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode
orangefs_inode_setattr(dir, &iattr);
mark_inode_dirty_sync(dir);
out:
+ op_release(new_op);
return ret;
}
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
index dd28079f518c..19739aaee675 100644
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -323,7 +323,7 @@ static ssize_t sysfs_service_op_show(struct kobject *kobj,
/* Can't do a service_operation if the client is not running... */
rc = is_daemon_in_service();
if (rc) {
- pr_info("%s: Client not running :%d:\n",
+ pr_info_ratelimited("%s: Client not running :%d:\n",
__func__,
is_daemon_in_service());
goto out;
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 296037afecdb..1cc797a08a5b 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -141,7 +141,7 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
}
/* Try to use clone_file_range to clone up within the same fs */
- error = vfs_clone_file_range(old_file, 0, new_file, 0, len);
+ error = do_clone_file_range(old_file, 0, new_file, 0, len);
if (!error)
goto out;
/* Couldn't clone, so now we try to copy the data */
diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c
index 32e9282893c9..986313da0c88 100644
--- a/fs/overlayfs/file.c
+++ b/fs/overlayfs/file.c
@@ -131,9 +131,6 @@ static int ovl_open(struct inode *inode, struct file *file)
if (IS_ERR(realfile))
return PTR_ERR(realfile);
- /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */
- file->f_mapping = realfile->f_mapping;
-
file->private_data = realfile;
return 0;
@@ -243,8 +240,10 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter)
goto out_unlock;
old_cred = ovl_override_creds(file_inode(file)->i_sb);
+ file_start_write(real.file);
ret = vfs_iter_write(real.file, iter, &iocb->ki_pos,
ovl_iocb_to_rwf(iocb));
+ file_end_write(real.file);
revert_creds(old_cred);
/* Update size */
@@ -334,6 +333,25 @@ static long ovl_fallocate(struct file *file, int mode, loff_t offset, loff_t len
return ret;
}
+static int ovl_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
+{
+ struct fd real;
+ const struct cred *old_cred;
+ int ret;
+
+ ret = ovl_real_fdget(file, &real);
+ if (ret)
+ return ret;
+
+ old_cred = ovl_override_creds(file_inode(file)->i_sb);
+ ret = vfs_fadvise(real.file, offset, len, advice);
+ revert_creds(old_cred);
+
+ fdput(real);
+
+ return ret;
+}
+
static long ovl_real_ioctl(struct file *file, unsigned int cmd,
unsigned long arg)
{
@@ -502,6 +520,7 @@ const struct file_operations ovl_file_operations = {
.fsync = ovl_fsync,
.mmap = ovl_mmap,
.fallocate = ovl_fallocate,
+ .fadvise = ovl_fadvise,
.unlocked_ioctl = ovl_ioctl,
.compat_ioctl = ovl_compat_ioctl,
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index e0bb217c01e2..3b7ed5d2279c 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -467,6 +467,10 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
return -EOPNOTSUPP;
old_cred = ovl_override_creds(inode->i_sb);
+
+ if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC)
+ filemap_write_and_wait(realinode->i_mapping);
+
err = realinode->i_op->fiemap(realinode, fieinfo, start, len);
revert_creds(old_cred);
@@ -500,6 +504,11 @@ static const struct inode_operations ovl_special_inode_operations = {
.update_time = ovl_update_time,
};
+static const struct address_space_operations ovl_aops = {
+ /* For O_DIRECT dentry_open() checks f_mapping->a_ops->direct_IO */
+ .direct_IO = noop_direct_IO,
+};
+
/*
* It is possible to stack overlayfs instance on top of another
* overlayfs instance as lower layer. We need to annonate the
@@ -571,6 +580,7 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev,
case S_IFREG:
inode->i_op = &ovl_file_inode_operations;
inode->i_fop = &ovl_file_operations;
+ inode->i_mapping->a_ops = &ovl_aops;
break;
case S_IFDIR:
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
index f28711846dd6..9c0ca6a7becf 100644
--- a/fs/overlayfs/namei.c
+++ b/fs/overlayfs/namei.c
@@ -686,7 +686,7 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper,
index = NULL;
goto out;
}
- pr_warn_ratelimited("overlayfs: failed inode index lookup (ino=%lu, key=%*s, err=%i);\n"
+ pr_warn_ratelimited("overlayfs: failed inode index lookup (ino=%lu, key=%.*s, err=%i);\n"
"overlayfs: mount with '-o index=off' to disable inodes index.\n",
d_inode(origin)->i_ino, name.len, name.name,
err);
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index f61839e1054c..a3c0d9584312 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -152,8 +152,8 @@ static inline int ovl_do_setxattr(struct dentry *dentry, const char *name,
const void *value, size_t size, int flags)
{
int err = vfs_setxattr(dentry, name, value, size, flags);
- pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n",
- dentry, name, (int) size, (char *) value, flags, err);
+ pr_debug("setxattr(%pd2, \"%s\", \"%*pE\", %zu, 0x%x) = %i\n",
+ dentry, name, min((int)size, 48), value, size, flags, err);
return err;
}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index 2e0fc93c2c06..30adc9d408a0 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -982,16 +982,6 @@ static int ovl_get_upper(struct ovl_fs *ofs, struct path *upperpath)
if (err)
goto out;
- err = -EBUSY;
- if (ovl_inuse_trylock(upperpath->dentry)) {
- ofs->upperdir_locked = true;
- } else if (ofs->config.index) {
- pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n");
- goto out;
- } else {
- pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
- }
-
upper_mnt = clone_private_mount(upperpath);
err = PTR_ERR(upper_mnt);
if (IS_ERR(upper_mnt)) {
@@ -1002,6 +992,17 @@ static int ovl_get_upper(struct ovl_fs *ofs, struct path *upperpath)
/* Don't inherit atime flags */
upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME);
ofs->upper_mnt = upper_mnt;
+
+ err = -EBUSY;
+ if (ovl_inuse_trylock(ofs->upper_mnt->mnt_root)) {
+ ofs->upperdir_locked = true;
+ } else if (ofs->config.index) {
+ pr_err("overlayfs: upperdir is in-use by another mount, mount with '-o index=off' to override exclusive upperdir protection.\n");
+ goto out;
+ } else {
+ pr_warn("overlayfs: upperdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
+ }
+
err = 0;
out:
return err;
@@ -1101,8 +1102,10 @@ static int ovl_get_workdir(struct ovl_fs *ofs, struct path *upperpath)
goto out;
}
+ ofs->workbasedir = dget(workpath.dentry);
+
err = -EBUSY;
- if (ovl_inuse_trylock(workpath.dentry)) {
+ if (ovl_inuse_trylock(ofs->workbasedir)) {
ofs->workdir_locked = true;
} else if (ofs->config.index) {
pr_err("overlayfs: workdir is in-use by another mount, mount with '-o index=off' to override exclusive workdir protection.\n");
@@ -1111,7 +1114,6 @@ static int ovl_get_workdir(struct ovl_fs *ofs, struct path *upperpath)
pr_warn("overlayfs: workdir is in-use by another mount, accessing files from both mounts will result in undefined behavior.\n");
}
- ofs->workbasedir = dget(workpath.dentry);
err = ovl_make_workdir(ofs, &workpath);
if (err)
goto out;
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
index 8cfb62cc8672..ace4fe4c39a9 100644
--- a/fs/overlayfs/util.c
+++ b/fs/overlayfs/util.c
@@ -683,7 +683,7 @@ static void ovl_cleanup_index(struct dentry *dentry)
struct dentry *upperdentry = ovl_dentry_upper(dentry);
struct dentry *index = NULL;
struct inode *inode;
- struct qstr name;
+ struct qstr name = { };
int err;
err = ovl_get_index_name(lowerdentry, &name);
@@ -726,6 +726,7 @@ static void ovl_cleanup_index(struct dentry *dentry)
goto fail;
out:
+ kfree(name.name);
dput(index);
return;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ccf86f16d9f0..7e9f07bf260d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -407,6 +407,20 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns,
unsigned long *entries;
int err;
+ /*
+ * The ability to racily run the kernel stack unwinder on a running task
+ * and then observe the unwinder output is scary; while it is useful for
+ * debugging kernel issues, it can also allow an attacker to leak kernel
+ * stack contents.
+ * Doing this in a manner that is at least safe from races would require
+ * some work to ensure that the remote task can not be scheduled; and
+ * even then, this would still expose the unwinder as local attack
+ * surface.
+ * Therefore, this interface is restricted to root.
+ */
+ if (!file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN))
+ return -EACCES;
+
entries = kmalloc_array(MAX_STACK_TRACE_DEPTH, sizeof(*entries),
GFP_KERNEL);
if (!entries)
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index ad72261ee3fe..d297fe4472a9 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -464,6 +464,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos)
ret = -EFAULT;
goto out;
}
+ m = NULL; /* skip the list anchor */
} else if (m->type == KCORE_VMALLOC) {
vread(buf, (char *)start, tsz);
/* we have to zero-fill user buffer even if no read */
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index cbde728f8ac6..91ae16fbd7d5 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -24,6 +24,8 @@
#include <linux/vmalloc.h>
#include <linux/pagemap.h>
#include <linux/uaccess.h>
+#include <linux/mem_encrypt.h>
+#include <asm/pgtable.h>
#include <asm/io.h>
#include "internal.h"
@@ -98,7 +100,8 @@ static int pfn_is_ram(unsigned long pfn)
/* Reads a page from the oldmem device from given offset. */
static ssize_t read_from_oldmem(char *buf, size_t count,
- u64 *ppos, int userbuf)
+ u64 *ppos, int userbuf,
+ bool encrypted)
{
unsigned long pfn, offset;
size_t nr_bytes;
@@ -120,8 +123,15 @@ static ssize_t read_from_oldmem(char *buf, size_t count,
if (pfn_is_ram(pfn) == 0)
memset(buf, 0, nr_bytes);
else {
- tmp = copy_oldmem_page(pfn, buf, nr_bytes,
- offset, userbuf);
+ if (encrypted)
+ tmp = copy_oldmem_page_encrypted(pfn, buf,
+ nr_bytes,
+ offset,
+ userbuf);
+ else
+ tmp = copy_oldmem_page(pfn, buf, nr_bytes,
+ offset, userbuf);
+
if (tmp < 0)
return tmp;
}
@@ -155,7 +165,7 @@ void __weak elfcorehdr_free(unsigned long long addr)
*/
ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
{
- return read_from_oldmem(buf, count, ppos, 0);
+ return read_from_oldmem(buf, count, ppos, 0, false);
}
/*
@@ -163,7 +173,7 @@ ssize_t __weak elfcorehdr_read(char *buf, size_t count, u64 *ppos)
*/
ssize_t __weak elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos)
{
- return read_from_oldmem(buf, count, ppos, 0);
+ return read_from_oldmem(buf, count, ppos, 0, sme_active());
}
/*
@@ -173,10 +183,21 @@ int __weak remap_oldmem_pfn_range(struct vm_area_struct *vma,
unsigned long from, unsigned long pfn,
unsigned long size, pgprot_t prot)
{
+ prot = pgprot_encrypted(prot);
return remap_pfn_range(vma, from, pfn, size, prot);
}
/*
+ * Architectures which support memory encryption override this.
+ */
+ssize_t __weak
+copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize,
+ unsigned long offset, int userbuf)
+{
+ return copy_oldmem_page(pfn, buf, csize, offset, userbuf);
+}
+
+/*
* Copy to either kernel or user space
*/
static int copy_to(void *target, void *src, size_t size, int userbuf)
@@ -351,7 +372,8 @@ static ssize_t __read_vmcore(char *buffer, size_t buflen, loff_t *fpos,
m->offset + m->size - *fpos,
buflen);
start = m->paddr + *fpos - m->offset;
- tmp = read_from_oldmem(buffer, tsz, &start, userbuf);
+ tmp = read_from_oldmem(buffer, tsz, &start,
+ userbuf, sme_active());
if (tmp < 0)
return tmp;
buflen -= tsz;
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 5fcb845b9fec..8cf2218b46a7 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -482,12 +482,10 @@ static struct file_system_type pstore_fs_type = {
.kill_sb = pstore_kill_sb,
};
-static int __init init_pstore_fs(void)
+int __init pstore_init_fs(void)
{
int err;
- pstore_choose_compression();
-
/* Create a convenient mount point for people to access pstore */
err = sysfs_create_mount_point(fs_kobj, "pstore");
if (err)
@@ -500,14 +498,9 @@ static int __init init_pstore_fs(void)
out:
return err;
}
-module_init(init_pstore_fs)
-static void __exit exit_pstore_fs(void)
+void __exit pstore_exit_fs(void)
{
unregister_filesystem(&pstore_fs_type);
sysfs_remove_mount_point(fs_kobj, "pstore");
}
-module_exit(exit_pstore_fs)
-
-MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
-MODULE_LICENSE("GPL");
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index fb767e28aeb2..7062ea4bc57c 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -37,7 +37,8 @@ extern bool pstore_is_mounted(void);
extern void pstore_record_init(struct pstore_record *record,
struct pstore_info *psi);
-/* Called during module_init() */
-extern void __init pstore_choose_compression(void);
+/* Called during pstore init/exit. */
+int __init pstore_init_fs(void);
+void __exit pstore_exit_fs(void);
#endif
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 15e99d5a681d..b821054ca3ed 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -274,36 +274,56 @@ static int pstore_decompress(void *in, void *out,
static void allocate_buf_for_compression(void)
{
+ struct crypto_comp *ctx;
+ int size;
+ char *buf;
+
+ /* Skip if not built-in or compression backend not selected yet. */
if (!IS_ENABLED(CONFIG_PSTORE_COMPRESS) || !zbackend)
return;
+ /* Skip if no pstore backend yet or compression init already done. */
+ if (!psinfo || tfm)
+ return;
+
if (!crypto_has_comp(zbackend->name, 0, 0)) {
- pr_err("No %s compression\n", zbackend->name);
+ pr_err("Unknown compression: %s\n", zbackend->name);
return;
}
- big_oops_buf_sz = zbackend->zbufsize(psinfo->bufsize);
- if (big_oops_buf_sz <= 0)
+ size = zbackend->zbufsize(psinfo->bufsize);
+ if (size <= 0) {
+ pr_err("Invalid compression size for %s: %d\n",
+ zbackend->name, size);
return;
+ }
- big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL);
- if (!big_oops_buf) {
- pr_err("allocate compression buffer error!\n");
+ buf = kmalloc(size, GFP_KERNEL);
+ if (!buf) {
+ pr_err("Failed %d byte compression buffer allocation for: %s\n",
+ size, zbackend->name);
return;
}
- tfm = crypto_alloc_comp(zbackend->name, 0, 0);
- if (IS_ERR_OR_NULL(tfm)) {
- kfree(big_oops_buf);
- big_oops_buf = NULL;
- pr_err("crypto_alloc_comp() failed!\n");
+ ctx = crypto_alloc_comp(zbackend->name, 0, 0);
+ if (IS_ERR_OR_NULL(ctx)) {
+ kfree(buf);
+ pr_err("crypto_alloc_comp('%s') failed: %ld\n", zbackend->name,
+ PTR_ERR(ctx));
return;
}
+
+ /* A non-NULL big_oops_buf indicates compression is available. */
+ tfm = ctx;
+ big_oops_buf_sz = size;
+ big_oops_buf = buf;
+
+ pr_info("Using compression: %s\n", zbackend->name);
}
static void free_buf_for_compression(void)
{
- if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && !IS_ERR_OR_NULL(tfm))
+ if (IS_ENABLED(CONFIG_PSTORE_COMPRESS) && tfm)
crypto_free_comp(tfm);
kfree(big_oops_buf);
big_oops_buf = NULL;
@@ -774,14 +794,43 @@ void __init pstore_choose_compression(void)
for (step = zbackends; step->name; step++) {
if (!strcmp(compress, step->name)) {
zbackend = step;
- pr_info("using %s compression\n", zbackend->name);
return;
}
}
}
+static int __init pstore_init(void)
+{
+ int ret;
+
+ pstore_choose_compression();
+
+ /*
+ * Check if any pstore backends registered earlier but did not
+ * initialize compression because crypto was not ready. If so,
+ * initialize compression now.
+ */
+ allocate_buf_for_compression();
+
+ ret = pstore_init_fs();
+ if (ret)
+ return ret;
+
+ return 0;
+}
+late_initcall(pstore_init);
+
+static void __exit pstore_exit(void)
+{
+ pstore_exit_fs();
+}
+module_exit(pstore_exit)
+
module_param(compress, charp, 0444);
MODULE_PARM_DESC(compress, "Pstore compression to use");
module_param(backend, charp, 0444);
MODULE_PARM_DESC(backend, "Pstore backend to use");
+
+MODULE_AUTHOR("Tony Luck <tony.luck@intel.com>");
+MODULE_LICENSE("GPL");
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index bbd1e357c23d..ffcff6516e89 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -587,9 +587,16 @@ static int ramoops_init_przs(const char *name,
goto fail;
for (i = 0; i < *cnt; i++) {
+ char *label;
+
+ if (*cnt == 1)
+ label = kasprintf(GFP_KERNEL, "ramoops:%s", name);
+ else
+ label = kasprintf(GFP_KERNEL, "ramoops:%s(%d/%d)",
+ name, i, *cnt - 1);
prz_ar[i] = persistent_ram_new(*paddr, zone_sz, sig,
- &cxt->ecc_info,
- cxt->memtype, flags);
+ &cxt->ecc_info,
+ cxt->memtype, flags, label);
if (IS_ERR(prz_ar[i])) {
err = PTR_ERR(prz_ar[i]);
dev_err(dev, "failed to request %s mem region (0x%zx@0x%llx): %d\n",
@@ -619,6 +626,8 @@ static int ramoops_init_prz(const char *name,
struct persistent_ram_zone **prz,
phys_addr_t *paddr, size_t sz, u32 sig)
{
+ char *label;
+
if (!sz)
return 0;
@@ -629,8 +638,9 @@ static int ramoops_init_prz(const char *name,
return -ENOMEM;
}
+ label = kasprintf(GFP_KERNEL, "ramoops:%s", name);
*prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info,
- cxt->memtype, 0);
+ cxt->memtype, 0, label);
if (IS_ERR(*prz)) {
int err = PTR_ERR(*prz);
@@ -898,8 +908,22 @@ static struct platform_driver ramoops_driver = {
},
};
-static void ramoops_register_dummy(void)
+static inline void ramoops_unregister_dummy(void)
+{
+ platform_device_unregister(dummy);
+ dummy = NULL;
+
+ kfree(dummy_data);
+ dummy_data = NULL;
+}
+
+static void __init ramoops_register_dummy(void)
{
+ /*
+ * Prepare a dummy platform data structure to carry the module
+ * parameters. If mem_size isn't set, then there are no module
+ * parameters, and we can skip this.
+ */
if (!mem_size)
return;
@@ -932,21 +956,28 @@ static void ramoops_register_dummy(void)
if (IS_ERR(dummy)) {
pr_info("could not create platform device: %ld\n",
PTR_ERR(dummy));
+ dummy = NULL;
+ ramoops_unregister_dummy();
}
}
static int __init ramoops_init(void)
{
+ int ret;
+
ramoops_register_dummy();
- return platform_driver_register(&ramoops_driver);
+ ret = platform_driver_register(&ramoops_driver);
+ if (ret != 0)
+ ramoops_unregister_dummy();
+
+ return ret;
}
-late_initcall(ramoops_init);
+postcore_initcall(ramoops_init);
static void __exit ramoops_exit(void)
{
platform_driver_unregister(&ramoops_driver);
- platform_device_unregister(dummy);
- kfree(dummy_data);
+ ramoops_unregister_dummy();
}
module_exit(ramoops_exit);
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 951a14edcf51..12e21f789194 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -429,15 +429,20 @@ static void *persistent_ram_vmap(phys_addr_t start, size_t size,
vaddr = vmap(pages, page_count, VM_MAP, prot);
kfree(pages);
- return vaddr;
+ /*
+ * Since vmap() uses page granularity, we must add the offset
+ * into the page here, to get the byte granularity address
+ * into the mapping to represent the actual "start" location.
+ */
+ return vaddr + offset_in_page(start);
}
static void *persistent_ram_iomap(phys_addr_t start, size_t size,
- unsigned int memtype)
+ unsigned int memtype, char *label)
{
void *va;
- if (!request_mem_region(start, size, "persistent_ram")) {
+ if (!request_mem_region(start, size, label ?: "ramoops")) {
pr_err("request mem region (0x%llx@0x%llx) failed\n",
(unsigned long long)size, (unsigned long long)start);
return NULL;
@@ -448,6 +453,11 @@ static void *persistent_ram_iomap(phys_addr_t start, size_t size,
else
va = ioremap_wc(start, size);
+ /*
+ * Since request_mem_region() and ioremap() are byte-granularity
+ * there is no need handle anything special like we do when the
+ * vmap() case in persistent_ram_vmap() above.
+ */
return va;
}
@@ -460,7 +470,8 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size,
if (pfn_valid(start >> PAGE_SHIFT))
prz->vaddr = persistent_ram_vmap(start, size, memtype);
else
- prz->vaddr = persistent_ram_iomap(start, size, memtype);
+ prz->vaddr = persistent_ram_iomap(start, size, memtype,
+ prz->label);
if (!prz->vaddr) {
pr_err("%s: Failed to map 0x%llx pages at 0x%llx\n", __func__,
@@ -468,7 +479,7 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size,
return -ENOMEM;
}
- prz->buffer = prz->vaddr + offset_in_page(start);
+ prz->buffer = prz->vaddr;
prz->buffer_size = size - sizeof(struct persistent_ram_buffer);
return 0;
@@ -515,7 +526,8 @@ void persistent_ram_free(struct persistent_ram_zone *prz)
if (prz->vaddr) {
if (pfn_valid(prz->paddr >> PAGE_SHIFT)) {
- vunmap(prz->vaddr);
+ /* We must vunmap() at page-granularity. */
+ vunmap(prz->vaddr - offset_in_page(prz->paddr));
} else {
iounmap(prz->vaddr);
release_mem_region(prz->paddr, prz->size);
@@ -530,12 +542,13 @@ void persistent_ram_free(struct persistent_ram_zone *prz)
prz->ecc_info.par = NULL;
persistent_ram_free_old(prz);
+ kfree(prz->label);
kfree(prz);
}
struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
u32 sig, struct persistent_ram_ecc_info *ecc_info,
- unsigned int memtype, u32 flags)
+ unsigned int memtype, u32 flags, char *label)
{
struct persistent_ram_zone *prz;
int ret = -ENOMEM;
@@ -549,6 +562,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
/* Initialize general buffer state. */
raw_spin_lock_init(&prz->buffer_lock);
prz->flags = flags;
+ prz->label = label;
ret = persistent_ram_buffer_map(start, size, prz, memtype);
if (ret)
diff --git a/fs/read_write.c b/fs/read_write.c
index 39b4a21dd933..8a2737f0d61d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1818,8 +1818,8 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
}
EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
-int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
- struct file *file_out, loff_t pos_out, u64 len)
+int do_clone_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out, u64 len)
{
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
@@ -1866,6 +1866,19 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
return ret;
}
+EXPORT_SYMBOL(do_clone_file_range);
+
+int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
+ struct file *file_out, loff_t pos_out, u64 len)
+{
+ int ret;
+
+ file_start_write(file_out);
+ ret = do_clone_file_range(file_in, pos_in, file_out, pos_out, len);
+ file_end_write(file_out);
+
+ return ret;
+}
EXPORT_SYMBOL(vfs_clone_file_range);
/*
diff --git a/fs/signalfd.c b/fs/signalfd.c
index 4fcd1498acf5..757afc7c5895 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -79,7 +79,7 @@ static __poll_t signalfd_poll(struct file *file, poll_table *wait)
* Copied from copy_siginfo_to_user() in kernel/signal.c
*/
static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
- siginfo_t const *kinfo)
+ kernel_siginfo_t const *kinfo)
{
struct signalfd_siginfo new;
@@ -163,7 +163,7 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo,
return sizeof(*uinfo);
}
-static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, siginfo_t *info,
+static ssize_t signalfd_dequeue(struct signalfd_ctx *ctx, kernel_siginfo_t *info,
int nonblock)
{
ssize_t ret;
@@ -215,7 +215,7 @@ static ssize_t signalfd_read(struct file *file, char __user *buf, size_t count,
struct signalfd_siginfo __user *siginfo;
int nonblock = file->f_flags & O_NONBLOCK;
ssize_t ret, total = 0;
- siginfo_t info;
+ kernel_siginfo_t info;
count /= sizeof(struct signalfd_siginfo);
if (!count)
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 23e7042666a7..fec62e9dfbe6 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1912,7 +1912,9 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
mutex_unlock(&c->bu_mutex);
}
- ubifs_assert(c, c->lst.taken_empty_lebs > 0);
+ if (!c->need_recovery)
+ ubifs_assert(c, c->lst.taken_empty_lebs > 0);
+
return 0;
}
@@ -1954,6 +1956,9 @@ static struct ubi_volume_desc *open_ubi(const char *name, int mode)
int dev, vol;
char *endptr;
+ if (!name || !*name)
+ return ERR_PTR(-EINVAL);
+
/* First, try to open using the device node path method */
ubi = ubi_open_volume_path(name, mode);
if (!IS_ERR(ubi))
@@ -2332,8 +2337,8 @@ late_initcall(ubifs_init);
static void __exit ubifs_exit(void)
{
- WARN_ON(list_empty(&ubifs_infos));
- WARN_ON(atomic_long_read(&ubifs_clean_zn_cnt) == 0);
+ WARN_ON(!list_empty(&ubifs_infos));
+ WARN_ON(atomic_long_read(&ubifs_clean_zn_cnt) != 0);
dbg_debugfs_exit();
ubifs_compressors_exit();
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 61afdfee4b28..f5ad1ede7990 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -152,12 +152,6 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
ui->data_len = size;
mutex_lock(&host_ui->ui_mutex);
-
- if (!host->i_nlink) {
- err = -ENOENT;
- goto out_noent;
- }
-
host->i_ctime = current_time(host);
host_ui->xattr_cnt += 1;
host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm));
@@ -190,7 +184,6 @@ out_cancel:
host_ui->xattr_size -= CALC_XATTR_BYTES(size);
host_ui->xattr_names -= fname_len(nm);
host_ui->flags &= ~UBIFS_CRYPT_FL;
-out_noent:
mutex_unlock(&host_ui->ui_mutex);
out_free:
make_bad_inode(inode);
@@ -242,12 +235,6 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
mutex_unlock(&ui->ui_mutex);
mutex_lock(&host_ui->ui_mutex);
-
- if (!host->i_nlink) {
- err = -ENOENT;
- goto out_noent;
- }
-
host->i_ctime = current_time(host);
host_ui->xattr_size -= CALC_XATTR_BYTES(old_size);
host_ui->xattr_size += CALC_XATTR_BYTES(size);
@@ -269,7 +256,6 @@ static int change_xattr(struct ubifs_info *c, struct inode *host,
out_cancel:
host_ui->xattr_size -= CALC_XATTR_BYTES(size);
host_ui->xattr_size += CALC_XATTR_BYTES(old_size);
-out_noent:
mutex_unlock(&host_ui->ui_mutex);
make_bad_inode(inode);
out_free:
@@ -496,12 +482,6 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host,
return err;
mutex_lock(&host_ui->ui_mutex);
-
- if (!host->i_nlink) {
- err = -ENOENT;
- goto out_noent;
- }
-
host->i_ctime = current_time(host);
host_ui->xattr_cnt -= 1;
host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm));
@@ -521,7 +501,6 @@ out_cancel:
host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm));
host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
host_ui->xattr_names += fname_len(nm);
-out_noent:
mutex_unlock(&host_ui->ui_mutex);
ubifs_release_budget(c, &req);
make_bad_inode(inode);
@@ -561,9 +540,6 @@ static int ubifs_xattr_remove(struct inode *host, const char *name)
ubifs_assert(c, inode_is_locked(host));
- if (!host->i_nlink)
- return -ENOENT;
-
if (fname_len(&nm) > UBIFS_MAX_NLEN)
return -ENAMETOOLONG;
diff --git a/fs/xattr.c b/fs/xattr.c
index daa732550088..0d6a6a4af861 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -948,17 +948,19 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
int err = 0;
#ifdef CONFIG_FS_POSIX_ACL
- if (inode->i_acl) {
- err = xattr_list_one(&buffer, &remaining_size,
- XATTR_NAME_POSIX_ACL_ACCESS);
- if (err)
- return err;
- }
- if (inode->i_default_acl) {
- err = xattr_list_one(&buffer, &remaining_size,
- XATTR_NAME_POSIX_ACL_DEFAULT);
- if (err)
- return err;
+ if (IS_POSIXACL(inode)) {
+ if (inode->i_acl) {
+ err = xattr_list_one(&buffer, &remaining_size,
+ XATTR_NAME_POSIX_ACL_ACCESS);
+ if (err)
+ return err;
+ }
+ if (inode->i_default_acl) {
+ err = xattr_list_one(&buffer, &remaining_size,
+ XATTR_NAME_POSIX_ACL_DEFAULT);
+ if (err)
+ return err;
+ }
}
#endif
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 1e671d4eb6fa..844ed87b1900 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -191,6 +191,128 @@ xfs_attr_calc_size(
return nblks;
}
+STATIC int
+xfs_attr_try_sf_addname(
+ struct xfs_inode *dp,
+ struct xfs_da_args *args)
+{
+
+ struct xfs_mount *mp = dp->i_mount;
+ int error, error2;
+
+ error = xfs_attr_shortform_addname(args);
+ if (error == -ENOSPC)
+ return error;
+
+ /*
+ * Commit the shortform mods, and we're done.
+ * NOTE: this is also the error path (EEXIST, etc).
+ */
+ if (!error && (args->flags & ATTR_KERNOTIME) == 0)
+ xfs_trans_ichgtime(args->trans, dp, XFS_ICHGTIME_CHG);
+
+ if (mp->m_flags & XFS_MOUNT_WSYNC)
+ xfs_trans_set_sync(args->trans);
+
+ error2 = xfs_trans_commit(args->trans);
+ args->trans = NULL;
+ return error ? error : error2;
+}
+
+/*
+ * Set the attribute specified in @args.
+ */
+int
+xfs_attr_set_args(
+ struct xfs_da_args *args,
+ struct xfs_buf **leaf_bp)
+{
+ struct xfs_inode *dp = args->dp;
+ int error;
+
+ /*
+ * If the attribute list is non-existent or a shortform list,
+ * upgrade it to a single-leaf-block attribute list.
+ */
+ if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
+ (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
+ dp->i_d.di_anextents == 0)) {
+
+ /*
+ * Build initial attribute list (if required).
+ */
+ if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
+ xfs_attr_shortform_create(args);
+
+ /*
+ * Try to add the attr to the attribute list in the inode.
+ */
+ error = xfs_attr_try_sf_addname(dp, args);
+ if (error != -ENOSPC)
+ return error;
+
+ /*
+ * It won't fit in the shortform, transform to a leaf block.
+ * GROT: another possible req'mt for a double-split btree op.
+ */
+ error = xfs_attr_shortform_to_leaf(args, leaf_bp);
+ if (error)
+ return error;
+
+ /*
+ * Prevent the leaf buffer from being unlocked so that a
+ * concurrent AIL push cannot grab the half-baked leaf
+ * buffer and run into problems with the write verifier.
+ */
+ xfs_trans_bhold(args->trans, *leaf_bp);
+
+ error = xfs_defer_finish(&args->trans);
+ if (error)
+ return error;
+
+ /*
+ * Commit the leaf transformation. We'll need another
+ * (linked) transaction to add the new attribute to the
+ * leaf.
+ */
+ error = xfs_trans_roll_inode(&args->trans, dp);
+ if (error)
+ return error;
+ xfs_trans_bjoin(args->trans, *leaf_bp);
+ *leaf_bp = NULL;
+ }
+
+ if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
+ error = xfs_attr_leaf_addname(args);
+ else
+ error = xfs_attr_node_addname(args);
+ return error;
+}
+
+/*
+ * Remove the attribute specified in @args.
+ */
+int
+xfs_attr_remove_args(
+ struct xfs_da_args *args)
+{
+ struct xfs_inode *dp = args->dp;
+ int error;
+
+ if (!xfs_inode_hasattr(dp)) {
+ error = -ENOATTR;
+ } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
+ ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
+ error = xfs_attr_shortform_remove(args);
+ } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
+ error = xfs_attr_leaf_removename(args);
+ } else {
+ error = xfs_attr_node_removename(args);
+ }
+
+ return error;
+}
+
int
xfs_attr_set(
struct xfs_inode *dp,
@@ -204,7 +326,7 @@ xfs_attr_set(
struct xfs_da_args args;
struct xfs_trans_res tres;
int rsvd = (flags & ATTR_ROOT) != 0;
- int error, err2, local;
+ int error, local;
XFS_STATS_INC(mp, xs_attr_set);
@@ -255,93 +377,17 @@ xfs_attr_set(
error = xfs_trans_reserve_quota_nblks(args.trans, dp, args.total, 0,
rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES :
XFS_QMOPT_RES_REGBLKS);
- if (error) {
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
- xfs_trans_cancel(args.trans);
- return error;
- }
+ if (error)
+ goto out_trans_cancel;
xfs_trans_ijoin(args.trans, dp, 0);
-
- /*
- * If the attribute list is non-existent or a shortform list,
- * upgrade it to a single-leaf-block attribute list.
- */
- if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL ||
- (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS &&
- dp->i_d.di_anextents == 0)) {
-
- /*
- * Build initial attribute list (if required).
- */
- if (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS)
- xfs_attr_shortform_create(&args);
-
- /*
- * Try to add the attr to the attribute list in
- * the inode.
- */
- error = xfs_attr_shortform_addname(&args);
- if (error != -ENOSPC) {
- /*
- * Commit the shortform mods, and we're done.
- * NOTE: this is also the error path (EEXIST, etc).
- */
- ASSERT(args.trans != NULL);
-
- /*
- * If this is a synchronous mount, make sure that
- * the transaction goes to disk before returning
- * to the user.
- */
- if (mp->m_flags & XFS_MOUNT_WSYNC)
- xfs_trans_set_sync(args.trans);
-
- if (!error && (flags & ATTR_KERNOTIME) == 0) {
- xfs_trans_ichgtime(args.trans, dp,
- XFS_ICHGTIME_CHG);
- }
- err2 = xfs_trans_commit(args.trans);
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
- return error ? error : err2;
- }
-
- /*
- * It won't fit in the shortform, transform to a leaf block.
- * GROT: another possible req'mt for a double-split btree op.
- */
- error = xfs_attr_shortform_to_leaf(&args, &leaf_bp);
- if (error)
- goto out;
- /*
- * Prevent the leaf buffer from being unlocked so that a
- * concurrent AIL push cannot grab the half-baked leaf
- * buffer and run into problems with the write verifier.
- */
- xfs_trans_bhold(args.trans, leaf_bp);
- error = xfs_defer_finish(&args.trans);
- if (error)
- goto out;
-
- /*
- * Commit the leaf transformation. We'll need another (linked)
- * transaction to add the new attribute to the leaf, which
- * means that we have to hold & join the leaf buffer here too.
- */
- error = xfs_trans_roll_inode(&args.trans, dp);
- if (error)
- goto out;
- xfs_trans_bjoin(args.trans, leaf_bp);
- leaf_bp = NULL;
- }
-
- if (xfs_bmap_one_block(dp, XFS_ATTR_FORK))
- error = xfs_attr_leaf_addname(&args);
- else
- error = xfs_attr_node_addname(&args);
+ error = xfs_attr_set_args(&args, &leaf_bp);
if (error)
- goto out;
+ goto out_release_leaf;
+ if (!args.trans) {
+ /* shortform attribute has already been committed */
+ goto out_unlock;
+ }
/*
* If this is a synchronous mount, make sure that the
@@ -358,17 +404,17 @@ xfs_attr_set(
*/
xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE);
error = xfs_trans_commit(args.trans);
+out_unlock:
xfs_iunlock(dp, XFS_ILOCK_EXCL);
-
return error;
-out:
+out_release_leaf:
if (leaf_bp)
xfs_trans_brelse(args.trans, leaf_bp);
+out_trans_cancel:
if (args.trans)
xfs_trans_cancel(args.trans);
- xfs_iunlock(dp, XFS_ILOCK_EXCL);
- return error;
+ goto out_unlock;
}
/*
@@ -423,17 +469,7 @@ xfs_attr_remove(
*/
xfs_trans_ijoin(args.trans, dp, 0);
- if (!xfs_inode_hasattr(dp)) {
- error = -ENOATTR;
- } else if (dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) {
- ASSERT(dp->i_afp->if_flags & XFS_IFINLINE);
- error = xfs_attr_shortform_remove(&args);
- } else if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) {
- error = xfs_attr_leaf_removename(&args);
- } else {
- error = xfs_attr_node_removename(&args);
- }
-
+ error = xfs_attr_remove_args(&args);
if (error)
goto out;
@@ -587,7 +623,7 @@ xfs_attr_leaf_addname(
*/
error = xfs_attr3_leaf_to_node(args);
if (error)
- goto out_defer_cancel;
+ return error;
error = xfs_defer_finish(&args->trans);
if (error)
return error;
@@ -675,7 +711,7 @@ xfs_attr_leaf_addname(
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
if (error)
- goto out_defer_cancel;
+ return error;
error = xfs_defer_finish(&args->trans);
if (error)
return error;
@@ -693,9 +729,6 @@ xfs_attr_leaf_addname(
error = xfs_attr3_leaf_clearflag(args);
}
return error;
-out_defer_cancel:
- xfs_defer_cancel(args->trans);
- return error;
}
/*
@@ -738,15 +771,12 @@ xfs_attr_leaf_removename(
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
if (error)
- goto out_defer_cancel;
+ return error;
error = xfs_defer_finish(&args->trans);
if (error)
return error;
}
return 0;
-out_defer_cancel:
- xfs_defer_cancel(args->trans);
- return error;
}
/*
@@ -864,7 +894,7 @@ restart:
state = NULL;
error = xfs_attr3_leaf_to_node(args);
if (error)
- goto out_defer_cancel;
+ goto out;
error = xfs_defer_finish(&args->trans);
if (error)
goto out;
@@ -888,7 +918,7 @@ restart:
*/
error = xfs_da3_split(state);
if (error)
- goto out_defer_cancel;
+ goto out;
error = xfs_defer_finish(&args->trans);
if (error)
goto out;
@@ -984,7 +1014,7 @@ restart:
if (retval && (state->path.active > 1)) {
error = xfs_da3_join(state);
if (error)
- goto out_defer_cancel;
+ goto out;
error = xfs_defer_finish(&args->trans);
if (error)
goto out;
@@ -1013,9 +1043,6 @@ out:
if (error)
return error;
return retval;
-out_defer_cancel:
- xfs_defer_cancel(args->trans);
- goto out;
}
/*
@@ -1107,7 +1134,7 @@ xfs_attr_node_removename(
if (retval && (state->path.active > 1)) {
error = xfs_da3_join(state);
if (error)
- goto out_defer_cancel;
+ goto out;
error = xfs_defer_finish(&args->trans);
if (error)
goto out;
@@ -1138,7 +1165,7 @@ xfs_attr_node_removename(
error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
/* bp is gone due to xfs_da_shrink_inode */
if (error)
- goto out_defer_cancel;
+ goto out;
error = xfs_defer_finish(&args->trans);
if (error)
goto out;
@@ -1150,9 +1177,6 @@ xfs_attr_node_removename(
out:
xfs_da_state_free(state);
return error;
-out_defer_cancel:
- xfs_defer_cancel(args->trans);
- goto out;
}
/*
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/libxfs/xfs_attr.h
index 033ff8c478e2..bdf52a333f3f 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/libxfs/xfs_attr.h
@@ -140,7 +140,9 @@ int xfs_attr_get(struct xfs_inode *ip, const unsigned char *name,
unsigned char *value, int *valuelenp, int flags);
int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
unsigned char *value, int valuelen, int flags);
+int xfs_attr_set_args(struct xfs_da_args *args, struct xfs_buf **leaf_bp);
int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
+int xfs_attr_remove_args(struct xfs_da_args *args);
int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
int flags, struct attrlist_cursor_kern *cursor);
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index af094063e402..d89363c6b523 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -485,7 +485,7 @@ xfs_attr_rmtval_set(
blkcnt, XFS_BMAPI_ATTRFORK, args->total, &map,
&nmap);
if (error)
- goto out_defer_cancel;
+ return error;
error = xfs_defer_finish(&args->trans);
if (error)
return error;
@@ -553,9 +553,6 @@ xfs_attr_rmtval_set(
}
ASSERT(valuelen == 0);
return 0;
-out_defer_cancel:
- xfs_defer_cancel(args->trans);
- return error;
}
/*
@@ -625,7 +622,7 @@ xfs_attr_rmtval_remove(
error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
XFS_BMAPI_ATTRFORK, 1, &done);
if (error)
- goto out_defer_cancel;
+ return error;
error = xfs_defer_finish(&args->trans);
if (error)
return error;
@@ -638,7 +635,4 @@ xfs_attr_rmtval_remove(
return error;
}
return 0;
-out_defer_cancel:
- xfs_defer_cancel(args->trans);
- return error;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 2760314fdf7f..74d7228e755b 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -673,7 +673,8 @@ xfs_bmap_extents_to_btree(
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
/*
- * Make space in the inode incore.
+ * Make space in the inode incore. This needs to be undone if we fail
+ * to expand the root.
*/
xfs_iroot_realloc(ip, 1, whichfork);
ifp->if_flags |= XFS_IFBROOT;
@@ -711,16 +712,15 @@ xfs_bmap_extents_to_btree(
args.minlen = args.maxlen = args.prod = 1;
args.wasdel = wasdel;
*logflagsp = 0;
- if ((error = xfs_alloc_vextent(&args))) {
- ASSERT(ifp->if_broot == NULL);
- goto err1;
- }
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ goto out_root_realloc;
if (WARN_ON_ONCE(args.fsbno == NULLFSBLOCK)) {
- ASSERT(ifp->if_broot == NULL);
error = -ENOSPC;
- goto err1;
+ goto out_root_realloc;
}
+
/*
* Allocation can't fail, the space was reserved.
*/
@@ -732,9 +732,10 @@ xfs_bmap_extents_to_btree(
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, 1L);
abp = xfs_btree_get_bufl(mp, tp, args.fsbno, 0);
if (!abp) {
- error = -ENOSPC;
- goto err2;
+ error = -EFSCORRUPTED;
+ goto out_unreserve_dquot;
}
+
/*
* Fill in the child block.
*/
@@ -775,11 +776,12 @@ xfs_bmap_extents_to_btree(
*logflagsp = XFS_ILOG_CORE | xfs_ilog_fbroot(whichfork);
return 0;
-err2:
+out_unreserve_dquot:
xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
-err1:
+out_root_realloc:
xfs_iroot_realloc(ip, -1, whichfork);
XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_EXTENTS);
+ ASSERT(ifp->if_broot == NULL);
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
return error;
@@ -1017,6 +1019,34 @@ xfs_bmap_add_attrfork_local(
return -EFSCORRUPTED;
}
+/* Set an inode attr fork off based on the format */
+int
+xfs_bmap_set_attrforkoff(
+ struct xfs_inode *ip,
+ int size,
+ int *version)
+{
+ switch (ip->i_d.di_format) {
+ case XFS_DINODE_FMT_DEV:
+ ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
+ break;
+ case XFS_DINODE_FMT_LOCAL:
+ case XFS_DINODE_FMT_EXTENTS:
+ case XFS_DINODE_FMT_BTREE:
+ ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
+ if (!ip->i_d.di_forkoff)
+ ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
+ else if ((ip->i_mount->m_flags & XFS_MOUNT_ATTR2) && version)
+ *version = 2;
+ break;
+ default:
+ ASSERT(0);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
/*
* Convert inode from non-attributed to attributed.
* Must not be in a transaction, ip must not be locked.
@@ -1068,26 +1098,9 @@ xfs_bmap_add_attrfork(
xfs_trans_ijoin(tp, ip, 0);
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-
- switch (ip->i_d.di_format) {
- case XFS_DINODE_FMT_DEV:
- ip->i_d.di_forkoff = roundup(sizeof(xfs_dev_t), 8) >> 3;
- break;
- case XFS_DINODE_FMT_LOCAL:
- case XFS_DINODE_FMT_EXTENTS:
- case XFS_DINODE_FMT_BTREE:
- ip->i_d.di_forkoff = xfs_attr_shortform_bytesfit(ip, size);
- if (!ip->i_d.di_forkoff)
- ip->i_d.di_forkoff = xfs_default_attroffset(ip) >> 3;
- else if (mp->m_flags & XFS_MOUNT_ATTR2)
- version = 2;
- break;
- default:
- ASSERT(0);
- error = -EINVAL;
+ error = xfs_bmap_set_attrforkoff(ip, size, &version);
+ if (error)
goto trans_cancel;
- }
-
ASSERT(ip->i_afp == NULL);
ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
ip->i_afp->if_flags = XFS_IFEXTENTS;
@@ -4079,8 +4092,7 @@ xfs_bmapi_allocate(
* extents to real extents when we're about to write the data.
*/
if ((!bma->wasdel || (bma->flags & XFS_BMAPI_COWFORK)) &&
- (bma->flags & XFS_BMAPI_PREALLOC) &&
- xfs_sb_version_hasextflgbit(&mp->m_sb))
+ (bma->flags & XFS_BMAPI_PREALLOC))
bma->got.br_state = XFS_EXT_UNWRITTEN;
if (bma->wasdel)
@@ -5243,8 +5255,7 @@ __xfs_bunmapi(
* unmapping part of it. But we can't really
* get rid of part of a realtime extent.
*/
- if (del.br_state == XFS_EXT_UNWRITTEN ||
- !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+ if (del.br_state == XFS_EXT_UNWRITTEN) {
/*
* This piece is unwritten, or we're not
* using unwritten extents. Skip over it.
@@ -5294,10 +5305,9 @@ __xfs_bunmapi(
del.br_blockcount -= mod;
del.br_startoff += mod;
del.br_startblock += mod;
- } else if ((del.br_startoff == start &&
- (del.br_state == XFS_EXT_UNWRITTEN ||
- tp->t_blk_res == 0)) ||
- !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
+ } else if (del.br_startoff == start &&
+ (del.br_state == XFS_EXT_UNWRITTEN ||
+ tp->t_blk_res == 0)) {
/*
* Can't make it unwritten. There isn't
* a full extent here so just skip it.
@@ -6112,11 +6122,7 @@ xfs_bmap_validate_extent(
XFS_FSB_TO_AGNO(mp, endfsb))
return __this_address;
}
- if (irec->br_state != XFS_EXT_NORM) {
- if (whichfork != XFS_DATA_FORK)
- return __this_address;
- if (!xfs_sb_version_hasextflgbit(&mp->m_sb))
- return __this_address;
- }
+ if (irec->br_state != XFS_EXT_NORM && whichfork != XFS_DATA_FORK)
+ return __this_address;
return NULL;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index b6e9b639e731..488dc8860fd7 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -183,6 +183,7 @@ void xfs_trim_extent(struct xfs_bmbt_irec *irec, xfs_fileoff_t bno,
xfs_filblks_t len);
void xfs_trim_extent_eof(struct xfs_bmbt_irec *, struct xfs_inode *);
int xfs_bmap_add_attrfork(struct xfs_inode *ip, int size, int rsvd);
+int xfs_bmap_set_attrforkoff(struct xfs_inode *ip, int size, int *version);
void xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
void __xfs_bmap_add_free(struct xfs_trans *tp, xfs_fsblock_t bno,
xfs_filblks_t len, struct xfs_owner_info *oinfo,
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 059bc44c27e8..9995d5ae380b 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -287,6 +287,8 @@ static inline bool xfs_sb_good_v4_features(struct xfs_sb *sbp)
{
if (!(sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT))
return false;
+ if (!(sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT))
+ return false;
/* check for unknown features in the fs */
if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKBITS) ||
@@ -357,12 +359,6 @@ static inline bool xfs_sb_version_haslogv2(struct xfs_sb *sbp)
(sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
}
-static inline bool xfs_sb_version_hasextflgbit(struct xfs_sb *sbp)
-{
- return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 ||
- (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
-}
-
static inline bool xfs_sb_version_hassector(struct xfs_sb *sbp)
{
return (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
@@ -1016,6 +1012,8 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
#define XFS_DIFLAG_EXTSZINHERIT_BIT 12 /* inherit inode extent size */
#define XFS_DIFLAG_NODEFRAG_BIT 13 /* do not reorganize/defragment */
#define XFS_DIFLAG_FILESTREAM_BIT 14 /* use filestream allocator */
+/* Do not use bit 15, di_flags is legacy and unchanging now */
+
#define XFS_DIFLAG_REALTIME (1 << XFS_DIFLAG_REALTIME_BIT)
#define XFS_DIFLAG_PREALLOC (1 << XFS_DIFLAG_PREALLOC_BIT)
#define XFS_DIFLAG_NEWRTBM (1 << XFS_DIFLAG_NEWRTBM_BIT)
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 30d1d60f1d46..09d9c8cfa4a0 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -415,6 +415,31 @@ xfs_dinode_verify_fork(
return NULL;
}
+static xfs_failaddr_t
+xfs_dinode_verify_forkoff(
+ struct xfs_dinode *dip,
+ struct xfs_mount *mp)
+{
+ if (!XFS_DFORK_Q(dip))
+ return NULL;
+
+ switch (dip->di_format) {
+ case XFS_DINODE_FMT_DEV:
+ if (dip->di_forkoff != (roundup(sizeof(xfs_dev_t), 8) >> 3))
+ return __this_address;
+ break;
+ case XFS_DINODE_FMT_LOCAL: /* fall through ... */
+ case XFS_DINODE_FMT_EXTENTS: /* fall through ... */
+ case XFS_DINODE_FMT_BTREE:
+ if (dip->di_forkoff >= (XFS_LITINO(mp, dip->di_version) >> 3))
+ return __this_address;
+ break;
+ default:
+ return __this_address;
+ }
+ return NULL;
+}
+
xfs_failaddr_t
xfs_dinode_verify(
struct xfs_mount *mp,
@@ -470,6 +495,11 @@ xfs_dinode_verify(
if (mode && (flags & XFS_DIFLAG_REALTIME) && !mp->m_rtdev_targp)
return __this_address;
+ /* check for illegal values of forkoff */
+ fa = xfs_dinode_verify_forkoff(dip, mp);
+ if (fa)
+ return fa;
+
/* Do we have appropriate data fork formats for the mode? */
switch (mode & S_IFMT) {
case S_IFIFO:
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 081f46e30556..b5a82acd7dfe 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -1115,7 +1115,8 @@ xfs_fs_geometry(
geo->version = XFS_FSOP_GEOM_VERSION;
geo->flags = XFS_FSOP_GEOM_FLAGS_NLINK |
- XFS_FSOP_GEOM_FLAGS_DIRV2;
+ XFS_FSOP_GEOM_FLAGS_DIRV2 |
+ XFS_FSOP_GEOM_FLAGS_EXTFLG;
if (xfs_sb_version_hasattr(sbp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_ATTR;
if (xfs_sb_version_hasquota(sbp))
@@ -1124,8 +1125,6 @@ xfs_fs_geometry(
geo->flags |= XFS_FSOP_GEOM_FLAGS_IALIGN;
if (xfs_sb_version_hasdalign(sbp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_DALIGN;
- if (xfs_sb_version_hasextflgbit(sbp))
- geo->flags |= XFS_FSOP_GEOM_FLAGS_EXTFLG;
if (xfs_sb_version_hassector(sbp))
geo->flags |= XFS_FSOP_GEOM_FLAGS_SECTOR;
if (xfs_sb_version_hasasciici(sbp))
diff --git a/fs/xfs/scrub/alloc.c b/fs/xfs/scrub/alloc.c
index 036b5c7021eb..376bcb585ae6 100644
--- a/fs/xfs/scrub/alloc.c
+++ b/fs/xfs/scrub/alloc.c
@@ -17,7 +17,6 @@
#include "xfs_sb.h"
#include "xfs_alloc.h"
#include "xfs_rmap.h"
-#include "xfs_alloc.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c
index 5b3b177c0fc9..e386c9b0b4ab 100644
--- a/fs/xfs/scrub/inode.c
+++ b/fs/xfs/scrub/inode.c
@@ -126,6 +126,7 @@ xchk_inode_flags(
{
struct xfs_mount *mp = sc->mp;
+ /* di_flags are all taken, last bit cannot be used */
if (flags & ~XFS_DIFLAG_ANY)
goto bad;
@@ -172,8 +173,9 @@ xchk_inode_flags2(
{
struct xfs_mount *mp = sc->mp;
+ /* Unknown di_flags2 could be from a future kernel */
if (flags2 & ~XFS_DIFLAG2_ANY)
- goto bad;
+ xchk_ino_set_warning(sc, ino);
/* reflink flag requires reflink feature */
if ((flags2 & XFS_DIFLAG2_REFLINK) &&
diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c
index 9f08dd9bf1d5..4fc0a5ea7673 100644
--- a/fs/xfs/scrub/repair.c
+++ b/fs/xfs/scrub/repair.c
@@ -29,6 +29,8 @@
#include "xfs_ag_resv.h"
#include "xfs_trans_space.h"
#include "xfs_quota.h"
+#include "xfs_attr.h"
+#include "xfs_reflink.h"
#include "scrub/xfs_scrub.h"
#include "scrub/scrub.h"
#include "scrub/common.h"
@@ -692,13 +694,14 @@ xrep_findroot_block(
struct xrep_find_ag_btree *fab,
uint64_t owner,
xfs_agblock_t agbno,
- bool *found_it)
+ bool *done_with_block)
{
struct xfs_mount *mp = ri->sc->mp;
struct xfs_buf *bp;
struct xfs_btree_block *btblock;
xfs_daddr_t daddr;
- int error;
+ int block_level;
+ int error = 0;
daddr = XFS_AGB_TO_DADDR(mp, ri->sc->sa.agno, agbno);
@@ -717,36 +720,111 @@ xrep_findroot_block(
return error;
}
+ /*
+ * Read the buffer into memory so that we can see if it's a match for
+ * our btree type. We have no clue if it is beforehand, and we want to
+ * avoid xfs_trans_read_buf's behavior of dumping the DONE state (which
+ * will cause needless disk reads in subsequent calls to this function)
+ * and logging metadata verifier failures.
+ *
+ * Therefore, pass in NULL buffer ops. If the buffer was already in
+ * memory from some other caller it will already have b_ops assigned.
+ * If it was in memory from a previous unsuccessful findroot_block
+ * call, the buffer won't have b_ops but it should be clean and ready
+ * for us to try to verify if the read call succeeds. The same applies
+ * if the buffer wasn't in memory at all.
+ *
+ * Note: If we never match a btree type with this buffer, it will be
+ * left in memory with NULL b_ops. This shouldn't be a problem unless
+ * the buffer gets written.
+ */
error = xfs_trans_read_buf(mp, ri->sc->tp, mp->m_ddev_targp, daddr,
mp->m_bsize, 0, &bp, NULL);
if (error)
return error;
- /*
- * Does this look like a block matching our fs and higher than any
- * other block we've found so far? If so, reattach buffer verifiers
- * so the AIL won't complain if the buffer is also dirty.
- */
+ /* Ensure the block magic matches the btree type we're looking for. */
btblock = XFS_BUF_TO_BLOCK(bp);
if (be32_to_cpu(btblock->bb_magic) != fab->magic)
goto out;
- if (xfs_sb_version_hascrc(&mp->m_sb) &&
- !uuid_equal(&btblock->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
- goto out;
- bp->b_ops = fab->buf_ops;
- /* Ignore this block if it's lower in the tree than we've seen. */
- if (fab->root != NULLAGBLOCK &&
- xfs_btree_get_level(btblock) < fab->height)
- goto out;
+ /*
+ * If the buffer already has ops applied and they're not the ones for
+ * this btree type, we know this block doesn't match the btree and we
+ * can bail out.
+ *
+ * If the buffer ops match ours, someone else has already validated
+ * the block for us, so we can move on to checking if this is a root
+ * block candidate.
+ *
+ * If the buffer does not have ops, nobody has successfully validated
+ * the contents and the buffer cannot be dirty. If the magic, uuid,
+ * and structure match this btree type then we'll move on to checking
+ * if it's a root block candidate. If there is no match, bail out.
+ */
+ if (bp->b_ops) {
+ if (bp->b_ops != fab->buf_ops)
+ goto out;
+ } else {
+ ASSERT(!xfs_trans_buf_is_dirty(bp));
+ if (!uuid_equal(&btblock->bb_u.s.bb_uuid,
+ &mp->m_sb.sb_meta_uuid))
+ goto out;
+ fab->buf_ops->verify_read(bp);
+ if (bp->b_error) {
+ bp->b_error = 0;
+ goto out;
+ }
- /* Make sure we pass the verifiers. */
- bp->b_ops->verify_read(bp);
- if (bp->b_error)
+ /*
+ * Some read verifiers will (re)set b_ops, so we must be
+ * careful not to blow away any such assignment.
+ */
+ if (!bp->b_ops)
+ bp->b_ops = fab->buf_ops;
+ }
+
+ /*
+ * This block passes the magic/uuid and verifier tests for this btree
+ * type. We don't need the caller to try the other tree types.
+ */
+ *done_with_block = true;
+
+ /*
+ * Compare this btree block's level to the height of the current
+ * candidate root block.
+ *
+ * If the level matches the root we found previously, throw away both
+ * blocks because there can't be two candidate roots.
+ *
+ * If level is lower in the tree than the root we found previously,
+ * ignore this block.
+ */
+ block_level = xfs_btree_get_level(btblock);
+ if (block_level + 1 == fab->height) {
+ fab->root = NULLAGBLOCK;
goto out;
- fab->root = agbno;
- fab->height = xfs_btree_get_level(btblock) + 1;
- *found_it = true;
+ } else if (block_level < fab->height) {
+ goto out;
+ }
+
+ /*
+ * This is the highest block in the tree that we've found so far.
+ * Update the btree height to reflect what we've learned from this
+ * block.
+ */
+ fab->height = block_level + 1;
+
+ /*
+ * If this block doesn't have sibling pointers, then it's the new root
+ * block candidate. Otherwise, the root will be found farther up the
+ * tree.
+ */
+ if (btblock->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) &&
+ btblock->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
+ fab->root = agbno;
+ else
+ fab->root = NULLAGBLOCK;
trace_xrep_findroot_block(mp, ri->sc->sa.agno, agbno,
be32_to_cpu(btblock->bb_magic), fab->height - 1);
@@ -768,7 +846,7 @@ xrep_findroot_rmap(
struct xrep_findroot *ri = priv;
struct xrep_find_ag_btree *fab;
xfs_agblock_t b;
- bool found_it;
+ bool done;
int error = 0;
/* Ignore anything that isn't AG metadata. */
@@ -777,16 +855,16 @@ xrep_findroot_rmap(
/* Otherwise scan each block + btree type. */
for (b = 0; b < rec->rm_blockcount; b++) {
- found_it = false;
+ done = false;
for (fab = ri->btree_info; fab->buf_ops; fab++) {
if (rec->rm_owner != fab->rmap_owner)
continue;
error = xrep_findroot_block(ri, fab,
rec->rm_owner, rec->rm_startblock + b,
- &found_it);
+ &done);
if (error)
return error;
- if (found_it)
+ if (done)
break;
}
}
diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c
index 4bfae1e61d30..1b2344d00525 100644
--- a/fs/xfs/scrub/scrub.c
+++ b/fs/xfs/scrub/scrub.c
@@ -412,19 +412,6 @@ xchk_validate_inputs(
goto out;
}
- error = -EOPNOTSUPP;
- /*
- * We won't scrub any filesystem that doesn't have the ability
- * to record unwritten extents. The option was made default in
- * 2003, removed from mkfs in 2007, and cannot be disabled in
- * v5, so if we find a filesystem without this flag it's either
- * really old or totally unsupported. Avoid it either way.
- * We also don't support v1-v3 filesystems, which aren't
- * mountable.
- */
- if (!xfs_sb_version_hasextflgbit(&mp->m_sb))
- goto out;
-
/*
* We only want to repair read-write v5+ filesystems. Defer the check
* for ops->repair until after our scrub confirms that we need to
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 49f5f5896a43..338b9d9984e0 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -917,7 +917,7 @@ xfs_vm_writepage(
struct writeback_control *wbc)
{
struct xfs_writepage_ctx wpc = {
- .io_type = XFS_IO_INVALID,
+ .io_type = XFS_IO_HOLE,
};
int ret;
@@ -933,7 +933,7 @@ xfs_vm_writepages(
struct writeback_control *wbc)
{
struct xfs_writepage_ctx wpc = {
- .io_type = XFS_IO_INVALID,
+ .io_type = XFS_IO_HOLE,
};
int ret;
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 9af867951a10..494b4338446e 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -12,21 +12,19 @@ extern struct bio_set xfs_ioend_bioset;
* Types of I/O for bmap clustering and I/O completion tracking.
*/
enum {
- XFS_IO_INVALID, /* initial state */
+ XFS_IO_HOLE, /* covers region without any block allocation */
XFS_IO_DELALLOC, /* covers delalloc region */
XFS_IO_UNWRITTEN, /* covers allocated but uninitialized data */
XFS_IO_OVERWRITE, /* covers already allocated extent */
XFS_IO_COW, /* covers copy-on-write extent */
- XFS_IO_HOLE, /* covers region without any block allocation */
};
#define XFS_IO_TYPES \
- { XFS_IO_INVALID, "invalid" }, \
- { XFS_IO_DELALLOC, "delalloc" }, \
- { XFS_IO_UNWRITTEN, "unwritten" }, \
- { XFS_IO_OVERWRITE, "overwrite" }, \
- { XFS_IO_COW, "CoW" }, \
- { XFS_IO_HOLE, "hole" }
+ { XFS_IO_HOLE, "hole" }, \
+ { XFS_IO_DELALLOC, "delalloc" }, \
+ { XFS_IO_UNWRITTEN, "unwritten" }, \
+ { XFS_IO_OVERWRITE, "overwrite" }, \
+ { XFS_IO_COW, "CoW" }
/*
* Structure for buffered I/O completions.
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index addbd74ecd8e..5d263dfdb3bc 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -406,10 +406,10 @@ xfs_getbmap_report_one(
struct xfs_bmbt_irec *got)
{
struct kgetbmap *p = out + bmv->bmv_entries;
- bool shared = false, trimmed = false;
+ bool shared = false;
int error;
- error = xfs_reflink_trim_around_shared(ip, got, &shared, &trimmed);
+ error = xfs_reflink_trim_around_shared(ip, got, &shared);
if (error)
return error;
@@ -702,13 +702,9 @@ xfs_bmap_punch_delalloc_range(
struct xfs_iext_cursor icur;
int error = 0;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- if (!(ifp->if_flags & XFS_IFEXTENTS)) {
- error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK);
- if (error)
- goto out_unlock;
- }
+ ASSERT(ifp->if_flags & XFS_IFEXTENTS);
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
if (!xfs_iext_lookup_extent_before(ip, ifp, &end_fsb, &icur, &got))
goto out_unlock;
@@ -1047,44 +1043,6 @@ out_trans_cancel:
}
static int
-xfs_adjust_extent_unmap_boundaries(
- struct xfs_inode *ip,
- xfs_fileoff_t *startoffset_fsb,
- xfs_fileoff_t *endoffset_fsb)
-{
- struct xfs_mount *mp = ip->i_mount;
- struct xfs_bmbt_irec imap;
- int nimap, error;
- xfs_extlen_t mod = 0;
-
- nimap = 1;
- error = xfs_bmapi_read(ip, *startoffset_fsb, 1, &imap, &nimap, 0);
- if (error)
- return error;
-
- if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
- div_u64_rem(imap.br_startblock, mp->m_sb.sb_rextsize, &mod);
- if (mod)
- *startoffset_fsb += mp->m_sb.sb_rextsize - mod;
- }
-
- nimap = 1;
- error = xfs_bmapi_read(ip, *endoffset_fsb - 1, 1, &imap, &nimap, 0);
- if (error)
- return error;
-
- if (nimap && imap.br_startblock != HOLESTARTBLOCK) {
- ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
- mod++;
- if (mod && mod != mp->m_sb.sb_rextsize)
- *endoffset_fsb -= mod;
- }
-
- return 0;
-}
-
-static int
xfs_flush_unmap_range(
struct xfs_inode *ip,
xfs_off_t offset,
@@ -1137,19 +1095,8 @@ xfs_free_file_space(
endoffset_fsb = XFS_B_TO_FSBT(mp, offset + len);
/*
- * Need to zero the stuff we're not freeing, on disk. If it's a RT file
- * and we can't use unwritten extents then we actually need to ensure
- * to zero the whole extent, otherwise we just need to take of block
- * boundaries, and xfs_bunmapi will handle the rest.
+ * Need to zero the stuff we're not freeing, on disk.
*/
- if (XFS_IS_REALTIME_INODE(ip) &&
- !xfs_sb_version_hasextflgbit(&mp->m_sb)) {
- error = xfs_adjust_extent_unmap_boundaries(ip, &startoffset_fsb,
- &endoffset_fsb);
- if (error)
- return error;
- }
-
if (endoffset_fsb > startoffset_fsb) {
while (!done) {
error = xfs_unmap_extent(ip, startoffset_fsb,
@@ -1584,7 +1531,7 @@ xfs_swap_extent_rmap(
tirec.br_blockcount, &irec,
&nimaps, 0);
if (error)
- goto out_defer;
+ goto out;
ASSERT(nimaps == 1);
ASSERT(tirec.br_startoff == irec.br_startoff);
trace_xfs_swap_extent_rmap_remap_piece(ip, &irec);
@@ -1599,22 +1546,22 @@ xfs_swap_extent_rmap(
/* Remove the mapping from the donor file. */
error = xfs_bmap_unmap_extent(tp, tip, &uirec);
if (error)
- goto out_defer;
+ goto out;
/* Remove the mapping from the source file. */
error = xfs_bmap_unmap_extent(tp, ip, &irec);
if (error)
- goto out_defer;
+ goto out;
/* Map the donor file's blocks into the source file. */
error = xfs_bmap_map_extent(tp, ip, &uirec);
if (error)
- goto out_defer;
+ goto out;
/* Map the source file's blocks into the donor file. */
error = xfs_bmap_map_extent(tp, tip, &irec);
if (error)
- goto out_defer;
+ goto out;
error = xfs_defer_finish(tpp);
tp = *tpp;
@@ -1636,8 +1583,6 @@ xfs_swap_extent_rmap(
tip->i_d.di_flags2 = tip_flags2;
return 0;
-out_defer:
- xfs_defer_cancel(tp);
out:
trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_);
tip->i_d.di_flags2 = tip_flags2;
@@ -1830,6 +1775,12 @@ xfs_swap_extents(
if (error)
goto out_unlock;
+ if (xfs_inode_has_cow_data(tip)) {
+ error = xfs_reflink_cancel_cow_range(tip, 0, NULLFILEOFF, true);
+ if (error)
+ return error;
+ }
+
/*
* Extent "swapping" with rmap requires a permanent reservation and
* a block reservation because it's really just a remap operation
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index e839907e8492..b21ea2ba768d 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -37,6 +37,32 @@ static kmem_zone_t *xfs_buf_zone;
#define xb_to_gfp(flags) \
((((flags) & XBF_READ_AHEAD) ? __GFP_NORETRY : GFP_NOFS) | __GFP_NOWARN)
+/*
+ * Locking orders
+ *
+ * xfs_buf_ioacct_inc:
+ * xfs_buf_ioacct_dec:
+ * b_sema (caller holds)
+ * b_lock
+ *
+ * xfs_buf_stale:
+ * b_sema (caller holds)
+ * b_lock
+ * lru_lock
+ *
+ * xfs_buf_rele:
+ * b_lock
+ * pag_buf_lock
+ * lru_lock
+ *
+ * xfs_buftarg_wait_rele
+ * lru_lock
+ * b_lock (trylock due to inversion)
+ *
+ * xfs_buftarg_isolate
+ * lru_lock
+ * b_lock (trylock due to inversion)
+ */
static inline int
xfs_buf_is_vmapped(
@@ -749,6 +775,30 @@ _xfs_buf_read(
return xfs_buf_submit(bp);
}
+/*
+ * If the caller passed in an ops structure and the buffer doesn't have ops
+ * assigned, set the ops and use them to verify the contents. If the contents
+ * cannot be verified, we'll clear XBF_DONE. We assume the buffer has no
+ * recorded errors and is already in XBF_DONE state.
+ */
+int
+xfs_buf_ensure_ops(
+ struct xfs_buf *bp,
+ const struct xfs_buf_ops *ops)
+{
+ ASSERT(bp->b_flags & XBF_DONE);
+ ASSERT(bp->b_error == 0);
+
+ if (!ops || bp->b_ops)
+ return 0;
+
+ bp->b_ops = ops;
+ bp->b_ops->verify_read(bp);
+ if (bp->b_error)
+ bp->b_flags &= ~XBF_DONE;
+ return bp->b_error;
+}
+
xfs_buf_t *
xfs_buf_read_map(
struct xfs_buftarg *target,
@@ -762,26 +812,32 @@ xfs_buf_read_map(
flags |= XBF_READ;
bp = xfs_buf_get_map(target, map, nmaps, flags);
- if (bp) {
- trace_xfs_buf_read(bp, flags, _RET_IP_);
+ if (!bp)
+ return NULL;
- if (!(bp->b_flags & XBF_DONE)) {
- XFS_STATS_INC(target->bt_mount, xb_get_read);
- bp->b_ops = ops;
- _xfs_buf_read(bp, flags);
- } else if (flags & XBF_ASYNC) {
- /*
- * Read ahead call which is already satisfied,
- * drop the buffer
- */
- xfs_buf_relse(bp);
- return NULL;
- } else {
- /* We do not want read in the flags */
- bp->b_flags &= ~XBF_READ;
- }
+ trace_xfs_buf_read(bp, flags, _RET_IP_);
+
+ if (!(bp->b_flags & XBF_DONE)) {
+ XFS_STATS_INC(target->bt_mount, xb_get_read);
+ bp->b_ops = ops;
+ _xfs_buf_read(bp, flags);
+ return bp;
}
+ xfs_buf_ensure_ops(bp, ops);
+
+ if (flags & XBF_ASYNC) {
+ /*
+ * Read ahead call which is already satisfied,
+ * drop the buffer
+ */
+ xfs_buf_relse(bp);
+ return NULL;
+ }
+
+ /* We do not want read in the flags */
+ bp->b_flags &= ~XBF_READ;
+ ASSERT(bp->b_ops != NULL || ops == NULL);
return bp;
}
@@ -1006,8 +1062,18 @@ xfs_buf_rele(
ASSERT(atomic_read(&bp->b_hold) > 0);
- release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
+ /*
+ * We grab the b_lock here first to serialise racing xfs_buf_rele()
+ * calls. The pag_buf_lock being taken on the last reference only
+ * serialises against racing lookups in xfs_buf_find(). IOWs, the second
+ * to last reference we drop here is not serialised against the last
+ * reference until we take bp->b_lock. Hence if we don't grab b_lock
+ * first, the last "release" reference can win the race to the lock and
+ * free the buffer before the second-to-last reference is processed,
+ * leading to a use-after-free scenario.
+ */
spin_lock(&bp->b_lock);
+ release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
if (!release) {
/*
* Drop the in-flight state if the buffer is already on the LRU
@@ -1989,6 +2055,13 @@ xfs_buf_delwri_submit_buffers(
* is only safely useable for callers that can track I/O completion by higher
* level means, e.g. AIL pushing as the @buffer_list is consumed in this
* function.
+ *
+ * Note: this function will skip buffers it would block on, and in doing so
+ * leaves them on @buffer_list so they can be retried on a later pass. As such,
+ * it is up to the caller to ensure that the buffer list is fully submitted or
+ * cancelled appropriately when they are finished with the list. Failure to
+ * cancel or resubmit the list until it is empty will result in leaked buffers
+ * at unmount time.
*/
int
xfs_buf_delwri_submit_nowait(
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 4e3171acd0f8..b9f5511ea998 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -385,4 +385,6 @@ extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int);
#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
+int xfs_buf_ensure_ops(struct xfs_buf *bp, const struct xfs_buf_ops *ops);
+
#endif /* __XFS_BUF_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 1c9d1398980b..12d8455bfbb2 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -532,6 +532,49 @@ xfs_buf_item_push(
}
/*
+ * Drop the buffer log item refcount and take appropriate action. This helper
+ * determines whether the bli must be freed or not, since a decrement to zero
+ * does not necessarily mean the bli is unused.
+ *
+ * Return true if the bli is freed, false otherwise.
+ */
+bool
+xfs_buf_item_put(
+ struct xfs_buf_log_item *bip)
+{
+ struct xfs_log_item *lip = &bip->bli_item;
+ bool aborted;
+ bool dirty;
+
+ /* drop the bli ref and return if it wasn't the last one */
+ if (!atomic_dec_and_test(&bip->bli_refcount))
+ return false;
+
+ /*
+ * We dropped the last ref and must free the item if clean or aborted.
+ * If the bli is dirty and non-aborted, the buffer was clean in the
+ * transaction but still awaiting writeback from previous changes. In
+ * that case, the bli is freed on buffer writeback completion.
+ */
+ aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags) ||
+ XFS_FORCED_SHUTDOWN(lip->li_mountp);
+ dirty = bip->bli_flags & XFS_BLI_DIRTY;
+ if (dirty && !aborted)
+ return false;
+
+ /*
+ * The bli is aborted or clean. An aborted item may be in the AIL
+ * regardless of dirty state. For example, consider an aborted
+ * transaction that invalidated a dirty bli and cleared the dirty
+ * state.
+ */
+ if (aborted)
+ xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
+ xfs_buf_item_relse(bip->bli_buf);
+ return true;
+}
+
+/*
* Release the buffer associated with the buf log item. If there is no dirty
* logged data associated with the buffer recorded in the buf log item, then
* free the buf log item and remove the reference to it in the buffer.
@@ -556,76 +599,42 @@ xfs_buf_item_unlock(
{
struct xfs_buf_log_item *bip = BUF_ITEM(lip);
struct xfs_buf *bp = bip->bli_buf;
- bool aborted;
- bool hold = !!(bip->bli_flags & XFS_BLI_HOLD);
- bool dirty = !!(bip->bli_flags & XFS_BLI_DIRTY);
+ bool released;
+ bool hold = bip->bli_flags & XFS_BLI_HOLD;
+ bool stale = bip->bli_flags & XFS_BLI_STALE;
#if defined(DEBUG) || defined(XFS_WARN)
- bool ordered = !!(bip->bli_flags & XFS_BLI_ORDERED);
+ bool ordered = bip->bli_flags & XFS_BLI_ORDERED;
+ bool dirty = bip->bli_flags & XFS_BLI_DIRTY;
#endif
- aborted = test_bit(XFS_LI_ABORTED, &lip->li_flags);
-
- /* Clear the buffer's association with this transaction. */
- bp->b_transp = NULL;
-
- /*
- * The per-transaction state has been copied above so clear it from the
- * bli.
- */
- bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
-
- /*
- * If the buf item is marked stale, then don't do anything. We'll
- * unlock the buffer and free the buf item when the buffer is unpinned
- * for the last time.
- */
- if (bip->bli_flags & XFS_BLI_STALE) {
- trace_xfs_buf_item_unlock_stale(bip);
- ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
- if (!aborted) {
- atomic_dec(&bip->bli_refcount);
- return;
- }
- }
-
trace_xfs_buf_item_unlock(bip);
/*
- * If the buf item isn't tracking any data, free it, otherwise drop the
- * reference we hold to it. If we are aborting the transaction, this may
- * be the only reference to the buf item, so we free it anyway
- * regardless of whether it is dirty or not. A dirty abort implies a
- * shutdown, anyway.
- *
* The bli dirty state should match whether the blf has logged segments
* except for ordered buffers, where only the bli should be dirty.
*/
ASSERT((!ordered && dirty == xfs_buf_item_dirty_format(bip)) ||
(ordered && dirty && !xfs_buf_item_dirty_format(bip)));
+ ASSERT(!stale || (bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
/*
- * Clean buffers, by definition, cannot be in the AIL. However, aborted
- * buffers may be in the AIL regardless of dirty state. An aborted
- * transaction that invalidates a buffer already in the AIL may have
- * marked it stale and cleared the dirty state, for example.
- *
- * Therefore if we are aborting a buffer and we've just taken the last
- * reference away, we have to check if it is in the AIL before freeing
- * it. We need to free it in this case, because an aborted transaction
- * has already shut the filesystem down and this is the last chance we
- * will have to do so.
+ * Clear the buffer's association with this transaction and
+ * per-transaction state from the bli, which has been copied above.
*/
- if (atomic_dec_and_test(&bip->bli_refcount)) {
- if (aborted) {
- ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
- xfs_trans_ail_remove(lip, SHUTDOWN_LOG_IO_ERROR);
- xfs_buf_item_relse(bp);
- } else if (!dirty)
- xfs_buf_item_relse(bp);
- }
+ bp->b_transp = NULL;
+ bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD | XFS_BLI_ORDERED);
- if (!hold)
- xfs_buf_relse(bp);
+ /*
+ * Unref the item and unlock the buffer unless held or stale. Stale
+ * buffers remain locked until final unpin unless the bli is freed by
+ * the unref call. The latter implies shutdown because buffer
+ * invalidation dirties the bli and transaction.
+ */
+ released = xfs_buf_item_put(bip);
+ if (hold || (stale && !released))
+ return;
+ ASSERT(!stale || test_bit(XFS_LI_ABORTED, &lip->li_flags));
+ xfs_buf_relse(bp);
}
/*
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 3f7d7b72e7e6..90f65f891fab 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -51,6 +51,7 @@ struct xfs_buf_log_item {
int xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
void xfs_buf_item_relse(struct xfs_buf *);
+bool xfs_buf_item_put(struct xfs_buf_log_item *);
void xfs_buf_item_log(struct xfs_buf_log_item *, uint, uint);
bool xfs_buf_item_dirty_format(struct xfs_buf_log_item *);
void xfs_buf_attach_iodone(struct xfs_buf *,
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 7c00b8bedfe3..093c2b8d7e20 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -470,20 +470,13 @@ xfs_fs_goingdown(
*/
void
xfs_do_force_shutdown(
- xfs_mount_t *mp,
+ struct xfs_mount *mp,
int flags,
char *fname,
int lnnum)
{
- int logerror;
-
- logerror = flags & SHUTDOWN_LOG_IO_ERROR;
+ bool logerror = flags & SHUTDOWN_LOG_IO_ERROR;
- if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
- xfs_notice(mp,
- "%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT,
- __func__, flags, lnnum, fname, __return_address);
- }
/*
* No need to duplicate efforts.
*/
@@ -499,27 +492,34 @@ xfs_do_force_shutdown(
if (xfs_log_force_umount(mp, logerror))
return;
+ if (flags & SHUTDOWN_FORCE_UMOUNT) {
+ xfs_alert(mp,
+"User initiated shutdown received. Shutting down filesystem");
+ return;
+ }
+
+ xfs_notice(mp,
+"%s(0x%x) called from line %d of file %s. Return address = "PTR_FMT,
+ __func__, flags, lnnum, fname, __return_address);
+
if (flags & SHUTDOWN_CORRUPT_INCORE) {
xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_CORRUPT,
- "Corruption of in-memory data detected. Shutting down filesystem");
+"Corruption of in-memory data detected. Shutting down filesystem");
if (XFS_ERRLEVEL_HIGH <= xfs_error_level)
xfs_stack_trace();
- } else if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
- if (logerror) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
- "Log I/O Error Detected. Shutting down filesystem");
- } else if (flags & SHUTDOWN_DEVICE_REQ) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
- "All device paths lost. Shutting down filesystem");
- } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
- xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
- "I/O Error Detected. Shutting down filesystem");
- }
- }
- if (!(flags & SHUTDOWN_FORCE_UMOUNT)) {
- xfs_alert(mp,
- "Please umount the filesystem and rectify the problem(s)");
+ } else if (logerror) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_LOGERROR,
+ "Log I/O Error Detected. Shutting down filesystem");
+ } else if (flags & SHUTDOWN_DEVICE_REQ) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+ "All device paths lost. Shutting down filesystem");
+ } else if (!(flags & SHUTDOWN_REMOTE_REQ)) {
+ xfs_alert_tag(mp, XFS_PTAG_SHUTDOWN_IOERROR,
+ "I/O Error Detected. Shutting down filesystem");
}
+
+ xfs_alert(mp,
+ "Please unmount the filesystem and rectify the problem(s)");
}
/*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index d957a46dc1cb..05db9540e459 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1563,7 +1563,7 @@ xfs_itruncate_extents_flags(
error = xfs_bunmapi(tp, ip, first_unmap_block, unmap_len, flags,
XFS_ITRUNC_MAX_EXTENTS, &done);
if (error)
- goto out_bmap_cancel;
+ goto out;
/*
* Duplicate the transaction that has the permanent
@@ -1599,14 +1599,6 @@ xfs_itruncate_extents_flags(
out:
*tpp = tp;
return error;
-out_bmap_cancel:
- /*
- * If the bunmapi call encounters an error, return to the caller where
- * the transaction can be properly aborted. We just need to make sure
- * we're not holding any resources that we were not when we came in.
- */
- xfs_defer_cancel(tp);
- goto out;
}
int
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0ef5ece5634c..6e2c08f30f60 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -604,14 +604,6 @@ xfs_ioc_space(
uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
int error;
- /*
- * Only allow the sys admin to reserve space unless
- * unwritten extents are enabled.
- */
- if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) &&
- !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
return -EPERM;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 6320aca39f39..27c93b5f029d 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -62,6 +62,21 @@ xfs_bmbt_to_iomap(
iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
}
+static void
+xfs_hole_to_iomap(
+ struct xfs_inode *ip,
+ struct iomap *iomap,
+ xfs_fileoff_t offset_fsb,
+ xfs_fileoff_t end_fsb)
+{
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->type = IOMAP_HOLE;
+ iomap->offset = XFS_FSB_TO_B(ip->i_mount, offset_fsb);
+ iomap->length = XFS_FSB_TO_B(ip->i_mount, end_fsb - offset_fsb);
+ iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
+ iomap->dax_dev = xfs_find_daxdev_for_inode(VFS_I(ip));
+}
+
xfs_extlen_t
xfs_eof_alignment(
struct xfs_inode *ip,
@@ -502,6 +517,7 @@ xfs_file_iomap_begin_delay(
struct inode *inode,
loff_t offset,
loff_t count,
+ unsigned flags,
struct iomap *iomap)
{
struct xfs_inode *ip = XFS_I(inode);
@@ -538,15 +554,23 @@ xfs_file_iomap_begin_delay(
goto out_unlock;
}
+ end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
+
eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &icur, &got);
- if (!eof && got.br_startoff <= offset_fsb) {
- if (xfs_is_reflink_inode(ip)) {
- bool shared;
+ if (eof)
+ got.br_startoff = end_fsb; /* fake hole until the end */
- end_fsb = min(XFS_B_TO_FSB(mp, offset + count),
- maxbytes_fsb);
+ if (got.br_startoff <= offset_fsb) {
+ /*
+ * For reflink files we may need a delalloc reservation when
+ * overwriting shared extents. This includes zeroing of
+ * existing extents that contain data.
+ */
+ if (xfs_is_reflink_inode(ip) &&
+ ((flags & IOMAP_WRITE) ||
+ got.br_state != XFS_EXT_UNWRITTEN)) {
xfs_trim_extent(&got, offset_fsb, end_fsb - offset_fsb);
- error = xfs_reflink_reserve_cow(ip, &got, &shared);
+ error = xfs_reflink_reserve_cow(ip, &got);
if (error)
goto out_unlock;
}
@@ -555,6 +579,11 @@ xfs_file_iomap_begin_delay(
goto done;
}
+ if (flags & IOMAP_ZERO) {
+ xfs_hole_to_iomap(ip, iomap, offset_fsb, got.br_startoff);
+ goto out_unlock;
+ }
+
error = xfs_qm_dqattach_locked(ip, false);
if (error)
goto out_unlock;
@@ -1003,16 +1032,17 @@ xfs_file_iomap_begin(
struct xfs_bmbt_irec imap;
xfs_fileoff_t offset_fsb, end_fsb;
int nimaps = 1, error = 0;
- bool shared = false, trimmed = false;
+ bool shared = false;
unsigned lockmode;
if (XFS_FORCED_SHUTDOWN(mp))
return -EIO;
- if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) &&
+ if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && !(flags & IOMAP_DIRECT) &&
!IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
/* Reserve delalloc blocks for regular writeback. */
- return xfs_file_iomap_begin_delay(inode, offset, length, iomap);
+ return xfs_file_iomap_begin_delay(inode, offset, length, flags,
+ iomap);
}
/*
@@ -1038,8 +1068,7 @@ xfs_file_iomap_begin(
if (flags & IOMAP_REPORT) {
/* Trim the mapping to the nearest shared extent boundary. */
- error = xfs_reflink_trim_around_shared(ip, &imap, &shared,
- &trimmed);
+ error = xfs_reflink_trim_around_shared(ip, &imap, &shared);
if (error)
goto out_unlock;
}
@@ -1065,7 +1094,7 @@ xfs_file_iomap_begin(
if (error)
goto out_unlock;
} else {
- error = xfs_reflink_reserve_cow(ip, &imap, &shared);
+ error = xfs_reflink_reserve_cow(ip, &imap);
if (error)
goto out_unlock;
}
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index c3e74f9128e8..f48ffd7a8d3e 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -471,8 +471,18 @@ xfs_vn_get_link_inline(
struct inode *inode,
struct delayed_call *done)
{
+ char *link;
+
ASSERT(XFS_I(inode)->i_df.if_flags & XFS_IFINLINE);
- return XFS_I(inode)->i_df.if_u1.if_data;
+
+ /*
+ * The VFS crashes on a NULL pointer, so return -EFSCORRUPTED if
+ * if_data is junk.
+ */
+ link = XFS_I(inode)->i_df.if_u1.if_data;
+ if (!link)
+ return ERR_PTR(-EFSCORRUPTED);
+ return link;
}
STATIC int
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index a21dc61ec09e..1fc9e9042e0e 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1570,16 +1570,6 @@ xlog_find_zeroed(
if (last_cycle != 0) { /* log completely written to */
xlog_put_bp(bp);
return 0;
- } else if (first_cycle != 1) {
- /*
- * If the cycle of the last block is zero, the cycle of
- * the first block must be 1. If it's not, maybe we're
- * not looking at a log... Bail out.
- */
- xfs_warn(log->l_mp,
- "Log inconsistent or not a log (last==0, first!=1)");
- error = -EINVAL;
- goto bp_err;
}
/* we have a partially zeroed log */
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 38f405415b88..8eaeec9d58ed 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -182,8 +182,7 @@ int
xfs_reflink_trim_around_shared(
struct xfs_inode *ip,
struct xfs_bmbt_irec *irec,
- bool *shared,
- bool *trimmed)
+ bool *shared)
{
xfs_agnumber_t agno;
xfs_agblock_t agbno;
@@ -209,7 +208,7 @@ xfs_reflink_trim_around_shared(
if (error)
return error;
- *shared = *trimmed = false;
+ *shared = false;
if (fbno == NULLAGBLOCK) {
/* No shared blocks at all. */
return 0;
@@ -222,8 +221,6 @@ xfs_reflink_trim_around_shared(
*/
irec->br_blockcount = flen;
*shared = true;
- if (flen != aglen)
- *trimmed = true;
return 0;
} else {
/*
@@ -233,7 +230,6 @@ xfs_reflink_trim_around_shared(
* start of the shared region.
*/
irec->br_blockcount = fbno - agbno;
- *trimmed = true;
return 0;
}
}
@@ -241,7 +237,7 @@ xfs_reflink_trim_around_shared(
/*
* Trim the passed in imap to the next shared/unshared extent boundary, and
* if imap->br_startoff points to a shared extent reserve space for it in the
- * COW fork. In this case *shared is set to true, else to false.
+ * COW fork.
*
* Note that imap will always contain the block numbers for the existing blocks
* in the data fork, as the upper layers need them for read-modify-write
@@ -250,14 +246,14 @@ xfs_reflink_trim_around_shared(
int
xfs_reflink_reserve_cow(
struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap,
- bool *shared)
+ struct xfs_bmbt_irec *imap)
{
struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
struct xfs_bmbt_irec got;
int error = 0;
- bool eof = false, trimmed;
+ bool eof = false;
struct xfs_iext_cursor icur;
+ bool shared;
/*
* Search the COW fork extent list first. This serves two purposes:
@@ -273,18 +269,16 @@ xfs_reflink_reserve_cow(
if (!eof && got.br_startoff <= imap->br_startoff) {
trace_xfs_reflink_cow_found(ip, imap);
xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
-
- *shared = true;
return 0;
}
/* Trim the mapping to the nearest shared extent boundary. */
- error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
+ error = xfs_reflink_trim_around_shared(ip, imap, &shared);
if (error)
return error;
/* Not shared? Just report the (potentially capped) extent. */
- if (!*shared)
+ if (!shared)
return 0;
/*
@@ -352,6 +346,50 @@ xfs_reflink_convert_cow(
return error;
}
+/*
+ * Find the extent that maps the given range in the COW fork. Even if the extent
+ * is not shared we might have a preallocation for it in the COW fork. If so we
+ * use it that rather than trigger a new allocation.
+ */
+static int
+xfs_find_trim_cow_extent(
+ struct xfs_inode *ip,
+ struct xfs_bmbt_irec *imap,
+ bool *shared,
+ bool *found)
+{
+ xfs_fileoff_t offset_fsb = imap->br_startoff;
+ xfs_filblks_t count_fsb = imap->br_blockcount;
+ struct xfs_iext_cursor icur;
+ struct xfs_bmbt_irec got;
+
+ *found = false;
+
+ /*
+ * If we don't find an overlapping extent, trim the range we need to
+ * allocate to fit the hole we found.
+ */
+ if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got))
+ got.br_startoff = offset_fsb + count_fsb;
+ if (got.br_startoff > offset_fsb) {
+ xfs_trim_extent(imap, imap->br_startoff,
+ got.br_startoff - imap->br_startoff);
+ return xfs_reflink_trim_around_shared(ip, imap, shared);
+ }
+
+ *shared = true;
+ if (isnullstartblock(got.br_startblock)) {
+ xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
+ return 0;
+ }
+
+ /* real extent found - no need to allocate */
+ xfs_trim_extent(&got, offset_fsb, count_fsb);
+ *imap = got;
+ *found = true;
+ return 0;
+}
+
/* Allocate all CoW reservations covering a range of blocks in a file. */
int
xfs_reflink_allocate_cow(
@@ -363,78 +401,64 @@ xfs_reflink_allocate_cow(
struct xfs_mount *mp = ip->i_mount;
xfs_fileoff_t offset_fsb = imap->br_startoff;
xfs_filblks_t count_fsb = imap->br_blockcount;
- struct xfs_bmbt_irec got;
- struct xfs_trans *tp = NULL;
+ struct xfs_trans *tp;
int nimaps, error = 0;
- bool trimmed;
+ bool found;
xfs_filblks_t resaligned;
xfs_extlen_t resblks = 0;
- struct xfs_iext_cursor icur;
-retry:
- ASSERT(xfs_is_reflink_inode(ip));
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+ ASSERT(xfs_is_reflink_inode(ip));
- /*
- * Even if the extent is not shared we might have a preallocation for
- * it in the COW fork. If so use it.
- */
- if (xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got) &&
- got.br_startoff <= offset_fsb) {
- *shared = true;
-
- /* If we have a real allocation in the COW fork we're done. */
- if (!isnullstartblock(got.br_startblock)) {
- xfs_trim_extent(&got, offset_fsb, count_fsb);
- *imap = got;
- goto convert;
- }
+ error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
+ if (error || !*shared)
+ return error;
+ if (found)
+ goto convert;
- xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
- } else {
- error = xfs_reflink_trim_around_shared(ip, imap, shared, &trimmed);
- if (error || !*shared)
- goto out;
- }
+ resaligned = xfs_aligned_fsb_count(imap->br_startoff,
+ imap->br_blockcount, xfs_get_cowextsz_hint(ip));
+ resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
- if (!tp) {
- resaligned = xfs_aligned_fsb_count(imap->br_startoff,
- imap->br_blockcount, xfs_get_cowextsz_hint(ip));
- resblks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned);
+ xfs_iunlock(ip, *lockmode);
+ error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+ *lockmode = XFS_ILOCK_EXCL;
+ xfs_ilock(ip, *lockmode);
- xfs_iunlock(ip, *lockmode);
- error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
- *lockmode = XFS_ILOCK_EXCL;
- xfs_ilock(ip, *lockmode);
+ if (error)
+ return error;
- if (error)
- return error;
+ error = xfs_qm_dqattach_locked(ip, false);
+ if (error)
+ goto out_trans_cancel;
- error = xfs_qm_dqattach_locked(ip, false);
- if (error)
- goto out;
- goto retry;
+ /*
+ * Check for an overlapping extent again now that we dropped the ilock.
+ */
+ error = xfs_find_trim_cow_extent(ip, imap, shared, &found);
+ if (error || !*shared)
+ goto out_trans_cancel;
+ if (found) {
+ xfs_trans_cancel(tp);
+ goto convert;
}
error = xfs_trans_reserve_quota_nblks(tp, ip, resblks, 0,
XFS_QMOPT_RES_REGBLKS);
if (error)
- goto out;
+ goto out_trans_cancel;
xfs_trans_ijoin(tp, ip, 0);
- nimaps = 1;
-
/* Allocate the entire reservation as unwritten blocks. */
+ nimaps = 1;
error = xfs_bmapi_write(tp, ip, imap->br_startoff, imap->br_blockcount,
XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC,
resblks, imap, &nimaps);
if (error)
- goto out_trans_cancel;
+ goto out_unreserve;
xfs_inode_set_cowblocks_tag(ip);
-
- /* Finish up. */
error = xfs_trans_commit(tp);
if (error)
return error;
@@ -447,12 +471,12 @@ retry:
return -ENOSPC;
convert:
return xfs_reflink_convert_cow_extent(ip, imap, offset_fsb, count_fsb);
-out_trans_cancel:
+
+out_unreserve:
xfs_trans_unreserve_quota_nblks(tp, ip, (long)resblks, 0,
XFS_QMOPT_RES_REGBLKS);
-out:
- if (tp)
- xfs_trans_cancel(tp);
+out_trans_cancel:
+ xfs_trans_cancel(tp);
return error;
}
@@ -666,14 +690,12 @@ xfs_reflink_end_cow(
if (!del.br_blockcount)
goto prev_extent;
- ASSERT(!isnullstartblock(got.br_startblock));
-
/*
- * Don't remap unwritten extents; these are
- * speculatively preallocated CoW extents that have been
- * allocated but have not yet been involved in a write.
+ * Only remap real extent that contain data. With AIO
+ * speculatively preallocations can leak into the range we
+ * are called upon, and we need to skip them.
*/
- if (got.br_state == XFS_EXT_UNWRITTEN)
+ if (!xfs_bmap_is_real_extent(&got))
goto prev_extent;
/* Unmap the old blocks in the data fork. */
@@ -1195,35 +1217,92 @@ retry:
return 0;
}
+/* Unlock both inodes after they've been prepped for a range clone. */
+STATIC void
+xfs_reflink_remap_unlock(
+ struct file *file_in,
+ struct file *file_out)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct xfs_inode *src = XFS_I(inode_in);
+ struct inode *inode_out = file_inode(file_out);
+ struct xfs_inode *dest = XFS_I(inode_out);
+ bool same_inode = (inode_in == inode_out);
+
+ xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
+ if (!same_inode)
+ xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
+ inode_unlock(inode_out);
+ if (!same_inode)
+ inode_unlock_shared(inode_in);
+}
+
/*
- * Link a range of blocks from one file to another.
+ * If we're reflinking to a point past the destination file's EOF, we must
+ * zero any speculative post-EOF preallocations that sit between the old EOF
+ * and the destination file offset.
*/
-int
-xfs_reflink_remap_range(
+static int
+xfs_reflink_zero_posteof(
+ struct xfs_inode *ip,
+ loff_t pos)
+{
+ loff_t isize = i_size_read(VFS_I(ip));
+
+ if (pos <= isize)
+ return 0;
+
+ trace_xfs_zero_eof(ip, isize, pos - isize);
+ return iomap_zero_range(VFS_I(ip), isize, pos - isize, NULL,
+ &xfs_iomap_ops);
+}
+
+/*
+ * Prepare two files for range cloning. Upon a successful return both inodes
+ * will have the iolock and mmaplock held, the page cache of the out file will
+ * be truncated, and any leases on the out file will have been broken. This
+ * function borrows heavily from xfs_file_aio_write_checks.
+ *
+ * The VFS allows partial EOF blocks to "match" for dedupe even though it hasn't
+ * checked that the bytes beyond EOF physically match. Hence we cannot use the
+ * EOF block in the source dedupe range because it's not a complete block match,
+ * hence can introduce a corruption into the file that has it's block replaced.
+ *
+ * In similar fashion, the VFS file cloning also allows partial EOF blocks to be
+ * "block aligned" for the purposes of cloning entire files. However, if the
+ * source file range includes the EOF block and it lands within the existing EOF
+ * of the destination file, then we can expose stale data from beyond the source
+ * file EOF in the destination file.
+ *
+ * XFS doesn't support partial block sharing, so in both cases we have check
+ * these cases ourselves. For dedupe, we can simply round the length to dedupe
+ * down to the previous whole block and ignore the partial EOF block. While this
+ * means we can't dedupe the last block of a file, this is an acceptible
+ * tradeoff for simplicity on implementation.
+ *
+ * For cloning, we want to share the partial EOF block if it is also the new EOF
+ * block of the destination file. If the partial EOF block lies inside the
+ * existing destination EOF, then we have to abort the clone to avoid exposing
+ * stale data in the destination file. Hence we reject these clone attempts with
+ * -EINVAL in this case.
+ */
+STATIC int
+xfs_reflink_remap_prep(
struct file *file_in,
loff_t pos_in,
struct file *file_out,
loff_t pos_out,
- u64 len,
+ u64 *len,
bool is_dedupe)
{
struct inode *inode_in = file_inode(file_in);
struct xfs_inode *src = XFS_I(inode_in);
struct inode *inode_out = file_inode(file_out);
struct xfs_inode *dest = XFS_I(inode_out);
- struct xfs_mount *mp = src->i_mount;
bool same_inode = (inode_in == inode_out);
- xfs_fileoff_t sfsbno, dfsbno;
- xfs_filblks_t fsblen;
- xfs_extlen_t cowextsize;
+ u64 blkmask = i_blocksize(inode_in) - 1;
ssize_t ret;
- if (!xfs_sb_version_hasreflink(&mp->m_sb))
- return -EOPNOTSUPP;
-
- if (XFS_FORCED_SHUTDOWN(mp))
- return -EIO;
-
/* Lock both files against IO */
ret = xfs_iolock_two_inodes_and_break_layout(inode_in, inode_out);
if (ret)
@@ -1245,33 +1324,115 @@ xfs_reflink_remap_range(
goto out_unlock;
ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
- &len, is_dedupe);
+ len, is_dedupe);
if (ret <= 0)
goto out_unlock;
+ /*
+ * If the dedupe data matches, chop off the partial EOF block
+ * from the source file so we don't try to dedupe the partial
+ * EOF block.
+ */
+ if (is_dedupe) {
+ *len &= ~blkmask;
+ } else if (*len & blkmask) {
+ /*
+ * The user is attempting to share a partial EOF block,
+ * if it's inside the destination EOF then reject it.
+ */
+ if (pos_out + *len < i_size_read(inode_out)) {
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+ }
+
/* Attach dquots to dest inode before changing block map */
ret = xfs_qm_dqattach(dest);
if (ret)
goto out_unlock;
- trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
-
/*
- * Clear out post-eof preallocations because we don't have page cache
- * backing the delayed allocations and they'll never get freed on
- * their own.
+ * Zero existing post-eof speculative preallocations in the destination
+ * file.
*/
- if (xfs_can_free_eofblocks(dest, true)) {
- ret = xfs_free_eofblocks(dest);
- if (ret)
- goto out_unlock;
- }
+ ret = xfs_reflink_zero_posteof(dest, pos_out);
+ if (ret)
+ goto out_unlock;
/* Set flags and remap blocks. */
ret = xfs_reflink_set_inode_flag(src, dest);
if (ret)
goto out_unlock;
+ /* Zap any page cache for the destination file's range. */
+ truncate_inode_pages_range(&inode_out->i_data, pos_out,
+ PAGE_ALIGN(pos_out + *len) - 1);
+
+ /* If we're altering the file contents... */
+ if (!is_dedupe) {
+ /*
+ * ...update the timestamps (which will grab the ilock again
+ * from xfs_fs_dirty_inode, so we have to call it before we
+ * take the ilock).
+ */
+ if (!(file_out->f_mode & FMODE_NOCMTIME)) {
+ ret = file_update_time(file_out);
+ if (ret)
+ goto out_unlock;
+ }
+
+ /*
+ * ...clear the security bits if the process is not being run
+ * by root. This keeps people from modifying setuid and setgid
+ * binaries.
+ */
+ ret = file_remove_privs(file_out);
+ if (ret)
+ goto out_unlock;
+ }
+
+ return 1;
+out_unlock:
+ xfs_reflink_remap_unlock(file_in, file_out);
+ return ret;
+}
+
+/*
+ * Link a range of blocks from one file to another.
+ */
+int
+xfs_reflink_remap_range(
+ struct file *file_in,
+ loff_t pos_in,
+ struct file *file_out,
+ loff_t pos_out,
+ u64 len,
+ bool is_dedupe)
+{
+ struct inode *inode_in = file_inode(file_in);
+ struct xfs_inode *src = XFS_I(inode_in);
+ struct inode *inode_out = file_inode(file_out);
+ struct xfs_inode *dest = XFS_I(inode_out);
+ struct xfs_mount *mp = src->i_mount;
+ xfs_fileoff_t sfsbno, dfsbno;
+ xfs_filblks_t fsblen;
+ xfs_extlen_t cowextsize;
+ ssize_t ret;
+
+ if (!xfs_sb_version_hasreflink(&mp->m_sb))
+ return -EOPNOTSUPP;
+
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
+ /* Prepare and then clone file data. */
+ ret = xfs_reflink_remap_prep(file_in, pos_in, file_out, pos_out,
+ &len, is_dedupe);
+ if (ret <= 0)
+ return ret;
+
+ trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
+
dfsbno = XFS_B_TO_FSBT(mp, pos_out);
sfsbno = XFS_B_TO_FSBT(mp, pos_in);
fsblen = XFS_B_TO_FSB(mp, len);
@@ -1280,10 +1441,6 @@ xfs_reflink_remap_range(
if (ret)
goto out_unlock;
- /* Zap any page cache for the destination file's range. */
- truncate_inode_pages_range(&inode_out->i_data, pos_out,
- PAGE_ALIGN(pos_out + len) - 1);
-
/*
* Carry the cowextsize hint from src to dest if we're sharing the
* entire source file to the entire destination file, the source file
@@ -1300,12 +1457,7 @@ xfs_reflink_remap_range(
is_dedupe);
out_unlock:
- xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
- if (!same_inode)
- xfs_iunlock(src, XFS_MMAPLOCK_SHARED);
- inode_unlock(inode_out);
- if (!same_inode)
- inode_unlock_shared(inode_in);
+ xfs_reflink_remap_unlock(file_in, file_out);
if (ret)
trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
return ret;
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index c585ad9552b2..7f47202b5639 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -10,10 +10,10 @@ extern int xfs_reflink_find_shared(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_extlen_t aglen,
xfs_agblock_t *fbno, xfs_extlen_t *flen, bool find_maximal);
extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
- struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed);
+ struct xfs_bmbt_irec *irec, bool *shared);
extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
- struct xfs_bmbt_irec *imap, bool *shared);
+ struct xfs_bmbt_irec *imap);
extern int xfs_reflink_allocate_cow(struct xfs_inode *ip,
struct xfs_bmbt_irec *imap, bool *shared, uint *lockmode);
extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset,
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 4e4423153071..cc509743facd 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -29,30 +29,30 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
char *desc;
int endpoint;
} xstats[] = {
- { "extent_alloc", XFSSTAT_END_EXTENT_ALLOC },
- { "abt", XFSSTAT_END_ALLOC_BTREE },
- { "blk_map", XFSSTAT_END_BLOCK_MAPPING },
- { "bmbt", XFSSTAT_END_BLOCK_MAP_BTREE },
- { "dir", XFSSTAT_END_DIRECTORY_OPS },
- { "trans", XFSSTAT_END_TRANSACTIONS },
- { "ig", XFSSTAT_END_INODE_OPS },
- { "log", XFSSTAT_END_LOG_OPS },
- { "push_ail", XFSSTAT_END_TAIL_PUSHING },
- { "xstrat", XFSSTAT_END_WRITE_CONVERT },
- { "rw", XFSSTAT_END_READ_WRITE_OPS },
- { "attr", XFSSTAT_END_ATTRIBUTE_OPS },
- { "icluster", XFSSTAT_END_INODE_CLUSTER },
- { "vnodes", XFSSTAT_END_VNODE_OPS },
- { "buf", XFSSTAT_END_BUF },
- { "abtb2", XFSSTAT_END_ABTB_V2 },
- { "abtc2", XFSSTAT_END_ABTC_V2 },
- { "bmbt2", XFSSTAT_END_BMBT_V2 },
- { "ibt2", XFSSTAT_END_IBT_V2 },
- { "fibt2", XFSSTAT_END_FIBT_V2 },
- { "rmapbt", XFSSTAT_END_RMAP_V2 },
- { "refcntbt", XFSSTAT_END_REFCOUNT },
+ { "extent_alloc", xfsstats_offset(xs_abt_lookup) },
+ { "abt", xfsstats_offset(xs_blk_mapr) },
+ { "blk_map", xfsstats_offset(xs_bmbt_lookup) },
+ { "bmbt", xfsstats_offset(xs_dir_lookup) },
+ { "dir", xfsstats_offset(xs_trans_sync) },
+ { "trans", xfsstats_offset(xs_ig_attempts) },
+ { "ig", xfsstats_offset(xs_log_writes) },
+ { "log", xfsstats_offset(xs_try_logspace)},
+ { "push_ail", xfsstats_offset(xs_xstrat_quick)},
+ { "xstrat", xfsstats_offset(xs_write_calls) },
+ { "rw", xfsstats_offset(xs_attr_get) },
+ { "attr", xfsstats_offset(xs_iflush_count)},
+ { "icluster", xfsstats_offset(vn_active) },
+ { "vnodes", xfsstats_offset(xb_get) },
+ { "buf", xfsstats_offset(xs_abtb_2) },
+ { "abtb2", xfsstats_offset(xs_abtc_2) },
+ { "abtc2", xfsstats_offset(xs_bmbt_2) },
+ { "bmbt2", xfsstats_offset(xs_ibt_2) },
+ { "ibt2", xfsstats_offset(xs_fibt_2) },
+ { "fibt2", xfsstats_offset(xs_rmap_2) },
+ { "rmapbt", xfsstats_offset(xs_refcbt_2) },
+ { "refcntbt", xfsstats_offset(xs_qm_dqreclaims)},
/* we print both series of quota information together */
- { "qm", XFSSTAT_END_QM },
+ { "qm", xfsstats_offset(xs_xstrat_bytes)},
};
/* Loop over all stats groups */
@@ -104,6 +104,10 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats)
#ifdef CONFIG_PROC_FS
/* legacy quota interfaces */
#ifdef CONFIG_XFS_QUOTA
+
+#define XFSSTAT_START_XQMSTAT xfsstats_offset(xs_qm_dqreclaims)
+#define XFSSTAT_END_XQMSTAT xfsstats_offset(xs_qm_dquot)
+
static int xqm_proc_show(struct seq_file *m, void *v)
{
/* maximum; incore; ratio free to inuse; freelist */
@@ -119,7 +123,7 @@ static int xqmstat_proc_show(struct seq_file *m, void *v)
int j;
seq_printf(m, "qm");
- for (j = XFSSTAT_END_IBT_V2; j < XFSSTAT_END_XQMSTAT; j++)
+ for (j = XFSSTAT_START_XQMSTAT; j < XFSSTAT_END_XQMSTAT; j++)
seq_printf(m, " %u", counter_val(xfsstats.xs_stats, j));
seq_putc(m, '\n');
return 0;
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 130db070e4d8..34d704f703d2 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -41,17 +41,14 @@ enum {
* XFS global statistics
*/
struct __xfsstats {
-# define XFSSTAT_END_EXTENT_ALLOC 4
uint32_t xs_allocx;
uint32_t xs_allocb;
uint32_t xs_freex;
uint32_t xs_freeb;
-# define XFSSTAT_END_ALLOC_BTREE (XFSSTAT_END_EXTENT_ALLOC+4)
uint32_t xs_abt_lookup;
uint32_t xs_abt_compare;
uint32_t xs_abt_insrec;
uint32_t xs_abt_delrec;
-# define XFSSTAT_END_BLOCK_MAPPING (XFSSTAT_END_ALLOC_BTREE+7)
uint32_t xs_blk_mapr;
uint32_t xs_blk_mapw;
uint32_t xs_blk_unmap;
@@ -59,21 +56,17 @@ struct __xfsstats {
uint32_t xs_del_exlist;
uint32_t xs_look_exlist;
uint32_t xs_cmp_exlist;
-# define XFSSTAT_END_BLOCK_MAP_BTREE (XFSSTAT_END_BLOCK_MAPPING+4)
uint32_t xs_bmbt_lookup;
uint32_t xs_bmbt_compare;
uint32_t xs_bmbt_insrec;
uint32_t xs_bmbt_delrec;
-# define XFSSTAT_END_DIRECTORY_OPS (XFSSTAT_END_BLOCK_MAP_BTREE+4)
uint32_t xs_dir_lookup;
uint32_t xs_dir_create;
uint32_t xs_dir_remove;
uint32_t xs_dir_getdents;
-# define XFSSTAT_END_TRANSACTIONS (XFSSTAT_END_DIRECTORY_OPS+3)
uint32_t xs_trans_sync;
uint32_t xs_trans_async;
uint32_t xs_trans_empty;
-# define XFSSTAT_END_INODE_OPS (XFSSTAT_END_TRANSACTIONS+7)
uint32_t xs_ig_attempts;
uint32_t xs_ig_found;
uint32_t xs_ig_frecycle;
@@ -81,13 +74,11 @@ struct __xfsstats {
uint32_t xs_ig_dup;
uint32_t xs_ig_reclaims;
uint32_t xs_ig_attrchg;
-# define XFSSTAT_END_LOG_OPS (XFSSTAT_END_INODE_OPS+5)
uint32_t xs_log_writes;
uint32_t xs_log_blocks;
uint32_t xs_log_noiclogs;
uint32_t xs_log_force;
uint32_t xs_log_force_sleep;
-# define XFSSTAT_END_TAIL_PUSHING (XFSSTAT_END_LOG_OPS+10)
uint32_t xs_try_logspace;
uint32_t xs_sleep_logspace;
uint32_t xs_push_ail;
@@ -98,22 +89,17 @@ struct __xfsstats {
uint32_t xs_push_ail_flushing;
uint32_t xs_push_ail_restarts;
uint32_t xs_push_ail_flush;
-# define XFSSTAT_END_WRITE_CONVERT (XFSSTAT_END_TAIL_PUSHING+2)
uint32_t xs_xstrat_quick;
uint32_t xs_xstrat_split;
-# define XFSSTAT_END_READ_WRITE_OPS (XFSSTAT_END_WRITE_CONVERT+2)
uint32_t xs_write_calls;
uint32_t xs_read_calls;
-# define XFSSTAT_END_ATTRIBUTE_OPS (XFSSTAT_END_READ_WRITE_OPS+4)
uint32_t xs_attr_get;
uint32_t xs_attr_set;
uint32_t xs_attr_remove;
uint32_t xs_attr_list;
-# define XFSSTAT_END_INODE_CLUSTER (XFSSTAT_END_ATTRIBUTE_OPS+3)
uint32_t xs_iflush_count;
uint32_t xs_icluster_flushcnt;
uint32_t xs_icluster_flushinode;
-# define XFSSTAT_END_VNODE_OPS (XFSSTAT_END_INODE_CLUSTER+8)
uint32_t vn_active; /* # vnodes not on free lists */
uint32_t vn_alloc; /* # times vn_alloc called */
uint32_t vn_get; /* # times vn_get called */
@@ -122,7 +108,6 @@ struct __xfsstats {
uint32_t vn_reclaim; /* # times vn_reclaim called */
uint32_t vn_remove; /* # times vn_remove called */
uint32_t vn_free; /* # times vn_free called */
-#define XFSSTAT_END_BUF (XFSSTAT_END_VNODE_OPS+9)
uint32_t xb_get;
uint32_t xb_create;
uint32_t xb_get_locked;
@@ -133,28 +118,19 @@ struct __xfsstats {
uint32_t xb_page_found;
uint32_t xb_get_read;
/* Version 2 btree counters */
-#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF + __XBTS_MAX)
uint32_t xs_abtb_2[__XBTS_MAX];
-#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2 + __XBTS_MAX)
uint32_t xs_abtc_2[__XBTS_MAX];
-#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2 + __XBTS_MAX)
uint32_t xs_bmbt_2[__XBTS_MAX];
-#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2 + __XBTS_MAX)
uint32_t xs_ibt_2[__XBTS_MAX];
-#define XFSSTAT_END_FIBT_V2 (XFSSTAT_END_IBT_V2 + __XBTS_MAX)
uint32_t xs_fibt_2[__XBTS_MAX];
-#define XFSSTAT_END_RMAP_V2 (XFSSTAT_END_FIBT_V2 + __XBTS_MAX)
uint32_t xs_rmap_2[__XBTS_MAX];
-#define XFSSTAT_END_REFCOUNT (XFSSTAT_END_RMAP_V2 + __XBTS_MAX)
uint32_t xs_refcbt_2[__XBTS_MAX];
-#define XFSSTAT_END_XQMSTAT (XFSSTAT_END_REFCOUNT + 6)
uint32_t xs_qm_dqreclaims;
uint32_t xs_qm_dqreclaim_misses;
uint32_t xs_qm_dquot_dups;
uint32_t xs_qm_dqcachemisses;
uint32_t xs_qm_dqcachehits;
uint32_t xs_qm_dqwants;
-#define XFSSTAT_END_QM (XFSSTAT_END_XQMSTAT+2)
uint32_t xs_qm_dquot;
uint32_t xs_qm_dquot_unused;
/* Extra precision counters */
@@ -163,10 +139,12 @@ struct __xfsstats {
uint64_t xs_read_bytes;
};
+#define xfsstats_offset(f) (offsetof(struct __xfsstats, f)/sizeof(uint32_t))
+
struct xfsstats {
union {
struct __xfsstats s;
- uint32_t a[XFSSTAT_END_XQMSTAT];
+ uint32_t a[xfsstats_offset(xs_qm_dquot)];
};
};
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 207ee302b1bb..d3e6cd063688 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -43,6 +43,7 @@
#include <linux/dax.h>
#include <linux/init.h>
#include <linux/slab.h>
+#include <linux/magic.h>
#include <linux/mount.h>
#include <linux/mempool.h>
#include <linux/writeback.h>
@@ -933,6 +934,32 @@ xfs_fs_alloc_inode(
return NULL;
}
+#ifdef DEBUG
+static void
+xfs_check_delalloc(
+ struct xfs_inode *ip,
+ int whichfork)
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_bmbt_irec got;
+ struct xfs_iext_cursor icur;
+
+ if (!ifp || !xfs_iext_lookup_extent(ip, ifp, 0, &icur, &got))
+ return;
+ do {
+ if (isnullstartblock(got.br_startblock)) {
+ xfs_warn(ip->i_mount,
+ "ino %llx %s fork has delalloc extent at [0x%llx:0x%llx]",
+ ip->i_ino,
+ whichfork == XFS_DATA_FORK ? "data" : "cow",
+ got.br_startoff, got.br_blockcount);
+ }
+ } while (xfs_iext_next_extent(ifp, &icur, &got));
+}
+#else
+#define xfs_check_delalloc(ip, whichfork) do { } while (0)
+#endif
+
/*
* Now that the generic code is guaranteed not to be accessing
* the linux inode, we can inactivate and reclaim the inode.
@@ -951,7 +978,12 @@ xfs_fs_destroy_inode(
xfs_inactive(ip);
- ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
+ if (!XFS_FORCED_SHUTDOWN(ip->i_mount) && ip->i_delayed_blks) {
+ xfs_check_delalloc(ip, XFS_DATA_FORK);
+ xfs_check_delalloc(ip, XFS_COW_FORK);
+ ASSERT(0);
+ }
+
XFS_STATS_INC(ip->i_mount, vn_reclaim);
/*
@@ -1097,7 +1129,7 @@ xfs_fs_statfs(
xfs_extlen_t lsize;
int64_t ffree;
- statp->f_type = XFS_SB_MAGIC;
+ statp->f_type = XFS_SUPER_MAGIC;
statp->f_namelen = MAXNAMELEN - 1;
id = huge_encode_dev(mp->m_ddev_targp->bt_dev);
@@ -1650,7 +1682,7 @@ xfs_fs_fill_super(
* we must configure the block size in the superblock before we run the
* full mount process as the mount process can lookup and cache inodes.
*/
- sb->s_magic = XFS_SB_MAGIC;
+ sb->s_magic = XFS_SUPER_MAGIC;
sb->s_blocksize = mp->m_sb.sb_blocksize;
sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1;
sb->s_maxbytes = xfs_max_file_offset(sb->s_blocksize_bits);
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index ad315e83bc02..3043e5ed6495 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -473,7 +473,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
-DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index bedc5a5133a5..912b42f5fe4a 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -259,6 +259,14 @@ xfs_trans_alloc(
struct xfs_trans *tp;
int error;
+ /*
+ * Allocate the handle before we do our freeze accounting and setting up
+ * GFP_NOFS allocation context so that we avoid lockdep false positives
+ * by doing GFP_KERNEL allocations inside sb_start_intwrite().
+ */
+ tp = kmem_zone_zalloc(xfs_trans_zone,
+ (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP);
+
if (!(flags & XFS_TRANS_NO_WRITECOUNT))
sb_start_intwrite(mp->m_super);
@@ -270,8 +278,6 @@ xfs_trans_alloc(
mp->m_super->s_writers.frozen == SB_FREEZE_COMPLETE);
atomic_inc(&mp->m_active_trans);
- tp = kmem_zone_zalloc(xfs_trans_zone,
- (flags & XFS_TRANS_NOFS) ? KM_NOFS : KM_SLEEP);
tp->t_magic = XFS_TRANS_HEADER_MAGIC;
tp->t_flags = flags;
tp->t_mountp = mp;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c3d278e96ad1..a0c5dbda18aa 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -220,6 +220,7 @@ void xfs_trans_ijoin(struct xfs_trans *, struct xfs_inode *, uint);
void xfs_trans_log_buf(struct xfs_trans *, struct xfs_buf *, uint,
uint);
void xfs_trans_dirty_buf(struct xfs_trans *, struct xfs_buf *);
+bool xfs_trans_buf_is_dirty(struct xfs_buf *bp);
void xfs_trans_log_inode(xfs_trans_t *, struct xfs_inode *, uint);
void xfs_extent_free_init_defer_op(void);
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 55326f971cb3..d3a4e89bf4a0 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -531,17 +531,33 @@ xfsaild(
set_current_state(TASK_INTERRUPTIBLE);
/*
- * Check kthread_should_stop() after we set the task state
- * to guarantee that we either see the stop bit and exit or
- * the task state is reset to runnable such that it's not
- * scheduled out indefinitely and detects the stop bit at
- * next iteration.
- *
+ * Check kthread_should_stop() after we set the task state to
+ * guarantee that we either see the stop bit and exit or the
+ * task state is reset to runnable such that it's not scheduled
+ * out indefinitely and detects the stop bit at next iteration.
* A memory barrier is included in above task state set to
* serialize again kthread_stop().
*/
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
+
+ /*
+ * The caller forces out the AIL before stopping the
+ * thread in the common case, which means the delwri
+ * queue is drained. In the shutdown case, the queue may
+ * still hold relogged buffers that haven't been
+ * submitted because they were pinned since added to the
+ * queue.
+ *
+ * Log I/O error processing stales the underlying buffer
+ * and clears the delwri state, expecting the buf to be
+ * removed on the next submission attempt. That won't
+ * happen if we're shutting down, so this is the last
+ * opportunity to release such buffers from the queue.
+ */
+ ASSERT(list_empty(&ailp->ail_buf_list) ||
+ XFS_FORCED_SHUTDOWN(ailp->ail_mount));
+ xfs_buf_delwri_cancel(&ailp->ail_buf_list);
break;
}
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 15919f67a88f..629f1479c9d2 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -264,11 +264,39 @@ xfs_trans_read_buf_map(
return -EIO;
}
+ /*
+ * Check if the caller is trying to read a buffer that is
+ * already attached to the transaction yet has no buffer ops
+ * assigned. Ops are usually attached when the buffer is
+ * attached to the transaction, or by the read caller if
+ * special circumstances. That didn't happen, which is not
+ * how this is supposed to go.
+ *
+ * If the buffer passes verification we'll let this go, but if
+ * not we have to shut down. Let the transaction cleanup code
+ * release this buffer when it kills the tranaction.
+ */
+ ASSERT(bp->b_ops != NULL);
+ error = xfs_buf_ensure_ops(bp, ops);
+ if (error) {
+ xfs_buf_ioerror_alert(bp, __func__);
+
+ if (tp->t_flags & XFS_TRANS_DIRTY)
+ xfs_force_shutdown(tp->t_mountp,
+ SHUTDOWN_META_IO_ERROR);
+
+ /* bad CRC means corrupted metadata */
+ if (error == -EFSBADCRC)
+ error = -EFSCORRUPTED;
+ return error;
+ }
+
bip = bp->b_log_item;
bip->bli_recur++;
ASSERT(atomic_read(&bip->bli_refcount) > 0);
trace_xfs_trans_read_buf_recur(bip);
+ ASSERT(bp->b_ops != NULL || ops == NULL);
*bpp = bp;
return 0;
}
@@ -316,55 +344,58 @@ xfs_trans_read_buf_map(
_xfs_trans_bjoin(tp, bp, 1);
trace_xfs_trans_read_buf(bp->b_log_item);
}
+ ASSERT(bp->b_ops != NULL || ops == NULL);
*bpp = bp;
return 0;
}
+/* Has this buffer been dirtied by anyone? */
+bool
+xfs_trans_buf_is_dirty(
+ struct xfs_buf *bp)
+{
+ struct xfs_buf_log_item *bip = bp->b_log_item;
+
+ if (!bip)
+ return false;
+ ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
+ return test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags);
+}
+
/*
- * Release the buffer bp which was previously acquired with one of the
- * xfs_trans_... buffer allocation routines if the buffer has not
- * been modified within this transaction. If the buffer is modified
- * within this transaction, do decrement the recursion count but do
- * not release the buffer even if the count goes to 0. If the buffer is not
- * modified within the transaction, decrement the recursion count and
- * release the buffer if the recursion count goes to 0.
+ * Release a buffer previously joined to the transaction. If the buffer is
+ * modified within this transaction, decrement the recursion count but do not
+ * release the buffer even if the count goes to 0. If the buffer is not modified
+ * within the transaction, decrement the recursion count and release the buffer
+ * if the recursion count goes to 0.
*
- * If the buffer is to be released and it was not modified before
- * this transaction began, then free the buf_log_item associated with it.
+ * If the buffer is to be released and it was not already dirty before this
+ * transaction began, then also free the buf_log_item associated with it.
*
- * If the transaction pointer is NULL, make this just a normal
- * brelse() call.
+ * If the transaction pointer is NULL, this is a normal xfs_buf_relse() call.
*/
void
xfs_trans_brelse(
- xfs_trans_t *tp,
- xfs_buf_t *bp)
+ struct xfs_trans *tp,
+ struct xfs_buf *bp)
{
- struct xfs_buf_log_item *bip;
- int freed;
+ struct xfs_buf_log_item *bip = bp->b_log_item;
- /*
- * Default to a normal brelse() call if the tp is NULL.
- */
- if (tp == NULL) {
- ASSERT(bp->b_transp == NULL);
+ ASSERT(bp->b_transp == tp);
+
+ if (!tp) {
xfs_buf_relse(bp);
return;
}
- ASSERT(bp->b_transp == tp);
- bip = bp->b_log_item;
+ trace_xfs_trans_brelse(bip);
ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
- ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
- ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
ASSERT(atomic_read(&bip->bli_refcount) > 0);
- trace_xfs_trans_brelse(bip);
-
/*
- * If the release is just for a recursive lock,
- * then decrement the count and return.
+ * If the release is for a recursive lookup, then decrement the count
+ * and return.
*/
if (bip->bli_recur > 0) {
bip->bli_recur--;
@@ -372,64 +403,24 @@ xfs_trans_brelse(
}
/*
- * If the buffer is dirty within this transaction, we can't
+ * If the buffer is invalidated or dirty in this transaction, we can't
* release it until we commit.
*/
if (test_bit(XFS_LI_DIRTY, &bip->bli_item.li_flags))
return;
-
- /*
- * If the buffer has been invalidated, then we can't release
- * it until the transaction commits to disk unless it is re-dirtied
- * as part of this transaction. This prevents us from pulling
- * the item from the AIL before we should.
- */
if (bip->bli_flags & XFS_BLI_STALE)
return;
- ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
-
/*
- * Free up the log item descriptor tracking the released item.
+ * Unlink the log item from the transaction and clear the hold flag, if
+ * set. We wouldn't want the next user of the buffer to get confused.
*/
+ ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
xfs_trans_del_item(&bip->bli_item);
+ bip->bli_flags &= ~XFS_BLI_HOLD;
- /*
- * Clear the hold flag in the buf log item if it is set.
- * We wouldn't want the next user of the buffer to
- * get confused.
- */
- if (bip->bli_flags & XFS_BLI_HOLD) {
- bip->bli_flags &= ~XFS_BLI_HOLD;
- }
-
- /*
- * Drop our reference to the buf log item.
- */
- freed = atomic_dec_and_test(&bip->bli_refcount);
-
- /*
- * If the buf item is not tracking data in the log, then we must free it
- * before releasing the buffer back to the free pool.
- *
- * If the fs has shutdown and we dropped the last reference, it may fall
- * on us to release a (possibly dirty) bli if it never made it to the
- * AIL (e.g., the aborted unpin already happened and didn't release it
- * due to our reference). Since we're already shutdown and need
- * ail_lock, just force remove from the AIL and release the bli here.
- */
- if (XFS_FORCED_SHUTDOWN(tp->t_mountp) && freed) {
- xfs_trans_ail_remove(&bip->bli_item, SHUTDOWN_LOG_IO_ERROR);
- xfs_buf_item_relse(bp);
- } else if (!(bip->bli_flags & XFS_BLI_DIRTY)) {
-/***
- ASSERT(bp->b_pincount == 0);
-***/
- ASSERT(atomic_read(&bip->bli_refcount) == 0);
- ASSERT(!test_bit(XFS_LI_IN_AIL, &bip->bli_item.li_flags));
- ASSERT(!(bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF));
- xfs_buf_item_relse(bp);
- }
+ /* drop the reference to the bli */
+ xfs_buf_item_put(bip);
bp->b_transp = NULL;
xfs_buf_relse(bp);