aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig57
-rw-r--r--fs/Makefile6
-rw-r--r--fs/affs/file.c2
-rw-r--r--fs/affs/inode.c3
-rw-r--r--fs/afs/write.c2
-rw-r--r--fs/aio.c100
-rw-r--r--fs/anon_inodes.c7
-rw-r--r--fs/autofs/inode.c2
-rw-r--r--fs/autofs4/inode.c4
-rw-r--r--fs/bad_inode.c6
-rw-r--r--fs/befs/linuxvfs.c5
-rw-r--r--fs/binfmt_aout.c81
-rw-r--r--fs/binfmt_misc.c3
-rw-r--r--fs/bio-integrity.c2
-rw-r--r--fs/bio.c320
-rw-r--r--fs/block_dev.c9
-rw-r--r--fs/buffer.c23
-rw-r--r--fs/cifs/Makefile2
-rw-r--r--fs/cifs/cifsfs.c7
-rw-r--r--fs/cifs/cifsfs.h1
-rw-r--r--fs/cifs/fcntl.c118
-rw-r--r--fs/cifs/file.c2
-rw-r--r--fs/cifs/inode.c2
-rw-r--r--fs/coda/file.c12
-rw-r--r--fs/configfs/inode.c3
-rw-r--r--fs/cramfs/inode.c2
-rw-r--r--fs/dcache.c25
-rw-r--r--fs/dcookies.c28
-rw-r--r--fs/debugfs/inode.c3
-rw-r--r--fs/devpts/inode.c472
-rw-r--r--fs/dquot.c436
-rw-r--r--fs/ecryptfs/file.c15
-rw-r--r--fs/ecryptfs/inode.c6
-rw-r--r--fs/ecryptfs/mmap.c2
-rw-r--r--fs/exec.c57
-rw-r--r--fs/ext2/ialloc.c6
-rw-r--r--fs/ext2/inode.c7
-rw-r--r--fs/ext2/namei.c15
-rw-r--r--fs/ext3/ialloc.c6
-rw-r--r--fs/ext3/inode.c9
-rw-r--r--fs/ext3/namei.c18
-rw-r--r--fs/ext3/super.c16
-rw-r--r--fs/ext4/ext4_sb.h6
-rw-r--r--fs/ext4/ialloc.c6
-rw-r--r--fs/ext4/inode.c11
-rw-r--r--fs/ext4/namei.c17
-rw-r--r--fs/ext4/super.c23
-rw-r--r--fs/fat/dir.c1
-rw-r--r--fs/fat/inode.c2
-rw-r--r--fs/fat/namei_vfat.c2
-rw-r--r--fs/file_table.c10
-rw-r--r--fs/filesystems.c23
-rw-r--r--fs/freevxfs/vxfs_inode.c4
-rw-r--r--fs/fuse/file.c4
-rw-r--r--fs/gfs2/ops_address.c2
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/hugetlbfs/inode.c1
-rw-r--r--fs/inode.c270
-rw-r--r--fs/isofs/inode.c6
-rw-r--r--fs/jbd2/commit.c9
-rw-r--r--fs/jbd2/journal.c19
-rw-r--r--fs/jbd2/transaction.c47
-rw-r--r--fs/jffs2/file.c2
-rw-r--r--fs/jfs/inode.c8
-rw-r--r--fs/jfs/jfs_imap.c10
-rw-r--r--fs/jfs/jfs_inode.c29
-rw-r--r--fs/jfs/namei.c24
-rw-r--r--fs/libfs.c7
-rw-r--r--fs/lockd/clntlock.c23
-rw-r--r--fs/lockd/host.c10
-rw-r--r--fs/lockd/svc.c6
-rw-r--r--fs/namei.c160
-rw-r--r--fs/namespace.c2
-rw-r--r--fs/nfs/callback.c36
-rw-r--r--fs/nfs/client.c95
-rw-r--r--fs/nfs/delegation.c260
-rw-r--r--fs/nfs/delegation.h33
-rw-r--r--fs/nfs/dir.c24
-rw-r--r--fs/nfs/file.c2
-rw-r--r--fs/nfs/inode.c13
-rw-r--r--fs/nfs/internal.h14
-rw-r--r--fs/nfs/mount_clnt.c34
-rw-r--r--fs/nfs/nfs4_fs.h32
-rw-r--r--fs/nfs/nfs4proc.c431
-rw-r--r--fs/nfs/nfs4renewd.c22
-rw-r--r--fs/nfs/nfs4state.c415
-rw-r--r--fs/nfs/nfs4xdr.c1235
-rw-r--r--fs/nfs/nfsroot.c27
-rw-r--r--fs/nfs/read.c6
-rw-r--r--fs/nfs/super.c44
-rw-r--r--fs/nfs_common/nfsacl.c4
-rw-r--r--fs/nfsctl.c5
-rw-r--r--fs/nfsd/nfs4callback.c9
-rw-r--r--fs/nfsd/nfs4state.c12
-rw-r--r--fs/nfsd/vfs.c43
-rw-r--r--fs/notify/Kconfig2
-rw-r--r--fs/notify/Makefile2
-rw-r--r--fs/notify/dnotify/Kconfig10
-rw-r--r--fs/notify/dnotify/Makefile1
-rw-r--r--fs/notify/dnotify/dnotify.c (renamed from fs/dnotify.c)3
-rw-r--r--fs/notify/inotify/Kconfig27
-rw-r--r--fs/notify/inotify/Makefile2
-rw-r--r--fs/notify/inotify/inotify.c (renamed from fs/inotify.c)0
-rw-r--r--fs/notify/inotify/inotify_user.c (renamed from fs/inotify_user.c)4
-rw-r--r--fs/ntfs/inode.c3
-rw-r--r--fs/ocfs2/Makefile7
-rw-r--r--fs/ocfs2/acl.c479
-rw-r--r--fs/ocfs2/acl.h58
-rw-r--r--fs/ocfs2/alloc.c710
-rw-r--r--fs/ocfs2/alloc.h30
-rw-r--r--fs/ocfs2/aops.c59
-rw-r--r--fs/ocfs2/blockcheck.c477
-rw-r--r--fs/ocfs2/blockcheck.h82
-rw-r--r--fs/ocfs2/buffer_head_io.c32
-rw-r--r--fs/ocfs2/buffer_head_io.h27
-rw-r--r--fs/ocfs2/cluster/masklog.c1
-rw-r--r--fs/ocfs2/cluster/masklog.h1
-rw-r--r--fs/ocfs2/dir.c399
-rw-r--r--fs/ocfs2/dir.h2
-rw-r--r--fs/ocfs2/dlm/dlmast.c52
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h3
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c53
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c1
-rw-r--r--fs/ocfs2/dlm/dlmfs.c2
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c42
-rw-r--r--fs/ocfs2/dlm/dlmthread.c3
-rw-r--r--fs/ocfs2/dlmglue.c168
-rw-r--r--fs/ocfs2/dlmglue.h19
-rw-r--r--fs/ocfs2/extent_map.c96
-rw-r--r--fs/ocfs2/extent_map.h24
-rw-r--r--fs/ocfs2/file.c209
-rw-r--r--fs/ocfs2/file.h3
-rw-r--r--fs/ocfs2/inode.c175
-rw-r--r--fs/ocfs2/inode.h18
-rw-r--r--fs/ocfs2/journal.c364
-rw-r--r--fs/ocfs2/journal.h128
-rw-r--r--fs/ocfs2/localalloc.c26
-rw-r--r--fs/ocfs2/namei.c318
-rw-r--r--fs/ocfs2/ocfs2.h46
-rw-r--r--fs/ocfs2/ocfs2_fs.h213
-rw-r--r--fs/ocfs2/ocfs2_jbd_compat.h82
-rw-r--r--fs/ocfs2/ocfs2_lockid.h5
-rw-r--r--fs/ocfs2/quota.h119
-rw-r--r--fs/ocfs2/quota_global.c1025
-rw-r--r--fs/ocfs2/quota_local.c1253
-rw-r--r--fs/ocfs2/resize.c76
-rw-r--r--fs/ocfs2/slot_map.c4
-rw-r--r--fs/ocfs2/suballoc.c363
-rw-r--r--fs/ocfs2/suballoc.h18
-rw-r--r--fs/ocfs2/super.c328
-rw-r--r--fs/ocfs2/symlink.c2
-rw-r--r--fs/ocfs2/xattr.c2984
-rw-r--r--fs/ocfs2/xattr.h45
-rw-r--r--fs/omfs/inode.c1
-rw-r--r--fs/open.c7
-rw-r--r--fs/openpromfs/inode.c3
-rw-r--r--fs/pipe.c7
-rw-r--r--fs/proc/base.c4
-rw-r--r--fs/proc/proc_sysctl.c1
-rw-r--r--fs/proc/stat.c7
-rw-r--r--fs/quota.c11
-rw-r--r--fs/quota_tree.c645
-rw-r--r--fs/quota_tree.h25
-rw-r--r--fs/quota_v1.c28
-rw-r--r--fs/quota_v2.c631
-rw-r--r--fs/quotaio_v1.h33
-rw-r--r--fs/quotaio_v2.h60
-rw-r--r--fs/ramfs/inode.c1
-rw-r--r--fs/read_write.c13
-rw-r--r--fs/reiserfs/inode.c30
-rw-r--r--fs/reiserfs/namei.c8
-rw-r--r--fs/reiserfs/super.c10
-rw-r--r--fs/romfs/inode.c1
-rw-r--r--fs/seq_file.c13
-rw-r--r--fs/smbfs/file.c2
-rw-r--r--fs/stat.c2
-rw-r--r--fs/sync.c48
-rw-r--r--fs/sysfs/inode.c3
-rw-r--r--fs/sysv/inode.c6
-rw-r--r--fs/ubifs/budget.c208
-rw-r--r--fs/ubifs/commit.c25
-rw-r--r--fs/ubifs/compress.c18
-rw-r--r--fs/ubifs/debug.c265
-rw-r--r--fs/ubifs/debug.h117
-rw-r--r--fs/ubifs/file.c17
-rw-r--r--fs/ubifs/ioctl.c2
-rw-r--r--fs/ubifs/journal.c4
-rw-r--r--fs/ubifs/key.h32
-rw-r--r--fs/ubifs/lprops.c14
-rw-r--r--fs/ubifs/lpt.c45
-rw-r--r--fs/ubifs/lpt_commit.c210
-rw-r--r--fs/ubifs/orphan.c2
-rw-r--r--fs/ubifs/replay.c15
-rw-r--r--fs/ubifs/sb.c20
-rw-r--r--fs/ubifs/super.c255
-rw-r--r--fs/ubifs/tnc.c31
-rw-r--r--fs/ubifs/tnc_commit.c9
-rw-r--r--fs/ubifs/ubifs-media.h7
-rw-r--r--fs/ubifs/ubifs.h111
-rw-r--r--fs/xattr.c2
-rw-r--r--fs/xfs/Makefile6
-rw-r--r--fs/xfs/linux-2.6/sv.h22
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c66
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.h3
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c87
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h30
-rw-r--r--fs/xfs/linux-2.6/xfs_cred.h8
-rw-r--r--fs/xfs/linux-2.6/xfs_export.c1
-rw-r--r--fs/xfs/linux-2.6/xfs_file.c189
-rw-r--r--fs/xfs/linux-2.6/xfs_fs_subr.c23
-rw-r--r--fs/xfs/linux-2.6/xfs_globals.c8
-rw-r--r--fs/xfs/linux-2.6/xfs_globals.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.c223
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl.h82
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.c849
-rw-r--r--fs/xfs/linux-2.6/xfs_ioctl32.h214
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.c122
-rw-r--r--fs/xfs/linux-2.6/xfs_iops.h1
-rw-r--r--fs/xfs/linux-2.6/xfs_linux.h13
-rw-r--r--fs/xfs/linux-2.6/xfs_lrw.c50
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.c6
-rw-r--r--fs/xfs/linux-2.6/xfs_stats.h65
-rw-r--r--fs/xfs/linux-2.6/xfs_super.c884
-rw-r--r--fs/xfs/linux-2.6/xfs_super.h15
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.c762
-rw-r--r--fs/xfs/linux-2.6/xfs_sync.h55
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.c11
-rw-r--r--fs/xfs/linux-2.6/xfs_sysctl.h3
-rw-r--r--fs/xfs/linux-2.6/xfs_vfs.h77
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.c145
-rw-r--r--fs/xfs/linux-2.6/xfs_vnode.h72
-rw-r--r--fs/xfs/quota/xfs_dquot.c39
-rw-r--r--fs/xfs/quota/xfs_dquot.h4
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c45
-rw-r--r--fs/xfs/quota/xfs_qm.c57
-rw-r--r--fs/xfs/quota/xfs_qm.h3
-rw-r--r--fs/xfs/quota/xfs_qm_bhv.c5
-rw-r--r--fs/xfs/quota/xfs_qm_syscalls.c151
-rw-r--r--fs/xfs/support/debug.c39
-rw-r--r--fs/xfs/support/debug.h2
-rw-r--r--fs/xfs/support/ktrace.c9
-rw-r--r--fs/xfs/xfs.h2
-rw-r--r--fs/xfs/xfs_acl.c2
-rw-r--r--fs/xfs/xfs_ag.h15
-rw-r--r--fs/xfs/xfs_alloc.c264
-rw-r--r--fs/xfs/xfs_alloc.h27
-rw-r--r--fs/xfs/xfs_alloc_btree.c2387
-rw-r--r--fs/xfs/xfs_alloc_btree.h107
-rw-r--r--fs/xfs/xfs_arch.h39
-rw-r--r--fs/xfs/xfs_bit.h3
-rw-r--r--fs/xfs/xfs_bmap.c410
-rw-r--r--fs/xfs/xfs_bmap.h72
-rw-r--r--fs/xfs/xfs_bmap_btree.c2617
-rw-r--r--fs/xfs/xfs_bmap_btree.h171
-rw-r--r--fs/xfs/xfs_btree.c3596
-rw-r--r--fs/xfs/xfs_btree.h392
-rw-r--r--fs/xfs/xfs_btree_trace.c249
-rw-r--r--fs/xfs/xfs_btree_trace.h116
-rw-r--r--fs/xfs/xfs_buf_item.c45
-rw-r--r--fs/xfs/xfs_clnt.h105
-rw-r--r--fs/xfs/xfs_da_btree.h24
-rw-r--r--fs/xfs/xfs_dfrag.c8
-rw-r--r--fs/xfs/xfs_dfrag.h2
-rw-r--r--fs/xfs/xfs_dinode.h148
-rw-r--r--fs/xfs/xfs_dir2_sf.h7
-rw-r--r--fs/xfs/xfs_dmops.c5
-rw-r--r--fs/xfs/xfs_error.c15
-rw-r--r--fs/xfs/xfs_error.h12
-rw-r--r--fs/xfs/xfs_extfree_item.c45
-rw-r--r--fs/xfs/xfs_fs.h22
-rw-r--r--fs/xfs/xfs_fsops.c30
-rw-r--r--fs/xfs/xfs_ialloc.c449
-rw-r--r--fs/xfs/xfs_ialloc.h31
-rw-r--r--fs/xfs/xfs_ialloc_btree.c2193
-rw-r--r--fs/xfs/xfs_ialloc_btree.h111
-rw-r--r--fs/xfs/xfs_iget.c735
-rw-r--r--fs/xfs/xfs_imap.h40
-rw-r--r--fs/xfs/xfs_inode.c587
-rw-r--r--fs/xfs/xfs_inode.h375
-rw-r--r--fs/xfs/xfs_inode_item.c45
-rw-r--r--fs/xfs/xfs_inode_item.h41
-rw-r--r--fs/xfs/xfs_iomap.c28
-rw-r--r--fs/xfs/xfs_itable.c102
-rw-r--r--fs/xfs/xfs_itable.h14
-rw-r--r--fs/xfs/xfs_log.c81
-rw-r--r--fs/xfs/xfs_log.h4
-rw-r--r--fs/xfs/xfs_log_priv.h48
-rw-r--r--fs/xfs/xfs_log_recover.c416
-rw-r--r--fs/xfs/xfs_mount.c81
-rw-r--r--fs/xfs/xfs_mount.h73
-rw-r--r--fs/xfs/xfs_qmops.c5
-rw-r--r--fs/xfs/xfs_quota.h8
-rw-r--r--fs/xfs/xfs_rename.c61
-rw-r--r--fs/xfs/xfs_rtalloc.c41
-rw-r--r--fs/xfs/xfs_rw.c2
-rw-r--r--fs/xfs/xfs_sb.h167
-rw-r--r--fs/xfs/xfs_trans.c22
-rw-r--r--fs/xfs/xfs_trans.h322
-rw-r--r--fs/xfs/xfs_trans_ail.c362
-rw-r--r--fs/xfs/xfs_trans_buf.c7
-rw-r--r--fs/xfs/xfs_trans_inode.c30
-rw-r--r--fs/xfs/xfs_trans_item.c10
-rw-r--r--fs/xfs/xfs_trans_priv.h98
-rw-r--r--fs/xfs/xfs_utils.c12
-rw-r--r--fs/xfs/xfs_vfsops.c757
-rw-r--r--fs/xfs/xfs_vfsops.h16
-rw-r--r--fs/xfs/xfs_vnodeops.c354
-rw-r--r--fs/xfs/xfs_vnodeops.h10
308 files changed, 23805 insertions, 18455 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 522469a7eca3..f9b6e2979aaa 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -189,6 +189,8 @@ config OCFS2_FS
select CONFIGFS_FS
select JBD2
select CRC32
+ select QUOTA
+ select QUOTA_TREE
help
OCFS2 is a general purpose extent based shared disk cluster file
system with many similarities to ext3. It supports 64 bit inode
@@ -258,56 +260,18 @@ config OCFS2_DEBUG_FS
this option for debugging only as it is likely to decrease
performance of the filesystem.
-config OCFS2_COMPAT_JBD
- bool "Use JBD for compatibility"
+config OCFS2_FS_POSIX_ACL
+ bool "OCFS2 POSIX Access Control Lists"
depends on OCFS2_FS
+ select FS_POSIX_ACL
default n
- select JBD
help
- The ocfs2 filesystem now uses JBD2 for its journalling. JBD2
- is backwards compatible with JBD. It is safe to say N here.
- However, if you really want to use the original JBD, say Y here.
+ Posix Access Control Lists (ACLs) support permissions for users and
+ groups beyond the owner/group/world scheme.
endif # BLOCK
-config DNOTIFY
- bool "Dnotify support"
- default y
- help
- Dnotify is a directory-based per-fd file change notification system
- that uses signals to communicate events to user-space. There exist
- superior alternatives, but some applications may still rely on
- dnotify.
-
- If unsure, say Y.
-
-config INOTIFY
- bool "Inotify file change notification support"
- default y
- ---help---
- Say Y here to enable inotify support. Inotify is a file change
- notification system and a replacement for dnotify. Inotify fixes
- numerous shortcomings in dnotify and introduces several new features
- including multiple file events, one-shot support, and unmount
- notification.
-
- For more information, see <file:Documentation/filesystems/inotify.txt>
-
- If unsure, say Y.
-
-config INOTIFY_USER
- bool "Inotify support for userspace"
- depends on INOTIFY
- default y
- ---help---
- Say Y here to enable inotify support for userspace, including the
- associated system calls. Inotify allows monitoring of both files and
- directories via a single open fd. Events are read from the file
- descriptor, which is also select()- and poll()-able.
-
- For more information, see <file:Documentation/filesystems/inotify.txt>
-
- If unsure, say Y.
+source "fs/notify/Kconfig"
config QUOTA
bool "Quota support"
@@ -340,6 +304,10 @@ config PRINT_QUOTA_WARNING
Note that this behavior is currently deprecated and may go away in
future. Please use notification via netlink socket instead.
+# Generic support for tree structured quota files. Seleted when needed.
+config QUOTA_TREE
+ tristate
+
config QFMT_V1
tristate "Old quota format support"
depends on QUOTA
@@ -351,6 +319,7 @@ config QFMT_V1
config QFMT_V2
tristate "Quota format v2 support"
depends on QUOTA
+ select QUOTA_TREE
help
This quota format allows using quotas with 32-bit UIDs/GIDs. If you
need this functionality say Y here.
diff --git a/fs/Makefile b/fs/Makefile
index d9f8afe6f0c4..c830611550d3 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -20,8 +20,7 @@ obj-y += no-block.o
endif
obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
-obj-$(CONFIG_INOTIFY) += inotify.o
-obj-$(CONFIG_INOTIFY_USER) += inotify_user.o
+obj-y += notify/
obj-$(CONFIG_EPOLL) += eventpoll.o
obj-$(CONFIG_ANON_INODES) += anon_inodes.o
obj-$(CONFIG_SIGNALFD) += signalfd.o
@@ -55,10 +54,9 @@ obj-$(CONFIG_GENERIC_ACL) += generic_acl.o
obj-$(CONFIG_QUOTA) += dquot.o
obj-$(CONFIG_QFMT_V1) += quota_v1.o
obj-$(CONFIG_QFMT_V2) += quota_v2.o
+obj-$(CONFIG_QUOTA_TREE) += quota_tree.o
obj-$(CONFIG_QUOTACTL) += quota.o
-obj-$(CONFIG_DNOTIFY) += dnotify.o
-
obj-$(CONFIG_PROC_FS) += proc/
obj-y += partitions/
obj-$(CONFIG_SYSFS) += sysfs/
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 1377b1240b6e..9246cb4aa018 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -628,7 +628,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
}
index = pos >> PAGE_CACHE_SHIFT;
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 415d9c67ac16..3c4ec7d864c4 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -119,8 +119,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
goto bad_inode;
#else
inode->i_mode |= S_IFDIR;
- inode->i_op = NULL;
- inode->i_fop = NULL;
+ /* ... and leave ->i_op and ->i_fop pointing to empty */
break;
#endif
case ST_LINKFILE:
diff --git a/fs/afs/write.c b/fs/afs/write.c
index d6b85dab35fc..3fb36d433621 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -144,7 +144,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
candidate->state = AFS_WBACK_PENDING;
init_waitqueue_head(&candidate->waitq);
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page) {
kfree(candidate);
return -ENOMEM;
diff --git a/fs/aio.c b/fs/aio.c
index f658441d5666..d6f89d3c15e8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -191,6 +191,20 @@ static int aio_setup_ring(struct kioctx *ctx)
kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \
} while(0)
+static void ctx_rcu_free(struct rcu_head *head)
+{
+ struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
+ unsigned nr_events = ctx->max_reqs;
+
+ kmem_cache_free(kioctx_cachep, ctx);
+
+ if (nr_events) {
+ spin_lock(&aio_nr_lock);
+ BUG_ON(aio_nr - nr_events > aio_nr);
+ aio_nr -= nr_events;
+ spin_unlock(&aio_nr_lock);
+ }
+}
/* __put_ioctx
* Called when the last user of an aio context has gone away,
@@ -198,8 +212,6 @@ static int aio_setup_ring(struct kioctx *ctx)
*/
static void __put_ioctx(struct kioctx *ctx)
{
- unsigned nr_events = ctx->max_reqs;
-
BUG_ON(ctx->reqs_active);
cancel_delayed_work(&ctx->wq);
@@ -208,14 +220,7 @@ static void __put_ioctx(struct kioctx *ctx)
mmdrop(ctx->mm);
ctx->mm = NULL;
pr_debug("__put_ioctx: freeing %p\n", ctx);
- kmem_cache_free(kioctx_cachep, ctx);
-
- if (nr_events) {
- spin_lock(&aio_nr_lock);
- BUG_ON(aio_nr - nr_events > aio_nr);
- aio_nr -= nr_events;
- spin_unlock(&aio_nr_lock);
- }
+ call_rcu(&ctx->rcu_head, ctx_rcu_free);
}
#define get_ioctx(kioctx) do { \
@@ -235,6 +240,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
{
struct mm_struct *mm;
struct kioctx *ctx;
+ int did_sync = 0;
/* Prevent overflows */
if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
@@ -267,21 +273,30 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
goto out_freectx;
/* limit the number of system wide aios */
- spin_lock(&aio_nr_lock);
- if (aio_nr + ctx->max_reqs > aio_max_nr ||
- aio_nr + ctx->max_reqs < aio_nr)
- ctx->max_reqs = 0;
- else
- aio_nr += ctx->max_reqs;
- spin_unlock(&aio_nr_lock);
+ do {
+ spin_lock_bh(&aio_nr_lock);
+ if (aio_nr + nr_events > aio_max_nr ||
+ aio_nr + nr_events < aio_nr)
+ ctx->max_reqs = 0;
+ else
+ aio_nr += ctx->max_reqs;
+ spin_unlock_bh(&aio_nr_lock);
+ if (ctx->max_reqs || did_sync)
+ break;
+
+ /* wait for rcu callbacks to have completed before giving up */
+ synchronize_rcu();
+ did_sync = 1;
+ ctx->max_reqs = nr_events;
+ } while (1);
+
if (ctx->max_reqs == 0)
goto out_cleanup;
/* now link into global list. */
- write_lock(&mm->ioctx_list_lock);
- ctx->next = mm->ioctx_list;
- mm->ioctx_list = ctx;
- write_unlock(&mm->ioctx_list_lock);
+ spin_lock(&mm->ioctx_lock);
+ hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
+ spin_unlock(&mm->ioctx_lock);
dprintk("aio: allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
ctx, ctx->user_id, current->mm, ctx->ring_info.nr);
@@ -375,11 +390,12 @@ ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
*/
void exit_aio(struct mm_struct *mm)
{
- struct kioctx *ctx = mm->ioctx_list;
- mm->ioctx_list = NULL;
- while (ctx) {
- struct kioctx *next = ctx->next;
- ctx->next = NULL;
+ struct kioctx *ctx;
+
+ while (!hlist_empty(&mm->ioctx_list)) {
+ ctx = hlist_entry(mm->ioctx_list.first, struct kioctx, list);
+ hlist_del_rcu(&ctx->list);
+
aio_cancel_all(ctx);
wait_for_all_aios(ctx);
@@ -394,7 +410,6 @@ void exit_aio(struct mm_struct *mm)
atomic_read(&ctx->users), ctx->dead,
ctx->reqs_active);
put_ioctx(ctx);
- ctx = next;
}
}
@@ -555,19 +570,21 @@ int aio_put_req(struct kiocb *req)
static struct kioctx *lookup_ioctx(unsigned long ctx_id)
{
- struct kioctx *ioctx;
- struct mm_struct *mm;
+ struct mm_struct *mm = current->mm;
+ struct kioctx *ctx = NULL;
+ struct hlist_node *n;
- mm = current->mm;
- read_lock(&mm->ioctx_list_lock);
- for (ioctx = mm->ioctx_list; ioctx; ioctx = ioctx->next)
- if (likely(ioctx->user_id == ctx_id && !ioctx->dead)) {
- get_ioctx(ioctx);
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) {
+ if (ctx->user_id == ctx_id && !ctx->dead) {
+ get_ioctx(ctx);
break;
}
- read_unlock(&mm->ioctx_list_lock);
+ }
- return ioctx;
+ rcu_read_unlock();
+ return ctx;
}
/*
@@ -1215,19 +1232,14 @@ out:
static void io_destroy(struct kioctx *ioctx)
{
struct mm_struct *mm = current->mm;
- struct kioctx **tmp;
int was_dead;
/* delete the entry from the list is someone else hasn't already */
- write_lock(&mm->ioctx_list_lock);
+ spin_lock(&mm->ioctx_lock);
was_dead = ioctx->dead;
ioctx->dead = 1;
- for (tmp = &mm->ioctx_list; *tmp && *tmp != ioctx;
- tmp = &(*tmp)->next)
- ;
- if (*tmp)
- *tmp = ioctx->next;
- write_unlock(&mm->ioctx_list_lock);
+ hlist_del_rcu(&ioctx->list);
+ spin_unlock(&mm->ioctx_lock);
dprintk("aio_release(%p)\n", ioctx);
if (likely(!was_dead))
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index c16d9be1b017..3bbdb9d02376 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -79,9 +79,12 @@ int anon_inode_getfd(const char *name, const struct file_operations *fops,
if (IS_ERR(anon_inode_inode))
return -ENODEV;
+ if (fops->owner && !try_module_get(fops->owner))
+ return -ENOENT;
+
error = get_unused_fd_flags(flags);
if (error < 0)
- return error;
+ goto err_module;
fd = error;
/*
@@ -128,6 +131,8 @@ err_dput:
dput(dentry);
err_put_unused_fd:
put_unused_fd(fd);
+err_module:
+ module_put(fops->owner);
return error;
}
EXPORT_SYMBOL_GPL(anon_inode_getfd);
diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c
index c773680d5c60..e1734f2d6e26 100644
--- a/fs/autofs/inode.c
+++ b/fs/autofs/inode.c
@@ -251,13 +251,11 @@ struct inode *autofs_iget(struct super_block *sb, unsigned long ino)
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
inode->i_nlink = 2;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
- inode->i_blocks = 0;
if (ino == AUTOFS_ROOT_INO) {
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
inode->i_op = &autofs_root_inode_operations;
inode->i_fop = &autofs_root_operations;
- inode->i_uid = inode->i_gid = 0; /* Changed in read_super */
goto done;
}
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 7b19802cfef4..cfc23e53b6f4 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -455,11 +455,7 @@ struct inode *autofs4_get_inode(struct super_block *sb,
if (sb->s_root) {
inode->i_uid = sb->s_root->d_inode->i_uid;
inode->i_gid = sb->s_root->d_inode->i_gid;
- } else {
- inode->i_uid = 0;
- inode->i_gid = 0;
}
- inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
if (S_ISDIR(inf->mode)) {
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 5f1538c03b1b..a05287a23f62 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -132,11 +132,6 @@ static int bad_file_check_flags(int flags)
return -EIO;
}
-static int bad_file_dir_notify(struct file *file, unsigned long arg)
-{
- return -EIO;
-}
-
static int bad_file_flock(struct file *filp, int cmd, struct file_lock *fl)
{
return -EIO;
@@ -179,7 +174,6 @@ static const struct file_operations bad_file_ops =
.sendpage = bad_file_sendpage,
.get_unmapped_area = bad_file_get_unmapped_area,
.check_flags = bad_file_check_flags,
- .dir_notify = bad_file_dir_notify,
.flock = bad_file_flock,
.splice_write = bad_file_splice_write,
.splice_read = bad_file_splice_read,
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index b6dfee37c7b7..d06cb023ad02 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -378,7 +378,8 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
inode->i_size = 0;
inode->i_blocks = befs_sb->block_size / VFS_BLOCK_SIZE;
strncpy(befs_ino->i_data.symlink, raw_inode->data.symlink,
- BEFS_SYMLINK_LEN);
+ BEFS_SYMLINK_LEN - 1);
+ befs_ino->i_data.symlink[BEFS_SYMLINK_LEN - 1] = '\0';
} else {
int num_blks;
@@ -477,6 +478,8 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd)
kfree(link);
befs_error(sb, "Failed to read entire long symlink");
link = ERR_PTR(-EIO);
+ } else {
+ link[len - 1] = '\0';
}
} else {
link = befs_ino->i_data.symlink;
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index f1f3f4192a60..b639dcf7c778 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -95,92 +95,55 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, u
int has_dumped = 0;
unsigned long dump_start, dump_size;
struct user dump;
-#if defined(__alpha__)
+#ifdef __alpha__
# define START_DATA(u) (u.start_data)
-#elif defined(__arm__)
+#else
# define START_DATA(u) ((u.u_tsize << PAGE_SHIFT) + u.start_code)
-#elif defined(__sparc__)
-# define START_DATA(u) (u.u_tsize)
-#elif defined(__i386__) || defined(__mc68000__) || defined(__arch_um__)
-# define START_DATA(u) (u.u_tsize << PAGE_SHIFT)
#endif
-#ifdef __sparc__
-# define START_STACK(u) ((regs->u_regs[UREG_FP]) & ~(PAGE_SIZE - 1))
-#else
# define START_STACK(u) (u.start_stack)
-#endif
fs = get_fs();
set_fs(KERNEL_DS);
has_dumped = 1;
current->flags |= PF_DUMPCORE;
strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm));
-#ifndef __sparc__
dump.u_ar0 = offsetof(struct user, regs);
-#endif
dump.signal = signr;
aout_dump_thread(regs, &dump);
/* If the size of the dump file exceeds the rlimit, then see what would happen
if we wrote the stack, but not the data area. */
-#ifdef __sparc__
- if ((dump.u_dsize + dump.u_ssize) > limit)
- dump.u_dsize = 0;
-#else
if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > limit)
dump.u_dsize = 0;
-#endif
/* Make sure we have enough room to write the stack and data areas. */
-#ifdef __sparc__
- if (dump.u_ssize > limit)
- dump.u_ssize = 0;
-#else
if ((dump.u_ssize + 1) * PAGE_SIZE > limit)
dump.u_ssize = 0;
-#endif
/* make sure we actually have a data and stack area to dump */
set_fs(USER_DS);
-#ifdef __sparc__
- if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize))
- dump.u_dsize = 0;
- if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize))
- dump.u_ssize = 0;
-#else
if (!access_ok(VERIFY_READ, (void __user *)START_DATA(dump), dump.u_dsize << PAGE_SHIFT))
dump.u_dsize = 0;
if (!access_ok(VERIFY_READ, (void __user *)START_STACK(dump), dump.u_ssize << PAGE_SHIFT))
dump.u_ssize = 0;
-#endif
set_fs(KERNEL_DS);
/* struct user */
DUMP_WRITE(&dump,sizeof(dump));
/* Now dump all of the user data. Include malloced stuff as well */
-#ifndef __sparc__
DUMP_SEEK(PAGE_SIZE);
-#endif
/* now we start writing out the user space info */
set_fs(USER_DS);
/* Dump the data area */
if (dump.u_dsize != 0) {
dump_start = START_DATA(dump);
-#ifdef __sparc__
- dump_size = dump.u_dsize;
-#else
dump_size = dump.u_dsize << PAGE_SHIFT;
-#endif
DUMP_WRITE(dump_start,dump_size);
}
/* Now prepare to dump the stack area */
if (dump.u_ssize != 0) {
dump_start = START_STACK(dump);
-#ifdef __sparc__
- dump_size = dump.u_ssize;
-#else
dump_size = dump.u_ssize << PAGE_SHIFT;
-#endif
DUMP_WRITE(dump_start,dump_size);
}
/* Finally dump the task struct. Not be used by gdb, but could be useful */
@@ -205,29 +168,24 @@ static unsigned long __user *create_aout_tables(char __user *p, struct linux_bin
int envc = bprm->envc;
sp = (void __user *)((-(unsigned long)sizeof(char *)) & (unsigned long) p);
-#ifdef __sparc__
- /* This imposes the proper stack alignment for a new process. */
- sp = (void __user *) (((unsigned long) sp) & ~7);
- if ((envc+argc+3)&1) --sp;
-#endif
#ifdef __alpha__
/* whee.. test-programs are so much fun. */
put_user(0, --sp);
put_user(0, --sp);
if (bprm->loader) {
put_user(0, --sp);
- put_user(0x3eb, --sp);
+ put_user(1003, --sp);
put_user(bprm->loader, --sp);
- put_user(0x3ea, --sp);
+ put_user(1002, --sp);
}
put_user(bprm->exec, --sp);
- put_user(0x3e9, --sp);
+ put_user(1001, --sp);
#endif
sp -= envc+1;
envp = (char __user * __user *) sp;
sp -= argc+1;
argv = (char __user * __user *) sp;
-#if defined(__i386__) || defined(__mc68000__) || defined(__arm__) || defined(__arch_um__)
+#ifndef __alpha__
put_user((unsigned long) envp,--sp);
put_user((unsigned long) argv,--sp);
#endif
@@ -300,13 +258,8 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
return retval;
/* OK, This is the point of no return */
-#if defined(__alpha__)
+#ifdef __alpha__
SET_AOUT_PERSONALITY(bprm, ex);
-#elif defined(__sparc__)
- set_personality(PER_SUNOS);
-#if !defined(__sparc_v9__)
- memcpy(&current->thread.core_exec, &ex, sizeof(struct exec));
-#endif
#else
set_personality(PER_LINUX);
#endif
@@ -322,24 +275,6 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
install_exec_creds(bprm);
current->flags &= ~PF_FORKNOEXEC;
-#ifdef __sparc__
- if (N_MAGIC(ex) == NMAGIC) {
- loff_t pos = fd_offset;
- /* Fuck me plenty... */
- /* <AOL></AOL> */
- down_write(&current->mm->mmap_sem);
- error = do_brk(N_TXTADDR(ex), ex.a_text);
- up_write(&current->mm->mmap_sem);
- bprm->file->f_op->read(bprm->file, (char *) N_TXTADDR(ex),
- ex.a_text, &pos);
- down_write(&current->mm->mmap_sem);
- error = do_brk(N_DATADDR(ex), ex.a_data);
- up_write(&current->mm->mmap_sem);
- bprm->file->f_op->read(bprm->file, (char *) N_DATADDR(ex),
- ex.a_data, &pos);
- goto beyond_if;
- }
-#endif
if (N_MAGIC(ex) == OMAGIC) {
unsigned long text_addr, map_size;
@@ -347,7 +282,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
text_addr = N_TXTADDR(ex);
-#if defined(__alpha__) || defined(__sparc__)
+#ifdef __alpha__
pos = fd_offset;
map_size = ex.a_text+ex.a_data + PAGE_SIZE - 1;
#else
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index f2744ab4e5b3..e1158cb4fbd6 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -496,9 +496,6 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode)
if (inode) {
inode->i_mode = mode;
- inode->i_uid = 0;
- inode->i_gid = 0;
- inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime =
current_fs_time(inode->i_sb);
}
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
index 19caf7c962ac..77ebc3c263d6 100644
--- a/fs/bio-integrity.c
+++ b/fs/bio-integrity.c
@@ -111,7 +111,7 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs)
&& bip->bip_buf != NULL)
kfree(bip->bip_buf);
- mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]);
+ bvec_free_bs(bs, bip->bip_vec, bip->bip_pool);
mempool_free(bip, bs->bio_integrity_pool);
bio->bi_integrity = NULL;
diff --git a/fs/bio.c b/fs/bio.c
index df99c882b807..711cee103602 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -31,7 +31,11 @@
DEFINE_TRACE(block_split);
-static struct kmem_cache *bio_slab __read_mostly;
+/*
+ * Test patch to inline a certain number of bi_io_vec's inside the bio
+ * itself, to shrink a bio data allocation from two mempool calls to one
+ */
+#define BIO_INLINE_VECS 4
static mempool_t *bio_split_pool __read_mostly;
@@ -40,9 +44,8 @@ static mempool_t *bio_split_pool __read_mostly;
* break badly! cannot be bigger than what you can fit into an
* unsigned short
*/
-
#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
-static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
+struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
};
#undef BV
@@ -53,12 +56,121 @@ static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
*/
struct bio_set *fs_bio_set;
+/*
+ * Our slab pool management
+ */
+struct bio_slab {
+ struct kmem_cache *slab;
+ unsigned int slab_ref;
+ unsigned int slab_size;
+ char name[8];
+};
+static DEFINE_MUTEX(bio_slab_lock);
+static struct bio_slab *bio_slabs;
+static unsigned int bio_slab_nr, bio_slab_max;
+
+static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
+{
+ unsigned int sz = sizeof(struct bio) + extra_size;
+ struct kmem_cache *slab = NULL;
+ struct bio_slab *bslab;
+ unsigned int i, entry = -1;
+
+ mutex_lock(&bio_slab_lock);
+
+ i = 0;
+ while (i < bio_slab_nr) {
+ struct bio_slab *bslab = &bio_slabs[i];
+
+ if (!bslab->slab && entry == -1)
+ entry = i;
+ else if (bslab->slab_size == sz) {
+ slab = bslab->slab;
+ bslab->slab_ref++;
+ break;
+ }
+ i++;
+ }
+
+ if (slab)
+ goto out_unlock;
+
+ if (bio_slab_nr == bio_slab_max && entry == -1) {
+ bio_slab_max <<= 1;
+ bio_slabs = krealloc(bio_slabs,
+ bio_slab_max * sizeof(struct bio_slab),
+ GFP_KERNEL);
+ if (!bio_slabs)
+ goto out_unlock;
+ }
+ if (entry == -1)
+ entry = bio_slab_nr++;
+
+ bslab = &bio_slabs[entry];
+
+ snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
+ slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!slab)
+ goto out_unlock;
+
+ printk("bio: create slab <%s> at %d\n", bslab->name, entry);
+ bslab->slab = slab;
+ bslab->slab_ref = 1;
+ bslab->slab_size = sz;
+out_unlock:
+ mutex_unlock(&bio_slab_lock);
+ return slab;
+}
+
+static void bio_put_slab(struct bio_set *bs)
+{
+ struct bio_slab *bslab = NULL;
+ unsigned int i;
+
+ mutex_lock(&bio_slab_lock);
+
+ for (i = 0; i < bio_slab_nr; i++) {
+ if (bs->bio_slab == bio_slabs[i].slab) {
+ bslab = &bio_slabs[i];
+ break;
+ }
+ }
+
+ if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
+ goto out;
+
+ WARN_ON(!bslab->slab_ref);
+
+ if (--bslab->slab_ref)
+ goto out;
+
+ kmem_cache_destroy(bslab->slab);
+ bslab->slab = NULL;
+
+out:
+ mutex_unlock(&bio_slab_lock);
+}
+
unsigned int bvec_nr_vecs(unsigned short idx)
{
return bvec_slabs[idx].nr_vecs;
}
-struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct bio_set *bs)
+void bvec_free_bs(struct bio_set *bs, struct bio_vec *bv, unsigned int idx)
+{
+ BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
+
+ if (idx == BIOVEC_MAX_IDX)
+ mempool_free(bv, bs->bvec_pool);
+ else {
+ struct biovec_slab *bvs = bvec_slabs + idx;
+
+ kmem_cache_free(bvs->slab, bv);
+ }
+}
+
+struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx,
+ struct bio_set *bs)
{
struct bio_vec *bvl;
@@ -67,60 +179,85 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct
* If not, this is a bio_kmalloc() allocation and just do a
* kzalloc() for the exact number of vecs right away.
*/
- if (bs) {
+ if (!bs)
+ bvl = kmalloc(nr * sizeof(struct bio_vec), gfp_mask);
+
+ /*
+ * see comment near bvec_array define!
+ */
+ switch (nr) {
+ case 1:
+ *idx = 0;
+ break;
+ case 2 ... 4:
+ *idx = 1;
+ break;
+ case 5 ... 16:
+ *idx = 2;
+ break;
+ case 17 ... 64:
+ *idx = 3;
+ break;
+ case 65 ... 128:
+ *idx = 4;
+ break;
+ case 129 ... BIO_MAX_PAGES:
+ *idx = 5;
+ break;
+ default:
+ return NULL;
+ }
+
+ /*
+ * idx now points to the pool we want to allocate from. only the
+ * 1-vec entry pool is mempool backed.
+ */
+ if (*idx == BIOVEC_MAX_IDX) {
+fallback:
+ bvl = mempool_alloc(bs->bvec_pool, gfp_mask);
+ } else {
+ struct biovec_slab *bvs = bvec_slabs + *idx;
+ gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
+
/*
- * see comment near bvec_array define!
+ * Make this allocation restricted and don't dump info on
+ * allocation failures, since we'll fallback to the mempool
+ * in case of failure.
*/
- switch (nr) {
- case 1:
- *idx = 0;
- break;
- case 2 ... 4:
- *idx = 1;
- break;
- case 5 ... 16:
- *idx = 2;
- break;
- case 17 ... 64:
- *idx = 3;
- break;
- case 65 ... 128:
- *idx = 4;
- break;
- case 129 ... BIO_MAX_PAGES:
- *idx = 5;
- break;
- default:
- return NULL;
- }
+ __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
/*
- * idx now points to the pool we want to allocate from
+ * Try a slab allocation. If this fails and __GFP_WAIT
+ * is set, retry with the 1-entry mempool
*/
- bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask);
- if (bvl)
- memset(bvl, 0,
- bvec_nr_vecs(*idx) * sizeof(struct bio_vec));
- } else
- bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask);
+ bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
+ if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) {
+ *idx = BIOVEC_MAX_IDX;
+ goto fallback;
+ }
+ }
return bvl;
}
-void bio_free(struct bio *bio, struct bio_set *bio_set)
+void bio_free(struct bio *bio, struct bio_set *bs)
{
- if (bio->bi_io_vec) {
- const int pool_idx = BIO_POOL_IDX(bio);
+ void *p;
- BIO_BUG_ON(pool_idx >= BIOVEC_NR_POOLS);
-
- mempool_free(bio->bi_io_vec, bio_set->bvec_pools[pool_idx]);
- }
+ if (bio_has_allocated_vec(bio))
+ bvec_free_bs(bs, bio->bi_io_vec, BIO_POOL_IDX(bio));
if (bio_integrity(bio))
- bio_integrity_free(bio, bio_set);
+ bio_integrity_free(bio, bs);
+
+ /*
+ * If we have front padding, adjust the bio pointer before freeing
+ */
+ p = bio;
+ if (bs->front_pad)
+ p -= bs->front_pad;
- mempool_free(bio, bio_set->bio_pool);
+ mempool_free(p, bs->bio_pool);
}
/*
@@ -133,7 +270,8 @@ static void bio_fs_destructor(struct bio *bio)
static void bio_kmalloc_destructor(struct bio *bio)
{
- kfree(bio->bi_io_vec);
+ if (bio_has_allocated_vec(bio))
+ kfree(bio->bi_io_vec);
kfree(bio);
}
@@ -157,16 +295,20 @@ void bio_init(struct bio *bio)
* for a &struct bio to become free. If a %NULL @bs is passed in, we will
* fall back to just using @kmalloc to allocate the required memory.
*
- * allocate bio and iovecs from the memory pools specified by the
- * bio_set structure, or @kmalloc if none given.
+ * Note that the caller must set ->bi_destructor on succesful return
+ * of a bio, to do the appropriate freeing of the bio once the reference
+ * count drops to zero.
**/
struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
{
- struct bio *bio;
+ struct bio *bio = NULL;
+
+ if (bs) {
+ void *p = mempool_alloc(bs->bio_pool, gfp_mask);
- if (bs)
- bio = mempool_alloc(bs->bio_pool, gfp_mask);
- else
+ if (p)
+ bio = p + bs->front_pad;
+ } else
bio = kmalloc(sizeof(*bio), gfp_mask);
if (likely(bio)) {
@@ -176,7 +318,15 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
if (likely(nr_iovecs)) {
unsigned long uninitialized_var(idx);
- bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs);
+ if (nr_iovecs <= BIO_INLINE_VECS) {
+ idx = 0;
+ bvl = bio->bi_inline_vecs;
+ nr_iovecs = BIO_INLINE_VECS;
+ } else {
+ bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx,
+ bs);
+ nr_iovecs = bvec_nr_vecs(idx);
+ }
if (unlikely(!bvl)) {
if (bs)
mempool_free(bio, bs->bio_pool);
@@ -186,7 +336,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
goto out;
}
bio->bi_flags |= idx << BIO_POOL_OFFSET;
- bio->bi_max_vecs = bvec_nr_vecs(idx);
+ bio->bi_max_vecs = nr_iovecs;
}
bio->bi_io_vec = bvl;
}
@@ -1346,30 +1496,18 @@ EXPORT_SYMBOL(bio_sector_offset);
*/
static int biovec_create_pools(struct bio_set *bs, int pool_entries)
{
- int i;
+ struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
- for (i = 0; i < BIOVEC_NR_POOLS; i++) {
- struct biovec_slab *bp = bvec_slabs + i;
- mempool_t **bvp = bs->bvec_pools + i;
+ bs->bvec_pool = mempool_create_slab_pool(pool_entries, bp->slab);
+ if (!bs->bvec_pool)
+ return -ENOMEM;
- *bvp = mempool_create_slab_pool(pool_entries, bp->slab);
- if (!*bvp)
- return -ENOMEM;
- }
return 0;
}
static void biovec_free_pools(struct bio_set *bs)
{
- int i;
-
- for (i = 0; i < BIOVEC_NR_POOLS; i++) {
- mempool_t *bvp = bs->bvec_pools[i];
-
- if (bvp)
- mempool_destroy(bvp);
- }
-
+ mempool_destroy(bs->bvec_pool);
}
void bioset_free(struct bio_set *bs)
@@ -1379,25 +1517,49 @@ void bioset_free(struct bio_set *bs)
bioset_integrity_free(bs);
biovec_free_pools(bs);
+ bio_put_slab(bs);
kfree(bs);
}
-struct bio_set *bioset_create(int bio_pool_size, int bvec_pool_size)
+/**
+ * bioset_create - Create a bio_set
+ * @pool_size: Number of bio and bio_vecs to cache in the mempool
+ * @front_pad: Number of bytes to allocate in front of the returned bio
+ *
+ * Description:
+ * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
+ * to ask for a number of bytes to be allocated in front of the bio.
+ * Front pad allocation is useful for embedding the bio inside
+ * another structure, to avoid allocating extra data to go with the bio.
+ * Note that the bio must be embedded at the END of that structure always,
+ * or things will break badly.
+ */
+struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
{
- struct bio_set *bs = kzalloc(sizeof(*bs), GFP_KERNEL);
+ unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
+ struct bio_set *bs;
+ bs = kzalloc(sizeof(*bs), GFP_KERNEL);
if (!bs)
return NULL;
- bs->bio_pool = mempool_create_slab_pool(bio_pool_size, bio_slab);
+ bs->front_pad = front_pad;
+
+ bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
+ if (!bs->bio_slab) {
+ kfree(bs);
+ return NULL;
+ }
+
+ bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab);
if (!bs->bio_pool)
goto bad;
- if (bioset_integrity_create(bs, bio_pool_size))
+ if (bioset_integrity_create(bs, pool_size))
goto bad;
- if (!biovec_create_pools(bs, bvec_pool_size))
+ if (!biovec_create_pools(bs, pool_size))
return bs;
bad:
@@ -1421,12 +1583,16 @@ static void __init biovec_init_slabs(void)
static int __init init_bio(void)
{
- bio_slab = KMEM_CACHE(bio, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
+ bio_slab_max = 2;
+ bio_slab_nr = 0;
+ bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL);
+ if (!bio_slabs)
+ panic("bio: can't allocate bios\n");
bio_integrity_init_slab();
biovec_init_slabs();
- fs_bio_set = bioset_create(BIO_POOL_SIZE, 2);
+ fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
if (!fs_bio_set)
panic("bio: can't allocate bios\n");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 99e0ae1a4c78..349a26c10001 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -326,12 +326,13 @@ static struct file_system_type bd_type = {
.kill_sb = kill_anon_super,
};
-static struct vfsmount *bd_mnt __read_mostly;
-struct super_block *blockdev_superblock;
+struct super_block *blockdev_superblock __read_mostly;
void __init bdev_cache_init(void)
{
int err;
+ struct vfsmount *bd_mnt;
+
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD|SLAB_PANIC),
@@ -373,7 +374,7 @@ struct block_device *bdget(dev_t dev)
struct block_device *bdev;
struct inode *inode;
- inode = iget5_locked(bd_mnt->mnt_sb, hash(dev),
+ inode = iget5_locked(blockdev_superblock, hash(dev),
bdev_test, bdev_set, &dev);
if (!inode)
@@ -463,7 +464,7 @@ void bd_forget(struct inode *inode)
spin_lock(&bdev_lock);
if (inode->i_bdev) {
- if (inode->i_sb != blockdev_superblock)
+ if (!sb_is_blkdev_sb(inode->i_sb))
bdev = inode->i_bdev;
__bd_forget(inode);
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 10179cfa1152..a13f09b696f7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -99,10 +99,18 @@ __clear_page_buffers(struct page *page)
page_cache_release(page);
}
+
+static int quiet_error(struct buffer_head *bh)
+{
+ if (!test_bit(BH_Quiet, &bh->b_state) && printk_ratelimit())
+ return 0;
+ return 1;
+}
+
+
static void buffer_io_error(struct buffer_head *bh)
{
char b[BDEVNAME_SIZE];
-
printk(KERN_ERR "Buffer I/O error on device %s, logical block %Lu\n",
bdevname(bh->b_bdev, b),
(unsigned long long)bh->b_blocknr);
@@ -144,7 +152,7 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate)
if (uptodate) {
set_buffer_uptodate(bh);
} else {
- if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+ if (!buffer_eopnotsupp(bh) && !quiet_error(bh)) {
buffer_io_error(bh);
printk(KERN_WARNING "lost page write due to "
"I/O error on %s\n",
@@ -394,7 +402,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate)
set_buffer_uptodate(bh);
} else {
clear_buffer_uptodate(bh);
- if (printk_ratelimit())
+ if (!quiet_error(bh))
buffer_io_error(bh);
SetPageError(page);
}
@@ -455,7 +463,7 @@ static void end_buffer_async_write(struct buffer_head *bh, int uptodate)
if (uptodate) {
set_buffer_uptodate(bh);
} else {
- if (printk_ratelimit()) {
+ if (!quiet_error(bh)) {
buffer_io_error(bh);
printk(KERN_WARNING "lost page write due to "
"I/O error on %s\n",
@@ -1988,7 +1996,7 @@ int block_write_begin(struct file *file, struct address_space *mapping,
page = *pagep;
if (page == NULL) {
ownpage = 1;
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page) {
status = -ENOMEM;
goto out;
@@ -2494,7 +2502,7 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
from = pos & (PAGE_CACHE_SIZE - 1);
to = from + len;
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
@@ -2913,6 +2921,9 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
set_bit(BH_Eopnotsupp, &bh->b_state);
}
+ if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags)))
+ set_bit(BH_Quiet, &bh->b_state);
+
bh->b_end_io(bh, test_bit(BIO_UPTODATE, &bio->bi_flags));
bio_put(bio);
}
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index 6ba43fb346fb..9948c0030e86 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -5,7 +5,7 @@ obj-$(CONFIG_CIFS) += cifs.o
cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o \
- md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o \
+ md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
readdir.o ioctl.o sess.o export.o cifsacl.o
cifs-$(CONFIG_CIFS_UPCALL) += cifs_spnego.o
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 0005a194a75c..13ea53251dcf 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -747,7 +747,6 @@ const struct file_operations cifs_file_ops = {
#endif /* CONFIG_CIFS_POSIX */
#ifdef CONFIG_CIFS_EXPERIMENTAL
- .dir_notify = cifs_dir_notify,
.setlease = cifs_setlease,
#endif /* CONFIG_CIFS_EXPERIMENTAL */
};
@@ -768,7 +767,6 @@ const struct file_operations cifs_file_direct_ops = {
#endif /* CONFIG_CIFS_POSIX */
.llseek = cifs_llseek,
#ifdef CONFIG_CIFS_EXPERIMENTAL
- .dir_notify = cifs_dir_notify,
.setlease = cifs_setlease,
#endif /* CONFIG_CIFS_EXPERIMENTAL */
};
@@ -789,7 +787,6 @@ const struct file_operations cifs_file_nobrl_ops = {
#endif /* CONFIG_CIFS_POSIX */
#ifdef CONFIG_CIFS_EXPERIMENTAL
- .dir_notify = cifs_dir_notify,
.setlease = cifs_setlease,
#endif /* CONFIG_CIFS_EXPERIMENTAL */
};
@@ -809,7 +806,6 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
#endif /* CONFIG_CIFS_POSIX */
.llseek = cifs_llseek,
#ifdef CONFIG_CIFS_EXPERIMENTAL
- .dir_notify = cifs_dir_notify,
.setlease = cifs_setlease,
#endif /* CONFIG_CIFS_EXPERIMENTAL */
};
@@ -818,9 +814,6 @@ const struct file_operations cifs_dir_ops = {
.readdir = cifs_readdir,
.release = cifs_closedir,
.read = generic_read_dir,
-#ifdef CONFIG_CIFS_EXPERIMENTAL
- .dir_notify = cifs_dir_notify,
-#endif /* CONFIG_CIFS_EXPERIMENTAL */
.unlocked_ioctl = cifs_ioctl,
.llseek = generic_file_llseek,
};
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 2ce04c73d74e..7ac481841f87 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -76,7 +76,6 @@ extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
extern const struct file_operations cifs_dir_ops;
extern int cifs_dir_open(struct inode *inode, struct file *file);
extern int cifs_readdir(struct file *file, void *direntry, filldir_t filldir);
-extern int cifs_dir_notify(struct file *, unsigned long arg);
/* Functions related to dir entries */
extern struct dentry_operations cifs_dentry_ops;
diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c
deleted file mode 100644
index 5a57581eb4b2..000000000000
--- a/fs/cifs/fcntl.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * fs/cifs/fcntl.c
- *
- * vfs operations that deal with the file control API
- *
- * Copyright (C) International Business Machines Corp., 2003,2004
- * Author(s): Steve French (sfrench@us.ibm.com)
- *
- * This library is free software; you can redistribute it and/or modify
- * it under the terms of the GNU Lesser General Public License as published
- * by the Free Software Foundation; either version 2.1 of the License, or
- * (at your option) any later version.
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
- * the GNU Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public License
- * along with this library; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-#include <linux/fs.h>
-#include <linux/stat.h>
-#include <linux/fcntl.h>
-#include "cifsglob.h"
-#include "cifsproto.h"
-#include "cifs_unicode.h"
-#include "cifs_debug.h"
-#include "cifsfs.h"
-
-static __u32 convert_to_cifs_notify_flags(unsigned long fcntl_notify_flags)
-{
- __u32 cifs_ntfy_flags = 0;
-
- /* No way on Linux VFS to ask to monitor xattr
- changes (and no stream support either */
- if (fcntl_notify_flags & DN_ACCESS)
- cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_ACCESS;
- if (fcntl_notify_flags & DN_MODIFY) {
- /* What does this mean on directories? */
- cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE |
- FILE_NOTIFY_CHANGE_SIZE;
- }
- if (fcntl_notify_flags & DN_CREATE) {
- cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_CREATION |
- FILE_NOTIFY_CHANGE_LAST_WRITE;
- }
- if (fcntl_notify_flags & DN_DELETE)
- cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_LAST_WRITE;
- if (fcntl_notify_flags & DN_RENAME) {
- /* BB review this - checking various server behaviors */
- cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_DIR_NAME |
- FILE_NOTIFY_CHANGE_FILE_NAME;
- }
- if (fcntl_notify_flags & DN_ATTRIB) {
- cifs_ntfy_flags |= FILE_NOTIFY_CHANGE_SECURITY |
- FILE_NOTIFY_CHANGE_ATTRIBUTES;
- }
-/* if (fcntl_notify_flags & DN_MULTISHOT) {
- cifs_ntfy_flags |= ;
- } */ /* BB fixme - not sure how to handle this with CIFS yet */
-
- return cifs_ntfy_flags;
-}
-
-int cifs_dir_notify(struct file *file, unsigned long arg)
-{
- int xid;
- int rc = -EINVAL;
- int oplock = 0;
- struct cifs_sb_info *cifs_sb;
- struct cifsTconInfo *pTcon;
- char *full_path = NULL;
- __u32 filter = FILE_NOTIFY_CHANGE_NAME | FILE_NOTIFY_CHANGE_ATTRIBUTES;
- __u16 netfid;
-
- if (experimEnabled == 0)
- return 0;
-
- xid = GetXid();
- cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
- pTcon = cifs_sb->tcon;
-
- full_path = build_path_from_dentry(file->f_path.dentry);
-
- if (full_path == NULL) {
- rc = -ENOMEM;
- } else {
- cFYI(1, ("dir notify on file %s Arg 0x%lx", full_path, arg));
- rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN,
- GENERIC_READ | SYNCHRONIZE, 0 /* create options */,
- &netfid, &oplock, NULL, cifs_sb->local_nls,
- cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
- /* BB fixme - add this handle to a notify handle list */
- if (rc) {
- cFYI(1, ("Could not open directory for notify"));
- } else {
- filter = convert_to_cifs_notify_flags(arg);
- if (filter != 0) {
- rc = CIFSSMBNotify(xid, pTcon,
- 0 /* no subdirs */, netfid,
- filter, file, arg & DN_MULTISHOT,
- cifs_sb->local_nls);
- } else {
- rc = -EINVAL;
- }
- /* BB add code to close file eventually (at unmount
- it would close automatically but may be a way
- to do it easily when inode freed or when
- notify info is cleared/changed */
- cFYI(1, ("notify rc %d", rc));
- }
- }
-
- FreeXid(xid);
- return rc;
-}
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index b1e1fc6a6e6a..12bb656fbe75 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2074,7 +2074,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
cFYI(1, ("write_begin from %lld len %d", (long long)pos, len));
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page) {
rc = -ENOMEM;
goto out;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index f247da9f4edc..5ab9896fdcb2 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1641,7 +1641,7 @@ do_expand:
i_size_write(inode, offset);
spin_unlock(&inode->i_lock);
out_truncate:
- if (inode->i_op && inode->i_op->truncate)
+ if (inode->i_op->truncate)
inode->i_op->truncate(inode);
return 0;
out_sig:
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 466303db2df6..6a347fbc998a 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -201,8 +201,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
{
struct file *host_file;
- struct dentry *host_dentry;
- struct inode *host_inode, *coda_inode = coda_dentry->d_inode;
+ struct inode *coda_inode = coda_dentry->d_inode;
struct coda_file_info *cfi;
int err = 0;
@@ -214,14 +213,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
host_file = cfi->cfi_container;
- if (host_file->f_op && host_file->f_op->fsync) {
- host_dentry = host_file->f_path.dentry;
- host_inode = host_dentry->d_inode;
- mutex_lock(&host_inode->i_mutex);
- err = host_file->f_op->fsync(host_file, host_dentry, datasync);
- mutex_unlock(&host_inode->i_mutex);
- }
-
+ err = vfs_fsync(host_file, host_file->f_path.dentry, datasync);
if ( !err && !datasync ) {
lock_kernel();
err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4803ccc94480..5d349d38e056 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -117,8 +117,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
{
inode->i_mode = mode;
- inode->i_uid = 0;
- inode->i_gid = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
}
@@ -136,7 +134,6 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd)
{
struct inode * inode = new_inode(configfs_sb);
if (inode) {
- inode->i_blocks = 0;
inode->i_mapping->a_ops = &configfs_aops;
inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
inode->i_op = &configfs_inode_operations;
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index f40423eb1a14..a07338d2d140 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -83,8 +83,6 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
inode->i_op = &page_symlink_inode_operations;
inode->i_data.a_ops = &cramfs_aops;
} else {
- inode->i_size = 0;
- inode->i_blocks = 0;
init_special_inode(inode, inode->i_mode,
old_decode_dev(cramfs_inode->size));
}
diff --git a/fs/dcache.c b/fs/dcache.c
index a1d86c7f3e66..e88c23b85a32 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -34,7 +34,6 @@
#include <linux/bootmem.h>
#include "internal.h"
-
int sysctl_vfs_cache_pressure __read_mostly = 100;
EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
@@ -948,9 +947,6 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name)
dentry->d_op = NULL;
dentry->d_fsdata = NULL;
dentry->d_mounted = 0;
-#ifdef CONFIG_PROFILING
- dentry->d_cookie = NULL;
-#endif
INIT_HLIST_NODE(&dentry->d_hash);
INIT_LIST_HEAD(&dentry->d_lru);
INIT_LIST_HEAD(&dentry->d_subdirs);
@@ -1336,7 +1332,7 @@ err_out:
*
* Searches the children of the parent dentry for the name in question. If
* the dentry is found its reference count is incremented and the dentry
- * is returned. The caller must use d_put to free the entry when it has
+ * is returned. The caller must use dput to free the entry when it has
* finished using it. %NULL is returned on failure.
*
* __d_lookup is dcache_lock free. The hash list is protected using RCU.
@@ -1620,8 +1616,11 @@ static void switch_names(struct dentry *dentry, struct dentry *target)
*/
memcpy(dentry->d_iname, target->d_name.name,
target->d_name.len + 1);
+ dentry->d_name.len = target->d_name.len;
+ return;
}
}
+ do_switch(dentry->d_name.len, target->d_name.len);
}
/*
@@ -1681,7 +1680,6 @@ already_unhashed:
/* Switch the names.. */
switch_names(dentry, target);
- do_switch(dentry->d_name.len, target->d_name.len);
do_switch(dentry->d_name.hash, target->d_name.hash);
/* ... and switch the parents */
@@ -1791,7 +1789,6 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
struct dentry *dparent, *aparent;
switch_names(dentry, anon);
- do_switch(dentry->d_name.len, anon->d_name.len);
do_switch(dentry->d_name.hash, anon->d_name.hash);
dparent = dentry->d_parent;
@@ -1911,7 +1908,8 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
* Convert a dentry into an ASCII path name. If the entry has been deleted
* the string " (deleted)" is appended. Note that this is ambiguous.
*
- * Returns the buffer or an error code if the path was too long.
+ * Returns a pointer into the buffer or an error code if the
+ * path was too long.
*
* "buflen" should be positive. Caller holds the dcache_lock.
*
@@ -1987,7 +1985,10 @@ Elong:
* Convert a dentry into an ASCII path name. If the entry has been deleted
* the string " (deleted)" is appended. Note that this is ambiguous.
*
- * Returns the buffer or an error code if the path was too long.
+ * Returns a pointer into the buffer or an error code if the path was
+ * too long. Note: Callers should use the returned pointer, not the passed
+ * in buffer, to use the name! The implementation often starts at an offset
+ * into the buffer, and may leave 0 bytes at the start.
*
* "buflen" should be positive.
*/
@@ -2313,9 +2314,6 @@ static void __init dcache_init(void)
/* SLAB cache for __getname() consumers */
struct kmem_cache *names_cachep __read_mostly;
-/* SLAB cache for file structures */
-struct kmem_cache *filp_cachep __read_mostly;
-
EXPORT_SYMBOL(d_genocide);
void __init vfs_caches_init_early(void)
@@ -2337,9 +2335,6 @@ void __init vfs_caches_init(unsigned long mempages)
names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
- filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-
dcache_init();
inode_init();
files_init(mempages);
diff --git a/fs/dcookies.c b/fs/dcookies.c
index 855d4b1d619a..180e9fec4ad8 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -93,10 +93,15 @@ static struct dcookie_struct *alloc_dcookie(struct path *path)
{
struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache,
GFP_KERNEL);
+ struct dentry *d;
if (!dcs)
return NULL;
- path->dentry->d_cookie = dcs;
+ d = path->dentry;
+ spin_lock(&d->d_lock);
+ d->d_flags |= DCACHE_COOKIE;
+ spin_unlock(&d->d_lock);
+
dcs->path = *path;
path_get(path);
hash_dcookie(dcs);
@@ -119,14 +124,14 @@ int get_dcookie(struct path *path, unsigned long *cookie)
goto out;
}
- dcs = path->dentry->d_cookie;
-
- if (!dcs)
+ if (path->dentry->d_flags & DCACHE_COOKIE) {
+ dcs = find_dcookie((unsigned long)path->dentry);
+ } else {
dcs = alloc_dcookie(path);
-
- if (!dcs) {
- err = -ENOMEM;
- goto out;
+ if (!dcs) {
+ err = -ENOMEM;
+ goto out;
+ }
}
*cookie = dcookie_value(dcs);
@@ -251,7 +256,12 @@ out_kmem:
static void free_dcookie(struct dcookie_struct * dcs)
{
- dcs->path.dentry->d_cookie = NULL;
+ struct dentry *d = dcs->path.dentry;
+
+ spin_lock(&d->d_lock);
+ d->d_flags &= ~DCACHE_COOKIE;
+ spin_unlock(&d->d_lock);
+
path_put(&dcs->path);
kmem_cache_free(dcookie_cache, dcs);
}
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 3dbe2169cf36..81ae9ea3c6e1 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -37,9 +37,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d
if (inode) {
inode->i_mode = mode;
- inode->i_uid = 0;
- inode->i_gid = 0;
- inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
switch (mode & S_IFMT) {
default:
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 5d61b7c06e13..5f3231b9633f 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -27,25 +27,32 @@
#define DEVPTS_SUPER_MAGIC 0x1cd1
#define DEVPTS_DEFAULT_MODE 0600
+/*
+ * ptmx is a new node in /dev/pts and will be unused in legacy (single-
+ * instance) mode. To prevent surprises in user space, set permissions of
+ * ptmx to 0. Use 'chmod' or remount with '-o ptmxmode' to set meaningful
+ * permissions.
+ */
+#define DEVPTS_DEFAULT_PTMX_MODE 0000
#define PTMX_MINOR 2
extern int pty_limit; /* Config limit on Unix98 ptys */
-static DEFINE_IDA(allocated_ptys);
static DEFINE_MUTEX(allocated_ptys_lock);
static struct vfsmount *devpts_mnt;
-static struct dentry *devpts_root;
-static struct {
+struct pts_mount_opts {
int setuid;
int setgid;
uid_t uid;
gid_t gid;
umode_t mode;
-} config = {.mode = DEVPTS_DEFAULT_MODE};
+ umode_t ptmxmode;
+ int newinstance;
+};
enum {
- Opt_uid, Opt_gid, Opt_mode,
+ Opt_uid, Opt_gid, Opt_mode, Opt_ptmxmode, Opt_newinstance,
Opt_err
};
@@ -53,18 +60,50 @@ static const match_table_t tokens = {
{Opt_uid, "uid=%u"},
{Opt_gid, "gid=%u"},
{Opt_mode, "mode=%o"},
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+ {Opt_ptmxmode, "ptmxmode=%o"},
+ {Opt_newinstance, "newinstance"},
+#endif
{Opt_err, NULL}
};
-static int devpts_remount(struct super_block *sb, int *flags, char *data)
+struct pts_fs_info {
+ struct ida allocated_ptys;
+ struct pts_mount_opts mount_opts;
+ struct dentry *ptmx_dentry;
+};
+
+static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+static inline struct super_block *pts_sb_from_inode(struct inode *inode)
+{
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+ if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
+ return inode->i_sb;
+#endif
+ return devpts_mnt->mnt_sb;
+}
+
+#define PARSE_MOUNT 0
+#define PARSE_REMOUNT 1
+
+static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts)
{
char *p;
- config.setuid = 0;
- config.setgid = 0;
- config.uid = 0;
- config.gid = 0;
- config.mode = DEVPTS_DEFAULT_MODE;
+ opts->setuid = 0;
+ opts->setgid = 0;
+ opts->uid = 0;
+ opts->gid = 0;
+ opts->mode = DEVPTS_DEFAULT_MODE;
+ opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
+
+ /* newinstance makes sense only on initial mount */
+ if (op == PARSE_MOUNT)
+ opts->newinstance = 0;
while ((p = strsep(&data, ",")) != NULL) {
substring_t args[MAX_OPT_ARGS];
@@ -79,20 +118,32 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
case Opt_uid:
if (match_int(&args[0], &option))
return -EINVAL;
- config.uid = option;
- config.setuid = 1;
+ opts->uid = option;
+ opts->setuid = 1;
break;
case Opt_gid:
if (match_int(&args[0], &option))
return -EINVAL;
- config.gid = option;
- config.setgid = 1;
+ opts->gid = option;
+ opts->setgid = 1;
break;
case Opt_mode:
if (match_octal(&args[0], &option))
return -EINVAL;
- config.mode = option & S_IALLUGO;
+ opts->mode = option & S_IALLUGO;
+ break;
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+ case Opt_ptmxmode:
+ if (match_octal(&args[0], &option))
+ return -EINVAL;
+ opts->ptmxmode = option & S_IALLUGO;
+ break;
+ case Opt_newinstance:
+ /* newinstance makes sense only on initial mount */
+ if (op == PARSE_MOUNT)
+ opts->newinstance = 1;
break;
+#endif
default:
printk(KERN_ERR "devpts: called with bogus options\n");
return -EINVAL;
@@ -102,13 +153,106 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data)
return 0;
}
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+static int mknod_ptmx(struct super_block *sb)
+{
+ int mode;
+ int rc = -ENOMEM;
+ struct dentry *dentry;
+ struct inode *inode;
+ struct dentry *root = sb->s_root;
+ struct pts_fs_info *fsi = DEVPTS_SB(sb);
+ struct pts_mount_opts *opts = &fsi->mount_opts;
+
+ mutex_lock(&root->d_inode->i_mutex);
+
+ /* If we have already created ptmx node, return */
+ if (fsi->ptmx_dentry) {
+ rc = 0;
+ goto out;
+ }
+
+ dentry = d_alloc_name(root, "ptmx");
+ if (!dentry) {
+ printk(KERN_NOTICE "Unable to alloc dentry for ptmx node\n");
+ goto out;
+ }
+
+ /*
+ * Create a new 'ptmx' node in this mount of devpts.
+ */
+ inode = new_inode(sb);
+ if (!inode) {
+ printk(KERN_ERR "Unable to alloc inode for ptmx node\n");
+ dput(dentry);
+ goto out;
+ }
+
+ inode->i_ino = 2;
+ inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+
+ mode = S_IFCHR|opts->ptmxmode;
+ init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2));
+
+ d_add(dentry, inode);
+
+ fsi->ptmx_dentry = dentry;
+ rc = 0;
+
+ printk(KERN_DEBUG "Created ptmx node in devpts ino %lu\n",
+ inode->i_ino);
+out:
+ mutex_unlock(&root->d_inode->i_mutex);
+ return rc;
+}
+
+static void update_ptmx_mode(struct pts_fs_info *fsi)
+{
+ struct inode *inode;
+ if (fsi->ptmx_dentry) {
+ inode = fsi->ptmx_dentry->d_inode;
+ inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode;
+ }
+}
+#else
+static inline void update_ptmx_mode(struct pts_fs_info *fsi)
+{
+ return;
+}
+#endif
+
+static int devpts_remount(struct super_block *sb, int *flags, char *data)
+{
+ int err;
+ struct pts_fs_info *fsi = DEVPTS_SB(sb);
+ struct pts_mount_opts *opts = &fsi->mount_opts;
+
+ err = parse_mount_options(data, PARSE_REMOUNT, opts);
+
+ /*
+ * parse_mount_options() restores options to default values
+ * before parsing and may have changed ptmxmode. So, update the
+ * mode in the inode too. Bogus options don't fail the remount,
+ * so do this even on error return.
+ */
+ update_ptmx_mode(fsi);
+
+ return err;
+}
+
static int devpts_show_options(struct seq_file *seq, struct vfsmount *vfs)
{
- if (config.setuid)
- seq_printf(seq, ",uid=%u", config.uid);
- if (config.setgid)
- seq_printf(seq, ",gid=%u", config.gid);
- seq_printf(seq, ",mode=%03o", config.mode);
+ struct pts_fs_info *fsi = DEVPTS_SB(vfs->mnt_sb);
+ struct pts_mount_opts *opts = &fsi->mount_opts;
+
+ if (opts->setuid)
+ seq_printf(seq, ",uid=%u", opts->uid);
+ if (opts->setgid)
+ seq_printf(seq, ",gid=%u", opts->gid);
+ seq_printf(seq, ",mode=%03o", opts->mode);
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+ seq_printf(seq, ",ptmxmode=%03o", opts->ptmxmode);
+#endif
return 0;
}
@@ -119,10 +263,25 @@ static const struct super_operations devpts_sops = {
.show_options = devpts_show_options,
};
+static void *new_pts_fs_info(void)
+{
+ struct pts_fs_info *fsi;
+
+ fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL);
+ if (!fsi)
+ return NULL;
+
+ ida_init(&fsi->allocated_ptys);
+ fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE;
+ fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE;
+
+ return fsi;
+}
+
static int
devpts_fill_super(struct super_block *s, void *data, int silent)
{
- struct inode * inode;
+ struct inode *inode;
s->s_blocksize = 1024;
s->s_blocksize_bits = 10;
@@ -130,39 +289,240 @@ devpts_fill_super(struct super_block *s, void *data, int silent)
s->s_op = &devpts_sops;
s->s_time_gran = 1;
+ s->s_fs_info = new_pts_fs_info();
+ if (!s->s_fs_info)
+ goto fail;
+
inode = new_inode(s);
if (!inode)
- goto fail;
+ goto free_fsi;
inode->i_ino = 1;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
- inode->i_blocks = 0;
- inode->i_uid = inode->i_gid = 0;
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR;
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
inode->i_nlink = 2;
- devpts_root = s->s_root = d_alloc_root(inode);
+ s->s_root = d_alloc_root(inode);
if (s->s_root)
return 0;
-
- printk("devpts: get root dentry failed\n");
+
+ printk(KERN_ERR "devpts: get root dentry failed\n");
iput(inode);
+
+free_fsi:
+ kfree(s->s_fs_info);
fail:
return -ENOMEM;
}
+#ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
+static int compare_init_pts_sb(struct super_block *s, void *p)
+{
+ if (devpts_mnt)
+ return devpts_mnt->mnt_sb == s;
+ return 0;
+}
+
+/*
+ * Safely parse the mount options in @data and update @opts.
+ *
+ * devpts ends up parsing options two times during mount, due to the
+ * two modes of operation it supports. The first parse occurs in
+ * devpts_get_sb() when determining the mode (single-instance or
+ * multi-instance mode). The second parse happens in devpts_remount()
+ * or new_pts_mount() depending on the mode.
+ *
+ * Parsing of options modifies the @data making subsequent parsing
+ * incorrect. So make a local copy of @data and parse it.
+ *
+ * Return: 0 On success, -errno on error
+ */
+static int safe_parse_mount_options(void *data, struct pts_mount_opts *opts)
+{
+ int rc;
+ void *datacp;
+
+ if (!data)
+ return 0;
+
+ /* Use kstrdup() ? */
+ datacp = kmalloc(PAGE_SIZE, GFP_KERNEL);
+ if (!datacp)
+ return -ENOMEM;
+
+ memcpy(datacp, data, PAGE_SIZE);
+ rc = parse_mount_options((char *)datacp, PARSE_MOUNT, opts);
+ kfree(datacp);
+
+ return rc;
+}
+
+/*
+ * Mount a new (private) instance of devpts. PTYs created in this
+ * instance are independent of the PTYs in other devpts instances.
+ */
+static int new_pts_mount(struct file_system_type *fs_type, int flags,
+ void *data, struct vfsmount *mnt)
+{
+ int err;
+ struct pts_fs_info *fsi;
+ struct pts_mount_opts *opts;
+
+ printk(KERN_NOTICE "devpts: newinstance mount\n");
+
+ err = get_sb_nodev(fs_type, flags, data, devpts_fill_super, mnt);
+ if (err)
+ return err;
+
+ fsi = DEVPTS_SB(mnt->mnt_sb);
+ opts = &fsi->mount_opts;
+
+ err = parse_mount_options(data, PARSE_MOUNT, opts);
+ if (err)
+ goto fail;
+
+ err = mknod_ptmx(mnt->mnt_sb);
+ if (err)
+ goto fail;
+
+ return 0;
+
+fail:
+ dput(mnt->mnt_sb->s_root);
+ deactivate_super(mnt->mnt_sb);
+ return err;
+}
+
+/*
+ * Check if 'newinstance' mount option was specified in @data.
+ *
+ * Return: -errno on error (eg: invalid mount options specified)
+ * : 1 if 'newinstance' mount option was specified
+ * : 0 if 'newinstance' mount option was NOT specified
+ */
+static int is_new_instance_mount(void *data)
+{
+ int rc;
+ struct pts_mount_opts opts;
+
+ if (!data)
+ return 0;
+
+ rc = safe_parse_mount_options(data, &opts);
+ if (!rc)
+ rc = opts.newinstance;
+
+ return rc;
+}
+
+/*
+ * get_init_pts_sb()
+ *
+ * This interface is needed to support multiple namespace semantics in
+ * devpts while preserving backward compatibility of the current 'single-
+ * namespace' semantics. i.e all mounts of devpts without the 'newinstance'
+ * mount option should bind to the initial kernel mount, like
+ * get_sb_single().
+ *
+ * Mounts with 'newinstance' option create a new private namespace.
+ *
+ * But for single-mount semantics, devpts cannot use get_sb_single(),
+ * because get_sb_single()/sget() find and use the super-block from
+ * the most recent mount of devpts. But that recent mount may be a
+ * 'newinstance' mount and get_sb_single() would pick the newinstance
+ * super-block instead of the initial super-block.
+ *
+ * This interface is identical to get_sb_single() except that it
+ * consistently selects the 'single-namespace' superblock even in the
+ * presence of the private namespace (i.e 'newinstance') super-blocks.
+ */
+static int get_init_pts_sb(struct file_system_type *fs_type, int flags,
+ void *data, struct vfsmount *mnt)
+{
+ struct super_block *s;
+ int error;
+
+ s = sget(fs_type, compare_init_pts_sb, set_anon_super, NULL);
+ if (IS_ERR(s))
+ return PTR_ERR(s);
+
+ if (!s->s_root) {
+ s->s_flags = flags;
+ error = devpts_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+ if (error) {
+ up_write(&s->s_umount);
+ deactivate_super(s);
+ return error;
+ }
+ s->s_flags |= MS_ACTIVE;
+ }
+ do_remount_sb(s, flags, data, 0);
+ return simple_set_mnt(mnt, s);
+}
+
+/*
+ * Mount or remount the initial kernel mount of devpts. This type of
+ * mount maintains the legacy, single-instance semantics, while the
+ * kernel still allows multiple-instances.
+ */
+static int init_pts_mount(struct file_system_type *fs_type, int flags,
+ void *data, struct vfsmount *mnt)
+{
+ int err;
+
+ err = get_init_pts_sb(fs_type, flags, data, mnt);
+ if (err)
+ return err;
+
+ err = mknod_ptmx(mnt->mnt_sb);
+ if (err) {
+ dput(mnt->mnt_sb->s_root);
+ deactivate_super(mnt->mnt_sb);
+ }
+
+ return err;
+}
+
static int devpts_get_sb(struct file_system_type *fs_type,
int flags, const char *dev_name, void *data, struct vfsmount *mnt)
{
+ int new;
+
+ new = is_new_instance_mount(data);
+ if (new < 0)
+ return new;
+
+ if (new)
+ return new_pts_mount(fs_type, flags, data, mnt);
+
+ return init_pts_mount(fs_type, flags, data, mnt);
+}
+#else
+/*
+ * This supports only the legacy single-instance semantics (no
+ * multiple-instance semantics)
+ */
+static int devpts_get_sb(struct file_system_type *fs_type, int flags,
+ const char *dev_name, void *data, struct vfsmount *mnt)
+{
return get_sb_single(fs_type, flags, data, devpts_fill_super, mnt);
}
+#endif
+
+static void devpts_kill_sb(struct super_block *sb)
+{
+ struct pts_fs_info *fsi = DEVPTS_SB(sb);
+
+ kfree(fsi);
+ kill_litter_super(sb);
+}
static struct file_system_type devpts_fs_type = {
.owner = THIS_MODULE,
.name = "devpts",
.get_sb = devpts_get_sb,
- .kill_sb = kill_anon_super,
+ .kill_sb = devpts_kill_sb,
};
/*
@@ -172,16 +532,17 @@ static struct file_system_type devpts_fs_type = {
int devpts_new_index(struct inode *ptmx_inode)
{
+ struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+ struct pts_fs_info *fsi = DEVPTS_SB(sb);
int index;
int ida_ret;
retry:
- if (!ida_pre_get(&allocated_ptys, GFP_KERNEL)) {
+ if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL))
return -ENOMEM;
- }
mutex_lock(&allocated_ptys_lock);
- ida_ret = ida_get_new(&allocated_ptys, &index);
+ ida_ret = ida_get_new(&fsi->allocated_ptys, &index);
if (ida_ret < 0) {
mutex_unlock(&allocated_ptys_lock);
if (ida_ret == -EAGAIN)
@@ -190,7 +551,7 @@ retry:
}
if (index >= pty_limit) {
- ida_remove(&allocated_ptys, index);
+ ida_remove(&fsi->allocated_ptys, index);
mutex_unlock(&allocated_ptys_lock);
return -EIO;
}
@@ -200,18 +561,26 @@ retry:
void devpts_kill_index(struct inode *ptmx_inode, int idx)
{
+ struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+ struct pts_fs_info *fsi = DEVPTS_SB(sb);
+
mutex_lock(&allocated_ptys_lock);
- ida_remove(&allocated_ptys, idx);
+ ida_remove(&fsi->allocated_ptys, idx);
mutex_unlock(&allocated_ptys_lock);
}
int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
{
- int number = tty->index; /* tty layer puts index from devpts_new_index() in here */
+ /* tty layer puts index from devpts_new_index() in here */
+ int number = tty->index;
struct tty_driver *driver = tty->driver;
dev_t device = MKDEV(driver->major, driver->minor_start+number);
struct dentry *dentry;
- struct inode *inode = new_inode(devpts_mnt->mnt_sb);
+ struct super_block *sb = pts_sb_from_inode(ptmx_inode);
+ struct inode *inode = new_inode(sb);
+ struct dentry *root = sb->s_root;
+ struct pts_fs_info *fsi = DEVPTS_SB(sb);
+ struct pts_mount_opts *opts = &fsi->mount_opts;
char s[12];
/* We're supposed to be given the slave end of a pty */
@@ -221,25 +590,25 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
if (!inode)
return -ENOMEM;
- inode->i_ino = number+2;
- inode->i_uid = config.setuid ? config.uid : current_fsuid();
- inode->i_gid = config.setgid ? config.gid : current_fsgid();
+ inode->i_ino = number + 3;
+ inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
+ inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
- init_special_inode(inode, S_IFCHR|config.mode, device);
+ init_special_inode(inode, S_IFCHR|opts->mode, device);
inode->i_private = tty;
tty->driver_data = inode;
sprintf(s, "%d", number);
- mutex_lock(&devpts_root->d_inode->i_mutex);
+ mutex_lock(&root->d_inode->i_mutex);
- dentry = d_alloc_name(devpts_root, s);
+ dentry = d_alloc_name(root, s);
if (!IS_ERR(dentry)) {
d_add(dentry, inode);
- fsnotify_create(devpts_root->d_inode, dentry);
+ fsnotify_create(root->d_inode, dentry);
}
- mutex_unlock(&devpts_root->d_inode->i_mutex);
+ mutex_unlock(&root->d_inode->i_mutex);
return 0;
}
@@ -256,20 +625,27 @@ struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
void devpts_pty_kill(struct tty_struct *tty)
{
struct inode *inode = tty->driver_data;
+ struct super_block *sb = pts_sb_from_inode(inode);
+ struct dentry *root = sb->s_root;
struct dentry *dentry;
BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
- mutex_lock(&devpts_root->d_inode->i_mutex);
+ mutex_lock(&root->d_inode->i_mutex);
dentry = d_find_alias(inode);
- if (dentry && !IS_ERR(dentry)) {
+ if (IS_ERR(dentry))
+ goto out;
+
+ if (dentry) {
inode->i_nlink--;
d_delete(dentry);
- dput(dentry);
+ dput(dentry); /* d_alloc_name() in devpts_pty_new() */
}
- mutex_unlock(&devpts_root->d_inode->i_mutex);
+ dput(dentry); /* d_find_alias above */
+out:
+ mutex_unlock(&root->d_inode->i_mutex);
}
static int __init init_devpts_fs(void)
diff --git a/fs/dquot.c b/fs/dquot.c
index c237ccc8581c..61bfff64e5af 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -211,8 +211,6 @@ static struct hlist_head *dquot_hash;
struct dqstats dqstats;
-static void dqput(struct dquot *dquot);
-
static inline unsigned int
hashfn(const struct super_block *sb, unsigned int id, int type)
{
@@ -415,6 +413,17 @@ out_dqlock:
return ret;
}
+void dquot_destroy(struct dquot *dquot)
+{
+ kmem_cache_free(dquot_cachep, dquot);
+}
+EXPORT_SYMBOL(dquot_destroy);
+
+static inline void do_destroy_dquot(struct dquot *dquot)
+{
+ dquot->dq_sb->dq_op->destroy_dquot(dquot);
+}
+
/* Invalidate all dquots on the list. Note that this function is called after
* quota is disabled and pointers from inodes removed so there cannot be new
* quota users. There can still be some users of quotas due to inodes being
@@ -463,9 +472,44 @@ restart:
remove_dquot_hash(dquot);
remove_free_dquot(dquot);
remove_inuse(dquot);
- kmem_cache_free(dquot_cachep, dquot);
+ do_destroy_dquot(dquot);
+ }
+ spin_unlock(&dq_list_lock);
+}
+
+/* Call callback for every active dquot on given filesystem */
+int dquot_scan_active(struct super_block *sb,
+ int (*fn)(struct dquot *dquot, unsigned long priv),
+ unsigned long priv)
+{
+ struct dquot *dquot, *old_dquot = NULL;
+ int ret = 0;
+
+ mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+ spin_lock(&dq_list_lock);
+ list_for_each_entry(dquot, &inuse_list, dq_inuse) {
+ if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
+ continue;
+ if (dquot->dq_sb != sb)
+ continue;
+ /* Now we have active dquot so we can just increase use count */
+ atomic_inc(&dquot->dq_count);
+ dqstats.lookups++;
+ spin_unlock(&dq_list_lock);
+ dqput(old_dquot);
+ old_dquot = dquot;
+ ret = fn(dquot, priv);
+ if (ret < 0)
+ goto out;
+ spin_lock(&dq_list_lock);
+ /* We are safe to continue now because our dquot could not
+ * be moved out of the inuse list while we hold the reference */
}
spin_unlock(&dq_list_lock);
+out:
+ dqput(old_dquot);
+ mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+ return ret;
}
int vfs_quota_sync(struct super_block *sb, int type)
@@ -479,7 +523,7 @@ int vfs_quota_sync(struct super_block *sb, int type)
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (type != -1 && cnt != type)
continue;
- if (!sb_has_quota_enabled(sb, cnt))
+ if (!sb_has_quota_active(sb, cnt))
continue;
spin_lock(&dq_list_lock);
dirty = &dqopt->info[cnt].dqi_dirty_list;
@@ -504,8 +548,8 @@ int vfs_quota_sync(struct super_block *sb, int type)
}
for (cnt = 0; cnt < MAXQUOTAS; cnt++)
- if ((cnt == type || type == -1) && sb_has_quota_enabled(sb, cnt)
- && info_dirty(&dqopt->info[cnt]))
+ if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt)
+ && info_dirty(&dqopt->info[cnt]))
sb->dq_op->write_info(sb, cnt);
spin_lock(&dq_list_lock);
dqstats.syncs++;
@@ -527,7 +571,7 @@ static void prune_dqcache(int count)
remove_dquot_hash(dquot);
remove_free_dquot(dquot);
remove_inuse(dquot);
- kmem_cache_free(dquot_cachep, dquot);
+ do_destroy_dquot(dquot);
count--;
head = free_dquots.prev;
}
@@ -558,7 +602,7 @@ static struct shrinker dqcache_shrinker = {
* NOTE: If you change this function please check whether dqput_blocks() works right...
* MUST be called with either dqptr_sem or dqonoff_mutex held
*/
-static void dqput(struct dquot *dquot)
+void dqput(struct dquot *dquot)
{
int ret;
@@ -584,7 +628,7 @@ we_slept:
/* We have more than one user... nothing to do */
atomic_dec(&dquot->dq_count);
/* Releasing dquot during quotaoff phase? */
- if (!sb_has_quota_enabled(dquot->dq_sb, dquot->dq_type) &&
+ if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_type) &&
atomic_read(&dquot->dq_count) == 1)
wake_up(&dquot->dq_wait_unused);
spin_unlock(&dq_list_lock);
@@ -625,11 +669,17 @@ we_slept:
spin_unlock(&dq_list_lock);
}
+struct dquot *dquot_alloc(struct super_block *sb, int type)
+{
+ return kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
+}
+EXPORT_SYMBOL(dquot_alloc);
+
static struct dquot *get_empty_dquot(struct super_block *sb, int type)
{
struct dquot *dquot;
- dquot = kmem_cache_zalloc(dquot_cachep, GFP_NOFS);
+ dquot = sb->dq_op->alloc_dquot(sb, type);
if(!dquot)
return NODQUOT;
@@ -647,15 +697,33 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type)
}
/*
+ * Check whether dquot is in memory.
+ * MUST be called with either dqptr_sem or dqonoff_mutex held
+ */
+int dquot_is_cached(struct super_block *sb, unsigned int id, int type)
+{
+ unsigned int hashent = hashfn(sb, id, type);
+ int ret = 0;
+
+ if (!sb_has_quota_active(sb, type))
+ return 0;
+ spin_lock(&dq_list_lock);
+ if (find_dquot(hashent, sb, id, type) != NODQUOT)
+ ret = 1;
+ spin_unlock(&dq_list_lock);
+ return ret;
+}
+
+/*
* Get reference to dquot
* MUST be called with either dqptr_sem or dqonoff_mutex held
*/
-static struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
+struct dquot *dqget(struct super_block *sb, unsigned int id, int type)
{
unsigned int hashent = hashfn(sb, id, type);
struct dquot *dquot, *empty = NODQUOT;
- if (!sb_has_quota_enabled(sb, type))
+ if (!sb_has_quota_active(sb, type))
return NODQUOT;
we_slept:
spin_lock(&dq_list_lock);
@@ -682,7 +750,7 @@ we_slept:
dqstats.lookups++;
spin_unlock(&dq_list_lock);
if (empty)
- kmem_cache_free(dquot_cachep, empty);
+ do_destroy_dquot(empty);
}
/* Wait for dq_lock - after this we know that either dquot_release() is already
* finished or it will be canceled due to dq_count > 1 test */
@@ -820,7 +888,7 @@ static void drop_dquot_ref(struct super_block *sb, int type)
}
}
-static inline void dquot_incr_inodes(struct dquot *dquot, unsigned long number)
+static inline void dquot_incr_inodes(struct dquot *dquot, qsize_t number)
{
dquot->dq_dqb.dqb_curinodes += number;
}
@@ -830,9 +898,10 @@ static inline void dquot_incr_space(struct dquot *dquot, qsize_t number)
dquot->dq_dqb.dqb_curspace += number;
}
-static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number)
+static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number)
{
- if (dquot->dq_dqb.dqb_curinodes > number)
+ if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
+ dquot->dq_dqb.dqb_curinodes >= number)
dquot->dq_dqb.dqb_curinodes -= number;
else
dquot->dq_dqb.dqb_curinodes = 0;
@@ -843,11 +912,12 @@ static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number)
static inline void dquot_decr_space(struct dquot *dquot, qsize_t number)
{
- if (dquot->dq_dqb.dqb_curspace > number)
+ if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE ||
+ dquot->dq_dqb.dqb_curspace >= number)
dquot->dq_dqb.dqb_curspace -= number;
else
dquot->dq_dqb.dqb_curspace = 0;
- if (toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit)
+ if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
dquot->dq_dqb.dqb_btime = (time_t) 0;
clear_bit(DQ_BLKS_B, &dquot->dq_flags);
}
@@ -1023,10 +1093,11 @@ static inline char ignore_hardlimit(struct dquot *dquot)
}
/* needs dq_data_lock */
-static int check_idq(struct dquot *dquot, ulong inodes, char *warntype)
+static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype)
{
*warntype = QUOTA_NL_NOWARN;
- if (inodes <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags))
+ if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
+ test_bit(DQ_FAKE_B, &dquot->dq_flags))
return QUOTA_OK;
if (dquot->dq_dqb.dqb_ihardlimit &&
@@ -1058,11 +1129,12 @@ static int check_idq(struct dquot *dquot, ulong inodes, char *warntype)
static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype)
{
*warntype = QUOTA_NL_NOWARN;
- if (space <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags))
+ if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) ||
+ test_bit(DQ_FAKE_B, &dquot->dq_flags))
return QUOTA_OK;
if (dquot->dq_dqb.dqb_bhardlimit &&
- toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bhardlimit &&
+ dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bhardlimit &&
!ignore_hardlimit(dquot)) {
if (!prealloc)
*warntype = QUOTA_NL_BHARDWARN;
@@ -1070,7 +1142,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
}
if (dquot->dq_dqb.dqb_bsoftlimit &&
- toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit &&
+ dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
dquot->dq_dqb.dqb_btime && get_seconds() >= dquot->dq_dqb.dqb_btime &&
!ignore_hardlimit(dquot)) {
if (!prealloc)
@@ -1079,7 +1151,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
}
if (dquot->dq_dqb.dqb_bsoftlimit &&
- toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit &&
+ dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit &&
dquot->dq_dqb.dqb_btime == 0) {
if (!prealloc) {
*warntype = QUOTA_NL_BSOFTWARN;
@@ -1096,10 +1168,11 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war
return QUOTA_OK;
}
-static int info_idq_free(struct dquot *dquot, ulong inodes)
+static int info_idq_free(struct dquot *dquot, qsize_t inodes)
{
if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
- dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit)
+ dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit ||
+ !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type))
return QUOTA_NL_NOWARN;
if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit)
@@ -1113,15 +1186,13 @@ static int info_idq_free(struct dquot *dquot, ulong inodes)
static int info_bdq_free(struct dquot *dquot, qsize_t space)
{
if (test_bit(DQ_FAKE_B, &dquot->dq_flags) ||
- toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit)
+ dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit)
return QUOTA_NL_NOWARN;
- if (toqb(dquot->dq_dqb.dqb_curspace - space) <=
- dquot->dq_dqb.dqb_bsoftlimit)
+ if (dquot->dq_dqb.dqb_curspace - space <= dquot->dq_dqb.dqb_bsoftlimit)
return QUOTA_NL_BSOFTBELOW;
- if (toqb(dquot->dq_dqb.dqb_curspace) >= dquot->dq_dqb.dqb_bhardlimit &&
- toqb(dquot->dq_dqb.dqb_curspace - space) <
- dquot->dq_dqb.dqb_bhardlimit)
+ if (dquot->dq_dqb.dqb_curspace >= dquot->dq_dqb.dqb_bhardlimit &&
+ dquot->dq_dqb.dqb_curspace - space < dquot->dq_dqb.dqb_bhardlimit)
return QUOTA_NL_BHARDBELOW;
return QUOTA_NL_NOWARN;
}
@@ -1166,17 +1237,23 @@ out_err:
* Release all quotas referenced by inode
* Transaction must be started at an entry
*/
-int dquot_drop(struct inode *inode)
+int dquot_drop_locked(struct inode *inode)
{
int cnt;
- down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (inode->i_dquot[cnt] != NODQUOT) {
dqput(inode->i_dquot[cnt]);
inode->i_dquot[cnt] = NODQUOT;
}
}
+ return 0;
+}
+
+int dquot_drop(struct inode *inode)
+{
+ down_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
+ dquot_drop_locked(inode);
up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
return 0;
}
@@ -1264,7 +1341,7 @@ warn_put_all:
/*
* This operation can block, but only after everything is updated
*/
-int dquot_alloc_inode(const struct inode *inode, unsigned long number)
+int dquot_alloc_inode(const struct inode *inode, qsize_t number)
{
int cnt, ret = NO_QUOTA;
char warntype[MAXQUOTAS];
@@ -1349,7 +1426,7 @@ out_sub:
/*
* This operation can block, but only after everything is updated
*/
-int dquot_free_inode(const struct inode *inode, unsigned long number)
+int dquot_free_inode(const struct inode *inode, qsize_t number)
{
unsigned int cnt;
char warntype[MAXQUOTAS];
@@ -1495,7 +1572,7 @@ warn_put_all:
/* Wrapper for transferring ownership of an inode */
int vfs_dq_transfer(struct inode *inode, struct iattr *iattr)
{
- if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) {
+ if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) {
vfs_dq_init(inode);
if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA)
return 1;
@@ -1533,54 +1610,27 @@ struct dquot_operations dquot_operations = {
.acquire_dquot = dquot_acquire,
.release_dquot = dquot_release,
.mark_dirty = dquot_mark_dquot_dirty,
- .write_info = dquot_commit_info
+ .write_info = dquot_commit_info,
+ .alloc_dquot = dquot_alloc,
+ .destroy_dquot = dquot_destroy,
};
-static inline void set_enable_flags(struct quota_info *dqopt, int type)
-{
- switch (type) {
- case USRQUOTA:
- dqopt->flags |= DQUOT_USR_ENABLED;
- dqopt->flags &= ~DQUOT_USR_SUSPENDED;
- break;
- case GRPQUOTA:
- dqopt->flags |= DQUOT_GRP_ENABLED;
- dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
- break;
- }
-}
-
-static inline void reset_enable_flags(struct quota_info *dqopt, int type,
- int remount)
-{
- switch (type) {
- case USRQUOTA:
- dqopt->flags &= ~DQUOT_USR_ENABLED;
- if (remount)
- dqopt->flags |= DQUOT_USR_SUSPENDED;
- else
- dqopt->flags &= ~DQUOT_USR_SUSPENDED;
- break;
- case GRPQUOTA:
- dqopt->flags &= ~DQUOT_GRP_ENABLED;
- if (remount)
- dqopt->flags |= DQUOT_GRP_SUSPENDED;
- else
- dqopt->flags &= ~DQUOT_GRP_SUSPENDED;
- break;
- }
-}
-
-
/*
* Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
*/
-int vfs_quota_off(struct super_block *sb, int type, int remount)
+int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
{
int cnt, ret = 0;
struct quota_info *dqopt = sb_dqopt(sb);
struct inode *toputinode[MAXQUOTAS];
+ /* Cannot turn off usage accounting without turning off limits, or
+ * suspend quotas and simultaneously turn quotas off. */
+ if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED))
+ || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED |
+ DQUOT_USAGE_ENABLED)))
+ return -EINVAL;
+
/* We need to serialize quota_off() for device */
mutex_lock(&dqopt->dqonoff_mutex);
@@ -1589,7 +1639,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
* sometimes we are called when fill_super() failed and calling
* sync_fs() in such cases does no good.
*/
- if (!sb_any_quota_enabled(sb) && !sb_any_quota_suspended(sb)) {
+ if (!sb_any_quota_loaded(sb)) {
mutex_unlock(&dqopt->dqonoff_mutex);
return 0;
}
@@ -1597,17 +1647,28 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
toputinode[cnt] = NULL;
if (type != -1 && cnt != type)
continue;
- /* If we keep inodes of quota files after remount and quotaoff
- * is called, drop kept inodes. */
- if (!remount && sb_has_quota_suspended(sb, cnt)) {
- iput(dqopt->files[cnt]);
- dqopt->files[cnt] = NULL;
- reset_enable_flags(dqopt, cnt, 0);
+ if (!sb_has_quota_loaded(sb, cnt))
continue;
+
+ if (flags & DQUOT_SUSPENDED) {
+ dqopt->flags |=
+ dquot_state_flag(DQUOT_SUSPENDED, cnt);
+ } else {
+ dqopt->flags &= ~dquot_state_flag(flags, cnt);
+ /* Turning off suspended quotas? */
+ if (!sb_has_quota_loaded(sb, cnt) &&
+ sb_has_quota_suspended(sb, cnt)) {
+ dqopt->flags &= ~dquot_state_flag(
+ DQUOT_SUSPENDED, cnt);
+ iput(dqopt->files[cnt]);
+ dqopt->files[cnt] = NULL;
+ continue;
+ }
}
- if (!sb_has_quota_enabled(sb, cnt))
+
+ /* We still have to keep quota loaded? */
+ if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED))
continue;
- reset_enable_flags(dqopt, cnt, remount);
/* Note: these are blocking operations */
drop_dquot_ref(sb, cnt);
@@ -1623,7 +1684,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
put_quota_format(dqopt->info[cnt].dqi_format);
toputinode[cnt] = dqopt->files[cnt];
- if (!remount)
+ if (!sb_has_quota_loaded(sb, cnt))
dqopt->files[cnt] = NULL;
dqopt->info[cnt].dqi_flags = 0;
dqopt->info[cnt].dqi_igrace = 0;
@@ -1631,6 +1692,11 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
dqopt->ops[cnt] = NULL;
}
mutex_unlock(&dqopt->dqonoff_mutex);
+
+ /* Skip syncing and setting flags if quota files are hidden */
+ if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
+ goto put_inodes;
+
/* Sync the superblock so that buffers with quota data are written to
* disk (and so userspace sees correct data afterwards). */
if (sb->s_op->sync_fs)
@@ -1646,7 +1712,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
mutex_lock(&dqopt->dqonoff_mutex);
/* If quota was reenabled in the meantime, we have
* nothing to do */
- if (!sb_has_quota_enabled(sb, cnt)) {
+ if (!sb_has_quota_loaded(sb, cnt)) {
mutex_lock_nested(&toputinode[cnt]->i_mutex, I_MUTEX_QUOTA);
toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
S_NOATIME | S_NOQUOTA);
@@ -1655,26 +1721,43 @@ int vfs_quota_off(struct super_block *sb, int type, int remount)
mark_inode_dirty(toputinode[cnt]);
}
mutex_unlock(&dqopt->dqonoff_mutex);
+ }
+ if (sb->s_bdev)
+ invalidate_bdev(sb->s_bdev);
+put_inodes:
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+ if (toputinode[cnt]) {
/* On remount RO, we keep the inode pointer so that we
- * can reenable quota on the subsequent remount RW.
- * But we have better not keep inode pointer when there
- * is pending delete on the quota file... */
- if (!remount)
+ * can reenable quota on the subsequent remount RW. We
+ * have to check 'flags' variable and not use sb_has_
+ * function because another quotaon / quotaoff could
+ * change global state before we got here. We refuse
+ * to suspend quotas when there is pending delete on
+ * the quota file... */
+ if (!(flags & DQUOT_SUSPENDED))
iput(toputinode[cnt]);
else if (!toputinode[cnt]->i_nlink)
ret = -EBUSY;
}
- if (sb->s_bdev)
- invalidate_bdev(sb->s_bdev);
return ret;
}
+int vfs_quota_off(struct super_block *sb, int type, int remount)
+{
+ return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED :
+ (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED));
+}
+
/*
* Turn quotas on on a device
*/
-/* Helper function when we already have the inode */
-static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
+/*
+ * Helper function to turn quotas on when we already have the inode of
+ * quota file and no quota information is loaded.
+ */
+static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
+ unsigned int flags)
{
struct quota_format_type *fmt = find_quota_format(format_id);
struct super_block *sb = inode->i_sb;
@@ -1696,27 +1779,37 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
error = -EINVAL;
goto out_fmt;
}
+ /* Usage always has to be set... */
+ if (!(flags & DQUOT_USAGE_ENABLED)) {
+ error = -EINVAL;
+ goto out_fmt;
+ }
- /* As we bypass the pagecache we must now flush the inode so that
- * we see all the changes from userspace... */
- write_inode_now(inode, 1);
- /* And now flush the block cache so that kernel sees the changes */
- invalidate_bdev(sb->s_bdev);
+ if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
+ /* As we bypass the pagecache we must now flush the inode so
+ * that we see all the changes from userspace... */
+ write_inode_now(inode, 1);
+ /* And now flush the block cache so that kernel sees the
+ * changes */
+ invalidate_bdev(sb->s_bdev);
+ }
mutex_lock(&inode->i_mutex);
mutex_lock(&dqopt->dqonoff_mutex);
- if (sb_has_quota_enabled(sb, type) ||
- sb_has_quota_suspended(sb, type)) {
+ if (sb_has_quota_loaded(sb, type)) {
error = -EBUSY;
goto out_lock;
}
- /* We don't want quota and atime on quota files (deadlocks possible)
- * Also nobody should write to the file - we use special IO operations
- * which ignore the immutable bit. */
- down_write(&dqopt->dqptr_sem);
- oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
- inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
- up_write(&dqopt->dqptr_sem);
- sb->dq_op->drop(inode);
+
+ if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
+ /* We don't want quota and atime on quota files (deadlocks
+ * possible) Also nobody should write to the file - we use
+ * special IO operations which ignore the immutable bit. */
+ down_write(&dqopt->dqptr_sem);
+ oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA);
+ inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+ up_write(&dqopt->dqptr_sem);
+ sb->dq_op->drop(inode);
+ }
error = -EIO;
dqopt->files[type] = igrab(inode);
@@ -1737,7 +1830,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id)
}
mutex_unlock(&dqopt->dqio_mutex);
mutex_unlock(&inode->i_mutex);
- set_enable_flags(dqopt, type);
+ dqopt->flags |= dquot_state_flag(flags, type);
add_dquot_ref(sb, type);
mutex_unlock(&dqopt->dqonoff_mutex);
@@ -1770,20 +1863,23 @@ static int vfs_quota_on_remount(struct super_block *sb, int type)
struct quota_info *dqopt = sb_dqopt(sb);
struct inode *inode;
int ret;
+ unsigned int flags;
mutex_lock(&dqopt->dqonoff_mutex);
if (!sb_has_quota_suspended(sb, type)) {
mutex_unlock(&dqopt->dqonoff_mutex);
return 0;
}
- BUG_ON(sb_has_quota_enabled(sb, type));
-
inode = dqopt->files[type];
dqopt->files[type] = NULL;
- reset_enable_flags(dqopt, type, 0);
+ flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
+ DQUOT_LIMITS_ENABLED, type);
+ dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
mutex_unlock(&dqopt->dqonoff_mutex);
- ret = vfs_quota_on_inode(inode, type, dqopt->info[type].dqi_fmt_id);
+ flags = dquot_generic_flag(flags, type);
+ ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id,
+ flags);
iput(inode);
return ret;
@@ -1799,12 +1895,12 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
if (path->mnt->mnt_sb != sb)
error = -EXDEV;
else
- error = vfs_quota_on_inode(path->dentry->d_inode, type,
- format_id);
+ error = vfs_load_quota_inode(path->dentry->d_inode, type,
+ format_id, DQUOT_USAGE_ENABLED |
+ DQUOT_LIMITS_ENABLED);
return error;
}
-/* Actual function called from quotactl() */
int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
int remount)
{
@@ -1823,6 +1919,50 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
}
/*
+ * More powerful function for turning on quotas allowing setting
+ * of individual quota flags
+ */
+int vfs_quota_enable(struct inode *inode, int type, int format_id,
+ unsigned int flags)
+{
+ int ret = 0;
+ struct super_block *sb = inode->i_sb;
+ struct quota_info *dqopt = sb_dqopt(sb);
+
+ /* Just unsuspend quotas? */
+ if (flags & DQUOT_SUSPENDED)
+ return vfs_quota_on_remount(sb, type);
+ if (!flags)
+ return 0;
+ /* Just updating flags needed? */
+ if (sb_has_quota_loaded(sb, type)) {
+ mutex_lock(&dqopt->dqonoff_mutex);
+ /* Now do a reliable test... */
+ if (!sb_has_quota_loaded(sb, type)) {
+ mutex_unlock(&dqopt->dqonoff_mutex);
+ goto load_quota;
+ }
+ if (flags & DQUOT_USAGE_ENABLED &&
+ sb_has_quota_usage_enabled(sb, type)) {
+ ret = -EBUSY;
+ goto out_lock;
+ }
+ if (flags & DQUOT_LIMITS_ENABLED &&
+ sb_has_quota_limits_enabled(sb, type)) {
+ ret = -EBUSY;
+ goto out_lock;
+ }
+ sb_dqopt(sb)->flags |= dquot_state_flag(flags, type);
+out_lock:
+ mutex_unlock(&dqopt->dqonoff_mutex);
+ return ret;
+ }
+
+load_quota:
+ return vfs_load_quota_inode(inode, type, format_id, flags);
+}
+
+/*
* This function is used when filesystem needs to initialize quotas
* during mount time.
*/
@@ -1843,7 +1983,8 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
error = security_quota_on(dentry);
if (!error)
- error = vfs_quota_on_inode(dentry->d_inode, type, format_id);
+ error = vfs_load_quota_inode(dentry->d_inode, type, format_id,
+ DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
out:
dput(dentry);
@@ -1866,14 +2007,24 @@ int vfs_dq_quota_on_remount(struct super_block *sb)
return ret;
}
+static inline qsize_t qbtos(qsize_t blocks)
+{
+ return blocks << QIF_DQBLKSIZE_BITS;
+}
+
+static inline qsize_t stoqb(qsize_t space)
+{
+ return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
+}
+
/* Generic routine for getting common part of quota structure */
static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di)
{
struct mem_dqblk *dm = &dquot->dq_dqb;
spin_lock(&dq_data_lock);
- di->dqb_bhardlimit = dm->dqb_bhardlimit;
- di->dqb_bsoftlimit = dm->dqb_bsoftlimit;
+ di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit);
+ di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit);
di->dqb_curspace = dm->dqb_curspace;
di->dqb_ihardlimit = dm->dqb_ihardlimit;
di->dqb_isoftlimit = dm->dqb_isoftlimit;
@@ -1918,28 +2069,36 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di)
if (di->dqb_valid & QIF_SPACE) {
dm->dqb_curspace = di->dqb_curspace;
check_blim = 1;
+ __set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
}
if (di->dqb_valid & QIF_BLIMITS) {
- dm->dqb_bsoftlimit = di->dqb_bsoftlimit;
- dm->dqb_bhardlimit = di->dqb_bhardlimit;
+ dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit);
+ dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit);
check_blim = 1;
+ __set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
}
if (di->dqb_valid & QIF_INODES) {
dm->dqb_curinodes = di->dqb_curinodes;
check_ilim = 1;
+ __set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
}
if (di->dqb_valid & QIF_ILIMITS) {
dm->dqb_isoftlimit = di->dqb_isoftlimit;
dm->dqb_ihardlimit = di->dqb_ihardlimit;
check_ilim = 1;
+ __set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
}
- if (di->dqb_valid & QIF_BTIME)
+ if (di->dqb_valid & QIF_BTIME) {
dm->dqb_btime = di->dqb_btime;
- if (di->dqb_valid & QIF_ITIME)
+ __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+ }
+ if (di->dqb_valid & QIF_ITIME) {
dm->dqb_itime = di->dqb_itime;
+ __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+ }
if (check_blim) {
- if (!dm->dqb_bsoftlimit || toqb(dm->dqb_curspace) < dm->dqb_bsoftlimit) {
+ if (!dm->dqb_bsoftlimit || dm->dqb_curspace < dm->dqb_bsoftlimit) {
dm->dqb_btime = 0;
clear_bit(DQ_BLKS_B, &dquot->dq_flags);
}
@@ -1970,12 +2129,14 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d
int rc;
mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
- if (!(dquot = dqget(sb, id, type))) {
- mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
- return -ESRCH;
+ dquot = dqget(sb, id, type);
+ if (!dquot) {
+ rc = -ESRCH;
+ goto out;
}
rc = do_set_dqblk(dquot, di);
dqput(dquot);
+out:
mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
return rc;
}
@@ -1986,7 +2147,7 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
struct mem_dqinfo *mi;
mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
- if (!sb_has_quota_enabled(sb, type)) {
+ if (!sb_has_quota_active(sb, type)) {
mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
return -ESRCH;
}
@@ -2005,11 +2166,12 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
{
struct mem_dqinfo *mi;
+ int err = 0;
mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
- if (!sb_has_quota_enabled(sb, type)) {
- mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
- return -ESRCH;
+ if (!sb_has_quota_active(sb, type)) {
+ err = -ESRCH;
+ goto out;
}
mi = sb_dqopt(sb)->info + type;
spin_lock(&dq_data_lock);
@@ -2023,8 +2185,9 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
mark_info_dirty(sb, type);
/* Force write to disk */
sb->dq_op->write_info(sb, type);
+out:
mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
- return 0;
+ return err;
}
struct quotactl_ops vfs_quotactl_ops = {
@@ -2186,10 +2349,13 @@ EXPORT_SYMBOL(register_quota_format);
EXPORT_SYMBOL(unregister_quota_format);
EXPORT_SYMBOL(dqstats);
EXPORT_SYMBOL(dq_data_lock);
+EXPORT_SYMBOL(vfs_quota_enable);
EXPORT_SYMBOL(vfs_quota_on);
EXPORT_SYMBOL(vfs_quota_on_path);
EXPORT_SYMBOL(vfs_quota_on_mount);
+EXPORT_SYMBOL(vfs_quota_disable);
EXPORT_SYMBOL(vfs_quota_off);
+EXPORT_SYMBOL(dquot_scan_active);
EXPORT_SYMBOL(vfs_quota_sync);
EXPORT_SYMBOL(vfs_get_dqinfo);
EXPORT_SYMBOL(vfs_set_dqinfo);
@@ -2202,7 +2368,11 @@ EXPORT_SYMBOL(dquot_release);
EXPORT_SYMBOL(dquot_mark_dquot_dirty);
EXPORT_SYMBOL(dquot_initialize);
EXPORT_SYMBOL(dquot_drop);
+EXPORT_SYMBOL(dquot_drop_locked);
EXPORT_SYMBOL(vfs_dq_drop);
+EXPORT_SYMBOL(dqget);
+EXPORT_SYMBOL(dqput);
+EXPORT_SYMBOL(dquot_is_cached);
EXPORT_SYMBOL(dquot_alloc_space);
EXPORT_SYMBOL(dquot_alloc_inode);
EXPORT_SYMBOL(dquot_free_space);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index eb3dc4c7ac06..713834371229 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -275,18 +275,9 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
static int
ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
{
- struct file *lower_file = ecryptfs_file_to_lower(file);
- struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
- struct inode *lower_inode = lower_dentry->d_inode;
- int rc = -EINVAL;
-
- if (lower_inode->i_fop->fsync) {
- mutex_lock(&lower_inode->i_mutex);
- rc = lower_inode->i_fop->fsync(lower_file, lower_dentry,
- datasync);
- mutex_unlock(&lower_inode->i_mutex);
- }
- return rc;
+ return vfs_fsync(ecryptfs_file_to_lower(file),
+ ecryptfs_dentry_to_lower(dentry),
+ datasync);
}
static int ecryptfs_fasync(int fd, struct file *file, int flag)
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 89209f00f9c7..0111906a8877 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -612,8 +612,7 @@ ecryptfs_readlink(struct dentry *dentry, char __user * buf, int bufsiz)
struct ecryptfs_crypt_stat *crypt_stat;
lower_dentry = ecryptfs_dentry_to_lower(dentry);
- if (!lower_dentry->d_inode->i_op ||
- !lower_dentry->d_inode->i_op->readlink) {
+ if (!lower_dentry->d_inode->i_op->readlink) {
rc = -EINVAL;
goto out;
}
@@ -673,10 +672,11 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd)
ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ "
"dentry->d_name.name = [%s]\n", dentry->d_name.name);
rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len);
- buf[rc] = '\0';
set_fs(old_fs);
if (rc < 0)
goto out_free;
+ else
+ buf[rc] = '\0';
rc = 0;
nd_set_link(nd, buf);
goto out;
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index 04d7b3fa1ac6..46cec2b69796 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -288,7 +288,7 @@ static int ecryptfs_write_begin(struct file *file,
loff_t prev_page_end_size;
int rc = 0;
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
diff --git a/fs/exec.c b/fs/exec.c
index 1f59ea079cbb..9c33f542dc77 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -51,17 +51,13 @@
#include <linux/audit.h>
#include <linux/tracehook.h>
#include <linux/kmod.h>
+#include <linux/fsnotify.h>
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/tlb.h>
#include "internal.h"
-#ifdef __alpha__
-/* for /sbin/loader handling in search_binary_handler() */
-#include <linux/a.out.h>
-#endif
-
int core_uses_pid;
char core_pattern[CORENAME_MAX_SIZE] = "core";
int suid_dumpable = 0;
@@ -127,7 +123,8 @@ asmlinkage long sys_uselib(const char __user * library)
if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
goto exit;
- error = vfs_permission(&nd, MAY_READ | MAY_EXEC | MAY_OPEN);
+ error = inode_permission(nd.path.dentry->d_inode,
+ MAY_READ | MAY_EXEC | MAY_OPEN);
if (error)
goto exit;
@@ -136,6 +133,8 @@ asmlinkage long sys_uselib(const char __user * library)
if (IS_ERR(file))
goto out;
+ fsnotify_open(file->f_path.dentry);
+
error = -ENOEXEC;
if(file->f_op) {
struct linux_binfmt * fmt;
@@ -680,7 +679,7 @@ struct file *open_exec(const char *name)
if (nd.path.mnt->mnt_flags & MNT_NOEXEC)
goto out_path_put;
- err = vfs_permission(&nd, MAY_EXEC | MAY_OPEN);
+ err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN);
if (err)
goto out_path_put;
@@ -688,6 +687,8 @@ struct file *open_exec(const char *name)
if (IS_ERR(file))
return file;
+ fsnotify_open(file->f_path.dentry);
+
err = deny_write_access(file);
if (err) {
fput(file);
@@ -773,7 +774,6 @@ static int de_thread(struct task_struct *tsk)
struct signal_struct *sig = tsk->signal;
struct sighand_struct *oldsighand = tsk->sighand;
spinlock_t *lock = &oldsighand->siglock;
- struct task_struct *leader = NULL;
int count;
if (thread_group_empty(tsk))
@@ -811,7 +811,7 @@ static int de_thread(struct task_struct *tsk)
* and to assume its PID:
*/
if (!thread_group_leader(tsk)) {
- leader = tsk->group_leader;
+ struct task_struct *leader = tsk->group_leader;
sig->notify_count = -1; /* for exit_notify() */
for (;;) {
@@ -863,8 +863,9 @@ static int de_thread(struct task_struct *tsk)
BUG_ON(leader->exit_state != EXIT_ZOMBIE);
leader->exit_state = EXIT_DEAD;
-
write_unlock_irq(&tasklist_lock);
+
+ release_task(leader);
}
sig->group_exit_task = NULL;
@@ -873,8 +874,6 @@ static int de_thread(struct task_struct *tsk)
no_thread_group:
exit_itimers(sig);
flush_itimer_signals();
- if (leader)
- release_task(leader);
if (atomic_read(&oldsighand->count) != 1) {
struct sighand_struct *newsighand;
@@ -1173,41 +1172,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
unsigned int depth = bprm->recursion_depth;
int try,retval;
struct linux_binfmt *fmt;
-#ifdef __alpha__
- /* handle /sbin/loader.. */
- {
- struct exec * eh = (struct exec *) bprm->buf;
-
- if (!bprm->loader && eh->fh.f_magic == 0x183 &&
- (eh->fh.f_flags & 0x3000) == 0x3000)
- {
- struct file * file;
- unsigned long loader;
- allow_write_access(bprm->file);
- fput(bprm->file);
- bprm->file = NULL;
-
- loader = bprm->vma->vm_end - sizeof(void *);
-
- file = open_exec("/sbin/loader");
- retval = PTR_ERR(file);
- if (IS_ERR(file))
- return retval;
-
- /* Remember if the application is TASO. */
- bprm->taso = eh->ah.entry < 0x100000000UL;
-
- bprm->file = file;
- bprm->loader = loader;
- retval = prepare_binprm(bprm);
- if (retval<0)
- return retval;
- /* should call search_binary_handler recursively here,
- but it does not matter */
- }
- }
-#endif
retval = security_bprm_check(bprm);
if (retval)
return retval;
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 8d0add625870..c454d5db28a5 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -585,7 +585,10 @@ got:
spin_lock(&sbi->s_next_gen_lock);
inode->i_generation = sbi->s_next_generation++;
spin_unlock(&sbi->s_next_gen_lock);
- insert_inode_hash(inode);
+ if (insert_inode_locked(inode) < 0) {
+ err = -EINVAL;
+ goto fail_drop;
+ }
if (DQUOT_ALLOC_INODE(inode)) {
err = -EDQUOT;
@@ -612,6 +615,7 @@ fail_drop:
DQUOT_DROP(inode);
inode->i_flags |= S_NOQUOTA;
inode->i_nlink = 0;
+ unlock_new_inode(inode);
iput(inode);
return ERR_PTR(err);
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 7658b33e2653..02b39a5deb74 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -32,6 +32,7 @@
#include <linux/buffer_head.h>
#include <linux/mpage.h>
#include <linux/fiemap.h>
+#include <linux/namei.h>
#include "ext2.h"
#include "acl.h"
#include "xip.h"
@@ -1286,9 +1287,11 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
else
inode->i_mapping->a_ops = &ext2_aops;
} else if (S_ISLNK(inode->i_mode)) {
- if (ext2_inode_is_fast_symlink(inode))
+ if (ext2_inode_is_fast_symlink(inode)) {
inode->i_op = &ext2_fast_symlink_inode_operations;
- else {
+ nd_terminate_link(ei->i_data, inode->i_size,
+ sizeof(ei->i_data) - 1);
+ } else {
inode->i_op = &ext2_symlink_inode_operations;
if (test_opt(inode->i_sb, NOBH))
inode->i_mapping->a_ops = &ext2_nobh_aops;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 2a747252ec12..90ea17998a73 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -41,9 +41,11 @@ static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
int err = ext2_add_link(dentry, inode);
if (!err) {
d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
return 0;
}
inode_dec_link_count(inode);
+ unlock_new_inode(inode);
iput(inode);
return err;
}
@@ -170,6 +172,7 @@ out:
out_fail:
inode_dec_link_count(inode);
+ unlock_new_inode(inode);
iput (inode);
goto out;
}
@@ -178,6 +181,7 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
struct dentry *dentry)
{
struct inode *inode = old_dentry->d_inode;
+ int err;
if (inode->i_nlink >= EXT2_LINK_MAX)
return -EMLINK;
@@ -186,7 +190,14 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir,
inode_inc_link_count(inode);
atomic_inc(&inode->i_count);
- return ext2_add_nondir(dentry, inode);
+ err = ext2_add_link(dentry, inode);
+ if (!err) {
+ d_instantiate(dentry, inode);
+ return 0;
+ }
+ inode_dec_link_count(inode);
+ iput(inode);
+ return err;
}
static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
@@ -222,12 +233,14 @@ static int ext2_mkdir(struct inode * dir, struct dentry * dentry, int mode)
goto out_fail;
d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
out:
return err;
out_fail:
inode_dec_link_count(inode);
inode_dec_link_count(inode);
+ unlock_new_inode(inode);
iput(inode);
out_dir:
inode_dec_link_count(dir);
diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c
index 490bd0ed7896..5655fbcbd11f 100644
--- a/fs/ext3/ialloc.c
+++ b/fs/ext3/ialloc.c
@@ -579,7 +579,10 @@ got:
ext3_set_inode_flags(inode);
if (IS_DIRSYNC(inode))
handle->h_sync = 1;
- insert_inode_hash(inode);
+ if (insert_inode_locked(inode) < 0) {
+ err = -EINVAL;
+ goto fail_drop;
+ }
spin_lock(&sbi->s_next_gen_lock);
inode->i_generation = sbi->s_next_generation++;
spin_unlock(&sbi->s_next_gen_lock);
@@ -627,6 +630,7 @@ fail_drop:
DQUOT_DROP(inode);
inode->i_flags |= S_NOQUOTA;
inode->i_nlink = 0;
+ unlock_new_inode(inode);
iput(inode);
brelse(bitmap_bh);
return ERR_PTR(err);
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index f8424ad89971..5fa453b49a64 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -37,6 +37,7 @@
#include <linux/uio.h>
#include <linux/bio.h>
#include <linux/fiemap.h>
+#include <linux/namei.h>
#include "xattr.h"
#include "acl.h"
@@ -1160,7 +1161,7 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping,
to = from + len;
retry:
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
@@ -2817,9 +2818,11 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &ext3_dir_inode_operations;
inode->i_fop = &ext3_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
- if (ext3_inode_is_fast_symlink(inode))
+ if (ext3_inode_is_fast_symlink(inode)) {
inode->i_op = &ext3_fast_symlink_inode_operations;
- else {
+ nd_terminate_link(ei->i_data, inode->i_size,
+ sizeof(ei->i_data) - 1);
+ } else {
inode->i_op = &ext3_symlink_inode_operations;
ext3_set_aops(inode);
}
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 3e5edc92aa0b..1dd2abe6313e 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1652,9 +1652,11 @@ static int ext3_add_nondir(handle_t *handle,
if (!err) {
ext3_mark_inode_dirty(handle, inode);
d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
return 0;
}
drop_nlink(inode);
+ unlock_new_inode(inode);
iput(inode);
return err;
}
@@ -1765,6 +1767,7 @@ retry:
dir_block = ext3_bread (handle, inode, 0, 1, &err);
if (!dir_block) {
drop_nlink(inode); /* is this nlink == 0? */
+ unlock_new_inode(inode);
ext3_mark_inode_dirty(handle, inode);
iput (inode);
goto out_stop;
@@ -1792,6 +1795,7 @@ retry:
err = ext3_add_entry (handle, dentry, inode);
if (err) {
inode->i_nlink = 0;
+ unlock_new_inode(inode);
ext3_mark_inode_dirty(handle, inode);
iput (inode);
goto out_stop;
@@ -1800,6 +1804,7 @@ retry:
ext3_update_dx_flag(dir);
ext3_mark_inode_dirty(handle, dir);
d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
out_stop:
ext3_journal_stop(handle);
if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
@@ -2170,10 +2175,10 @@ retry:
* We have a transaction open. All is sweetness. It also sets
* i_size in generic_commit_write().
*/
- err = __page_symlink(inode, symname, l,
- mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+ err = __page_symlink(inode, symname, l, 1);
if (err) {
drop_nlink(inode);
+ unlock_new_inode(inode);
ext3_mark_inode_dirty(handle, inode);
iput (inode);
goto out_stop;
@@ -2221,7 +2226,14 @@ retry:
inc_nlink(inode);
atomic_inc(&inode->i_count);
- err = ext3_add_nondir(handle, dentry, inode);
+ err = ext3_add_entry(handle, dentry, inode);
+ if (!err) {
+ ext3_mark_inode_dirty(handle, inode);
+ d_instantiate(dentry, inode);
+ } else {
+ drop_nlink(inode);
+ iput(inode);
+ }
ext3_journal_stop(handle);
if (err == -ENOSPC && ext3_should_retry_alloc(dir->i_sb, &retries))
goto retry;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index f6c94f232ec1..c22d01467bd1 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -713,7 +713,9 @@ static struct dquot_operations ext3_quota_operations = {
.acquire_dquot = ext3_acquire_dquot,
.release_dquot = ext3_release_dquot,
.mark_dirty = ext3_mark_dquot_dirty,
- .write_info = ext3_write_info
+ .write_info = ext3_write_info,
+ .alloc_dquot = dquot_alloc,
+ .destroy_dquot = dquot_destroy,
};
static struct quotactl_ops ext3_qctl_operations = {
@@ -1035,8 +1037,7 @@ static int parse_options (char *options, struct super_block *sb,
case Opt_grpjquota:
qtype = GRPQUOTA;
set_qf_name:
- if ((sb_any_quota_enabled(sb) ||
- sb_any_quota_suspended(sb)) &&
+ if (sb_any_quota_loaded(sb) &&
!sbi->s_qf_names[qtype]) {
printk(KERN_ERR
"EXT3-fs: Cannot change journaled "
@@ -1075,8 +1076,7 @@ set_qf_name:
case Opt_offgrpjquota:
qtype = GRPQUOTA;
clear_qf_name:
- if ((sb_any_quota_enabled(sb) ||
- sb_any_quota_suspended(sb)) &&
+ if (sb_any_quota_loaded(sb) &&
sbi->s_qf_names[qtype]) {
printk(KERN_ERR "EXT3-fs: Cannot change "
"journaled quota options when "
@@ -1095,8 +1095,7 @@ clear_qf_name:
case Opt_jqfmt_vfsv0:
qfmt = QFMT_VFS_V0;
set_qf_format:
- if ((sb_any_quota_enabled(sb) ||
- sb_any_quota_suspended(sb)) &&
+ if (sb_any_quota_loaded(sb) &&
sbi->s_jquota_fmt != qfmt) {
printk(KERN_ERR "EXT3-fs: Cannot change "
"journaled quota options when "
@@ -1115,8 +1114,7 @@ set_qf_format:
set_opt(sbi->s_mount_opt, GRPQUOTA);
break;
case Opt_noquota:
- if (sb_any_quota_enabled(sb) ||
- sb_any_quota_suspended(sb)) {
+ if (sb_any_quota_loaded(sb)) {
printk(KERN_ERR "EXT3-fs: Cannot change quota "
"options when quota turned on.\n");
return 0;
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 445fde603df8..b21f16713db0 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -146,4 +146,10 @@ struct ext4_sb_info {
struct flex_groups *s_flex_groups;
};
+static inline spinlock_t *
+sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
+{
+ return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+}
+
#endif /* _EXT4_SB */
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 08cac9fcace2..6e6052879aa2 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -826,7 +826,10 @@ got:
ext4_set_inode_flags(inode);
if (IS_DIRSYNC(inode))
handle->h_sync = 1;
- insert_inode_hash(inode);
+ if (insert_inode_locked(inode) < 0) {
+ err = -EINVAL;
+ goto fail_drop;
+ }
spin_lock(&sbi->s_next_gen_lock);
inode->i_generation = sbi->s_next_generation++;
spin_unlock(&sbi->s_next_gen_lock);
@@ -881,6 +884,7 @@ fail_drop:
DQUOT_DROP(inode);
inode->i_flags |= S_NOQUOTA;
inode->i_nlink = 0;
+ unlock_new_inode(inode);
iput(inode);
brelse(bitmap_bh);
return ERR_PTR(err);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index be21a5ae33cb..6702a49992a6 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -34,6 +34,7 @@
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/mpage.h>
+#include <linux/namei.h>
#include <linux/uio.h>
#include <linux/bio.h>
#include "ext4_jbd2.h"
@@ -1345,7 +1346,7 @@ retry:
goto out;
}
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page) {
ext4_journal_stop(handle);
ret = -ENOMEM;
@@ -2549,7 +2550,7 @@ retry:
goto out;
}
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page) {
ext4_journal_stop(handle);
ret = -ENOMEM;
@@ -4164,9 +4165,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &ext4_dir_inode_operations;
inode->i_fop = &ext4_dir_operations;
} else if (S_ISLNK(inode->i_mode)) {
- if (ext4_inode_is_fast_symlink(inode))
+ if (ext4_inode_is_fast_symlink(inode)) {
inode->i_op = &ext4_fast_symlink_inode_operations;
- else {
+ nd_terminate_link(ei->i_data, inode->i_size,
+ sizeof(ei->i_data) - 1);
+ } else {
inode->i_op = &ext4_symlink_inode_operations;
ext4_set_aops(inode);
}
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 63adcb792988..9fd2a5e1be4d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1693,9 +1693,11 @@ static int ext4_add_nondir(handle_t *handle,
if (!err) {
ext4_mark_inode_dirty(handle, inode);
d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
return 0;
}
drop_nlink(inode);
+ unlock_new_inode(inode);
iput(inode);
return err;
}
@@ -1830,6 +1832,7 @@ retry:
if (err) {
out_clear_inode:
clear_nlink(inode);
+ unlock_new_inode(inode);
ext4_mark_inode_dirty(handle, inode);
iput(inode);
goto out_stop;
@@ -1838,6 +1841,7 @@ out_clear_inode:
ext4_update_dx_flag(dir);
ext4_mark_inode_dirty(handle, dir);
d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
out_stop:
ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
@@ -2208,10 +2212,10 @@ retry:
* We have a transaction open. All is sweetness. It also sets
* i_size in generic_commit_write().
*/
- err = __page_symlink(inode, symname, l,
- mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
+ err = __page_symlink(inode, symname, l, 1);
if (err) {
clear_nlink(inode);
+ unlock_new_inode(inode);
ext4_mark_inode_dirty(handle, inode);
iput(inode);
goto out_stop;
@@ -2262,7 +2266,14 @@ retry:
ext4_inc_count(handle, inode);
atomic_inc(&inode->i_count);
- err = ext4_add_nondir(handle, dentry, inode);
+ err = ext4_add_entry(handle, dentry, inode);
+ if (!err) {
+ ext4_mark_inode_dirty(handle, inode);
+ d_instantiate(dentry, inode);
+ } else {
+ drop_nlink(inode);
+ iput(inode);
+ }
ext4_journal_stop(handle);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e4a241c65dbe..9494bb249390 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -803,7 +803,9 @@ static struct dquot_operations ext4_quota_operations = {
.acquire_dquot = ext4_acquire_dquot,
.release_dquot = ext4_release_dquot,
.mark_dirty = ext4_mark_dquot_dirty,
- .write_info = ext4_write_info
+ .write_info = ext4_write_info,
+ .alloc_dquot = dquot_alloc,
+ .destroy_dquot = dquot_destroy,
};
static struct quotactl_ops ext4_qctl_operations = {
@@ -1142,8 +1144,7 @@ static int parse_options(char *options, struct super_block *sb,
case Opt_grpjquota:
qtype = GRPQUOTA;
set_qf_name:
- if ((sb_any_quota_enabled(sb) ||
- sb_any_quota_suspended(sb)) &&
+ if (sb_any_quota_loaded(sb) &&
!sbi->s_qf_names[qtype]) {
printk(KERN_ERR
"EXT4-fs: Cannot change journaled "
@@ -1182,8 +1183,7 @@ set_qf_name:
case Opt_offgrpjquota:
qtype = GRPQUOTA;
clear_qf_name:
- if ((sb_any_quota_enabled(sb) ||
- sb_any_quota_suspended(sb)) &&
+ if (sb_any_quota_loaded(sb) &&
sbi->s_qf_names[qtype]) {
printk(KERN_ERR "EXT4-fs: Cannot change "
"journaled quota options when "
@@ -1202,8 +1202,7 @@ clear_qf_name:
case Opt_jqfmt_vfsv0:
qfmt = QFMT_VFS_V0;
set_qf_format:
- if ((sb_any_quota_enabled(sb) ||
- sb_any_quota_suspended(sb)) &&
+ if (sb_any_quota_loaded(sb) &&
sbi->s_jquota_fmt != qfmt) {
printk(KERN_ERR "EXT4-fs: Cannot change "
"journaled quota options when "
@@ -1222,7 +1221,7 @@ set_qf_format:
set_opt(sbi->s_mount_opt, GRPQUOTA);
break;
case Opt_noquota:
- if (sb_any_quota_enabled(sb)) {
+ if (sb_any_quota_loaded(sb)) {
printk(KERN_ERR "EXT4-fs: Cannot change quota "
"options when quota turned on.\n");
return 0;
@@ -1721,7 +1720,7 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
/* small i_blocks in vfs inode? */
if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
/*
- * CONFIG_LSF is not enabled implies the inode
+ * CONFIG_LBD is not enabled implies the inode
* i_block represent total blocks in 512 bytes
* 32 == size of vfs inode i_blocks * 8
*/
@@ -1764,7 +1763,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
/*
- * !has_huge_files or CONFIG_LSF is not enabled
+ * !has_huge_files or CONFIG_LBD is not enabled
* implies the inode i_block represent total blocks in
* 512 bytes 32 == size of vfs inode i_blocks * 8
*/
@@ -2021,13 +2020,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
if (has_huge_files) {
/*
* Large file size enabled file system can only be
- * mount if kernel is build with CONFIG_LSF
+ * mount if kernel is build with CONFIG_LBD
*/
if (sizeof(root->i_blocks) < sizeof(u64) &&
!(sb->s_flags & MS_RDONLY)) {
printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
"files cannot be mounted read-write "
- "without CONFIG_LSF.\n", sb->s_id);
+ "without CONFIG_LBD.\n", sb->s_id);
goto failed_mount;
}
}
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 67e058357098..3a7f603b6982 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -841,7 +841,6 @@ const struct file_operations fat_dir_operations = {
.compat_ioctl = fat_compat_dir_ioctl,
#endif
.fsync = file_fsync,
- .llseek = generic_file_llseek,
};
static int fat_get_short_entry(struct inode *dir, loff_t *pos,
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index d937aaf77374..6b74d09adbe5 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -749,6 +749,8 @@ static struct dentry *fat_get_parent(struct dentry *child)
brelse(bh);
parent = d_obtain_alias(inode);
+ if (!IS_ERR(parent))
+ parent->d_op = sb->s_root->d_op;
out:
unlock_super(sb);
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c
index bf326d4356a3..8ae32e37673c 100644
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -78,7 +78,7 @@ static int vfat_revalidate_ci(struct dentry *dentry, struct nameidata *nd)
* for creation.
*/
if (!(nd->flags & (LOOKUP_CONTINUE | LOOKUP_PARENT))) {
- if (nd->flags & LOOKUP_CREATE)
+ if (nd->flags & (LOOKUP_CREATE | LOOKUP_RENAME_TARGET))
return 0;
}
diff --git a/fs/file_table.c b/fs/file_table.c
index 0fbcacc3ea75..bbeeac6efa1a 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -32,6 +32,9 @@ struct files_stat_struct files_stat = {
/* public. Not pretty! */
__cacheline_aligned_in_smp DEFINE_SPINLOCK(files_lock);
+/* SLAB cache for file structures */
+static struct kmem_cache *filp_cachep __read_mostly;
+
static struct percpu_counter nr_files __cacheline_aligned_in_smp;
static inline void file_free_rcu(struct rcu_head *head)
@@ -397,7 +400,12 @@ too_bad:
void __init files_init(unsigned long mempages)
{
int n;
- /* One file with associated inode and dcache is very roughly 1K.
+
+ filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+
+ /*
+ * One file with associated inode and dcache is very roughly 1K.
* Per default don't use more than 10% of our memory for files.
*/
diff --git a/fs/filesystems.c b/fs/filesystems.c
index d0e20ced62dd..d488dcd7f2bb 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -253,24 +253,27 @@ static int __init proc_filesystems_init(void)
module_init(proc_filesystems_init);
#endif
-struct file_system_type *get_fs_type(const char *name)
+static struct file_system_type *__get_fs_type(const char *name, int len)
{
struct file_system_type *fs;
- const char *dot = strchr(name, '.');
- unsigned len = dot ? dot - name : strlen(name);
read_lock(&file_systems_lock);
fs = *(find_filesystem(name, len));
if (fs && !try_module_get(fs->owner))
fs = NULL;
read_unlock(&file_systems_lock);
- if (!fs && (request_module("%.*s", len, name) == 0)) {
- read_lock(&file_systems_lock);
- fs = *(find_filesystem(name, len));
- if (fs && !try_module_get(fs->owner))
- fs = NULL;
- read_unlock(&file_systems_lock);
- }
+ return fs;
+}
+
+struct file_system_type *get_fs_type(const char *name)
+{
+ struct file_system_type *fs;
+ const char *dot = strchr(name, '.');
+ int len = dot ? dot - name : strlen(name);
+
+ fs = __get_fs_type(name, len);
+ if (!fs && (request_module("%.*s", len, name) == 0))
+ fs = __get_fs_type(name, len);
if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
put_filesystem(fs);
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index 9f3f2ceb73f0..03a6ea5e99f7 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -325,8 +325,10 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
if (!VXFS_ISIMMED(vip)) {
ip->i_op = &page_symlink_inode_operations;
ip->i_mapping->a_ops = &vxfs_aops;
- } else
+ } else {
ip->i_op = &vxfs_immed_symlink_iops;
+ vip->vii_immed.vi_immed[ip->i_size] = '\0';
+ }
} else
init_special_inode(ip, ip->i_mode, old_decode_dev(vip->vii_rdev));
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 34930a964b82..4c9ee7011265 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -646,7 +646,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping,
{
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- *pagep = __grab_cache_page(mapping, index);
+ *pagep = grab_cache_page_write_begin(mapping, index, flags);
if (!*pagep)
return -ENOMEM;
return 0;
@@ -779,7 +779,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
break;
err = -ENOMEM;
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, 0);
if (!page)
break;
diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c
index 27563816e1c5..15f710f2d4da 100644
--- a/fs/gfs2/ops_address.c
+++ b/fs/gfs2/ops_address.c
@@ -675,7 +675,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
goto out_trans_fail;
error = -ENOMEM;
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
*pagep = page;
if (unlikely(!page))
goto out_endtrans;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 3a31451ac170..5c538e0ec14b 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -501,7 +501,7 @@ int hostfs_write_begin(struct file *file, struct address_space *mapping,
{
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- *pagep = __grab_cache_page(mapping, index);
+ *pagep = grab_cache_page_write_begin(mapping, index, flags);
if (!*pagep)
return -ENOMEM;
return 0;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 7d479ce3aceb..0ab0c6f5f438 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -506,7 +506,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid,
inode->i_mode = mode;
inode->i_uid = uid;
inode->i_gid = gid;
- inode->i_blocks = 0;
inode->i_mapping->a_ops = &hugetlbfs_aops;
inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/inode.c b/fs/inode.c
index 0487ddba1397..bd48e5e6d3e8 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -108,84 +108,102 @@ static void wake_up_inode(struct inode *inode)
wake_up_bit(&inode->i_state, __I_LOCK);
}
-static struct inode *alloc_inode(struct super_block *sb)
+/**
+ * inode_init_always - perform inode structure intialisation
+ * @sb - superblock inode belongs to.
+ * @inode - inode to initialise
+ *
+ * These are initializations that need to be done on every inode
+ * allocation as the fields are not initialised by slab allocation.
+ */
+struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
{
static const struct address_space_operations empty_aops;
static struct inode_operations empty_iops;
static const struct file_operations empty_fops;
- struct inode *inode;
- if (sb->s_op->alloc_inode)
- inode = sb->s_op->alloc_inode(sb);
- else
- inode = (struct inode *) kmem_cache_alloc(inode_cachep, GFP_KERNEL);
-
- if (inode) {
- struct address_space * const mapping = &inode->i_data;
-
- inode->i_sb = sb;
- inode->i_blkbits = sb->s_blocksize_bits;
- inode->i_flags = 0;
- atomic_set(&inode->i_count, 1);
- inode->i_op = &empty_iops;
- inode->i_fop = &empty_fops;
- inode->i_nlink = 1;
- atomic_set(&inode->i_writecount, 0);
- inode->i_size = 0;
- inode->i_blocks = 0;
- inode->i_bytes = 0;
- inode->i_generation = 0;
+ struct address_space * const mapping = &inode->i_data;
+
+ inode->i_sb = sb;
+ inode->i_blkbits = sb->s_blocksize_bits;
+ inode->i_flags = 0;
+ atomic_set(&inode->i_count, 1);
+ inode->i_op = &empty_iops;
+ inode->i_fop = &empty_fops;
+ inode->i_nlink = 1;
+ inode->i_uid = 0;
+ inode->i_gid = 0;
+ atomic_set(&inode->i_writecount, 0);
+ inode->i_size = 0;
+ inode->i_blocks = 0;
+ inode->i_bytes = 0;
+ inode->i_generation = 0;
#ifdef CONFIG_QUOTA
- memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
+ memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
#endif
- inode->i_pipe = NULL;
- inode->i_bdev = NULL;
- inode->i_cdev = NULL;
- inode->i_rdev = 0;
- inode->dirtied_when = 0;
- if (security_inode_alloc(inode)) {
- if (inode->i_sb->s_op->destroy_inode)
- inode->i_sb->s_op->destroy_inode(inode);
- else
- kmem_cache_free(inode_cachep, (inode));
- return NULL;
- }
+ inode->i_pipe = NULL;
+ inode->i_bdev = NULL;
+ inode->i_cdev = NULL;
+ inode->i_rdev = 0;
+ inode->dirtied_when = 0;
+ if (security_inode_alloc(inode)) {
+ if (inode->i_sb->s_op->destroy_inode)
+ inode->i_sb->s_op->destroy_inode(inode);
+ else
+ kmem_cache_free(inode_cachep, (inode));
+ return NULL;
+ }
- spin_lock_init(&inode->i_lock);
- lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
+ spin_lock_init(&inode->i_lock);
+ lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
- mutex_init(&inode->i_mutex);
- lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
+ mutex_init(&inode->i_mutex);
+ lockdep_set_class(&inode->i_mutex, &sb->s_type->i_mutex_key);
- init_rwsem(&inode->i_alloc_sem);
- lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
+ init_rwsem(&inode->i_alloc_sem);
+ lockdep_set_class(&inode->i_alloc_sem, &sb->s_type->i_alloc_sem_key);
- mapping->a_ops = &empty_aops;
- mapping->host = inode;
- mapping->flags = 0;
- mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
- mapping->assoc_mapping = NULL;
- mapping->backing_dev_info = &default_backing_dev_info;
- mapping->writeback_index = 0;
+ mapping->a_ops = &empty_aops;
+ mapping->host = inode;
+ mapping->flags = 0;
+ mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE);
+ mapping->assoc_mapping = NULL;
+ mapping->backing_dev_info = &default_backing_dev_info;
+ mapping->writeback_index = 0;
- /*
- * If the block_device provides a backing_dev_info for client
- * inodes then use that. Otherwise the inode share the bdev's
- * backing_dev_info.
- */
- if (sb->s_bdev) {
- struct backing_dev_info *bdi;
+ /*
+ * If the block_device provides a backing_dev_info for client
+ * inodes then use that. Otherwise the inode share the bdev's
+ * backing_dev_info.
+ */
+ if (sb->s_bdev) {
+ struct backing_dev_info *bdi;
- bdi = sb->s_bdev->bd_inode_backing_dev_info;
- if (!bdi)
- bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
- mapping->backing_dev_info = bdi;
- }
- inode->i_private = NULL;
- inode->i_mapping = mapping;
+ bdi = sb->s_bdev->bd_inode_backing_dev_info;
+ if (!bdi)
+ bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+ mapping->backing_dev_info = bdi;
}
+ inode->i_private = NULL;
+ inode->i_mapping = mapping;
+
return inode;
}
+EXPORT_SYMBOL(inode_init_always);
+
+static struct inode *alloc_inode(struct super_block *sb)
+{
+ struct inode *inode;
+
+ if (sb->s_op->alloc_inode)
+ inode = sb->s_op->alloc_inode(sb);
+ else
+ inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
+
+ if (inode)
+ return inode_init_always(sb, inode);
+ return NULL;
+}
void destroy_inode(struct inode *inode)
{
@@ -196,6 +214,7 @@ void destroy_inode(struct inode *inode)
else
kmem_cache_free(inode_cachep, (inode));
}
+EXPORT_SYMBOL(destroy_inode);
/*
@@ -534,6 +553,49 @@ repeat:
return node ? inode : NULL;
}
+static unsigned long hash(struct super_block *sb, unsigned long hashval)
+{
+ unsigned long tmp;
+
+ tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
+ L1_CACHE_BYTES;
+ tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
+ return tmp & I_HASHMASK;
+}
+
+static inline void
+__inode_add_to_lists(struct super_block *sb, struct hlist_head *head,
+ struct inode *inode)
+{
+ inodes_stat.nr_inodes++;
+ list_add(&inode->i_list, &inode_in_use);
+ list_add(&inode->i_sb_list, &sb->s_inodes);
+ if (head)
+ hlist_add_head(&inode->i_hash, head);
+}
+
+/**
+ * inode_add_to_lists - add a new inode to relevant lists
+ * @sb - superblock inode belongs to.
+ * @inode - inode to mark in use
+ *
+ * When an inode is allocated it needs to be accounted for, added to the in use
+ * list, the owning superblock and the inode hash. This needs to be done under
+ * the inode_lock, so export a function to do this rather than the inode lock
+ * itself. We calculate the hash list to add to here so it is all internal
+ * which requires the caller to have already set up the inode number in the
+ * inode to add.
+ */
+void inode_add_to_lists(struct super_block *sb, struct inode *inode)
+{
+ struct hlist_head *head = inode_hashtable + hash(sb, inode->i_ino);
+
+ spin_lock(&inode_lock);
+ __inode_add_to_lists(sb, head, inode);
+ spin_unlock(&inode_lock);
+}
+EXPORT_SYMBOL_GPL(inode_add_to_lists);
+
/**
* new_inode - obtain an inode
* @sb: superblock
@@ -561,9 +623,7 @@ struct inode *new_inode(struct super_block *sb)
inode = alloc_inode(sb);
if (inode) {
spin_lock(&inode_lock);
- inodes_stat.nr_inodes++;
- list_add(&inode->i_list, &inode_in_use);
- list_add(&inode->i_sb_list, &sb->s_inodes);
+ __inode_add_to_lists(sb, NULL, inode);
inode->i_ino = ++last_ino;
inode->i_state = 0;
spin_unlock(&inode_lock);
@@ -622,10 +682,7 @@ static struct inode * get_new_inode(struct super_block *sb, struct hlist_head *h
if (set(inode, data))
goto set_failed;
- inodes_stat.nr_inodes++;
- list_add(&inode->i_list, &inode_in_use);
- list_add(&inode->i_sb_list, &sb->s_inodes);
- hlist_add_head(&inode->i_hash, head);
+ __inode_add_to_lists(sb, head, inode);
inode->i_state = I_LOCK|I_NEW;
spin_unlock(&inode_lock);
@@ -671,10 +728,7 @@ static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_he
old = find_inode_fast(sb, head, ino);
if (!old) {
inode->i_ino = ino;
- inodes_stat.nr_inodes++;
- list_add(&inode->i_list, &inode_in_use);
- list_add(&inode->i_sb_list, &sb->s_inodes);
- hlist_add_head(&inode->i_hash, head);
+ __inode_add_to_lists(sb, head, inode);
inode->i_state = I_LOCK|I_NEW;
spin_unlock(&inode_lock);
@@ -698,16 +752,6 @@ static struct inode * get_new_inode_fast(struct super_block *sb, struct hlist_he
return inode;
}
-static unsigned long hash(struct super_block *sb, unsigned long hashval)
-{
- unsigned long tmp;
-
- tmp = (hashval * (unsigned long)sb) ^ (GOLDEN_RATIO_PRIME + hashval) /
- L1_CACHE_BYTES;
- tmp = tmp ^ ((tmp ^ GOLDEN_RATIO_PRIME) >> I_HASHBITS);
- return tmp & I_HASHMASK;
-}
-
/**
* iunique - get a unique inode number
* @sb: superblock
@@ -990,6 +1034,65 @@ struct inode *iget_locked(struct super_block *sb, unsigned long ino)
EXPORT_SYMBOL(iget_locked);
+int insert_inode_locked(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+ ino_t ino = inode->i_ino;
+ struct hlist_head *head = inode_hashtable + hash(sb, ino);
+ struct inode *old;
+
+ inode->i_state |= I_LOCK|I_NEW;
+ while (1) {
+ spin_lock(&inode_lock);
+ old = find_inode_fast(sb, head, ino);
+ if (likely(!old)) {
+ hlist_add_head(&inode->i_hash, head);
+ spin_unlock(&inode_lock);
+ return 0;
+ }
+ __iget(old);
+ spin_unlock(&inode_lock);
+ wait_on_inode(old);
+ if (unlikely(!hlist_unhashed(&old->i_hash))) {
+ iput(old);
+ return -EBUSY;
+ }
+ iput(old);
+ }
+}
+
+EXPORT_SYMBOL(insert_inode_locked);
+
+int insert_inode_locked4(struct inode *inode, unsigned long hashval,
+ int (*test)(struct inode *, void *), void *data)
+{
+ struct super_block *sb = inode->i_sb;
+ struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+ struct inode *old;
+
+ inode->i_state |= I_LOCK|I_NEW;
+
+ while (1) {
+ spin_lock(&inode_lock);
+ old = find_inode(sb, head, test, data);
+ if (likely(!old)) {
+ hlist_add_head(&inode->i_hash, head);
+ spin_unlock(&inode_lock);
+ return 0;
+ }
+ __iget(old);
+ spin_unlock(&inode_lock);
+ wait_on_inode(old);
+ if (unlikely(!hlist_unhashed(&old->i_hash))) {
+ iput(old);
+ return -EBUSY;
+ }
+ iput(old);
+ }
+}
+
+EXPORT_SYMBOL(insert_inode_locked4);
+
/**
* __insert_inode_hash - hash an inode
* @inode: unhashed inode
@@ -1292,6 +1395,7 @@ int inode_wait(void *word)
schedule();
return 0;
}
+EXPORT_SYMBOL(inode_wait);
/*
* If we try to find an inode in the inode hash while it is being
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 3f8af0f1505b..6147ec3643a0 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -855,10 +855,6 @@ root_found:
}
sbi->s_joliet_level = joliet_level;
- /* check the root inode */
- if (!inode->i_op)
- goto out_bad_root;
-
/* Make sure the root inode is a directory */
if (!S_ISDIR(inode->i_mode)) {
printk(KERN_WARNING
@@ -886,8 +882,6 @@ root_found:
/*
* Display error messages and free resources.
*/
-out_bad_root:
- printk(KERN_WARNING "%s: root inode not initialized\n", __func__);
out_iput:
iput(inode);
goto out_no_inode;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index ebc667bc54a8..c8a1bace685a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -509,6 +509,10 @@ void jbd2_journal_commit_transaction(journal_t *journal)
if (is_journal_aborted(journal)) {
clear_buffer_jbddirty(jh2bh(jh));
JBUFFER_TRACE(jh, "journal is aborting: refile");
+ jbd2_buffer_abort_trigger(jh,
+ jh->b_frozen_data ?
+ jh->b_frozen_triggers :
+ jh->b_triggers);
jbd2_journal_refile_buffer(journal, jh);
/* If that was the last one, we need to clean up
* any descriptor buffers which may have been
@@ -844,6 +848,9 @@ restart_loop:
* data.
*
* Otherwise, we can just throw away the frozen data now.
+ *
+ * We also know that the frozen data has already fired
+ * its triggers if they exist, so we can clear that too.
*/
if (jh->b_committed_data) {
jbd2_free(jh->b_committed_data, bh->b_size);
@@ -851,10 +858,12 @@ restart_loop:
if (jh->b_frozen_data) {
jh->b_committed_data = jh->b_frozen_data;
jh->b_frozen_data = NULL;
+ jh->b_frozen_triggers = NULL;
}
} else if (jh->b_frozen_data) {
jbd2_free(jh->b_frozen_data, bh->b_size);
jh->b_frozen_data = NULL;
+ jh->b_frozen_triggers = NULL;
}
spin_lock(&journal->j_list_lock);
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e70d657a19f8..f6bff9d6f8df 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -50,6 +50,7 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates);
EXPORT_SYMBOL(jbd2_journal_get_write_access);
EXPORT_SYMBOL(jbd2_journal_get_create_access);
EXPORT_SYMBOL(jbd2_journal_get_undo_access);
+EXPORT_SYMBOL(jbd2_journal_set_triggers);
EXPORT_SYMBOL(jbd2_journal_dirty_metadata);
EXPORT_SYMBOL(jbd2_journal_release_buffer);
EXPORT_SYMBOL(jbd2_journal_forget);
@@ -290,6 +291,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction,
struct page *new_page;
unsigned int new_offset;
struct buffer_head *bh_in = jh2bh(jh_in);
+ struct jbd2_buffer_trigger_type *triggers;
/*
* The buffer really shouldn't be locked: only the current committing
@@ -314,13 +316,23 @@ repeat:
done_copy_out = 1;
new_page = virt_to_page(jh_in->b_frozen_data);
new_offset = offset_in_page(jh_in->b_frozen_data);
+ triggers = jh_in->b_frozen_triggers;
} else {
new_page = jh2bh(jh_in)->b_page;
new_offset = offset_in_page(jh2bh(jh_in)->b_data);
+ triggers = jh_in->b_triggers;
}
mapped_data = kmap_atomic(new_page, KM_USER0);
/*
+ * Fire any commit trigger. Do this before checking for escaping,
+ * as the trigger may modify the magic offset. If a copy-out
+ * happens afterwards, it will have the correct data in the buffer.
+ */
+ jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset,
+ triggers);
+
+ /*
* Check for escaping
*/
if (*((__be32 *)(mapped_data + new_offset)) ==
@@ -352,6 +364,13 @@ repeat:
new_page = virt_to_page(tmp);
new_offset = offset_in_page(tmp);
done_copy_out = 1;
+
+ /*
+ * This isn't strictly necessary, as we're using frozen
+ * data for the escaping, but it keeps consistency with
+ * b_frozen_data usage.
+ */
+ jh_in->b_frozen_triggers = jh_in->b_triggers;
}
/*
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 39b7805a599a..4f925a4f3d05 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -741,6 +741,12 @@ done:
source = kmap_atomic(page, KM_USER0);
memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size);
kunmap_atomic(source, KM_USER0);
+
+ /*
+ * Now that the frozen data is saved off, we need to store
+ * any matching triggers.
+ */
+ jh->b_frozen_triggers = jh->b_triggers;
}
jbd_unlock_bh_state(bh);
@@ -944,6 +950,47 @@ out:
}
/**
+ * void jbd2_journal_set_triggers() - Add triggers for commit writeout
+ * @bh: buffer to trigger on
+ * @type: struct jbd2_buffer_trigger_type containing the trigger(s).
+ *
+ * Set any triggers on this journal_head. This is always safe, because
+ * triggers for a committing buffer will be saved off, and triggers for
+ * a running transaction will match the buffer in that transaction.
+ *
+ * Call with NULL to clear the triggers.
+ */
+void jbd2_journal_set_triggers(struct buffer_head *bh,
+ struct jbd2_buffer_trigger_type *type)
+{
+ struct journal_head *jh = bh2jh(bh);
+
+ jh->b_triggers = type;
+}
+
+void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data,
+ struct jbd2_buffer_trigger_type *triggers)
+{
+ struct buffer_head *bh = jh2bh(jh);
+
+ if (!triggers || !triggers->t_commit)
+ return;
+
+ triggers->t_commit(triggers, bh, mapped_data, bh->b_size);
+}
+
+void jbd2_buffer_abort_trigger(struct journal_head *jh,
+ struct jbd2_buffer_trigger_type *triggers)
+{
+ if (!triggers || !triggers->t_abort)
+ return;
+
+ triggers->t_abort(triggers, jh2bh(jh));
+}
+
+
+
+/**
* int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata
* @handle: transaction to add buffer to.
* @bh: buffer to mark
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 5a98aa87c853..5edc2bf20581 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -132,7 +132,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
uint32_t pageofs = index << PAGE_CACHE_SHIFT;
int ret = 0;
- pg = __grab_cache_page(mapping, index);
+ pg = grab_cache_page_write_begin(mapping, index, flags);
if (!pg)
return -ENOMEM;
*pagep = pg;
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 210339784b56..b00ee9f05a06 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -59,8 +59,14 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
if (inode->i_size >= IDATASIZE) {
inode->i_op = &page_symlink_inode_operations;
inode->i_mapping->a_ops = &jfs_aops;
- } else
+ } else {
inode->i_op = &jfs_symlink_inode_operations;
+ /*
+ * The inline data should be null-terminated, but
+ * don't let on-disk corruption crash the kernel
+ */
+ JFS_IP(inode)->i_inline[inode->i_size] = '\0';
+ }
} else {
inode->i_op = &jfs_file_inode_operations;
init_special_inode(inode, inode->i_mode, inode->i_rdev);
diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c
index d6363d8309d0..0f94381ca6d0 100644
--- a/fs/jfs/jfs_imap.c
+++ b/fs/jfs/jfs_imap.c
@@ -58,9 +58,9 @@
/*
* __mark_inode_dirty expects inodes to be hashed. Since we don't want
- * special inodes in the fileset inode space, we hash them to a dummy head
+ * special inodes in the fileset inode space, we make them appear hashed,
+ * but do not put on any lists.
*/
-static HLIST_HEAD(aggregate_hash);
/*
* imap locks
@@ -496,7 +496,11 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary)
/* release the page */
release_metapage(mp);
- hlist_add_head(&ip->i_hash, &aggregate_hash);
+ /*
+ * that will look hashed, but won't be on any list; hlist_del()
+ * will work fine and require no locking.
+ */
+ ip->i_hash.pprev = &ip->i_hash.next;
return (ip);
}
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 70022fd1c539..d4d142c2edd4 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -79,7 +79,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
inode = new_inode(sb);
if (!inode) {
jfs_warn("ialloc: new_inode returned NULL!");
- return ERR_PTR(-ENOMEM);
+ rc = -ENOMEM;
+ goto fail;
}
jfs_inode = JFS_IP(inode);
@@ -89,8 +90,12 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
jfs_warn("ialloc: diAlloc returned %d!", rc);
if (rc == -EIO)
make_bad_inode(inode);
- iput(inode);
- return ERR_PTR(rc);
+ goto fail_put;
+ }
+
+ if (insert_inode_locked(inode) < 0) {
+ rc = -EINVAL;
+ goto fail_unlock;
}
inode->i_uid = current_fsuid();
@@ -112,11 +117,8 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
* Allocate inode to quota.
*/
if (DQUOT_ALLOC_INODE(inode)) {
- DQUOT_DROP(inode);
- inode->i_flags |= S_NOQUOTA;
- inode->i_nlink = 0;
- iput(inode);
- return ERR_PTR(-EDQUOT);
+ rc = -EDQUOT;
+ goto fail_drop;
}
inode->i_mode = mode;
@@ -158,4 +160,15 @@ struct inode *ialloc(struct inode *parent, umode_t mode)
jfs_info("ialloc returns inode = 0x%p\n", inode);
return inode;
+
+fail_drop:
+ DQUOT_DROP(inode);
+ inode->i_flags |= S_NOQUOTA;
+fail_unlock:
+ inode->i_nlink = 0;
+ unlock_new_inode(inode);
+fail_put:
+ iput(inode);
+fail:
+ return ERR_PTR(rc);
}
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index cc3cedffbfa1..b4de56b851e4 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -155,7 +155,6 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
ip->i_fop = &jfs_file_operations;
ip->i_mapping->a_ops = &jfs_aops;
- insert_inode_hash(ip);
mark_inode_dirty(ip);
dip->i_ctime = dip->i_mtime = CURRENT_TIME;
@@ -171,9 +170,12 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode,
if (rc) {
free_ea_wmap(ip);
ip->i_nlink = 0;
+ unlock_new_inode(ip);
iput(ip);
- } else
+ } else {
d_instantiate(dentry, ip);
+ unlock_new_inode(ip);
+ }
out2:
free_UCSname(&dname);
@@ -289,7 +291,6 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
ip->i_op = &jfs_dir_inode_operations;
ip->i_fop = &jfs_dir_operations;
- insert_inode_hash(ip);
mark_inode_dirty(ip);
/* update parent directory inode */
@@ -306,9 +307,12 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode)
if (rc) {
free_ea_wmap(ip);
ip->i_nlink = 0;
+ unlock_new_inode(ip);
iput(ip);
- } else
+ } else {
d_instantiate(dentry, ip);
+ unlock_new_inode(ip);
+ }
out2:
free_UCSname(&dname);
@@ -1019,7 +1023,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
goto out3;
}
- insert_inode_hash(ip);
mark_inode_dirty(ip);
dip->i_ctime = dip->i_mtime = CURRENT_TIME;
@@ -1039,9 +1042,12 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
if (rc) {
free_ea_wmap(ip);
ip->i_nlink = 0;
+ unlock_new_inode(ip);
iput(ip);
- } else
+ } else {
d_instantiate(dentry, ip);
+ unlock_new_inode(ip);
+ }
out2:
free_UCSname(&dname);
@@ -1399,7 +1405,6 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
jfs_ip->dev = new_encode_dev(rdev);
init_special_inode(ip, ip->i_mode, rdev);
- insert_inode_hash(ip);
mark_inode_dirty(ip);
dir->i_ctime = dir->i_mtime = CURRENT_TIME;
@@ -1417,9 +1422,12 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry,
if (rc) {
free_ea_wmap(ip);
ip->i_nlink = 0;
+ unlock_new_inode(ip);
iput(ip);
- } else
+ } else {
d_instantiate(dentry, ip);
+ unlock_new_inode(ip);
+ }
out1:
free_UCSname(&dname);
diff --git a/fs/libfs.c b/fs/libfs.c
index e960a8321902..49b44099dabb 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -231,7 +231,6 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
*/
root->i_ino = 1;
root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR;
- root->i_uid = root->i_gid = 0;
root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME;
dentry = d_alloc(NULL, &d_name);
if (!dentry) {
@@ -360,7 +359,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
index = pos >> PAGE_CACHE_SHIFT;
from = pos & (PAGE_CACHE_SIZE - 1);
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
@@ -436,8 +435,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
*/
inode->i_ino = 1;
inode->i_mode = S_IFDIR | 0755;
- inode->i_uid = inode->i_gid = 0;
- inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
@@ -464,8 +461,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files
if (!inode)
goto out;
inode->i_mode = S_IFREG | files->mode;
- inode->i_uid = inode->i_gid = 0;
- inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_fop = files->ops;
inode->i_ino = i;
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index 8307dd64bf46..1f3b0fc0d351 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -14,6 +14,7 @@
#include <linux/sunrpc/svc.h>
#include <linux/lockd/lockd.h>
#include <linux/smp_lock.h>
+#include <linux/kthread.h>
#define NLMDBG_FACILITY NLMDBG_CLIENT
@@ -60,7 +61,7 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init)
host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen,
nlm_init->protocol, nlm_version,
- nlm_init->hostname);
+ nlm_init->hostname, nlm_init->noresvport);
if (host == NULL) {
lockd_down();
return ERR_PTR(-ENOLCK);
@@ -191,11 +192,15 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
void
nlmclnt_recovery(struct nlm_host *host)
{
+ struct task_struct *task;
+
if (!host->h_reclaiming++) {
nlm_get_host(host);
- __module_get(THIS_MODULE);
- if (kernel_thread(reclaimer, host, CLONE_FS | CLONE_FILES) < 0)
- module_put(THIS_MODULE);
+ task = kthread_run(reclaimer, host, "%s-reclaim", host->h_name);
+ if (IS_ERR(task))
+ printk(KERN_ERR "lockd: unable to spawn reclaimer "
+ "thread. Locks for %s won't be reclaimed! "
+ "(%ld)\n", host->h_name, PTR_ERR(task));
}
}
@@ -207,7 +212,6 @@ reclaimer(void *ptr)
struct file_lock *fl, *next;
u32 nsmstate;
- daemonize("%s-reclaim", host->h_name);
allow_signal(SIGKILL);
down_write(&host->h_rwsem);
@@ -233,7 +237,12 @@ restart:
list_for_each_entry_safe(fl, next, &host->h_reclaim, fl_u.nfs_fl.list) {
list_del_init(&fl->fl_u.nfs_fl.list);
- /* Why are we leaking memory here? --okir */
+ /*
+ * sending this thread a SIGKILL will result in any unreclaimed
+ * locks being removed from the h_granted list. This means that
+ * the kernel will not attempt to reclaim them again if a new
+ * reclaimer thread is spawned for this host.
+ */
if (signalled())
continue;
if (nlmclnt_reclaim(host, fl) != 0)
@@ -261,5 +270,5 @@ restart:
nlm_release_host(host);
lockd_down();
unlock_kernel();
- module_put_and_exit(0);
+ return 0;
}
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index e05d04416037..abdebf76b820 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -48,6 +48,7 @@ struct nlm_lookup_host_info {
const size_t hostname_len; /* it's length */
const struct sockaddr *src_sap; /* our address (optional) */
const size_t src_len; /* it's length */
+ const int noresvport; /* use non-priv port */
};
/*
@@ -222,6 +223,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
host->h_nsmstate = 0; /* real NSM state */
host->h_nsmhandle = nsm;
host->h_server = ni->server;
+ host->h_noresvport = ni->noresvport;
hlist_add_head(&host->h_hash, chain);
INIT_LIST_HEAD(&host->h_lockowners);
spin_lock_init(&host->h_lock);
@@ -272,6 +274,7 @@ nlm_destroy_host(struct nlm_host *host)
* @protocol: transport protocol to use
* @version: NLM protocol version
* @hostname: '\0'-terminated hostname of server
+ * @noresvport: 1 if non-privileged port should be used
*
* Returns an nlm_host structure that matches the passed-in
* [server address, transport protocol, NLM version, server hostname].
@@ -281,7 +284,9 @@ nlm_destroy_host(struct nlm_host *host)
struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
const size_t salen,
const unsigned short protocol,
- const u32 version, const char *hostname)
+ const u32 version,
+ const char *hostname,
+ int noresvport)
{
const struct sockaddr source = {
.sa_family = AF_UNSPEC,
@@ -296,6 +301,7 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap,
.hostname_len = strlen(hostname),
.src_sap = &source,
.src_len = sizeof(source),
+ .noresvport = noresvport,
};
dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__,
@@ -417,6 +423,8 @@ nlm_bind_host(struct nlm_host *host)
*/
if (!host->h_server)
args.flags |= RPC_CLNT_CREATE_HARDRTRY;
+ if (host->h_noresvport)
+ args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
clnt = rpc_create(&args);
if (!IS_ERR(clnt))
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 56b076736b56..252d80163d02 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -45,7 +45,7 @@
static struct svc_program nlmsvc_program;
struct nlmsvc_binding * nlmsvc_ops;
-EXPORT_SYMBOL(nlmsvc_ops);
+EXPORT_SYMBOL_GPL(nlmsvc_ops);
static DEFINE_MUTEX(nlmsvc_mutex);
static unsigned int nlmsvc_users;
@@ -300,7 +300,7 @@ out:
mutex_unlock(&nlmsvc_mutex);
return error;
}
-EXPORT_SYMBOL(lockd_up);
+EXPORT_SYMBOL_GPL(lockd_up);
/*
* Decrement the user count and bring down lockd if we're the last.
@@ -329,7 +329,7 @@ lockd_down(void)
out:
mutex_unlock(&nlmsvc_mutex);
}
-EXPORT_SYMBOL(lockd_down);
+EXPORT_SYMBOL_GPL(lockd_down);
#ifdef CONFIG_SYSCTL
diff --git a/fs/namei.c b/fs/namei.c
index af3783fff1de..f05bed242422 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -226,6 +226,16 @@ int generic_permission(struct inode *inode, int mask,
return -EACCES;
}
+/**
+ * inode_permission - check for access rights to a given inode
+ * @inode: inode to check permission on
+ * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ *
+ * Used to check for read/write/execute permissions on an inode.
+ * We use "fsuid" for this, letting us set arbitrary permissions
+ * for filesystem access without changing the "normal" uids which
+ * are used for other things.
+ */
int inode_permission(struct inode *inode, int mask)
{
int retval;
@@ -247,8 +257,7 @@ int inode_permission(struct inode *inode, int mask)
return -EACCES;
}
- /* Ordinary permission routines do not understand MAY_APPEND. */
- if (inode->i_op && inode->i_op->permission)
+ if (inode->i_op->permission)
retval = inode->i_op->permission(inode, mask);
else
retval = generic_permission(inode, mask, NULL);
@@ -265,21 +274,6 @@ int inode_permission(struct inode *inode, int mask)
}
/**
- * vfs_permission - check for access rights to a given path
- * @nd: lookup result that describes the path
- * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
- *
- * Used to check for read/write/execute permissions on a path.
- * We use "fsuid" for this, letting us set arbitrary permissions
- * for filesystem access without changing the "normal" uids which
- * are used for other things.
- */
-int vfs_permission(struct nameidata *nd, int mask)
-{
- return inode_permission(nd->path.dentry->d_inode, mask);
-}
-
-/**
* file_permission - check for additional access rights to a given file
* @file: file to check access rights for
* @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
@@ -289,7 +283,7 @@ int vfs_permission(struct nameidata *nd, int mask)
*
* Note:
* Do not use this function in new code. All access checks should
- * be done using vfs_permission().
+ * be done using inode_permission().
*/
int file_permission(struct file *file, int mask)
{
@@ -438,7 +432,7 @@ static int exec_permission_lite(struct inode *inode)
{
umode_t mode = inode->i_mode;
- if (inode->i_op && inode->i_op->permission)
+ if (inode->i_op->permission)
return -EAGAIN;
if (current_fsuid() == inode->i_uid)
@@ -527,18 +521,6 @@ out_unlock:
return result;
}
-/* SMP-safe */
-static __always_inline void
-walk_init_root(const char *name, struct nameidata *nd)
-{
- struct fs_struct *fs = current->fs;
-
- read_lock(&fs->lock);
- nd->path = fs->root;
- path_get(&fs->root);
- read_unlock(&fs->lock);
-}
-
/*
* Wrapper to retry pathname resolution whenever the underlying
* file system returns an ESTALE.
@@ -576,9 +558,16 @@ static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *l
goto fail;
if (*link == '/') {
+ struct fs_struct *fs = current->fs;
+
path_put(&nd->path);
- walk_init_root(link, nd);
+
+ read_lock(&fs->lock);
+ nd->path = fs->root;
+ path_get(&fs->root);
+ read_unlock(&fs->lock);
}
+
res = link_path_walk(link, nd);
if (nd->depth || res || nd->last_type!=LAST_NORM)
return res;
@@ -859,7 +848,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
nd->flags |= LOOKUP_CONTINUE;
err = exec_permission_lite(inode);
if (err == -EAGAIN)
- err = vfs_permission(nd, MAY_EXEC);
+ err = inode_permission(nd->path.dentry->d_inode,
+ MAY_EXEC);
if (err)
break;
@@ -918,9 +908,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
inode = next.dentry->d_inode;
if (!inode)
goto out_dput;
- err = -ENOTDIR;
- if (!inode->i_op)
- goto out_dput;
if (inode->i_op->follow_link) {
err = do_follow_link(&next, nd);
@@ -930,9 +917,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
inode = nd->path.dentry->d_inode;
if (!inode)
break;
- err = -ENOTDIR;
- if (!inode->i_op)
- break;
} else
path_to_nameidata(&next, nd);
err = -ENOTDIR;
@@ -971,7 +955,7 @@ last_component:
break;
inode = next.dentry->d_inode;
if ((lookup_flags & LOOKUP_FOLLOW)
- && inode && inode->i_op && inode->i_op->follow_link) {
+ && inode && inode->i_op->follow_link) {
err = do_follow_link(&next, nd);
if (err)
goto return_err;
@@ -983,7 +967,7 @@ last_component:
break;
if (lookup_flags & LOOKUP_DIRECTORY) {
err = -ENOTDIR;
- if (!inode->i_op || !inode->i_op->lookup)
+ if (!inode->i_op->lookup)
break;
}
goto return_base;
@@ -1479,7 +1463,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
if (error)
return error;
- if (!dir->i_op || !dir->i_op->create)
+ if (!dir->i_op->create)
return -EACCES; /* shouldn't it be ENOSYS? */
mode &= S_IALLUGO;
mode |= S_IFREG;
@@ -1493,9 +1477,9 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode,
return error;
}
-int may_open(struct nameidata *nd, int acc_mode, int flag)
+int may_open(struct path *path, int acc_mode, int flag)
{
- struct dentry *dentry = nd->path.dentry;
+ struct dentry *dentry = path->dentry;
struct inode *inode = dentry->d_inode;
int error;
@@ -1516,13 +1500,13 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
flag &= ~O_TRUNC;
} else if (S_ISBLK(inode->i_mode) || S_ISCHR(inode->i_mode)) {
- if (nd->path.mnt->mnt_flags & MNT_NODEV)
+ if (path->mnt->mnt_flags & MNT_NODEV)
return -EACCES;
flag &= ~O_TRUNC;
}
- error = vfs_permission(nd, acc_mode);
+ error = inode_permission(inode, acc_mode);
if (error)
return error;
/*
@@ -1556,6 +1540,9 @@ int may_open(struct nameidata *nd, int acc_mode, int flag)
* Refuse to truncate files with mandatory locks held on them.
*/
error = locks_verify_locked(inode);
+ if (!error)
+ error = security_path_truncate(path, 0,
+ ATTR_MTIME|ATTR_CTIME|ATTR_OPEN);
if (!error) {
DQUOT_INIT(inode);
@@ -1586,14 +1573,18 @@ static int __open_namei_create(struct nameidata *nd, struct path *path,
if (!IS_POSIXACL(dir->d_inode))
mode &= ~current->fs->umask;
+ error = security_path_mknod(&nd->path, path->dentry, mode, 0);
+ if (error)
+ goto out_unlock;
error = vfs_create(dir->d_inode, path->dentry, mode, nd);
+out_unlock:
mutex_unlock(&dir->d_inode->i_mutex);
dput(nd->path.dentry);
nd->path.dentry = path->dentry;
if (error)
return error;
/* Don't check for write permission, don't truncate */
- return may_open(nd, 0, flag & ~O_TRUNC);
+ return may_open(&nd->path, 0, flag & ~O_TRUNC);
}
/*
@@ -1755,7 +1746,7 @@ do_last:
error = -ENOENT;
if (!path.dentry->d_inode)
goto exit_dput;
- if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link)
+ if (path.dentry->d_inode->i_op->follow_link)
goto do_link;
path_to_nameidata(&path, &nd);
@@ -1779,7 +1770,7 @@ ok:
if (error)
goto exit;
}
- error = may_open(&nd, acc_mode, flag);
+ error = may_open(&nd.path, acc_mode, flag);
if (error) {
if (will_write)
mnt_drop_write(nd.path.mnt);
@@ -1936,7 +1927,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
return -EPERM;
- if (!dir->i_op || !dir->i_op->mknod)
+ if (!dir->i_op->mknod)
return -EPERM;
error = devcgroup_inode_mknod(mode, dev);
@@ -1999,6 +1990,9 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
error = mnt_want_write(nd.path.mnt);
if (error)
goto out_dput;
+ error = security_path_mknod(&nd.path, dentry, mode, dev);
+ if (error)
+ goto out_drop_write;
switch (mode & S_IFMT) {
case 0: case S_IFREG:
error = vfs_create(nd.path.dentry->d_inode,dentry,mode,&nd);
@@ -2011,6 +2005,7 @@ asmlinkage long sys_mknodat(int dfd, const char __user *filename, int mode,
error = vfs_mknod(nd.path.dentry->d_inode,dentry,mode,0);
break;
}
+out_drop_write:
mnt_drop_write(nd.path.mnt);
out_dput:
dput(dentry);
@@ -2034,7 +2029,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
if (error)
return error;
- if (!dir->i_op || !dir->i_op->mkdir)
+ if (!dir->i_op->mkdir)
return -EPERM;
mode &= (S_IRWXUGO|S_ISVTX);
@@ -2070,7 +2065,11 @@ asmlinkage long sys_mkdirat(int dfd, const char __user *pathname, int mode)
error = mnt_want_write(nd.path.mnt);
if (error)
goto out_dput;
+ error = security_path_mkdir(&nd.path, dentry, mode);
+ if (error)
+ goto out_drop_write;
error = vfs_mkdir(nd.path.dentry->d_inode, dentry, mode);
+out_drop_write:
mnt_drop_write(nd.path.mnt);
out_dput:
dput(dentry);
@@ -2121,7 +2120,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
if (error)
return error;
- if (!dir->i_op || !dir->i_op->rmdir)
+ if (!dir->i_op->rmdir)
return -EPERM;
DQUOT_INIT(dir);
@@ -2180,7 +2179,11 @@ static long do_rmdir(int dfd, const char __user *pathname)
error = mnt_want_write(nd.path.mnt);
if (error)
goto exit3;
+ error = security_path_rmdir(&nd.path, dentry);
+ if (error)
+ goto exit4;
error = vfs_rmdir(nd.path.dentry->d_inode, dentry);
+exit4:
mnt_drop_write(nd.path.mnt);
exit3:
dput(dentry);
@@ -2204,7 +2207,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry)
if (error)
return error;
- if (!dir->i_op || !dir->i_op->unlink)
+ if (!dir->i_op->unlink)
return -EPERM;
DQUOT_INIT(dir);
@@ -2265,7 +2268,11 @@ static long do_unlinkat(int dfd, const char __user *pathname)
error = mnt_want_write(nd.path.mnt);
if (error)
goto exit2;
+ error = security_path_unlink(&nd.path, dentry);
+ if (error)
+ goto exit3;
error = vfs_unlink(nd.path.dentry->d_inode, dentry);
+exit3:
mnt_drop_write(nd.path.mnt);
exit2:
dput(dentry);
@@ -2307,7 +2314,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
if (error)
return error;
- if (!dir->i_op || !dir->i_op->symlink)
+ if (!dir->i_op->symlink)
return -EPERM;
error = security_inode_symlink(dir, dentry, oldname);
@@ -2346,7 +2353,11 @@ asmlinkage long sys_symlinkat(const char __user *oldname,
error = mnt_want_write(nd.path.mnt);
if (error)
goto out_dput;
+ error = security_path_symlink(&nd.path, dentry, from);
+ if (error)
+ goto out_drop_write;
error = vfs_symlink(nd.path.dentry->d_inode, dentry, from);
+out_drop_write:
mnt_drop_write(nd.path.mnt);
out_dput:
dput(dentry);
@@ -2384,7 +2395,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
*/
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
return -EPERM;
- if (!dir->i_op || !dir->i_op->link)
+ if (!dir->i_op->link)
return -EPERM;
if (S_ISDIR(inode->i_mode))
return -EPERM;
@@ -2443,7 +2454,11 @@ asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
error = mnt_want_write(nd.path.mnt);
if (error)
goto out_dput;
+ error = security_path_link(old_path.dentry, &nd.path, new_dentry);
+ if (error)
+ goto out_drop_write;
error = vfs_link(old_path.dentry, nd.path.dentry->d_inode, new_dentry);
+out_drop_write:
mnt_drop_write(nd.path.mnt);
out_dput:
dput(new_dentry);
@@ -2587,7 +2602,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (error)
return error;
- if (!old_dir->i_op || !old_dir->i_op->rename)
+ if (!old_dir->i_op->rename)
return -EPERM;
DQUOT_INIT(old_dir);
@@ -2679,8 +2694,13 @@ asmlinkage long sys_renameat(int olddfd, const char __user *oldname,
error = mnt_want_write(oldnd.path.mnt);
if (error)
goto exit5;
+ error = security_path_rename(&oldnd.path, old_dentry,
+ &newnd.path, new_dentry);
+ if (error)
+ goto exit6;
error = vfs_rename(old_dir->d_inode, old_dentry,
new_dir->d_inode, new_dentry);
+exit6:
mnt_drop_write(oldnd.path.mnt);
exit5:
dput(new_dentry);
@@ -2750,13 +2770,16 @@ int vfs_follow_link(struct nameidata *nd, const char *link)
/* get the link contents into pagecache */
static char *page_getlink(struct dentry * dentry, struct page **ppage)
{
- struct page * page;
+ char *kaddr;
+ struct page *page;
struct address_space *mapping = dentry->d_inode->i_mapping;
page = read_mapping_page(mapping, 0, NULL);
if (IS_ERR(page))
return (char*)page;
*ppage = page;
- return kmap(page);
+ kaddr = kmap(page);
+ nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
+ return kaddr;
}
int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
@@ -2788,18 +2811,23 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie)
}
}
-int __page_symlink(struct inode *inode, const char *symname, int len,
- gfp_t gfp_mask)
+/*
+ * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
+ */
+int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
void *fsdata;
int err;
char *kaddr;
+ unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
+ if (nofs)
+ flags |= AOP_FLAG_NOFS;
retry:
err = pagecache_write_begin(NULL, mapping, 0, len-1,
- AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata);
+ flags, &page, &fsdata);
if (err)
goto fail;
@@ -2823,7 +2851,7 @@ fail:
int page_symlink(struct inode *inode, const char *symname, int len)
{
return __page_symlink(inode, symname, len,
- mapping_gfp_mask(inode->i_mapping));
+ !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS));
}
const struct inode_operations page_symlink_inode_operations = {
@@ -2849,7 +2877,6 @@ EXPORT_SYMBOL(path_lookup);
EXPORT_SYMBOL(kern_path);
EXPORT_SYMBOL(vfs_path_lookup);
EXPORT_SYMBOL(inode_permission);
-EXPORT_SYMBOL(vfs_permission);
EXPORT_SYMBOL(file_permission);
EXPORT_SYMBOL(unlock_rename);
EXPORT_SYMBOL(vfs_create);
@@ -2865,3 +2892,10 @@ EXPORT_SYMBOL(vfs_symlink);
EXPORT_SYMBOL(vfs_unlink);
EXPORT_SYMBOL(dentry_unhash);
EXPORT_SYMBOL(generic_readlink);
+
+/* to be mentioned only in INIT_TASK */
+struct fs_struct init_fs = {
+ .count = ATOMIC_INIT(1),
+ .lock = __RW_LOCK_UNLOCKED(init_fs.lock),
+ .umask = 0022,
+};
diff --git a/fs/namespace.c b/fs/namespace.c
index 1c09cab8f7cf..a40685d800a8 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1990,7 +1990,7 @@ static struct mnt_namespace *dup_mnt_ns(struct mnt_namespace *mnt_ns,
if (!new_ns->root) {
up_write(&namespace_sem);
kfree(new_ns);
- return ERR_PTR(-ENOMEM);;
+ return ERR_PTR(-ENOMEM);
}
spin_lock(&vfsmount_lock);
list_add_tail(&new_ns->list, &new_ns->root->mnt_list);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index c2e9cfd9e5a4..3e634f2a1083 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -16,6 +16,7 @@
#include <linux/mutex.h>
#include <linux/freezer.h>
#include <linux/kthread.h>
+#include <linux/sunrpc/svcauth_gss.h>
#include <net/inet_sock.h>
@@ -182,10 +183,34 @@ void nfs_callback_down(void)
mutex_unlock(&nfs_callback_mutex);
}
+static int check_gss_callback_principal(struct nfs_client *clp,
+ struct svc_rqst *rqstp)
+{
+ struct rpc_clnt *r = clp->cl_rpcclient;
+ char *p = svc_gss_principal(rqstp);
+
+ /*
+ * It might just be a normal user principal, in which case
+ * userspace won't bother to tell us the name at all.
+ */
+ if (p == NULL)
+ return SVC_DENIED;
+
+ /* Expect a GSS_C_NT_HOSTBASED_NAME like "nfs@serverhostname" */
+
+ if (memcmp(p, "nfs@", 4) != 0)
+ return SVC_DENIED;
+ p += 4;
+ if (strcmp(p, r->cl_server) != 0)
+ return SVC_DENIED;
+ return SVC_OK;
+}
+
static int nfs_callback_authenticate(struct svc_rqst *rqstp)
{
struct nfs_client *clp;
RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
+ int ret = SVC_OK;
/* Don't talk to strangers */
clp = nfs_find_client(svc_addr(rqstp), 4);
@@ -194,21 +219,22 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp)
dprintk("%s: %s NFSv4 callback!\n", __func__,
svc_print_addr(rqstp, buf, sizeof(buf)));
- nfs_put_client(clp);
switch (rqstp->rq_authop->flavour) {
case RPC_AUTH_NULL:
if (rqstp->rq_proc != CB_NULL)
- return SVC_DENIED;
+ ret = SVC_DENIED;
break;
case RPC_AUTH_UNIX:
break;
case RPC_AUTH_GSS:
- /* FIXME: RPCSEC_GSS handling? */
+ ret = check_gss_callback_principal(clp, rqstp);
+ break;
default:
- return SVC_DENIED;
+ ret = SVC_DENIED;
}
- return SVC_OK;
+ nfs_put_client(clp);
+ return ret;
}
/*
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 7547600b6174..9b728f3565a1 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -143,7 +143,6 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_
clp->cl_proto = cl_init->proto;
#ifdef CONFIG_NFS_V4
- init_rwsem(&clp->cl_sem);
INIT_LIST_HEAD(&clp->cl_delegations);
spin_lock_init(&clp->cl_lock);
INIT_DELAYED_WORK(&clp->cl_renewd, nfs4_renew_state);
@@ -224,31 +223,54 @@ void nfs_put_client(struct nfs_client *clp)
}
}
-static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
- const struct sockaddr_in *sa2)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static const struct in6_addr *nfs_map_ipv4_addr(const struct sockaddr *sa, struct in6_addr *addr_mapped)
{
- return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
+ switch (sa->sa_family) {
+ default:
+ return NULL;
+ case AF_INET6:
+ return &((const struct sockaddr_in6 *)sa)->sin6_addr;
+ break;
+ case AF_INET:
+ ipv6_addr_set_v4mapped(((const struct sockaddr_in *)sa)->sin_addr.s_addr,
+ addr_mapped);
+ return addr_mapped;
+ }
}
-static int nfs_sockaddr_match_ipaddr6(const struct sockaddr_in6 *sa1,
- const struct sockaddr_in6 *sa2)
+static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
+ const struct sockaddr *sa2)
+{
+ const struct in6_addr *addr1;
+ const struct in6_addr *addr2;
+ struct in6_addr addr1_mapped;
+ struct in6_addr addr2_mapped;
+
+ addr1 = nfs_map_ipv4_addr(sa1, &addr1_mapped);
+ if (likely(addr1 != NULL)) {
+ addr2 = nfs_map_ipv4_addr(sa2, &addr2_mapped);
+ if (likely(addr2 != NULL))
+ return ipv6_addr_equal(addr1, addr2);
+ }
+ return 0;
+}
+#else
+static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
+ const struct sockaddr_in *sa2)
{
- return ipv6_addr_equal(&sa1->sin6_addr, &sa2->sin6_addr);
+ return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
}
static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
const struct sockaddr *sa2)
{
- switch (sa1->sa_family) {
- case AF_INET:
- return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
- (const struct sockaddr_in *)sa2);
- case AF_INET6:
- return nfs_sockaddr_match_ipaddr6((const struct sockaddr_in6 *)sa1,
- (const struct sockaddr_in6 *)sa2);
- }
- BUG();
+ if (unlikely(sa1->sa_family != AF_INET || sa2->sa_family != AF_INET))
+ return 0;
+ return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
+ (const struct sockaddr_in *)sa2);
}
+#endif
/*
* Find a client by IP address and protocol version
@@ -270,8 +292,6 @@ struct nfs_client *nfs_find_client(const struct sockaddr *addr, u32 nfsversion)
if (clp->rpc_ops->version != nfsversion)
continue;
- if (addr->sa_family != clap->sa_family)
- continue;
/* Match only the IP address, not the port number */
if (!nfs_sockaddr_match_ipaddr(addr, clap))
continue;
@@ -305,8 +325,6 @@ struct nfs_client *nfs_find_client_next(struct nfs_client *clp)
if (clp->rpc_ops->version != nfsvers)
continue;
- if (sap->sa_family != clap->sa_family)
- continue;
/* Match only the IP address, not the port number */
if (!nfs_sockaddr_match_ipaddr(sap, clap))
continue;
@@ -470,7 +488,7 @@ static void nfs_init_timeout_values(struct rpc_timeout *to, int proto,
static int nfs_create_rpc_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms,
rpc_authflavor_t flavor,
- int flags)
+ int discrtry, int noresvport)
{
struct rpc_clnt *clnt = NULL;
struct rpc_create_args args = {
@@ -482,9 +500,13 @@ static int nfs_create_rpc_client(struct nfs_client *clp,
.program = &nfs_program,
.version = clp->rpc_ops->version,
.authflavor = flavor,
- .flags = flags,
};
+ if (discrtry)
+ args.flags |= RPC_CLNT_CREATE_DISCRTRY;
+ if (noresvport)
+ args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+
if (!IS_ERR(clp->cl_rpcclient))
return 0;
@@ -522,6 +544,8 @@ static int nfs_start_lockd(struct nfs_server *server)
.protocol = server->flags & NFS_MOUNT_TCP ?
IPPROTO_TCP : IPPROTO_UDP,
.nfs_version = clp->rpc_ops->version,
+ .noresvport = server->flags & NFS_MOUNT_NORESVPORT ?
+ 1 : 0,
};
if (nlm_init.nfs_version > 3)
@@ -623,7 +647,8 @@ static int nfs_init_client(struct nfs_client *clp,
* Create a client RPC handle for doing FSSTAT with UNIX auth only
* - RFC 2623, sec 2.3.2
*/
- error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX, 0);
+ error = nfs_create_rpc_client(clp, timeparms, RPC_AUTH_UNIX,
+ 0, data->flags & NFS_MOUNT_NORESVPORT);
if (error < 0)
goto error;
nfs_mark_client_ready(clp, NFS_CS_READY);
@@ -965,7 +990,8 @@ error:
static int nfs4_init_client(struct nfs_client *clp,
const struct rpc_timeout *timeparms,
const char *ip_addr,
- rpc_authflavor_t authflavour)
+ rpc_authflavor_t authflavour,
+ int flags)
{
int error;
@@ -979,7 +1005,7 @@ static int nfs4_init_client(struct nfs_client *clp,
clp->rpc_ops = &nfs_v4_clientops;
error = nfs_create_rpc_client(clp, timeparms, authflavour,
- RPC_CLNT_CREATE_DISCRTRY);
+ 1, flags & NFS_MOUNT_NORESVPORT);
if (error < 0)
goto error;
memcpy(clp->cl_ipaddr, ip_addr, sizeof(clp->cl_ipaddr));
@@ -1030,7 +1056,8 @@ static int nfs4_set_client(struct nfs_server *server,
error = PTR_ERR(clp);
goto error;
}
- error = nfs4_init_client(clp, timeparms, ip_addr, authflavour);
+ error = nfs4_init_client(clp, timeparms, ip_addr, authflavour,
+ server->flags);
if (error < 0)
goto error_put;
@@ -1059,6 +1086,10 @@ static int nfs4_init_server(struct nfs_server *server,
nfs_init_timeout_values(&timeparms, data->nfs_server.protocol,
data->timeo, data->retrans);
+ /* Initialise the client representation from the mount data */
+ server->flags = data->flags;
+ server->caps |= NFS_CAP_ATOMIC_OPEN;
+
/* Get a client record */
error = nfs4_set_client(server,
data->nfs_server.hostname,
@@ -1071,10 +1102,6 @@ static int nfs4_init_server(struct nfs_server *server,
if (error < 0)
goto error;
- /* Initialise the client representation from the mount data */
- server->flags = data->flags;
- server->caps |= NFS_CAP_ATOMIC_OPEN;
-
if (data->rsize)
server->rsize = nfs_block_size(data->rsize, NULL);
if (data->wsize)
@@ -1177,6 +1204,10 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
parent_server = NFS_SB(data->sb);
parent_client = parent_server->nfs_client;
+ /* Initialise the client representation from the parent server */
+ nfs_server_copy_userdata(server, parent_server);
+ server->caps |= NFS_CAP_ATOMIC_OPEN;
+
/* Get a client representation.
* Note: NFSv4 always uses TCP, */
error = nfs4_set_client(server, data->hostname,
@@ -1189,10 +1220,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
if (error < 0)
goto error;
- /* Initialise the client representation from the parent server */
- nfs_server_copy_userdata(server, parent_server);
- server->caps |= NFS_CAP_ATOMIC_OPEN;
-
error = nfs_init_server_rpcclient(server, parent_server->client->cl_timeout, data->authflavor);
if (error < 0)
goto error;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index cc563cfa6940..968225a88015 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -43,6 +43,27 @@ static void nfs_free_delegation(struct nfs_delegation *delegation)
put_rpccred(cred);
}
+void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
+{
+ set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
+}
+
+int nfs_have_delegation(struct inode *inode, fmode_t flags)
+{
+ struct nfs_delegation *delegation;
+ int ret = 0;
+
+ flags &= FMODE_READ|FMODE_WRITE;
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation != NULL && (delegation->type & flags) == flags) {
+ nfs_mark_delegation_referenced(delegation);
+ ret = 1;
+ }
+ rcu_read_unlock();
+ return ret;
+}
+
static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_state *state)
{
struct inode *inode = state->inode;
@@ -119,7 +140,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, st
delegation->maxsize = res->maxsize;
oldcred = delegation->cred;
delegation->cred = get_rpccred(cred);
- delegation->flags &= ~NFS_DELEGATION_NEED_RECLAIM;
+ clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
NFS_I(inode)->delegation_state = delegation->type;
smp_wmb();
put_rpccred(oldcred);
@@ -134,19 +155,35 @@ static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *
return res;
}
+static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation)
+{
+ struct inode *inode = NULL;
+
+ spin_lock(&delegation->lock);
+ if (delegation->inode != NULL)
+ inode = igrab(delegation->inode);
+ spin_unlock(&delegation->lock);
+ return inode;
+}
+
static struct nfs_delegation *nfs_detach_delegation_locked(struct nfs_inode *nfsi, const nfs4_stateid *stateid)
{
struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
if (delegation == NULL)
goto nomatch;
+ spin_lock(&delegation->lock);
if (stateid != NULL && memcmp(delegation->stateid.data, stateid->data,
sizeof(delegation->stateid.data)) != 0)
- goto nomatch;
+ goto nomatch_unlock;
list_del_rcu(&delegation->super_list);
+ delegation->inode = NULL;
nfsi->delegation_state = 0;
rcu_assign_pointer(nfsi->delegation, NULL);
+ spin_unlock(&delegation->lock);
return delegation;
+nomatch_unlock:
+ spin_unlock(&delegation->lock);
nomatch:
return NULL;
}
@@ -172,6 +209,8 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
delegation->change_attr = nfsi->change_attr;
delegation->cred = get_rpccred(cred);
delegation->inode = inode;
+ delegation->flags = 1<<NFS_DELEGATION_REFERENCED;
+ spin_lock_init(&delegation->lock);
spin_lock(&clp->cl_lock);
if (rcu_dereference(nfsi->delegation) != NULL) {
@@ -226,22 +265,47 @@ static void nfs_msync_inode(struct inode *inode)
*/
static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation)
{
- struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
struct nfs_inode *nfsi = NFS_I(inode);
nfs_msync_inode(inode);
- down_read(&clp->cl_sem);
/* Guard against new delegated open calls */
down_write(&nfsi->rwsem);
nfs_delegation_claim_opens(inode, &delegation->stateid);
up_write(&nfsi->rwsem);
- up_read(&clp->cl_sem);
nfs_msync_inode(inode);
return nfs_do_return_delegation(inode, delegation, 1);
}
/*
+ * Return all delegations that have been marked for return
+ */
+void nfs_client_return_marked_delegations(struct nfs_client *clp)
+{
+ struct nfs_delegation *delegation;
+ struct inode *inode;
+
+restart:
+ rcu_read_lock();
+ list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
+ if (!test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags))
+ continue;
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL)
+ continue;
+ spin_lock(&clp->cl_lock);
+ delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
+ spin_unlock(&clp->cl_lock);
+ rcu_read_unlock();
+ if (delegation != NULL)
+ __nfs_inode_return_delegation(inode, delegation);
+ iput(inode);
+ goto restart;
+ }
+ rcu_read_unlock();
+}
+
+/*
* This function returns the delegation without reclaiming opens
* or protecting against delegation reclaims.
* It is therefore really only safe to be called from
@@ -279,83 +343,55 @@ int nfs_inode_return_delegation(struct inode *inode)
return err;
}
+static void nfs_mark_return_delegation(struct nfs_client *clp, struct nfs_delegation *delegation)
+{
+ set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+ set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+}
+
/*
* Return all delegations associated to a super block
*/
-void nfs_return_all_delegations(struct super_block *sb)
+void nfs_super_return_all_delegations(struct super_block *sb)
{
struct nfs_client *clp = NFS_SB(sb)->nfs_client;
struct nfs_delegation *delegation;
- struct inode *inode;
if (clp == NULL)
return;
-restart:
rcu_read_lock();
list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
- if (delegation->inode->i_sb != sb)
- continue;
- inode = igrab(delegation->inode);
- if (inode == NULL)
- continue;
- spin_lock(&clp->cl_lock);
- delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
- spin_unlock(&clp->cl_lock);
- rcu_read_unlock();
- if (delegation != NULL)
- __nfs_inode_return_delegation(inode, delegation);
- iput(inode);
- goto restart;
+ spin_lock(&delegation->lock);
+ if (delegation->inode != NULL && delegation->inode->i_sb == sb)
+ set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+ spin_unlock(&delegation->lock);
}
rcu_read_unlock();
+ nfs_client_return_marked_delegations(clp);
}
-static int nfs_do_expire_all_delegations(void *ptr)
+static void nfs_client_mark_return_all_delegations(struct nfs_client *clp)
{
- struct nfs_client *clp = ptr;
struct nfs_delegation *delegation;
- struct inode *inode;
- allow_signal(SIGKILL);
-restart:
- if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) != 0)
- goto out;
- if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0)
- goto out;
rcu_read_lock();
list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
- inode = igrab(delegation->inode);
- if (inode == NULL)
- continue;
- spin_lock(&clp->cl_lock);
- delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
- spin_unlock(&clp->cl_lock);
- rcu_read_unlock();
- if (delegation)
- __nfs_inode_return_delegation(inode, delegation);
- iput(inode);
- goto restart;
+ set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+ set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
}
rcu_read_unlock();
-out:
- nfs_put_client(clp);
- module_put_and_exit(0);
+}
+
+static void nfs_delegation_run_state_manager(struct nfs_client *clp)
+{
+ if (test_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state))
+ nfs4_schedule_state_manager(clp);
}
void nfs_expire_all_delegations(struct nfs_client *clp)
{
- struct task_struct *task;
-
- __module_get(THIS_MODULE);
- atomic_inc(&clp->cl_count);
- task = kthread_run(nfs_do_expire_all_delegations, clp,
- "%s-delegreturn",
- rpc_peeraddr2str(clp->cl_rpcclient,
- RPC_DISPLAY_ADDR));
- if (!IS_ERR(task))
- return;
- nfs_put_client(clp);
- module_put(THIS_MODULE);
+ nfs_client_mark_return_all_delegations(clp);
+ nfs_delegation_run_state_manager(clp);
}
/*
@@ -363,68 +399,29 @@ void nfs_expire_all_delegations(struct nfs_client *clp)
*/
void nfs_handle_cb_pathdown(struct nfs_client *clp)
{
- struct nfs_delegation *delegation;
- struct inode *inode;
-
if (clp == NULL)
return;
-restart:
+ nfs_client_mark_return_all_delegations(clp);
+}
+
+static void nfs_client_mark_return_unreferenced_delegations(struct nfs_client *clp)
+{
+ struct nfs_delegation *delegation;
+
rcu_read_lock();
list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
- inode = igrab(delegation->inode);
- if (inode == NULL)
+ if (test_and_clear_bit(NFS_DELEGATION_REFERENCED, &delegation->flags))
continue;
- spin_lock(&clp->cl_lock);
- delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
- spin_unlock(&clp->cl_lock);
- rcu_read_unlock();
- if (delegation != NULL)
- __nfs_inode_return_delegation(inode, delegation);
- iput(inode);
- goto restart;
+ set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+ set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
}
rcu_read_unlock();
}
-struct recall_threadargs {
- struct inode *inode;
- struct nfs_client *clp;
- const nfs4_stateid *stateid;
-
- struct completion started;
- int result;
-};
-
-static int recall_thread(void *data)
+void nfs_expire_unreferenced_delegations(struct nfs_client *clp)
{
- struct recall_threadargs *args = (struct recall_threadargs *)data;
- struct inode *inode = igrab(args->inode);
- struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
- struct nfs_inode *nfsi = NFS_I(inode);
- struct nfs_delegation *delegation;
-
- daemonize("nfsv4-delegreturn");
-
- nfs_msync_inode(inode);
- down_read(&clp->cl_sem);
- down_write(&nfsi->rwsem);
- spin_lock(&clp->cl_lock);
- delegation = nfs_detach_delegation_locked(nfsi, args->stateid);
- if (delegation != NULL)
- args->result = 0;
- else
- args->result = -ENOENT;
- spin_unlock(&clp->cl_lock);
- complete(&args->started);
- nfs_delegation_claim_opens(inode, args->stateid);
- up_write(&nfsi->rwsem);
- up_read(&clp->cl_sem);
- nfs_msync_inode(inode);
-
- if (delegation != NULL)
- nfs_do_return_delegation(inode, delegation, 1);
- iput(inode);
- module_put_and_exit(0);
+ nfs_client_mark_return_unreferenced_delegations(clp);
+ nfs_delegation_run_state_manager(clp);
}
/*
@@ -432,22 +429,20 @@ static int recall_thread(void *data)
*/
int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid)
{
- struct recall_threadargs data = {
- .inode = inode,
- .stateid = stateid,
- };
- int status;
+ struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+ struct nfs_delegation *delegation;
- init_completion(&data.started);
- __module_get(THIS_MODULE);
- status = kernel_thread(recall_thread, &data, CLONE_KERNEL);
- if (status < 0)
- goto out_module_put;
- wait_for_completion(&data.started);
- return data.result;
-out_module_put:
- module_put(THIS_MODULE);
- return status;
+ rcu_read_lock();
+ delegation = rcu_dereference(NFS_I(inode)->delegation);
+ if (delegation == NULL || memcmp(delegation->stateid.data, stateid->data,
+ sizeof(delegation->stateid.data)) != 0) {
+ rcu_read_unlock();
+ return -ENOENT;
+ }
+ nfs_mark_return_delegation(clp, delegation);
+ rcu_read_unlock();
+ nfs_delegation_run_state_manager(clp);
+ return 0;
}
/*
@@ -459,10 +454,14 @@ struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs
struct inode *res = NULL;
rcu_read_lock();
list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
- if (nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
+ spin_lock(&delegation->lock);
+ if (delegation->inode != NULL &&
+ nfs_compare_fh(fhandle, &NFS_I(delegation->inode)->fh) == 0) {
res = igrab(delegation->inode);
- break;
}
+ spin_unlock(&delegation->lock);
+ if (res != NULL)
+ break;
}
rcu_read_unlock();
return res;
@@ -476,7 +475,7 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp)
struct nfs_delegation *delegation;
rcu_read_lock();
list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list)
- delegation->flags |= NFS_DELEGATION_NEED_RECLAIM;
+ set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
rcu_read_unlock();
}
@@ -486,17 +485,22 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp)
void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
{
struct nfs_delegation *delegation;
+ struct inode *inode;
restart:
rcu_read_lock();
list_for_each_entry_rcu(delegation, &clp->cl_delegations, super_list) {
- if ((delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0)
+ if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0)
+ continue;
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL)
continue;
spin_lock(&clp->cl_lock);
- delegation = nfs_detach_delegation_locked(NFS_I(delegation->inode), NULL);
+ delegation = nfs_detach_delegation_locked(NFS_I(inode), NULL);
spin_unlock(&clp->cl_lock);
rcu_read_unlock();
if (delegation != NULL)
nfs_free_delegation(delegation);
+ iput(inode);
goto restart;
}
rcu_read_unlock();
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index f1c5e2a5d88e..09f383795174 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -17,14 +17,20 @@ struct nfs_delegation {
struct rpc_cred *cred;
struct inode *inode;
nfs4_stateid stateid;
- int type;
-#define NFS_DELEGATION_NEED_RECLAIM 1
- long flags;
+ fmode_t type;
loff_t maxsize;
__u64 change_attr;
+ unsigned long flags;
+ spinlock_t lock;
struct rcu_head rcu;
};
+enum {
+ NFS_DELEGATION_NEED_RECLAIM = 0,
+ NFS_DELEGATION_RETURN,
+ NFS_DELEGATION_REFERENCED,
+};
+
int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
int nfs_inode_return_delegation(struct inode *inode);
@@ -32,9 +38,11 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s
void nfs_inode_return_delegation_noreclaim(struct inode *inode);
struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle);
-void nfs_return_all_delegations(struct super_block *sb);
+void nfs_super_return_all_delegations(struct super_block *sb);
void nfs_expire_all_delegations(struct nfs_client *clp);
+void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
void nfs_handle_cb_pathdown(struct nfs_client *clp);
+void nfs_client_return_marked_delegations(struct nfs_client *clp);
void nfs_delegation_mark_reclaim(struct nfs_client *clp);
void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
@@ -45,22 +53,11 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl);
int nfs4_copy_delegation_stateid(nfs4_stateid *dst, struct inode *inode);
-static inline int nfs_have_delegation(struct inode *inode, int flags)
-{
- struct nfs_delegation *delegation;
- int ret = 0;
-
- flags &= FMODE_READ|FMODE_WRITE;
- rcu_read_lock();
- delegation = rcu_dereference(NFS_I(inode)->delegation);
- if (delegation != NULL && (delegation->type & flags) == flags)
- ret = 1;
- rcu_read_unlock();
- return ret;
-}
+void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
+int nfs_have_delegation(struct inode *inode, fmode_t flags);
#else
-static inline int nfs_have_delegation(struct inode *inode, int flags)
+static inline int nfs_have_delegation(struct inode *inode, fmode_t flags)
{
return 0;
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 3e64b98f3a93..e35c8199f82f 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -799,6 +799,9 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
goto out_bad;
}
+ if (nfs_have_delegation(inode, FMODE_READ))
+ goto out_set_verifier;
+
/* Force a full look up iff the parent directory has changed */
if (!nfs_is_exclusive_create(dir, nd) && nfs_check_verifier(dir, dentry)) {
if (nfs_lookup_verify_inode(inode, nd))
@@ -817,6 +820,7 @@ static int nfs_lookup_revalidate(struct dentry * dentry, struct nameidata *nd)
if ((error = nfs_refresh_inode(inode, &fattr)) != 0)
goto out_bad;
+out_set_verifier:
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
out_valid:
dput(parent);
@@ -973,7 +977,7 @@ struct dentry_operations nfs4_dentry_operations = {
* Use intent information to determine whether we need to substitute
* the NFSv4-style stateful OPEN for the LOOKUP call
*/
-static int is_atomic_open(struct inode *dir, struct nameidata *nd)
+static int is_atomic_open(struct nameidata *nd)
{
if (nd == NULL || nfs_lookup_check_intent(nd, LOOKUP_OPEN) == 0)
return 0;
@@ -996,7 +1000,7 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry
dir->i_sb->s_id, dir->i_ino, dentry->d_name.name);
/* Check that we are indeed trying to open this file */
- if (!is_atomic_open(dir, nd))
+ if (!is_atomic_open(nd))
goto no_open;
if (dentry->d_name.len > NFS_SERVER(dir)->namelen) {
@@ -1047,10 +1051,10 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
struct inode *dir;
int openflags, ret = 0;
+ if (!is_atomic_open(nd))
+ goto no_open;
parent = dget_parent(dentry);
dir = parent->d_inode;
- if (!is_atomic_open(dir, nd))
- goto no_open;
/* We can't create new files in nfs_open_revalidate(), so we
* optimize away revalidation of negative dentries.
*/
@@ -1062,11 +1066,11 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd)
/* NFS only supports OPEN on regular files */
if (!S_ISREG(inode->i_mode))
- goto no_open;
+ goto no_open_dput;
openflags = nd->intent.open.flags;
/* We cannot do exclusive creation on a positive dentry */
if ((openflags & (O_CREAT|O_EXCL)) == (O_CREAT|O_EXCL))
- goto no_open;
+ goto no_open_dput;
/* We can't create new files, or truncate existing ones here */
openflags &= ~(O_CREAT|O_TRUNC);
@@ -1081,10 +1085,9 @@ out:
if (!ret)
d_drop(dentry);
return ret;
-no_open:
+no_open_dput:
dput(parent);
- if (inode != NULL && nfs_have_delegation(inode, FMODE_READ))
- return 1;
+no_open:
return nfs_lookup_revalidate(dentry, nd);
}
#endif /* CONFIG_NFSV4 */
@@ -1794,7 +1797,8 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
cache = nfs_access_search_rbtree(inode, cred);
if (cache == NULL)
goto out;
- if (!time_in_range(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
+ if (!nfs_have_delegation(inode, FMODE_READ) &&
+ !time_in_range_open(jiffies, cache->jiffies, cache->jiffies + nfsi->attrtimeo))
goto out_stale;
res->jiffies = cache->jiffies;
res->cred = cache->cred;
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index d319b49f8f06..90f292b520d2 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -354,7 +354,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
file->f_path.dentry->d_name.name,
mapping->host->i_ino, len, (long long) pos);
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index d22eb383e1cf..0c381686171e 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -592,7 +592,7 @@ static void nfs_file_set_open_context(struct file *filp, struct nfs_open_context
/*
* Given an inode, search for an open context with the desired characteristics
*/
-struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode)
+struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode)
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_open_context *pos, *ctx = NULL;
@@ -712,14 +712,7 @@ int nfs_attribute_timeout(struct inode *inode)
if (nfs_have_delegation(inode, FMODE_READ))
return 0;
- /*
- * Special case: if the attribute timeout is set to 0, then always
- * treat the cache as having expired (unless holding
- * a delegation).
- */
- if (nfsi->attrtimeo == 0)
- return 1;
- return !time_in_range(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
+ return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
}
/**
@@ -1182,7 +1175,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
nfsi->attrtimeo_timestamp = now;
nfsi->attr_gencount = nfs_inc_attr_generation_counter();
} else {
- if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
+ if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) {
if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode))
nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = now;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index d212ee41caf2..340ede8f608f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -63,6 +63,20 @@ struct nfs_parsed_mount_data {
struct security_mnt_opts lsm_opts;
};
+/* mount_clnt.c */
+struct nfs_mount_request {
+ struct sockaddr *sap;
+ size_t salen;
+ char *hostname;
+ char *dirpath;
+ u32 version;
+ unsigned short protocol;
+ struct nfs_fh *fh;
+ int noresvport;
+};
+
+extern int nfs_mount(struct nfs_mount_request *info);
+
/* client.c */
extern struct rpc_program nfs_program;
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 086a6830d785..ca905a5bb1ba 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -29,47 +29,43 @@ struct mnt_fhstatus {
/**
* nfs_mount - Obtain an NFS file handle for the given host and path
- * @addr: pointer to server's address
- * @len: size of server's address
- * @hostname: name of server host, or NULL
- * @path: pointer to string containing export path to mount
- * @version: mount version to use for this request
- * @protocol: transport protocol to use for thie request
- * @fh: pointer to location to place returned file handle
+ * @info: pointer to mount request arguments
*
* Uses default timeout parameters specified by underlying transport.
*/
-int nfs_mount(struct sockaddr *addr, size_t len, char *hostname, char *path,
- int version, int protocol, struct nfs_fh *fh)
+int nfs_mount(struct nfs_mount_request *info)
{
struct mnt_fhstatus result = {
- .fh = fh
+ .fh = info->fh
};
struct rpc_message msg = {
- .rpc_argp = path,
+ .rpc_argp = info->dirpath,
.rpc_resp = &result,
};
struct rpc_create_args args = {
- .protocol = protocol,
- .address = addr,
- .addrsize = len,
- .servername = hostname,
+ .protocol = info->protocol,
+ .address = info->sap,
+ .addrsize = info->salen,
+ .servername = info->hostname,
.program = &mnt_program,
- .version = version,
+ .version = info->version,
.authflavor = RPC_AUTH_UNIX,
- .flags = 0,
};
struct rpc_clnt *mnt_clnt;
int status;
dprintk("NFS: sending MNT request for %s:%s\n",
- (hostname ? hostname : "server"), path);
+ (info->hostname ? info->hostname : "server"),
+ info->dirpath);
+
+ if (info->noresvport)
+ args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
mnt_clnt = rpc_create(&args);
if (IS_ERR(mnt_clnt))
goto out_clnt_err;
- if (version == NFS_MNT3_VERSION)
+ if (info->version == NFS_MNT3_VERSION)
msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC3_MNT];
else
msg.rpc_proc = &mnt_clnt->cl_procinfo[MNTPROC_MNT];
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ea790645fda6..4e4d33204376 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -38,8 +38,12 @@ struct idmap;
((err) != NFSERR_NOFILEHANDLE))
enum nfs4_client_state {
- NFS4CLNT_STATE_RECOVER = 0,
+ NFS4CLNT_MANAGER_RUNNING = 0,
+ NFS4CLNT_CHECK_LEASE,
NFS4CLNT_LEASE_EXPIRED,
+ NFS4CLNT_RECLAIM_REBOOT,
+ NFS4CLNT_RECLAIM_NOGRACE,
+ NFS4CLNT_DELEGRETURN,
};
/*
@@ -90,12 +94,18 @@ struct nfs4_state_owner {
spinlock_t so_lock;
atomic_t so_count;
+ unsigned long so_flags;
struct list_head so_states;
struct list_head so_delegations;
struct nfs_seqid_counter so_seqid;
struct rpc_sequence so_sequence;
};
+enum {
+ NFS_OWNER_RECLAIM_REBOOT,
+ NFS_OWNER_RECLAIM_NOGRACE
+};
+
/*
* struct nfs4_state maintains the client-side state for a given
* (state_owner,inode) tuple (OPEN) or state_owner (LOCK).
@@ -128,6 +138,8 @@ enum {
NFS_O_RDONLY_STATE, /* OPEN stateid has read-only state */
NFS_O_WRONLY_STATE, /* OPEN stateid has write-only state */
NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */
+ NFS_STATE_RECLAIM_REBOOT, /* OPEN stateid server rebooted */
+ NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */
};
struct nfs4_state {
@@ -149,7 +161,7 @@ struct nfs4_state {
unsigned int n_rdonly; /* Number of read-only references */
unsigned int n_wronly; /* Number of write-only references */
unsigned int n_rdwr; /* Number of read/write references */
- int state; /* State on the server (R,W, or RW) */
+ fmode_t state; /* State on the server (R,W, or RW) */
atomic_t count;
};
@@ -157,9 +169,12 @@ struct nfs4_state {
struct nfs4_exception {
long timeout;
int retry;
+ struct nfs4_state *state;
};
struct nfs4_state_recovery_ops {
+ int owner_flag_bit;
+ int state_flag_bit;
int (*recover_open)(struct nfs4_state_owner *, struct nfs4_state *);
int (*recover_lock)(struct nfs4_state *, struct file_lock *);
};
@@ -174,7 +189,6 @@ extern ssize_t nfs4_listxattr(struct dentry *, char *, size_t);
/* nfs4proc.c */
-extern int nfs4_map_errors(int err);
extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *);
extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct rpc_cred *);
extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
@@ -187,7 +201,7 @@ extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
struct nfs4_fs_locations *fs_locations, struct page *page);
extern struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops;
-extern struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops;
+extern struct nfs4_state_recovery_ops nfs4_nograce_recovery_ops;
extern const u32 nfs4_fattr_bitmap[2];
extern const u32 nfs4_statfs_bitmap[2];
@@ -202,16 +216,18 @@ extern void nfs4_kill_renewd(struct nfs_client *);
extern void nfs4_renew_state(struct work_struct *);
/* nfs4state.c */
-struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp);
+struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp);
extern struct nfs4_state_owner * nfs4_get_state_owner(struct nfs_server *, struct rpc_cred *);
extern void nfs4_put_state_owner(struct nfs4_state_owner *);
extern struct nfs4_state * nfs4_get_open_state(struct inode *, struct nfs4_state_owner *);
extern void nfs4_put_open_state(struct nfs4_state *);
-extern void nfs4_close_state(struct path *, struct nfs4_state *, mode_t);
-extern void nfs4_close_sync(struct path *, struct nfs4_state *, mode_t);
-extern void nfs4_state_set_mode_locked(struct nfs4_state *, mode_t);
+extern void nfs4_close_state(struct path *, struct nfs4_state *, fmode_t);
+extern void nfs4_close_sync(struct path *, struct nfs4_state *, fmode_t);
+extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
extern void nfs4_schedule_state_recovery(struct nfs_client *);
+extern void nfs4_schedule_state_manager(struct nfs_client *);
+extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state);
extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 83e700a2b0c0..8dde84b988d9 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -62,14 +62,12 @@
struct nfs4_opendata;
static int _nfs4_proc_open(struct nfs4_opendata *data);
static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
-static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *);
-static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception);
-static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp);
+static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *);
static int _nfs4_proc_lookup(struct inode *dir, const struct qstr *name, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr);
/* Prevent leaks of NFSv4 errors into userland */
-int nfs4_map_errors(int err)
+static int nfs4_map_errors(int err)
{
if (err < -1000) {
dprintk("%s could not handle NFSv4 error %d\n",
@@ -195,6 +193,83 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
kunmap_atomic(start, KM_USER0);
}
+static int nfs4_wait_bit_killable(void *word)
+{
+ if (fatal_signal_pending(current))
+ return -ERESTARTSYS;
+ schedule();
+ return 0;
+}
+
+static int nfs4_wait_clnt_recover(struct nfs_client *clp)
+{
+ int res;
+
+ might_sleep();
+
+ res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
+ nfs4_wait_bit_killable, TASK_KILLABLE);
+ return res;
+}
+
+static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
+{
+ int res = 0;
+
+ might_sleep();
+
+ if (*timeout <= 0)
+ *timeout = NFS4_POLL_RETRY_MIN;
+ if (*timeout > NFS4_POLL_RETRY_MAX)
+ *timeout = NFS4_POLL_RETRY_MAX;
+ schedule_timeout_killable(*timeout);
+ if (fatal_signal_pending(current))
+ res = -ERESTARTSYS;
+ *timeout <<= 1;
+ return res;
+}
+
+/* This is the error handling routine for processes that are allowed
+ * to sleep.
+ */
+static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
+{
+ struct nfs_client *clp = server->nfs_client;
+ struct nfs4_state *state = exception->state;
+ int ret = errorcode;
+
+ exception->retry = 0;
+ switch(errorcode) {
+ case 0:
+ return 0;
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_OPENMODE:
+ if (state == NULL)
+ break;
+ nfs4_state_mark_reclaim_nograce(clp, state);
+ case -NFS4ERR_STALE_CLIENTID:
+ case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_EXPIRED:
+ nfs4_schedule_state_recovery(clp);
+ ret = nfs4_wait_clnt_recover(clp);
+ if (ret == 0)
+ exception->retry = 1;
+ break;
+ case -NFS4ERR_FILE_OPEN:
+ case -NFS4ERR_GRACE:
+ case -NFS4ERR_DELAY:
+ ret = nfs4_delay(server->client, &exception->timeout);
+ if (ret != 0)
+ break;
+ case -NFS4ERR_OLD_STATEID:
+ exception->retry = 1;
+ }
+ /* We failed to handle the error */
+ return nfs4_map_errors(ret);
+}
+
+
static void renew_lease(const struct nfs_server *server, unsigned long timestamp)
{
struct nfs_client *clp = server->nfs_client;
@@ -248,7 +323,7 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p)
}
static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
- struct nfs4_state_owner *sp, int flags,
+ struct nfs4_state_owner *sp, fmode_t fmode, int flags,
const struct iattr *attrs)
{
struct dentry *parent = dget_parent(path->dentry);
@@ -268,7 +343,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path,
p->owner = sp;
atomic_inc(&sp->so_count);
p->o_arg.fh = NFS_FH(dir);
- p->o_arg.open_flags = flags,
+ p->o_arg.open_flags = flags;
+ p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
p->o_arg.clientid = server->nfs_client->cl_clientid;
p->o_arg.id = sp->so_owner_id.id;
p->o_arg.name = &p->path.dentry->d_name;
@@ -324,10 +400,13 @@ static int nfs4_wait_for_completion_rpc_task(struct rpc_task *task)
return ret;
}
-static int can_open_cached(struct nfs4_state *state, int mode)
+static int can_open_cached(struct nfs4_state *state, fmode_t mode, int open_mode)
{
int ret = 0;
- switch (mode & (FMODE_READ|FMODE_WRITE|O_EXCL)) {
+
+ if (open_mode & O_EXCL)
+ goto out;
+ switch (mode & (FMODE_READ|FMODE_WRITE)) {
case FMODE_READ:
ret |= test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0;
break;
@@ -337,21 +416,23 @@ static int can_open_cached(struct nfs4_state *state, int mode)
case FMODE_READ|FMODE_WRITE:
ret |= test_bit(NFS_O_RDWR_STATE, &state->flags) != 0;
}
+out:
return ret;
}
-static int can_open_delegated(struct nfs_delegation *delegation, mode_t open_flags)
+static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
{
- if ((delegation->type & open_flags) != open_flags)
+ if ((delegation->type & fmode) != fmode)
return 0;
- if (delegation->flags & NFS_DELEGATION_NEED_RECLAIM)
+ if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
return 0;
+ nfs_mark_delegation_referenced(delegation);
return 1;
}
-static void update_open_stateflags(struct nfs4_state *state, mode_t open_flags)
+static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)
{
- switch (open_flags) {
+ switch (fmode) {
case FMODE_WRITE:
state->n_wronly++;
break;
@@ -361,15 +442,15 @@ static void update_open_stateflags(struct nfs4_state *state, mode_t open_flags)
case FMODE_READ|FMODE_WRITE:
state->n_rdwr++;
}
- nfs4_state_set_mode_locked(state, state->state | open_flags);
+ nfs4_state_set_mode_locked(state, state->state | fmode);
}
-static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags)
+static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
{
if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
memcpy(state->stateid.data, stateid->data, sizeof(state->stateid.data));
memcpy(state->open_stateid.data, stateid->data, sizeof(state->open_stateid.data));
- switch (open_flags) {
+ switch (fmode) {
case FMODE_READ:
set_bit(NFS_O_RDONLY_STATE, &state->flags);
break;
@@ -381,16 +462,15 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *
}
}
-static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, int open_flags)
+static void nfs_set_open_stateid(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
{
write_seqlock(&state->seqlock);
- nfs_set_open_stateid_locked(state, stateid, open_flags);
+ nfs_set_open_stateid_locked(state, stateid, fmode);
write_sequnlock(&state->seqlock);
}
-static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *deleg_stateid, int open_flags)
+static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode)
{
- open_flags &= (FMODE_READ|FMODE_WRITE);
/*
* Protect the call to nfs4_state_set_mode_locked and
* serialise the stateid update
@@ -401,20 +481,60 @@ static void update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_sta
set_bit(NFS_DELEGATED_STATE, &state->flags);
}
if (open_stateid != NULL)
- nfs_set_open_stateid_locked(state, open_stateid, open_flags);
+ nfs_set_open_stateid_locked(state, open_stateid, fmode);
write_sequnlock(&state->seqlock);
spin_lock(&state->owner->so_lock);
- update_open_stateflags(state, open_flags);
+ update_open_stateflags(state, fmode);
spin_unlock(&state->owner->so_lock);
}
-static void nfs4_return_incompatible_delegation(struct inode *inode, mode_t open_flags)
+static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *delegation, fmode_t fmode)
+{
+ struct nfs_inode *nfsi = NFS_I(state->inode);
+ struct nfs_delegation *deleg_cur;
+ int ret = 0;
+
+ fmode &= (FMODE_READ|FMODE_WRITE);
+
+ rcu_read_lock();
+ deleg_cur = rcu_dereference(nfsi->delegation);
+ if (deleg_cur == NULL)
+ goto no_delegation;
+
+ spin_lock(&deleg_cur->lock);
+ if (nfsi->delegation != deleg_cur ||
+ (deleg_cur->type & fmode) != fmode)
+ goto no_delegation_unlock;
+
+ if (delegation == NULL)
+ delegation = &deleg_cur->stateid;
+ else if (memcmp(deleg_cur->stateid.data, delegation->data, NFS4_STATEID_SIZE) != 0)
+ goto no_delegation_unlock;
+
+ nfs_mark_delegation_referenced(deleg_cur);
+ __update_open_stateid(state, open_stateid, &deleg_cur->stateid, fmode);
+ ret = 1;
+no_delegation_unlock:
+ spin_unlock(&deleg_cur->lock);
+no_delegation:
+ rcu_read_unlock();
+
+ if (!ret && open_stateid != NULL) {
+ __update_open_stateid(state, open_stateid, NULL, fmode);
+ ret = 1;
+ }
+
+ return ret;
+}
+
+
+static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode)
{
struct nfs_delegation *delegation;
rcu_read_lock();
delegation = rcu_dereference(NFS_I(inode)->delegation);
- if (delegation == NULL || (delegation->type & open_flags) == open_flags) {
+ if (delegation == NULL || (delegation->type & fmode) == fmode) {
rcu_read_unlock();
return;
}
@@ -427,27 +547,28 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
struct nfs4_state *state = opendata->state;
struct nfs_inode *nfsi = NFS_I(state->inode);
struct nfs_delegation *delegation;
- int open_mode = opendata->o_arg.open_flags & (FMODE_READ|FMODE_WRITE|O_EXCL);
+ int open_mode = opendata->o_arg.open_flags & O_EXCL;
+ fmode_t fmode = opendata->o_arg.fmode;
nfs4_stateid stateid;
int ret = -EAGAIN;
- rcu_read_lock();
- delegation = rcu_dereference(nfsi->delegation);
for (;;) {
- if (can_open_cached(state, open_mode)) {
+ if (can_open_cached(state, fmode, open_mode)) {
spin_lock(&state->owner->so_lock);
- if (can_open_cached(state, open_mode)) {
- update_open_stateflags(state, open_mode);
+ if (can_open_cached(state, fmode, open_mode)) {
+ update_open_stateflags(state, fmode);
spin_unlock(&state->owner->so_lock);
- rcu_read_unlock();
goto out_return_state;
}
spin_unlock(&state->owner->so_lock);
}
- if (delegation == NULL)
- break;
- if (!can_open_delegated(delegation, open_mode))
+ rcu_read_lock();
+ delegation = rcu_dereference(nfsi->delegation);
+ if (delegation == NULL ||
+ !can_open_delegated(delegation, fmode)) {
+ rcu_read_unlock();
break;
+ }
/* Save the delegation */
memcpy(stateid.data, delegation->stateid.data, sizeof(stateid.data));
rcu_read_unlock();
@@ -455,19 +576,11 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata)
if (ret != 0)
goto out;
ret = -EAGAIN;
- rcu_read_lock();
- delegation = rcu_dereference(nfsi->delegation);
- /* If no delegation, try a cached open */
- if (delegation == NULL)
- continue;
- /* Is the delegation still valid? */
- if (memcmp(stateid.data, delegation->stateid.data, sizeof(stateid.data)) != 0)
- continue;
- rcu_read_unlock();
- update_open_stateid(state, NULL, &stateid, open_mode);
- goto out_return_state;
+
+ /* Try to update the stateid using the delegation */
+ if (update_open_stateid(state, NULL, &stateid, fmode))
+ goto out_return_state;
}
- rcu_read_unlock();
out:
return ERR_PTR(ret);
out_return_state:
@@ -480,7 +593,6 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
struct inode *inode;
struct nfs4_state *state = NULL;
struct nfs_delegation *delegation;
- nfs4_stateid *deleg_stateid = NULL;
int ret;
if (!data->rpc_done) {
@@ -507,7 +619,7 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
if (delegation)
delegation_flags = delegation->flags;
rcu_read_unlock();
- if (!(delegation_flags & NFS_DELEGATION_NEED_RECLAIM))
+ if ((delegation_flags & 1UL<<NFS_DELEGATION_NEED_RECLAIM) == 0)
nfs_inode_set_delegation(state->inode,
data->owner->so_cred,
&data->o_res);
@@ -516,12 +628,9 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data
data->owner->so_cred,
&data->o_res);
}
- rcu_read_lock();
- delegation = rcu_dereference(NFS_I(inode)->delegation);
- if (delegation != NULL)
- deleg_stateid = &delegation->stateid;
- update_open_stateid(state, &data->o_res.stateid, deleg_stateid, data->o_arg.open_flags);
- rcu_read_unlock();
+
+ update_open_stateid(state, &data->o_res.stateid, NULL,
+ data->o_arg.fmode);
iput(inode);
out:
return state;
@@ -552,7 +661,7 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
{
struct nfs4_opendata *opendata;
- opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, NULL);
+ opendata = nfs4_opendata_alloc(&ctx->path, state->owner, 0, 0, NULL);
if (opendata == NULL)
return ERR_PTR(-ENOMEM);
opendata->state = state;
@@ -560,12 +669,13 @@ static struct nfs4_opendata *nfs4_open_recoverdata_alloc(struct nfs_open_context
return opendata;
}
-static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openflags, struct nfs4_state **res)
+static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmode, struct nfs4_state **res)
{
struct nfs4_state *newstate;
int ret;
- opendata->o_arg.open_flags = openflags;
+ opendata->o_arg.open_flags = 0;
+ opendata->o_arg.fmode = fmode;
memset(&opendata->o_res, 0, sizeof(opendata->o_res));
memset(&opendata->c_res, 0, sizeof(opendata->c_res));
nfs4_init_opendata_res(opendata);
@@ -575,7 +685,7 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, mode_t openf
newstate = nfs4_opendata_to_nfs4_state(opendata);
if (IS_ERR(newstate))
return PTR_ERR(newstate);
- nfs4_close_state(&opendata->path, newstate, openflags);
+ nfs4_close_state(&opendata->path, newstate, fmode);
*res = newstate;
return 0;
}
@@ -631,7 +741,7 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
{
struct nfs_delegation *delegation;
struct nfs4_opendata *opendata;
- int delegation_type = 0;
+ fmode_t delegation_type = 0;
int status;
opendata = nfs4_open_recoverdata_alloc(ctx, state);
@@ -641,7 +751,7 @@ static int _nfs4_do_open_reclaim(struct nfs_open_context *ctx, struct nfs4_state
opendata->o_arg.fh = NFS_FH(state->inode);
rcu_read_lock();
delegation = rcu_dereference(NFS_I(state->inode)->delegation);
- if (delegation != NULL && (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) != 0)
+ if (delegation != NULL && test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) != 0)
delegation_type = delegation->type;
rcu_read_unlock();
opendata->o_arg.u.delegation_type = delegation_type;
@@ -744,7 +854,7 @@ static void nfs4_open_confirm_release(void *calldata)
goto out_free;
state = nfs4_opendata_to_nfs4_state(data);
if (!IS_ERR(state))
- nfs4_close_state(&data->path, state, data->o_arg.open_flags);
+ nfs4_close_state(&data->path, state, data->o_arg.fmode);
out_free:
nfs4_opendata_put(data);
}
@@ -808,12 +918,12 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
if (data->state != NULL) {
struct nfs_delegation *delegation;
- if (can_open_cached(data->state, data->o_arg.open_flags & (FMODE_READ|FMODE_WRITE|O_EXCL)))
+ if (can_open_cached(data->state, data->o_arg.fmode, data->o_arg.open_flags))
goto out_no_action;
rcu_read_lock();
delegation = rcu_dereference(NFS_I(data->state->inode)->delegation);
if (delegation != NULL &&
- (delegation->flags & NFS_DELEGATION_NEED_RECLAIM) == 0) {
+ test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) {
rcu_read_unlock();
goto out_no_action;
}
@@ -877,7 +987,7 @@ static void nfs4_open_release(void *calldata)
goto out_free;
state = nfs4_opendata_to_nfs4_state(data);
if (!IS_ERR(state))
- nfs4_close_state(&data->path, state, data->o_arg.open_flags);
+ nfs4_close_state(&data->path, state, data->o_arg.fmode);
out_free:
nfs4_opendata_put(data);
}
@@ -955,10 +1065,11 @@ static int nfs4_recover_expired_lease(struct nfs_server *server)
int ret;
for (;;) {
- ret = nfs4_wait_clnt_recover(server->client, clp);
+ ret = nfs4_wait_clnt_recover(clp);
if (ret != 0)
return ret;
- if (!test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+ if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
+ !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
break;
nfs4_schedule_state_recovery(clp);
}
@@ -993,8 +1104,9 @@ static inline int nfs4_do_open_expired(struct nfs_open_context *ctx, struct nfs4
do {
err = _nfs4_open_expired(ctx, state);
- if (err == -NFS4ERR_DELAY)
- nfs4_handle_exception(server, err, &exception);
+ if (err != -NFS4ERR_DELAY)
+ break;
+ nfs4_handle_exception(server, err, &exception);
} while (exception.retry);
return err;
}
@@ -1031,12 +1143,11 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct
/*
* Returns a referenced nfs4_state
*/
-static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
+static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred, struct nfs4_state **res)
{
struct nfs4_state_owner *sp;
struct nfs4_state *state = NULL;
struct nfs_server *server = NFS_SERVER(dir);
- struct nfs_client *clp = server->nfs_client;
struct nfs4_opendata *opendata;
int status;
@@ -1050,12 +1161,11 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct
if (status != 0)
goto err_put_state_owner;
if (path->dentry->d_inode != NULL)
- nfs4_return_incompatible_delegation(path->dentry->d_inode, flags & (FMODE_READ|FMODE_WRITE));
- down_read(&clp->cl_sem);
+ nfs4_return_incompatible_delegation(path->dentry->d_inode, fmode);
status = -ENOMEM;
- opendata = nfs4_opendata_alloc(path, sp, flags, sattr);
+ opendata = nfs4_opendata_alloc(path, sp, fmode, flags, sattr);
if (opendata == NULL)
- goto err_release_rwsem;
+ goto err_put_state_owner;
if (path->dentry->d_inode != NULL)
opendata->state = nfs4_get_open_state(path->dentry->d_inode, sp);
@@ -1073,13 +1183,10 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, int flags, struct
goto err_opendata_put;
nfs4_opendata_put(opendata);
nfs4_put_state_owner(sp);
- up_read(&clp->cl_sem);
*res = state;
return 0;
err_opendata_put:
nfs4_opendata_put(opendata);
-err_release_rwsem:
- up_read(&clp->cl_sem);
err_put_state_owner:
nfs4_put_state_owner(sp);
out_err:
@@ -1088,14 +1195,14 @@ out_err:
}
-static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, int flags, struct iattr *sattr, struct rpc_cred *cred)
+static struct nfs4_state *nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, int flags, struct iattr *sattr, struct rpc_cred *cred)
{
struct nfs4_exception exception = { };
struct nfs4_state *res;
int status;
do {
- status = _nfs4_do_open(dir, path, flags, sattr, cred, &res);
+ status = _nfs4_do_open(dir, path, fmode, flags, sattr, cred, &res);
if (status == 0)
break;
/* NOTE: BAD_SEQID means the server and client disagree about the
@@ -1230,10 +1337,13 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
renew_lease(server, calldata->timestamp);
break;
case -NFS4ERR_STALE_STATEID:
+ case -NFS4ERR_OLD_STATEID:
+ case -NFS4ERR_BAD_STATEID:
case -NFS4ERR_EXPIRED:
- break;
+ if (calldata->arg.fmode == 0)
+ break;
default:
- if (nfs4_async_handle_error(task, server) == -EAGAIN) {
+ if (nfs4_async_handle_error(task, server, state) == -EAGAIN) {
rpc_restart_call(task);
return;
}
@@ -1272,10 +1382,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
nfs_fattr_init(calldata->res.fattr);
if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0) {
task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
- calldata->arg.open_flags = FMODE_READ;
+ calldata->arg.fmode = FMODE_READ;
} else if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0) {
task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
- calldata->arg.open_flags = FMODE_WRITE;
+ calldata->arg.fmode = FMODE_WRITE;
}
calldata->timestamp = jiffies;
rpc_call_start(task);
@@ -1328,6 +1438,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid);
if (calldata->arg.seqid == NULL)
goto out_free_calldata;
+ calldata->arg.fmode = 0;
calldata->arg.bitmask = server->attr_bitmask;
calldata->res.fattr = &calldata->fattr;
calldata->res.seqid = calldata->arg.seqid;
@@ -1354,13 +1465,13 @@ out:
return status;
}
-static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state)
+static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct nfs4_state *state, fmode_t fmode)
{
struct file *filp;
int ret;
/* If the open_intent is for execute, we have an extra check to make */
- if (nd->intent.open.flags & FMODE_EXEC) {
+ if (fmode & FMODE_EXEC) {
ret = nfs_may_open(state->inode,
state->owner->so_cred,
nd->intent.open.flags);
@@ -1376,7 +1487,7 @@ static int nfs4_intent_set_file(struct nameidata *nd, struct path *path, struct
}
ret = PTR_ERR(filp);
out_close:
- nfs4_close_sync(path, state, nd->intent.open.flags);
+ nfs4_close_sync(path, state, fmode & (FMODE_READ|FMODE_WRITE));
return ret;
}
@@ -1392,6 +1503,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
struct rpc_cred *cred;
struct nfs4_state *state;
struct dentry *res;
+ fmode_t fmode = nd->intent.open.flags & (FMODE_READ | FMODE_WRITE | FMODE_EXEC);
if (nd->flags & LOOKUP_CREATE) {
attr.ia_mode = nd->intent.open.create_mode;
@@ -1409,7 +1521,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
parent = dentry->d_parent;
/* Protect against concurrent sillydeletes */
nfs_block_sillyrename(parent);
- state = nfs4_do_open(dir, &path, nd->intent.open.flags, &attr, cred);
+ state = nfs4_do_open(dir, &path, fmode, nd->intent.open.flags, &attr, cred);
put_rpccred(cred);
if (IS_ERR(state)) {
if (PTR_ERR(state) == -ENOENT) {
@@ -1424,7 +1536,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
path.dentry = res;
nfs_set_verifier(path.dentry, nfs_save_change_attribute(dir));
nfs_unblock_sillyrename(parent);
- nfs4_intent_set_file(nd, &path, state);
+ nfs4_intent_set_file(nd, &path, state, fmode);
return res;
}
@@ -1437,11 +1549,12 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
};
struct rpc_cred *cred;
struct nfs4_state *state;
+ fmode_t fmode = openflags & (FMODE_READ | FMODE_WRITE);
cred = rpc_lookup_cred();
if (IS_ERR(cred))
return PTR_ERR(cred);
- state = nfs4_do_open(dir, &path, openflags, NULL, cred);
+ state = nfs4_do_open(dir, &path, fmode, openflags, NULL, cred);
put_rpccred(cred);
if (IS_ERR(state)) {
switch (PTR_ERR(state)) {
@@ -1458,10 +1571,10 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st
}
if (state->inode == dentry->d_inode) {
nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
- nfs4_intent_set_file(nd, &path, state);
+ nfs4_intent_set_file(nd, &path, state, fmode);
return 1;
}
- nfs4_close_sync(&path, state, openflags);
+ nfs4_close_sync(&path, state, fmode);
out_drop:
d_drop(dentry);
return 0;
@@ -1887,6 +2000,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
};
struct nfs4_state *state;
struct rpc_cred *cred;
+ fmode_t fmode = flags & (FMODE_READ | FMODE_WRITE);
int status = 0;
cred = rpc_lookup_cred();
@@ -1894,7 +2008,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
status = PTR_ERR(cred);
goto out;
}
- state = nfs4_do_open(dir, &path, flags, sattr, cred);
+ state = nfs4_do_open(dir, &path, fmode, flags, sattr, cred);
d_drop(dentry);
if (IS_ERR(state)) {
status = PTR_ERR(state);
@@ -1910,9 +2024,9 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
nfs_post_op_update_inode(state->inode, &fattr);
}
if (status == 0 && (nd->flags & LOOKUP_OPEN) != 0)
- status = nfs4_intent_set_file(nd, &path, state);
+ status = nfs4_intent_set_file(nd, &path, state, fmode);
else
- nfs4_close_sync(&path, state, flags);
+ nfs4_close_sync(&path, state, fmode);
out_putcred:
put_rpccred(cred);
out:
@@ -1974,7 +2088,7 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
{
struct nfs_removeres *res = task->tk_msg.rpc_resp;
- if (nfs4_async_handle_error(task, res->server) == -EAGAIN)
+ if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN)
return 0;
update_changeattr(dir, &res->cinfo);
nfs_post_op_update_inode(dir, &res->dir_attr);
@@ -2402,7 +2516,7 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data)
{
struct nfs_server *server = NFS_SERVER(data->inode);
- if (nfs4_async_handle_error(task, server) == -EAGAIN) {
+ if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) {
rpc_restart_call(task);
return -EAGAIN;
}
@@ -2423,7 +2537,7 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data)
{
struct inode *inode = data->inode;
- if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) {
+ if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) {
rpc_restart_call(task);
return -EAGAIN;
}
@@ -2449,7 +2563,7 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data)
{
struct inode *inode = data->inode;
- if (nfs4_async_handle_error(task, NFS_SERVER(inode)) == -EAGAIN) {
+ if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) {
rpc_restart_call(task);
return -EAGAIN;
}
@@ -2742,19 +2856,25 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen
}
static int
-nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
+nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state)
{
struct nfs_client *clp = server->nfs_client;
if (!clp || task->tk_status >= 0)
return 0;
switch(task->tk_status) {
+ case -NFS4ERR_ADMIN_REVOKED:
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_OPENMODE:
+ if (state == NULL)
+ break;
+ nfs4_state_mark_reclaim_nograce(clp, state);
case -NFS4ERR_STALE_CLIENTID:
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_EXPIRED:
rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL);
nfs4_schedule_state_recovery(clp);
- if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0)
+ if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0)
rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task);
task->tk_status = 0;
return -EAGAIN;
@@ -2772,79 +2892,6 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server)
return 0;
}
-static int nfs4_wait_bit_killable(void *word)
-{
- if (fatal_signal_pending(current))
- return -ERESTARTSYS;
- schedule();
- return 0;
-}
-
-static int nfs4_wait_clnt_recover(struct rpc_clnt *clnt, struct nfs_client *clp)
-{
- int res;
-
- might_sleep();
-
- rwsem_acquire(&clp->cl_sem.dep_map, 0, 0, _RET_IP_);
-
- res = wait_on_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER,
- nfs4_wait_bit_killable, TASK_KILLABLE);
-
- rwsem_release(&clp->cl_sem.dep_map, 1, _RET_IP_);
- return res;
-}
-
-static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
-{
- int res = 0;
-
- might_sleep();
-
- if (*timeout <= 0)
- *timeout = NFS4_POLL_RETRY_MIN;
- if (*timeout > NFS4_POLL_RETRY_MAX)
- *timeout = NFS4_POLL_RETRY_MAX;
- schedule_timeout_killable(*timeout);
- if (fatal_signal_pending(current))
- res = -ERESTARTSYS;
- *timeout <<= 1;
- return res;
-}
-
-/* This is the error handling routine for processes that are allowed
- * to sleep.
- */
-static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, struct nfs4_exception *exception)
-{
- struct nfs_client *clp = server->nfs_client;
- int ret = errorcode;
-
- exception->retry = 0;
- switch(errorcode) {
- case 0:
- return 0;
- case -NFS4ERR_STALE_CLIENTID:
- case -NFS4ERR_STALE_STATEID:
- case -NFS4ERR_EXPIRED:
- nfs4_schedule_state_recovery(clp);
- ret = nfs4_wait_clnt_recover(server->client, clp);
- if (ret == 0)
- exception->retry = 1;
- break;
- case -NFS4ERR_FILE_OPEN:
- case -NFS4ERR_GRACE:
- case -NFS4ERR_DELAY:
- ret = nfs4_delay(server->client, &exception->timeout);
- if (ret != 0)
- break;
- case -NFS4ERR_OLD_STATEID:
- exception->retry = 1;
- }
- /* We failed to handle the error */
- return nfs4_map_errors(ret);
-}
-
int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred)
{
nfs4_verifier sc_verifier;
@@ -2916,7 +2963,6 @@ static int _nfs4_proc_setclientid_confirm(struct nfs_client *clp, struct rpc_cre
spin_lock(&clp->cl_lock);
clp->cl_lease_time = fsinfo.lease_time * HZ;
clp->cl_last_renewal = now;
- clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
spin_unlock(&clp->cl_lock);
}
return status;
@@ -3074,7 +3120,6 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
struct nfs4_lock_state *lsp;
int status;
- down_read(&clp->cl_sem);
arg.lock_owner.clientid = clp->cl_clientid;
status = nfs4_set_lock_state(state, request);
if (status != 0)
@@ -3091,7 +3136,6 @@ static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock
}
request->fl_ops->fl_release_private(request);
out:
- up_read(&clp->cl_sem);
return status;
}
@@ -3181,11 +3225,13 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
sizeof(calldata->lsp->ls_stateid.data));
renew_lease(calldata->server, calldata->timestamp);
break;
+ case -NFS4ERR_BAD_STATEID:
+ case -NFS4ERR_OLD_STATEID:
case -NFS4ERR_STALE_STATEID:
case -NFS4ERR_EXPIRED:
break;
default:
- if (nfs4_async_handle_error(task, calldata->server) == -EAGAIN)
+ if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
rpc_restart_call(task);
}
}
@@ -3248,6 +3294,7 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
{
+ struct nfs_inode *nfsi = NFS_I(state->inode);
struct nfs_seqid *seqid;
struct nfs4_lock_state *lsp;
struct rpc_task *task;
@@ -3257,8 +3304,12 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
status = nfs4_set_lock_state(state, request);
/* Unlock _before_ we do the RPC call */
request->fl_flags |= FL_EXISTS;
- if (do_vfs_lock(request->fl_file, request) == -ENOENT)
+ down_read(&nfsi->rwsem);
+ if (do_vfs_lock(request->fl_file, request) == -ENOENT) {
+ up_read(&nfsi->rwsem);
goto out;
+ }
+ up_read(&nfsi->rwsem);
if (status != 0)
goto out;
/* Is this a delegated lock? */
@@ -3484,7 +3535,7 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
{
- struct nfs_client *clp = state->owner->so_client;
+ struct nfs_inode *nfsi = NFS_I(state->inode);
unsigned char fl_flags = request->fl_flags;
int status;
@@ -3496,19 +3547,13 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
status = do_vfs_lock(request->fl_file, request);
if (status < 0)
goto out;
- down_read(&clp->cl_sem);
+ down_read(&nfsi->rwsem);
if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
- struct nfs_inode *nfsi = NFS_I(state->inode);
/* Yes: cache locks! */
- down_read(&nfsi->rwsem);
/* ...but avoid races with delegation recall... */
- if (test_bit(NFS_DELEGATED_STATE, &state->flags)) {
- request->fl_flags = fl_flags & ~FL_SLEEP;
- status = do_vfs_lock(request->fl_file, request);
- up_read(&nfsi->rwsem);
- goto out_unlock;
- }
- up_read(&nfsi->rwsem);
+ request->fl_flags = fl_flags & ~FL_SLEEP;
+ status = do_vfs_lock(request->fl_file, request);
+ goto out_unlock;
}
status = _nfs4_do_setlk(state, cmd, request, 0);
if (status != 0)
@@ -3518,7 +3563,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
if (do_vfs_lock(request->fl_file, request) < 0)
printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__);
out_unlock:
- up_read(&clp->cl_sem);
+ up_read(&nfsi->rwsem);
out:
request->fl_flags = fl_flags;
return status;
@@ -3664,11 +3709,15 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
}
struct nfs4_state_recovery_ops nfs4_reboot_recovery_ops = {
+ .owner_flag_bit = NFS_OWNER_RECLAIM_REBOOT,
+ .state_flag_bit = NFS_STATE_RECLAIM_REBOOT,
.recover_open = nfs4_open_reclaim,
.recover_lock = nfs4_lock_reclaim,
};
-struct nfs4_state_recovery_ops nfs4_network_partition_recovery_ops = {
+struct nfs4_state_recovery_ops nfs4_nograce_recovery_ops = {
+ .owner_flag_bit = NFS_OWNER_RECLAIM_NOGRACE,
+ .state_flag_bit = NFS_STATE_RECLAIM_NOGRACE,
.recover_open = nfs4_open_expired,
.recover_lock = nfs4_lock_expired,
};
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index 3305acbbe2ae..f524e932ff7b 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -65,7 +65,6 @@ nfs4_renew_state(struct work_struct *work)
long lease, timeout;
unsigned long last, now;
- down_read(&clp->cl_sem);
dprintk("%s: start\n", __func__);
/* Are there any active superblocks? */
if (list_empty(&clp->cl_superblocks))
@@ -77,17 +76,19 @@ nfs4_renew_state(struct work_struct *work)
timeout = (2 * lease) / 3 + (long)last - (long)now;
/* Are we close to a lease timeout? */
if (time_after(now, last + lease/3)) {
- cred = nfs4_get_renew_cred(clp);
+ cred = nfs4_get_renew_cred_locked(clp);
+ spin_unlock(&clp->cl_lock);
if (cred == NULL) {
- set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
- spin_unlock(&clp->cl_lock);
+ if (list_empty(&clp->cl_delegations)) {
+ set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+ goto out;
+ }
nfs_expire_all_delegations(clp);
- goto out;
+ } else {
+ /* Queue an asynchronous RENEW. */
+ nfs4_proc_async_renew(clp, cred);
+ put_rpccred(cred);
}
- spin_unlock(&clp->cl_lock);
- /* Queue an asynchronous RENEW. */
- nfs4_proc_async_renew(clp, cred);
- put_rpccred(cred);
timeout = (2 * lease) / 3;
spin_lock(&clp->cl_lock);
} else
@@ -100,12 +101,11 @@ nfs4_renew_state(struct work_struct *work)
cancel_delayed_work(&clp->cl_renewd);
schedule_delayed_work(&clp->cl_renewd, timeout);
spin_unlock(&clp->cl_lock);
+ nfs_expire_unreferenced_delegations(clp);
out:
- up_read(&clp->cl_sem);
dprintk("%s: done\n", __func__);
}
-/* Must be called with clp->cl_sem locked for writes */
void
nfs4_schedule_state_renewal(struct nfs_client *clp)
{
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 401ef8b28f97..2022fe47966f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -71,14 +71,12 @@ static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
return status;
}
-static struct rpc_cred *nfs4_get_machine_cred(struct nfs_client *clp)
+static struct rpc_cred *nfs4_get_machine_cred_locked(struct nfs_client *clp)
{
struct rpc_cred *cred = NULL;
- spin_lock(&clp->cl_lock);
if (clp->cl_machine_cred != NULL)
cred = get_rpccred(clp->cl_machine_cred);
- spin_unlock(&clp->cl_lock);
return cred;
}
@@ -94,7 +92,7 @@ static void nfs4_clear_machine_cred(struct nfs_client *clp)
put_rpccred(cred);
}
-struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
+struct rpc_cred *nfs4_get_renew_cred_locked(struct nfs_client *clp)
{
struct nfs4_state_owner *sp;
struct rb_node *pos;
@@ -110,13 +108,24 @@ struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
return cred;
}
+static struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp)
+{
+ struct rpc_cred *cred;
+
+ spin_lock(&clp->cl_lock);
+ cred = nfs4_get_renew_cred_locked(clp);
+ spin_unlock(&clp->cl_lock);
+ return cred;
+}
+
static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
{
struct nfs4_state_owner *sp;
struct rb_node *pos;
struct rpc_cred *cred;
- cred = nfs4_get_machine_cred(clp);
+ spin_lock(&clp->cl_lock);
+ cred = nfs4_get_machine_cred_locked(clp);
if (cred != NULL)
goto out;
pos = rb_first(&clp->cl_state_owners);
@@ -125,6 +134,7 @@ static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp)
cred = get_rpccred(sp->so_cred);
}
out:
+ spin_unlock(&clp->cl_lock);
return cred;
}
@@ -295,10 +305,6 @@ nfs4_drop_state_owner(struct nfs4_state_owner *sp)
}
}
-/*
- * Note: must be called with clp->cl_sem held in order to prevent races
- * with reboot recovery!
- */
struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct rpc_cred *cred)
{
struct nfs_client *clp = server->nfs_client;
@@ -327,10 +333,6 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct
return sp;
}
-/*
- * Must be called with clp->cl_sem held in order to avoid races
- * with state recovery...
- */
void nfs4_put_state_owner(struct nfs4_state_owner *sp)
{
struct nfs_client *clp = sp->so_client;
@@ -361,18 +363,18 @@ nfs4_alloc_open_state(void)
}
void
-nfs4_state_set_mode_locked(struct nfs4_state *state, mode_t mode)
+nfs4_state_set_mode_locked(struct nfs4_state *state, fmode_t fmode)
{
- if (state->state == mode)
+ if (state->state == fmode)
return;
/* NB! List reordering - see the reclaim code for why. */
- if ((mode & FMODE_WRITE) != (state->state & FMODE_WRITE)) {
- if (mode & FMODE_WRITE)
+ if ((fmode & FMODE_WRITE) != (state->state & FMODE_WRITE)) {
+ if (fmode & FMODE_WRITE)
list_move(&state->open_states, &state->owner->so_states);
else
list_move_tail(&state->open_states, &state->owner->so_states);
}
- state->state = mode;
+ state->state = fmode;
}
static struct nfs4_state *
@@ -432,10 +434,6 @@ out:
return state;
}
-/*
- * Beware! Caller must be holding exactly one
- * reference to clp->cl_sem!
- */
void nfs4_put_open_state(struct nfs4_state *state)
{
struct inode *inode = state->inode;
@@ -456,16 +454,16 @@ void nfs4_put_open_state(struct nfs4_state *state)
/*
* Close the current file.
*/
-static void __nfs4_close(struct path *path, struct nfs4_state *state, mode_t mode, int wait)
+static void __nfs4_close(struct path *path, struct nfs4_state *state, fmode_t fmode, int wait)
{
struct nfs4_state_owner *owner = state->owner;
int call_close = 0;
- int newstate;
+ fmode_t newstate;
atomic_inc(&owner->so_count);
/* Protect against nfs4_find_state() */
spin_lock(&owner->so_lock);
- switch (mode & (FMODE_READ | FMODE_WRITE)) {
+ switch (fmode & (FMODE_READ | FMODE_WRITE)) {
case FMODE_READ:
state->n_rdonly--;
break;
@@ -500,14 +498,14 @@ static void __nfs4_close(struct path *path, struct nfs4_state *state, mode_t mod
nfs4_do_close(path, state, wait);
}
-void nfs4_close_state(struct path *path, struct nfs4_state *state, mode_t mode)
+void nfs4_close_state(struct path *path, struct nfs4_state *state, fmode_t fmode)
{
- __nfs4_close(path, state, mode, 0);
+ __nfs4_close(path, state, fmode, 0);
}
-void nfs4_close_sync(struct path *path, struct nfs4_state *state, mode_t mode)
+void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode)
{
- __nfs4_close(path, state, mode, 1);
+ __nfs4_close(path, state, fmode, 1);
}
/*
@@ -568,7 +566,6 @@ static void nfs4_free_lock_state(struct nfs4_lock_state *lsp)
* Return a compatible lock_state. If no initialized lock_state structure
* exists, return an uninitialized one.
*
- * The caller must be holding clp->cl_sem
*/
static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner)
{
@@ -770,32 +767,34 @@ unlock:
return status;
}
-static int reclaimer(void *);
+static int nfs4_run_state_manager(void *);
-static inline void nfs4_clear_recover_bit(struct nfs_client *clp)
+static void nfs4_clear_state_manager_bit(struct nfs_client *clp)
{
smp_mb__before_clear_bit();
- clear_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state);
+ clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
smp_mb__after_clear_bit();
- wake_up_bit(&clp->cl_state, NFS4CLNT_STATE_RECOVER);
+ wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING);
rpc_wake_up(&clp->cl_rpcwaitq);
}
/*
- * State recovery routine
+ * Schedule the nfs_client asynchronous state management routine
*/
-static void nfs4_recover_state(struct nfs_client *clp)
+void nfs4_schedule_state_manager(struct nfs_client *clp)
{
struct task_struct *task;
+ if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
+ return;
__module_get(THIS_MODULE);
atomic_inc(&clp->cl_count);
- task = kthread_run(reclaimer, clp, "%s-reclaim",
+ task = kthread_run(nfs4_run_state_manager, clp, "%s-manager",
rpc_peeraddr2str(clp->cl_rpcclient,
RPC_DISPLAY_ADDR));
if (!IS_ERR(task))
return;
- nfs4_clear_recover_bit(clp);
+ nfs4_clear_state_manager_bit(clp);
nfs_put_client(clp);
module_put(THIS_MODULE);
}
@@ -807,16 +806,42 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp)
{
if (!clp)
return;
- if (test_and_set_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0)
- nfs4_recover_state(clp);
+ if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+ set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+ nfs4_schedule_state_manager(clp);
}
-static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_state *state)
+static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
+{
+
+ set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+ /* Don't recover state that expired before the reboot */
+ if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) {
+ clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+ return 0;
+ }
+ set_bit(NFS_OWNER_RECLAIM_REBOOT, &state->owner->so_flags);
+ set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
+ return 1;
+}
+
+int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
+{
+ set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
+ clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
+ set_bit(NFS_OWNER_RECLAIM_NOGRACE, &state->owner->so_flags);
+ set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
+ return 1;
+}
+
+static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_recovery_ops *ops)
{
struct inode *inode = state->inode;
+ struct nfs_inode *nfsi = NFS_I(inode);
struct file_lock *fl;
int status = 0;
+ down_write(&nfsi->rwsem);
for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
continue;
@@ -839,12 +864,14 @@ static int nfs4_reclaim_locks(struct nfs4_state_recovery_ops *ops, struct nfs4_s
goto out_err;
}
}
+ up_write(&nfsi->rwsem);
return 0;
out_err:
+ up_write(&nfsi->rwsem);
return status;
}
-static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct nfs4_state_owner *sp)
+static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs4_state_recovery_ops *ops)
{
struct nfs4_state *state;
struct nfs4_lock_state *lock;
@@ -858,28 +885,34 @@ static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct n
* recovering after a network partition or a reboot from a
* server that doesn't support a grace period.
*/
+restart:
+ spin_lock(&sp->so_lock);
list_for_each_entry(state, &sp->so_states, open_states) {
+ if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
+ continue;
if (state->state == 0)
continue;
+ atomic_inc(&state->count);
+ spin_unlock(&sp->so_lock);
status = ops->recover_open(sp, state);
if (status >= 0) {
- status = nfs4_reclaim_locks(ops, state);
- if (status < 0)
- goto out_err;
- list_for_each_entry(lock, &state->lock_states, ls_locks) {
- if (!(lock->ls_flags & NFS_LOCK_INITIALIZED))
- printk("%s: Lock reclaim failed!\n",
+ status = nfs4_reclaim_locks(state, ops);
+ if (status >= 0) {
+ list_for_each_entry(lock, &state->lock_states, ls_locks) {
+ if (!(lock->ls_flags & NFS_LOCK_INITIALIZED))
+ printk("%s: Lock reclaim failed!\n",
__func__);
+ }
+ nfs4_put_open_state(state);
+ goto restart;
}
- continue;
}
switch (status) {
default:
printk(KERN_ERR "%s: unhandled error %d. Zeroing state\n",
__func__, status);
case -ENOENT:
- case -NFS4ERR_RECLAIM_BAD:
- case -NFS4ERR_RECLAIM_CONFLICT:
+ case -ESTALE:
/*
* Open state on this file cannot be recovered
* All we can do is revert to using the zero stateid.
@@ -889,84 +922,176 @@ static int nfs4_reclaim_open_state(struct nfs4_state_recovery_ops *ops, struct n
/* Mark the file as being 'closed' */
state->state = 0;
break;
+ case -NFS4ERR_RECLAIM_BAD:
+ case -NFS4ERR_RECLAIM_CONFLICT:
+ nfs4_state_mark_reclaim_nograce(sp->so_client, state);
+ break;
case -NFS4ERR_EXPIRED:
case -NFS4ERR_NO_GRACE:
+ nfs4_state_mark_reclaim_nograce(sp->so_client, state);
case -NFS4ERR_STALE_CLIENTID:
goto out_err;
}
+ nfs4_put_open_state(state);
+ goto restart;
}
+ spin_unlock(&sp->so_lock);
return 0;
out_err:
+ nfs4_put_open_state(state);
return status;
}
-static void nfs4_state_mark_reclaim(struct nfs_client *clp)
+static void nfs4_clear_open_state(struct nfs4_state *state)
+{
+ struct nfs4_lock_state *lock;
+
+ clear_bit(NFS_DELEGATED_STATE, &state->flags);
+ clear_bit(NFS_O_RDONLY_STATE, &state->flags);
+ clear_bit(NFS_O_WRONLY_STATE, &state->flags);
+ clear_bit(NFS_O_RDWR_STATE, &state->flags);
+ list_for_each_entry(lock, &state->lock_states, ls_locks) {
+ lock->ls_seqid.flags = 0;
+ lock->ls_flags &= ~NFS_LOCK_INITIALIZED;
+ }
+}
+
+static void nfs4_state_mark_reclaim_helper(struct nfs_client *clp, int (*mark_reclaim)(struct nfs_client *clp, struct nfs4_state *state))
{
struct nfs4_state_owner *sp;
struct rb_node *pos;
struct nfs4_state *state;
- struct nfs4_lock_state *lock;
/* Reset all sequence ids to zero */
for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
- sp->so_seqid.counter = 0;
sp->so_seqid.flags = 0;
spin_lock(&sp->so_lock);
list_for_each_entry(state, &sp->so_states, open_states) {
- clear_bit(NFS_DELEGATED_STATE, &state->flags);
- clear_bit(NFS_O_RDONLY_STATE, &state->flags);
- clear_bit(NFS_O_WRONLY_STATE, &state->flags);
- clear_bit(NFS_O_RDWR_STATE, &state->flags);
- list_for_each_entry(lock, &state->lock_states, ls_locks) {
- lock->ls_seqid.counter = 0;
- lock->ls_seqid.flags = 0;
- lock->ls_flags &= ~NFS_LOCK_INITIALIZED;
- }
+ if (mark_reclaim(clp, state))
+ nfs4_clear_open_state(state);
}
spin_unlock(&sp->so_lock);
}
}
-static int reclaimer(void *ptr)
+static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
+{
+ /* Mark all delegations for reclaim */
+ nfs_delegation_mark_reclaim(clp);
+ nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot);
+}
+
+static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
{
- struct nfs_client *clp = ptr;
struct nfs4_state_owner *sp;
struct rb_node *pos;
- struct nfs4_state_recovery_ops *ops;
- struct rpc_cred *cred;
+ struct nfs4_state *state;
+
+ if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+ return;
+
+ for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+ sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+ spin_lock(&sp->so_lock);
+ list_for_each_entry(state, &sp->so_states, open_states) {
+ if (!test_and_clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags))
+ continue;
+ nfs4_state_mark_reclaim_nograce(clp, state);
+ }
+ spin_unlock(&sp->so_lock);
+ }
+
+ nfs_delegation_reap_unclaimed(clp);
+}
+
+static void nfs_delegation_clear_all(struct nfs_client *clp)
+{
+ nfs_delegation_mark_reclaim(clp);
+ nfs_delegation_reap_unclaimed(clp);
+}
+
+static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
+{
+ nfs_delegation_clear_all(clp);
+ nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
+}
+
+static void nfs4_state_end_reclaim_nograce(struct nfs_client *clp)
+{
+ clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
+}
+
+static void nfs4_recovery_handle_error(struct nfs_client *clp, int error)
+{
+ switch (error) {
+ case -NFS4ERR_CB_PATH_DOWN:
+ nfs_handle_cb_pathdown(clp);
+ break;
+ case -NFS4ERR_STALE_CLIENTID:
+ case -NFS4ERR_LEASE_MOVED:
+ set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+ nfs4_state_start_reclaim_reboot(clp);
+ break;
+ case -NFS4ERR_EXPIRED:
+ set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+ nfs4_state_start_reclaim_nograce(clp);
+ }
+}
+
+static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops)
+{
+ struct rb_node *pos;
int status = 0;
- allow_signal(SIGKILL);
+restart:
+ spin_lock(&clp->cl_lock);
+ for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
+ struct nfs4_state_owner *sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
+ if (!test_and_clear_bit(ops->owner_flag_bit, &sp->so_flags))
+ continue;
+ atomic_inc(&sp->so_count);
+ spin_unlock(&clp->cl_lock);
+ status = nfs4_reclaim_open_state(sp, ops);
+ if (status < 0) {
+ set_bit(ops->owner_flag_bit, &sp->so_flags);
+ nfs4_put_state_owner(sp);
+ nfs4_recovery_handle_error(clp, status);
+ return status;
+ }
+ nfs4_put_state_owner(sp);
+ goto restart;
+ }
+ spin_unlock(&clp->cl_lock);
+ return status;
+}
- /* Ensure exclusive access to NFSv4 state */
- down_write(&clp->cl_sem);
- /* Are there any NFS mounts out there? */
- if (list_empty(&clp->cl_superblocks))
- goto out;
-restart_loop:
- ops = &nfs4_network_partition_recovery_ops;
- /* Are there any open files on this volume? */
+static int nfs4_check_lease(struct nfs_client *clp)
+{
+ struct rpc_cred *cred;
+ int status = -NFS4ERR_EXPIRED;
+
+ /* Is the client already known to have an expired lease? */
+ if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state))
+ return 0;
cred = nfs4_get_renew_cred(clp);
- if (cred != NULL) {
- /* Yes there are: try to renew the old lease */
- status = nfs4_proc_renew(clp, cred);
- put_rpccred(cred);
- switch (status) {
- case 0:
- case -NFS4ERR_CB_PATH_DOWN:
- goto out;
- case -NFS4ERR_STALE_CLIENTID:
- case -NFS4ERR_LEASE_MOVED:
- ops = &nfs4_reboot_recovery_ops;
- }
- } else {
- /* "reboot" to ensure we clear all state on the server */
- clp->cl_boot_time = CURRENT_TIME;
+ if (cred == NULL) {
+ cred = nfs4_get_setclientid_cred(clp);
+ if (cred == NULL)
+ goto out;
}
- /* We're going to have to re-establish a clientid */
- nfs4_state_mark_reclaim(clp);
- status = -ENOENT;
+ status = nfs4_proc_renew(clp, cred);
+ put_rpccred(cred);
+out:
+ nfs4_recovery_handle_error(clp, status);
+ return status;
+}
+
+static int nfs4_reclaim_lease(struct nfs_client *clp)
+{
+ struct rpc_cred *cred;
+ int status = -ENOENT;
+
cred = nfs4_get_setclientid_cred(clp);
if (cred != NULL) {
status = nfs4_init_client(clp, cred);
@@ -974,42 +1099,90 @@ restart_loop:
/* Handle case where the user hasn't set up machine creds */
if (status == -EACCES && cred == clp->cl_machine_cred) {
nfs4_clear_machine_cred(clp);
- goto restart_loop;
+ status = -EAGAIN;
}
}
- if (status)
- goto out_error;
- /* Mark all delegations for reclaim */
- nfs_delegation_mark_reclaim(clp);
- /* Note: list is protected by exclusive lock on cl->cl_sem */
- for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) {
- sp = rb_entry(pos, struct nfs4_state_owner, so_client_node);
- status = nfs4_reclaim_open_state(ops, sp);
- if (status < 0) {
- if (status == -NFS4ERR_NO_GRACE) {
- ops = &nfs4_network_partition_recovery_ops;
- status = nfs4_reclaim_open_state(ops, sp);
+ return status;
+}
+
+static void nfs4_state_manager(struct nfs_client *clp)
+{
+ int status = 0;
+
+ /* Ensure exclusive access to NFSv4 state */
+ for(;;) {
+ if (test_and_clear_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) {
+ /* We're going to have to re-establish a clientid */
+ status = nfs4_reclaim_lease(clp);
+ if (status) {
+ set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
+ if (status == -EAGAIN)
+ continue;
+ goto out_error;
}
+ clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
+ }
+
+ if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) {
+ status = nfs4_check_lease(clp);
+ if (status != 0)
+ continue;
+ }
+
+ /* First recover reboot state... */
+ if (test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) {
+ status = nfs4_do_reclaim(clp, &nfs4_reboot_recovery_ops);
if (status == -NFS4ERR_STALE_CLIENTID)
- goto restart_loop;
- if (status == -NFS4ERR_EXPIRED)
- goto restart_loop;
+ continue;
+ nfs4_state_end_reclaim_reboot(clp);
+ continue;
+ }
+
+ /* Now recover expired state... */
+ if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
+ status = nfs4_do_reclaim(clp, &nfs4_nograce_recovery_ops);
+ if (status < 0) {
+ set_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
+ if (status == -NFS4ERR_STALE_CLIENTID)
+ continue;
+ if (status == -NFS4ERR_EXPIRED)
+ continue;
+ goto out_error;
+ } else
+ nfs4_state_end_reclaim_nograce(clp);
+ continue;
}
+
+ if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
+ nfs_client_return_marked_delegations(clp);
+ continue;
+ }
+
+ nfs4_clear_state_manager_bit(clp);
+ /* Did we race with an attempt to give us more work? */
+ if (clp->cl_state == 0)
+ break;
+ if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
+ break;
}
- nfs_delegation_reap_unclaimed(clp);
-out:
- up_write(&clp->cl_sem);
- if (status == -NFS4ERR_CB_PATH_DOWN)
- nfs_handle_cb_pathdown(clp);
- nfs4_clear_recover_bit(clp);
+ return;
+out_error:
+ printk(KERN_WARNING "Error: state manager failed on NFSv4 server %s"
+ " with error %d\n", clp->cl_hostname, -status);
+ if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state))
+ nfs4_state_end_reclaim_reboot(clp);
+ nfs4_clear_state_manager_bit(clp);
+}
+
+static int nfs4_run_state_manager(void *ptr)
+{
+ struct nfs_client *clp = ptr;
+
+ allow_signal(SIGKILL);
+ nfs4_state_manager(clp);
nfs_put_client(clp);
module_put_and_exit(0);
return 0;
-out_error:
- printk(KERN_WARNING "Error: state recovery failed on NFSv4 server %s"
- " with error %d\n", clp->cl_hostname, -status);
- set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
- goto out;
}
/*
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index b916297d2334..d1e4c8f8a0a9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -8,7 +8,7 @@
*
* Kendrick Smith <kmsmith@umich.edu>
* Andy Adamson <andros@umich.edu>
- *
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -67,7 +67,7 @@ static int nfs4_stat_to_errno(int);
#define NFS4_MAXTAGLEN 0
#endif
-/* lock,open owner id:
+/* lock,open owner id:
* we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2)
*/
#define open_owner_id_maxsz (1 + 4)
@@ -541,6 +541,7 @@ static struct {
struct compound_hdr {
int32_t status;
uint32_t nops;
+ __be32 * nops_p;
uint32_t taglen;
char * tag;
};
@@ -578,7 +579,7 @@ static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *
xdr_encode_opaque(p, str, len);
}
-static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
+static void encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
__be32 *p;
@@ -588,8 +589,13 @@ static int encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
WRITE32(hdr->taglen);
WRITEMEM(hdr->tag, hdr->taglen);
WRITE32(NFS4_MINOR_VERSION);
+ hdr->nops_p = p;
WRITE32(hdr->nops);
- return 0;
+}
+
+static void encode_nops(struct compound_hdr *hdr)
+{
+ *hdr->nops_p = htonl(hdr->nops);
}
static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *verf)
@@ -601,7 +607,7 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
xdr_encode_opaque_fixed(p, verf->data, NFS4_VERIFIER_SIZE);
}
-static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server)
+static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const struct nfs_server *server)
{
char owner_name[IDMAP_NAMESZ];
char owner_group[IDMAP_NAMESZ];
@@ -612,7 +618,6 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
int len;
uint32_t bmval0 = 0;
uint32_t bmval1 = 0;
- int status;
/*
* We reserve enough space to write the entire attribute buffer at once.
@@ -709,7 +714,7 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
WRITE32(NFS4_SET_TO_SERVER_TIME);
}
-
+
/*
* Now we backfill the bitmap and the attribute buffer length.
*/
@@ -723,23 +728,20 @@ static int encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const s
*q++ = htonl(bmval1);
*q++ = htonl(len);
- status = 0;
/* out: */
- return status;
}
-static int encode_access(struct xdr_stream *xdr, u32 access)
+static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hdr *hdr)
{
__be32 *p;
RESERVE_SPACE(8);
WRITE32(OP_ACCESS);
WRITE32(access);
-
- return 0;
+ hdr->nops++;
}
-static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
+static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
{
__be32 *p;
@@ -747,26 +749,24 @@ static int encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
WRITE32(OP_CLOSE);
WRITE32(arg->seqid->sequence->counter);
WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
-
- return 0;
+ hdr->nops++;
}
-static int encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args)
+static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
{
__be32 *p;
-
- RESERVE_SPACE(16);
- WRITE32(OP_COMMIT);
- WRITE64(args->offset);
- WRITE32(args->count);
- return 0;
+ RESERVE_SPACE(16);
+ WRITE32(OP_COMMIT);
+ WRITE64(args->offset);
+ WRITE32(args->count);
+ hdr->nops++;
}
-static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create)
+static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *create, struct compound_hdr *hdr)
{
__be32 *p;
-
+
RESERVE_SPACE(8);
WRITE32(OP_CREATE);
WRITE32(create->ftype);
@@ -791,64 +791,62 @@ static int encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *c
RESERVE_SPACE(4 + create->name->len);
WRITE32(create->name->len);
WRITEMEM(create->name->name, create->name->len);
+ hdr->nops++;
- return encode_attrs(xdr, create->attrs, create->server);
+ encode_attrs(xdr, create->attrs, create->server);
}
-static int encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap)
+static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
{
- __be32 *p;
+ __be32 *p;
- RESERVE_SPACE(12);
- WRITE32(OP_GETATTR);
- WRITE32(1);
- WRITE32(bitmap);
- return 0;
+ RESERVE_SPACE(12);
+ WRITE32(OP_GETATTR);
+ WRITE32(1);
+ WRITE32(bitmap);
+ hdr->nops++;
}
-static int encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1)
+static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm1, struct compound_hdr *hdr)
{
- __be32 *p;
+ __be32 *p;
- RESERVE_SPACE(16);
- WRITE32(OP_GETATTR);
- WRITE32(2);
- WRITE32(bm0);
- WRITE32(bm1);
- return 0;
+ RESERVE_SPACE(16);
+ WRITE32(OP_GETATTR);
+ WRITE32(2);
+ WRITE32(bm0);
+ WRITE32(bm1);
+ hdr->nops++;
}
-static int encode_getfattr(struct xdr_stream *xdr, const u32* bitmask)
+static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
{
- return encode_getattr_two(xdr,
- bitmask[0] & nfs4_fattr_bitmap[0],
- bitmask[1] & nfs4_fattr_bitmap[1]);
+ encode_getattr_two(xdr, bitmask[0] & nfs4_fattr_bitmap[0],
+ bitmask[1] & nfs4_fattr_bitmap[1], hdr);
}
-static int encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask)
+static void encode_fsinfo(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
{
- return encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0],
- bitmask[1] & nfs4_fsinfo_bitmap[1]);
+ encode_getattr_two(xdr, bitmask[0] & nfs4_fsinfo_bitmap[0],
+ bitmask[1] & nfs4_fsinfo_bitmap[1], hdr);
}
-static int encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask)
+static void encode_fs_locations(struct xdr_stream *xdr, const u32* bitmask, struct compound_hdr *hdr)
{
- return encode_getattr_two(xdr,
- bitmask[0] & nfs4_fs_locations_bitmap[0],
- bitmask[1] & nfs4_fs_locations_bitmap[1]);
+ encode_getattr_two(xdr, bitmask[0] & nfs4_fs_locations_bitmap[0],
+ bitmask[1] & nfs4_fs_locations_bitmap[1], hdr);
}
-static int encode_getfh(struct xdr_stream *xdr)
+static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
__be32 *p;
RESERVE_SPACE(4);
WRITE32(OP_GETFH);
-
- return 0;
+ hdr->nops++;
}
-static int encode_link(struct xdr_stream *xdr, const struct qstr *name)
+static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
{
__be32 *p;
@@ -856,8 +854,7 @@ static int encode_link(struct xdr_stream *xdr, const struct qstr *name)
WRITE32(OP_LINK);
WRITE32(name->len);
WRITEMEM(name->name, name->len);
-
- return 0;
+ hdr->nops++;
}
static inline int nfs4_lock_type(struct file_lock *fl, int block)
@@ -878,7 +875,7 @@ static inline uint64_t nfs4_lock_length(struct file_lock *fl)
* opcode,type,reclaim,offset,length,new_lock_owner = 32
* open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40
*/
-static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
+static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args, struct compound_hdr *hdr)
{
__be32 *p;
@@ -904,11 +901,10 @@ static int encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args)
WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE);
WRITE32(args->lock_seqid->sequence->counter);
}
-
- return 0;
+ hdr->nops++;
}
-static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args)
+static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *args, struct compound_hdr *hdr)
{
__be32 *p;
@@ -921,11 +917,10 @@ static int encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *arg
WRITE32(16);
WRITEMEM("lock id:", 8);
WRITE64(args->lock_owner.id);
-
- return 0;
+ hdr->nops++;
}
-static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args)
+static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *args, struct compound_hdr *hdr)
{
__be32 *p;
@@ -936,11 +931,10 @@ static int encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *arg
WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE);
WRITE64(args->fl->fl_start);
WRITE64(nfs4_lock_length(args->fl));
-
- return 0;
+ hdr->nops++;
}
-static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name)
+static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
{
int len = name->len;
__be32 *p;
@@ -949,27 +943,26 @@ static int encode_lookup(struct xdr_stream *xdr, const struct qstr *name)
WRITE32(OP_LOOKUP);
WRITE32(len);
WRITEMEM(name->name, len);
-
- return 0;
+ hdr->nops++;
}
-static void encode_share_access(struct xdr_stream *xdr, int open_flags)
+static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
{
__be32 *p;
RESERVE_SPACE(8);
- switch (open_flags & (FMODE_READ|FMODE_WRITE)) {
- case FMODE_READ:
- WRITE32(NFS4_SHARE_ACCESS_READ);
- break;
- case FMODE_WRITE:
- WRITE32(NFS4_SHARE_ACCESS_WRITE);
- break;
- case FMODE_READ|FMODE_WRITE:
- WRITE32(NFS4_SHARE_ACCESS_BOTH);
- break;
- default:
- BUG();
+ switch (fmode & (FMODE_READ|FMODE_WRITE)) {
+ case FMODE_READ:
+ WRITE32(NFS4_SHARE_ACCESS_READ);
+ break;
+ case FMODE_WRITE:
+ WRITE32(NFS4_SHARE_ACCESS_WRITE);
+ break;
+ case FMODE_READ|FMODE_WRITE:
+ WRITE32(NFS4_SHARE_ACCESS_BOTH);
+ break;
+ default:
+ WRITE32(0);
}
WRITE32(0); /* for linux, share_deny = 0 always */
}
@@ -984,7 +977,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
RESERVE_SPACE(8);
WRITE32(OP_OPEN);
WRITE32(arg->seqid->sequence->counter);
- encode_share_access(xdr, arg->open_flags);
+ encode_share_access(xdr, arg->fmode);
RESERVE_SPACE(28);
WRITE64(arg->clientid);
WRITE32(16);
@@ -998,13 +991,13 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
RESERVE_SPACE(4);
switch(arg->open_flags & O_EXCL) {
- case 0:
- WRITE32(NFS4_CREATE_UNCHECKED);
- encode_attrs(xdr, arg->u.attrs, arg->server);
- break;
- default:
- WRITE32(NFS4_CREATE_EXCLUSIVE);
- encode_nfs4_verifier(xdr, &arg->u.verifier);
+ case 0:
+ WRITE32(NFS4_CREATE_UNCHECKED);
+ encode_attrs(xdr, arg->u.attrs, arg->server);
+ break;
+ default:
+ WRITE32(NFS4_CREATE_EXCLUSIVE);
+ encode_nfs4_verifier(xdr, &arg->u.verifier);
}
}
@@ -1014,33 +1007,33 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
RESERVE_SPACE(4);
switch (arg->open_flags & O_CREAT) {
- case 0:
- WRITE32(NFS4_OPEN_NOCREATE);
- break;
- default:
- BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
- WRITE32(NFS4_OPEN_CREATE);
- encode_createmode(xdr, arg);
+ case 0:
+ WRITE32(NFS4_OPEN_NOCREATE);
+ break;
+ default:
+ BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
+ WRITE32(NFS4_OPEN_CREATE);
+ encode_createmode(xdr, arg);
}
}
-static inline void encode_delegation_type(struct xdr_stream *xdr, int delegation_type)
+static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delegation_type)
{
__be32 *p;
RESERVE_SPACE(4);
switch (delegation_type) {
- case 0:
- WRITE32(NFS4_OPEN_DELEGATE_NONE);
- break;
- case FMODE_READ:
- WRITE32(NFS4_OPEN_DELEGATE_READ);
- break;
- case FMODE_WRITE|FMODE_READ:
- WRITE32(NFS4_OPEN_DELEGATE_WRITE);
- break;
- default:
- BUG();
+ case 0:
+ WRITE32(NFS4_OPEN_DELEGATE_NONE);
+ break;
+ case FMODE_READ:
+ WRITE32(NFS4_OPEN_DELEGATE_READ);
+ break;
+ case FMODE_WRITE|FMODE_READ:
+ WRITE32(NFS4_OPEN_DELEGATE_WRITE);
+ break;
+ default:
+ BUG();
}
}
@@ -1053,7 +1046,7 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
encode_string(xdr, name->len, name->name);
}
-static inline void encode_claim_previous(struct xdr_stream *xdr, int type)
+static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
{
__be32 *p;
@@ -1072,27 +1065,27 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
encode_string(xdr, name->len, name->name);
}
-static int encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg)
+static void encode_open(struct xdr_stream *xdr, const struct nfs_openargs *arg, struct compound_hdr *hdr)
{
encode_openhdr(xdr, arg);
encode_opentype(xdr, arg);
switch (arg->claim) {
- case NFS4_OPEN_CLAIM_NULL:
- encode_claim_null(xdr, arg->name);
- break;
- case NFS4_OPEN_CLAIM_PREVIOUS:
- encode_claim_previous(xdr, arg->u.delegation_type);
- break;
- case NFS4_OPEN_CLAIM_DELEGATE_CUR:
- encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation);
- break;
- default:
- BUG();
+ case NFS4_OPEN_CLAIM_NULL:
+ encode_claim_null(xdr, arg->name);
+ break;
+ case NFS4_OPEN_CLAIM_PREVIOUS:
+ encode_claim_previous(xdr, arg->u.delegation_type);
+ break;
+ case NFS4_OPEN_CLAIM_DELEGATE_CUR:
+ encode_claim_delegate_cur(xdr, arg->name, &arg->u.delegation);
+ break;
+ default:
+ BUG();
}
- return 0;
+ hdr->nops++;
}
-static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg)
+static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_confirmargs *arg, struct compound_hdr *hdr)
{
__be32 *p;
@@ -1100,11 +1093,10 @@ static int encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_con
WRITE32(OP_OPEN_CONFIRM);
WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
WRITE32(arg->seqid->sequence->counter);
-
- return 0;
+ hdr->nops++;
}
-static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg)
+static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
{
__be32 *p;
@@ -1112,12 +1104,12 @@ static int encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closea
WRITE32(OP_OPEN_DOWNGRADE);
WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
WRITE32(arg->seqid->sequence->counter);
- encode_share_access(xdr, arg->open_flags);
- return 0;
+ encode_share_access(xdr, arg->fmode);
+ hdr->nops++;
}
-static int
-encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh)
+static void
+encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hdr *hdr)
{
int len = fh->size;
__be32 *p;
@@ -1126,18 +1118,16 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh)
WRITE32(OP_PUTFH);
WRITE32(len);
WRITEMEM(fh->data, len);
-
- return 0;
+ hdr->nops++;
}
-static int encode_putrootfh(struct xdr_stream *xdr)
+static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
- __be32 *p;
-
- RESERVE_SPACE(4);
- WRITE32(OP_PUTROOTFH);
+ __be32 *p;
- return 0;
+ RESERVE_SPACE(4);
+ WRITE32(OP_PUTROOTFH);
+ hdr->nops++;
}
static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx)
@@ -1153,7 +1143,7 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE);
}
-static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args)
+static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
{
__be32 *p;
@@ -1165,11 +1155,10 @@ static int encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args)
RESERVE_SPACE(12);
WRITE64(args->offset);
WRITE32(args->count);
-
- return 0;
+ hdr->nops++;
}
-static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req)
+static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg *readdir, struct rpc_rqst *req, struct compound_hdr *hdr)
{
uint32_t attrs[2] = {
FATTR4_WORD0_RDATTR_ERROR|FATTR4_WORD0_FILEID,
@@ -1191,6 +1180,7 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
WRITE32(attrs[0] & readdir->bitmask[0]);
WRITE32(attrs[1] & readdir->bitmask[1]);
+ hdr->nops++;
dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
__func__,
(unsigned long long)readdir->cookie,
@@ -1198,21 +1188,18 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
((u32 *)readdir->verifier.data)[1],
attrs[0] & readdir->bitmask[0],
attrs[1] & readdir->bitmask[1]);
-
- return 0;
}
-static int encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req)
+static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *readlink, struct rpc_rqst *req, struct compound_hdr *hdr)
{
__be32 *p;
RESERVE_SPACE(4);
WRITE32(OP_READLINK);
-
- return 0;
+ hdr->nops++;
}
-static int encode_remove(struct xdr_stream *xdr, const struct qstr *name)
+static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr)
{
__be32 *p;
@@ -1220,11 +1207,10 @@ static int encode_remove(struct xdr_stream *xdr, const struct qstr *name)
WRITE32(OP_REMOVE);
WRITE32(name->len);
WRITEMEM(name->name, name->len);
-
- return 0;
+ hdr->nops++;
}
-static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname)
+static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, const struct qstr *newname, struct compound_hdr *hdr)
{
__be32 *p;
@@ -1232,38 +1218,35 @@ static int encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, con
WRITE32(OP_RENAME);
WRITE32(oldname->len);
WRITEMEM(oldname->name, oldname->len);
-
+
RESERVE_SPACE(4 + newname->len);
WRITE32(newname->len);
WRITEMEM(newname->name, newname->len);
-
- return 0;
+ hdr->nops++;
}
-static int encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid)
+static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client_stateid, struct compound_hdr *hdr)
{
__be32 *p;
RESERVE_SPACE(12);
WRITE32(OP_RENEW);
WRITE64(client_stateid->cl_clientid);
-
- return 0;
+ hdr->nops++;
}
-static int
-encode_restorefh(struct xdr_stream *xdr)
+static void
+encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
__be32 *p;
RESERVE_SPACE(4);
WRITE32(OP_RESTOREFH);
-
- return 0;
+ hdr->nops++;
}
static int
-encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
+encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compound_hdr *hdr)
{
__be32 *p;
@@ -1278,36 +1261,32 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg)
RESERVE_SPACE(4);
WRITE32(arg->acl_len);
xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
+ hdr->nops++;
return 0;
}
-static int
-encode_savefh(struct xdr_stream *xdr)
+static void
+encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
{
__be32 *p;
RESERVE_SPACE(4);
WRITE32(OP_SAVEFH);
-
- return 0;
+ hdr->nops++;
}
-static int encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server)
+static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs *arg, const struct nfs_server *server, struct compound_hdr *hdr)
{
- int status;
__be32 *p;
-
- RESERVE_SPACE(4+NFS4_STATEID_SIZE);
- WRITE32(OP_SETATTR);
- WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
- if ((status = encode_attrs(xdr, arg->iap, server)))
- return status;
-
- return 0;
+ RESERVE_SPACE(4+NFS4_STATEID_SIZE);
+ WRITE32(OP_SETATTR);
+ WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
+ hdr->nops++;
+ encode_attrs(xdr, arg->iap, server);
}
-static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid)
+static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
{
__be32 *p;
@@ -1322,23 +1301,21 @@ static int encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclien
encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
RESERVE_SPACE(4);
WRITE32(setclientid->sc_cb_ident);
-
- return 0;
+ hdr->nops++;
}
-static int encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state)
+static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_client *client_state, struct compound_hdr *hdr)
{
- __be32 *p;
-
- RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
- WRITE32(OP_SETCLIENTID_CONFIRM);
- WRITE64(client_state->cl_clientid);
- WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
+ __be32 *p;
- return 0;
+ RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
+ WRITE32(OP_SETCLIENTID_CONFIRM);
+ WRITE64(client_state->cl_clientid);
+ WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
+ hdr->nops++;
}
-static int encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args)
+static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args, struct compound_hdr *hdr)
{
__be32 *p;
@@ -1353,11 +1330,10 @@ static int encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *args
WRITE32(args->count);
xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
-
- return 0;
+ hdr->nops++;
}
-static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid)
+static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *stateid, struct compound_hdr *hdr)
{
__be32 *p;
@@ -1365,8 +1341,7 @@ static int encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *statei
WRITE32(OP_DELEGRETURN);
WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
- return 0;
-
+ hdr->nops++;
}
/*
* END OF "GENERIC" ENCODE ROUTINES.
@@ -1379,21 +1354,16 @@ static int nfs4_xdr_enc_access(struct rpc_rqst *req, __be32 *p, const struct nfs
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 3,
+ .nops = 0,
};
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- status = encode_putfh(&xdr, args->fh);
- if (status != 0)
- goto out;
- status = encode_access(&xdr, args->access);
- if (status != 0)
- goto out;
- status = encode_getfattr(&xdr, args->bitmask);
-out:
- return status;
+ encode_putfh(&xdr, args->fh, &hdr);
+ encode_access(&xdr, args->access, &hdr);
+ encode_getfattr(&xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1403,21 +1373,17 @@ static int nfs4_xdr_enc_lookup(struct rpc_rqst *req, __be32 *p, const struct nfs
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 4,
+ .nops = 0,
};
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
- goto out;
- if ((status = encode_lookup(&xdr, args->name)) != 0)
- goto out;
- if ((status = encode_getfh(&xdr)) != 0)
- goto out;
- status = encode_getfattr(&xdr, args->bitmask);
-out:
- return status;
+ encode_putfh(&xdr, args->dir_fh, &hdr);
+ encode_lookup(&xdr, args->name, &hdr);
+ encode_getfh(&xdr, &hdr);
+ encode_getfattr(&xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1427,18 +1393,16 @@ static int nfs4_xdr_enc_lookup_root(struct rpc_rqst *req, __be32 *p, const struc
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 3,
+ .nops = 0,
};
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- if ((status = encode_putrootfh(&xdr)) != 0)
- goto out;
- if ((status = encode_getfh(&xdr)) == 0)
- status = encode_getfattr(&xdr, args->bitmask);
-out:
- return status;
+ encode_putrootfh(&xdr, &hdr);
+ encode_getfh(&xdr, &hdr);
+ encode_getfattr(&xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1448,19 +1412,16 @@ static int nfs4_xdr_enc_remove(struct rpc_rqst *req, __be32 *p, const struct nfs
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 3,
+ .nops = 0,
};
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- if ((status = encode_putfh(&xdr, args->fh)) != 0)
- goto out;
- if ((status = encode_remove(&xdr, &args->name)) != 0)
- goto out;
- status = encode_getfattr(&xdr, args->bitmask);
-out:
- return status;
+ encode_putfh(&xdr, args->fh, &hdr);
+ encode_remove(&xdr, &args->name, &hdr);
+ encode_getfattr(&xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1470,27 +1431,20 @@ static int nfs4_xdr_enc_rename(struct rpc_rqst *req, __be32 *p, const struct nfs
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 7,
+ .nops = 0,
};
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- if ((status = encode_putfh(&xdr, args->old_dir)) != 0)
- goto out;
- if ((status = encode_savefh(&xdr)) != 0)
- goto out;
- if ((status = encode_putfh(&xdr, args->new_dir)) != 0)
- goto out;
- if ((status = encode_rename(&xdr, args->old_name, args->new_name)) != 0)
- goto out;
- if ((status = encode_getfattr(&xdr, args->bitmask)) != 0)
- goto out;
- if ((status = encode_restorefh(&xdr)) != 0)
- goto out;
- status = encode_getfattr(&xdr, args->bitmask);
-out:
- return status;
+ encode_putfh(&xdr, args->old_dir, &hdr);
+ encode_savefh(&xdr, &hdr);
+ encode_putfh(&xdr, args->new_dir, &hdr);
+ encode_rename(&xdr, args->old_name, args->new_name, &hdr);
+ encode_getfattr(&xdr, args->bitmask, &hdr);
+ encode_restorefh(&xdr, &hdr);
+ encode_getfattr(&xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1500,27 +1454,20 @@ static int nfs4_xdr_enc_link(struct rpc_rqst *req, __be32 *p, const struct nfs4_
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 7,
+ .nops = 0,
};
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- if ((status = encode_putfh(&xdr, args->fh)) != 0)
- goto out;
- if ((status = encode_savefh(&xdr)) != 0)
- goto out;
- if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
- goto out;
- if ((status = encode_link(&xdr, args->name)) != 0)
- goto out;
- if ((status = encode_getfattr(&xdr, args->bitmask)) != 0)
- goto out;
- if ((status = encode_restorefh(&xdr)) != 0)
- goto out;
- status = encode_getfattr(&xdr, args->bitmask);
-out:
- return status;
+ encode_putfh(&xdr, args->fh, &hdr);
+ encode_savefh(&xdr, &hdr);
+ encode_putfh(&xdr, args->dir_fh, &hdr);
+ encode_link(&xdr, args->name, &hdr);
+ encode_getfattr(&xdr, args->bitmask, &hdr);
+ encode_restorefh(&xdr, &hdr);
+ encode_getfattr(&xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1530,27 +1477,20 @@ static int nfs4_xdr_enc_create(struct rpc_rqst *req, __be32 *p, const struct nfs
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 7,
+ .nops = 0,
};
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
- goto out;
- if ((status = encode_savefh(&xdr)) != 0)
- goto out;
- if ((status = encode_create(&xdr, args)) != 0)
- goto out;
- if ((status = encode_getfh(&xdr)) != 0)
- goto out;
- if ((status = encode_getfattr(&xdr, args->bitmask)) != 0)
- goto out;
- if ((status = encode_restorefh(&xdr)) != 0)
- goto out;
- status = encode_getfattr(&xdr, args->bitmask);
-out:
- return status;
+ encode_putfh(&xdr, args->dir_fh, &hdr);
+ encode_savefh(&xdr, &hdr);
+ encode_create(&xdr, args, &hdr);
+ encode_getfh(&xdr, &hdr);
+ encode_getfattr(&xdr, args->bitmask, &hdr);
+ encode_restorefh(&xdr, &hdr);
+ encode_getfattr(&xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1568,15 +1508,15 @@ static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nf
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 2,
+ .nops = 0,
};
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- if ((status = encode_putfh(&xdr, args->fh)) == 0)
- status = encode_getfattr(&xdr, args->bitmask);
- return status;
+ encode_putfh(&xdr, args->fh, &hdr);
+ encode_getfattr(&xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1584,23 +1524,18 @@ static int nfs4_xdr_enc_getattr(struct rpc_rqst *req, __be32 *p, const struct nf
*/
static int nfs4_xdr_enc_close(struct rpc_rqst *req, __be32 *p, struct nfs_closeargs *args)
{
- struct xdr_stream xdr;
- struct compound_hdr hdr = {
- .nops = 3,
- };
- int status;
-
- xdr_init_encode(&xdr, &req->rq_snd_buf, p);
- encode_compound_hdr(&xdr, &hdr);
- status = encode_putfh(&xdr, args->fh);
- if(status)
- goto out;
- status = encode_close(&xdr, args);
- if (status != 0)
- goto out;
- status = encode_getfattr(&xdr, args->bitmask);
-out:
- return status;
+ struct xdr_stream xdr;
+ struct compound_hdr hdr = {
+ .nops = 0,
+ };
+
+ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+ encode_compound_hdr(&xdr, &hdr);
+ encode_putfh(&xdr, args->fh, &hdr);
+ encode_close(&xdr, args, &hdr);
+ encode_getfattr(&xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1610,33 +1545,20 @@ static int nfs4_xdr_enc_open(struct rpc_rqst *req, __be32 *p, struct nfs_openarg
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 7,
+ .nops = 0,
};
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- status = encode_putfh(&xdr, args->fh);
- if (status)
- goto out;
- status = encode_savefh(&xdr);
- if (status)
- goto out;
- status = encode_open(&xdr, args);
- if (status)
- goto out;
- status = encode_getfh(&xdr);
- if (status)
- goto out;
- status = encode_getfattr(&xdr, args->bitmask);
- if (status)
- goto out;
- status = encode_restorefh(&xdr);
- if (status)
- goto out;
- status = encode_getfattr(&xdr, args->bitmask);
-out:
- return status;
+ encode_putfh(&xdr, args->fh, &hdr);
+ encode_savefh(&xdr, &hdr);
+ encode_open(&xdr, args, &hdr);
+ encode_getfh(&xdr, &hdr);
+ encode_getfattr(&xdr, args->bitmask, &hdr);
+ encode_restorefh(&xdr, &hdr);
+ encode_getfattr(&xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1646,18 +1568,15 @@ static int nfs4_xdr_enc_open_confirm(struct rpc_rqst *req, __be32 *p, struct nfs
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 2,
+ .nops = 0,
};
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- status = encode_putfh(&xdr, args->fh);
- if(status)
- goto out;
- status = encode_open_confirm(&xdr, args);
-out:
- return status;
+ encode_putfh(&xdr, args->fh, &hdr);
+ encode_open_confirm(&xdr, args, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1667,21 +1586,16 @@ static int nfs4_xdr_enc_open_noattr(struct rpc_rqst *req, __be32 *p, struct nfs_
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 3,
+ .nops = 0,
};
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- status = encode_putfh(&xdr, args->fh);
- if (status)
- goto out;
- status = encode_open(&xdr, args);
- if (status)
- goto out;
- status = encode_getfattr(&xdr, args->bitmask);
-out:
- return status;
+ encode_putfh(&xdr, args->fh, &hdr);
+ encode_open(&xdr, args, &hdr);
+ encode_getfattr(&xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1691,21 +1605,16 @@ static int nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req, __be32 *p, struct n
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 3,
+ .nops = 0,
};
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- status = encode_putfh(&xdr, args->fh);
- if (status)
- goto out;
- status = encode_open_downgrade(&xdr, args);
- if (status != 0)
- goto out;
- status = encode_getfattr(&xdr, args->bitmask);
-out:
- return status;
+ encode_putfh(&xdr, args->fh, &hdr);
+ encode_open_downgrade(&xdr, args, &hdr);
+ encode_getfattr(&xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1715,18 +1624,15 @@ static int nfs4_xdr_enc_lock(struct rpc_rqst *req, __be32 *p, struct nfs_lock_ar
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 2,
+ .nops = 0,
};
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- status = encode_putfh(&xdr, args->fh);
- if(status)
- goto out;
- status = encode_lock(&xdr, args);
-out:
- return status;
+ encode_putfh(&xdr, args->fh, &hdr);
+ encode_lock(&xdr, args, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1736,18 +1642,15 @@ static int nfs4_xdr_enc_lockt(struct rpc_rqst *req, __be32 *p, struct nfs_lockt_
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 2,
+ .nops = 0,
};
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- status = encode_putfh(&xdr, args->fh);
- if(status)
- goto out;
- status = encode_lockt(&xdr, args);
-out:
- return status;
+ encode_putfh(&xdr, args->fh, &hdr);
+ encode_lockt(&xdr, args, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1757,18 +1660,15 @@ static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 2,
+ .nops = 0,
};
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- status = encode_putfh(&xdr, args->fh);
- if(status)
- goto out;
- status = encode_locku(&xdr, args);
-out:
- return status;
+ encode_putfh(&xdr, args->fh, &hdr);
+ encode_locku(&xdr, args, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1778,18 +1678,15 @@ static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct n
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 2,
+ .nops = 0,
};
struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
unsigned int replen;
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- status = encode_putfh(&xdr, args->fh);
- if(status)
- goto out;
- status = encode_readlink(&xdr, args, req);
+ encode_putfh(&xdr, args->fh, &hdr);
+ encode_readlink(&xdr, args, req, &hdr);
/* set up reply kvec
* toplevel_status + taglen + rescount + OP_PUTFH + status
@@ -1798,9 +1695,8 @@ static int nfs4_xdr_enc_readlink(struct rpc_rqst *req, __be32 *p, const struct n
replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_readlink_sz) << 2;
xdr_inline_pages(&req->rq_rcv_buf, replen, args->pages,
args->pgbase, args->pglen);
-
-out:
- return status;
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1810,18 +1706,15 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 2,
+ .nops = 0,
};
struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
int replen;
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- status = encode_putfh(&xdr, args->fh);
- if(status)
- goto out;
- status = encode_readdir(&xdr, args, req);
+ encode_putfh(&xdr, args->fh, &hdr);
+ encode_readdir(&xdr, args, req, &hdr);
/* set up reply kvec
* toplevel_status + taglen + rescount + OP_PUTFH + status
@@ -1833,9 +1726,8 @@ static int nfs4_xdr_enc_readdir(struct rpc_rqst *req, __be32 *p, const struct nf
dprintk("%s: inlined page args = (%u, %p, %u, %u)\n",
__func__, replen, args->pages,
args->pgbase, args->count);
-
-out:
- return status;
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1846,18 +1738,14 @@ static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readarg
struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 2,
+ .nops = 0,
};
- int replen, status;
+ int replen;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- status = encode_putfh(&xdr, args->fh);
- if (status)
- goto out;
- status = encode_read(&xdr, args);
- if (status)
- goto out;
+ encode_putfh(&xdr, args->fh, &hdr);
+ encode_read(&xdr, args, &hdr);
/* set up reply kvec
* toplevel status + taglen=0 + rescount + OP_PUTFH + status
@@ -1867,33 +1755,27 @@ static int nfs4_xdr_enc_read(struct rpc_rqst *req, __be32 *p, struct nfs_readarg
xdr_inline_pages(&req->rq_rcv_buf, replen,
args->pages, args->pgbase, args->count);
req->rq_rcv_buf.flags |= XDRBUF_READ;
-out:
- return status;
+ encode_nops(&hdr);
+ return 0;
}
/*
* Encode an SETATTR request
*/
static int nfs4_xdr_enc_setattr(struct rpc_rqst *req, __be32 *p, struct nfs_setattrargs *args)
-
{
- struct xdr_stream xdr;
- struct compound_hdr hdr = {
- .nops = 3,
- };
- int status;
-
- xdr_init_encode(&xdr, &req->rq_snd_buf, p);
- encode_compound_hdr(&xdr, &hdr);
- status = encode_putfh(&xdr, args->fh);
- if(status)
- goto out;
- status = encode_setattr(&xdr, args, args->server);
- if(status)
- goto out;
- status = encode_getfattr(&xdr, args->bitmask);
-out:
- return status;
+ struct xdr_stream xdr;
+ struct compound_hdr hdr = {
+ .nops = 0,
+ };
+
+ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+ encode_compound_hdr(&xdr, &hdr);
+ encode_putfh(&xdr, args->fh, &hdr);
+ encode_setattr(&xdr, args, args->server, &hdr);
+ encode_getfattr(&xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1906,22 +1788,21 @@ nfs4_xdr_enc_getacl(struct rpc_rqst *req, __be32 *p,
struct xdr_stream xdr;
struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
struct compound_hdr hdr = {
- .nops = 2,
+ .nops = 0,
};
- int replen, status;
+ int replen;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- status = encode_putfh(&xdr, args->fh);
- if (status)
- goto out;
- status = encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0);
+ encode_putfh(&xdr, args->fh, &hdr);
+ encode_getattr_two(&xdr, FATTR4_WORD0_ACL, 0, &hdr);
+
/* set up reply buffer: */
replen = (RPC_REPHDRSIZE + auth->au_rslack + NFS4_dec_getacl_sz) << 2;
xdr_inline_pages(&req->rq_rcv_buf, replen,
args->acl_pages, args->acl_pgbase, args->acl_len);
-out:
- return status;
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1931,22 +1812,17 @@ static int nfs4_xdr_enc_write(struct rpc_rqst *req, __be32 *p, struct nfs_writea
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 3,
+ .nops = 0,
};
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- status = encode_putfh(&xdr, args->fh);
- if (status)
- goto out;
- status = encode_write(&xdr, args);
- if (status)
- goto out;
+ encode_putfh(&xdr, args->fh, &hdr);
+ encode_write(&xdr, args, &hdr);
req->rq_snd_buf.flags |= XDRBUF_WRITE;
- status = encode_getfattr(&xdr, args->bitmask);
-out:
- return status;
+ encode_getfattr(&xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1956,21 +1832,16 @@ static int nfs4_xdr_enc_commit(struct rpc_rqst *req, __be32 *p, struct nfs_write
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 3,
+ .nops = 0,
};
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- status = encode_putfh(&xdr, args->fh);
- if (status)
- goto out;
- status = encode_commit(&xdr, args);
- if (status)
- goto out;
- status = encode_getfattr(&xdr, args->bitmask);
-out:
- return status;
+ encode_putfh(&xdr, args->fh, &hdr);
+ encode_commit(&xdr, args, &hdr);
+ encode_getfattr(&xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1980,16 +1851,15 @@ static int nfs4_xdr_enc_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs4_fsin
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 2,
+ .nops = 0,
};
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- status = encode_putfh(&xdr, args->fh);
- if (!status)
- status = encode_fsinfo(&xdr, args->bitmask);
- return status;
+ encode_putfh(&xdr, args->fh, &hdr);
+ encode_fsinfo(&xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -1999,17 +1869,16 @@ static int nfs4_xdr_enc_pathconf(struct rpc_rqst *req, __be32 *p, const struct n
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 2,
+ .nops = 0,
};
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- status = encode_putfh(&xdr, args->fh);
- if (!status)
- status = encode_getattr_one(&xdr,
- args->bitmask[0] & nfs4_pathconf_bitmap[0]);
- return status;
+ encode_putfh(&xdr, args->fh, &hdr);
+ encode_getattr_one(&xdr, args->bitmask[0] & nfs4_pathconf_bitmap[0],
+ &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -2019,18 +1888,16 @@ static int nfs4_xdr_enc_statfs(struct rpc_rqst *req, __be32 *p, const struct nfs
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 2,
+ .nops = 0,
};
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- status = encode_putfh(&xdr, args->fh);
- if (status == 0)
- status = encode_getattr_two(&xdr,
- args->bitmask[0] & nfs4_statfs_bitmap[0],
- args->bitmask[1] & nfs4_statfs_bitmap[1]);
- return status;
+ encode_putfh(&xdr, args->fh, &hdr);
+ encode_getattr_two(&xdr, args->bitmask[0] & nfs4_statfs_bitmap[0],
+ args->bitmask[1] & nfs4_statfs_bitmap[1], &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -2040,19 +1907,18 @@ static int nfs4_xdr_enc_server_caps(struct rpc_rqst *req, __be32 *p, const struc
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 2,
+ .nops = 0,
};
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- status = encode_putfh(&xdr, fhandle);
- if (status == 0)
- status = encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
- FATTR4_WORD0_LINK_SUPPORT|
- FATTR4_WORD0_SYMLINK_SUPPORT|
- FATTR4_WORD0_ACLSUPPORT);
- return status;
+ encode_putfh(&xdr, fhandle, &hdr);
+ encode_getattr_one(&xdr, FATTR4_WORD0_SUPPORTED_ATTRS|
+ FATTR4_WORD0_LINK_SUPPORT|
+ FATTR4_WORD0_SYMLINK_SUPPORT|
+ FATTR4_WORD0_ACLSUPPORT, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -2062,12 +1928,14 @@ static int nfs4_xdr_enc_renew(struct rpc_rqst *req, __be32 *p, struct nfs_client
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 1,
+ .nops = 0,
};
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- return encode_renew(&xdr, clp);
+ encode_renew(&xdr, clp, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -2077,12 +1945,14 @@ static int nfs4_xdr_enc_setclientid(struct rpc_rqst *req, __be32 *p, struct nfs4
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 1,
+ .nops = 0,
};
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- return encode_setclientid(&xdr, sc);
+ encode_setclientid(&xdr, sc, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -2092,19 +1962,17 @@ static int nfs4_xdr_enc_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 3,
+ .nops = 0,
};
const u32 lease_bitmap[2] = { FATTR4_WORD0_LEASE_TIME, 0 };
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- status = encode_setclientid_confirm(&xdr, clp);
- if (!status)
- status = encode_putrootfh(&xdr);
- if (!status)
- status = encode_fsinfo(&xdr, lease_bitmap);
- return status;
+ encode_setclientid_confirm(&xdr, clp, &hdr);
+ encode_putrootfh(&xdr, &hdr);
+ encode_fsinfo(&xdr, lease_bitmap, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -2114,21 +1982,16 @@ static int nfs4_xdr_enc_delegreturn(struct rpc_rqst *req, __be32 *p, const struc
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 3,
+ .nops = 0,
};
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- status = encode_putfh(&xdr, args->fhandle);
- if (status != 0)
- goto out;
- status = encode_delegreturn(&xdr, args->stateid);
- if (status != 0)
- goto out;
- status = encode_getfattr(&xdr, args->bitmask);
-out:
- return status;
+ encode_putfh(&xdr, args->fhandle, &hdr);
+ encode_delegreturn(&xdr, args->stateid, &hdr);
+ encode_getfattr(&xdr, args->bitmask, &hdr);
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -2138,20 +2001,17 @@ static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs
{
struct xdr_stream xdr;
struct compound_hdr hdr = {
- .nops = 3,
+ .nops = 0,
};
struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth;
int replen;
- int status;
xdr_init_encode(&xdr, &req->rq_snd_buf, p);
encode_compound_hdr(&xdr, &hdr);
- if ((status = encode_putfh(&xdr, args->dir_fh)) != 0)
- goto out;
- if ((status = encode_lookup(&xdr, args->name)) != 0)
- goto out;
- if ((status = encode_fs_locations(&xdr, args->bitmask)) != 0)
- goto out;
+ encode_putfh(&xdr, args->dir_fh, &hdr);
+ encode_lookup(&xdr, args->name, &hdr);
+ encode_fs_locations(&xdr, args->bitmask, &hdr);
+
/* set up reply
* toplevel_status + OP_PUTFH + status
* + OP_LOOKUP + status + OP_GETATTR + status = 7
@@ -2159,8 +2019,8 @@ static int nfs4_xdr_enc_fs_locations(struct rpc_rqst *req, __be32 *p, struct nfs
replen = (RPC_REPHDRSIZE + auth->au_rslack + 7) << 2;
xdr_inline_pages(&req->rq_rcv_buf, replen, &args->page,
0, PAGE_SIZE);
-out:
- return status;
+ encode_nops(&hdr);
+ return 0;
}
/*
@@ -2217,11 +2077,13 @@ static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
READ_BUF(8);
READ32(hdr->status);
READ32(hdr->taglen);
-
+
READ_BUF(hdr->taglen + 4);
hdr->tag = (char *)p;
p += XDR_QUADLEN(hdr->taglen);
READ32(hdr->nops);
+ if (unlikely(hdr->nops < 1))
+ return nfs4_stat_to_errno(hdr->status);
return 0;
}
@@ -3047,8 +2909,7 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
{
__be32 *savep;
- uint32_t attrlen,
- bitmap[2] = {0};
+ uint32_t attrlen, bitmap[2] = {0};
int status;
if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
@@ -3070,14 +2931,13 @@ xdr_error:
dprintk("%s: xdr returned %d!\n", __func__, -status);
return status;
}
-
+
static int decode_statfs(struct xdr_stream *xdr, struct nfs_fsstat *fsstat)
{
__be32 *savep;
- uint32_t attrlen,
- bitmap[2] = {0};
+ uint32_t attrlen, bitmap[2] = {0};
int status;
-
+
if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
goto xdr_error;
if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
@@ -3107,10 +2967,9 @@ xdr_error:
static int decode_pathconf(struct xdr_stream *xdr, struct nfs_pathconf *pathconf)
{
__be32 *savep;
- uint32_t attrlen,
- bitmap[2] = {0};
+ uint32_t attrlen, bitmap[2] = {0};
int status;
-
+
if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
goto xdr_error;
if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
@@ -3256,7 +3115,7 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
{
int status;
-
+
status = decode_op_hdr(xdr, OP_LINK);
if (status)
return status;
@@ -3344,27 +3203,27 @@ static int decode_lookup(struct xdr_stream *xdr)
/* This is too sick! */
static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
{
- __be32 *p;
+ __be32 *p;
uint32_t limit_type, nblocks, blocksize;
READ_BUF(12);
READ32(limit_type);
switch (limit_type) {
- case 1:
- READ64(*maxsize);
- break;
- case 2:
- READ32(nblocks);
- READ32(blocksize);
- *maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
+ case 1:
+ READ64(*maxsize);
+ break;
+ case 2:
+ READ32(nblocks);
+ READ32(blocksize);
+ *maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
}
return 0;
}
static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
{
- __be32 *p;
- uint32_t delegation_type;
+ __be32 *p;
+ uint32_t delegation_type;
READ_BUF(4);
READ32(delegation_type);
@@ -3375,13 +3234,14 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
READ_BUF(NFS4_STATEID_SIZE+4);
COPYMEM(res->delegation.data, NFS4_STATEID_SIZE);
READ32(res->do_recall);
+
switch (delegation_type) {
- case NFS4_OPEN_DELEGATE_READ:
- res->delegation_type = FMODE_READ;
- break;
- case NFS4_OPEN_DELEGATE_WRITE:
- res->delegation_type = FMODE_WRITE|FMODE_READ;
- if (decode_space_limit(xdr, &res->maxsize) < 0)
+ case NFS4_OPEN_DELEGATE_READ:
+ res->delegation_type = FMODE_READ;
+ break;
+ case NFS4_OPEN_DELEGATE_WRITE:
+ res->delegation_type = FMODE_WRITE|FMODE_READ;
+ if (decode_space_limit(xdr, &res->maxsize) < 0)
return -EIO;
}
return decode_ace(xdr, NULL, res->server->nfs_client);
@@ -3389,27 +3249,27 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
{
- __be32 *p;
+ __be32 *p;
uint32_t savewords, bmlen, i;
- int status;
+ int status;
- status = decode_op_hdr(xdr, OP_OPEN);
+ status = decode_op_hdr(xdr, OP_OPEN);
if (status != -EIO)
nfs_increment_open_seqid(status, res->seqid);
- if (status)
- return status;
- READ_BUF(NFS4_STATEID_SIZE);
- COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
+ if (status)
+ return status;
+ READ_BUF(NFS4_STATEID_SIZE);
+ COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
- decode_change_info(xdr, &res->cinfo);
+ decode_change_info(xdr, &res->cinfo);
- READ_BUF(8);
- READ32(res->rflags);
- READ32(bmlen);
- if (bmlen > 10)
- goto xdr_error;
+ READ_BUF(8);
+ READ32(res->rflags);
+ READ32(bmlen);
+ if (bmlen > 10)
+ goto xdr_error;
- READ_BUF(bmlen << 2);
+ READ_BUF(bmlen << 2);
savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
for (i = 0; i < savewords; ++i)
READ32(res->attrset[i]);
@@ -3424,17 +3284,17 @@ xdr_error:
static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
{
- __be32 *p;
+ __be32 *p;
int status;
- status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
+ status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
if (status != -EIO)
nfs_increment_open_seqid(status, res->seqid);
- if (status)
- return status;
- READ_BUF(NFS4_STATEID_SIZE);
- COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
- return 0;
+ if (status)
+ return status;
+ READ_BUF(NFS4_STATEID_SIZE);
+ COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
+ return 0;
}
static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
@@ -3562,7 +3422,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
dprintk("NFS: readdir reply truncated!\n");
entry[1] = 1;
}
-out:
+out:
kunmap_atomic(kaddr, KM_USER0);
return 0;
short_pkt:
@@ -3718,7 +3578,6 @@ static int decode_setattr(struct xdr_stream *xdr, struct nfs_setattrres *res)
uint32_t bmlen;
int status;
-
status = decode_op_hdr(xdr, OP_SETATTR);
if (status)
return status;
@@ -3738,7 +3597,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
READ32(opnum);
if (opnum != OP_SETCLIENTID) {
dprintk("nfs: decode_setclientid: Server returned operation"
- " %d\n", opnum);
+ " %d\n", opnum);
return -EIO;
}
READ32(nfserr);
@@ -3792,34 +3651,34 @@ static int decode_delegreturn(struct xdr_stream *xdr)
}
/*
+ * END OF "GENERIC" DECODE ROUTINES.
+ */
+
+/*
* Decode OPEN_DOWNGRADE response
*/
static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
{
- struct xdr_stream xdr;
- struct compound_hdr hdr;
- int status;
-
- xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
- status = decode_compound_hdr(&xdr, &hdr);
- if (status)
- goto out;
- status = decode_putfh(&xdr);
- if (status)
- goto out;
- status = decode_open_downgrade(&xdr, res);
+ struct xdr_stream xdr;
+ struct compound_hdr hdr;
+ int status;
+
+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+ status = decode_compound_hdr(&xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_putfh(&xdr);
+ if (status)
+ goto out;
+ status = decode_open_downgrade(&xdr, res);
if (status != 0)
goto out;
decode_getfattr(&xdr, res->fattr, res->server);
out:
- return status;
+ return status;
}
/*
- * END OF "GENERIC" DECODE ROUTINES.
- */
-
-/*
* Decode ACCESS response
*/
static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_accessres *res)
@@ -3827,7 +3686,7 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac
struct xdr_stream xdr;
struct compound_hdr hdr;
int status;
-
+
xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
goto out;
@@ -3850,7 +3709,7 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lo
struct xdr_stream xdr;
struct compound_hdr hdr;
int status;
-
+
xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
goto out;
@@ -3873,7 +3732,7 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nf
struct xdr_stream xdr;
struct compound_hdr hdr;
int status;
-
+
xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
goto out;
@@ -3893,7 +3752,7 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
struct xdr_stream xdr;
struct compound_hdr hdr;
int status;
-
+
xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
goto out;
@@ -3914,7 +3773,7 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_re
struct xdr_stream xdr;
struct compound_hdr hdr;
int status;
-
+
xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
goto out;
@@ -3944,7 +3803,7 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link
struct xdr_stream xdr;
struct compound_hdr hdr;
int status;
-
+
xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
goto out;
@@ -3977,7 +3836,7 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_cr
struct xdr_stream xdr;
struct compound_hdr hdr;
int status;
-
+
xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
if ((status = decode_compound_hdr(&xdr, &hdr)) != 0)
goto out;
@@ -4014,7 +3873,7 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g
struct xdr_stream xdr;
struct compound_hdr hdr;
int status;
-
+
xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
status = decode_compound_hdr(&xdr, &hdr);
if (status)
@@ -4025,7 +3884,6 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g
status = decode_getfattr(&xdr, res->fattr, res->server);
out:
return status;
-
}
/*
@@ -4034,21 +3892,20 @@ out:
static int
nfs4_xdr_enc_setacl(struct rpc_rqst *req, __be32 *p, struct nfs_setaclargs *args)
{
- struct xdr_stream xdr;
- struct compound_hdr hdr = {
- .nops = 2,
- };
- int status;
-
- xdr_init_encode(&xdr, &req->rq_snd_buf, p);
- encode_compound_hdr(&xdr, &hdr);
- status = encode_putfh(&xdr, args->fh);
- if (status)
- goto out;
- status = encode_setacl(&xdr, args);
-out:
- return status;
+ struct xdr_stream xdr;
+ struct compound_hdr hdr = {
+ .nops = 0,
+ };
+ int status;
+
+ xdr_init_encode(&xdr, &req->rq_snd_buf, p);
+ encode_compound_hdr(&xdr, &hdr);
+ encode_putfh(&xdr, args->fh, &hdr);
+ status = encode_setacl(&xdr, args, &hdr);
+ encode_nops(&hdr);
+ return status;
}
+
/*
* Decode SETACL response
*/
@@ -4099,18 +3956,18 @@ out:
*/
static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_closeres *res)
{
- struct xdr_stream xdr;
- struct compound_hdr hdr;
- int status;
-
- xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
- status = decode_compound_hdr(&xdr, &hdr);
- if (status)
- goto out;
- status = decode_putfh(&xdr);
- if (status)
- goto out;
- status = decode_close(&xdr, res);
+ struct xdr_stream xdr;
+ struct compound_hdr hdr;
+ int status;
+
+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+ status = decode_compound_hdr(&xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_putfh(&xdr);
+ if (status)
+ goto out;
+ status = decode_close(&xdr, res);
if (status != 0)
goto out;
/*
@@ -4121,7 +3978,7 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
*/
decode_getfattr(&xdr, res->fattr, res->server);
out:
- return status;
+ return status;
}
/*
@@ -4129,23 +3986,23 @@ out:
*/
static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
{
- struct xdr_stream xdr;
- struct compound_hdr hdr;
- int status;
-
- xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
- status = decode_compound_hdr(&xdr, &hdr);
- if (status)
- goto out;
- status = decode_putfh(&xdr);
- if (status)
- goto out;
- status = decode_savefh(&xdr);
+ struct xdr_stream xdr;
+ struct compound_hdr hdr;
+ int status;
+
+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+ status = decode_compound_hdr(&xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_putfh(&xdr);
+ if (status)
+ goto out;
+ status = decode_savefh(&xdr);
+ if (status)
+ goto out;
+ status = decode_open(&xdr, res);
if (status)
goto out;
- status = decode_open(&xdr, res);
- if (status)
- goto out;
if (decode_getfh(&xdr, &res->fh) != 0)
goto out;
if (decode_getfattr(&xdr, res->f_attr, res->server) != 0)
@@ -4154,7 +4011,7 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openr
goto out;
decode_getfattr(&xdr, res->dir_attr, res->server);
out:
- return status;
+ return status;
}
/*
@@ -4162,20 +4019,20 @@ out:
*/
static int nfs4_xdr_dec_open_confirm(struct rpc_rqst *rqstp, __be32 *p, struct nfs_open_confirmres *res)
{
- struct xdr_stream xdr;
- struct compound_hdr hdr;
- int status;
-
- xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
- status = decode_compound_hdr(&xdr, &hdr);
- if (status)
- goto out;
- status = decode_putfh(&xdr);
- if (status)
- goto out;
- status = decode_open_confirm(&xdr, res);
+ struct xdr_stream xdr;
+ struct compound_hdr hdr;
+ int status;
+
+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+ status = decode_compound_hdr(&xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_putfh(&xdr);
+ if (status)
+ goto out;
+ status = decode_open_confirm(&xdr, res);
out:
- return status;
+ return status;
}
/*
@@ -4183,23 +4040,23 @@ out:
*/
static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openres *res)
{
- struct xdr_stream xdr;
- struct compound_hdr hdr;
- int status;
-
- xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
- status = decode_compound_hdr(&xdr, &hdr);
- if (status)
- goto out;
- status = decode_putfh(&xdr);
- if (status)
- goto out;
- status = decode_open(&xdr, res);
- if (status)
- goto out;
+ struct xdr_stream xdr;
+ struct compound_hdr hdr;
+ int status;
+
+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+ status = decode_compound_hdr(&xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_putfh(&xdr);
+ if (status)
+ goto out;
+ status = decode_open(&xdr, res);
+ if (status)
+ goto out;
decode_getfattr(&xdr, res->f_attr, res->server);
out:
- return status;
+ return status;
}
/*
@@ -4207,25 +4064,25 @@ out:
*/
static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_setattrres *res)
{
- struct xdr_stream xdr;
- struct compound_hdr hdr;
- int status;
-
- xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
- status = decode_compound_hdr(&xdr, &hdr);
- if (status)
- goto out;
- status = decode_putfh(&xdr);
- if (status)
- goto out;
- status = decode_setattr(&xdr, res);
- if (status)
- goto out;
+ struct xdr_stream xdr;
+ struct compound_hdr hdr;
+ int status;
+
+ xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p);
+ status = decode_compound_hdr(&xdr, &hdr);
+ if (status)
+ goto out;
+ status = decode_putfh(&xdr);
+ if (status)
+ goto out;
+ status = decode_setattr(&xdr, res);
+ if (status)
+ goto out;
status = decode_getfattr(&xdr, res->fattr, res->server);
if (status == NFS4ERR_DELAY)
status = 0;
out:
- return status;
+ return status;
}
/*
@@ -4421,8 +4278,6 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinf
status = decode_putfh(&xdr);
if (!status)
status = decode_fsinfo(&xdr, fsinfo);
- if (!status)
- status = nfs4_stat_to_errno(hdr.status);
return status;
}
@@ -4511,8 +4366,6 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p,
status = decode_compound_hdr(&xdr, &hdr);
if (!status)
status = decode_setclientid(&xdr, clp);
- if (!status)
- status = nfs4_stat_to_errno(hdr.status);
return status;
}
@@ -4533,8 +4386,6 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str
status = decode_putrootfh(&xdr);
if (!status)
status = decode_fsinfo(&xdr, fsinfo);
- if (!status)
- status = nfs4_stat_to_errno(hdr.status);
return status;
}
@@ -4715,7 +4566,7 @@ nfs4_stat_to_errno(int stat)
.p_replen = NFS4_##restype##_sz, \
.p_statidx = NFSPROC4_CLNT_##proc, \
.p_name = #proc, \
- }
+}
struct rpc_procinfo nfs4_procedures[] = {
PROC(READ, enc_read, dec_read),
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index d74d16ce0d49..d9ef602fbc5a 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -86,6 +86,8 @@
#include <net/ipconfig.h>
#include <linux/parser.h>
+#include "internal.h"
+
/* Define this to allow debugging output */
#undef NFSROOT_DEBUG
#define NFSDBG_FACILITY NFSDBG_ROOT
@@ -100,7 +102,7 @@ static char nfs_root_name[256] __initdata = "";
static __be32 servaddr __initdata = 0;
/* Name of directory to mount */
-static char nfs_path[NFS_MAXPATHLEN] __initdata = { 0, };
+static char nfs_export_path[NFS_MAXPATHLEN] __initdata = { 0, };
/* NFS-related data */
static struct nfs_mount_data nfs_data __initdata = { 0, };/* NFS mount info */
@@ -312,7 +314,7 @@ static int __init root_nfs_name(char *name)
printk(KERN_ERR "Root-NFS: Pathname for remote directory too long.\n");
return -1;
}
- sprintf(nfs_path, buf, cp);
+ sprintf(nfs_export_path, buf, cp);
return 1;
}
@@ -340,7 +342,7 @@ static int __init root_nfs_addr(void)
static void __init root_nfs_print(void)
{
printk(KERN_NOTICE "Root-NFS: Mounting %s on server %s as root\n",
- nfs_path, nfs_data.hostname);
+ nfs_export_path, nfs_data.hostname);
printk(KERN_NOTICE "Root-NFS: rsize = %d, wsize = %d, timeo = %d, retrans = %d\n",
nfs_data.rsize, nfs_data.wsize, nfs_data.timeo, nfs_data.retrans);
printk(KERN_NOTICE "Root-NFS: acreg (min,max) = (%d,%d), acdir (min,max) = (%d,%d)\n",
@@ -485,18 +487,23 @@ static int __init root_nfs_get_handle(void)
{
struct nfs_fh fh;
struct sockaddr_in sin;
+ struct nfs_mount_request request = {
+ .sap = (struct sockaddr *)&sin,
+ .salen = sizeof(sin),
+ .dirpath = nfs_export_path,
+ .version = (nfs_data.flags & NFS_MOUNT_VER3) ?
+ NFS_MNT3_VERSION : NFS_MNT_VERSION,
+ .protocol = (nfs_data.flags & NFS_MOUNT_TCP) ?
+ XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP,
+ .fh = &fh,
+ };
int status;
- int protocol = (nfs_data.flags & NFS_MOUNT_TCP) ?
- XPRT_TRANSPORT_TCP : XPRT_TRANSPORT_UDP;
- int version = (nfs_data.flags & NFS_MOUNT_VER3) ?
- NFS_MNT3_VERSION : NFS_MNT_VERSION;
set_sockaddr(&sin, servaddr, htons(mount_port));
- status = nfs_mount((struct sockaddr *) &sin, sizeof(sin), NULL,
- nfs_path, version, protocol, &fh);
+ status = nfs_mount(&request);
if (status < 0)
printk(KERN_ERR "Root-NFS: Server returned error %d "
- "while mounting %s\n", status, nfs_path);
+ "while mounting %s\n", status, nfs_export_path);
else {
nfs_data.root.size = fh.size;
memcpy(nfs_data.root.data, fh.data, fh.size);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 40d17987d0e8..f856004bb7fa 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -533,12 +533,6 @@ readpage_async_filler(void *data, struct page *page)
unsigned int len;
int error;
- error = nfs_wb_page(inode, page);
- if (error)
- goto out_unlock;
- if (PageUptodate(page))
- goto out_unlock;
-
len = nfs_page_length(page);
if (len == 0)
return nfs_return_empty_page(page);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index bb0313ac9e1f..d6686f4786dc 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -75,6 +75,7 @@ enum {
Opt_acl, Opt_noacl,
Opt_rdirplus, Opt_nordirplus,
Opt_sharecache, Opt_nosharecache,
+ Opt_resvport, Opt_noresvport,
/* Mount options that take integer arguments */
Opt_port,
@@ -129,6 +130,8 @@ static const match_table_t nfs_mount_option_tokens = {
{ Opt_nordirplus, "nordirplus" },
{ Opt_sharecache, "sharecache" },
{ Opt_nosharecache, "nosharecache" },
+ { Opt_resvport, "resvport" },
+ { Opt_noresvport, "noresvport" },
{ Opt_port, "port=%u" },
{ Opt_rsize, "rsize=%u" },
@@ -512,7 +515,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss,
{ NFS_MOUNT_NONLM, ",nolock", "" },
{ NFS_MOUNT_NOACL, ",noacl", "" },
{ NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" },
- { NFS_MOUNT_UNSHARED, ",nosharecache", ""},
+ { NFS_MOUNT_UNSHARED, ",nosharecache", "" },
+ { NFS_MOUNT_NORESVPORT, ",noresvport", "" },
{ 0, NULL, NULL }
};
const struct proc_nfs_info *nfs_infop;
@@ -1033,6 +1037,12 @@ static int nfs_parse_mount_options(char *raw,
case Opt_nosharecache:
mnt->flags |= NFS_MOUNT_UNSHARED;
break;
+ case Opt_resvport:
+ mnt->flags &= ~NFS_MOUNT_NORESVPORT;
+ break;
+ case Opt_noresvport:
+ mnt->flags |= NFS_MOUNT_NORESVPORT;
+ break;
/*
* options that take numeric values
@@ -1327,8 +1337,14 @@ out_security_failure:
static int nfs_try_mount(struct nfs_parsed_mount_data *args,
struct nfs_fh *root_fh)
{
- struct sockaddr *sap = (struct sockaddr *)&args->mount_server.address;
- char *hostname;
+ struct nfs_mount_request request = {
+ .sap = (struct sockaddr *)
+ &args->mount_server.address,
+ .dirpath = args->nfs_server.export_path,
+ .protocol = args->mount_server.protocol,
+ .fh = root_fh,
+ .noresvport = args->flags & NFS_MOUNT_NORESVPORT,
+ };
int status;
if (args->mount_server.version == 0) {
@@ -1337,42 +1353,38 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
else
args->mount_server.version = NFS_MNT_VERSION;
}
+ request.version = args->mount_server.version;
if (args->mount_server.hostname)
- hostname = args->mount_server.hostname;
+ request.hostname = args->mount_server.hostname;
else
- hostname = args->nfs_server.hostname;
+ request.hostname = args->nfs_server.hostname;
/*
* Construct the mount server's address.
*/
if (args->mount_server.address.ss_family == AF_UNSPEC) {
- memcpy(sap, &args->nfs_server.address,
+ memcpy(request.sap, &args->nfs_server.address,
args->nfs_server.addrlen);
args->mount_server.addrlen = args->nfs_server.addrlen;
}
+ request.salen = args->mount_server.addrlen;
/*
* autobind will be used if mount_server.port == 0
*/
- nfs_set_port(sap, args->mount_server.port);
+ nfs_set_port(request.sap, args->mount_server.port);
/*
* Now ask the mount server to map our export path
* to a file handle.
*/
- status = nfs_mount(sap,
- args->mount_server.addrlen,
- hostname,
- args->nfs_server.export_path,
- args->mount_server.version,
- args->mount_server.protocol,
- root_fh);
+ status = nfs_mount(&request);
if (status == 0)
return 0;
dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
- hostname, status);
+ request.hostname, status);
return status;
}
@@ -2419,7 +2431,7 @@ static void nfs4_kill_super(struct super_block *sb)
{
struct nfs_server *server = NFS_SB(sb);
- nfs_return_all_delegations(sb);
+ nfs_super_return_all_delegations(sb);
kill_anon_super(sb);
nfs4_renewd_prepare_shutdown(server);
diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c
index c11f5375d7c1..04133aacb1e5 100644
--- a/fs/nfs_common/nfsacl.c
+++ b/fs/nfs_common/nfsacl.c
@@ -29,8 +29,8 @@
MODULE_LICENSE("GPL");
-EXPORT_SYMBOL(nfsacl_encode);
-EXPORT_SYMBOL(nfsacl_decode);
+EXPORT_SYMBOL_GPL(nfsacl_encode);
+EXPORT_SYMBOL_GPL(nfsacl_decode);
struct nfsacl_encode_desc {
struct xdr_array2_desc desc;
diff --git a/fs/nfsctl.c b/fs/nfsctl.c
index b1acbd6ab6fb..b27451909dff 100644
--- a/fs/nfsctl.c
+++ b/fs/nfsctl.c
@@ -38,9 +38,10 @@ static struct file *do_open(char *name, int flags)
return ERR_PTR(error);
if (flags == O_RDWR)
- error = may_open(&nd,MAY_READ|MAY_WRITE,FMODE_READ|FMODE_WRITE);
+ error = may_open(&nd.path, MAY_READ|MAY_WRITE,
+ FMODE_READ|FMODE_WRITE);
else
- error = may_open(&nd, MAY_WRITE, FMODE_WRITE);
+ error = may_open(&nd.path, MAY_WRITE, FMODE_WRITE);
if (!error)
return dentry_open(nd.path.dentry, nd.path.mnt, flags,
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 094747a1227c..6d7d8c02c197 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -358,6 +358,7 @@ static struct rpc_program cb_program = {
.nrvers = ARRAY_SIZE(nfs_cb_version),
.version = nfs_cb_version,
.stats = &cb_stats,
+ .pipe_dir_name = "/nfsd4_cb",
};
/* Reference counting, callback cleanup, etc., all look racy as heck.
@@ -382,8 +383,9 @@ static int do_probe_callback(void *data)
.program = &cb_program,
.prognumber = cb->cb_prog,
.version = nfs_cb_version[1]->number,
- .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */
+ .authflavor = clp->cl_flavor,
.flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
+ .client_name = clp->cl_principal,
};
struct rpc_message msg = {
.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
@@ -392,6 +394,11 @@ static int do_probe_callback(void *data)
struct rpc_clnt *client;
int status;
+ if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) {
+ status = nfserr_cb_path_down;
+ goto out_err;
+ }
+
/* Initialize address */
memset(&addr, 0, sizeof(addr));
addr.sin_family = AF_INET;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index bf4cd46a5a11..13e0e074dbb8 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -54,6 +54,7 @@
#include <linux/mutex.h>
#include <linux/lockd/bind.h>
#include <linux/module.h>
+#include <linux/sunrpc/svcauth_gss.h>
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -377,6 +378,7 @@ free_client(struct nfs4_client *clp)
shutdown_callback_client(clp);
if (clp->cl_cred.cr_group_info)
put_group_info(clp->cl_cred.cr_group_info);
+ kfree(clp->cl_principal);
kfree(clp->cl_name.data);
kfree(clp);
}
@@ -696,6 +698,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
unsigned int strhashval;
struct nfs4_client *conf, *unconf, *new;
__be32 status;
+ char *princ;
char dname[HEXDIR_LEN];
if (!check_name(clname))
@@ -783,6 +786,15 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
}
copy_verf(new, &clverifier);
new->cl_addr = sin->sin_addr.s_addr;
+ new->cl_flavor = rqstp->rq_flavor;
+ princ = svc_gss_principal(rqstp);
+ if (princ) {
+ new->cl_principal = kstrdup(princ, GFP_KERNEL);
+ if (new->cl_principal == NULL) {
+ free_client(new);
+ goto out;
+ }
+ }
copy_cred(&new->cl_cred, &rqstp->rq_cred);
gen_confirm(new);
gen_callback(new, setclid);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index d1c5f787b365..44aa92aba891 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -744,45 +744,16 @@ nfsd_close(struct file *filp)
fput(filp);
}
-/*
- * Sync a file
- * As this calls fsync (not fdatasync) there is no need for a write_inode
- * after it.
- */
-static inline int nfsd_dosync(struct file *filp, struct dentry *dp,
- const struct file_operations *fop)
-{
- struct inode *inode = dp->d_inode;
- int (*fsync) (struct file *, struct dentry *, int);
- int err;
-
- err = filemap_fdatawrite(inode->i_mapping);
- if (err == 0 && fop && (fsync = fop->fsync))
- err = fsync(filp, dp, 0);
- if (err == 0)
- err = filemap_fdatawait(inode->i_mapping);
-
- return err;
-}
-
-
static int
nfsd_sync(struct file *filp)
{
- int err;
- struct inode *inode = filp->f_path.dentry->d_inode;
- dprintk("nfsd: sync file %s\n", filp->f_path.dentry->d_name.name);
- mutex_lock(&inode->i_mutex);
- err=nfsd_dosync(filp, filp->f_path.dentry, filp->f_op);
- mutex_unlock(&inode->i_mutex);
-
- return err;
+ return vfs_fsync(filp, filp->f_path.dentry, 0);
}
int
-nfsd_sync_dir(struct dentry *dp)
+nfsd_sync_dir(struct dentry *dentry)
{
- return nfsd_dosync(NULL, dp, dp->d_inode->i_fop);
+ return vfs_fsync(NULL, dentry, 0);
}
/*
@@ -1211,7 +1182,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
dirp = dentry->d_inode;
err = nfserr_notdir;
- if(!dirp->i_op || !dirp->i_op->lookup)
+ if (!dirp->i_op->lookup)
goto out;
/*
* Check whether the response file handle has been verified yet.
@@ -1347,7 +1318,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
/* Get all the sanity checks out of the way before
* we lock the parent. */
err = nfserr_notdir;
- if(!dirp->i_op || !dirp->i_op->lookup)
+ if (!dirp->i_op->lookup)
goto out;
fh_lock_nested(fhp, I_MUTEX_PARENT);
@@ -1482,7 +1453,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
inode = dentry->d_inode;
err = nfserr_inval;
- if (!inode->i_op || !inode->i_op->readlink)
+ if (!inode->i_op->readlink)
goto out;
touch_atime(fhp->fh_export->ex_path.mnt, dentry);
@@ -2162,7 +2133,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl)
size_t size;
int error;
- if (!IS_POSIXACL(inode) || !inode->i_op ||
+ if (!IS_POSIXACL(inode) ||
!inode->i_op->setxattr || !inode->i_op->removexattr)
return -EOPNOTSUPP;
switch(type) {
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
new file mode 100644
index 000000000000..50914d7303c6
--- /dev/null
+++ b/fs/notify/Kconfig
@@ -0,0 +1,2 @@
+source "fs/notify/dnotify/Kconfig"
+source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/Makefile b/fs/notify/Makefile
new file mode 100644
index 000000000000..5a95b6010ce7
--- /dev/null
+++ b/fs/notify/Makefile
@@ -0,0 +1,2 @@
+obj-y += dnotify/
+obj-y += inotify/
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
new file mode 100644
index 000000000000..26adf5dfa646
--- /dev/null
+++ b/fs/notify/dnotify/Kconfig
@@ -0,0 +1,10 @@
+config DNOTIFY
+ bool "Dnotify support"
+ default y
+ help
+ Dnotify is a directory-based per-fd file change notification system
+ that uses signals to communicate events to user-space. There exist
+ superior alternatives, but some applications may still rely on
+ dnotify.
+
+ If unsure, say Y.
diff --git a/fs/notify/dnotify/Makefile b/fs/notify/dnotify/Makefile
new file mode 100644
index 000000000000..f145251dcadb
--- /dev/null
+++ b/fs/notify/dnotify/Makefile
@@ -0,0 +1 @@
+obj-$(CONFIG_DNOTIFY) += dnotify.o
diff --git a/fs/dnotify.c b/fs/notify/dnotify/dnotify.c
index 676073b8dda5..b0aa2cde80bd 100644
--- a/fs/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -115,9 +115,6 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg)
dn->dn_next = inode->i_dnotify;
inode->i_dnotify = dn;
spin_unlock(&inode->i_lock);
-
- if (filp->f_op && filp->f_op->dir_notify)
- return filp->f_op->dir_notify(filp, arg);
return 0;
out_free:
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
new file mode 100644
index 000000000000..446792841023
--- /dev/null
+++ b/fs/notify/inotify/Kconfig
@@ -0,0 +1,27 @@
+config INOTIFY
+ bool "Inotify file change notification support"
+ default y
+ ---help---
+ Say Y here to enable inotify support. Inotify is a file change
+ notification system and a replacement for dnotify. Inotify fixes
+ numerous shortcomings in dnotify and introduces several new features
+ including multiple file events, one-shot support, and unmount
+ notification.
+
+ For more information, see <file:Documentation/filesystems/inotify.txt>
+
+ If unsure, say Y.
+
+config INOTIFY_USER
+ bool "Inotify support for userspace"
+ depends on INOTIFY
+ default y
+ ---help---
+ Say Y here to enable inotify support for userspace, including the
+ associated system calls. Inotify allows monitoring of both files and
+ directories via a single open fd. Events are read from the file
+ descriptor, which is also select()- and poll()-able.
+
+ For more information, see <file:Documentation/filesystems/inotify.txt>
+
+ If unsure, say Y.
diff --git a/fs/notify/inotify/Makefile b/fs/notify/inotify/Makefile
new file mode 100644
index 000000000000..e290f3bb9d8d
--- /dev/null
+++ b/fs/notify/inotify/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_INOTIFY) += inotify.o
+obj-$(CONFIG_INOTIFY_USER) += inotify_user.o
diff --git a/fs/inotify.c b/fs/notify/inotify/inotify.c
index dae3f28f30d4..dae3f28f30d4 100644
--- a/fs/inotify.c
+++ b/fs/notify/inotify/inotify.c
diff --git a/fs/inotify_user.c b/fs/notify/inotify/inotify_user.c
index e2425bbd871f..81b8644b0136 100644
--- a/fs/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -76,10 +76,10 @@ struct inotify_device {
struct mutex ev_mutex; /* protects event queue */
struct mutex up_mutex; /* synchronizes watch updates */
struct list_head events; /* list of queued events */
- atomic_t count; /* reference count */
struct user_struct *user; /* user who opened this dev */
struct inotify_handle *ih; /* inotify handle */
struct fasync_struct *fa; /* async notification */
+ atomic_t count; /* reference count */
unsigned int queue_size; /* size of the queue (bytes) */
unsigned int event_count; /* number of pending events */
unsigned int max_events; /* maximum number of events */
@@ -704,7 +704,7 @@ fput_and_out:
return ret;
}
-asmlinkage long sys_inotify_rm_watch(int fd, u32 wd)
+asmlinkage long sys_inotify_rm_watch(int fd, __s32 wd)
{
struct file *filp;
struct inotify_device *dev;
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index e9da092e2772..86bef156cf0a 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1406,9 +1406,6 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi)
ni->allocated_size = sle64_to_cpu(
a->data.non_resident.allocated_size);
}
- /* Setup the operations for this attribute inode. */
- vi->i_op = NULL;
- vi->i_fop = NULL;
if (NInoMstProtected(ni))
vi->i_mapping->a_ops = &ntfs_mst_aops;
else
diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile
index 589dcdfdfe3c..01596079dd63 100644
--- a/fs/ocfs2/Makefile
+++ b/fs/ocfs2/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o
ocfs2-objs := \
alloc.o \
aops.o \
+ blockcheck.o \
buffer_head_io.o \
dcache.o \
dir.o \
@@ -35,8 +36,14 @@ ocfs2-objs := \
sysfile.o \
uptodate.o \
ver.o \
+ quota_local.o \
+ quota_global.o \
xattr.o
+ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y)
+ocfs2-objs += acl.o
+endif
+
ocfs2_stackglue-objs := stackglue.o
ocfs2_stack_o2cb-objs := stack_o2cb.o
ocfs2_stack_user-objs := stack_user.o
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
new file mode 100644
index 000000000000..12dfb44c22e5
--- /dev/null
+++ b/fs/ocfs2/acl.c
@@ -0,0 +1,479 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.c
+ *
+ * Copyright (C) 2004, 2008 Oracle. All rights reserved.
+ *
+ * CREDITS:
+ * Lots of code in this file is copy from linux/fs/ext3/acl.c.
+ * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+
+#define MLOG_MASK_PREFIX ML_INODE
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+#include "alloc.h"
+#include "dlmglue.h"
+#include "file.h"
+#include "ocfs2_fs.h"
+
+#include "xattr.h"
+#include "acl.h"
+
+/*
+ * Convert from xattr value to acl struct.
+ */
+static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size)
+{
+ int n, count;
+ struct posix_acl *acl;
+
+ if (!value)
+ return NULL;
+ if (size < sizeof(struct posix_acl_entry))
+ return ERR_PTR(-EINVAL);
+
+ count = size / sizeof(struct posix_acl_entry);
+ if (count < 0)
+ return ERR_PTR(-EINVAL);
+ if (count == 0)
+ return NULL;
+
+ acl = posix_acl_alloc(count, GFP_NOFS);
+ if (!acl)
+ return ERR_PTR(-ENOMEM);
+ for (n = 0; n < count; n++) {
+ struct ocfs2_acl_entry *entry =
+ (struct ocfs2_acl_entry *)value;
+
+ acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag);
+ acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm);
+ acl->a_entries[n].e_id = le32_to_cpu(entry->e_id);
+ value += sizeof(struct posix_acl_entry);
+
+ }
+ return acl;
+}
+
+/*
+ * Convert acl struct to xattr value.
+ */
+static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size)
+{
+ struct ocfs2_acl_entry *entry = NULL;
+ char *ocfs2_acl;
+ size_t n;
+
+ *size = acl->a_count * sizeof(struct posix_acl_entry);
+
+ ocfs2_acl = kmalloc(*size, GFP_NOFS);
+ if (!ocfs2_acl)
+ return ERR_PTR(-ENOMEM);
+
+ entry = (struct ocfs2_acl_entry *)ocfs2_acl;
+ for (n = 0; n < acl->a_count; n++, entry++) {
+ entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag);
+ entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm);
+ entry->e_id = cpu_to_le32(acl->a_entries[n].e_id);
+ }
+ return ocfs2_acl;
+}
+
+static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode,
+ int type,
+ struct buffer_head *di_bh)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int name_index;
+ char *value = NULL;
+ struct posix_acl *acl;
+ int retval;
+
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+ return NULL;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+ break;
+ case ACL_TYPE_DEFAULT:
+ name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
+
+ retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", NULL, 0);
+ if (retval > 0) {
+ value = kmalloc(retval, GFP_NOFS);
+ if (!value)
+ return ERR_PTR(-ENOMEM);
+ retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+ "", value, retval);
+ }
+
+ if (retval > 0)
+ acl = ocfs2_acl_from_xattr(value, retval);
+ else if (retval == -ENODATA || retval == 0)
+ acl = NULL;
+ else
+ acl = ERR_PTR(retval);
+
+ kfree(value);
+
+ return acl;
+}
+
+
+/*
+ * Get posix acl.
+ */
+static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct buffer_head *di_bh = NULL;
+ struct posix_acl *acl;
+ int ret;
+
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+ return NULL;
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ acl = ERR_PTR(ret);
+ return acl;
+ }
+
+ acl = ocfs2_get_acl_nolock(inode, type, di_bh);
+
+ ocfs2_inode_unlock(inode, 0);
+
+ brelse(di_bh);
+
+ return acl;
+}
+
+/*
+ * Set the access or default ACL of an inode.
+ */
+static int ocfs2_set_acl(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *di_bh,
+ int type,
+ struct posix_acl *acl,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac)
+{
+ int name_index;
+ void *value = NULL;
+ size_t size = 0;
+ int ret;
+
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ switch (type) {
+ case ACL_TYPE_ACCESS:
+ name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
+ if (acl) {
+ mode_t mode = inode->i_mode;
+ ret = posix_acl_equiv_mode(acl, &mode);
+ if (ret < 0)
+ return ret;
+ else {
+ inode->i_mode = mode;
+ if (ret == 0)
+ acl = NULL;
+ }
+ }
+ break;
+ case ACL_TYPE_DEFAULT:
+ name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
+ if (!S_ISDIR(inode->i_mode))
+ return acl ? -EACCES : 0;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (acl) {
+ value = ocfs2_acl_to_xattr(acl, &size);
+ if (IS_ERR(value))
+ return (int)PTR_ERR(value);
+ }
+
+ if (handle)
+ ret = ocfs2_xattr_set_handle(handle, inode, di_bh, name_index,
+ "", value, size, 0,
+ meta_ac, data_ac);
+ else
+ ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0);
+
+ kfree(value);
+
+ return ret;
+}
+
+int ocfs2_check_acl(struct inode *inode, int mask)
+{
+ struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (acl) {
+ int ret = posix_acl_permission(inode, acl, mask);
+ posix_acl_release(acl);
+ return ret;
+ }
+
+ return -EAGAIN;
+}
+
+int ocfs2_acl_chmod(struct inode *inode)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct posix_acl *acl, *clone;
+ int ret;
+
+ if (S_ISLNK(inode->i_mode))
+ return -EOPNOTSUPP;
+
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+ return 0;
+
+ acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS);
+ if (IS_ERR(acl) || !acl)
+ return PTR_ERR(acl);
+ clone = posix_acl_clone(acl, GFP_KERNEL);
+ posix_acl_release(acl);
+ if (!clone)
+ return -ENOMEM;
+ ret = posix_acl_chmod_masq(clone, inode->i_mode);
+ if (!ret)
+ ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS,
+ clone, NULL, NULL);
+ posix_acl_release(clone);
+ return ret;
+}
+
+/*
+ * Initialize the ACLs of a new inode. If parent directory has default ACL,
+ * then clone to new inode. Called from ocfs2_mknod.
+ */
+int ocfs2_init_acl(handle_t *handle,
+ struct inode *inode,
+ struct inode *dir,
+ struct buffer_head *di_bh,
+ struct buffer_head *dir_bh,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct posix_acl *acl = NULL;
+ int ret = 0;
+
+ if (!S_ISLNK(inode->i_mode)) {
+ if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+ acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT,
+ dir_bh);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ }
+ if (!acl)
+ inode->i_mode &= ~current->fs->umask;
+ }
+ if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) {
+ struct posix_acl *clone;
+ mode_t mode;
+
+ if (S_ISDIR(inode->i_mode)) {
+ ret = ocfs2_set_acl(handle, inode, di_bh,
+ ACL_TYPE_DEFAULT, acl,
+ meta_ac, data_ac);
+ if (ret)
+ goto cleanup;
+ }
+ clone = posix_acl_clone(acl, GFP_NOFS);
+ ret = -ENOMEM;
+ if (!clone)
+ goto cleanup;
+
+ mode = inode->i_mode;
+ ret = posix_acl_create_masq(clone, &mode);
+ if (ret >= 0) {
+ inode->i_mode = mode;
+ if (ret > 0) {
+ ret = ocfs2_set_acl(handle, inode,
+ di_bh, ACL_TYPE_ACCESS,
+ clone, meta_ac, data_ac);
+ }
+ }
+ posix_acl_release(clone);
+ }
+cleanup:
+ posix_acl_release(acl);
+ return ret;
+}
+
+static size_t ocfs2_xattr_list_acl_access(struct inode *inode,
+ char *list,
+ size_t list_len,
+ const char *name,
+ size_t name_len)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS);
+
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+ return 0;
+
+ if (list && size <= list_len)
+ memcpy(list, POSIX_ACL_XATTR_ACCESS, size);
+ return size;
+}
+
+static size_t ocfs2_xattr_list_acl_default(struct inode *inode,
+ char *list,
+ size_t list_len,
+ const char *name,
+ size_t name_len)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT);
+
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+ return 0;
+
+ if (list && size <= list_len)
+ memcpy(list, POSIX_ACL_XATTR_DEFAULT, size);
+ return size;
+}
+
+static int ocfs2_xattr_get_acl(struct inode *inode,
+ int type,
+ void *buffer,
+ size_t size)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct posix_acl *acl;
+ int ret;
+
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+ return -EOPNOTSUPP;
+
+ acl = ocfs2_get_acl(inode, type);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ if (acl == NULL)
+ return -ENODATA;
+ ret = posix_acl_to_xattr(acl, buffer, size);
+ posix_acl_release(acl);
+
+ return ret;
+}
+
+static int ocfs2_xattr_get_acl_access(struct inode *inode,
+ const char *name,
+ void *buffer,
+ size_t size)
+{
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size);
+}
+
+static int ocfs2_xattr_get_acl_default(struct inode *inode,
+ const char *name,
+ void *buffer,
+ size_t size)
+{
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size);
+}
+
+static int ocfs2_xattr_set_acl(struct inode *inode,
+ int type,
+ const void *value,
+ size_t size)
+{
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct posix_acl *acl;
+ int ret = 0;
+
+ if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL))
+ return -EOPNOTSUPP;
+
+ if (!is_owner_or_cap(inode))
+ return -EPERM;
+
+ if (value) {
+ acl = posix_acl_from_xattr(value, size);
+ if (IS_ERR(acl))
+ return PTR_ERR(acl);
+ else if (acl) {
+ ret = posix_acl_valid(acl);
+ if (ret)
+ goto cleanup;
+ }
+ } else
+ acl = NULL;
+
+ ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL);
+
+cleanup:
+ posix_acl_release(acl);
+ return ret;
+}
+
+static int ocfs2_xattr_set_acl_access(struct inode *inode,
+ const char *name,
+ const void *value,
+ size_t size,
+ int flags)
+{
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size);
+}
+
+static int ocfs2_xattr_set_acl_default(struct inode *inode,
+ const char *name,
+ const void *value,
+ size_t size,
+ int flags)
+{
+ if (strcmp(name, "") != 0)
+ return -EINVAL;
+ return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size);
+}
+
+struct xattr_handler ocfs2_xattr_acl_access_handler = {
+ .prefix = POSIX_ACL_XATTR_ACCESS,
+ .list = ocfs2_xattr_list_acl_access,
+ .get = ocfs2_xattr_get_acl_access,
+ .set = ocfs2_xattr_set_acl_access,
+};
+
+struct xattr_handler ocfs2_xattr_acl_default_handler = {
+ .prefix = POSIX_ACL_XATTR_DEFAULT,
+ .list = ocfs2_xattr_list_acl_default,
+ .get = ocfs2_xattr_get_acl_default,
+ .set = ocfs2_xattr_set_acl_default,
+};
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
new file mode 100644
index 000000000000..8f6389ed4da5
--- /dev/null
+++ b/fs/ocfs2/acl.h
@@ -0,0 +1,58 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * acl.h
+ *
+ * Copyright (C) 2004, 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_ACL_H
+#define OCFS2_ACL_H
+
+#include <linux/posix_acl_xattr.h>
+
+struct ocfs2_acl_entry {
+ __le16 e_tag;
+ __le16 e_perm;
+ __le32 e_id;
+};
+
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+
+extern int ocfs2_check_acl(struct inode *, int);
+extern int ocfs2_acl_chmod(struct inode *);
+extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
+ struct buffer_head *, struct buffer_head *,
+ struct ocfs2_alloc_context *,
+ struct ocfs2_alloc_context *);
+
+#else /* CONFIG_OCFS2_FS_POSIX_ACL*/
+
+#define ocfs2_check_acl NULL
+static inline int ocfs2_acl_chmod(struct inode *inode)
+{
+ return 0;
+}
+static inline int ocfs2_init_acl(handle_t *handle,
+ struct inode *inode,
+ struct inode *dir,
+ struct buffer_head *di_bh,
+ struct buffer_head *dir_bh,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac)
+{
+ return 0;
+}
+
+#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/
+
+#endif /* OCFS2_ACL_H */
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 0cc2deb9394c..54ff4c77aaa3 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -28,6 +28,7 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/swap.h>
+#include <linux/quotaops.h>
#define MLOG_MASK_PREFIX ML_DISK_ALLOC
#include <cluster/masklog.h>
@@ -36,6 +37,7 @@
#include "alloc.h"
#include "aops.h"
+#include "blockcheck.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "inode.h"
@@ -46,6 +48,7 @@
#include "file.h"
#include "super.h"
#include "uptodate.h"
+#include "xattr.h"
#include "buffer_head_io.h"
@@ -187,20 +190,12 @@ static int ocfs2_dinode_insert_check(struct inode *inode,
static int ocfs2_dinode_sanity_check(struct inode *inode,
struct ocfs2_extent_tree *et)
{
- int ret = 0;
- struct ocfs2_dinode *di;
+ struct ocfs2_dinode *di = et->et_object;
BUG_ON(et->et_ops != &ocfs2_dinode_et_ops);
+ BUG_ON(!OCFS2_IS_VALID_DINODE(di));
- di = et->et_object;
- if (!OCFS2_IS_VALID_DINODE(di)) {
- ret = -EIO;
- ocfs2_error(inode->i_sb,
- "Inode %llu has invalid path root",
- (unsigned long long)OCFS2_I(inode)->ip_blkno);
- }
-
- return ret;
+ return 0;
}
static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
@@ -213,36 +208,33 @@ static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et)
static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et)
{
- struct ocfs2_xattr_value_root *xv = et->et_object;
+ struct ocfs2_xattr_value_buf *vb = et->et_object;
- et->et_root_el = &xv->xr_list;
+ et->et_root_el = &vb->vb_xv->xr_list;
}
static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et,
u64 blkno)
{
- struct ocfs2_xattr_value_root *xv =
- (struct ocfs2_xattr_value_root *)et->et_object;
+ struct ocfs2_xattr_value_buf *vb = et->et_object;
- xv->xr_last_eb_blk = cpu_to_le64(blkno);
+ vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno);
}
static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et)
{
- struct ocfs2_xattr_value_root *xv =
- (struct ocfs2_xattr_value_root *) et->et_object;
+ struct ocfs2_xattr_value_buf *vb = et->et_object;
- return le64_to_cpu(xv->xr_last_eb_blk);
+ return le64_to_cpu(vb->vb_xv->xr_last_eb_blk);
}
static void ocfs2_xattr_value_update_clusters(struct inode *inode,
struct ocfs2_extent_tree *et,
u32 clusters)
{
- struct ocfs2_xattr_value_root *xv =
- (struct ocfs2_xattr_value_root *)et->et_object;
+ struct ocfs2_xattr_value_buf *vb = et->et_object;
- le32_add_cpu(&xv->xr_clusters, clusters);
+ le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
}
static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
@@ -304,11 +296,13 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
struct inode *inode,
struct buffer_head *bh,
+ ocfs2_journal_access_func access,
void *obj,
struct ocfs2_extent_tree_operations *ops)
{
et->et_ops = ops;
et->et_root_bh = bh;
+ et->et_root_journal_access = access;
if (!obj)
obj = (void *)bh->b_data;
et->et_object = obj;
@@ -324,23 +318,23 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
struct inode *inode,
struct buffer_head *bh)
{
- __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops);
+ __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di,
+ NULL, &ocfs2_dinode_et_ops);
}
void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
struct inode *inode,
struct buffer_head *bh)
{
- __ocfs2_init_extent_tree(et, inode, bh, NULL,
- &ocfs2_xattr_tree_et_ops);
+ __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb,
+ NULL, &ocfs2_xattr_tree_et_ops);
}
void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
struct inode *inode,
- struct buffer_head *bh,
- struct ocfs2_xattr_value_root *xv)
+ struct ocfs2_xattr_value_buf *vb)
{
- __ocfs2_init_extent_tree(et, inode, bh, xv,
+ __ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb,
&ocfs2_xattr_value_et_ops);
}
@@ -362,6 +356,15 @@ static inline void ocfs2_et_update_clusters(struct inode *inode,
et->et_ops->eo_update_clusters(inode, et, clusters);
}
+static inline int ocfs2_et_root_journal_access(handle_t *handle,
+ struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ int type)
+{
+ return et->et_root_journal_access(handle, inode, et->et_root_bh,
+ type);
+}
+
static inline int ocfs2_et_insert_check(struct inode *inode,
struct ocfs2_extent_tree *et,
struct ocfs2_extent_rec *rec)
@@ -402,12 +405,14 @@ struct ocfs2_path_item {
#define OCFS2_MAX_PATH_DEPTH 5
struct ocfs2_path {
- int p_tree_depth;
- struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
+ int p_tree_depth;
+ ocfs2_journal_access_func p_root_access;
+ struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH];
};
#define path_root_bh(_path) ((_path)->p_node[0].bh)
#define path_root_el(_path) ((_path)->p_node[0].el)
+#define path_root_access(_path)((_path)->p_root_access)
#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
#define path_num_items(_path) ((_path)->p_tree_depth + 1)
@@ -440,6 +445,8 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
*/
if (keep_root)
depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
+ else
+ path_root_access(path) = NULL;
path->p_tree_depth = depth;
}
@@ -465,6 +472,7 @@ static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src)
BUG_ON(path_root_bh(dest) != path_root_bh(src));
BUG_ON(path_root_el(dest) != path_root_el(src));
+ BUG_ON(path_root_access(dest) != path_root_access(src));
ocfs2_reinit_path(dest, 1);
@@ -486,6 +494,7 @@ static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
int i;
BUG_ON(path_root_bh(dest) != path_root_bh(src));
+ BUG_ON(path_root_access(dest) != path_root_access(src));
for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
brelse(dest->p_node[i].bh);
@@ -521,7 +530,8 @@ static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
}
static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
- struct ocfs2_extent_list *root_el)
+ struct ocfs2_extent_list *root_el,
+ ocfs2_journal_access_func access)
{
struct ocfs2_path *path;
@@ -533,11 +543,48 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
get_bh(root_bh);
path_root_bh(path) = root_bh;
path_root_el(path) = root_el;
+ path_root_access(path) = access;
}
return path;
}
+static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path)
+{
+ return ocfs2_new_path(path_root_bh(path), path_root_el(path),
+ path_root_access(path));
+}
+
+static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et)
+{
+ return ocfs2_new_path(et->et_root_bh, et->et_root_el,
+ et->et_root_journal_access);
+}
+
+/*
+ * Journal the buffer at depth idx. All idx>0 are extent_blocks,
+ * otherwise it's the root_access function.
+ *
+ * I don't like the way this function's name looks next to
+ * ocfs2_journal_access_path(), but I don't have a better one.
+ */
+static int ocfs2_path_bh_journal_access(handle_t *handle,
+ struct inode *inode,
+ struct ocfs2_path *path,
+ int idx)
+{
+ ocfs2_journal_access_func access = path_root_access(path);
+
+ if (!access)
+ access = ocfs2_journal_access;
+
+ if (idx)
+ access = ocfs2_journal_access_eb;
+
+ return access(handle, inode, path->p_node[idx].bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+}
+
/*
* Convenience function to journal all components in a path.
*/
@@ -550,8 +597,7 @@ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
goto out;
for(i = 0; i < path_num_items(path); i++) {
- ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode, path, i);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -686,6 +732,80 @@ struct ocfs2_merge_ctxt {
int c_split_covers_rec;
};
+static int ocfs2_validate_extent_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc;
+ struct ocfs2_extent_block *eb =
+ (struct ocfs2_extent_block *)bh->b_data;
+
+ mlog(0, "Validating extent block %llu\n",
+ (unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check);
+ if (rc) {
+ mlog(ML_ERROR, "Checksum failed for extent block %llu\n",
+ (unsigned long long)bh->b_blocknr);
+ return rc;
+ }
+
+ /*
+ * Errors after here are fatal.
+ */
+
+ if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+ ocfs2_error(sb,
+ "Extent block #%llu has bad signature %.*s",
+ (unsigned long long)bh->b_blocknr, 7,
+ eb->h_signature);
+ return -EINVAL;
+ }
+
+ if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) {
+ ocfs2_error(sb,
+ "Extent block #%llu has an invalid h_blkno "
+ "of %llu",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(eb->h_blkno));
+ return -EINVAL;
+ }
+
+ if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+ ocfs2_error(sb,
+ "Extent block #%llu has an invalid "
+ "h_fs_generation of #%u",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(eb->h_fs_generation));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+ struct buffer_head **bh)
+{
+ int rc;
+ struct buffer_head *tmp = *bh;
+
+ rc = ocfs2_read_block(inode, eb_blkno, &tmp,
+ ocfs2_validate_extent_block);
+
+ /* If ocfs2_read_block() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
+
/*
* How many free extents have we got before we need more meta data?
*/
@@ -705,8 +825,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb,
last_eb_blk = ocfs2_et_get_last_eb_blk(et);
if (last_eb_blk) {
- retval = ocfs2_read_block(inode, last_eb_blk,
- &eb_bh);
+ retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
if (retval < 0) {
mlog_errno(retval);
goto bail;
@@ -768,8 +887,8 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
}
ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
- status = ocfs2_journal_access(handle, inode, bhs[i],
- OCFS2_JOURNAL_ACCESS_CREATE);
+ status = ocfs2_journal_access_eb(handle, inode, bhs[i],
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -908,15 +1027,12 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
for(i = 0; i < new_blocks; i++) {
bh = new_eb_bhs[i];
eb = (struct ocfs2_extent_block *) bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- status = -EIO;
- goto bail;
- }
+ /* ocfs2_create_new_meta_bhs() should create it right! */
+ BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
eb_el = &eb->h_list;
- status = ocfs2_journal_access(handle, inode, bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ status = ocfs2_journal_access_eb(handle, inode, bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -955,21 +1071,21 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
* journal_dirty erroring as it won't unless we've aborted the
* handle (in which case we would never be here) so reserving
* the write with journal_access is all we need to do. */
- status = ocfs2_journal_access(handle, inode, *last_eb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
- status = ocfs2_journal_access(handle, inode, et->et_root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_et_root_journal_access(handle, inode, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
if (eb_bh) {
- status = ocfs2_journal_access(handle, inode, eb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_eb(handle, inode, eb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1052,17 +1168,14 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
}
eb = (struct ocfs2_extent_block *) new_eb_bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- status = -EIO;
- goto bail;
- }
+ /* ocfs2_create_new_meta_bhs() should create it right! */
+ BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
eb_el = &eb->h_list;
root_el = et->et_root_el;
- status = ocfs2_journal_access(handle, inode, new_eb_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ status = ocfs2_journal_access_eb(handle, inode, new_eb_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1080,8 +1193,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
goto bail;
}
- status = ocfs2_journal_access(handle, inode, et->et_root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_et_root_journal_access(handle, inode, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1176,18 +1289,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb,
brelse(bh);
bh = NULL;
- status = ocfs2_read_block(inode, blkno, &bh);
+ status = ocfs2_read_extent_block(inode, blkno, &bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
eb = (struct ocfs2_extent_block *) bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- status = -EIO;
- goto bail;
- }
el = &eb->h_list;
if (le16_to_cpu(el->l_next_free_rec) <
@@ -1540,7 +1648,7 @@ static int __ocfs2_find_path(struct inode *inode,
brelse(bh);
bh = NULL;
- ret = ocfs2_read_block(inode, blkno, &bh);
+ ret = ocfs2_read_extent_block(inode, blkno, &bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1548,11 +1656,6 @@ static int __ocfs2_find_path(struct inode *inode,
eb = (struct ocfs2_extent_block *) bh->b_data;
el = &eb->h_list;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- ret = -EIO;
- goto out;
- }
if (le16_to_cpu(el->l_next_free_rec) >
le16_to_cpu(el->l_count)) {
@@ -1860,25 +1963,23 @@ static int ocfs2_rotate_subtree_right(struct inode *inode,
root_bh = left_path->p_node[subtree_index].bh;
BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
- ret = ocfs2_journal_access(handle, inode, root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+ subtree_index);
if (ret) {
mlog_errno(ret);
goto out;
}
for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
- ret = ocfs2_journal_access(handle, inode,
- right_path->p_node[i].bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode,
+ right_path, i);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access(handle, inode,
- left_path->p_node[i].bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode,
+ left_path, i);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2102,8 +2203,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode,
*ret_left_path = NULL;
- left_path = ocfs2_new_path(path_root_bh(right_path),
- path_root_el(right_path));
+ left_path = ocfs2_new_path_from_path(right_path);
if (!left_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -2398,9 +2498,9 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
return -EAGAIN;
if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) {
- ret = ocfs2_journal_access(handle, inode,
- path_leaf_bh(right_path),
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_eb(handle, inode,
+ path_leaf_bh(right_path),
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2417,8 +2517,8 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
* We have to update i_last_eb_blk during the meta
* data delete.
*/
- ret = ocfs2_journal_access(handle, inode, et_root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_et_root_journal_access(handle, inode, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2433,25 +2533,23 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
*/
BUG_ON(right_has_empty && !del_right_subtree);
- ret = ocfs2_journal_access(handle, inode, root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+ subtree_index);
if (ret) {
mlog_errno(ret);
goto out;
}
for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
- ret = ocfs2_journal_access(handle, inode,
- right_path->p_node[i].bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode,
+ right_path, i);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access(handle, inode,
- left_path->p_node[i].bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode,
+ left_path, i);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2596,16 +2694,17 @@ out:
static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode,
handle_t *handle,
- struct buffer_head *bh,
- struct ocfs2_extent_list *el)
+ struct ocfs2_path *path)
{
int ret;
+ struct buffer_head *bh = path_leaf_bh(path);
+ struct ocfs2_extent_list *el = path_leaf_el(path);
if (!ocfs2_is_empty_extent(&el->l_recs[0]))
return 0;
- ret = ocfs2_journal_access(handle, inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode, path,
+ path_num_items(path) - 1);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2644,8 +2743,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
goto out;
}
- left_path = ocfs2_new_path(path_root_bh(path),
- path_root_el(path));
+ left_path = ocfs2_new_path_from_path(path);
if (!left_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -2654,8 +2752,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
ocfs2_cp_path(left_path, path);
- right_path = ocfs2_new_path(path_root_bh(path),
- path_root_el(path));
+ right_path = ocfs2_new_path_from_path(path);
if (!right_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -2689,9 +2786,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode,
* Caller might still want to make changes to the
* tree root, so re-add it to the journal here.
*/
- ret = ocfs2_journal_access(handle, inode,
- path_root_bh(left_path),
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode,
+ left_path, 0);
if (ret) {
mlog_errno(ret);
goto out;
@@ -2785,8 +2881,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
* We have a path to the left of this one - it needs
* an update too.
*/
- left_path = ocfs2_new_path(path_root_bh(path),
- path_root_el(path));
+ left_path = ocfs2_new_path_from_path(path);
if (!left_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -2875,8 +2970,7 @@ rightmost_no_delete:
* it up front.
*/
ret = ocfs2_rotate_rightmost_leaf_left(inode, handle,
- path_leaf_bh(path),
- path_leaf_el(path));
+ path);
if (ret)
mlog_errno(ret);
goto out;
@@ -3027,8 +3121,7 @@ static int ocfs2_get_right_path(struct inode *inode,
/* This function shouldn't be called for the rightmost leaf. */
BUG_ON(right_cpos == 0);
- right_path = ocfs2_new_path(path_root_bh(left_path),
- path_root_el(left_path));
+ right_path = ocfs2_new_path_from_path(left_path);
if (!right_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -3111,8 +3204,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
root_bh = left_path->p_node[subtree_index].bh;
BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
- ret = ocfs2_journal_access(handle, inode, root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+ subtree_index);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3120,17 +3213,15 @@ static int ocfs2_merge_rec_right(struct inode *inode,
for (i = subtree_index + 1;
i < path_num_items(right_path); i++) {
- ret = ocfs2_journal_access(handle, inode,
- right_path->p_node[i].bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode,
+ right_path, i);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access(handle, inode,
- left_path->p_node[i].bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode,
+ left_path, i);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3142,8 +3233,8 @@ static int ocfs2_merge_rec_right(struct inode *inode,
right_rec = &el->l_recs[index + 1];
}
- ret = ocfs2_journal_access(handle, inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode, left_path,
+ path_num_items(left_path) - 1);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3199,8 +3290,7 @@ static int ocfs2_get_left_path(struct inode *inode,
/* This function shouldn't be called for the leftmost leaf. */
BUG_ON(left_cpos == 0);
- left_path = ocfs2_new_path(path_root_bh(right_path),
- path_root_el(right_path));
+ left_path = ocfs2_new_path_from_path(right_path);
if (!left_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -3283,8 +3373,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
root_bh = left_path->p_node[subtree_index].bh;
BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
- ret = ocfs2_journal_access(handle, inode, root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+ subtree_index);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3292,17 +3382,15 @@ static int ocfs2_merge_rec_left(struct inode *inode,
for (i = subtree_index + 1;
i < path_num_items(right_path); i++) {
- ret = ocfs2_journal_access(handle, inode,
- right_path->p_node[i].bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode,
+ right_path, i);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access(handle, inode,
- left_path->p_node[i].bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode,
+ left_path, i);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3314,8 +3402,8 @@ static int ocfs2_merge_rec_left(struct inode *inode,
has_empty_extent = 1;
}
- ret = ocfs2_journal_access(handle, inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_path_bh_journal_access(handle, inode, right_path,
+ path_num_items(right_path) - 1);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3732,8 +3820,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
* leftmost leaf.
*/
if (left_cpos) {
- left_path = ocfs2_new_path(path_root_bh(right_path),
- path_root_el(right_path));
+ left_path = ocfs2_new_path_from_path(right_path);
if (!left_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -3958,8 +4045,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
el = et->et_root_el;
- ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_et_root_journal_access(handle, inode, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3970,7 +4057,7 @@ static int ocfs2_do_insert_extent(struct inode *inode,
goto out_update_clusters;
}
- right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+ right_path = ocfs2_new_path_from_et(et);
if (!right_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -4020,8 +4107,8 @@ static int ocfs2_do_insert_extent(struct inode *inode,
* ocfs2_rotate_tree_right() might have extended the
* transaction without re-journaling our tree root.
*/
- ret = ocfs2_journal_access(handle, inode, et->et_root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_et_root_journal_access(handle, inode, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4082,8 +4169,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
goto out;
if (left_cpos != 0) {
- left_path = ocfs2_new_path(path_root_bh(path),
- path_root_el(path));
+ left_path = ocfs2_new_path_from_path(path);
if (!left_path)
goto out;
@@ -4097,8 +4183,15 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
le16_to_cpu(new_el->l_count)) {
bh = path_leaf_bh(left_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
- eb);
+ ocfs2_error(inode->i_sb,
+ "Extent block #%llu has an "
+ "invalid l_next_free_rec of "
+ "%d. It should have "
+ "matched the l_count of %d",
+ (unsigned long long)le64_to_cpu(eb->h_blkno),
+ le16_to_cpu(new_el->l_next_free_rec),
+ le16_to_cpu(new_el->l_count));
+ status = -EINVAL;
goto out;
}
rec = &new_el->l_recs[
@@ -4132,8 +4225,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
if (right_cpos == 0)
goto out;
- right_path = ocfs2_new_path(path_root_bh(path),
- path_root_el(path));
+ right_path = ocfs2_new_path_from_path(path);
if (!right_path)
goto out;
@@ -4147,8 +4239,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path,
if (le16_to_cpu(new_el->l_next_free_rec) <= 1) {
bh = path_leaf_bh(right_path);
eb = (struct ocfs2_extent_block *)bh->b_data;
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
- eb);
+ ocfs2_error(inode->i_sb,
+ "Extent block #%llu has an "
+ "invalid l_next_free_rec of %d",
+ (unsigned long long)le64_to_cpu(eb->h_blkno),
+ le16_to_cpu(new_el->l_next_free_rec));
+ status = -EINVAL;
goto out;
}
rec = &new_el->l_recs[1];
@@ -4294,7 +4390,9 @@ static int ocfs2_figure_insert_type(struct inode *inode,
* ocfs2_figure_insert_type() and ocfs2_add_branch()
* may want it later.
*/
- ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh);
+ ret = ocfs2_read_extent_block(inode,
+ ocfs2_et_get_last_eb_blk(et),
+ &bh);
if (ret) {
mlog_exit(ret);
goto out;
@@ -4320,7 +4418,7 @@ static int ocfs2_figure_insert_type(struct inode *inode,
return 0;
}
- path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+ path = ocfs2_new_path_from_et(et);
if (!path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -4531,9 +4629,9 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb,
BUG_ON(num_bits > clusters_to_add);
- /* reserve our write early -- insert_extent may update the inode */
- status = ocfs2_journal_access(handle, inode, et->et_root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ /* reserve our write early -- insert_extent may update the tree root */
+ status = ocfs2_et_root_journal_access(handle, inode, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -4760,20 +4858,15 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
if (path->p_tree_depth) {
struct ocfs2_extent_block *eb;
- ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
- &last_eb_bh);
+ ret = ocfs2_read_extent_block(inode,
+ ocfs2_et_get_last_eb_blk(et),
+ &last_eb_bh);
if (ret) {
mlog_exit(ret);
goto out;
}
eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- ret = -EROFS;
- goto out;
- }
-
rightmost_el = &eb->h_list;
} else
rightmost_el = path_root_el(path);
@@ -4854,7 +4947,7 @@ int ocfs2_mark_extent_written(struct inode *inode,
if (et->et_ops == &ocfs2_dinode_et_ops)
ocfs2_extent_map_trunc(inode, 0);
- left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+ left_path = ocfs2_new_path_from_et(et);
if (!left_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -4918,8 +5011,9 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et,
depth = path->p_tree_depth;
if (depth > 0) {
- ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et),
- &last_eb_bh);
+ ret = ocfs2_read_extent_block(inode,
+ ocfs2_et_get_last_eb_blk(et),
+ &last_eb_bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -5025,8 +5119,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle,
}
if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) {
- left_path = ocfs2_new_path(path_root_bh(path),
- path_root_el(path));
+ left_path = ocfs2_new_path_from_path(path);
if (!left_path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -5135,7 +5228,7 @@ int ocfs2_remove_extent(struct inode *inode,
ocfs2_extent_map_trunc(inode, 0);
- path = ocfs2_new_path(et->et_root_bh, et->et_root_el);
+ path = ocfs2_new_path_from_et(et);
if (!path) {
ret = -ENOMEM;
mlog_errno(ret);
@@ -5255,6 +5348,78 @@ out:
return ret;
}
+int ocfs2_remove_btree_range(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 cpos, u32 phys_cpos, u32 len,
+ struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+ int ret;
+ u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *tl_inode = osb->osb_tl_inode;
+ handle_t *handle;
+ struct ocfs2_alloc_context *meta_ac = NULL;
+
+ ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ if (ocfs2_truncate_log_needs_flush(osb)) {
+ ret = __ocfs2_flush_truncate_log(osb);
+ if (ret < 0) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_et_root_journal_access(handle, inode, et,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac,
+ dealloc);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ocfs2_et_update_clusters(inode, et, -len);
+
+ ret = ocfs2_journal_dirty(handle, et->et_root_bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out_commit;
+ }
+
+ ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+ if (ret)
+ mlog_errno(ret);
+
+out_commit:
+ ocfs2_commit_trans(osb, handle);
+out:
+ mutex_unlock(&tl_inode->i_mutex);
+
+ if (meta_ac)
+ ocfs2_free_alloc_context(meta_ac);
+
+ return ret;
+}
+
int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb)
{
struct buffer_head *tl_bh = osb->osb_tl_bh;
@@ -5308,13 +5473,13 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
di = (struct ocfs2_dinode *) tl_bh->b_data;
- tl = &di->id2.i_dealloc;
- if (!OCFS2_IS_VALID_DINODE(di)) {
- OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
- status = -EIO;
- goto bail;
- }
+ /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated
+ * by the underlying call to ocfs2_read_inode_block(), so any
+ * corruption is a code bug */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+ tl = &di->id2.i_dealloc;
tl_count = le16_to_cpu(tl->tl_count);
mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) ||
tl_count == 0,
@@ -5332,8 +5497,8 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
goto bail;
}
- status = ocfs2_journal_access(handle, tl_inode, tl_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -5394,8 +5559,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb,
while (i >= 0) {
/* Caller has given us at least enough credits to
* update the truncate log dinode */
- status = ocfs2_journal_access(handle, tl_inode, tl_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, tl_inode, tl_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -5464,13 +5629,13 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
BUG_ON(mutex_trylock(&tl_inode->i_mutex));
di = (struct ocfs2_dinode *) tl_bh->b_data;
- tl = &di->id2.i_dealloc;
- if (!OCFS2_IS_VALID_DINODE(di)) {
- OCFS2_RO_ON_INVALID_DINODE(osb->sb, di);
- status = -EIO;
- goto out;
- }
+ /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated
+ * by the underlying call to ocfs2_read_inode_block(), so any
+ * corruption is a code bug */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+ tl = &di->id2.i_dealloc;
num_to_flush = le16_to_cpu(tl->tl_used);
mlog(0, "Flush %u records from truncate log #%llu\n",
num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno);
@@ -5586,7 +5751,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb,
goto bail;
}
- status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+ status = ocfs2_read_inode_block(inode, &bh);
if (status < 0) {
iput(inode);
mlog_errno(status);
@@ -5625,13 +5790,13 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
}
di = (struct ocfs2_dinode *) tl_bh->b_data;
- tl = &di->id2.i_dealloc;
- if (!OCFS2_IS_VALID_DINODE(di)) {
- OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di);
- status = -EIO;
- goto bail;
- }
+ /* tl_bh is loaded from ocfs2_get_truncate_log_info(). It's
+ * validated by the underlying call to ocfs2_read_inode_block(),
+ * so any corruption is a code bug */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(di));
+
+ tl = &di->id2.i_dealloc;
if (le16_to_cpu(tl->tl_used)) {
mlog(0, "We'll have %u logs to recover\n",
le16_to_cpu(tl->tl_used));
@@ -5651,6 +5816,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
* tl_used. */
tl->tl_used = 0;
+ ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check);
status = ocfs2_write_block(osb, tl_bh, tl_inode);
if (status < 0) {
mlog_errno(status);
@@ -5800,7 +5966,10 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
*/
/*
- * Describes a single block free from a suballocator
+ * Describe a single bit freed from a suballocator. For the block
+ * suballocators, it represents one block. For the global cluster
+ * allocator, it represents some clusters and free_bit indicates
+ * clusters number.
*/
struct ocfs2_cached_block_free {
struct ocfs2_cached_block_free *free_next;
@@ -5815,10 +5984,10 @@ struct ocfs2_per_slot_free_list {
struct ocfs2_cached_block_free *f_first;
};
-static int ocfs2_free_cached_items(struct ocfs2_super *osb,
- int sysfile_type,
- int slot,
- struct ocfs2_cached_block_free *head)
+static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
+ int sysfile_type,
+ int slot,
+ struct ocfs2_cached_block_free *head)
{
int ret;
u64 bg_blkno;
@@ -5893,6 +6062,82 @@ out:
return ret;
}
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+ u64 blkno, unsigned int bit)
+{
+ int ret = 0;
+ struct ocfs2_cached_block_free *item;
+
+ item = kmalloc(sizeof(*item), GFP_NOFS);
+ if (item == NULL) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ return ret;
+ }
+
+ mlog(0, "Insert clusters: (bit %u, blk %llu)\n",
+ bit, (unsigned long long)blkno);
+
+ item->free_blk = blkno;
+ item->free_bit = bit;
+ item->free_next = ctxt->c_global_allocator;
+
+ ctxt->c_global_allocator = item;
+ return ret;
+}
+
+static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
+ struct ocfs2_cached_block_free *head)
+{
+ struct ocfs2_cached_block_free *tmp;
+ struct inode *tl_inode = osb->osb_tl_inode;
+ handle_t *handle;
+ int ret = 0;
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ while (head) {
+ if (ocfs2_truncate_log_needs_flush(osb)) {
+ ret = __ocfs2_flush_truncate_log(osb);
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = ocfs2_truncate_log_append(osb, handle, head->free_blk,
+ head->free_bit);
+
+ ocfs2_commit_trans(osb, handle);
+ tmp = head;
+ head = head->free_next;
+ kfree(tmp);
+
+ if (ret < 0) {
+ mlog_errno(ret);
+ break;
+ }
+ }
+
+ mutex_unlock(&tl_inode->i_mutex);
+
+ while (head) {
+ /* Premature exit may have left some dangling items. */
+ tmp = head;
+ head = head->free_next;
+ kfree(tmp);
+ }
+
+ return ret;
+}
+
int ocfs2_run_deallocs(struct ocfs2_super *osb,
struct ocfs2_cached_dealloc_ctxt *ctxt)
{
@@ -5908,8 +6153,10 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
if (fl->f_first) {
mlog(0, "Free items: (type %u, slot %d)\n",
fl->f_inode_type, fl->f_slot);
- ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type,
- fl->f_slot, fl->f_first);
+ ret2 = ocfs2_free_cached_blocks(osb,
+ fl->f_inode_type,
+ fl->f_slot,
+ fl->f_first);
if (ret2)
mlog_errno(ret2);
if (!ret)
@@ -5920,6 +6167,17 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb,
kfree(fl);
}
+ if (ctxt->c_global_allocator) {
+ ret2 = ocfs2_free_cached_clusters(osb,
+ ctxt->c_global_allocator);
+ if (ret2)
+ mlog_errno(ret2);
+ if (!ret)
+ ret = ret2;
+
+ ctxt->c_global_allocator = NULL;
+ }
+
return ret;
}
@@ -6075,11 +6333,10 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode,
eb = (struct ocfs2_extent_block *) bh->b_data;
el = &eb->h_list;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- ret = -EROFS;
- goto out;
- }
+
+ /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block().
+ * Any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb));
*new_last_eb = bh;
get_bh(*new_last_eb);
@@ -6326,8 +6583,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
}
if (last_eb_bh) {
- status = ocfs2_journal_access(handle, inode, last_eb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_eb(handle, inode, last_eb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -6350,6 +6607,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
goto bail;
}
+ vfs_dq_free_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, clusters_to_del));
spin_lock(&OCFS2_I(inode)->ip_lock);
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
clusters_to_del;
@@ -6436,11 +6695,6 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle,
mlog_errno(ret);
else if (ocfs2_should_order_data(inode)) {
ret = ocfs2_jbd2_file_inode(handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
- ret = walk_page_buffers(handle, page_buffers(page),
- from, to, &partial,
- ocfs2_journal_dirty_data);
-#endif
if (ret < 0)
mlog_errno(ret);
}
@@ -6663,6 +6917,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
struct page **pages = NULL;
loff_t end = osb->s_clustersize;
struct ocfs2_extent_tree et;
+ int did_quota = 0;
has_data = i_size_read(inode) ? 1 : 0;
@@ -6682,15 +6937,16 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
}
}
- handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS);
+ handle = ocfs2_start_trans(osb,
+ ocfs2_inline_to_extents_credits(osb->sb));
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
goto out_unlock;
}
- ret = ocfs2_journal_access(handle, inode, di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, inode, di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
@@ -6701,6 +6957,13 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
unsigned int page_end;
u64 phys;
+ if (vfs_dq_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, 1))) {
+ ret = -EDQUOT;
+ goto out_commit;
+ }
+ did_quota = 1;
+
ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off,
&num);
if (ret) {
@@ -6774,6 +7037,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
}
out_commit:
+ if (ret < 0 && did_quota)
+ vfs_dq_free_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, 1));
+
ocfs2_commit_trans(osb, handle);
out_unlock:
@@ -6813,7 +7080,8 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
i_size_read(inode));
- path = ocfs2_new_path(fe_bh, &di->id2.i_list);
+ path = ocfs2_new_path(fe_bh, &di->id2.i_list,
+ ocfs2_journal_access_di);
if (!path) {
status = -ENOMEM;
mlog_errno(status);
@@ -6984,20 +7252,14 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc);
if (fe->id2.i_list.l_tree_depth) {
- status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk),
- &last_eb_bh);
+ status = ocfs2_read_extent_block(inode,
+ le64_to_cpu(fe->i_last_eb_blk),
+ &last_eb_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-
- brelse(last_eb_bh);
- status = -EIO;
- goto bail;
- }
}
(*tc)->tc_last_eb_bh = last_eb_bh;
@@ -7052,8 +7314,8 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh,
goto out;
}
- ret = ocfs2_journal_access(handle, inode, di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, inode, di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 70257c84cfbe..cceff5c37f47 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -45,7 +45,9 @@
*
* ocfs2_extent_tree contains info for the root of the b-tree, it must have a
* root ocfs2_extent_list and a root_bh so that they can be used in the b-tree
- * functions.
+ * functions. With metadata ecc, we now call different journal_access
+ * functions for each type of metadata, so it must have the
+ * root_journal_access function.
* ocfs2_extent_tree_operations abstract the normal operations we do for
* the root of extent b-tree.
*/
@@ -54,6 +56,7 @@ struct ocfs2_extent_tree {
struct ocfs2_extent_tree_operations *et_ops;
struct buffer_head *et_root_bh;
struct ocfs2_extent_list *et_root_el;
+ ocfs2_journal_access_func et_root_journal_access;
void *et_object;
unsigned int et_max_leaf_clusters;
};
@@ -68,10 +71,18 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et,
void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et,
struct inode *inode,
struct buffer_head *bh);
+struct ocfs2_xattr_value_buf;
void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et,
struct inode *inode,
- struct buffer_head *bh,
- struct ocfs2_xattr_value_root *xv);
+ struct ocfs2_xattr_value_buf *vb);
+
+/*
+ * Read an extent block into *bh. If *bh is NULL, a bh will be
+ * allocated. This is a cached read. The extent block will be validated
+ * with ocfs2_validate_extent_block().
+ */
+int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno,
+ struct buffer_head **bh);
struct ocfs2_alloc_context;
int ocfs2_insert_extent(struct ocfs2_super *osb,
@@ -110,6 +121,11 @@ int ocfs2_remove_extent(struct inode *inode,
u32 cpos, u32 len, handle_t *handle,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc);
+int ocfs2_remove_btree_range(struct inode *inode,
+ struct ocfs2_extent_tree *et,
+ u32 cpos, u32 phys_cpos, u32 len,
+ struct ocfs2_cached_dealloc_ctxt *dealloc);
+
int ocfs2_num_free_extents(struct ocfs2_super *osb,
struct inode *inode,
struct ocfs2_extent_tree *et);
@@ -167,10 +183,18 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb);
*/
struct ocfs2_cached_dealloc_ctxt {
struct ocfs2_per_slot_free_list *c_first_suballocator;
+ struct ocfs2_cached_block_free *c_global_allocator;
};
static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c)
{
c->c_first_suballocator = NULL;
+ c->c_global_allocator = NULL;
+}
+int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt,
+ u64 blkno, unsigned int bit);
+static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c)
+{
+ return c->c_global_allocator != NULL;
}
int ocfs2_run_deallocs(struct ocfs2_super *osb,
struct ocfs2_cached_dealloc_ctxt *ctxt);
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c22543b33420..a067a6cffb01 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -27,6 +27,7 @@
#include <linux/swap.h>
#include <linux/pipe_fs_i.h>
#include <linux/mpage.h>
+#include <linux/quotaops.h>
#define MLOG_MASK_PREFIX ML_FILE_IO
#include <cluster/masklog.h>
@@ -68,20 +69,13 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
goto bail;
}
- status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+ status = ocfs2_read_inode_block(inode, &bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
fe = (struct ocfs2_dinode *) bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n",
- (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
- fe->i_signature);
- goto bail;
- }
-
if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
le32_to_cpu(fe->i_clusters))) {
mlog(ML_ERROR, "block offset is outside the allocated size: "
@@ -262,7 +256,7 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page)
BUG_ON(!PageLocked(page));
BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
- ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+ ret = ocfs2_read_inode_block(inode, &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -481,12 +475,6 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
if (ocfs2_should_order_data(inode)) {
ret = ocfs2_jbd2_file_inode(handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
- ret = walk_page_buffers(handle,
- page_buffers(page),
- from, to, NULL,
- ocfs2_journal_dirty_data);
-#endif
if (ret < 0)
mlog_errno(ret);
}
@@ -1072,15 +1060,8 @@ static void ocfs2_write_failure(struct inode *inode,
tmppage = wc->w_pages[i];
if (page_has_buffers(tmppage)) {
- if (ocfs2_should_order_data(inode)) {
+ if (ocfs2_should_order_data(inode))
ocfs2_jbd2_file_inode(wc->w_handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
- walk_page_buffers(wc->w_handle,
- page_buffers(tmppage),
- from, to, NULL,
- ocfs2_journal_dirty_data);
-#endif
- }
block_commit_write(tmppage, from, to);
}
@@ -1531,8 +1512,8 @@ static int ocfs2_write_begin_inline(struct address_space *mapping,
goto out;
}
- ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
ocfs2_commit_trans(osb, handle);
@@ -1750,15 +1731,20 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
wc->w_handle = handle;
+ if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) {
+ ret = -EDQUOT;
+ goto out_commit;
+ }
/*
* We don't want this to fail in ocfs2_write_end(), so do it
* here.
*/
- ret = ocfs2_journal_access(handle, inode, wc->w_di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out_quota;
}
/*
@@ -1771,14 +1757,14 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
mmap_page);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out_quota;
}
ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
len);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out_quota;
}
if (data_ac)
@@ -1790,6 +1776,10 @@ success:
*pagep = wc->w_target_page;
*fsdata = wc;
return 0;
+out_quota:
+ if (clusters_to_alloc)
+ vfs_dq_free_space(inode,
+ ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc));
out_commit:
ocfs2_commit_trans(osb, handle);
@@ -1919,15 +1909,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping,
}
if (page_has_buffers(tmppage)) {
- if (ocfs2_should_order_data(inode)) {
+ if (ocfs2_should_order_data(inode))
ocfs2_jbd2_file_inode(wc->w_handle, inode);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
- walk_page_buffers(wc->w_handle,
- page_buffers(tmppage),
- from, to, NULL,
- ocfs2_journal_dirty_data);
-#endif
- }
block_commit_write(tmppage, from, to);
}
}
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
new file mode 100644
index 000000000000..2a947c44e594
--- /dev/null
+++ b/fs/ocfs2/blockcheck.c
@@ -0,0 +1,477 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.c
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2006, 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/crc32.h>
+#include <linux/buffer_head.h>
+#include <linux/bitops.h>
+#include <asm/byteorder.h>
+
+#include <cluster/masklog.h>
+
+#include "ocfs2.h"
+
+#include "blockcheck.h"
+
+
+/*
+ * We use the following conventions:
+ *
+ * d = # data bits
+ * p = # parity bits
+ * c = # total code bits (d + p)
+ */
+
+
+/*
+ * Calculate the bit offset in the hamming code buffer based on the bit's
+ * offset in the data buffer. Since the hamming code reserves all
+ * power-of-two bits for parity, the data bit number and the code bit
+ * number are offest by all the parity bits beforehand.
+ *
+ * Recall that bit numbers in hamming code are 1-based. This function
+ * takes the 0-based data bit from the caller.
+ *
+ * An example. Take bit 1 of the data buffer. 1 is a power of two (2^0),
+ * so it's a parity bit. 2 is a power of two (2^1), so it's a parity bit.
+ * 3 is not a power of two. So bit 1 of the data buffer ends up as bit 3
+ * in the code buffer.
+ *
+ * The caller can pass in *p if it wants to keep track of the most recent
+ * number of parity bits added. This allows the function to start the
+ * calculation at the last place.
+ */
+static unsigned int calc_code_bit(unsigned int i, unsigned int *p_cache)
+{
+ unsigned int b, p = 0;
+
+ /*
+ * Data bits are 0-based, but we're talking code bits, which
+ * are 1-based.
+ */
+ b = i + 1;
+
+ /* Use the cache if it is there */
+ if (p_cache)
+ p = *p_cache;
+ b += p;
+
+ /*
+ * For every power of two below our bit number, bump our bit.
+ *
+ * We compare with (b + 1) because we have to compare with what b
+ * would be _if_ it were bumped up by the parity bit. Capice?
+ *
+ * p is set above.
+ */
+ for (; (1 << p) < (b + 1); p++)
+ b++;
+
+ if (p_cache)
+ *p_cache = p;
+
+ return b;
+}
+
+/*
+ * This is the low level encoder function. It can be called across
+ * multiple hunks just like the crc32 code. 'd' is the number of bits
+ * _in_this_hunk_. nr is the bit offset of this hunk. So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr)
+{
+ unsigned int i, b, p = 0;
+
+ BUG_ON(!d);
+
+ /*
+ * b is the hamming code bit number. Hamming code specifies a
+ * 1-based array, but C uses 0-based. So 'i' is for C, and 'b' is
+ * for the algorithm.
+ *
+ * The i++ in the for loop is so that the start offset passed
+ * to ocfs2_find_next_bit_set() is one greater than the previously
+ * found bit.
+ */
+ for (i = 0; (i = ocfs2_find_next_bit(data, d, i)) < d; i++)
+ {
+ /*
+ * i is the offset in this hunk, nr + i is the total bit
+ * offset.
+ */
+ b = calc_code_bit(nr + i, &p);
+
+ /*
+ * Data bits in the resultant code are checked by
+ * parity bits that are part of the bit number
+ * representation. Huh?
+ *
+ * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code">
+ * In other words, the parity bit at position 2^k
+ * checks bits in positions having bit k set in
+ * their binary representation. Conversely, for
+ * instance, bit 13, i.e. 1101(2), is checked by
+ * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1.
+ * </wikipedia>
+ *
+ * Note that 'k' is the _code_ bit number. 'b' in
+ * our loop.
+ */
+ parity ^= b;
+ }
+
+ /* While the data buffer was treated as little endian, the
+ * return value is in host endian. */
+ return parity;
+}
+
+u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize)
+{
+ return ocfs2_hamming_encode(0, data, blocksize * 8, 0);
+}
+
+/*
+ * Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit
+ * offset of the current hunk. If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one hunk, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+ unsigned int fix)
+{
+ unsigned int i, b;
+
+ BUG_ON(!d);
+
+ /*
+ * If the bit to fix has an hweight of 1, it's a parity bit. One
+ * busted parity bit is its own error. Nothing to do here.
+ */
+ if (hweight32(fix) == 1)
+ return;
+
+ /*
+ * nr + d is the bit right past the data hunk we're looking at.
+ * If fix after that, nothing to do
+ */
+ if (fix >= calc_code_bit(nr + d, NULL))
+ return;
+
+ /*
+ * nr is the offset in the data hunk we're starting at. Let's
+ * start b at the offset in the code buffer. See hamming_encode()
+ * for a more detailed description of 'b'.
+ */
+ b = calc_code_bit(nr, NULL);
+ /* If the fix is before this hunk, nothing to do */
+ if (fix < b)
+ return;
+
+ for (i = 0; i < d; i++, b++)
+ {
+ /* Skip past parity bits */
+ while (hweight32(b) == 1)
+ b++;
+
+ /*
+ * i is the offset in this data hunk.
+ * nr + i is the offset in the total data buffer.
+ * b is the offset in the total code buffer.
+ *
+ * Thus, when b == fix, bit i in the current hunk needs
+ * fixing.
+ */
+ if (b == fix)
+ {
+ if (ocfs2_test_bit(i, data))
+ ocfs2_clear_bit(i, data);
+ else
+ ocfs2_set_bit(i, data);
+ break;
+ }
+ }
+}
+
+void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+ unsigned int fix)
+{
+ ocfs2_hamming_fix(data, blocksize * 8, 0, fix);
+}
+
+/*
+ * This function generates check information for a block.
+ * data is the block to be checked. bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information. If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+ struct ocfs2_block_check *bc)
+{
+ u32 crc;
+ u32 ecc;
+
+ memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+ crc = crc32_le(~0, data, blocksize);
+ ecc = ocfs2_hamming_encode_block(data, blocksize);
+
+ /*
+ * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+ * larger than 16 bits.
+ */
+ BUG_ON(ecc > USHORT_MAX);
+
+ bc->bc_crc32e = cpu_to_le32(crc);
+ bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+
+/*
+ * This function validates existing check information. Like _compute,
+ * the function will take care of zeroing bc before calculating check codes.
+ * If bc is not a pointer inside data, the caller must have zeroed any
+ * inline ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+ struct ocfs2_block_check *bc)
+{
+ int rc = 0;
+ struct ocfs2_block_check check;
+ u32 crc, ecc;
+
+ check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+ check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+
+ memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+ /* Fast path - if the crc32 validates, we're good to go */
+ crc = crc32_le(~0, data, blocksize);
+ if (crc == check.bc_crc32e)
+ goto out;
+
+ mlog(ML_ERROR,
+ "CRC32 failed: stored: %u, computed %u. Applying ECC.\n",
+ (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+ /* Ok, try ECC fixups */
+ ecc = ocfs2_hamming_encode_block(data, blocksize);
+ ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc);
+
+ /* And check the crc32 again */
+ crc = crc32_le(~0, data, blocksize);
+ if (crc == check.bc_crc32e)
+ goto out;
+
+ mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+ (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+ rc = -EIO;
+
+out:
+ bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
+ bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+
+ return rc;
+}
+
+/*
+ * This function generates check information for a list of buffer_heads.
+ * bhs is the blocks to be checked. bc is a pointer to the
+ * ocfs2_block_check structure describing the crc32 and the ecc.
+ *
+ * bc should be a pointer inside data, as the function will
+ * take care of zeroing it before calculating the check information. If
+ * bc does not point inside data, the caller must make sure any inline
+ * ocfs2_block_check structures are zeroed.
+ *
+ * The data buffer must be in on-disk endian (little endian for ocfs2).
+ * bc will be filled with little-endian values and will be ready to go to
+ * disk.
+ */
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc)
+{
+ int i;
+ u32 crc, ecc;
+
+ BUG_ON(nr < 0);
+
+ if (!nr)
+ return;
+
+ memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+ for (i = 0, crc = ~0, ecc = 0; i < nr; i++) {
+ crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+ /*
+ * The number of bits in a buffer is obviously b_size*8.
+ * The offset of this buffer is b_size*i, so the bit offset
+ * of this buffer is b_size*8*i.
+ */
+ ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+ bhs[i]->b_size * 8,
+ bhs[i]->b_size * 8 * i);
+ }
+
+ /*
+ * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
+ * larger than 16 bits.
+ */
+ BUG_ON(ecc > USHORT_MAX);
+
+ bc->bc_crc32e = cpu_to_le32(crc);
+ bc->bc_ecc = cpu_to_le16((u16)ecc);
+}
+
+/*
+ * This function validates existing check information on a list of
+ * buffer_heads. Like _compute_bhs, the function will take care of
+ * zeroing bc before calculating check codes. If bc is not a pointer
+ * inside data, the caller must have zeroed any inline
+ * ocfs2_block_check structures.
+ *
+ * Again, the data passed in should be the on-disk endian.
+ */
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc)
+{
+ int i, rc = 0;
+ struct ocfs2_block_check check;
+ u32 crc, ecc, fix;
+
+ BUG_ON(nr < 0);
+
+ if (!nr)
+ return 0;
+
+ check.bc_crc32e = le32_to_cpu(bc->bc_crc32e);
+ check.bc_ecc = le16_to_cpu(bc->bc_ecc);
+
+ memset(bc, 0, sizeof(struct ocfs2_block_check));
+
+ /* Fast path - if the crc32 validates, we're good to go */
+ for (i = 0, crc = ~0; i < nr; i++)
+ crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+ if (crc == check.bc_crc32e)
+ goto out;
+
+ mlog(ML_ERROR,
+ "CRC32 failed: stored: %u, computed %u. Applying ECC.\n",
+ (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+ /* Ok, try ECC fixups */
+ for (i = 0, ecc = 0; i < nr; i++) {
+ /*
+ * The number of bits in a buffer is obviously b_size*8.
+ * The offset of this buffer is b_size*i, so the bit offset
+ * of this buffer is b_size*8*i.
+ */
+ ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data,
+ bhs[i]->b_size * 8,
+ bhs[i]->b_size * 8 * i);
+ }
+ fix = ecc ^ check.bc_ecc;
+ for (i = 0; i < nr; i++) {
+ /*
+ * Try the fix against each buffer. It will only affect
+ * one of them.
+ */
+ ocfs2_hamming_fix(bhs[i]->b_data, bhs[i]->b_size * 8,
+ bhs[i]->b_size * 8 * i, fix);
+ }
+
+ /* And check the crc32 again */
+ for (i = 0, crc = ~0; i < nr; i++)
+ crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size);
+ if (crc == check.bc_crc32e)
+ goto out;
+
+ mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n",
+ (unsigned int)check.bc_crc32e, (unsigned int)crc);
+
+ rc = -EIO;
+
+out:
+ bc->bc_crc32e = cpu_to_le32(check.bc_crc32e);
+ bc->bc_ecc = cpu_to_le16(check.bc_ecc);
+
+ return rc;
+}
+
+/*
+ * These are the main API. They check the superblock flag before
+ * calling the underlying operations.
+ *
+ * They expect the buffer(s) to be in disk format.
+ */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+ struct ocfs2_block_check *bc)
+{
+ if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+ ocfs2_block_check_compute(data, sb->s_blocksize, bc);
+}
+
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+ struct ocfs2_block_check *bc)
+{
+ int rc = 0;
+
+ if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+ rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc);
+
+ return rc;
+}
+
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+ struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc)
+{
+ if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+ ocfs2_block_check_compute_bhs(bhs, nr, bc);
+}
+
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+ struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc)
+{
+ int rc = 0;
+
+ if (ocfs2_meta_ecc(OCFS2_SB(sb)))
+ rc = ocfs2_block_check_validate_bhs(bhs, nr, bc);
+
+ return rc;
+}
+
diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h
new file mode 100644
index 000000000000..70ec3feda32f
--- /dev/null
+++ b/fs/ocfs2/blockcheck.h
@@ -0,0 +1,82 @@
+/* -*- mode: c; c-basic-offset: 8; -*-
+ * vim: noexpandtab sw=8 ts=8 sts=0:
+ *
+ * blockcheck.h
+ *
+ * Checksum and ECC codes for the OCFS2 userspace library.
+ *
+ * Copyright (C) 2004, 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License, version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+
+#ifndef OCFS2_BLOCKCHECK_H
+#define OCFS2_BLOCKCHECK_H
+
+
+/* High level block API */
+void ocfs2_compute_meta_ecc(struct super_block *sb, void *data,
+ struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc(struct super_block *sb, void *data,
+ struct ocfs2_block_check *bc);
+void ocfs2_compute_meta_ecc_bhs(struct super_block *sb,
+ struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc);
+int ocfs2_validate_meta_ecc_bhs(struct super_block *sb,
+ struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc);
+
+/* Lower level API */
+void ocfs2_block_check_compute(void *data, size_t blocksize,
+ struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate(void *data, size_t blocksize,
+ struct ocfs2_block_check *bc);
+void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc);
+int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr,
+ struct ocfs2_block_check *bc);
+
+/*
+ * Hamming code functions
+ */
+
+/*
+ * Encoding hamming code parity bits for a buffer.
+ *
+ * This is the low level encoder function. It can be called across
+ * multiple hunks just like the crc32 code. 'd' is the number of bits
+ * _in_this_hunk_. nr is the bit offset of this hunk. So, if you had
+ * two 512B buffers, you would do it like so:
+ *
+ * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0);
+ * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8);
+ *
+ * If you just have one buffer, use ocfs2_hamming_encode_block().
+ */
+u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d,
+ unsigned int nr);
+/*
+ * Fix a buffer with a bit error. The 'fix' is the original parity
+ * xor'd with the parity calculated now.
+ *
+ * Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit
+ * offset of the current hunk. If bit to be fixed is not part of the
+ * current hunk, this does nothing.
+ *
+ * If you only have one buffer, use ocfs2_hamming_fix_block().
+ */
+void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr,
+ unsigned int fix);
+
+/* Convenience wrappers for a single buffer of data */
+extern u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize);
+extern void ocfs2_hamming_fix_block(void *data, unsigned int blocksize,
+ unsigned int fix);
+#endif
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 3a178ec48d7c..15c8e6deee2e 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -39,6 +39,18 @@
#include "buffer_head_io.h"
+/*
+ * Bits on bh->b_state used by ocfs2.
+ *
+ * These MUST be after the JBD2 bits. Hence, we use BH_JBDPrivateStart.
+ */
+enum ocfs2_state_bits {
+ BH_NeedsValidate = BH_JBDPrivateStart,
+};
+
+/* Expand the magic b_state functions */
+BUFFER_FNS(NeedsValidate, needs_validate);
+
int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
struct inode *inode)
{
@@ -166,7 +178,9 @@ bail:
}
int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
- struct buffer_head *bhs[], int flags)
+ struct buffer_head *bhs[], int flags,
+ int (*validate)(struct super_block *sb,
+ struct buffer_head *bh))
{
int status = 0;
int i, ignore_cache = 0;
@@ -298,6 +312,8 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
clear_buffer_uptodate(bh);
get_bh(bh); /* for end_buffer_read_sync() */
+ if (validate)
+ set_buffer_needs_validate(bh);
bh->b_end_io = end_buffer_read_sync;
submit_bh(READ, bh);
continue;
@@ -328,6 +344,20 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
bhs[i] = NULL;
continue;
}
+
+ if (buffer_needs_validate(bh)) {
+ /* We never set NeedsValidate if the
+ * buffer was held by the journal, so
+ * that better not have changed */
+ BUG_ON(buffer_jbd(bh));
+ clear_buffer_needs_validate(bh);
+ status = validate(inode->i_sb, bh);
+ if (status) {
+ put_bh(bh);
+ bhs[i] = NULL;
+ continue;
+ }
+ }
}
/* Always set the buffer in the cache, even if it was
diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h
index 75e1dcb1ade7..c75d682dadd8 100644
--- a/fs/ocfs2/buffer_head_io.h
+++ b/fs/ocfs2/buffer_head_io.h
@@ -31,21 +31,24 @@
void ocfs2_end_buffer_io_sync(struct buffer_head *bh,
int uptodate);
-static inline int ocfs2_read_block(struct inode *inode,
- u64 off,
- struct buffer_head **bh);
-
int ocfs2_write_block(struct ocfs2_super *osb,
struct buffer_head *bh,
struct inode *inode);
-int ocfs2_read_blocks(struct inode *inode,
- u64 block,
- int nr,
- struct buffer_head *bhs[],
- int flags);
int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block,
unsigned int nr, struct buffer_head *bhs[]);
+/*
+ * If not NULL, validate() will be called on a buffer that is freshly
+ * read from disk. It will not be called if the buffer was in cache.
+ * Note that if validate() is being used for this buffer, it needs to
+ * be set even for a READAHEAD call, as it marks the buffer for later
+ * validation.
+ */
+int ocfs2_read_blocks(struct inode *inode, u64 block, int nr,
+ struct buffer_head *bhs[], int flags,
+ int (*validate)(struct super_block *sb,
+ struct buffer_head *bh));
+
int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
struct buffer_head *bh);
@@ -53,7 +56,9 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb,
#define OCFS2_BH_READAHEAD 8
static inline int ocfs2_read_block(struct inode *inode, u64 off,
- struct buffer_head **bh)
+ struct buffer_head **bh,
+ int (*validate)(struct super_block *sb,
+ struct buffer_head *bh))
{
int status = 0;
@@ -63,7 +68,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off,
goto bail;
}
- status = ocfs2_read_blocks(inode, off, 1, bh, 0);
+ status = ocfs2_read_blocks(inode, off, 1, bh, 0, validate);
bail:
return status;
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index d8a0cb92cef6..96df5416993e 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -110,6 +110,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = {
define_mask(QUORUM),
define_mask(EXPORT),
define_mask(XATTR),
+ define_mask(QUOTA),
define_mask(ERROR),
define_mask(NOTICE),
define_mask(KTHREAD),
diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h
index 57670c680471..7e72a81bc2d4 100644
--- a/fs/ocfs2/cluster/masklog.h
+++ b/fs/ocfs2/cluster/masklog.h
@@ -113,6 +113,7 @@
#define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */
#define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */
#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */
+#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */
/* bits that are infrequently given and frequently matched in the high word */
#define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */
#define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 026e6eb85187..f2c4098cf337 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -40,6 +40,7 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
+#include <linux/quotaops.h>
#define MLOG_MASK_PREFIX ML_NAMEI
#include <cluster/masklog.h>
@@ -47,6 +48,7 @@
#include "ocfs2.h"
#include "alloc.h"
+#include "blockcheck.h"
#include "dir.h"
#include "dlmglue.h"
#include "extent_map.h"
@@ -82,47 +84,72 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
struct ocfs2_alloc_context *meta_ac,
struct buffer_head **new_bh);
-static struct buffer_head *ocfs2_bread(struct inode *inode,
- int block, int *err, int reada)
+/*
+ * These are distinct checks because future versions of the file system will
+ * want to have a trailing dirent structure independent of indexing.
+ */
+static int ocfs2_dir_has_trailer(struct inode *dir)
{
- struct buffer_head *bh = NULL;
- int tmperr;
- u64 p_blkno;
- int readflags = 0;
+ if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ return 0;
- if (reada)
- readflags |= OCFS2_BH_READAHEAD;
+ return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb));
+}
- if (((u64)block << inode->i_sb->s_blocksize_bits) >=
- i_size_read(inode)) {
- BUG_ON(!reada);
- return NULL;
- }
+static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb)
+{
+ return ocfs2_meta_ecc(osb);
+}
- down_read(&OCFS2_I(inode)->ip_alloc_sem);
- tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
- NULL);
- up_read(&OCFS2_I(inode)->ip_alloc_sem);
- if (tmperr < 0) {
- mlog_errno(tmperr);
- goto fail;
- }
+static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb)
+{
+ return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer);
+}
- tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags);
- if (tmperr < 0)
- goto fail;
+#define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb))))
- tmperr = 0;
+/* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make
+ * them more consistent? */
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+ void *data)
+{
+ char *p = data;
- *err = 0;
- return bh;
+ p += blocksize - sizeof(struct ocfs2_dir_block_trailer);
+ return (struct ocfs2_dir_block_trailer *)p;
+}
-fail:
- brelse(bh);
- bh = NULL;
+/*
+ * XXX: This is executed once on every dirent. We should consider optimizing
+ * it.
+ */
+static int ocfs2_skip_dir_trailer(struct inode *dir,
+ struct ocfs2_dir_entry *de,
+ unsigned long offset,
+ unsigned long blklen)
+{
+ unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer);
- *err = -EIO;
- return NULL;
+ if (!ocfs2_dir_has_trailer(dir))
+ return 0;
+
+ if (offset != toff)
+ return 0;
+
+ return 1;
+}
+
+static void ocfs2_init_dir_trailer(struct inode *inode,
+ struct buffer_head *bh)
+{
+ struct ocfs2_dir_block_trailer *trailer;
+
+ trailer = ocfs2_trailer_from_bh(bh, inode->i_sb);
+ strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE);
+ trailer->db_compat_rec_len =
+ cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer));
+ trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno);
+ trailer->db_blkno = cpu_to_le64(bh->b_blocknr);
}
/*
@@ -231,7 +258,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name,
struct ocfs2_dinode *di;
struct ocfs2_inline_data *data;
- ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
+ ret = ocfs2_read_inode_block(dir, &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -250,6 +277,108 @@ out:
return NULL;
}
+static int ocfs2_validate_dir_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc;
+ struct ocfs2_dir_block_trailer *trailer =
+ ocfs2_trailer_from_bh(bh, sb);
+
+
+ /*
+ * We don't validate dirents here, that's handled
+ * in-place when the code walks them.
+ */
+ mlog(0, "Validating dirblock %llu\n",
+ (unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ *
+ * Note that we are safe to call this even if the directory
+ * doesn't have a trailer. Filesystems without metaecc will do
+ * nothing, and filesystems with it will have one.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check);
+ if (rc)
+ mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+ (unsigned long long)bh->b_blocknr);
+
+ return rc;
+}
+
+/*
+ * This function forces all errors to -EIO for consistency with its
+ * predecessor, ocfs2_bread(). We haven't audited what returning the
+ * real error codes would do to callers. We log the real codes with
+ * mlog_errno() before we squash them.
+ */
+static int ocfs2_read_dir_block(struct inode *inode, u64 v_block,
+ struct buffer_head **bh, int flags)
+{
+ int rc = 0;
+ struct buffer_head *tmp = *bh;
+ struct ocfs2_dir_block_trailer *trailer;
+
+ rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags,
+ ocfs2_validate_dir_block);
+ if (rc) {
+ mlog_errno(rc);
+ goto out;
+ }
+
+ /*
+ * We check the trailer here rather than in
+ * ocfs2_validate_dir_block() because that function doesn't have
+ * the inode to test.
+ */
+ if (!(flags & OCFS2_BH_READAHEAD) &&
+ ocfs2_dir_has_trailer(inode)) {
+ trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb);
+ if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) {
+ rc = -EINVAL;
+ ocfs2_error(inode->i_sb,
+ "Invalid dirblock #%llu: "
+ "signature = %.*s\n",
+ (unsigned long long)tmp->b_blocknr, 7,
+ trailer->db_signature);
+ goto out;
+ }
+ if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) {
+ rc = -EINVAL;
+ ocfs2_error(inode->i_sb,
+ "Directory block #%llu has an invalid "
+ "db_blkno of %llu",
+ (unsigned long long)tmp->b_blocknr,
+ (unsigned long long)le64_to_cpu(trailer->db_blkno));
+ goto out;
+ }
+ if (le64_to_cpu(trailer->db_parent_dinode) !=
+ OCFS2_I(inode)->ip_blkno) {
+ rc = -EINVAL;
+ ocfs2_error(inode->i_sb,
+ "Directory block #%llu on dinode "
+ "#%llu has an invalid parent_dinode "
+ "of %llu",
+ (unsigned long long)tmp->b_blocknr,
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)le64_to_cpu(trailer->db_blkno));
+ goto out;
+ }
+ }
+
+ /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+ if (!*bh)
+ *bh = tmp;
+
+out:
+ return rc ? -EIO : 0;
+}
+
static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen,
struct inode *dir,
struct ocfs2_dir_entry **res_dir)
@@ -296,15 +425,17 @@ restart:
}
num++;
- bh = ocfs2_bread(dir, b++, &err, 1);
+ bh = NULL;
+ err = ocfs2_read_dir_block(dir, b++, &bh,
+ OCFS2_BH_READAHEAD);
bh_use[ra_max] = bh;
}
}
if ((bh = bh_use[ra_ptr++]) == NULL)
goto next;
- if (ocfs2_read_block(dir, block, &bh)) {
+ if (ocfs2_read_dir_block(dir, block, &bh, 0)) {
/* read error, skip block & hope for the best.
- * ocfs2_read_block() has released the bh. */
+ * ocfs2_read_dir_block() has released the bh. */
ocfs2_error(dir->i_sb, "reading directory %llu, "
"offset %lu\n",
(unsigned long long)OCFS2_I(dir)->ip_blkno,
@@ -381,14 +512,18 @@ int ocfs2_update_entry(struct inode *dir, handle_t *handle,
struct inode *new_entry_inode)
{
int ret;
+ ocfs2_journal_access_func access = ocfs2_journal_access_db;
/*
* The same code works fine for both inline-data and extent
- * based directories, so no need to split this up.
+ * based directories, so no need to split this up. The only
+ * difference is the journal_access function.
*/
- ret = ocfs2_journal_access(handle, dir, de_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ access = ocfs2_journal_access_di;
+
+ ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
@@ -410,9 +545,13 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
{
struct ocfs2_dir_entry *de, *pde;
int i, status = -ENOENT;
+ ocfs2_journal_access_func access = ocfs2_journal_access_db;
mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh);
+ if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
+ access = ocfs2_journal_access_di;
+
i = 0;
pde = NULL;
de = (struct ocfs2_dir_entry *) first_de;
@@ -423,8 +562,8 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir,
goto bail;
}
if (de == de_del) {
- status = ocfs2_journal_access(handle, dir, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = access(handle, dir, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
status = -EIO;
mlog_errno(status);
@@ -458,7 +597,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle,
struct ocfs2_dinode *di;
struct ocfs2_inline_data *data;
- ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh);
+ ret = ocfs2_read_inode_block(dir, &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -576,6 +715,16 @@ int __ocfs2_add_entry(handle_t *handle,
goto bail;
}
+ /* We're guaranteed that we should have space, so we
+ * can't possibly have hit the trailer...right? */
+ mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size),
+ "Hit dir trailer trying to insert %.*s "
+ "(namelen %d) into directory %llu. "
+ "offset is %lu, trailer offset is %d\n",
+ namelen, name, namelen,
+ (unsigned long long)parent_fe_bh->b_blocknr,
+ offset, ocfs2_dir_trailer_blk_off(dir->i_sb));
+
if (ocfs2_dirent_would_fit(de, rec_len)) {
dir->i_mtime = dir->i_ctime = CURRENT_TIME;
retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
@@ -584,8 +733,14 @@ int __ocfs2_add_entry(handle_t *handle,
goto bail;
}
- status = ocfs2_journal_access(handle, dir, insert_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ if (insert_bh == parent_fe_bh)
+ status = ocfs2_journal_access_di(handle, dir,
+ insert_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ else
+ status = ocfs2_journal_access_db(handle, dir,
+ insert_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
/* By now the buffer is marked for journaling */
offset += le16_to_cpu(de->rec_len);
if (le64_to_cpu(de->inode)) {
@@ -611,6 +766,7 @@ int __ocfs2_add_entry(handle_t *handle,
retval = 0;
goto bail;
}
+
offset += le16_to_cpu(de->rec_len);
de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len));
}
@@ -636,7 +792,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode,
struct ocfs2_inline_data *data;
struct ocfs2_dir_entry *de;
- ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+ ret = ocfs2_read_inode_block(inode, &di_bh);
if (ret) {
mlog(ML_ERROR, "Unable to read inode block for dir %llu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -724,7 +880,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
int i, stored;
struct buffer_head * bh, * tmp;
struct ocfs2_dir_entry * de;
- int err;
struct super_block * sb = inode->i_sb;
unsigned int ra_sectors = 16;
@@ -735,12 +890,8 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
while (!error && !stored && *f_pos < i_size_read(inode)) {
blk = (*f_pos) >> sb->s_blocksize_bits;
- bh = ocfs2_bread(inode, blk, &err, 0);
- if (!bh) {
- mlog(ML_ERROR,
- "directory #%llu contains a hole at offset %lld\n",
- (unsigned long long)OCFS2_I(inode)->ip_blkno,
- *f_pos);
+ if (ocfs2_read_dir_block(inode, blk, &bh, 0)) {
+ /* Skip the corrupt dirblock and keep trying */
*f_pos += sb->s_blocksize - offset;
continue;
}
@@ -754,8 +905,10 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode,
|| (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) {
for (i = ra_sectors >> (sb->s_blocksize_bits - 9);
i > 0; i--) {
- tmp = ocfs2_bread(inode, ++blk, &err, 1);
- brelse(tmp);
+ tmp = NULL;
+ if (!ocfs2_read_dir_block(inode, ++blk, &tmp,
+ OCFS2_BH_READAHEAD))
+ brelse(tmp);
}
last_ra_blk = blk;
ra_sectors = 8;
@@ -828,6 +981,7 @@ revalidate:
}
offset = 0;
brelse(bh);
+ bh = NULL;
}
stored = 0;
@@ -1050,9 +1204,15 @@ int ocfs2_empty_dir(struct inode *inode)
return !priv.seen_other;
}
-static void ocfs2_fill_initial_dirents(struct inode *inode,
- struct inode *parent,
- char *start, unsigned int size)
+/*
+ * Fills "." and ".." dirents in a new directory block. Returns dirent for
+ * "..", which might be used during creation of a directory with a trailing
+ * header. It is otherwise safe to ignore the return code.
+ */
+static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode,
+ struct inode *parent,
+ char *start,
+ unsigned int size)
{
struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start;
@@ -1069,6 +1229,8 @@ static void ocfs2_fill_initial_dirents(struct inode *inode,
de->name_len = 2;
strcpy(de->name, "..");
ocfs2_set_de_type(de, S_IFDIR);
+
+ return de;
}
/*
@@ -1086,8 +1248,8 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb,
struct ocfs2_inline_data *data = &di->id2.i_data;
unsigned int size = le16_to_cpu(data->id_count);
- ret = ocfs2_journal_access(handle, inode, di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, inode, di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1121,10 +1283,15 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
struct ocfs2_alloc_context *data_ac)
{
int status;
+ unsigned int size = osb->sb->s_blocksize;
struct buffer_head *new_bh = NULL;
+ struct ocfs2_dir_entry *de;
mlog_entry_void();
+ if (ocfs2_supports_dir_trailer(osb))
+ size = ocfs2_dir_trailer_blk_off(parent->i_sb);
+
status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh,
data_ac, NULL, &new_bh);
if (status < 0) {
@@ -1134,16 +1301,17 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb,
ocfs2_set_new_buffer_uptodate(inode, new_bh);
- status = ocfs2_journal_access(handle, inode, new_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ status = ocfs2_journal_access_db(handle, inode, new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
memset(new_bh->b_data, 0, osb->sb->s_blocksize);
- ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data,
- osb->sb->s_blocksize);
+ de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size);
+ if (ocfs2_supports_dir_trailer(osb))
+ ocfs2_init_dir_trailer(inode, new_bh);
status = ocfs2_journal_dirty(handle, new_bh);
if (status < 0) {
@@ -1184,13 +1352,27 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
data_ac);
}
+/*
+ * Expand rec_len of the rightmost dirent in a directory block so that it
+ * contains the end of our valid space for dirents. We do this during
+ * expansion from an inline directory to one with extents. The first dir block
+ * in that case is taken from the inline data portion of the inode block.
+ *
+ * We add the dir trailer if this filesystem wants it.
+ */
static void ocfs2_expand_last_dirent(char *start, unsigned int old_size,
- unsigned int new_size)
+ struct super_block *sb)
{
struct ocfs2_dir_entry *de;
struct ocfs2_dir_entry *prev_de;
char *de_buf, *limit;
- unsigned int bytes = new_size - old_size;
+ unsigned int new_size = sb->s_blocksize;
+ unsigned int bytes;
+
+ if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+ new_size = ocfs2_dir_trailer_blk_off(sb);
+
+ bytes = new_size - old_size;
limit = start + old_size;
de_buf = start;
@@ -1216,9 +1398,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
unsigned int blocks_wanted,
struct buffer_head **first_block_bh)
{
- int ret, credits = OCFS2_INLINE_TO_EXTENTS_CREDITS;
u32 alloc, bit_off, len;
struct super_block *sb = dir->i_sb;
+ int ret, credits = ocfs2_inline_to_extents_credits(sb);
u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
struct ocfs2_inode_info *oi = OCFS2_I(dir);
@@ -1227,6 +1409,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
handle_t *handle;
struct ocfs2_extent_tree et;
+ int did_quota = 0;
ocfs2_init_dinode_extent_tree(&et, dir, di_bh);
@@ -1264,6 +1447,12 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
goto out_sem;
}
+ if (vfs_dq_alloc_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(osb->sb, alloc))) {
+ ret = -EDQUOT;
+ goto out_commit;
+ }
+ did_quota = 1;
/*
* Try to claim as many clusters as the bitmap can give though
* if we only get one now, that's enough to continue. The rest
@@ -1290,8 +1479,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
ocfs2_set_new_buffer_uptodate(dir, dirdata_bh);
- ret = ocfs2_journal_access(handle, dir, dirdata_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ ret = ocfs2_journal_access_db(handle, dir, dirdata_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (ret) {
mlog_errno(ret);
goto out_commit;
@@ -1300,8 +1489,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir));
memset(dirdata_bh->b_data + i_size_read(dir), 0,
sb->s_blocksize - i_size_read(dir));
- ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir),
- sb->s_blocksize);
+ ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb);
+ if (ocfs2_supports_dir_trailer(osb))
+ ocfs2_init_dir_trailer(dir, dirdata_bh);
ret = ocfs2_journal_dirty(handle, dirdata_bh);
if (ret) {
@@ -1317,8 +1507,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
* We let the later dirent insert modify c/mtime - to the user
* the data hasn't changed.
*/
- ret = ocfs2_journal_access(handle, dir, di_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ ret = ocfs2_journal_access_di(handle, dir, di_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (ret) {
mlog_errno(ret);
goto out_commit;
@@ -1386,6 +1576,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh,
dirdata_bh = NULL;
out_commit:
+ if (ret < 0 && did_quota)
+ vfs_dq_free_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(osb->sb, 2));
ocfs2_commit_trans(osb, handle);
out_sem:
@@ -1410,7 +1603,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
struct buffer_head **new_bh)
{
int status;
- int extend;
+ int extend, did_quota = 0;
u64 p_blkno, v_blkno;
spin_lock(&OCFS2_I(dir)->ip_lock);
@@ -1420,6 +1613,13 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
if (extend) {
u32 offset = OCFS2_I(dir)->ip_clusters;
+ if (vfs_dq_alloc_space_nodirty(dir,
+ ocfs2_clusters_to_bytes(sb, 1))) {
+ status = -EDQUOT;
+ goto bail;
+ }
+ did_quota = 1;
+
status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset,
1, 0, parent_fe_bh, handle,
data_ac, meta_ac, NULL);
@@ -1445,6 +1645,8 @@ static int ocfs2_do_extend_dir(struct super_block *sb,
}
status = 0;
bail:
+ if (did_quota && status < 0)
+ vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1));
mlog_exit(status);
return status;
}
@@ -1569,16 +1771,22 @@ do_extend:
ocfs2_set_new_buffer_uptodate(dir, new_bh);
- status = ocfs2_journal_access(handle, dir, new_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ status = ocfs2_journal_access_db(handle, dir, new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto bail;
}
memset(new_bh->b_data, 0, sb->s_blocksize);
+
de = (struct ocfs2_dir_entry *) new_bh->b_data;
de->inode = 0;
- de->rec_len = cpu_to_le16(sb->s_blocksize);
+ if (ocfs2_dir_has_trailer(dir)) {
+ de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb));
+ ocfs2_init_dir_trailer(dir, new_bh);
+ } else {
+ de->rec_len = cpu_to_le16(sb->s_blocksize);
+ }
status = ocfs2_journal_dirty(handle, new_bh);
if (status < 0) {
mlog_errno(status);
@@ -1620,11 +1828,21 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
unsigned int *blocks_wanted)
{
int ret;
+ struct super_block *sb = dir->i_sb;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_dir_entry *de, *last_de = NULL;
char *de_buf, *limit;
unsigned long offset = 0;
- unsigned int rec_len, new_rec_len;
+ unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize;
+
+ /*
+ * This calculates how many free bytes we'd have in block zero, should
+ * this function force expansion to an extent tree.
+ */
+ if (ocfs2_supports_dir_trailer(OCFS2_SB(sb)))
+ free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir);
+ else
+ free_space = dir->i_sb->s_blocksize - i_size_read(dir);
de_buf = di->id2.i_data.id_data;
limit = de_buf + i_size_read(dir);
@@ -1641,6 +1859,11 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
ret = -EEXIST;
goto out;
}
+ /*
+ * No need to check for a trailing dirent record here as
+ * they're not used for inline dirs.
+ */
+
if (ocfs2_dirent_would_fit(de, rec_len)) {
/* Ok, we found a spot. Return this bh and let
* the caller actually fill it in. */
@@ -1661,7 +1884,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh,
* dirent can be found.
*/
*blocks_wanted = 1;
- new_rec_len = le16_to_cpu(last_de->rec_len) + (dir->i_sb->s_blocksize - i_size_read(dir));
+ new_rec_len = le16_to_cpu(last_de->rec_len) + free_space;
if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len)))
*blocks_wanted = 2;
@@ -1679,9 +1902,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
struct ocfs2_dir_entry *de;
struct super_block *sb = dir->i_sb;
int status;
+ int blocksize = dir->i_sb->s_blocksize;
- bh = ocfs2_bread(dir, 0, &status, 0);
- if (!bh) {
+ status = ocfs2_read_dir_block(dir, 0, &bh, 0);
+ if (status) {
mlog_errno(status);
goto bail;
}
@@ -1702,11 +1926,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
status = -ENOSPC;
goto bail;
}
- bh = ocfs2_bread(dir,
- offset >> sb->s_blocksize_bits,
- &status,
- 0);
- if (!bh) {
+ status = ocfs2_read_dir_block(dir,
+ offset >> sb->s_blocksize_bits,
+ &bh, 0);
+ if (status) {
mlog_errno(status);
goto bail;
}
@@ -1721,6 +1944,11 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
status = -EEXIST;
goto bail;
}
+
+ if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize,
+ blocksize))
+ goto next;
+
if (ocfs2_dirent_would_fit(de, rec_len)) {
/* Ok, we found a spot. Return this bh and let
* the caller actually fill it in. */
@@ -1729,6 +1957,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
status = 0;
goto bail;
}
+next:
offset += le16_to_cpu(de->rec_len);
de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
}
diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h
index ce48b9080d87..c511e2e18e9f 100644
--- a/fs/ocfs2/dir.h
+++ b/fs/ocfs2/dir.h
@@ -83,4 +83,6 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb,
struct buffer_head *fe_bh,
struct ocfs2_alloc_context *data_ac);
+struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize,
+ void *data);
#endif /* OCFS2_DIR_H */
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 644bee55d8ba..d07ddbe4b283 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -275,6 +275,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
struct list_head *iter, *head=NULL;
u64 cookie;
u32 flags;
+ u8 node;
if (!dlm_grab(dlm)) {
dlm_error(DLM_REJECTED);
@@ -286,18 +287,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
name = past->name;
locklen = past->namelen;
- cookie = be64_to_cpu(past->cookie);
+ cookie = past->cookie;
flags = be32_to_cpu(past->flags);
+ node = past->node_idx;
if (locklen > DLM_LOCKID_NAME_MAX) {
ret = DLM_IVBUFLEN;
- mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n");
+ mlog(ML_ERROR, "Invalid name length (%d) in proxy ast "
+ "handler!\n", locklen);
goto leave;
}
if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) ==
(LKM_PUT_LVB|LKM_GET_LVB)) {
- mlog(ML_ERROR, "both PUT and GET lvb specified\n");
+ mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n",
+ flags);
ret = DLM_BADARGS;
goto leave;
}
@@ -310,22 +314,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
if (past->type != DLM_AST &&
past->type != DLM_BAST) {
mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu"
- "name=%.*s\n", past->type,
- dlm_get_lock_cookie_node(cookie),
- dlm_get_lock_cookie_seq(cookie),
- locklen, name);
+ "name=%.*s, node=%u\n", past->type,
+ dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+ locklen, name, node);
ret = DLM_IVLOCKID;
goto leave;
}
res = dlm_lookup_lockres(dlm, name, locklen);
if (!res) {
- mlog(0, "got %sast for unknown lockres! "
- "cookie=%u:%llu, name=%.*s, namelen=%u\n",
- past->type == DLM_AST ? "" : "b",
- dlm_get_lock_cookie_node(cookie),
- dlm_get_lock_cookie_seq(cookie),
- locklen, name, locklen);
+ mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, "
+ "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"),
+ dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+ locklen, name, node);
ret = DLM_IVLOCKID;
goto leave;
}
@@ -337,12 +340,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
- mlog(0, "responding with DLM_RECOVERING!\n");
+ mlog(0, "Responding with DLM_RECOVERING!\n");
ret = DLM_RECOVERING;
goto unlock_out;
}
if (res->state & DLM_LOCK_RES_MIGRATING) {
- mlog(0, "responding with DLM_MIGRATING!\n");
+ mlog(0, "Responding with DLM_MIGRATING!\n");
ret = DLM_MIGRATING;
goto unlock_out;
}
@@ -351,7 +354,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
lock = NULL;
list_for_each(iter, head) {
lock = list_entry (iter, struct dlm_lock, list);
- if (be64_to_cpu(lock->ml.cookie) == cookie)
+ if (lock->ml.cookie == cookie)
goto do_ast;
}
@@ -363,15 +366,15 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
list_for_each(iter, head) {
lock = list_entry (iter, struct dlm_lock, list);
- if (be64_to_cpu(lock->ml.cookie) == cookie)
+ if (lock->ml.cookie == cookie)
goto do_ast;
}
- mlog(0, "got %sast for unknown lock! cookie=%u:%llu, "
- "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b",
- dlm_get_lock_cookie_node(cookie),
- dlm_get_lock_cookie_seq(cookie),
- locklen, name, locklen);
+ mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
+ "node=%u\n", past->type == DLM_AST ? "" : "b",
+ dlm_get_lock_cookie_node(be64_to_cpu(cookie)),
+ dlm_get_lock_cookie_seq(be64_to_cpu(cookie)),
+ locklen, name, node);
ret = DLM_NORMAL;
unlock_out:
@@ -383,8 +386,8 @@ do_ast:
if (past->type == DLM_AST) {
/* do not alter lock refcount. switching lists. */
list_move_tail(&lock->list, &res->granted);
- mlog(0, "ast: adding to granted list... type=%d, "
- "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
+ mlog(0, "ast: Adding to granted list... type=%d, "
+ "convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
if (lock->ml.convert_type != LKM_IVMODE) {
lock->ml.type = lock->ml.convert_type;
lock->ml.convert_type = LKM_IVMODE;
@@ -408,7 +411,6 @@ do_ast:
dlm_do_local_bast(dlm, res, lock, past->blocked_type);
leave:
-
if (res)
dlm_lockres_put(res);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index d5a86fb81a49..bb53714813ab 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -140,6 +140,7 @@ struct dlm_ctxt
unsigned int purge_count;
spinlock_t spinlock;
spinlock_t ast_lock;
+ spinlock_t track_lock;
char *name;
u8 node_num;
u32 key;
@@ -316,6 +317,8 @@ struct dlm_lock_resource
* put on a list for the dlm thread to run. */
unsigned long last_used;
+ struct dlm_ctxt *dlm;
+
unsigned migration_pending:1;
atomic_t asts_reserved;
spinlock_t spinlock;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 1b81dcba175d..b32f60a5acfb 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -630,43 +630,38 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos)
{
struct debug_lockres *dl = m->private;
struct dlm_ctxt *dlm = dl->dl_ctxt;
+ struct dlm_lock_resource *oldres = dl->dl_res;
struct dlm_lock_resource *res = NULL;
+ struct list_head *track_list;
- spin_lock(&dlm->spinlock);
+ spin_lock(&dlm->track_lock);
+ if (oldres)
+ track_list = &oldres->tracking;
+ else
+ track_list = &dlm->tracking_list;
- if (dl->dl_res) {
- list_for_each_entry(res, &dl->dl_res->tracking, tracking) {
- if (dl->dl_res) {
- dlm_lockres_put(dl->dl_res);
- dl->dl_res = NULL;
- }
- if (&res->tracking == &dlm->tracking_list) {
- mlog(0, "End of list found, %p\n", res);
- dl = NULL;
- break;
- }
+ list_for_each_entry(res, track_list, tracking) {
+ if (&res->tracking == &dlm->tracking_list)
+ res = NULL;
+ else
dlm_lockres_get(res);
- dl->dl_res = res;
- break;
- }
- } else {
- if (!list_empty(&dlm->tracking_list)) {
- list_for_each_entry(res, &dlm->tracking_list, tracking)
- break;
- dlm_lockres_get(res);
- dl->dl_res = res;
- } else
- dl = NULL;
+ break;
}
+ spin_unlock(&dlm->track_lock);
- if (dl) {
- spin_lock(&dl->dl_res->spinlock);
- dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1);
- spin_unlock(&dl->dl_res->spinlock);
- }
+ if (oldres)
+ dlm_lockres_put(oldres);
- spin_unlock(&dlm->spinlock);
+ dl->dl_res = res;
+
+ if (res) {
+ spin_lock(&res->spinlock);
+ dump_lockres(res, dl->dl_buf, dl->dl_len - 1);
+ spin_unlock(&res->spinlock);
+ } else
+ dl = NULL;
+ /* passed to seq_show */
return dl;
}
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 63f8125824e8..d8d578f45613 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -1550,6 +1550,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
spin_lock_init(&dlm->spinlock);
spin_lock_init(&dlm->master_lock);
spin_lock_init(&dlm->ast_lock);
+ spin_lock_init(&dlm->track_lock);
INIT_LIST_HEAD(&dlm->list);
INIT_LIST_HEAD(&dlm->dirty_list);
INIT_LIST_HEAD(&dlm->reco.resources);
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 6f7a77d54020..1c9efb406a96 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -341,7 +341,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_blocks = 0;
inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inc_nlink(inode);
@@ -367,7 +366,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_blocks = 0;
inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 44f87caf3683..54e182a27caf 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -505,8 +505,10 @@ void dlm_change_lockres_owner(struct dlm_ctxt *dlm,
static void dlm_lockres_release(struct kref *kref)
{
struct dlm_lock_resource *res;
+ struct dlm_ctxt *dlm;
res = container_of(kref, struct dlm_lock_resource, refs);
+ dlm = res->dlm;
/* This should not happen -- all lockres' have a name
* associated with them at init time. */
@@ -515,6 +517,7 @@ static void dlm_lockres_release(struct kref *kref)
mlog(0, "destroying lockres %.*s\n", res->lockname.len,
res->lockname.name);
+ spin_lock(&dlm->track_lock);
if (!list_empty(&res->tracking))
list_del_init(&res->tracking);
else {
@@ -522,6 +525,9 @@ static void dlm_lockres_release(struct kref *kref)
res->lockname.len, res->lockname.name);
dlm_print_one_lock_resource(res);
}
+ spin_unlock(&dlm->track_lock);
+
+ dlm_put(dlm);
if (!hlist_unhashed(&res->hash_node) ||
!list_empty(&res->granted) ||
@@ -595,6 +601,10 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
res->migration_pending = 0;
res->inflight_locks = 0;
+ /* put in dlm_lockres_release */
+ dlm_grab(dlm);
+ res->dlm = dlm;
+
kref_init(&res->refs);
/* just for consistency */
@@ -722,14 +732,21 @@ lookup:
if (tmpres) {
int dropping_ref = 0;
+ spin_unlock(&dlm->spinlock);
+
spin_lock(&tmpres->spinlock);
+ /* We wait for the other thread that is mastering the resource */
+ if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
+ __dlm_wait_on_lockres(tmpres);
+ BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
+ }
+
if (tmpres->owner == dlm->node_num) {
BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF);
dlm_lockres_grab_inflight_ref(dlm, tmpres);
} else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF)
dropping_ref = 1;
spin_unlock(&tmpres->spinlock);
- spin_unlock(&dlm->spinlock);
/* wait until done messaging the master, drop our ref to allow
* the lockres to be purged, start over. */
@@ -2949,7 +2966,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
struct dlm_node_iter *iter)
{
struct dlm_migrate_request migrate;
- int ret, status = 0;
+ int ret, skip, status = 0;
int nodenum;
memset(&migrate, 0, sizeof(migrate));
@@ -2966,12 +2983,27 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
nodenum == new_master)
continue;
+ /* We could race exit domain. If exited, skip. */
+ spin_lock(&dlm->spinlock);
+ skip = (!test_bit(nodenum, dlm->domain_map));
+ spin_unlock(&dlm->spinlock);
+ if (skip) {
+ clear_bit(nodenum, iter->node_map);
+ continue;
+ }
+
ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key,
&migrate, sizeof(migrate), nodenum,
&status);
- if (ret < 0)
- mlog_errno(ret);
- else if (status < 0) {
+ if (ret < 0) {
+ mlog(0, "migrate_request returned %d!\n", ret);
+ if (!dlm_is_host_down(ret)) {
+ mlog(ML_ERROR, "unhandled error=%d!\n", ret);
+ BUG();
+ }
+ clear_bit(nodenum, iter->node_map);
+ ret = 0;
+ } else if (status < 0) {
mlog(0, "migrate request (node %u) returned %d!\n",
nodenum, status);
ret = status;
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 4060bb328bc8..d1295203029f 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -181,7 +181,8 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
spin_lock(&res->spinlock);
/* This ensures that clear refmap is sent after the set */
- __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
+ __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG |
+ DLM_LOCK_RES_MIGRATING));
spin_unlock(&res->spinlock);
/* clear our bit from the master's refmap, ignore errors */
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 6e6cc0a2e5f7..f731ab491795 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -32,6 +32,7 @@
#include <linux/debugfs.h>
#include <linux/seq_file.h>
#include <linux/time.h>
+#include <linux/quotaops.h>
#define MLOG_MASK_PREFIX ML_DLM_GLUE
#include <cluster/masklog.h>
@@ -51,6 +52,7 @@
#include "slot_map.h"
#include "super.h"
#include "uptodate.h"
+#include "quota.h"
#include "buffer_head_io.h"
@@ -68,6 +70,7 @@ struct ocfs2_mask_waiter {
static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres);
static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres);
static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres);
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres);
/*
* Return value from ->downconvert_worker functions.
@@ -102,6 +105,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb,
struct ocfs2_lock_res *lockres);
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres);
#define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres)
@@ -111,8 +115,7 @@ static void ocfs2_dump_meta_lvb_info(u64 level,
unsigned int line,
struct ocfs2_lock_res *lockres)
{
- struct ocfs2_meta_lvb *lvb =
- (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+ struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
mlog(level, "LVB information for %s (called from %s:%u):\n",
lockres->l_name, function, line);
@@ -258,6 +261,12 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = {
.flags = 0,
};
+static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = {
+ .set_lvb = ocfs2_set_qinfo_lvb,
+ .get_osb = ocfs2_get_qinfo_osb,
+ .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB,
+};
+
static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
{
return lockres->l_type == OCFS2_LOCK_TYPE_META ||
@@ -279,6 +288,13 @@ static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res
return (struct ocfs2_dentry_lock *)lockres->l_priv;
}
+static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_res *lockres)
+{
+ BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_QINFO);
+
+ return (struct ocfs2_mem_dqinfo *)lockres->l_priv;
+}
+
static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres)
{
if (lockres->l_ops->get_osb)
@@ -507,6 +523,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres)
return OCFS2_SB(inode->i_sb);
}
+static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres)
+{
+ struct ocfs2_mem_dqinfo *info = lockres->l_priv;
+
+ return OCFS2_SB(info->dqi_gi.dqi_sb);
+}
+
static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres)
{
struct ocfs2_file_private *fp = lockres->l_priv;
@@ -609,6 +632,17 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
lockres->l_flags |= OCFS2_LOCK_NOCACHE;
}
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+ struct ocfs2_mem_dqinfo *info)
+{
+ ocfs2_lock_res_init_once(lockres);
+ ocfs2_build_lock_name(OCFS2_LOCK_TYPE_QINFO, info->dqi_gi.dqi_type,
+ 0, lockres->l_name);
+ ocfs2_lock_res_init_common(OCFS2_SB(info->dqi_gi.dqi_sb), lockres,
+ OCFS2_LOCK_TYPE_QINFO, &ocfs2_qinfo_lops,
+ info);
+}
+
void ocfs2_lock_res_free(struct ocfs2_lock_res *res)
{
mlog_entry_void();
@@ -1829,7 +1863,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode)
mlog_entry_void();
- lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
/*
* Invalidate the LVB of a deleted inode - this way other
@@ -1881,7 +1915,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
mlog_meta_lvb(0, lockres);
- lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
/* We're safe here without the lockres lock... */
spin_lock(&oi->ip_lock);
@@ -1916,8 +1950,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode,
struct ocfs2_lock_res *lockres)
{
- struct ocfs2_meta_lvb *lvb =
- (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb);
+ struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
if (lvb->lvb_version == OCFS2_LVB_VERSION
&& be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation)
@@ -2024,7 +2057,7 @@ static int ocfs2_inode_lock_update(struct inode *inode,
} else {
/* Boo, we have to go to disk. */
/* read bh, cast, ocfs2_refresh_inode */
- status = ocfs2_read_block(inode, oi->ip_blkno, bh);
+ status = ocfs2_read_inode_block(inode, bh);
if (status < 0) {
mlog_errno(status);
goto bail_refresh;
@@ -2032,18 +2065,14 @@ static int ocfs2_inode_lock_update(struct inode *inode,
fe = (struct ocfs2_dinode *) (*bh)->b_data;
/* This is a good chance to make sure we're not
- * locking an invalid object.
+ * locking an invalid object. ocfs2_read_inode_block()
+ * already checked that the inode block is sane.
*
* We bug on a stale inode here because we checked
* above whether it was wiped from disk. The wiping
* node provides a guarantee that we receive that
* message and can mark the inode before dropping any
* locks associated with it. */
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
- status = -EIO;
- goto bail_refresh;
- }
mlog_bug_on_msg(inode->i_generation !=
le32_to_cpu(fe->i_generation),
"Invalid dinode %llu disk generation: %u "
@@ -2085,7 +2114,7 @@ static int ocfs2_assign_bh(struct inode *inode,
return 0;
}
- status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh);
+ status = ocfs2_read_inode_block(inode, ret_bh);
if (status < 0)
mlog_errno(status);
@@ -3449,6 +3478,117 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
return UNBLOCK_CONTINUE_POST;
}
+static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres)
+{
+ struct ocfs2_qinfo_lvb *lvb;
+ struct ocfs2_mem_dqinfo *oinfo = ocfs2_lock_res_qinfo(lockres);
+ struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+ oinfo->dqi_gi.dqi_type);
+
+ mlog_entry_void();
+
+ lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ lvb->lvb_version = OCFS2_QINFO_LVB_VERSION;
+ lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace);
+ lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace);
+ lvb->lvb_syncms = cpu_to_be32(oinfo->dqi_syncms);
+ lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks);
+ lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk);
+ lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry);
+
+ mlog_exit_void();
+}
+
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+ struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+ struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+
+ mlog_entry_void();
+ if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb))
+ ocfs2_cluster_unlock(osb, lockres, level);
+ mlog_exit_void();
+}
+
+static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo)
+{
+ struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb,
+ oinfo->dqi_gi.dqi_type);
+ struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+ struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb);
+ struct buffer_head *bh = NULL;
+ struct ocfs2_global_disk_dqinfo *gdinfo;
+ int status = 0;
+
+ if (lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) {
+ info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace);
+ info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace);
+ oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms);
+ oinfo->dqi_gi.dqi_blocks = be32_to_cpu(lvb->lvb_blocks);
+ oinfo->dqi_gi.dqi_free_blk = be32_to_cpu(lvb->lvb_free_blk);
+ oinfo->dqi_gi.dqi_free_entry =
+ be32_to_cpu(lvb->lvb_free_entry);
+ } else {
+ status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh);
+ if (status) {
+ mlog_errno(status);
+ goto bail;
+ }
+ gdinfo = (struct ocfs2_global_disk_dqinfo *)
+ (bh->b_data + OCFS2_GLOBAL_INFO_OFF);
+ info->dqi_bgrace = le32_to_cpu(gdinfo->dqi_bgrace);
+ info->dqi_igrace = le32_to_cpu(gdinfo->dqi_igrace);
+ oinfo->dqi_syncms = le32_to_cpu(gdinfo->dqi_syncms);
+ oinfo->dqi_gi.dqi_blocks = le32_to_cpu(gdinfo->dqi_blocks);
+ oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(gdinfo->dqi_free_blk);
+ oinfo->dqi_gi.dqi_free_entry =
+ le32_to_cpu(gdinfo->dqi_free_entry);
+ brelse(bh);
+ ocfs2_track_lock_refresh(lockres);
+ }
+
+bail:
+ return status;
+}
+
+/* Lock quota info, this function expects at least shared lock on the quota file
+ * so that we can safely refresh quota info from disk. */
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+ struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock;
+ struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb);
+ int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR;
+ int status = 0;
+
+ mlog_entry_void();
+
+ /* On RO devices, locking really isn't needed... */
+ if (ocfs2_is_hard_readonly(osb)) {
+ if (ex)
+ status = -EROFS;
+ goto bail;
+ }
+ if (ocfs2_mount_local(osb))
+ goto bail;
+
+ status = ocfs2_cluster_lock(osb, lockres, level, 0, 0);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ if (!ocfs2_should_refresh_lock_res(lockres))
+ goto bail;
+ /* OK, we have the lock but we need to refresh the quota info */
+ status = ocfs2_refresh_qinfo(oinfo);
+ if (status)
+ ocfs2_qinfo_unlock(oinfo, ex);
+ ocfs2_complete_lock_res_refresh(lockres, status);
+bail:
+ mlog_exit(status);
+ return status;
+}
+
/*
* This is the filesystem locking protocol. It provides the lock handling
* hooks for the underlying DLM. It has a maximum version number.
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index 2bb01f09c1b1..3f8d9986b8e0 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -49,6 +49,19 @@ struct ocfs2_meta_lvb {
__be32 lvb_reserved2;
};
+#define OCFS2_QINFO_LVB_VERSION 1
+
+struct ocfs2_qinfo_lvb {
+ __u8 lvb_version;
+ __u8 lvb_reserved[3];
+ __be32 lvb_bgrace;
+ __be32 lvb_igrace;
+ __be32 lvb_syncms;
+ __be32 lvb_blocks;
+ __be32 lvb_free_blk;
+ __be32 lvb_free_entry;
+};
+
/* ocfs2_inode_lock_full() 'arg_flags' flags */
/* don't wait on recovery. */
#define OCFS2_META_LOCK_RECOVERY (0x01)
@@ -69,6 +82,9 @@ void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
struct ocfs2_file_private;
void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
struct ocfs2_file_private *fp);
+struct ocfs2_mem_dqinfo;
+void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
+ struct ocfs2_mem_dqinfo *info);
void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
int ocfs2_create_new_inode_locks(struct inode *inode);
int ocfs2_drop_inode_locks(struct inode *inode);
@@ -103,6 +119,9 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex);
void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
int ocfs2_file_lock(struct file *file, int ex, int trylock);
void ocfs2_file_unlock(struct file *file);
+int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
+
void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres);
void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 2baedac58234..f2bb1a04d253 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -293,7 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
struct ocfs2_extent_block *eb;
struct ocfs2_extent_list *el;
- ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh);
+ ret = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -302,12 +302,6 @@ static int ocfs2_last_eb_is_empty(struct inode *inode,
eb = (struct ocfs2_extent_block *) eb_bh->b_data;
el = &eb->h_list;
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
- ret = -EROFS;
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
- goto out;
- }
-
if (el->l_tree_depth) {
ocfs2_error(inode->i_sb,
"Inode %lu has non zero tree depth in "
@@ -381,23 +375,16 @@ static int ocfs2_figure_hole_clusters(struct inode *inode,
if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
goto no_more_extents;
- ret = ocfs2_read_block(inode,
- le64_to_cpu(eb->h_next_leaf_blk),
- &next_eb_bh);
+ ret = ocfs2_read_extent_block(inode,
+ le64_to_cpu(eb->h_next_leaf_blk),
+ &next_eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
- next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
-
- if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
- ret = -EROFS;
- OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
- goto out;
- }
+ next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
el = &next_eb->h_list;
-
i = ocfs2_search_for_hole_index(el, v_cluster);
}
@@ -630,7 +617,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
if (ret == 0)
goto out;
- ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh);
+ ret = ocfs2_read_inode_block(inode, &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -819,3 +806,74 @@ out:
return ret;
}
+
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+ struct buffer_head *bhs[], int flags,
+ int (*validate)(struct super_block *sb,
+ struct buffer_head *bh))
+{
+ int rc = 0;
+ u64 p_block, p_count;
+ int i, count, done = 0;
+
+ mlog_entry("(inode = %p, v_block = %llu, nr = %d, bhs = %p, "
+ "flags = %x, validate = %p)\n",
+ inode, (unsigned long long)v_block, nr, bhs, flags,
+ validate);
+
+ if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >=
+ i_size_read(inode)) {
+ BUG_ON(!(flags & OCFS2_BH_READAHEAD));
+ goto out;
+ }
+
+ while (done < nr) {
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+ rc = ocfs2_extent_map_get_blocks(inode, v_block + done,
+ &p_block, &p_count, NULL);
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+ if (rc) {
+ mlog_errno(rc);
+ break;
+ }
+
+ if (!p_block) {
+ rc = -EIO;
+ mlog(ML_ERROR,
+ "Inode #%llu contains a hole at offset %llu\n",
+ (unsigned long long)OCFS2_I(inode)->ip_blkno,
+ (unsigned long long)(v_block + done) <<
+ inode->i_sb->s_blocksize_bits);
+ break;
+ }
+
+ count = nr - done;
+ if (p_count < count)
+ count = p_count;
+
+ /*
+ * If the caller passed us bhs, they should have come
+ * from a previous readahead call to this function. Thus,
+ * they should have the right b_blocknr.
+ */
+ for (i = 0; i < count; i++) {
+ if (!bhs[done + i])
+ continue;
+ BUG_ON(bhs[done + i]->b_blocknr != (p_block + i));
+ }
+
+ rc = ocfs2_read_blocks(inode, p_block, count, bhs + done,
+ flags, validate);
+ if (rc) {
+ mlog_errno(rc);
+ break;
+ }
+ done += count;
+ }
+
+out:
+ mlog_exit(rc);
+ return rc;
+}
+
+
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index 1c4aa8b06f34..b7dd9731b462 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -57,4 +57,28 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
u32 *p_cluster, u32 *num_clusters,
struct ocfs2_extent_list *el);
+int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
+ struct buffer_head *bhs[], int flags,
+ int (*validate)(struct super_block *sb,
+ struct buffer_head *bh));
+static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block,
+ struct buffer_head **bh,
+ int (*validate)(struct super_block *sb,
+ struct buffer_head *bh))
+{
+ int status = 0;
+
+ if (bh == NULL) {
+ printk("ocfs2: bh == NULL\n");
+ status = -EINVAL;
+ goto bail;
+ }
+
+ status = ocfs2_read_virt_blocks(inode, v_block, 1, bh, 0, validate);
+
+bail:
+ return status;
+}
+
+
#endif /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index e2570a3bc2b2..e8f795f978aa 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -35,6 +35,7 @@
#include <linux/mount.h>
#include <linux/writeback.h>
#include <linux/falloc.h>
+#include <linux/quotaops.h>
#define MLOG_MASK_PREFIX ML_INODE
#include <cluster/masklog.h>
@@ -56,6 +57,8 @@
#include "suballoc.h"
#include "super.h"
#include "xattr.h"
+#include "acl.h"
+#include "quota.h"
#include "buffer_head_io.h"
@@ -253,8 +256,8 @@ int ocfs2_update_inode_atime(struct inode *inode,
goto out;
}
- ret = ocfs2_journal_access(handle, inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, inode, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
@@ -303,9 +306,9 @@ bail:
return status;
}
-static int ocfs2_simple_size_update(struct inode *inode,
- struct buffer_head *di_bh,
- u64 new_i_size)
+int ocfs2_simple_size_update(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 new_i_size)
{
int ret;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
@@ -350,8 +353,8 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
goto out;
}
- status = ocfs2_journal_access(handle, inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, inode, fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto out_commit;
@@ -401,12 +404,9 @@ static int ocfs2_truncate_file(struct inode *inode,
(unsigned long long)OCFS2_I(inode)->ip_blkno,
(unsigned long long)new_i_size);
+ /* We trust di_bh because it comes from ocfs2_inode_lock(), which
+ * already validated it */
fe = (struct ocfs2_dinode *) di_bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
- status = -EIO;
- goto bail;
- }
mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode),
"Inode %llu, inode i_size = %lld != di "
@@ -536,6 +536,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
enum ocfs2_alloc_restarted why;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_extent_tree et;
+ int did_quota = 0;
mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
@@ -545,18 +546,12 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
*/
BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb));
- status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh);
+ status = ocfs2_read_inode_block(inode, &bh);
if (status < 0) {
mlog_errno(status);
goto leave;
}
-
fe = (struct ocfs2_dinode *) bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
- status = -EIO;
- goto leave;
- }
restart_all:
BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
@@ -585,11 +580,18 @@ restart_all:
}
restarted_transaction:
+ if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb,
+ clusters_to_add))) {
+ status = -EDQUOT;
+ goto leave;
+ }
+ did_quota = 1;
+
/* reserve a write to the file entry early on - that we if we
* run out of credits in the allocation path, we can still
* update i_size. */
- status = ocfs2_journal_access(handle, inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, inode, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -622,6 +624,10 @@ restarted_transaction:
spin_lock(&OCFS2_I(inode)->ip_lock);
clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters);
spin_unlock(&OCFS2_I(inode)->ip_lock);
+ /* Release unused quota reservation */
+ vfs_dq_free_space(inode,
+ ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
+ did_quota = 0;
if (why != RESTART_NONE && clusters_to_add) {
if (why == RESTART_META) {
@@ -654,6 +660,9 @@ restarted_transaction:
OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode));
leave:
+ if (status < 0 && did_quota)
+ vfs_dq_free_space(inode,
+ ocfs2_clusters_to_bytes(osb->sb, clusters_to_add));
if (handle) {
ocfs2_commit_trans(osb, handle);
handle = NULL;
@@ -885,6 +894,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
struct ocfs2_super *osb = OCFS2_SB(sb);
struct buffer_head *bh = NULL;
handle_t *handle = NULL;
+ int locked[MAXQUOTAS] = {0, 0};
+ int credits, qtype;
+ struct ocfs2_mem_dqinfo *oinfo;
mlog_entry("(0x%p, '%.*s')\n", dentry,
dentry->d_name.len, dentry->d_name.name);
@@ -955,11 +967,47 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
}
}
- handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- status = PTR_ERR(handle);
- mlog_errno(status);
- goto bail_unlock;
+ if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
+ (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
+ credits = OCFS2_INODE_UPDATE_CREDITS;
+ if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid
+ && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+ OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+ oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv;
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0)
+ goto bail_unlock;
+ credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) +
+ ocfs2_calc_qdel_credits(sb, USRQUOTA);
+ locked[USRQUOTA] = 1;
+ }
+ if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid
+ && OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+ oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv;
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0)
+ goto bail_unlock;
+ credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) +
+ ocfs2_calc_qdel_credits(sb, GRPQUOTA);
+ locked[GRPQUOTA] = 1;
+ }
+ handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto bail_unlock;
+ }
+ status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0;
+ if (status < 0)
+ goto bail_commit;
+ } else {
+ handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto bail_unlock;
+ }
}
/*
@@ -982,6 +1030,12 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
bail_commit:
ocfs2_commit_trans(osb, handle);
bail_unlock:
+ for (qtype = 0; qtype < MAXQUOTAS; qtype++) {
+ if (!locked[qtype])
+ continue;
+ oinfo = sb_dqinfo(sb, qtype)->dqi_priv;
+ ocfs2_unlock_global_qf(oinfo, 1);
+ }
ocfs2_inode_unlock(inode, 1);
bail_unlock_rw:
if (size_change)
@@ -989,6 +1043,12 @@ bail_unlock_rw:
bail:
brelse(bh);
+ if (!status && attr->ia_valid & ATTR_MODE) {
+ status = ocfs2_acl_chmod(inode);
+ if (status < 0)
+ mlog_errno(status);
+ }
+
mlog_exit(status);
return status;
}
@@ -1035,7 +1095,7 @@ int ocfs2_permission(struct inode *inode, int mask)
goto out;
}
- ret = generic_permission(inode, mask, NULL);
+ ret = generic_permission(inode, mask, ocfs2_check_acl);
ocfs2_inode_unlock(inode, 0);
out:
@@ -1061,8 +1121,8 @@ static int __ocfs2_write_remove_suid(struct inode *inode,
goto out;
}
- ret = ocfs2_journal_access(handle, inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, inode, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out_trans;
@@ -1128,9 +1188,8 @@ static int ocfs2_write_remove_suid(struct inode *inode)
{
int ret;
struct buffer_head *bh = NULL;
- struct ocfs2_inode_info *oi = OCFS2_I(inode);
- ret = ocfs2_read_block(inode, oi->ip_blkno, &bh);
+ ret = ocfs2_read_inode_block(inode, &bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -1156,8 +1215,7 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode,
struct buffer_head *di_bh = NULL;
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
- ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno,
- &di_bh);
+ ret = ocfs2_read_inode_block(inode, &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
@@ -1226,83 +1284,6 @@ out:
return ret;
}
-static int __ocfs2_remove_inode_range(struct inode *inode,
- struct buffer_head *di_bh,
- u32 cpos, u32 phys_cpos, u32 len,
- struct ocfs2_cached_dealloc_ctxt *dealloc)
-{
- int ret;
- u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct inode *tl_inode = osb->osb_tl_inode;
- handle_t *handle;
- struct ocfs2_alloc_context *meta_ac = NULL;
- struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
- struct ocfs2_extent_tree et;
-
- ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
-
- ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
- if (ret) {
- mlog_errno(ret);
- return ret;
- }
-
- mutex_lock(&tl_inode->i_mutex);
-
- if (ocfs2_truncate_log_needs_flush(osb)) {
- ret = __ocfs2_flush_truncate_log(osb);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
- }
-
- handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- mlog_errno(ret);
- goto out;
- }
-
- ret = ocfs2_journal_access(handle, inode, di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
- ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
- dealloc);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
-
- OCFS2_I(inode)->ip_clusters -= len;
- di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters);
-
- ret = ocfs2_journal_dirty(handle, di_bh);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
-
- ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
- if (ret)
- mlog_errno(ret);
-
-out_commit:
- ocfs2_commit_trans(osb, handle);
-out:
- mutex_unlock(&tl_inode->i_mutex);
-
- if (meta_ac)
- ocfs2_free_alloc_context(meta_ac);
-
- return ret;
-}
-
/*
* Truncate a byte range, avoiding pages within partial clusters. This
* preserves those pages for the zeroing code to write to.
@@ -1402,7 +1383,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_cached_dealloc_ctxt dealloc;
struct address_space *mapping = inode->i_mapping;
+ struct ocfs2_extent_tree et;
+ ocfs2_init_dinode_extent_tree(&et, inode, di_bh);
ocfs2_init_dealloc_ctxt(&dealloc);
if (byte_len == 0)
@@ -1458,9 +1441,9 @@ static int ocfs2_remove_inode_range(struct inode *inode,
/* Only do work for non-holes */
if (phys_cpos != 0) {
- ret = __ocfs2_remove_inode_range(inode, di_bh, cpos,
- phys_cpos, alloc_size,
- &dealloc);
+ ret = ocfs2_remove_btree_range(inode, &et, cpos,
+ phys_cpos, alloc_size,
+ &dealloc);
if (ret) {
mlog_errno(ret);
goto out;
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index e92382cbca5f..172f9fbc9fc7 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -51,6 +51,9 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
struct ocfs2_alloc_context *data_ac,
struct ocfs2_alloc_context *meta_ac,
enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_simple_size_update(struct inode *inode,
+ struct buffer_head *di_bh,
+ u64 new_i_size);
int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size,
u64 zero_to);
int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 7aa00d511874..229e707bc050 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -28,6 +28,7 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
+#include <linux/quotaops.h>
#include <asm/byteorder.h>
@@ -37,6 +38,7 @@
#include "ocfs2.h"
#include "alloc.h"
+#include "blockcheck.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "file.h"
@@ -214,12 +216,11 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque)
return 0;
}
-int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
- int create_ino)
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+ int create_ino)
{
struct super_block *sb;
struct ocfs2_super *osb;
- int status = -EINVAL;
int use_plocks = 1;
mlog_entry("(0x%p, size:%llu)\n", inode,
@@ -232,25 +233,17 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks())
use_plocks = 0;
- /* this means that read_inode cannot create a superblock inode
- * today. change if needed. */
- if (!OCFS2_IS_VALID_DINODE(fe) ||
- !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
- mlog(0, "Invalid dinode: i_ino=%lu, i_blkno=%llu, "
- "signature = %.*s, flags = 0x%x\n",
- inode->i_ino,
- (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
- fe->i_signature, le32_to_cpu(fe->i_flags));
- goto bail;
- }
+ /*
+ * These have all been checked by ocfs2_read_inode_block() or set
+ * by ocfs2_mknod_locked(), so a failure is a code bug.
+ */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); /* This means that read_inode
+ cannot create a superblock
+ inode today. change if
+ that is needed. */
+ BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL)));
+ BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation);
- if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) {
- mlog(ML_ERROR, "file entry generation does not match "
- "superblock! osb->fs_generation=%x, "
- "fe->i_fs_generation=%x\n",
- osb->fs_generation, le32_to_cpu(fe->i_fs_generation));
- goto bail;
- }
OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
@@ -284,14 +277,18 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
inode->i_nlink = le16_to_cpu(fe->i_links_count);
- if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
+ if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) {
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE;
+ inode->i_flags |= S_NOQUOTA;
+ }
if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) {
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino);
} else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) {
OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP;
+ } else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) {
+ inode->i_flags |= S_NOQUOTA;
} else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) {
mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino);
/* we can't actually hit this as read_inode can't
@@ -354,10 +351,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
ocfs2_set_inode_flags(inode);
- status = 0;
-bail:
- mlog_exit(status);
- return status;
+ mlog_exit_void();
}
static int ocfs2_read_locked_inode(struct inode *inode,
@@ -460,11 +454,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
}
}
- if (can_lock)
- status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh,
- OCFS2_BH_IGNORE_CACHE);
- else
+ if (can_lock) {
+ status = ocfs2_read_inode_block_full(inode, &bh,
+ OCFS2_BH_IGNORE_CACHE);
+ } else {
status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh);
+ if (!status)
+ status = ocfs2_validate_inode_block(osb->sb, bh);
+ }
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -472,12 +469,6 @@ static int ocfs2_read_locked_inode(struct inode *inode,
status = -EINVAL;
fe = (struct ocfs2_dinode *) bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- mlog(0, "Invalid dinode #%llu: signature = %.*s\n",
- (unsigned long long)args->fi_blkno, 7,
- fe->i_signature);
- goto bail;
- }
/*
* This is a code bug. Right now the caller needs to
@@ -491,10 +482,9 @@ static int ocfs2_read_locked_inode(struct inode *inode,
if (S_ISCHR(le16_to_cpu(fe->i_mode)) ||
S_ISBLK(le16_to_cpu(fe->i_mode)))
- inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
+ inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
- if (ocfs2_populate_inode(inode, fe, 0) < 0)
- goto bail;
+ ocfs2_populate_inode(inode, fe, 0);
BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno));
@@ -547,8 +537,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
goto out;
}
- status = ocfs2_journal_access(handle, inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, inode, fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto out;
@@ -615,7 +605,8 @@ static int ocfs2_remove_inode(struct inode *inode,
goto bail;
}
- handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS);
+ handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS +
+ ocfs2_quota_trans_credits(inode->i_sb));
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
mlog_errno(status);
@@ -630,8 +621,8 @@ static int ocfs2_remove_inode(struct inode *inode,
}
/* set the inodes dtime */
- status = ocfs2_journal_access(handle, inode, di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, inode, di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail_commit;
@@ -647,6 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode,
}
ocfs2_remove_from_cache(inode, di_bh);
+ vfs_dq_free_inode(inode);
status = ocfs2_free_dinode(handle, inode_alloc_inode,
inode_alloc_bh, di);
@@ -929,7 +921,10 @@ void ocfs2_delete_inode(struct inode *inode)
mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
- if (is_bad_inode(inode)) {
+ /* When we fail in read_inode() we mark inode as bad. The second test
+ * catches the case when inode allocation fails before allocating
+ * a block for inode. */
+ if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) {
mlog(0, "Skipping delete of bad inode\n");
goto bail;
}
@@ -1195,8 +1190,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
mlog_entry("(inode %llu)\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno);
- status = ocfs2_journal_access(handle, inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, inode, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -1264,3 +1259,89 @@ void ocfs2_refresh_inode(struct inode *inode,
spin_unlock(&OCFS2_I(inode)->ip_lock);
}
+
+int ocfs2_validate_inode_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data;
+
+ mlog(0, "Validating dinode %llu\n",
+ (unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check);
+ if (rc) {
+ mlog(ML_ERROR, "Checksum failed for dinode %llu\n",
+ (unsigned long long)bh->b_blocknr);
+ goto bail;
+ }
+
+ /*
+ * Errors after here are fatal.
+ */
+
+ rc = -EINVAL;
+
+ if (!OCFS2_IS_VALID_DINODE(di)) {
+ ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n",
+ (unsigned long long)bh->b_blocknr, 7,
+ di->i_signature);
+ goto bail;
+ }
+
+ if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) {
+ ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(di->i_blkno));
+ goto bail;
+ }
+
+ if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) {
+ ocfs2_error(sb,
+ "Invalid dinode #%llu: OCFS2_VALID_FL not set\n",
+ (unsigned long long)bh->b_blocknr);
+ goto bail;
+ }
+
+ if (le32_to_cpu(di->i_fs_generation) !=
+ OCFS2_SB(sb)->fs_generation) {
+ ocfs2_error(sb,
+ "Invalid dinode #%llu: fs_generation is %u\n",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(di->i_fs_generation));
+ goto bail;
+ }
+
+ rc = 0;
+
+bail:
+ return rc;
+}
+
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+ int flags)
+{
+ int rc;
+ struct buffer_head *tmp = *bh;
+
+ rc = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, &tmp,
+ flags, ocfs2_validate_inode_block);
+
+ /* If ocfs2_read_blocks() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh)
+{
+ return ocfs2_read_inode_block_full(inode, bh, 0);
+}
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 2f37af9bcc4a..eb3c302b38d3 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -128,8 +128,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags,
int sysfile_type);
int ocfs2_inode_init_private(struct inode *inode);
int ocfs2_inode_revalidate(struct dentry *dentry);
-int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
- int create_ino);
+void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
+ int create_ino);
void ocfs2_read_inode(struct inode *inode);
void ocfs2_read_inode2(struct inode *inode, void *opaque);
ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf,
@@ -142,6 +142,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle,
struct buffer_head *bh);
int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb);
int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
+struct buffer_head *ocfs2_bread(struct inode *inode,
+ int block, int *err, int reada);
void ocfs2_set_inode_flags(struct inode *inode);
void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi);
@@ -153,4 +155,16 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
}
+/* Validate that a bh contains a valid inode */
+int ocfs2_validate_inode_block(struct super_block *sb,
+ struct buffer_head *bh);
+/*
+ * Read an inode block into *bh. If *bh is NULL, a bh will be allocated.
+ * This is a cached read. The inode will be validated with
+ * ocfs2_validate_inode_block().
+ */
+int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh);
+/* The same, but can be passed OCFS2_BH_* flags */
+int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh,
+ int flags);
#endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 99fe9d584f3c..57d7d25a2b9a 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -35,6 +35,7 @@
#include "ocfs2.h"
#include "alloc.h"
+#include "blockcheck.h"
#include "dir.h"
#include "dlmglue.h"
#include "extent_map.h"
@@ -45,6 +46,7 @@
#include "slot_map.h"
#include "super.h"
#include "sysfile.h"
+#include "quota.h"
#include "buffer_head_io.h"
@@ -52,10 +54,10 @@ DEFINE_SPINLOCK(trans_inc_lock);
static int ocfs2_force_read_journal(struct inode *inode);
static int ocfs2_recover_node(struct ocfs2_super *osb,
- int node_num);
+ int node_num, int slot_num);
static int __ocfs2_recovery_thread(void *arg);
static int ocfs2_commit_cache(struct ocfs2_super *osb);
-static int ocfs2_wait_on_mount(struct ocfs2_super *osb);
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota);
static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
int dirty, int replayed);
static int ocfs2_trylock_journal(struct ocfs2_super *osb,
@@ -64,6 +66,17 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
int slot);
static int ocfs2_commit_thread(void *arg);
+static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+{
+ return __ocfs2_wait_on_mount(osb, 0);
+}
+
+static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb)
+{
+ return __ocfs2_wait_on_mount(osb, 1);
+}
+
+
/*
* The recovery_list is a simple linked list of node numbers to recover.
@@ -256,11 +269,9 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE);
BUG_ON(max_buffs <= 0);
- /* JBD might support this, but our journalling code doesn't yet. */
- if (journal_current_handle()) {
- mlog(ML_ERROR, "Recursive transaction attempted!\n");
- BUG();
- }
+ /* Nested transaction? Just return the handle... */
+ if (journal_current_handle())
+ return jbd2_journal_start(journal, max_buffs);
down_read(&osb->journal->j_trans_barrier);
@@ -285,16 +296,18 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs)
int ocfs2_commit_trans(struct ocfs2_super *osb,
handle_t *handle)
{
- int ret;
+ int ret, nested;
struct ocfs2_journal *journal = osb->journal;
BUG_ON(!handle);
+ nested = handle->h_ref > 1;
ret = jbd2_journal_stop(handle);
if (ret < 0)
mlog_errno(ret);
- up_read(&journal->j_trans_barrier);
+ if (!nested)
+ up_read(&journal->j_trans_barrier);
return ret;
}
@@ -357,10 +370,137 @@ bail:
return status;
}
-int ocfs2_journal_access(handle_t *handle,
- struct inode *inode,
- struct buffer_head *bh,
- int type)
+struct ocfs2_triggers {
+ struct jbd2_buffer_trigger_type ot_triggers;
+ int ot_offset;
+};
+
+static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers)
+{
+ return container_of(triggers, struct ocfs2_triggers, ot_triggers);
+}
+
+static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+ struct buffer_head *bh,
+ void *data, size_t size)
+{
+ struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers);
+
+ /*
+ * We aren't guaranteed to have the superblock here, so we
+ * must unconditionally compute the ecc data.
+ * __ocfs2_journal_access() will only set the triggers if
+ * metaecc is enabled.
+ */
+ ocfs2_block_check_compute(data, size, data + ot->ot_offset);
+}
+
+/*
+ * Quota blocks have their own trigger because the struct ocfs2_block_check
+ * offset depends on the blocksize.
+ */
+static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+ struct buffer_head *bh,
+ void *data, size_t size)
+{
+ struct ocfs2_disk_dqtrailer *dqt =
+ ocfs2_block_dqtrailer(size, data);
+
+ /*
+ * We aren't guaranteed to have the superblock here, so we
+ * must unconditionally compute the ecc data.
+ * __ocfs2_journal_access() will only set the triggers if
+ * metaecc is enabled.
+ */
+ ocfs2_block_check_compute(data, size, &dqt->dq_check);
+}
+
+/*
+ * Directory blocks also have their own trigger because the
+ * struct ocfs2_block_check offset depends on the blocksize.
+ */
+static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers,
+ struct buffer_head *bh,
+ void *data, size_t size)
+{
+ struct ocfs2_dir_block_trailer *trailer =
+ ocfs2_dir_trailer_from_size(size, data);
+
+ /*
+ * We aren't guaranteed to have the superblock here, so we
+ * must unconditionally compute the ecc data.
+ * __ocfs2_journal_access() will only set the triggers if
+ * metaecc is enabled.
+ */
+ ocfs2_block_check_compute(data, size, &trailer->db_check);
+}
+
+static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers,
+ struct buffer_head *bh)
+{
+ mlog(ML_ERROR,
+ "ocfs2_abort_trigger called by JBD2. bh = 0x%lx, "
+ "bh->b_blocknr = %llu\n",
+ (unsigned long)bh,
+ (unsigned long long)bh->b_blocknr);
+
+ /* We aren't guaranteed to have the superblock here - but if we
+ * don't, it'll just crash. */
+ ocfs2_error(bh->b_assoc_map->host->i_sb,
+ "JBD2 has aborted our journal, ocfs2 cannot continue\n");
+}
+
+static struct ocfs2_triggers di_triggers = {
+ .ot_triggers = {
+ .t_commit = ocfs2_commit_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+ .ot_offset = offsetof(struct ocfs2_dinode, i_check),
+};
+
+static struct ocfs2_triggers eb_triggers = {
+ .ot_triggers = {
+ .t_commit = ocfs2_commit_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+ .ot_offset = offsetof(struct ocfs2_extent_block, h_check),
+};
+
+static struct ocfs2_triggers gd_triggers = {
+ .ot_triggers = {
+ .t_commit = ocfs2_commit_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+ .ot_offset = offsetof(struct ocfs2_group_desc, bg_check),
+};
+
+static struct ocfs2_triggers db_triggers = {
+ .ot_triggers = {
+ .t_commit = ocfs2_db_commit_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+};
+
+static struct ocfs2_triggers xb_triggers = {
+ .ot_triggers = {
+ .t_commit = ocfs2_commit_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+ .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check),
+};
+
+static struct ocfs2_triggers dq_triggers = {
+ .ot_triggers = {
+ .t_commit = ocfs2_dq_commit_trigger,
+ .t_abort = ocfs2_abort_trigger,
+ },
+};
+
+static int __ocfs2_journal_access(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *bh,
+ struct ocfs2_triggers *triggers,
+ int type)
{
int status;
@@ -406,6 +546,8 @@ int ocfs2_journal_access(handle_t *handle,
status = -EINVAL;
mlog(ML_ERROR, "Uknown access type!\n");
}
+ if (!status && ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)) && triggers)
+ jbd2_journal_set_triggers(bh, &triggers->ot_triggers);
mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
if (status < 0)
@@ -416,6 +558,54 @@ int ocfs2_journal_access(handle_t *handle,
return status;
}
+int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, inode, bh, &di_triggers,
+ type);
+}
+
+int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, inode, bh, &eb_triggers,
+ type);
+}
+
+int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, inode, bh, &gd_triggers,
+ type);
+}
+
+int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, inode, bh, &db_triggers,
+ type);
+}
+
+int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, inode, bh, &xb_triggers,
+ type);
+}
+
+int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, inode, bh, &dq_triggers,
+ type);
+}
+
+int ocfs2_journal_access(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type)
+{
+ return __ocfs2_journal_access(handle, inode, bh, NULL, type);
+}
+
int ocfs2_journal_dirty(handle_t *handle,
struct buffer_head *bh)
{
@@ -434,20 +624,6 @@ int ocfs2_journal_dirty(handle_t *handle,
return status;
}
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-int ocfs2_journal_dirty_data(handle_t *handle,
- struct buffer_head *bh)
-{
- int err = journal_dirty_data(handle, bh);
- if (err)
- mlog_errno(err);
- /* TODO: When we can handle it, abort the handle and go RO on
- * error here. */
-
- return err;
-}
-#endif
-
#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE)
void ocfs2_set_journal_params(struct ocfs2_super *osb)
@@ -587,17 +763,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
mlog_entry_void();
fe = (struct ocfs2_dinode *)bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- /* This is called from startup/shutdown which will
- * handle the errors in a specific manner, so no need
- * to call ocfs2_error() here. */
- mlog(ML_ERROR, "Journal dinode %llu has invalid "
- "signature: %.*s",
- (unsigned long long)le64_to_cpu(fe->i_blkno), 7,
- fe->i_signature);
- status = -EIO;
- goto out;
- }
+
+ /* The journal bh on the osb always comes from ocfs2_journal_init()
+ * and was validated there inside ocfs2_inode_lock_full(). It's a
+ * code bug if we mess it up. */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
flags = le32_to_cpu(fe->id1.journal1.ij_flags);
if (dirty)
@@ -609,11 +779,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
if (replayed)
ocfs2_bump_recovery_generation(fe);
+ ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
status = ocfs2_write_block(osb, bh, journal->j_inode);
if (status < 0)
mlog_errno(status);
-out:
mlog_exit(status);
return status;
}
@@ -878,6 +1048,7 @@ struct ocfs2_la_recovery_item {
int lri_slot;
struct ocfs2_dinode *lri_la_dinode;
struct ocfs2_dinode *lri_tl_dinode;
+ struct ocfs2_quota_recovery *lri_qrec;
};
/* Does the second half of the recovery process. By this point, the
@@ -898,6 +1069,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
struct ocfs2_super *osb = journal->j_osb;
struct ocfs2_dinode *la_dinode, *tl_dinode;
struct ocfs2_la_recovery_item *item, *n;
+ struct ocfs2_quota_recovery *qrec;
LIST_HEAD(tmp_la_list);
mlog_entry_void();
@@ -913,6 +1085,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
mlog(0, "Complete recovery for slot %d\n", item->lri_slot);
+ ocfs2_wait_on_quotas(osb);
+
la_dinode = item->lri_la_dinode;
if (la_dinode) {
mlog(0, "Clean up local alloc %llu\n",
@@ -943,6 +1117,16 @@ void ocfs2_complete_recovery(struct work_struct *work)
if (ret < 0)
mlog_errno(ret);
+ qrec = item->lri_qrec;
+ if (qrec) {
+ mlog(0, "Recovering quota files");
+ ret = ocfs2_finish_quota_recovery(osb, qrec,
+ item->lri_slot);
+ if (ret < 0)
+ mlog_errno(ret);
+ /* Recovery info is already freed now */
+ }
+
kfree(item);
}
@@ -956,7 +1140,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
int slot_num,
struct ocfs2_dinode *la_dinode,
- struct ocfs2_dinode *tl_dinode)
+ struct ocfs2_dinode *tl_dinode,
+ struct ocfs2_quota_recovery *qrec)
{
struct ocfs2_la_recovery_item *item;
@@ -971,6 +1156,9 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
if (tl_dinode)
kfree(tl_dinode);
+ if (qrec)
+ ocfs2_free_quota_recovery(qrec);
+
mlog_errno(-ENOMEM);
return;
}
@@ -979,6 +1167,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
item->lri_la_dinode = la_dinode;
item->lri_slot = slot_num;
item->lri_tl_dinode = tl_dinode;
+ item->lri_qrec = qrec;
spin_lock(&journal->j_lock);
list_add_tail(&item->lri_list, &journal->j_la_cleanups);
@@ -998,6 +1187,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
ocfs2_queue_recovery_completion(journal,
osb->slot_num,
osb->local_alloc_copy,
+ NULL,
NULL);
ocfs2_schedule_truncate_log_flush(osb, 0);
@@ -1006,11 +1196,26 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
}
}
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
+{
+ if (osb->quota_rec) {
+ ocfs2_queue_recovery_completion(osb->journal,
+ osb->slot_num,
+ NULL,
+ NULL,
+ osb->quota_rec);
+ osb->quota_rec = NULL;
+ }
+}
+
static int __ocfs2_recovery_thread(void *arg)
{
- int status, node_num;
+ int status, node_num, slot_num;
struct ocfs2_super *osb = arg;
struct ocfs2_recovery_map *rm = osb->recovery_map;
+ int *rm_quota = NULL;
+ int rm_quota_used = 0, i;
+ struct ocfs2_quota_recovery *qrec;
mlog_entry_void();
@@ -1019,6 +1224,11 @@ static int __ocfs2_recovery_thread(void *arg)
goto bail;
}
+ rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS);
+ if (!rm_quota) {
+ status = -ENOMEM;
+ goto bail;
+ }
restart:
status = ocfs2_super_lock(osb, 1);
if (status < 0) {
@@ -1032,8 +1242,28 @@ restart:
* clear it until ocfs2_recover_node() has succeeded. */
node_num = rm->rm_entries[0];
spin_unlock(&osb->osb_lock);
-
- status = ocfs2_recover_node(osb, node_num);
+ mlog(0, "checking node %d\n", node_num);
+ slot_num = ocfs2_node_num_to_slot(osb, node_num);
+ if (slot_num == -ENOENT) {
+ status = 0;
+ mlog(0, "no slot for this node, so no recovery"
+ "required.\n");
+ goto skip_recovery;
+ }
+ mlog(0, "node %d was using slot %d\n", node_num, slot_num);
+
+ /* It is a bit subtle with quota recovery. We cannot do it
+ * immediately because we have to obtain cluster locks from
+ * quota files and we also don't want to just skip it because
+ * then quota usage would be out of sync until some node takes
+ * the slot. So we remember which nodes need quota recovery
+ * and when everything else is done, we recover quotas. */
+ for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++);
+ if (i == rm_quota_used)
+ rm_quota[rm_quota_used++] = slot_num;
+
+ status = ocfs2_recover_node(osb, node_num, slot_num);
+skip_recovery:
if (!status) {
ocfs2_recovery_map_clear(osb, node_num);
} else {
@@ -1055,13 +1285,27 @@ restart:
if (status < 0)
mlog_errno(status);
+ /* Now it is right time to recover quotas... We have to do this under
+ * superblock lock so that noone can start using the slot (and crash)
+ * before we recover it */
+ for (i = 0; i < rm_quota_used; i++) {
+ qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]);
+ if (IS_ERR(qrec)) {
+ status = PTR_ERR(qrec);
+ mlog_errno(status);
+ continue;
+ }
+ ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
+ NULL, NULL, qrec);
+ }
+
ocfs2_super_unlock(osb, 1);
/* We always run recovery on our own orphan dir - the dead
* node(s) may have disallowd a previos inode delete. Re-processing
* is therefore required. */
ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
- NULL);
+ NULL, NULL);
bail:
mutex_lock(&osb->recovery_lock);
@@ -1076,6 +1320,9 @@ bail:
mutex_unlock(&osb->recovery_lock);
+ if (rm_quota)
+ kfree(rm_quota);
+
mlog_exit(status);
/* no one is callint kthread_stop() for us so the kthread() api
* requires that we call do_exit(). And it isn't exported, but
@@ -1135,8 +1382,7 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb,
}
SET_INODE_JOURNAL(inode);
- status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh,
- OCFS2_BH_IGNORE_CACHE);
+ status = ocfs2_read_inode_block_full(inode, bh, OCFS2_BH_IGNORE_CACHE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1268,6 +1514,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
osb->slot_recovery_generations[slot_num] =
ocfs2_get_recovery_generation(fe);
+ ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check);
status = ocfs2_write_block(osb, bh, inode);
if (status < 0)
mlog_errno(status);
@@ -1304,31 +1551,19 @@ done:
* far less concerning.
*/
static int ocfs2_recover_node(struct ocfs2_super *osb,
- int node_num)
+ int node_num, int slot_num)
{
int status = 0;
- int slot_num;
struct ocfs2_dinode *la_copy = NULL;
struct ocfs2_dinode *tl_copy = NULL;
- mlog_entry("(node_num=%d, osb->node_num = %d)\n",
- node_num, osb->node_num);
-
- mlog(0, "checking node %d\n", node_num);
+ mlog_entry("(node_num=%d, slot_num=%d, osb->node_num = %d)\n",
+ node_num, slot_num, osb->node_num);
/* Should not ever be called to recover ourselves -- in that
* case we should've called ocfs2_journal_load instead. */
BUG_ON(osb->node_num == node_num);
- slot_num = ocfs2_node_num_to_slot(osb, node_num);
- if (slot_num == -ENOENT) {
- status = 0;
- mlog(0, "no slot for this node, so no recovery required.\n");
- goto done;
- }
-
- mlog(0, "node %d was using slot %d\n", node_num, slot_num);
-
status = ocfs2_replay_journal(osb, node_num, slot_num);
if (status < 0) {
if (status == -EBUSY) {
@@ -1364,7 +1599,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
/* This will kfree the memory pointed to by la_copy and tl_copy */
ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
- tl_copy);
+ tl_copy, NULL);
status = 0;
done:
@@ -1659,13 +1894,14 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
return ret;
}
-static int ocfs2_wait_on_mount(struct ocfs2_super *osb)
+static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota)
{
/* This check is good because ocfs2 will wait on our recovery
* thread before changing it to something other than MOUNTED
* or DISABLED. */
wait_event(osb->osb_mount_event,
- atomic_read(&osb->vol_state) == VOLUME_MOUNTED ||
+ (!quota && atomic_read(&osb->vol_state) == VOLUME_MOUNTED) ||
+ atomic_read(&osb->vol_state) == VOLUME_MOUNTED_QUOTAS ||
atomic_read(&osb->vol_state) == VOLUME_DISABLED);
/* If there's an error on mount, then we may never get to the
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d4d14e9a3cea..3c3532e1307c 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -27,12 +27,7 @@
#define OCFS2_JOURNAL_H
#include <linux/fs.h>
-#ifndef CONFIG_OCFS2_COMPAT_JBD
-# include <linux/jbd2.h>
-#else
-# include <linux/jbd.h>
-# include "ocfs2_jbd_compat.h"
-#endif
+#include <linux/jbd2.h>
enum ocfs2_journal_state {
OCFS2_JOURNAL_FREE = 0,
@@ -173,6 +168,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb,
int node_num);
int ocfs2_mark_dead_nodes(struct ocfs2_super *osb);
void ocfs2_complete_mount_recovery(struct ocfs2_super *osb);
+void ocfs2_complete_quota_recovery(struct ocfs2_super *osb);
static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb)
{
@@ -216,9 +212,12 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode)
* ocfs2_extend_trans - Extend a handle by nblocks credits. This may
* commit the handle to disk in the process, but will
* not release any locks taken during the transaction.
- * ocfs2_journal_access - Notify the handle that we want to journal this
+ * ocfs2_journal_access* - Notify the handle that we want to journal this
* buffer. Will have to call ocfs2_journal_dirty once
* we've actually dirtied it. Type is one of . or .
+ * Always call the specific flavor of
+ * ocfs2_journal_access_*() unless you intend to
+ * manage the checksum by hand.
* ocfs2_journal_dirty - Mark a journalled buffer as having dirty data.
* ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before
* the current handle commits.
@@ -248,10 +247,29 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks);
#define OCFS2_JOURNAL_ACCESS_WRITE 1
#define OCFS2_JOURNAL_ACCESS_UNDO 2
-int ocfs2_journal_access(handle_t *handle,
- struct inode *inode,
- struct buffer_head *bh,
- int type);
+
+/* ocfs2_inode */
+int ocfs2_journal_access_di(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type);
+/* ocfs2_extent_block */
+int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type);
+/* ocfs2_group_desc */
+int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type);
+/* ocfs2_xattr_block */
+int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type);
+/* quota blocks */
+int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type);
+/* dirblock */
+int ocfs2_journal_access_db(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type);
+/* Anything that has no ecc */
+int ocfs2_journal_access(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type);
+
/*
* A word about the journal_access/journal_dirty "dance". It is
* entirely legal to journal_access a buffer more than once (as long
@@ -273,10 +291,6 @@ int ocfs2_journal_access(handle_t *handle,
*/
int ocfs2_journal_dirty(handle_t *handle,
struct buffer_head *bh);
-#ifdef CONFIG_OCFS2_COMPAT_JBD
-int ocfs2_journal_dirty_data(handle_t *handle,
- struct buffer_head *bh);
-#endif
/*
* Credit Macros:
@@ -293,6 +307,37 @@ int ocfs2_journal_dirty_data(handle_t *handle,
/* extended attribute block update */
#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
+/* global quotafile inode update, data block */
+#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+
+/*
+ * The two writes below can accidentally see global info dirty due
+ * to set_info() quotactl so make them prepared for the writes.
+ */
+/* quota data block, global info */
+/* Write to local quota file */
+#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1)
+
+/* global quota data block, local quota data block, global quota inode,
+ * global quota info */
+#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3)
+
+static inline int ocfs2_quota_trans_credits(struct super_block *sb)
+{
+ int credits = 0;
+
+ if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA))
+ credits += OCFS2_QWRITE_CREDITS;
+ if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA))
+ credits += OCFS2_QWRITE_CREDITS;
+ return credits;
+}
+
+/* Number of credits needed for removing quota structure from file */
+int ocfs2_calc_qdel_credits(struct super_block *sb, int type);
+/* Number of credits needed for initialization of new quota structure */
+int ocfs2_calc_qinit_credits(struct super_block *sb, int type);
+
/* group extend. inode update and last group update. */
#define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
@@ -303,8 +348,11 @@ int ocfs2_journal_dirty_data(handle_t *handle,
* prev. group desc. if we relink. */
#define OCFS2_SUBALLOC_ALLOC (3)
-#define OCFS2_INLINE_TO_EXTENTS_CREDITS (OCFS2_SUBALLOC_ALLOC \
- + OCFS2_INODE_UPDATE_CREDITS)
+static inline int ocfs2_inline_to_extents_credits(struct super_block *sb)
+{
+ return OCFS2_SUBALLOC_ALLOC + OCFS2_INODE_UPDATE_CREDITS +
+ ocfs2_quota_trans_credits(sb);
+}
/* dinode + group descriptor update. We don't relink on free yet. */
#define OCFS2_SUBALLOC_FREE (2)
@@ -313,16 +361,23 @@ int ocfs2_journal_dirty_data(handle_t *handle,
#define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \
+ OCFS2_TRUNCATE_LOG_UPDATE)
-#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS)
+static inline int ocfs2_remove_extent_credits(struct super_block *sb)
+{
+ return OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS +
+ ocfs2_quota_trans_credits(sb);
+}
/* data block for new dir/symlink, 2 for bitmap updates (bitmap fe +
* bitmap block for the new bit) */
#define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2)
/* parent fe, parent block, new file entry, inode alloc fe, inode alloc
- * group descriptor + mkdir/symlink blocks */
-#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC \
- + OCFS2_DIR_LINK_ADDITIONAL_CREDITS)
+ * group descriptor + mkdir/symlink blocks + quota update */
+static inline int ocfs2_mknod_credits(struct super_block *sb)
+{
+ return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS +
+ ocfs2_quota_trans_credits(sb);
+}
/* local alloc metadata change + main bitmap updates */
#define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \
@@ -332,13 +387,21 @@ int ocfs2_journal_dirty_data(handle_t *handle,
* for the dinode, one for the new block. */
#define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2)
-/* file update (nlink, etc) + directory mtime/ctime + dir entry block */
-#define OCFS2_LINK_CREDITS (2*OCFS2_INODE_UPDATE_CREDITS + 1)
+/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota
+ * update on dir */
+static inline int ocfs2_link_credits(struct super_block *sb)
+{
+ return 2*OCFS2_INODE_UPDATE_CREDITS + 1 +
+ ocfs2_quota_trans_credits(sb);
+}
/* inode + dir inode (if we unlink a dir), + dir entry block + orphan
* dir inode link */
-#define OCFS2_UNLINK_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 1 \
- + OCFS2_LINK_CREDITS)
+static inline int ocfs2_unlink_credits(struct super_block *sb)
+{
+ /* The quota update from ocfs2_link_credits is unused here... */
+ return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb);
+}
/* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry +
* inode alloc group descriptor */
@@ -347,8 +410,10 @@ int ocfs2_journal_dirty_data(handle_t *handle,
/* dinode update, old dir dinode update, new dir dinode update, old
* dir dir entry, new dir dir entry, dir entry update for renaming
* directory + target unlink */
-#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \
- + OCFS2_UNLINK_CREDITS)
+static inline int ocfs2_rename_credits(struct super_block *sb)
+{
+ return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb);
+}
/* global bitmap dinode, group desc., relinked group,
* suballocator dinode, group desc., relinked group,
@@ -386,18 +451,19 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb,
* credit for the dinode there. */
extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth);
- return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks;
+ return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks +
+ ocfs2_quota_trans_credits(sb);
}
static inline int ocfs2_calc_symlink_credits(struct super_block *sb)
{
- int blocks = OCFS2_MKNOD_CREDITS;
+ int blocks = ocfs2_mknod_credits(sb);
/* links can be longer than one block so we may update many
* within our single allocated extent. */
blocks += ocfs2_clusters_to_blocks(sb, 1);
- return blocks;
+ return blocks + ocfs2_quota_trans_credits(sb);
}
static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb,
@@ -434,6 +500,8 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
/* update to the truncate log. */
credits += OCFS2_TRUNCATE_LOG_UPDATE;
+ credits += ocfs2_quota_trans_credits(sb);
+
return credits;
}
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 687b28713c32..ec70cdbe77fc 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -36,6 +36,7 @@
#include "ocfs2.h"
#include "alloc.h"
+#include "blockcheck.h"
#include "dlmglue.h"
#include "inode.h"
#include "journal.h"
@@ -248,8 +249,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
goto bail;
}
- status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
- &alloc_bh, OCFS2_BH_IGNORE_CACHE);
+ status = ocfs2_read_inode_block_full(inode, &alloc_bh,
+ OCFS2_BH_IGNORE_CACHE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -382,8 +383,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
}
memcpy(alloc_copy, alloc, bh->b_size);
- status = ocfs2_journal_access(handle, local_alloc_inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, local_alloc_inode, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto out_commit;
@@ -459,8 +460,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
mutex_lock(&inode->i_mutex);
- status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1,
- &alloc_bh, OCFS2_BH_IGNORE_CACHE);
+ status = ocfs2_read_inode_block_full(inode, &alloc_bh,
+ OCFS2_BH_IGNORE_CACHE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -476,6 +477,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
alloc = (struct ocfs2_dinode *) alloc_bh->b_data;
ocfs2_clear_local_alloc(alloc);
+ ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check);
status = ocfs2_write_block(osb, alloc_bh, inode);
if (status < 0)
mlog_errno(status);
@@ -762,9 +764,9 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
* delete bits from it! */
*num_bits = bits_wanted;
- status = ocfs2_journal_access(handle, local_alloc_inode,
- osb->local_alloc_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, local_alloc_inode,
+ osb->local_alloc_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1240,9 +1242,9 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb,
}
memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size);
- status = ocfs2_journal_access(handle, local_alloc_inode,
- osb->local_alloc_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, local_alloc_inode,
+ osb->local_alloc_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 2545e7402efe..084aba86c3b2 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -40,6 +40,7 @@
#include <linux/types.h>
#include <linux/slab.h>
#include <linux/highmem.h>
+#include <linux/quotaops.h>
#define MLOG_MASK_PREFIX ML_NAMEI
#include <cluster/masklog.h>
@@ -61,17 +62,18 @@
#include "sysfile.h"
#include "uptodate.h"
#include "xattr.h"
+#include "acl.h"
#include "buffer_head_io.h"
static int ocfs2_mknod_locked(struct ocfs2_super *osb,
struct inode *dir,
- struct dentry *dentry, int mode,
+ struct inode *inode,
+ struct dentry *dentry,
dev_t dev,
struct buffer_head **new_fe_bh,
struct buffer_head *parent_fe_bh,
handle_t *handle,
- struct inode **ret_inode,
struct ocfs2_alloc_context *inode_ac);
static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
@@ -186,6 +188,35 @@ bail:
return ret;
}
+static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode)
+{
+ struct inode *inode;
+
+ inode = new_inode(dir->i_sb);
+ if (!inode) {
+ mlog(ML_ERROR, "new_inode failed!\n");
+ return NULL;
+ }
+
+ /* populate as many fields early on as possible - many of
+ * these are used by the support functions here and in
+ * callers. */
+ if (S_ISDIR(mode))
+ inode->i_nlink = 2;
+ else
+ inode->i_nlink = 1;
+ inode->i_uid = current_fsuid();
+ if (dir->i_mode & S_ISGID) {
+ inode->i_gid = dir->i_gid;
+ if (S_ISDIR(mode))
+ mode |= S_ISGID;
+ } else
+ inode->i_gid = current_fsgid();
+ inode->i_mode = mode;
+ vfs_dq_init(inode);
+ return inode;
+}
+
static int ocfs2_mknod(struct inode *dir,
struct dentry *dentry,
int mode,
@@ -201,6 +232,13 @@ static int ocfs2_mknod(struct inode *dir,
struct inode *inode = NULL;
struct ocfs2_alloc_context *inode_ac = NULL;
struct ocfs2_alloc_context *data_ac = NULL;
+ struct ocfs2_alloc_context *xattr_ac = NULL;
+ int want_clusters = 0;
+ int xattr_credits = 0;
+ struct ocfs2_security_xattr_info si = {
+ .enable = 1,
+ };
+ int did_quota_inode = 0;
mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
(unsigned long)dev, dentry->d_name.len,
@@ -250,17 +288,46 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
- /* Reserve a cluster if creating an extent based directory. */
- if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) {
- status = ocfs2_reserve_clusters(osb, 1, &data_ac);
- if (status < 0) {
- if (status != -ENOSPC)
- mlog_errno(status);
+ inode = ocfs2_get_init_inode(dir, mode);
+ if (!inode) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto leave;
+ }
+
+ /* get security xattr */
+ status = ocfs2_init_security_get(inode, dir, &si);
+ if (status) {
+ if (status == -EOPNOTSUPP)
+ si.enable = 0;
+ else {
+ mlog_errno(status);
goto leave;
}
}
- handle = ocfs2_start_trans(osb, OCFS2_MKNOD_CREDITS);
+ /* calculate meta data/clusters for setting security and acl xattr */
+ status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode,
+ &si, &want_clusters,
+ &xattr_credits, &xattr_ac);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ /* Reserve a cluster if creating an extent based directory. */
+ if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb))
+ want_clusters += 1;
+
+ status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto leave;
+ }
+
+ handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) +
+ xattr_credits);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
handle = NULL;
@@ -268,10 +335,19 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
+ /* We don't use standard VFS wrapper because we don't want vfs_dq_init
+ * to be called. */
+ if (sb_any_quota_active(osb->sb) &&
+ osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
+ status = -EDQUOT;
+ goto leave;
+ }
+ did_quota_inode = 1;
+
/* do the real work now. */
- status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev,
+ status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev,
&new_fe_bh, parent_fe_bh, handle,
- &inode, inode_ac);
+ inode_ac);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -285,8 +361,8 @@ static int ocfs2_mknod(struct inode *dir,
goto leave;
}
- status = ocfs2_journal_access(handle, dir, parent_fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, dir, parent_fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -300,6 +376,22 @@ static int ocfs2_mknod(struct inode *dir,
inc_nlink(dir);
}
+ status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh,
+ xattr_ac, data_ac);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+
+ if (si.enable) {
+ status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+ xattr_ac, data_ac);
+ if (status < 0) {
+ mlog_errno(status);
+ goto leave;
+ }
+ }
+
status = ocfs2_add_entry(handle, dentry, inode,
OCFS2_I(inode)->ip_blkno, parent_fe_bh,
de_bh);
@@ -320,6 +412,8 @@ static int ocfs2_mknod(struct inode *dir,
d_instantiate(dentry, inode);
status = 0;
leave:
+ if (status < 0 && did_quota_inode)
+ vfs_dq_free_inode(inode);
if (handle)
ocfs2_commit_trans(osb, handle);
@@ -331,9 +425,13 @@ leave:
brelse(new_fe_bh);
brelse(de_bh);
brelse(parent_fe_bh);
+ kfree(si.name);
+ kfree(si.value);
- if ((status < 0) && inode)
+ if ((status < 0) && inode) {
+ clear_nlink(inode);
iput(inode);
+ }
if (inode_ac)
ocfs2_free_alloc_context(inode_ac);
@@ -341,6 +439,9 @@ leave:
if (data_ac)
ocfs2_free_alloc_context(data_ac);
+ if (xattr_ac)
+ ocfs2_free_alloc_context(xattr_ac);
+
mlog_exit(status);
return status;
@@ -348,12 +449,12 @@ leave:
static int ocfs2_mknod_locked(struct ocfs2_super *osb,
struct inode *dir,
- struct dentry *dentry, int mode,
+ struct inode *inode,
+ struct dentry *dentry,
dev_t dev,
struct buffer_head **new_fe_bh,
struct buffer_head *parent_fe_bh,
handle_t *handle,
- struct inode **ret_inode,
struct ocfs2_alloc_context *inode_ac)
{
int status = 0;
@@ -361,14 +462,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
struct ocfs2_extent_list *fel;
u64 fe_blkno = 0;
u16 suballoc_bit;
- struct inode *inode = NULL;
- mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode,
- (unsigned long)dev, dentry->d_name.len,
+ mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry,
+ inode->i_mode, (unsigned long)dev, dentry->d_name.len,
dentry->d_name.name);
*new_fe_bh = NULL;
- *ret_inode = NULL;
status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit,
&fe_blkno);
@@ -377,23 +476,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
goto leave;
}
- inode = new_inode(dir->i_sb);
- if (!inode) {
- status = -ENOMEM;
- mlog(ML_ERROR, "new_inode failed!\n");
- goto leave;
- }
-
/* populate as many fields early on as possible - many of
* these are used by the support functions here and in
* callers. */
inode->i_ino = ino_from_blkno(osb->sb, fe_blkno);
OCFS2_I(inode)->ip_blkno = fe_blkno;
- if (S_ISDIR(mode))
- inode->i_nlink = 2;
- else
- inode->i_nlink = 1;
- inode->i_mode = mode;
spin_lock(&osb->osb_lock);
inode->i_generation = osb->s_next_generation++;
spin_unlock(&osb->osb_lock);
@@ -406,8 +493,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
}
ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh);
- status = ocfs2_journal_access(handle, inode, *new_fe_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ status = ocfs2_journal_access_di(handle, inode, *new_fe_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -421,17 +508,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
fe->i_blkno = cpu_to_le64(fe_blkno);
fe->i_suballoc_bit = cpu_to_le16(suballoc_bit);
fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot);
- fe->i_uid = cpu_to_le32(current_fsuid());
- if (dir->i_mode & S_ISGID) {
- fe->i_gid = cpu_to_le32(dir->i_gid);
- if (S_ISDIR(mode))
- mode |= S_ISGID;
- } else
- fe->i_gid = cpu_to_le32(current_fsgid());
- fe->i_mode = cpu_to_le16(mode);
- if (S_ISCHR(mode) || S_ISBLK(mode))
+ fe->i_uid = cpu_to_le32(inode->i_uid);
+ fe->i_gid = cpu_to_le32(inode->i_gid);
+ fe->i_mode = cpu_to_le16(inode->i_mode);
+ if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev));
-
fe->i_links_count = cpu_to_le16(inode->i_nlink);
fe->i_last_eb_blk = 0;
@@ -446,7 +527,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
/*
* If supported, directories start with inline data.
*/
- if (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) {
+ if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) {
u16 feat = le16_to_cpu(fe->i_dyn_features);
fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL);
@@ -465,15 +546,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
goto leave;
}
- if (ocfs2_populate_inode(inode, fe, 1) < 0) {
- mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, "
- "i_blkno=%llu, i_ino=%lu\n",
- (unsigned long long)(*new_fe_bh)->b_blocknr,
- (unsigned long long)le64_to_cpu(fe->i_blkno),
- inode->i_ino);
- BUG();
- }
-
+ ocfs2_populate_inode(inode, fe, 1);
ocfs2_inode_set_new(osb, inode);
if (!ocfs2_mount_local(osb)) {
status = ocfs2_create_new_inode_locks(inode);
@@ -484,17 +557,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb,
status = 0; /* error in ocfs2_create_new_inode_locks is not
* critical */
- *ret_inode = inode;
leave:
if (status < 0) {
if (*new_fe_bh) {
brelse(*new_fe_bh);
*new_fe_bh = NULL;
}
- if (inode) {
- clear_nlink(inode);
- iput(inode);
- }
}
mlog_exit(status);
@@ -588,7 +656,7 @@ static int ocfs2_link(struct dentry *old_dentry,
goto out_unlock_inode;
}
- handle = ocfs2_start_trans(osb, OCFS2_LINK_CREDITS);
+ handle = ocfs2_start_trans(osb, ocfs2_link_credits(osb->sb));
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
handle = NULL;
@@ -596,8 +664,8 @@ static int ocfs2_link(struct dentry *old_dentry,
goto out_unlock_inode;
}
- err = ocfs2_journal_access(handle, inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ err = ocfs2_journal_access_di(handle, inode, fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (err < 0) {
mlog_errno(err);
goto out_commit;
@@ -775,7 +843,7 @@ static int ocfs2_unlink(struct inode *dir,
}
}
- handle = ocfs2_start_trans(osb, OCFS2_UNLINK_CREDITS);
+ handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb));
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
handle = NULL;
@@ -783,8 +851,8 @@ static int ocfs2_unlink(struct inode *dir,
goto leave;
}
- status = ocfs2_journal_access(handle, inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, inode, fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -1181,7 +1249,7 @@ static int ocfs2_rename(struct inode *old_dir,
}
}
- handle = ocfs2_start_trans(osb, OCFS2_RENAME_CREDITS);
+ handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb));
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
handle = NULL;
@@ -1197,8 +1265,8 @@ static int ocfs2_rename(struct inode *old_dir,
goto bail;
}
}
- status = ocfs2_journal_access(handle, new_inode, newfe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, new_inode, newfe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1244,8 +1312,8 @@ static int ocfs2_rename(struct inode *old_dir,
old_inode->i_ctime = CURRENT_TIME;
mark_inode_dirty(old_inode);
- status = ocfs2_journal_access(handle, old_inode, old_inode_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, old_inode, old_inode_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status >= 0) {
old_di = (struct ocfs2_dinode *) old_inode_bh->b_data;
@@ -1321,9 +1389,9 @@ static int ocfs2_rename(struct inode *old_dir,
(int)old_dir_nlink, old_dir->i_nlink);
} else {
struct ocfs2_dinode *fe;
- status = ocfs2_journal_access(handle, old_dir,
- old_dir_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, old_dir,
+ old_dir_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
fe = (struct ocfs2_dinode *) old_dir_bh->b_data;
fe->i_links_count = cpu_to_le16(old_dir->i_nlink);
status = ocfs2_journal_dirty(handle, old_dir_bh);
@@ -1496,6 +1564,13 @@ static int ocfs2_symlink(struct inode *dir,
handle_t *handle = NULL;
struct ocfs2_alloc_context *inode_ac = NULL;
struct ocfs2_alloc_context *data_ac = NULL;
+ struct ocfs2_alloc_context *xattr_ac = NULL;
+ int want_clusters = 0;
+ int xattr_credits = 0;
+ struct ocfs2_security_xattr_info si = {
+ .enable = 1,
+ };
+ int did_quota = 0, did_quota_inode = 0;
mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir,
dentry, symname, dentry->d_name.len, dentry->d_name.name);
@@ -1542,17 +1617,46 @@ static int ocfs2_symlink(struct inode *dir,
goto bail;
}
- /* don't reserve bitmap space for fast symlinks. */
- if (l > ocfs2_fast_symlink_chars(sb)) {
- status = ocfs2_reserve_clusters(osb, 1, &data_ac);
+ inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO);
+ if (!inode) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+
+ /* get security xattr */
+ status = ocfs2_init_security_get(inode, dir, &si);
+ if (status) {
+ if (status == -EOPNOTSUPP)
+ si.enable = 0;
+ else {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
+ /* calculate meta data/clusters for setting security xattr */
+ if (si.enable) {
+ status = ocfs2_calc_security_init(dir, &si, &want_clusters,
+ &xattr_credits, &xattr_ac);
if (status < 0) {
- if (status != -ENOSPC)
- mlog_errno(status);
+ mlog_errno(status);
goto bail;
}
}
- handle = ocfs2_start_trans(osb, credits);
+ /* don't reserve bitmap space for fast symlinks. */
+ if (l > ocfs2_fast_symlink_chars(sb))
+ want_clusters += 1;
+
+ status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac);
+ if (status < 0) {
+ if (status != -ENOSPC)
+ mlog_errno(status);
+ goto bail;
+ }
+
+ handle = ocfs2_start_trans(osb, credits + xattr_credits);
if (IS_ERR(handle)) {
status = PTR_ERR(handle);
handle = NULL;
@@ -1560,10 +1664,18 @@ static int ocfs2_symlink(struct inode *dir,
goto bail;
}
- status = ocfs2_mknod_locked(osb, dir, dentry,
- S_IFLNK | S_IRWXUGO, 0,
- &new_fe_bh, parent_fe_bh, handle,
- &inode, inode_ac);
+ /* We don't use standard VFS wrapper because we don't want vfs_dq_init
+ * to be called. */
+ if (sb_any_quota_active(osb->sb) &&
+ osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) {
+ status = -EDQUOT;
+ goto bail;
+ }
+ did_quota_inode = 1;
+
+ status = ocfs2_mknod_locked(osb, dir, inode, dentry,
+ 0, &new_fe_bh, parent_fe_bh, handle,
+ inode_ac);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1576,6 +1688,12 @@ static int ocfs2_symlink(struct inode *dir,
u32 offset = 0;
inode->i_op = &ocfs2_symlink_inode_operations;
+ if (vfs_dq_alloc_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, 1))) {
+ status = -EDQUOT;
+ goto bail;
+ }
+ did_quota = 1;
status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0,
new_fe_bh,
handle, data_ac, NULL,
@@ -1614,6 +1732,15 @@ static int ocfs2_symlink(struct inode *dir,
}
}
+ if (si.enable) {
+ status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si,
+ xattr_ac, data_ac);
+ if (status < 0) {
+ mlog_errno(status);
+ goto bail;
+ }
+ }
+
status = ocfs2_add_entry(handle, dentry, inode,
le64_to_cpu(fe->i_blkno), parent_fe_bh,
de_bh);
@@ -1632,6 +1759,11 @@ static int ocfs2_symlink(struct inode *dir,
dentry->d_op = &ocfs2_dentry_ops;
d_instantiate(dentry, inode);
bail:
+ if (status < 0 && did_quota)
+ vfs_dq_free_space_nodirty(inode,
+ ocfs2_clusters_to_bytes(osb->sb, 1));
+ if (status < 0 && did_quota_inode)
+ vfs_dq_free_inode(inode);
if (handle)
ocfs2_commit_trans(osb, handle);
@@ -1640,12 +1772,18 @@ bail:
brelse(new_fe_bh);
brelse(parent_fe_bh);
brelse(de_bh);
+ kfree(si.name);
+ kfree(si.value);
if (inode_ac)
ocfs2_free_alloc_context(inode_ac);
if (data_ac)
ocfs2_free_alloc_context(data_ac);
- if ((status < 0) && inode)
+ if (xattr_ac)
+ ocfs2_free_alloc_context(xattr_ac);
+ if ((status < 0) && inode) {
+ clear_nlink(inode);
iput(inode);
+ }
mlog_exit(status);
@@ -1754,16 +1892,14 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino);
- status = ocfs2_read_block(orphan_dir_inode,
- OCFS2_I(orphan_dir_inode)->ip_blkno,
- &orphan_dir_bh);
+ status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, orphan_dir_inode, orphan_dir_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
@@ -1850,8 +1986,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
goto leave;
}
- status = ocfs2_journal_access(handle,orphan_dir_inode, orphan_dir_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle,orphan_dir_inode, orphan_dir_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 3fed9e3d8992..ad5c24a29edd 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -161,6 +161,7 @@ enum ocfs2_vol_state
{
VOLUME_INIT = 0,
VOLUME_MOUNTED,
+ VOLUME_MOUNTED_QUOTAS,
VOLUME_DISMOUNTED,
VOLUME_DISABLED
};
@@ -195,6 +196,9 @@ enum ocfs2_mount_options
OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */
OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */
OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */
+ OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* POSIX access control lists */
+ OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */
+ OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
};
#define OCFS2_OSB_SOFT_RO 0x0001
@@ -205,6 +209,7 @@ enum ocfs2_mount_options
struct ocfs2_journal;
struct ocfs2_slot_info;
struct ocfs2_recovery_map;
+struct ocfs2_quota_recovery;
struct ocfs2_super
{
struct task_struct *commit_task;
@@ -286,10 +291,11 @@ struct ocfs2_super
char *local_alloc_debug_buf;
#endif
- /* Next two fields are for local node slot recovery during
+ /* Next three fields are for local node slot recovery during
* mount. */
int dirty;
struct ocfs2_dinode *local_alloc_copy;
+ struct ocfs2_quota_recovery *quota_rec;
struct ocfs2_alloc_stats alloc_stats;
char dev_str[20]; /* "major,minor" of the device */
@@ -333,6 +339,10 @@ struct ocfs2_super
#define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info)
+/* Useful typedef for passing around journal access functions */
+typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct inode *inode,
+ struct buffer_head *bh, int type);
+
static inline int ocfs2_should_order_data(struct inode *inode)
{
if (!S_ISREG(inode->i_mode))
@@ -376,6 +386,13 @@ static inline int ocfs2_supports_xattr(struct ocfs2_super *osb)
return 0;
}
+static inline int ocfs2_meta_ecc(struct ocfs2_super *osb)
+{
+ if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC)
+ return 1;
+ return 0;
+}
+
/* set / clear functions because cluster events can make these happen
* in parallel so we want the transitions to be atomic. this also
* means that any future flags osb_flags must be protected by spinlock
@@ -443,39 +460,19 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
#define OCFS2_IS_VALID_DINODE(ptr) \
(!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE))
-#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di) do { \
- typeof(__di) ____di = (__di); \
- ocfs2_error((__sb), \
- "Dinode # %llu has bad signature %.*s", \
- (unsigned long long)le64_to_cpu((____di)->i_blkno), 7, \
- (____di)->i_signature); \
-} while (0)
-
#define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \
(!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE))
-#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb) do { \
- typeof(__eb) ____eb = (__eb); \
- ocfs2_error((__sb), \
- "Extent Block # %llu has bad signature %.*s", \
- (unsigned long long)le64_to_cpu((____eb)->h_blkno), 7, \
- (____eb)->h_signature); \
-} while (0)
-
#define OCFS2_IS_VALID_GROUP_DESC(ptr) \
(!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE))
-#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd) do { \
- typeof(__gd) ____gd = (__gd); \
- ocfs2_error((__sb), \
- "Group Descriptor # %llu has bad signature %.*s", \
- (unsigned long long)le64_to_cpu((____gd)->bg_blkno), 7, \
- (____gd)->bg_signature); \
-} while (0)
#define OCFS2_IS_VALID_XATTR_BLOCK(ptr) \
(!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE))
+#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \
+ (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE))
+
static inline unsigned long ino_from_blkno(struct super_block *sb,
u64 blkno)
{
@@ -632,5 +629,6 @@ static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb)
#define ocfs2_clear_bit ext2_clear_bit
#define ocfs2_test_bit ext2_test_bit
#define ocfs2_find_next_zero_bit ext2_find_next_zero_bit
+#define ocfs2_find_next_bit ext2_find_next_bit
#endif /* OCFS2_H */
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 5e0c0d0aef7d..c7ae45aaa36c 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -65,6 +65,7 @@
#define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01"
#define OCFS2_GROUP_DESC_SIGNATURE "GROUP01"
#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01"
+#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1"
/* Compatibility flags */
#define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \
@@ -93,8 +94,11 @@
| OCFS2_FEATURE_INCOMPAT_INLINE_DATA \
| OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \
| OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \
- | OCFS2_FEATURE_INCOMPAT_XATTR)
-#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN
+ | OCFS2_FEATURE_INCOMPAT_XATTR \
+ | OCFS2_FEATURE_INCOMPAT_META_ECC)
+#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
+ | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
+ | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
/*
* Heartbeat-only devices are missing journals and other files. The
@@ -147,6 +151,9 @@
/* Support for extended attributes */
#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200
+/* Metadata checksum and error correction */
+#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800
+
/*
* backup superblock flag is used to indicate that this volume
* has backup superblocks.
@@ -163,6 +170,12 @@
*/
#define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001
+/*
+ * Maintain quota information for this filesystem
+ */
+#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002
+#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004
+
/* The byte offset of the first backup block will be 1G.
* The following will be 4G, 16G, 64G, 256G and 1T.
*/
@@ -192,6 +205,7 @@
#define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */
#define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */
#define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */
+#define OCFS2_QUOTA_FL (0x00001000) /* Quota file */
/*
* Flags on ocfs2_dinode.i_dyn_features
@@ -329,13 +343,17 @@ enum {
#define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE
HEARTBEAT_SYSTEM_INODE,
GLOBAL_BITMAP_SYSTEM_INODE,
-#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE
+ USER_QUOTA_SYSTEM_INODE,
+ GROUP_QUOTA_SYSTEM_INODE,
+#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE
ORPHAN_DIR_SYSTEM_INODE,
EXTENT_ALLOC_SYSTEM_INODE,
INODE_ALLOC_SYSTEM_INODE,
JOURNAL_SYSTEM_INODE,
LOCAL_ALLOC_SYSTEM_INODE,
TRUNCATE_LOG_SYSTEM_INODE,
+ LOCAL_USER_QUOTA_SYSTEM_INODE,
+ LOCAL_GROUP_QUOTA_SYSTEM_INODE,
NUM_SYSTEM_INODES
};
@@ -349,6 +367,8 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
[SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 },
[HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 },
[GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 },
+ [USER_QUOTA_SYSTEM_INODE] = { "aquota.user", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+ [GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group", OCFS2_QUOTA_FL, S_IFREG | 0644 },
/* Slot-specific system inodes (one copy per slot) */
[ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 },
@@ -356,7 +376,9 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = {
[INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 },
[JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 },
[LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 },
- [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }
+ [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 },
+ [LOCAL_USER_QUOTA_SYSTEM_INODE] = { "aquota.user:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
+ [LOCAL_GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 },
};
/* Parameter passed from mount.ocfs2 to module */
@@ -410,6 +432,22 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
#define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super))
/*
+ * Block checking structure. This is used in metadata to validate the
+ * contents. If OCFS2_FEATURE_INCOMPAT_META_ECC is not set, it is all
+ * zeros.
+ */
+struct ocfs2_block_check {
+/*00*/ __le32 bc_crc32e; /* 802.3 Ethernet II CRC32 */
+ __le16 bc_ecc; /* Single-error-correction parity vector.
+ This is a simple Hamming code dependant
+ on the blocksize. OCFS2's maximum
+ blocksize, 4K, requires 16 parity bits,
+ so we fit in __le16. */
+ __le16 bc_reserved1;
+/*08*/
+};
+
+/*
* On disk extent record for OCFS2
* It describes a range of clusters on disk.
*
@@ -496,7 +534,7 @@ struct ocfs2_truncate_log {
struct ocfs2_extent_block
{
/*00*/ __u8 h_signature[8]; /* Signature for verification */
- __le64 h_reserved1;
+ struct ocfs2_block_check h_check; /* Error checking */
/*10*/ __le16 h_suballoc_slot; /* Slot suballocator this
extent_header belongs to */
__le16 h_suballoc_bit; /* Bit offset in suballocator
@@ -666,7 +704,8 @@ struct ocfs2_dinode {
was set in i_flags */
__le16 i_dyn_features;
__le64 i_xattr_loc;
-/*80*/ __le64 i_reserved2[7];
+/*80*/ struct ocfs2_block_check i_check; /* Error checking */
+/*88*/ __le64 i_reserved2[6];
/*B8*/ union {
__le64 i_pad1; /* Generic way to refer to this
64bit union */
@@ -715,6 +754,34 @@ struct ocfs2_dir_entry {
} __attribute__ ((packed));
/*
+ * Per-block record for the unindexed directory btree. This is carefully
+ * crafted so that the rec_len and name_len records of an ocfs2_dir_entry are
+ * mirrored. That way, the directory manipulation code needs a minimal amount
+ * of update.
+ *
+ * NOTE: Keep this structure aligned to a multiple of 4 bytes.
+ */
+struct ocfs2_dir_block_trailer {
+/*00*/ __le64 db_compat_inode; /* Always zero. Was inode */
+
+ __le16 db_compat_rec_len; /* Backwards compatible with
+ * ocfs2_dir_entry. */
+ __u8 db_compat_name_len; /* Always zero. Was name_len */
+ __u8 db_reserved0;
+ __le16 db_reserved1;
+ __le16 db_free_rec_len; /* Size of largest empty hole
+ * in this block. (unused) */
+/*10*/ __u8 db_signature[8]; /* Signature for verification */
+ __le64 db_reserved2;
+ __le64 db_free_next; /* Next block in list (unused) */
+/*20*/ __le64 db_blkno; /* Offset on disk, in blocks */
+ __le64 db_parent_dinode; /* dinode which owns me, in
+ blocks */
+/*30*/ struct ocfs2_block_check db_check; /* Error checking */
+/*40*/
+};
+
+/*
* On disk allocator group structure for OCFS2
*/
struct ocfs2_group_desc
@@ -733,7 +800,8 @@ struct ocfs2_group_desc
/*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in
blocks */
__le64 bg_blkno; /* Offset on disk, in blocks */
-/*30*/ __le64 bg_reserved2[2];
+/*30*/ struct ocfs2_block_check bg_check; /* Error checking */
+ __le64 bg_reserved2;
/*40*/ __u8 bg_bitmap[0];
};
@@ -776,7 +844,12 @@ struct ocfs2_xattr_header {
in this extent record,
only valid in the first
bucket. */
- __le64 xh_csum;
+ struct ocfs2_block_check xh_check; /* Error checking
+ (Note, this is only
+ used for xattr
+ buckets. A block uses
+ xb_check and sets
+ this field to zero.) */
struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */
};
@@ -827,7 +900,7 @@ struct ocfs2_xattr_block {
block group */
__le32 xb_fs_generation; /* Must match super block */
/*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */
- __le64 xb_csum;
+ struct ocfs2_block_check xb_check; /* Error checking */
/*20*/ __le16 xb_flags; /* Indicates whether this block contains
real xattr or a xattr tree. */
__le16 xb_reserved0;
@@ -868,6 +941,128 @@ static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe)
return xe->xe_type & OCFS2_XATTR_TYPE_MASK;
}
+/*
+ * On disk structures for global quota file
+ */
+
+/* Magic numbers and known versions for global quota files */
+#define OCFS2_GLOBAL_QMAGICS {\
+ 0x0cf52470, /* USRQUOTA */ \
+ 0x0cf52471 /* GRPQUOTA */ \
+}
+
+#define OCFS2_GLOBAL_QVERSIONS {\
+ 0, \
+ 0, \
+}
+
+
+/* Each block of each quota file has a certain fixed number of bytes reserved
+ * for OCFS2 internal use at its end. OCFS2 can use it for things like
+ * checksums, etc. */
+#define OCFS2_QBLK_RESERVED_SPACE 8
+
+/* Generic header of all quota files */
+struct ocfs2_disk_dqheader {
+ __le32 dqh_magic; /* Magic number identifying file */
+ __le32 dqh_version; /* Quota format version */
+};
+
+#define OCFS2_GLOBAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+
+/* Information header of global quota file (immediately follows the generic
+ * header) */
+struct ocfs2_global_disk_dqinfo {
+/*00*/ __le32 dqi_bgrace; /* Grace time for space softlimit excess */
+ __le32 dqi_igrace; /* Grace time for inode softlimit excess */
+ __le32 dqi_syncms; /* Time after which we sync local changes to
+ * global quota file */
+ __le32 dqi_blocks; /* Number of blocks in quota file */
+/*10*/ __le32 dqi_free_blk; /* First free block in quota file */
+ __le32 dqi_free_entry; /* First block with free dquot entry in quota
+ * file */
+};
+
+/* Structure with global user / group information. We reserve some space
+ * for future use. */
+struct ocfs2_global_disk_dqblk {
+/*00*/ __le32 dqb_id; /* ID the structure belongs to */
+ __le32 dqb_use_count; /* Number of nodes having reference to this structure */
+ __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */
+/*10*/ __le64 dqb_isoftlimit; /* preferred inode limit */
+ __le64 dqb_curinodes; /* current # allocated inodes */
+/*20*/ __le64 dqb_bhardlimit; /* absolute limit on disk space */
+ __le64 dqb_bsoftlimit; /* preferred limit on disk space */
+/*30*/ __le64 dqb_curspace; /* current space occupied */
+ __le64 dqb_btime; /* time limit for excessive disk use */
+/*40*/ __le64 dqb_itime; /* time limit for excessive inode use */
+ __le64 dqb_pad1;
+/*50*/ __le64 dqb_pad2;
+};
+
+/*
+ * On-disk structures for local quota file
+ */
+
+/* Magic numbers and known versions for local quota files */
+#define OCFS2_LOCAL_QMAGICS {\
+ 0x0cf524c0, /* USRQUOTA */ \
+ 0x0cf524c1 /* GRPQUOTA */ \
+}
+
+#define OCFS2_LOCAL_QVERSIONS {\
+ 0, \
+ 0, \
+}
+
+/* Quota flags in dqinfo header */
+#define OLQF_CLEAN 0x0001 /* Quota file is empty (this should be after\
+ * quota has been cleanly turned off) */
+
+#define OCFS2_LOCAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader))
+
+/* Information header of local quota file (immediately follows the generic
+ * header) */
+struct ocfs2_local_disk_dqinfo {
+ __le32 dqi_flags; /* Flags for quota file */
+ __le32 dqi_chunks; /* Number of chunks of quota structures
+ * with a bitmap */
+ __le32 dqi_blocks; /* Number of blocks allocated for quota file */
+};
+
+/* Header of one chunk of a quota file */
+struct ocfs2_local_disk_chunk {
+ __le32 dqc_free; /* Number of free entries in the bitmap */
+ u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding
+ * chunk of quota file */
+};
+
+/* One entry in local quota file */
+struct ocfs2_local_disk_dqblk {
+/*00*/ __le64 dqb_id; /* id this quota applies to */
+ __le64 dqb_spacemod; /* Change in the amount of used space */
+/*10*/ __le64 dqb_inodemod; /* Change in the amount of used inodes */
+};
+
+
+/*
+ * The quota trailer lives at the end of each quota block.
+ */
+
+struct ocfs2_disk_dqtrailer {
+/*00*/ struct ocfs2_block_check dq_check; /* Error checking */
+/*08*/ /* Cannot be larger than OCFS2_QBLK_RESERVED_SPACE */
+};
+
+static inline struct ocfs2_disk_dqtrailer *ocfs2_block_dqtrailer(int blocksize,
+ void *buf)
+{
+ char *ptr = buf;
+ ptr += blocksize - OCFS2_QBLK_RESERVED_SPACE;
+
+ return (struct ocfs2_disk_dqtrailer *)ptr;
+}
+
#ifdef __KERNEL__
static inline int ocfs2_fast_symlink_chars(struct super_block *sb)
{
diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h
deleted file mode 100644
index b91c78f8f558..000000000000
--- a/fs/ocfs2/ocfs2_jbd_compat.h
+++ /dev/null
@@ -1,82 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * ocfs2_jbd_compat.h
- *
- * Compatibility defines for JBD.
- *
- * Copyright (C) 2008 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- */
-
-#ifndef OCFS2_JBD_COMPAT_H
-#define OCFS2_JBD_COMPAT_H
-
-#ifndef CONFIG_OCFS2_COMPAT_JBD
-# error Should not have been included
-#endif
-
-struct jbd2_inode {
- unsigned int dummy;
-};
-
-#define JBD2_BARRIER JFS_BARRIER
-#define JBD2_DEFAULT_MAX_COMMIT_AGE JBD_DEFAULT_MAX_COMMIT_AGE
-
-#define jbd2_journal_ack_err journal_ack_err
-#define jbd2_journal_clear_err journal_clear_err
-#define jbd2_journal_destroy journal_destroy
-#define jbd2_journal_dirty_metadata journal_dirty_metadata
-#define jbd2_journal_errno journal_errno
-#define jbd2_journal_extend journal_extend
-#define jbd2_journal_flush journal_flush
-#define jbd2_journal_force_commit journal_force_commit
-#define jbd2_journal_get_write_access journal_get_write_access
-#define jbd2_journal_get_undo_access journal_get_undo_access
-#define jbd2_journal_init_inode journal_init_inode
-#define jbd2_journal_invalidatepage journal_invalidatepage
-#define jbd2_journal_load journal_load
-#define jbd2_journal_lock_updates journal_lock_updates
-#define jbd2_journal_restart journal_restart
-#define jbd2_journal_start journal_start
-#define jbd2_journal_start_commit journal_start_commit
-#define jbd2_journal_stop journal_stop
-#define jbd2_journal_try_to_free_buffers journal_try_to_free_buffers
-#define jbd2_journal_unlock_updates journal_unlock_updates
-#define jbd2_journal_wipe journal_wipe
-#define jbd2_log_wait_commit log_wait_commit
-
-static inline int jbd2_journal_file_inode(handle_t *handle,
- struct jbd2_inode *inode)
-{
- return 0;
-}
-
-static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
- loff_t new_size)
-{
- return 0;
-}
-
-static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode,
- struct inode *inode)
-{
- return;
-}
-
-static inline void jbd2_journal_release_jbd_inode(journal_t *journal,
- struct jbd2_inode *jinode)
-{
- return;
-}
-
-
-#endif /* OCFS2_JBD_COMPAT_H */
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 82c200f7a8f1..eb6f50c9ceca 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -46,6 +46,7 @@ enum ocfs2_lock_type {
OCFS2_LOCK_TYPE_DENTRY,
OCFS2_LOCK_TYPE_OPEN,
OCFS2_LOCK_TYPE_FLOCK,
+ OCFS2_LOCK_TYPE_QINFO,
OCFS2_NUM_LOCK_TYPES
};
@@ -77,6 +78,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
case OCFS2_LOCK_TYPE_FLOCK:
c = 'F';
break;
+ case OCFS2_LOCK_TYPE_QINFO:
+ c = 'Q';
+ break;
default:
c = '\0';
}
@@ -95,6 +99,7 @@ static char *ocfs2_lock_type_strings[] = {
[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
[OCFS2_LOCK_TYPE_OPEN] = "Open",
[OCFS2_LOCK_TYPE_FLOCK] = "Flock",
+ [OCFS2_LOCK_TYPE_QINFO] = "Quota",
};
static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
new file mode 100644
index 000000000000..7365e2e08706
--- /dev/null
+++ b/fs/ocfs2/quota.h
@@ -0,0 +1,119 @@
+/*
+ * quota.h for OCFS2
+ *
+ * On disk quota structures for local and global quota file, in-memory
+ * structures.
+ *
+ */
+
+#ifndef _OCFS2_QUOTA_H
+#define _OCFS2_QUOTA_H
+
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/quota.h>
+#include <linux/list.h>
+#include <linux/dqblk_qtree.h>
+
+#include "ocfs2.h"
+
+/* Common stuff */
+/* id number of quota format */
+#define QFMT_OCFS2 3
+
+/*
+ * In-memory structures
+ */
+struct ocfs2_dquot {
+ struct dquot dq_dquot; /* Generic VFS dquot */
+ loff_t dq_local_off; /* Offset in the local quota file */
+ struct ocfs2_quota_chunk *dq_chunk; /* Chunk dquot is in */
+ unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */
+ s64 dq_origspace; /* Last globally synced space usage */
+ s64 dq_originodes; /* Last globally synced inode usage */
+};
+
+/* Description of one chunk to recover in memory */
+struct ocfs2_recovery_chunk {
+ struct list_head rc_list; /* List of chunks */
+ int rc_chunk; /* Chunk number */
+ unsigned long *rc_bitmap; /* Bitmap of entries to recover */
+};
+
+struct ocfs2_quota_recovery {
+ struct list_head r_list[MAXQUOTAS]; /* List of chunks to recover */
+};
+
+/* In-memory structure with quota header information */
+struct ocfs2_mem_dqinfo {
+ unsigned int dqi_type; /* Quota type this structure describes */
+ unsigned int dqi_chunks; /* Number of chunks in local quota file */
+ unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */
+ unsigned int dqi_syncms; /* How often should we sync with other nodes */
+ unsigned int dqi_syncjiff; /* Precomputed dqi_syncms in jiffies */
+ struct list_head dqi_chunk; /* List of chunks */
+ struct inode *dqi_gqinode; /* Global quota file inode */
+ struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */
+ struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */
+ int dqi_gqi_count; /* Number of holders of dqi_gqi_bh */
+ struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */
+ struct buffer_head *dqi_ibh; /* Buffer with information header */
+ struct qtree_mem_dqinfo dqi_gi; /* Info about global file */
+ struct delayed_work dqi_sync_work; /* Work for syncing dquots */
+ struct ocfs2_quota_recovery *dqi_rec; /* Pointer to recovery
+ * information, in case we
+ * enable quotas on file
+ * needing it */
+};
+
+static inline struct ocfs2_dquot *OCFS2_DQUOT(struct dquot *dquot)
+{
+ return container_of(dquot, struct ocfs2_dquot, dq_dquot);
+}
+
+struct ocfs2_quota_chunk {
+ struct list_head qc_chunk; /* List of quotafile chunks */
+ int qc_num; /* Number of quota chunk */
+ struct buffer_head *qc_headerbh; /* Buffer head with chunk header */
+};
+
+extern struct kmem_cache *ocfs2_dquot_cachep;
+extern struct kmem_cache *ocfs2_qf_chunk_cachep;
+
+extern struct qtree_fmt_operations ocfs2_global_ops;
+
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+ struct ocfs2_super *osb, int slot_num);
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+ struct ocfs2_quota_recovery *rec,
+ int slot_num);
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec);
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+ size_t len, loff_t off);
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+ const char *data, size_t len, loff_t off);
+int ocfs2_global_read_info(struct super_block *sb, int type);
+int ocfs2_global_write_info(struct super_block *sb, int type);
+int ocfs2_global_read_dquot(struct dquot *dquot);
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing);
+static inline int ocfs2_sync_dquot(struct dquot *dquot)
+{
+ return __ocfs2_sync_dquot(dquot, 0);
+}
+static inline int ocfs2_global_release_dquot(struct dquot *dquot)
+{
+ return __ocfs2_sync_dquot(dquot, 1);
+}
+
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex);
+int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+ struct buffer_head **bh);
+
+extern struct dquot_operations ocfs2_quota_operations;
+extern struct quota_format_type ocfs2_quota_format;
+
+int ocfs2_quota_setup(void);
+void ocfs2_quota_shutdown(void);
+
+#endif /* _OCFS2_QUOTA_H */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
new file mode 100644
index 000000000000..6aff8f2d3e49
--- /dev/null
+++ b/fs/ocfs2/quota_global.c
@@ -0,0 +1,1025 @@
+/*
+ * Implementation of operations over global quota file
+ */
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/dqblk_qtree.h>
+#include <linux/jiffies.h>
+#include <linux/writeback.h>
+#include <linux/workqueue.h>
+
+#define MLOG_MASK_PREFIX ML_QUOTA
+#include <cluster/masklog.h>
+
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "alloc.h"
+#include "blockcheck.h"
+#include "inode.h"
+#include "journal.h"
+#include "file.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "uptodate.h"
+#include "quota.h"
+
+static struct workqueue_struct *ocfs2_quota_wq = NULL;
+
+static void qsync_work_fn(struct work_struct *work);
+
+static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp)
+{
+ struct ocfs2_global_disk_dqblk *d = dp;
+ struct mem_dqblk *m = &dquot->dq_dqb;
+
+ /* Update from disk only entries not set by the admin */
+ if (!test_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags)) {
+ m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit);
+ m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit);
+ }
+ if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+ m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes);
+ if (!test_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags)) {
+ m->dqb_bhardlimit = le64_to_cpu(d->dqb_bhardlimit);
+ m->dqb_bsoftlimit = le64_to_cpu(d->dqb_bsoftlimit);
+ }
+ if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+ m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
+ if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags))
+ m->dqb_btime = le64_to_cpu(d->dqb_btime);
+ if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags))
+ m->dqb_itime = le64_to_cpu(d->dqb_itime);
+ OCFS2_DQUOT(dquot)->dq_use_count = le32_to_cpu(d->dqb_use_count);
+}
+
+static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
+{
+ struct ocfs2_global_disk_dqblk *d = dp;
+ struct mem_dqblk *m = &dquot->dq_dqb;
+
+ d->dqb_id = cpu_to_le32(dquot->dq_id);
+ d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count);
+ d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit);
+ d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit);
+ d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes);
+ d->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit);
+ d->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit);
+ d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
+ d->dqb_btime = cpu_to_le64(m->dqb_btime);
+ d->dqb_itime = cpu_to_le64(m->dqb_itime);
+}
+
+static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
+{
+ struct ocfs2_global_disk_dqblk *d = dp;
+ struct ocfs2_mem_dqinfo *oinfo =
+ sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+
+ if (qtree_entry_unused(&oinfo->dqi_gi, dp))
+ return 0;
+ return le32_to_cpu(d->dqb_id) == dquot->dq_id;
+}
+
+struct qtree_fmt_operations ocfs2_global_ops = {
+ .mem2disk_dqblk = ocfs2_global_mem2diskdqb,
+ .disk2mem_dqblk = ocfs2_global_disk2memdqb,
+ .is_id = ocfs2_global_is_id,
+};
+
+static int ocfs2_validate_quota_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ struct ocfs2_disk_dqtrailer *dqt =
+ ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data);
+
+ mlog(0, "Validating quota block %llu\n",
+ (unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ */
+ return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check);
+}
+
+int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
+ struct buffer_head **bh)
+{
+ int rc = 0;
+ struct buffer_head *tmp = *bh;
+
+ rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
+ ocfs2_validate_quota_block);
+ if (rc)
+ mlog_errno(rc);
+
+ /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
+static int ocfs2_get_quota_block(struct inode *inode, int block,
+ struct buffer_head **bh)
+{
+ u64 pblock, pcount;
+ int err;
+
+ down_read(&OCFS2_I(inode)->ip_alloc_sem);
+ err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL);
+ up_read(&OCFS2_I(inode)->ip_alloc_sem);
+ if (err) {
+ mlog_errno(err);
+ return err;
+ }
+ *bh = sb_getblk(inode->i_sb, pblock);
+ if (!*bh) {
+ err = -EIO;
+ mlog_errno(err);
+ }
+ return err;;
+}
+
+/* Read data from global quotafile - avoid pagecache and such because we cannot
+ * afford acquiring the locks... We use quota cluster lock to serialize
+ * operations. Caller is responsible for acquiring it. */
+ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data,
+ size_t len, loff_t off)
+{
+ struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ struct inode *gqinode = oinfo->dqi_gqinode;
+ loff_t i_size = i_size_read(gqinode);
+ int offset = off & (sb->s_blocksize - 1);
+ sector_t blk = off >> sb->s_blocksize_bits;
+ int err = 0;
+ struct buffer_head *bh;
+ size_t toread, tocopy;
+
+ if (off > i_size)
+ return 0;
+ if (off + len > i_size)
+ len = i_size - off;
+ toread = len;
+ while (toread > 0) {
+ tocopy = min_t(size_t, (sb->s_blocksize - offset), toread);
+ bh = NULL;
+ err = ocfs2_read_quota_block(gqinode, blk, &bh);
+ if (err) {
+ mlog_errno(err);
+ return err;
+ }
+ memcpy(data, bh->b_data + offset, tocopy);
+ brelse(bh);
+ offset = 0;
+ toread -= tocopy;
+ data += tocopy;
+ blk++;
+ }
+ return len;
+}
+
+/* Write to quotafile (we know the transaction is already started and has
+ * enough credits) */
+ssize_t ocfs2_quota_write(struct super_block *sb, int type,
+ const char *data, size_t len, loff_t off)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ struct inode *gqinode = oinfo->dqi_gqinode;
+ int offset = off & (sb->s_blocksize - 1);
+ sector_t blk = off >> sb->s_blocksize_bits;
+ int err = 0, new = 0, ja_type;
+ struct buffer_head *bh = NULL;
+ handle_t *handle = journal_current_handle();
+
+ if (!handle) {
+ mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled "
+ "because transaction was not started.\n",
+ (unsigned long long)off, (unsigned long long)len);
+ return -EIO;
+ }
+ if (len > sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset) {
+ WARN_ON(1);
+ len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset;
+ }
+
+ mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
+ if (gqinode->i_size < off + len) {
+ down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+ err = ocfs2_extend_no_holes(gqinode, off + len, off);
+ up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+ if (err < 0)
+ goto out;
+ err = ocfs2_simple_size_update(gqinode,
+ oinfo->dqi_gqi_bh,
+ off + len);
+ if (err < 0)
+ goto out;
+ new = 1;
+ }
+ /* Not rewriting whole block? */
+ if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) &&
+ !new) {
+ err = ocfs2_read_quota_block(gqinode, blk, &bh);
+ ja_type = OCFS2_JOURNAL_ACCESS_WRITE;
+ } else {
+ err = ocfs2_get_quota_block(gqinode, blk, &bh);
+ ja_type = OCFS2_JOURNAL_ACCESS_CREATE;
+ }
+ if (err) {
+ mlog_errno(err);
+ return err;
+ }
+ lock_buffer(bh);
+ if (new)
+ memset(bh->b_data, 0, sb->s_blocksize);
+ memcpy(bh->b_data + offset, data, len);
+ flush_dcache_page(bh->b_page);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ ocfs2_set_buffer_uptodate(gqinode, bh);
+ err = ocfs2_journal_access_dq(handle, gqinode, bh, ja_type);
+ if (err < 0) {
+ brelse(bh);
+ goto out;
+ }
+ err = ocfs2_journal_dirty(handle, bh);
+ brelse(bh);
+ if (err < 0)
+ goto out;
+out:
+ if (err) {
+ mutex_unlock(&gqinode->i_mutex);
+ mlog_errno(err);
+ return err;
+ }
+ gqinode->i_version++;
+ ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh);
+ mutex_unlock(&gqinode->i_mutex);
+ return len;
+}
+
+int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+ int status;
+ struct buffer_head *bh = NULL;
+
+ status = ocfs2_inode_lock(oinfo->dqi_gqinode, &bh, ex);
+ if (status < 0)
+ return status;
+ spin_lock(&dq_data_lock);
+ if (!oinfo->dqi_gqi_count++)
+ oinfo->dqi_gqi_bh = bh;
+ else
+ WARN_ON(bh != oinfo->dqi_gqi_bh);
+ spin_unlock(&dq_data_lock);
+ return 0;
+}
+
+void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
+{
+ ocfs2_inode_unlock(oinfo->dqi_gqinode, ex);
+ brelse(oinfo->dqi_gqi_bh);
+ spin_lock(&dq_data_lock);
+ if (!--oinfo->dqi_gqi_count)
+ oinfo->dqi_gqi_bh = NULL;
+ spin_unlock(&dq_data_lock);
+}
+
+/* Read information header from global quota file */
+int ocfs2_global_read_info(struct super_block *sb, int type)
+{
+ struct inode *gqinode = NULL;
+ unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+ GROUP_QUOTA_SYSTEM_INODE };
+ struct ocfs2_global_disk_dqinfo dinfo;
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ int status;
+
+ mlog_entry_void();
+
+ /* Read global header */
+ gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+ OCFS2_INVALID_SLOT);
+ if (!gqinode) {
+ mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n",
+ type);
+ status = -EINVAL;
+ goto out_err;
+ }
+ oinfo->dqi_gi.dqi_sb = sb;
+ oinfo->dqi_gi.dqi_type = type;
+ ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo);
+ oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk);
+ oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops;
+ oinfo->dqi_gqi_bh = NULL;
+ oinfo->dqi_gqi_count = 0;
+ oinfo->dqi_gqinode = gqinode;
+ status = ocfs2_lock_global_qf(oinfo, 0);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_err;
+ }
+ status = sb->s_op->quota_read(sb, type, (char *)&dinfo,
+ sizeof(struct ocfs2_global_disk_dqinfo),
+ OCFS2_GLOBAL_INFO_OFF);
+ ocfs2_unlock_global_qf(oinfo, 0);
+ if (status != sizeof(struct ocfs2_global_disk_dqinfo)) {
+ mlog(ML_ERROR, "Cannot read global quota info (%d).\n",
+ status);
+ if (status >= 0)
+ status = -EIO;
+ mlog_errno(status);
+ goto out_err;
+ }
+ info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
+ info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
+ oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
+ oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms);
+ oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+ oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+ oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+ oinfo->dqi_gi.dqi_blocksize_bits = sb->s_blocksize_bits;
+ oinfo->dqi_gi.dqi_usable_bs = sb->s_blocksize -
+ OCFS2_QBLK_RESERVED_SPACE;
+ oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
+ INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
+ queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+ oinfo->dqi_syncjiff);
+
+out_err:
+ mlog_exit(status);
+ return status;
+}
+
+/* Write information to global quota file. Expects exlusive lock on quota
+ * file inode and quota info */
+static int __ocfs2_global_write_info(struct super_block *sb, int type)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ struct ocfs2_global_disk_dqinfo dinfo;
+ ssize_t size;
+
+ spin_lock(&dq_data_lock);
+ info->dqi_flags &= ~DQF_INFO_DIRTY;
+ dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
+ dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
+ spin_unlock(&dq_data_lock);
+ dinfo.dqi_syncms = cpu_to_le32(oinfo->dqi_syncms);
+ dinfo.dqi_blocks = cpu_to_le32(oinfo->dqi_gi.dqi_blocks);
+ dinfo.dqi_free_blk = cpu_to_le32(oinfo->dqi_gi.dqi_free_blk);
+ dinfo.dqi_free_entry = cpu_to_le32(oinfo->dqi_gi.dqi_free_entry);
+ size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
+ sizeof(struct ocfs2_global_disk_dqinfo),
+ OCFS2_GLOBAL_INFO_OFF);
+ if (size != sizeof(struct ocfs2_global_disk_dqinfo)) {
+ mlog(ML_ERROR, "Cannot write global quota info structure\n");
+ if (size >= 0)
+ size = -EIO;
+ return size;
+ }
+ return 0;
+}
+
+int ocfs2_global_write_info(struct super_block *sb, int type)
+{
+ int err;
+ struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+
+ err = ocfs2_qinfo_lock(info, 1);
+ if (err < 0)
+ return err;
+ err = __ocfs2_global_write_info(sb, type);
+ ocfs2_qinfo_unlock(info, 1);
+ return err;
+}
+
+/* Read in information from global quota file and acquire a reference to it.
+ * dquot_acquire() has already started the transaction and locked quota file */
+int ocfs2_global_read_dquot(struct dquot *dquot)
+{
+ int err, err2, ex = 0;
+ struct ocfs2_mem_dqinfo *info =
+ sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+
+ err = ocfs2_qinfo_lock(info, 0);
+ if (err < 0)
+ goto out;
+ err = qtree_read_dquot(&info->dqi_gi, dquot);
+ if (err < 0)
+ goto out_qlock;
+ OCFS2_DQUOT(dquot)->dq_use_count++;
+ OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+ OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+ if (!dquot->dq_off) { /* No real quota entry? */
+ /* Upgrade to exclusive lock for allocation */
+ err = ocfs2_qinfo_lock(info, 1);
+ if (err < 0)
+ goto out_qlock;
+ ex = 1;
+ }
+ err = qtree_write_dquot(&info->dqi_gi, dquot);
+ if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
+ err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
+ if (!err)
+ err = err2;
+ }
+out_qlock:
+ if (ex)
+ ocfs2_qinfo_unlock(info, 1);
+ ocfs2_qinfo_unlock(info, 0);
+out:
+ if (err < 0)
+ mlog_errno(err);
+ return err;
+}
+
+/* Sync local information about quota modifications with global quota file.
+ * Caller must have started the transaction and obtained exclusive lock for
+ * global quota file inode */
+int __ocfs2_sync_dquot(struct dquot *dquot, int freeing)
+{
+ int err, err2;
+ struct super_block *sb = dquot->dq_sb;
+ int type = dquot->dq_type;
+ struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+ struct ocfs2_global_disk_dqblk dqblk;
+ s64 spacechange, inodechange;
+ time_t olditime, oldbtime;
+
+ err = sb->s_op->quota_read(sb, type, (char *)&dqblk,
+ sizeof(struct ocfs2_global_disk_dqblk),
+ dquot->dq_off);
+ if (err != sizeof(struct ocfs2_global_disk_dqblk)) {
+ if (err >= 0) {
+ mlog(ML_ERROR, "Short read from global quota file "
+ "(%u read)\n", err);
+ err = -EIO;
+ }
+ goto out;
+ }
+
+ /* Update space and inode usage. Get also other information from
+ * global quota file so that we don't overwrite any changes there.
+ * We are */
+ spin_lock(&dq_data_lock);
+ spacechange = dquot->dq_dqb.dqb_curspace -
+ OCFS2_DQUOT(dquot)->dq_origspace;
+ inodechange = dquot->dq_dqb.dqb_curinodes -
+ OCFS2_DQUOT(dquot)->dq_originodes;
+ olditime = dquot->dq_dqb.dqb_itime;
+ oldbtime = dquot->dq_dqb.dqb_btime;
+ ocfs2_global_disk2memdqb(dquot, &dqblk);
+ mlog(0, "Syncing global dquot %u space %lld+%lld, inodes %lld+%lld\n",
+ dquot->dq_id, dquot->dq_dqb.dqb_curspace, (long long)spacechange,
+ dquot->dq_dqb.dqb_curinodes, (long long)inodechange);
+ if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags))
+ dquot->dq_dqb.dqb_curspace += spacechange;
+ if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags))
+ dquot->dq_dqb.dqb_curinodes += inodechange;
+ /* Set properly space grace time... */
+ if (dquot->dq_dqb.dqb_bsoftlimit &&
+ dquot->dq_dqb.dqb_curspace > dquot->dq_dqb.dqb_bsoftlimit) {
+ if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags) &&
+ oldbtime > 0) {
+ if (dquot->dq_dqb.dqb_btime > 0)
+ dquot->dq_dqb.dqb_btime =
+ min(dquot->dq_dqb.dqb_btime, oldbtime);
+ else
+ dquot->dq_dqb.dqb_btime = oldbtime;
+ }
+ } else {
+ dquot->dq_dqb.dqb_btime = 0;
+ clear_bit(DQ_BLKS_B, &dquot->dq_flags);
+ }
+ /* Set properly inode grace time... */
+ if (dquot->dq_dqb.dqb_isoftlimit &&
+ dquot->dq_dqb.dqb_curinodes > dquot->dq_dqb.dqb_isoftlimit) {
+ if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags) &&
+ olditime > 0) {
+ if (dquot->dq_dqb.dqb_itime > 0)
+ dquot->dq_dqb.dqb_itime =
+ min(dquot->dq_dqb.dqb_itime, olditime);
+ else
+ dquot->dq_dqb.dqb_itime = olditime;
+ }
+ } else {
+ dquot->dq_dqb.dqb_itime = 0;
+ clear_bit(DQ_INODES_B, &dquot->dq_flags);
+ }
+ /* All information is properly updated, clear the flags */
+ __clear_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
+ __clear_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
+ __clear_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
+ __clear_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
+ __clear_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
+ __clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
+ OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
+ OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+ spin_unlock(&dq_data_lock);
+ err = ocfs2_qinfo_lock(info, freeing);
+ if (err < 0) {
+ mlog(ML_ERROR, "Failed to lock quota info, loosing quota write"
+ " (type=%d, id=%u)\n", dquot->dq_type,
+ (unsigned)dquot->dq_id);
+ goto out;
+ }
+ if (freeing)
+ OCFS2_DQUOT(dquot)->dq_use_count--;
+ err = qtree_write_dquot(&info->dqi_gi, dquot);
+ if (err < 0)
+ goto out_qlock;
+ if (freeing && !OCFS2_DQUOT(dquot)->dq_use_count) {
+ err = qtree_release_dquot(&info->dqi_gi, dquot);
+ if (info_dirty(sb_dqinfo(sb, type))) {
+ err2 = __ocfs2_global_write_info(sb, type);
+ if (!err)
+ err = err2;
+ }
+ }
+out_qlock:
+ ocfs2_qinfo_unlock(info, freeing);
+out:
+ if (err < 0)
+ mlog_errno(err);
+ return err;
+}
+
+/*
+ * Functions for periodic syncing of dquots with global file
+ */
+static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type)
+{
+ handle_t *handle;
+ struct super_block *sb = dquot->dq_sb;
+ struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+ int status = 0;
+
+ mlog_entry("id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id,
+ dquot->dq_type, type, sb->s_id);
+ if (type != dquot->dq_type)
+ goto out;
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0)
+ goto out;
+
+ handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_ilock;
+ }
+ mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+ status = ocfs2_sync_dquot(dquot);
+ mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+ if (status < 0)
+ mlog_errno(status);
+ /* We have to write local structure as well... */
+ dquot_mark_dquot_dirty(dquot);
+ status = dquot_commit(dquot);
+ if (status < 0)
+ mlog_errno(status);
+ ocfs2_commit_trans(osb, handle);
+out_ilock:
+ ocfs2_unlock_global_qf(oinfo, 1);
+out:
+ mlog_exit(status);
+ return status;
+}
+
+static void qsync_work_fn(struct work_struct *work)
+{
+ struct ocfs2_mem_dqinfo *oinfo = container_of(work,
+ struct ocfs2_mem_dqinfo,
+ dqi_sync_work.work);
+ struct super_block *sb = oinfo->dqi_gqinode->i_sb;
+
+ dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
+ queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
+ oinfo->dqi_syncjiff);
+}
+
+/*
+ * Wrappers for generic quota functions
+ */
+
+static int ocfs2_write_dquot(struct dquot *dquot)
+{
+ handle_t *handle;
+ struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+ int status = 0;
+
+ mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+
+ handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out;
+ }
+ status = dquot_commit(dquot);
+ ocfs2_commit_trans(osb, handle);
+out:
+ mlog_exit(status);
+ return status;
+}
+
+int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
+{
+ struct ocfs2_mem_dqinfo *oinfo;
+ int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
+
+ if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
+ return 0;
+
+ oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ /* We modify tree, leaf block, global info, local chunk header,
+ * global and local inode */
+ return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 +
+ 2 * OCFS2_INODE_UPDATE_CREDITS;
+}
+
+static int ocfs2_release_dquot(struct dquot *dquot)
+{
+ handle_t *handle;
+ struct ocfs2_mem_dqinfo *oinfo =
+ sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+ struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+ int status = 0;
+
+ mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0)
+ goto out;
+ handle = ocfs2_start_trans(osb,
+ ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_type));
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_ilock;
+ }
+ status = dquot_release(dquot);
+ ocfs2_commit_trans(osb, handle);
+out_ilock:
+ ocfs2_unlock_global_qf(oinfo, 1);
+out:
+ mlog_exit(status);
+ return status;
+}
+
+int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
+{
+ struct ocfs2_mem_dqinfo *oinfo;
+ int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
+ struct ocfs2_dinode *lfe, *gfe;
+
+ if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
+ return 0;
+
+ oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data;
+ lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data;
+ /* We can extend local file + global file. In local file we
+ * can modify info, chunk header block and dquot block. In
+ * global file we can modify info, tree and leaf block */
+ return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
+ ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
+ 3 + oinfo->dqi_gi.dqi_qtree_depth + 2;
+}
+
+static int ocfs2_acquire_dquot(struct dquot *dquot)
+{
+ handle_t *handle;
+ struct ocfs2_mem_dqinfo *oinfo =
+ sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+ struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
+ int status = 0;
+
+ mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
+ /* We need an exclusive lock, because we're going to update use count
+ * and instantiate possibly new dquot structure */
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0)
+ goto out;
+ handle = ocfs2_start_trans(osb,
+ ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type));
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_ilock;
+ }
+ status = dquot_acquire(dquot);
+ ocfs2_commit_trans(osb, handle);
+out_ilock:
+ ocfs2_unlock_global_qf(oinfo, 1);
+out:
+ mlog_exit(status);
+ return status;
+}
+
+static int ocfs2_mark_dquot_dirty(struct dquot *dquot)
+{
+ unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) |
+ (1 << (DQ_LASTSET_B + QIF_BLIMITS_B)) |
+ (1 << (DQ_LASTSET_B + QIF_INODES_B)) |
+ (1 << (DQ_LASTSET_B + QIF_SPACE_B)) |
+ (1 << (DQ_LASTSET_B + QIF_BTIME_B)) |
+ (1 << (DQ_LASTSET_B + QIF_ITIME_B));
+ int sync = 0;
+ int status;
+ struct super_block *sb = dquot->dq_sb;
+ int type = dquot->dq_type;
+ struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ handle_t *handle;
+ struct ocfs2_super *osb = OCFS2_SB(sb);
+
+ mlog_entry("id=%u, type=%d", dquot->dq_id, type);
+ dquot_mark_dquot_dirty(dquot);
+
+ /* In case user set some limits, sync dquot immediately to global
+ * quota file so that information propagates quicker */
+ spin_lock(&dq_data_lock);
+ if (dquot->dq_flags & mask)
+ sync = 1;
+ spin_unlock(&dq_data_lock);
+ if (!sync) {
+ status = ocfs2_write_dquot(dquot);
+ goto out;
+ }
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0)
+ goto out;
+ handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_ilock;
+ }
+ status = ocfs2_sync_dquot(dquot);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ /* Now write updated local dquot structure */
+ status = dquot_commit(dquot);
+out_trans:
+ ocfs2_commit_trans(osb, handle);
+out_ilock:
+ ocfs2_unlock_global_qf(oinfo, 1);
+out:
+ mlog_exit(status);
+ return status;
+}
+
+/* This should happen only after set_dqinfo(). */
+static int ocfs2_write_info(struct super_block *sb, int type)
+{
+ handle_t *handle;
+ int status = 0;
+ struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+
+ mlog_entry_void();
+
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0)
+ goto out;
+ handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QINFO_WRITE_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_ilock;
+ }
+ status = dquot_commit_info(sb, type);
+ ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_ilock:
+ ocfs2_unlock_global_qf(oinfo, 1);
+out:
+ mlog_exit(status);
+ return status;
+}
+
+/* This is difficult. We have to lock quota inode and start transaction
+ * in this function but we don't want to take the penalty of exlusive
+ * quota file lock when we are just going to use cached structures. So
+ * we just take read lock check whether we have dquot cached and if so,
+ * we don't have to take the write lock... */
+static int ocfs2_dquot_initialize(struct inode *inode, int type)
+{
+ handle_t *handle = NULL;
+ int status = 0;
+ struct super_block *sb = inode->i_sb;
+ struct ocfs2_mem_dqinfo *oinfo;
+ int exclusive = 0;
+ int cnt;
+ qid_t id;
+
+ mlog_entry_void();
+
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ if (type != -1 && cnt != type)
+ continue;
+ if (!sb_has_quota_active(sb, cnt))
+ continue;
+ oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+ status = ocfs2_lock_global_qf(oinfo, 0);
+ if (status < 0)
+ goto out;
+ /* This is just a performance optimization not a reliable test.
+ * Since we hold an inode lock, noone can actually release
+ * the structure until we are finished with initialization. */
+ if (inode->i_dquot[cnt] != NODQUOT) {
+ ocfs2_unlock_global_qf(oinfo, 0);
+ continue;
+ }
+ /* When we have inode lock, we know that no dquot_release() can
+ * run and thus we can safely check whether we need to
+ * read+modify global file to get quota information or whether
+ * our node already has it. */
+ if (cnt == USRQUOTA)
+ id = inode->i_uid;
+ else if (cnt == GRPQUOTA)
+ id = inode->i_gid;
+ else
+ BUG();
+ /* Obtain exclusion from quota off... */
+ down_write(&sb_dqopt(sb)->dqptr_sem);
+ exclusive = !dquot_is_cached(sb, id, cnt);
+ up_write(&sb_dqopt(sb)->dqptr_sem);
+ if (exclusive) {
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0) {
+ exclusive = 0;
+ mlog_errno(status);
+ goto out_ilock;
+ }
+ handle = ocfs2_start_trans(OCFS2_SB(sb),
+ ocfs2_calc_qinit_credits(sb, cnt));
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_ilock;
+ }
+ }
+ dquot_initialize(inode, cnt);
+ if (exclusive) {
+ ocfs2_commit_trans(OCFS2_SB(sb), handle);
+ ocfs2_unlock_global_qf(oinfo, 1);
+ }
+ ocfs2_unlock_global_qf(oinfo, 0);
+ }
+ mlog_exit(0);
+ return 0;
+out_ilock:
+ if (exclusive)
+ ocfs2_unlock_global_qf(oinfo, 1);
+ ocfs2_unlock_global_qf(oinfo, 0);
+out:
+ mlog_exit(status);
+ return status;
+}
+
+static int ocfs2_dquot_drop_slow(struct inode *inode)
+{
+ int status = 0;
+ int cnt;
+ int got_lock[MAXQUOTAS] = {0, 0};
+ handle_t *handle;
+ struct super_block *sb = inode->i_sb;
+ struct ocfs2_mem_dqinfo *oinfo;
+
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ if (!sb_has_quota_active(sb, cnt))
+ continue;
+ oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0)
+ goto out;
+ got_lock[cnt] = 1;
+ }
+ handle = ocfs2_start_trans(OCFS2_SB(sb),
+ ocfs2_calc_qinit_credits(sb, USRQUOTA) +
+ ocfs2_calc_qinit_credits(sb, GRPQUOTA));
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out;
+ }
+ dquot_drop(inode);
+ ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+ if (got_lock[cnt]) {
+ oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+ ocfs2_unlock_global_qf(oinfo, 1);
+ }
+ return status;
+}
+
+/* See the comment before ocfs2_dquot_initialize. */
+static int ocfs2_dquot_drop(struct inode *inode)
+{
+ int status = 0;
+ struct super_block *sb = inode->i_sb;
+ struct ocfs2_mem_dqinfo *oinfo;
+ int exclusive = 0;
+ int cnt;
+ int got_lock[MAXQUOTAS] = {0, 0};
+
+ mlog_entry_void();
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ if (!sb_has_quota_active(sb, cnt))
+ continue;
+ oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+ status = ocfs2_lock_global_qf(oinfo, 0);
+ if (status < 0)
+ goto out;
+ got_lock[cnt] = 1;
+ }
+ /* Lock against anyone releasing references so that when when we check
+ * we know we are not going to be last ones to release dquot */
+ down_write(&sb_dqopt(sb)->dqptr_sem);
+ /* Urgh, this is a terrible hack :( */
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+ if (inode->i_dquot[cnt] != NODQUOT &&
+ atomic_read(&inode->i_dquot[cnt]->dq_count) > 1) {
+ exclusive = 1;
+ break;
+ }
+ }
+ if (!exclusive)
+ dquot_drop_locked(inode);
+ up_write(&sb_dqopt(sb)->dqptr_sem);
+out:
+ for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+ if (got_lock[cnt]) {
+ oinfo = sb_dqinfo(sb, cnt)->dqi_priv;
+ ocfs2_unlock_global_qf(oinfo, 0);
+ }
+ /* In case we bailed out because we had to do expensive locking
+ * do it now... */
+ if (exclusive)
+ status = ocfs2_dquot_drop_slow(inode);
+ mlog_exit(status);
+ return status;
+}
+
+static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type)
+{
+ struct ocfs2_dquot *dquot =
+ kmem_cache_zalloc(ocfs2_dquot_cachep, GFP_NOFS);
+
+ if (!dquot)
+ return NULL;
+ return &dquot->dq_dquot;
+}
+
+static void ocfs2_destroy_dquot(struct dquot *dquot)
+{
+ kmem_cache_free(ocfs2_dquot_cachep, dquot);
+}
+
+struct dquot_operations ocfs2_quota_operations = {
+ .initialize = ocfs2_dquot_initialize,
+ .drop = ocfs2_dquot_drop,
+ .alloc_space = dquot_alloc_space,
+ .alloc_inode = dquot_alloc_inode,
+ .free_space = dquot_free_space,
+ .free_inode = dquot_free_inode,
+ .transfer = dquot_transfer,
+ .write_dquot = ocfs2_write_dquot,
+ .acquire_dquot = ocfs2_acquire_dquot,
+ .release_dquot = ocfs2_release_dquot,
+ .mark_dirty = ocfs2_mark_dquot_dirty,
+ .write_info = ocfs2_write_info,
+ .alloc_dquot = ocfs2_alloc_dquot,
+ .destroy_dquot = ocfs2_destroy_dquot,
+};
+
+int ocfs2_quota_setup(void)
+{
+ ocfs2_quota_wq = create_workqueue("o2quot");
+ if (!ocfs2_quota_wq)
+ return -ENOMEM;
+ return 0;
+}
+
+void ocfs2_quota_shutdown(void)
+{
+ if (ocfs2_quota_wq) {
+ flush_workqueue(ocfs2_quota_wq);
+ destroy_workqueue(ocfs2_quota_wq);
+ ocfs2_quota_wq = NULL;
+ }
+}
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
new file mode 100644
index 000000000000..07deec5e9721
--- /dev/null
+++ b/fs/ocfs2/quota_local.c
@@ -0,0 +1,1253 @@
+/*
+ * Implementation of operations over local quota file
+ */
+
+#include <linux/fs.h>
+#include <linux/quota.h>
+#include <linux/quotaops.h>
+#include <linux/module.h>
+
+#define MLOG_MASK_PREFIX ML_QUOTA
+#include <cluster/masklog.h>
+
+#include "ocfs2_fs.h"
+#include "ocfs2.h"
+#include "inode.h"
+#include "alloc.h"
+#include "file.h"
+#include "buffer_head_io.h"
+#include "journal.h"
+#include "sysfile.h"
+#include "dlmglue.h"
+#include "quota.h"
+
+/* Number of local quota structures per block */
+static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
+{
+ return ((sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) /
+ sizeof(struct ocfs2_local_disk_dqblk));
+}
+
+/* Number of blocks with entries in one chunk */
+static inline unsigned int ol_chunk_blocks(struct super_block *sb)
+{
+ return ((sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+ OCFS2_QBLK_RESERVED_SPACE) << 3) /
+ ol_quota_entries_per_block(sb);
+}
+
+/* Number of entries in a chunk bitmap */
+static unsigned int ol_chunk_entries(struct super_block *sb)
+{
+ return ol_chunk_blocks(sb) * ol_quota_entries_per_block(sb);
+}
+
+/* Offset of the chunk in quota file */
+static unsigned int ol_quota_chunk_block(struct super_block *sb, int c)
+{
+ /* 1 block for local quota file info, 1 block per chunk for chunk info */
+ return 1 + (ol_chunk_blocks(sb) + 1) * c;
+}
+
+static unsigned int ol_dqblk_block(struct super_block *sb, int c, int off)
+{
+ int epb = ol_quota_entries_per_block(sb);
+
+ return ol_quota_chunk_block(sb, c) + 1 + off / epb;
+}
+
+static unsigned int ol_dqblk_block_off(struct super_block *sb, int c, int off)
+{
+ int epb = ol_quota_entries_per_block(sb);
+
+ return (off % epb) * sizeof(struct ocfs2_local_disk_dqblk);
+}
+
+/* Offset of the dquot structure in the quota file */
+static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
+{
+ return (ol_dqblk_block(sb, c, off) << sb->s_blocksize_bits) +
+ ol_dqblk_block_off(sb, c, off);
+}
+
+/* Compute block number from given offset */
+static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off)
+{
+ return off >> sb->s_blocksize_bits;
+}
+
+static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off)
+{
+ return off & ((1 << sb->s_blocksize_bits) - 1);
+}
+
+/* Compute offset in the chunk of a structure with the given offset */
+static int ol_dqblk_chunk_off(struct super_block *sb, int c, loff_t off)
+{
+ int epb = ol_quota_entries_per_block(sb);
+
+ return ((off >> sb->s_blocksize_bits) -
+ ol_quota_chunk_block(sb, c) - 1) * epb
+ + ((unsigned int)(off & ((1 << sb->s_blocksize_bits) - 1))) /
+ sizeof(struct ocfs2_local_disk_dqblk);
+}
+
+/* Write bufferhead into the fs */
+static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
+ void (*modify)(struct buffer_head *, void *), void *private)
+{
+ struct super_block *sb = inode->i_sb;
+ handle_t *handle;
+ int status;
+
+ handle = ocfs2_start_trans(OCFS2_SB(sb), 1);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ return status;
+ }
+ status = ocfs2_journal_access_dq(handle, inode, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ ocfs2_commit_trans(OCFS2_SB(sb), handle);
+ return status;
+ }
+ lock_buffer(bh);
+ modify(bh, private);
+ unlock_buffer(bh);
+ status = ocfs2_journal_dirty(handle, bh);
+ if (status < 0) {
+ mlog_errno(status);
+ ocfs2_commit_trans(OCFS2_SB(sb), handle);
+ return status;
+ }
+ status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+ if (status < 0) {
+ mlog_errno(status);
+ return status;
+ }
+ return 0;
+}
+
+/* Check whether we understand format of quota files */
+static int ocfs2_local_check_quota_file(struct super_block *sb, int type)
+{
+ unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS;
+ unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS;
+ unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS;
+ unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS;
+ unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE,
+ GROUP_QUOTA_SYSTEM_INODE };
+ struct buffer_head *bh = NULL;
+ struct inode *linode = sb_dqopt(sb)->files[type];
+ struct inode *ginode = NULL;
+ struct ocfs2_disk_dqheader *dqhead;
+ int status, ret = 0;
+
+ /* First check whether we understand local quota file */
+ status = ocfs2_read_quota_block(linode, 0, &bh);
+ if (status) {
+ mlog_errno(status);
+ mlog(ML_ERROR, "failed to read quota file header (type=%d)\n",
+ type);
+ goto out_err;
+ }
+ dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+ if (le32_to_cpu(dqhead->dqh_magic) != lmagics[type]) {
+ mlog(ML_ERROR, "quota file magic does not match (%u != %u),"
+ " type=%d\n", le32_to_cpu(dqhead->dqh_magic),
+ lmagics[type], type);
+ goto out_err;
+ }
+ if (le32_to_cpu(dqhead->dqh_version) != lversions[type]) {
+ mlog(ML_ERROR, "quota file version does not match (%u != %u),"
+ " type=%d\n", le32_to_cpu(dqhead->dqh_version),
+ lversions[type], type);
+ goto out_err;
+ }
+ brelse(bh);
+ bh = NULL;
+
+ /* Next check whether we understand global quota file */
+ ginode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type],
+ OCFS2_INVALID_SLOT);
+ if (!ginode) {
+ mlog(ML_ERROR, "cannot get global quota file inode "
+ "(type=%d)\n", type);
+ goto out_err;
+ }
+ /* Since the header is read only, we don't care about locking */
+ status = ocfs2_read_quota_block(ginode, 0, &bh);
+ if (status) {
+ mlog_errno(status);
+ mlog(ML_ERROR, "failed to read global quota file header "
+ "(type=%d)\n", type);
+ goto out_err;
+ }
+ dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data);
+ if (le32_to_cpu(dqhead->dqh_magic) != gmagics[type]) {
+ mlog(ML_ERROR, "global quota file magic does not match "
+ "(%u != %u), type=%d\n",
+ le32_to_cpu(dqhead->dqh_magic), gmagics[type], type);
+ goto out_err;
+ }
+ if (le32_to_cpu(dqhead->dqh_version) != gversions[type]) {
+ mlog(ML_ERROR, "global quota file version does not match "
+ "(%u != %u), type=%d\n",
+ le32_to_cpu(dqhead->dqh_version), gversions[type],
+ type);
+ goto out_err;
+ }
+
+ ret = 1;
+out_err:
+ brelse(bh);
+ iput(ginode);
+ return ret;
+}
+
+/* Release given list of quota file chunks */
+static void ocfs2_release_local_quota_bitmaps(struct list_head *head)
+{
+ struct ocfs2_quota_chunk *pos, *next;
+
+ list_for_each_entry_safe(pos, next, head, qc_chunk) {
+ list_del(&pos->qc_chunk);
+ brelse(pos->qc_headerbh);
+ kmem_cache_free(ocfs2_qf_chunk_cachep, pos);
+ }
+}
+
+/* Load quota bitmaps into memory */
+static int ocfs2_load_local_quota_bitmaps(struct inode *inode,
+ struct ocfs2_local_disk_dqinfo *ldinfo,
+ struct list_head *head)
+{
+ struct ocfs2_quota_chunk *newchunk;
+ int i, status;
+
+ INIT_LIST_HEAD(head);
+ for (i = 0; i < le32_to_cpu(ldinfo->dqi_chunks); i++) {
+ newchunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+ if (!newchunk) {
+ ocfs2_release_local_quota_bitmaps(head);
+ return -ENOMEM;
+ }
+ newchunk->qc_num = i;
+ newchunk->qc_headerbh = NULL;
+ status = ocfs2_read_quota_block(inode,
+ ol_quota_chunk_block(inode->i_sb, i),
+ &newchunk->qc_headerbh);
+ if (status) {
+ mlog_errno(status);
+ kmem_cache_free(ocfs2_qf_chunk_cachep, newchunk);
+ ocfs2_release_local_quota_bitmaps(head);
+ return status;
+ }
+ list_add_tail(&newchunk->qc_chunk, head);
+ }
+ return 0;
+}
+
+static void olq_update_info(struct buffer_head *bh, void *private)
+{
+ struct mem_dqinfo *info = private;
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ struct ocfs2_local_disk_dqinfo *ldinfo;
+
+ ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+ OCFS2_LOCAL_INFO_OFF);
+ spin_lock(&dq_data_lock);
+ ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
+ ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks);
+ ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks);
+ spin_unlock(&dq_data_lock);
+}
+
+static int ocfs2_add_recovery_chunk(struct super_block *sb,
+ struct ocfs2_local_disk_chunk *dchunk,
+ int chunk,
+ struct list_head *head)
+{
+ struct ocfs2_recovery_chunk *rc;
+
+ rc = kmalloc(sizeof(struct ocfs2_recovery_chunk), GFP_NOFS);
+ if (!rc)
+ return -ENOMEM;
+ rc->rc_chunk = chunk;
+ rc->rc_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS);
+ if (!rc->rc_bitmap) {
+ kfree(rc);
+ return -ENOMEM;
+ }
+ memcpy(rc->rc_bitmap, dchunk->dqc_bitmap,
+ (ol_chunk_entries(sb) + 7) >> 3);
+ list_add_tail(&rc->rc_list, head);
+ return 0;
+}
+
+static void free_recovery_list(struct list_head *head)
+{
+ struct ocfs2_recovery_chunk *next;
+ struct ocfs2_recovery_chunk *rchunk;
+
+ list_for_each_entry_safe(rchunk, next, head, rc_list) {
+ list_del(&rchunk->rc_list);
+ kfree(rchunk->rc_bitmap);
+ kfree(rchunk);
+ }
+}
+
+void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec)
+{
+ int type;
+
+ for (type = 0; type < MAXQUOTAS; type++)
+ free_recovery_list(&(rec->r_list[type]));
+ kfree(rec);
+}
+
+/* Load entries in our quota file we have to recover*/
+static int ocfs2_recovery_load_quota(struct inode *lqinode,
+ struct ocfs2_local_disk_dqinfo *ldinfo,
+ int type,
+ struct list_head *head)
+{
+ struct super_block *sb = lqinode->i_sb;
+ struct buffer_head *hbh;
+ struct ocfs2_local_disk_chunk *dchunk;
+ int i, chunks = le32_to_cpu(ldinfo->dqi_chunks);
+ int status = 0;
+
+ for (i = 0; i < chunks; i++) {
+ hbh = NULL;
+ status = ocfs2_read_quota_block(lqinode,
+ ol_quota_chunk_block(sb, i),
+ &hbh);
+ if (status) {
+ mlog_errno(status);
+ break;
+ }
+ dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+ if (le32_to_cpu(dchunk->dqc_free) < ol_chunk_entries(sb))
+ status = ocfs2_add_recovery_chunk(sb, dchunk, i, head);
+ brelse(hbh);
+ if (status < 0)
+ break;
+ }
+ if (status < 0)
+ free_recovery_list(head);
+ return status;
+}
+
+static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void)
+{
+ int type;
+ struct ocfs2_quota_recovery *rec;
+
+ rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS);
+ if (!rec)
+ return NULL;
+ for (type = 0; type < MAXQUOTAS; type++)
+ INIT_LIST_HEAD(&(rec->r_list[type]));
+ return rec;
+}
+
+/* Load information we need for quota recovery into memory */
+struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
+ struct ocfs2_super *osb,
+ int slot_num)
+{
+ unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+ unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+ LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+ struct super_block *sb = osb->sb;
+ struct ocfs2_local_disk_dqinfo *ldinfo;
+ struct inode *lqinode;
+ struct buffer_head *bh;
+ int type;
+ int status = 0;
+ struct ocfs2_quota_recovery *rec;
+
+ mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num);
+ rec = ocfs2_alloc_quota_recovery();
+ if (!rec)
+ return ERR_PTR(-ENOMEM);
+ /* First init... */
+
+ for (type = 0; type < MAXQUOTAS; type++) {
+ if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+ continue;
+ /* At this point, journal of the slot is already replayed so
+ * we can trust metadata and data of the quota file */
+ lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+ if (!lqinode) {
+ status = -ENOENT;
+ goto out;
+ }
+ status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+ OCFS2_META_LOCK_RECOVERY);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_put;
+ }
+ /* Now read local header */
+ bh = NULL;
+ status = ocfs2_read_quota_block(lqinode, 0, &bh);
+ if (status) {
+ mlog_errno(status);
+ mlog(ML_ERROR, "failed to read quota file info header "
+ "(slot=%d type=%d)\n", slot_num, type);
+ goto out_lock;
+ }
+ ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+ OCFS2_LOCAL_INFO_OFF);
+ status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+ &rec->r_list[type]);
+ brelse(bh);
+out_lock:
+ ocfs2_inode_unlock(lqinode, 1);
+out_put:
+ iput(lqinode);
+ if (status < 0)
+ break;
+ }
+out:
+ if (status < 0) {
+ ocfs2_free_quota_recovery(rec);
+ rec = ERR_PTR(status);
+ }
+ return rec;
+}
+
+/* Sync changes in local quota file into global quota file and
+ * reinitialize local quota file.
+ * The function expects local quota file to be already locked and
+ * dqonoff_mutex locked. */
+static int ocfs2_recover_local_quota_file(struct inode *lqinode,
+ int type,
+ struct ocfs2_quota_recovery *rec)
+{
+ struct super_block *sb = lqinode->i_sb;
+ struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+ struct ocfs2_local_disk_chunk *dchunk;
+ struct ocfs2_local_disk_dqblk *dqblk;
+ struct dquot *dquot;
+ handle_t *handle;
+ struct buffer_head *hbh = NULL, *qbh = NULL;
+ int status = 0;
+ int bit, chunk;
+ struct ocfs2_recovery_chunk *rchunk, *next;
+ qsize_t spacechange, inodechange;
+
+ mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type);
+
+ status = ocfs2_lock_global_qf(oinfo, 1);
+ if (status < 0)
+ goto out;
+
+ list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) {
+ chunk = rchunk->rc_chunk;
+ hbh = NULL;
+ status = ocfs2_read_quota_block(lqinode,
+ ol_quota_chunk_block(sb, chunk),
+ &hbh);
+ if (status) {
+ mlog_errno(status);
+ break;
+ }
+ dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data;
+ for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) {
+ qbh = NULL;
+ status = ocfs2_read_quota_block(lqinode,
+ ol_dqblk_block(sb, chunk, bit),
+ &qbh);
+ if (status) {
+ mlog_errno(status);
+ break;
+ }
+ dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data +
+ ol_dqblk_block_off(sb, chunk, bit));
+ dquot = dqget(sb, le64_to_cpu(dqblk->dqb_id), type);
+ if (!dquot) {
+ status = -EIO;
+ mlog(ML_ERROR, "Failed to get quota structure "
+ "for id %u, type %d. Cannot finish quota "
+ "file recovery.\n",
+ (unsigned)le64_to_cpu(dqblk->dqb_id),
+ type);
+ goto out_put_bh;
+ }
+ handle = ocfs2_start_trans(OCFS2_SB(sb),
+ OCFS2_QSYNC_CREDITS);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_put_dquot;
+ }
+ mutex_lock(&sb_dqopt(sb)->dqio_mutex);
+ spin_lock(&dq_data_lock);
+ /* Add usage from quota entry into quota changes
+ * of our node. Auxiliary variables are important
+ * due to signedness */
+ spacechange = le64_to_cpu(dqblk->dqb_spacemod);
+ inodechange = le64_to_cpu(dqblk->dqb_inodemod);
+ dquot->dq_dqb.dqb_curspace += spacechange;
+ dquot->dq_dqb.dqb_curinodes += inodechange;
+ spin_unlock(&dq_data_lock);
+ /* We want to drop reference held by the crashed
+ * node. Since we have our own reference we know
+ * global structure actually won't be freed. */
+ status = ocfs2_global_release_dquot(dquot);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+ /* Release local quota file entry */
+ status = ocfs2_journal_access_dq(handle, lqinode,
+ qbh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_commit;
+ }
+ lock_buffer(qbh);
+ WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap));
+ ocfs2_clear_bit(bit, dchunk->dqc_bitmap);
+ le32_add_cpu(&dchunk->dqc_free, 1);
+ unlock_buffer(qbh);
+ status = ocfs2_journal_dirty(handle, qbh);
+ if (status < 0)
+ mlog_errno(status);
+out_commit:
+ mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
+ ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out_put_dquot:
+ dqput(dquot);
+out_put_bh:
+ brelse(qbh);
+ if (status < 0)
+ break;
+ }
+ brelse(hbh);
+ list_del(&rchunk->rc_list);
+ kfree(rchunk->rc_bitmap);
+ kfree(rchunk);
+ if (status < 0)
+ break;
+ }
+ ocfs2_unlock_global_qf(oinfo, 1);
+out:
+ if (status < 0)
+ free_recovery_list(&(rec->r_list[type]));
+ mlog_exit(status);
+ return status;
+}
+
+/* Recover local quota files for given node different from us */
+int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
+ struct ocfs2_quota_recovery *rec,
+ int slot_num)
+{
+ unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+ LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+ struct super_block *sb = osb->sb;
+ struct ocfs2_local_disk_dqinfo *ldinfo;
+ struct buffer_head *bh;
+ handle_t *handle;
+ int type;
+ int status = 0;
+ struct inode *lqinode;
+ unsigned int flags;
+
+ mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num);
+ mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+ for (type = 0; type < MAXQUOTAS; type++) {
+ if (list_empty(&(rec->r_list[type])))
+ continue;
+ mlog(0, "Recovering quota in slot %d\n", slot_num);
+ lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num);
+ if (!lqinode) {
+ status = -ENOENT;
+ goto out;
+ }
+ status = ocfs2_inode_lock_full(lqinode, NULL, 1,
+ OCFS2_META_LOCK_NOQUEUE);
+ /* Someone else is holding the lock? Then he must be
+ * doing the recovery. Just skip the file... */
+ if (status == -EAGAIN) {
+ mlog(ML_NOTICE, "skipping quota recovery for slot %d "
+ "because quota file is locked.\n", slot_num);
+ status = 0;
+ goto out_put;
+ } else if (status < 0) {
+ mlog_errno(status);
+ goto out_put;
+ }
+ /* Now read local header */
+ bh = NULL;
+ status = ocfs2_read_quota_block(lqinode, 0, &bh);
+ if (status) {
+ mlog_errno(status);
+ mlog(ML_ERROR, "failed to read quota file info header "
+ "(slot=%d type=%d)\n", slot_num, type);
+ goto out_lock;
+ }
+ ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+ OCFS2_LOCAL_INFO_OFF);
+ /* Is recovery still needed? */
+ flags = le32_to_cpu(ldinfo->dqi_flags);
+ if (!(flags & OLQF_CLEAN))
+ status = ocfs2_recover_local_quota_file(lqinode,
+ type,
+ rec);
+ /* We don't want to mark file as clean when it is actually
+ * active */
+ if (slot_num == osb->slot_num)
+ goto out_bh;
+ /* Mark quota file as clean if we are recovering quota file of
+ * some other node. */
+ handle = ocfs2_start_trans(osb, 1);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out_bh;
+ }
+ status = ocfs2_journal_access_dq(handle, lqinode, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ lock_buffer(bh);
+ ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN);
+ unlock_buffer(bh);
+ status = ocfs2_journal_dirty(handle, bh);
+ if (status < 0)
+ mlog_errno(status);
+out_trans:
+ ocfs2_commit_trans(osb, handle);
+out_bh:
+ brelse(bh);
+out_lock:
+ ocfs2_inode_unlock(lqinode, 1);
+out_put:
+ iput(lqinode);
+ if (status < 0)
+ break;
+ }
+out:
+ mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+ kfree(rec);
+ return status;
+}
+
+/* Read information header from quota file */
+static int ocfs2_local_read_info(struct super_block *sb, int type)
+{
+ struct ocfs2_local_disk_dqinfo *ldinfo;
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo;
+ struct inode *lqinode = sb_dqopt(sb)->files[type];
+ int status;
+ struct buffer_head *bh = NULL;
+ struct ocfs2_quota_recovery *rec;
+ int locked = 0;
+
+ info->dqi_maxblimit = 0x7fffffffffffffffLL;
+ info->dqi_maxilimit = 0x7fffffffffffffffLL;
+ oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
+ if (!oinfo) {
+ mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
+ " info.");
+ goto out_err;
+ }
+ info->dqi_priv = oinfo;
+ oinfo->dqi_type = type;
+ INIT_LIST_HEAD(&oinfo->dqi_chunk);
+ oinfo->dqi_rec = NULL;
+ oinfo->dqi_lqi_bh = NULL;
+ oinfo->dqi_ibh = NULL;
+
+ status = ocfs2_global_read_info(sb, type);
+ if (status < 0)
+ goto out_err;
+
+ status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_err;
+ }
+ locked = 1;
+
+ /* Now read local header */
+ status = ocfs2_read_quota_block(lqinode, 0, &bh);
+ if (status) {
+ mlog_errno(status);
+ mlog(ML_ERROR, "failed to read quota file info header "
+ "(type=%d)\n", type);
+ goto out_err;
+ }
+ ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
+ OCFS2_LOCAL_INFO_OFF);
+ info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
+ oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
+ oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
+ oinfo->dqi_ibh = bh;
+
+ /* We crashed when using local quota file? */
+ if (!(info->dqi_flags & OLQF_CLEAN)) {
+ rec = OCFS2_SB(sb)->quota_rec;
+ if (!rec) {
+ rec = ocfs2_alloc_quota_recovery();
+ if (!rec) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out_err;
+ }
+ OCFS2_SB(sb)->quota_rec = rec;
+ }
+
+ status = ocfs2_recovery_load_quota(lqinode, ldinfo, type,
+ &rec->r_list[type]);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_err;
+ }
+ }
+
+ status = ocfs2_load_local_quota_bitmaps(lqinode,
+ ldinfo,
+ &oinfo->dqi_chunk);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_err;
+ }
+
+ /* Now mark quota file as used */
+ info->dqi_flags &= ~OLQF_CLEAN;
+ status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_err;
+ }
+
+ return 0;
+out_err:
+ if (oinfo) {
+ iput(oinfo->dqi_gqinode);
+ ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+ ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+ brelse(oinfo->dqi_lqi_bh);
+ if (locked)
+ ocfs2_inode_unlock(lqinode, 1);
+ ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+ kfree(oinfo);
+ }
+ brelse(bh);
+ return -1;
+}
+
+/* Write local info to quota file */
+static int ocfs2_local_write_info(struct super_block *sb, int type)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv)
+ ->dqi_ibh;
+ int status;
+
+ status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info,
+ info);
+ if (status < 0) {
+ mlog_errno(status);
+ return -1;
+ }
+
+ return 0;
+}
+
+/* Release info from memory */
+static int ocfs2_local_free_info(struct super_block *sb, int type)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ struct ocfs2_quota_chunk *chunk;
+ struct ocfs2_local_disk_chunk *dchunk;
+ int mark_clean = 1, len;
+ int status;
+
+ /* At this point we know there are no more dquots and thus
+ * even if there's some sync in the pdflush queue, it won't
+ * find any dquots and return without doing anything */
+ cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+ iput(oinfo->dqi_gqinode);
+ ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock);
+ ocfs2_lock_res_free(&oinfo->dqi_gqlock);
+ list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+ dchunk = (struct ocfs2_local_disk_chunk *)
+ (chunk->qc_headerbh->b_data);
+ if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+ len = ol_chunk_entries(sb);
+ } else {
+ len = (oinfo->dqi_blocks -
+ ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+ * ol_quota_entries_per_block(sb);
+ }
+ /* Not all entries free? Bug! */
+ if (le32_to_cpu(dchunk->dqc_free) != len) {
+ mlog(ML_ERROR, "releasing quota file with used "
+ "entries (type=%d)\n", type);
+ mark_clean = 0;
+ }
+ }
+ ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
+
+ /* dqonoff_mutex protects us against racing with recovery thread... */
+ if (oinfo->dqi_rec) {
+ ocfs2_free_quota_recovery(oinfo->dqi_rec);
+ mark_clean = 0;
+ }
+
+ if (!mark_clean)
+ goto out;
+
+ /* Mark local file as clean */
+ info->dqi_flags |= OLQF_CLEAN;
+ status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
+ oinfo->dqi_ibh,
+ olq_update_info,
+ info);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+out:
+ ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1);
+ brelse(oinfo->dqi_ibh);
+ brelse(oinfo->dqi_lqi_bh);
+ kfree(oinfo);
+ return 0;
+}
+
+static void olq_set_dquot(struct buffer_head *bh, void *private)
+{
+ struct ocfs2_dquot *od = private;
+ struct ocfs2_local_disk_dqblk *dqblk;
+ struct super_block *sb = od->dq_dquot.dq_sb;
+
+ dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data
+ + ol_dqblk_block_offset(sb, od->dq_local_off));
+
+ dqblk->dqb_id = cpu_to_le64(od->dq_dquot.dq_id);
+ spin_lock(&dq_data_lock);
+ dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace -
+ od->dq_origspace);
+ dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes -
+ od->dq_originodes);
+ spin_unlock(&dq_data_lock);
+ mlog(0, "Writing local dquot %u space %lld inodes %lld\n",
+ od->dq_dquot.dq_id, (long long)le64_to_cpu(dqblk->dqb_spacemod),
+ (long long)le64_to_cpu(dqblk->dqb_inodemod));
+}
+
+/* Write dquot to local quota file */
+static int ocfs2_local_write_dquot(struct dquot *dquot)
+{
+ struct super_block *sb = dquot->dq_sb;
+ struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+ struct buffer_head *bh = NULL;
+ int status;
+
+ status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type],
+ ol_dqblk_file_block(sb, od->dq_local_off),
+ &bh);
+ if (status) {
+ mlog_errno(status);
+ goto out;
+ }
+ status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh,
+ olq_set_dquot, od);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+out:
+ brelse(bh);
+ return status;
+}
+
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb,
+ int type,
+ int *offset)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ struct ocfs2_quota_chunk *chunk;
+ struct ocfs2_local_disk_chunk *dchunk;
+ int found = 0, len;
+
+ list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) {
+ dchunk = (struct ocfs2_local_disk_chunk *)
+ chunk->qc_headerbh->b_data;
+ if (le32_to_cpu(dchunk->dqc_free) > 0) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found)
+ return NULL;
+
+ if (chunk->qc_num < oinfo->dqi_chunks - 1) {
+ len = ol_chunk_entries(sb);
+ } else {
+ len = (oinfo->dqi_blocks -
+ ol_quota_chunk_block(sb, chunk->qc_num) - 1)
+ * ol_quota_entries_per_block(sb);
+ }
+
+ found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0);
+ /* We failed? */
+ if (found == len) {
+ mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u"
+ " entries free (type=%d)\n", chunk->qc_num,
+ le32_to_cpu(dchunk->dqc_free), type);
+ return ERR_PTR(-EIO);
+ }
+ *offset = found;
+ return chunk;
+}
+
+/* Add new chunk to the local quota file */
+static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
+ struct super_block *sb,
+ int type,
+ int *offset)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ struct inode *lqinode = sb_dqopt(sb)->files[type];
+ struct ocfs2_quota_chunk *chunk = NULL;
+ struct ocfs2_local_disk_chunk *dchunk;
+ int status;
+ handle_t *handle;
+ struct buffer_head *bh = NULL;
+ u64 p_blkno;
+
+ /* We are protected by dqio_sem so no locking needed */
+ status = ocfs2_extend_no_holes(lqinode,
+ lqinode->i_size + 2 * sb->s_blocksize,
+ lqinode->i_size);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+ status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+ lqinode->i_size + 2 * sb->s_blocksize);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ chunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS);
+ if (!chunk) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out;
+ }
+
+ down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+ status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
+ &p_blkno, NULL, NULL);
+ up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+ bh = sb_getblk(sb, p_blkno);
+ if (!bh) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto out;
+ }
+ dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+
+ handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out;
+ }
+
+ status = ocfs2_journal_access_dq(handle, lqinode, bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ lock_buffer(bh);
+ dchunk->dqc_free = cpu_to_le32(ol_quota_entries_per_block(sb));
+ memset(dchunk->dqc_bitmap, 0,
+ sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
+ OCFS2_QBLK_RESERVED_SPACE);
+ set_buffer_uptodate(bh);
+ unlock_buffer(bh);
+ status = ocfs2_journal_dirty(handle, bh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+
+ oinfo->dqi_blocks += 2;
+ oinfo->dqi_chunks++;
+ status = ocfs2_local_write_info(sb, type);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ list_add_tail(&chunk->qc_chunk, &oinfo->dqi_chunk);
+ chunk->qc_num = list_entry(chunk->qc_chunk.prev,
+ struct ocfs2_quota_chunk,
+ qc_chunk)->qc_num + 1;
+ chunk->qc_headerbh = bh;
+ *offset = 0;
+ return chunk;
+out_trans:
+ ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+ brelse(bh);
+ kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
+ return ERR_PTR(status);
+}
+
+/* Find free entry in local quota file */
+static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
+ struct super_block *sb,
+ int type,
+ int *offset)
+{
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv;
+ struct ocfs2_quota_chunk *chunk;
+ struct inode *lqinode = sb_dqopt(sb)->files[type];
+ struct ocfs2_local_disk_chunk *dchunk;
+ int epb = ol_quota_entries_per_block(sb);
+ unsigned int chunk_blocks;
+ int status;
+ handle_t *handle;
+
+ if (list_empty(&oinfo->dqi_chunk))
+ return ocfs2_local_quota_add_chunk(sb, type, offset);
+ /* Is the last chunk full? */
+ chunk = list_entry(oinfo->dqi_chunk.prev,
+ struct ocfs2_quota_chunk, qc_chunk);
+ chunk_blocks = oinfo->dqi_blocks -
+ ol_quota_chunk_block(sb, chunk->qc_num) - 1;
+ if (ol_chunk_blocks(sb) == chunk_blocks)
+ return ocfs2_local_quota_add_chunk(sb, type, offset);
+
+ /* We are protected by dqio_sem so no locking needed */
+ status = ocfs2_extend_no_holes(lqinode,
+ lqinode->i_size + sb->s_blocksize,
+ lqinode->i_size);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+ status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh,
+ lqinode->i_size + sb->s_blocksize);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+ handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+ if (IS_ERR(handle)) {
+ status = PTR_ERR(handle);
+ mlog_errno(status);
+ goto out;
+ }
+ status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+
+ dchunk = (struct ocfs2_local_disk_chunk *)chunk->qc_headerbh->b_data;
+ lock_buffer(chunk->qc_headerbh);
+ le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb));
+ unlock_buffer(chunk->qc_headerbh);
+ status = ocfs2_journal_dirty(handle, chunk->qc_headerbh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+ oinfo->dqi_blocks++;
+ status = ocfs2_local_write_info(sb, type);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_trans;
+ }
+
+ status = ocfs2_commit_trans(OCFS2_SB(sb), handle);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+ *offset = chunk_blocks * epb;
+ return chunk;
+out_trans:
+ ocfs2_commit_trans(OCFS2_SB(sb), handle);
+out:
+ return ERR_PTR(status);
+}
+
+static void olq_alloc_dquot(struct buffer_head *bh, void *private)
+{
+ int *offset = private;
+ struct ocfs2_local_disk_chunk *dchunk;
+
+ dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+ ocfs2_set_bit(*offset, dchunk->dqc_bitmap);
+ le32_add_cpu(&dchunk->dqc_free, -1);
+}
+
+/* Create dquot in the local file for given id */
+static int ocfs2_create_local_dquot(struct dquot *dquot)
+{
+ struct super_block *sb = dquot->dq_sb;
+ int type = dquot->dq_type;
+ struct inode *lqinode = sb_dqopt(sb)->files[type];
+ struct ocfs2_quota_chunk *chunk;
+ struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+ int offset;
+ int status;
+
+ chunk = ocfs2_find_free_entry(sb, type, &offset);
+ if (!chunk) {
+ chunk = ocfs2_extend_local_quota_file(sb, type, &offset);
+ if (IS_ERR(chunk))
+ return PTR_ERR(chunk);
+ } else if (IS_ERR(chunk)) {
+ return PTR_ERR(chunk);
+ }
+ od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset);
+ od->dq_chunk = chunk;
+
+ /* Initialize dquot structure on disk */
+ status = ocfs2_local_write_dquot(dquot);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ /* Mark structure as allocated */
+ status = ocfs2_modify_bh(lqinode, chunk->qc_headerbh, olq_alloc_dquot,
+ &offset);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+out:
+ return status;
+}
+
+/* Create entry in local file for dquot, load data from the global file */
+static int ocfs2_local_read_dquot(struct dquot *dquot)
+{
+ int status;
+
+ mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type);
+
+ status = ocfs2_global_read_dquot(dquot);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_err;
+ }
+
+ /* Now create entry in the local quota file */
+ status = ocfs2_create_local_dquot(dquot);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out_err;
+ }
+ mlog_exit(0);
+ return 0;
+out_err:
+ mlog_exit(status);
+ return status;
+}
+
+/* Release dquot structure from local quota file. ocfs2_release_dquot() has
+ * already started a transaction and obtained exclusive lock for global
+ * quota file. */
+static int ocfs2_local_release_dquot(struct dquot *dquot)
+{
+ int status;
+ int type = dquot->dq_type;
+ struct ocfs2_dquot *od = OCFS2_DQUOT(dquot);
+ struct super_block *sb = dquot->dq_sb;
+ struct ocfs2_local_disk_chunk *dchunk;
+ int offset;
+ handle_t *handle = journal_current_handle();
+
+ BUG_ON(!handle);
+ /* First write all local changes to global file */
+ status = ocfs2_global_release_dquot(dquot);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+
+ status = ocfs2_journal_access_dq(handle, sb_dqopt(sb)->files[type],
+ od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+ offset = ol_dqblk_chunk_off(sb, od->dq_chunk->qc_num,
+ od->dq_local_off);
+ dchunk = (struct ocfs2_local_disk_chunk *)
+ (od->dq_chunk->qc_headerbh->b_data);
+ /* Mark structure as freed */
+ lock_buffer(od->dq_chunk->qc_headerbh);
+ ocfs2_clear_bit(offset, dchunk->dqc_bitmap);
+ le32_add_cpu(&dchunk->dqc_free, 1);
+ unlock_buffer(od->dq_chunk->qc_headerbh);
+ status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh);
+ if (status < 0) {
+ mlog_errno(status);
+ goto out;
+ }
+ status = 0;
+out:
+ /* Clear the read bit so that next time someone uses this
+ * dquot he reads fresh info from disk and allocates local
+ * dquot structure */
+ clear_bit(DQ_READ_B, &dquot->dq_flags);
+ return status;
+}
+
+static struct quota_format_ops ocfs2_format_ops = {
+ .check_quota_file = ocfs2_local_check_quota_file,
+ .read_file_info = ocfs2_local_read_info,
+ .write_file_info = ocfs2_global_write_info,
+ .free_file_info = ocfs2_local_free_info,
+ .read_dqblk = ocfs2_local_read_dquot,
+ .commit_dqblk = ocfs2_local_write_dquot,
+ .release_dqblk = ocfs2_local_release_dquot,
+};
+
+struct quota_format_type ocfs2_quota_format = {
+ .qf_fmt_id = QFMT_OCFS2,
+ .qf_ops = &ocfs2_format_ops,
+ .qf_owner = THIS_MODULE
+};
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index ffd48db229a7..424adaa5f900 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -106,8 +106,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n",
new_clusters, first_new_cluster);
- ret = ocfs2_journal_access(handle, bm_inode, group_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_gd(handle, bm_inode, group_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -141,8 +141,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle,
}
/* update the inode accordingly. */
- ret = ocfs2_journal_access(handle, bm_inode, bm_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, bm_inode, bm_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out_rollback;
@@ -314,6 +314,10 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
fe = (struct ocfs2_dinode *)main_bm_bh->b_data;
+ /* main_bm_bh is validated by inode read inside ocfs2_inode_lock(),
+ * so any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
if (le16_to_cpu(fe->id2.i_chain.cl_cpg) !=
ocfs2_group_bitmap_size(osb->sb) * 8) {
mlog(ML_ERROR, "The disk is too old and small. "
@@ -322,30 +326,18 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
goto out_unlock;
}
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe);
- ret = -EIO;
- goto out_unlock;
- }
-
first_new_cluster = le32_to_cpu(fe->i_clusters);
lgd_blkno = ocfs2_which_cluster_group(main_bm_inode,
first_new_cluster - 1);
- ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh);
+ ret = ocfs2_read_group_descriptor(main_bm_inode, fe, lgd_blkno,
+ &group_bh);
if (ret < 0) {
mlog_errno(ret);
goto out_unlock;
}
-
group = (struct ocfs2_group_desc *)group_bh->b_data;
- ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group);
- if (ret) {
- mlog_errno(ret);
- goto out_unlock;
- }
-
cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc);
if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters >
le16_to_cpu(fe->id2.i_chain.cl_cpg)) {
@@ -398,41 +390,16 @@ static int ocfs2_check_new_group(struct inode *inode,
struct buffer_head *group_bh)
{
int ret;
- struct ocfs2_group_desc *gd;
+ struct ocfs2_group_desc *gd =
+ (struct ocfs2_group_desc *)group_bh->b_data;
u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc);
- unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) *
- le16_to_cpu(di->id2.i_chain.cl_bpc);
-
- gd = (struct ocfs2_group_desc *)group_bh->b_data;
+ ret = ocfs2_check_group_descriptor(inode->i_sb, di, group_bh);
+ if (ret)
+ goto out;
- ret = -EIO;
- if (!OCFS2_IS_VALID_GROUP_DESC(gd))
- mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n",
- (unsigned long long)le64_to_cpu(gd->bg_blkno));
- else if (di->i_blkno != gd->bg_parent_dinode)
- mlog(ML_ERROR, "Group descriptor # %llu has bad parent "
- "pointer (%llu, expected %llu)\n",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
- (unsigned long long)le64_to_cpu(di->i_blkno));
- else if (le16_to_cpu(gd->bg_bits) > max_bits)
- mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- le16_to_cpu(gd->bg_bits));
- else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits))
- mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
- "claims that %u are free\n",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- le16_to_cpu(gd->bg_bits),
- le16_to_cpu(gd->bg_free_bits_count));
- else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size)))
- mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but "
- "max bitmap bits of %u\n",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- le16_to_cpu(gd->bg_bits),
- 8 * le16_to_cpu(gd->bg_size));
- else if (le16_to_cpu(gd->bg_chain) != input->chain)
+ ret = -EINVAL;
+ if (le16_to_cpu(gd->bg_chain) != input->chain)
mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u "
"while input has %u set.\n",
(unsigned long long)le64_to_cpu(gd->bg_blkno),
@@ -451,6 +418,7 @@ static int ocfs2_check_new_group(struct inode *inode,
else
ret = 0;
+out:
return ret;
}
@@ -568,8 +536,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
cl = &fe->id2.i_chain;
cr = &cl->cl_recs[input->chain];
- ret = ocfs2_journal_access(handle, main_bm_inode, group_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_gd(handle, main_bm_inode, group_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out_commit;
@@ -584,8 +552,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
goto out_commit;
}
- ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, main_bm_inode, main_bm_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out_commit;
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index bdda2d8f8508..40661e7824e9 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -151,7 +151,7 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb)
* this is not true, the read of -1 (UINT64_MAX) will fail.
*/
ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh,
- OCFS2_BH_IGNORE_CACHE);
+ OCFS2_BH_IGNORE_CACHE, NULL);
if (ret == 0) {
spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
@@ -405,7 +405,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb,
bh = NULL; /* Acquire a fresh bh */
status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh,
- OCFS2_BH_IGNORE_CACHE);
+ OCFS2_BH_IGNORE_CACHE, NULL);
if (status < 0) {
mlog_errno(status);
goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index c5ff18b46b57..a69628603e18 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -35,6 +35,7 @@
#include "ocfs2.h"
#include "alloc.h"
+#include "blockcheck.h"
#include "dlmglue.h"
#include "inode.h"
#include "journal.h"
@@ -145,62 +146,183 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl)
return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc);
}
-/* somewhat more expensive than our other checks, so use sparingly. */
-int ocfs2_check_group_descriptor(struct super_block *sb,
- struct ocfs2_dinode *di,
- struct ocfs2_group_desc *gd)
+#define do_error(fmt, ...) \
+ do{ \
+ if (clean_error) \
+ mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \
+ else \
+ ocfs2_error(sb, fmt, ##__VA_ARGS__); \
+ } while (0)
+
+static int ocfs2_validate_gd_self(struct super_block *sb,
+ struct buffer_head *bh,
+ int clean_error)
{
- unsigned int max_bits;
+ struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
- OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd);
- return -EIO;
+ do_error("Group descriptor #%llu has bad signature %.*s",
+ (unsigned long long)bh->b_blocknr, 7,
+ gd->bg_signature);
+ return -EINVAL;
}
+ if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) {
+ do_error("Group descriptor #%llu has an invalid bg_blkno "
+ "of %llu",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(gd->bg_blkno));
+ return -EINVAL;
+ }
+
+ if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) {
+ do_error("Group descriptor #%llu has an invalid "
+ "fs_generation of #%u",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(gd->bg_generation));
+ return -EINVAL;
+ }
+
+ if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
+ do_error("Group descriptor #%llu has bit count %u but "
+ "claims that %u are free",
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(gd->bg_bits),
+ le16_to_cpu(gd->bg_free_bits_count));
+ return -EINVAL;
+ }
+
+ if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
+ do_error("Group descriptor #%llu has bit count %u but "
+ "max bitmap bits of %u",
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(gd->bg_bits),
+ 8 * le16_to_cpu(gd->bg_size));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ocfs2_validate_gd_parent(struct super_block *sb,
+ struct ocfs2_dinode *di,
+ struct buffer_head *bh,
+ int clean_error)
+{
+ unsigned int max_bits;
+ struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
if (di->i_blkno != gd->bg_parent_dinode) {
- ocfs2_error(sb, "Group descriptor # %llu has bad parent "
- "pointer (%llu, expected %llu)",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
- (unsigned long long)le64_to_cpu(di->i_blkno));
- return -EIO;
+ do_error("Group descriptor #%llu has bad parent "
+ "pointer (%llu, expected %llu)",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(gd->bg_parent_dinode),
+ (unsigned long long)le64_to_cpu(di->i_blkno));
+ return -EINVAL;
}
max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc);
if (le16_to_cpu(gd->bg_bits) > max_bits) {
- ocfs2_error(sb, "Group descriptor # %llu has bit count of %u",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- le16_to_cpu(gd->bg_bits));
- return -EIO;
+ do_error("Group descriptor #%llu has bit count of %u",
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(gd->bg_bits));
+ return -EINVAL;
}
if (le16_to_cpu(gd->bg_chain) >=
le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) {
- ocfs2_error(sb, "Group descriptor # %llu has bad chain %u",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- le16_to_cpu(gd->bg_chain));
- return -EIO;
+ do_error("Group descriptor #%llu has bad chain %u",
+ (unsigned long long)bh->b_blocknr,
+ le16_to_cpu(gd->bg_chain));
+ return -EINVAL;
}
- if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) {
- ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
- "claims that %u are free",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- le16_to_cpu(gd->bg_bits),
- le16_to_cpu(gd->bg_free_bits_count));
- return -EIO;
- }
+ return 0;
+}
- if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) {
- ocfs2_error(sb, "Group descriptor # %llu has bit count %u but "
- "max bitmap bits of %u",
- (unsigned long long)le64_to_cpu(gd->bg_blkno),
- le16_to_cpu(gd->bg_bits),
- 8 * le16_to_cpu(gd->bg_size));
- return -EIO;
+#undef do_error
+
+/*
+ * This version only prints errors. It does not fail the filesystem, and
+ * exists only for resize.
+ */
+int ocfs2_check_group_descriptor(struct super_block *sb,
+ struct ocfs2_dinode *di,
+ struct buffer_head *bh)
+{
+ int rc;
+ struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+ if (rc) {
+ mlog(ML_ERROR,
+ "Checksum failed for group descriptor %llu\n",
+ (unsigned long long)bh->b_blocknr);
+ } else
+ rc = ocfs2_validate_gd_self(sb, bh, 1);
+ if (!rc)
+ rc = ocfs2_validate_gd_parent(sb, di, bh, 1);
+
+ return rc;
+}
+
+static int ocfs2_validate_group_descriptor(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc;
+ struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data;
+
+ mlog(0, "Validating group descriptor %llu\n",
+ (unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check);
+ if (rc)
+ return rc;
+
+ /*
+ * Errors after here are fatal.
+ */
+
+ return ocfs2_validate_gd_self(sb, bh, 0);
+}
+
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+ u64 gd_blkno, struct buffer_head **bh)
+{
+ int rc;
+ struct buffer_head *tmp = *bh;
+
+ rc = ocfs2_read_block(inode, gd_blkno, &tmp,
+ ocfs2_validate_group_descriptor);
+ if (rc)
+ goto out;
+
+ rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0);
+ if (rc) {
+ brelse(tmp);
+ goto out;
}
- return 0;
+ /* If ocfs2_read_block() got us a new bh, pass it up. */
+ if (!*bh)
+ *bh = tmp;
+
+out:
+ return rc;
}
static int ocfs2_block_group_fill(handle_t *handle,
@@ -225,10 +347,10 @@ static int ocfs2_block_group_fill(handle_t *handle,
goto bail;
}
- status = ocfs2_journal_access(handle,
- alloc_inode,
- bg_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ status = ocfs2_journal_access_gd(handle,
+ alloc_inode,
+ bg_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -358,8 +480,8 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
bg = (struct ocfs2_group_desc *) bg_bh->b_data;
- status = ocfs2_journal_access(handle, alloc_inode,
- bh, OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, alloc_inode,
+ bh, OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -441,11 +563,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
ac->ac_alloc_slot = slot;
fe = (struct ocfs2_dinode *) bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
- status = -EIO;
- goto bail;
- }
+
+ /* The bh was validated by the inode read inside
+ * ocfs2_inode_lock(). Any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) {
ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu",
(unsigned long long)le64_to_cpu(fe->i_blkno));
@@ -790,10 +912,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb,
int offset, start, found, status = 0;
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
- if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
- OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg);
- return -EIO;
- }
+ /* Callers got this descriptor from
+ * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
found = start = best_offset = best_size = 0;
bitmap = bg->bg_bitmap;
@@ -858,11 +979,9 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
mlog_entry_void();
- if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
- OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
- status = -EIO;
- goto bail;
- }
+ /* All callers get the descriptor via
+ * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits);
mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off,
@@ -871,10 +990,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle,
if (ocfs2_is_cluster_bitmap(alloc_inode))
journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
- status = ocfs2_journal_access(handle,
- alloc_inode,
- group_bh,
- journal_type);
+ status = ocfs2_journal_access_gd(handle,
+ alloc_inode,
+ group_bh,
+ journal_type);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -931,21 +1050,10 @@ static int ocfs2_relink_block_group(handle_t *handle,
struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data;
struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
- status = -EIO;
- goto out;
- }
- if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
- OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
- status = -EIO;
- goto out;
- }
- if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) {
- OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg);
- status = -EIO;
- goto out;
- }
+ /* The caller got these descriptors from
+ * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
+ BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg));
mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n",
(unsigned long long)le64_to_cpu(fe->i_blkno), chain,
@@ -956,8 +1064,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
bg_ptr = le64_to_cpu(bg->bg_next_group);
prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group);
- status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_gd(handle, alloc_inode, prev_bg_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto out_rollback;
@@ -971,8 +1079,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
goto out_rollback;
}
- status = ocfs2_journal_access(handle, alloc_inode, bg_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_gd(handle, alloc_inode, bg_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto out_rollback;
@@ -986,8 +1094,8 @@ static int ocfs2_relink_block_group(handle_t *handle,
goto out_rollback;
}
- status = ocfs2_journal_access(handle, alloc_inode, fe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, alloc_inode, fe_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto out_rollback;
@@ -1008,7 +1116,7 @@ out_rollback:
bg->bg_next_group = cpu_to_le64(bg_ptr);
prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr);
}
-out:
+
mlog_exit(status);
return status;
}
@@ -1138,8 +1246,8 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode,
struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data;
struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain;
- ret = ocfs2_journal_access(handle, inode, di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, inode, di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out;
@@ -1170,21 +1278,17 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac,
u16 found;
struct buffer_head *group_bh = NULL;
struct ocfs2_group_desc *gd;
+ struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data;
struct inode *alloc_inode = ac->ac_inode;
- ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh);
+ ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno,
+ &group_bh);
if (ret < 0) {
mlog_errno(ret);
return ret;
}
gd = (struct ocfs2_group_desc *) group_bh->b_data;
- if (!OCFS2_IS_VALID_GROUP_DESC(gd)) {
- OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd);
- ret = -EIO;
- goto out;
- }
-
ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits,
ac->ac_max_block, bit_off, &found);
if (ret < 0) {
@@ -1241,19 +1345,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
bits_wanted, chain,
(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno);
- status = ocfs2_read_block(alloc_inode,
- le64_to_cpu(cl->cl_recs[chain].c_blkno),
- &group_bh);
+ status = ocfs2_read_group_descriptor(alloc_inode, fe,
+ le64_to_cpu(cl->cl_recs[chain].c_blkno),
+ &group_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
bg = (struct ocfs2_group_desc *) group_bh->b_data;
- status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
status = -ENOSPC;
/* for now, the chain search is a bit simplistic. We just use
@@ -1271,18 +1370,13 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
next_group = le64_to_cpu(bg->bg_next_group);
prev_group_bh = group_bh;
group_bh = NULL;
- status = ocfs2_read_block(alloc_inode,
- next_group, &group_bh);
+ status = ocfs2_read_group_descriptor(alloc_inode, fe,
+ next_group, &group_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
bg = (struct ocfs2_group_desc *) group_bh->b_data;
- status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
}
if (status < 0) {
if (status != -ENOSPC)
@@ -1324,10 +1418,10 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac,
/* Ok, claim our bits now: set the info on dinode, chainlist
* and then the group */
- status = ocfs2_journal_access(handle,
- alloc_inode,
- ac->ac_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle,
+ alloc_inode,
+ ac->ac_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1392,11 +1486,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb,
BUG_ON(!ac->ac_bh);
fe = (struct ocfs2_dinode *) ac->ac_bh->b_data;
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe);
- status = -EIO;
- goto bail;
- }
+
+ /* The bh was validated by the inode read during
+ * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
+
if (le32_to_cpu(fe->id1.bitmap1.i_used) >=
le32_to_cpu(fe->id1.bitmap1.i_total)) {
ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used "
@@ -1725,19 +1819,17 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle,
mlog_entry_void();
- if (!OCFS2_IS_VALID_GROUP_DESC(bg)) {
- OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg);
- status = -EIO;
- goto bail;
- }
+ /* The caller got this descriptor from
+ * ocfs2_read_group_descriptor(). Any corruption is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg));
mlog(0, "off = %u, num = %u\n", bit_off, num_bits);
if (ocfs2_is_cluster_bitmap(alloc_inode))
journal_type = OCFS2_JOURNAL_ACCESS_UNDO;
- status = ocfs2_journal_access(handle, alloc_inode, group_bh,
- journal_type);
+ status = ocfs2_journal_access_gd(handle, alloc_inode, group_bh,
+ journal_type);
if (status < 0) {
mlog_errno(status);
goto bail;
@@ -1782,29 +1874,26 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
mlog_entry_void();
- if (!OCFS2_IS_VALID_DINODE(fe)) {
- OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe);
- status = -EIO;
- goto bail;
- }
+ /* The alloc_bh comes from ocfs2_free_dinode() or
+ * ocfs2_free_clusters(). The callers have all locked the
+ * allocator and gotten alloc_bh from the lock call. This
+ * validates the dinode buffer. Any corruption that has happended
+ * is a code bug. */
+ BUG_ON(!OCFS2_IS_VALID_DINODE(fe));
BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl));
mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n",
(unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count,
(unsigned long long)bg_blkno, start_bit);
- status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh);
+ status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno,
+ &group_bh);
if (status < 0) {
mlog_errno(status);
goto bail;
}
-
group = (struct ocfs2_group_desc *) group_bh->b_data;
- status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group);
- if (status) {
- mlog_errno(status);
- goto bail;
- }
+
BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits));
status = ocfs2_block_group_clear_bits(handle, alloc_inode,
@@ -1815,8 +1904,8 @@ int ocfs2_free_suballoc_bits(handle_t *handle,
goto bail;
}
- status = ocfs2_journal_access(handle, alloc_inode, alloc_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = ocfs2_journal_access_di(handle, alloc_inode, alloc_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto bail;
diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h
index 4df159d8f450..e3c13c77f9e8 100644
--- a/fs/ocfs2/suballoc.h
+++ b/fs/ocfs2/suballoc.h
@@ -164,10 +164,24 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac);
* and return that block offset. */
u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster);
-/* somewhat more expensive than our other checks, so use sparingly. */
+/*
+ * By default, ocfs2_read_group_descriptor() calls ocfs2_error() when it
+ * finds a problem. A caller that wants to check a group descriptor
+ * without going readonly should read the block with ocfs2_read_block[s]()
+ * and then checking it with this function. This is only resize, really.
+ * Everyone else should be using ocfs2_read_group_descriptor().
+ */
int ocfs2_check_group_descriptor(struct super_block *sb,
struct ocfs2_dinode *di,
- struct ocfs2_group_desc *gd);
+ struct buffer_head *bh);
+/*
+ * Read a group descriptor block into *bh. If *bh is NULL, a bh will be
+ * allocated. This is a cached read. The descriptor will be validated with
+ * ocfs2_validate_group_descriptor().
+ */
+int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di,
+ u64 gd_blkno, struct buffer_head **bh);
+
int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et,
u32 clusters_to_add, u32 extents_to_split,
struct ocfs2_alloc_context **data_ac,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 304b63ac78cf..43ed11345b59 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -41,6 +41,7 @@
#include <linux/debugfs.h>
#include <linux/mount.h>
#include <linux/seq_file.h>
+#include <linux/quotaops.h>
#define MLOG_MASK_PREFIX ML_SUPER
#include <cluster/masklog.h>
@@ -51,6 +52,7 @@
#include "ocfs1_fs_compat.h"
#include "alloc.h"
+#include "blockcheck.h"
#include "dlmglue.h"
#include "export.h"
#include "extent_map.h"
@@ -65,10 +67,13 @@
#include "uptodate.h"
#include "ver.h"
#include "xattr.h"
+#include "quota.h"
#include "buffer_head_io.h"
static struct kmem_cache *ocfs2_inode_cachep = NULL;
+struct kmem_cache *ocfs2_dquot_cachep;
+struct kmem_cache *ocfs2_qf_chunk_cachep;
/* OCFS2 needs to schedule several differnt types of work which
* require cluster locking, disk I/O, recovery waits, etc. Since these
@@ -124,6 +129,9 @@ static int ocfs2_get_sector(struct super_block *sb,
static void ocfs2_write_super(struct super_block *sb);
static struct inode *ocfs2_alloc_inode(struct super_block *sb);
static void ocfs2_destroy_inode(struct inode *inode);
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend);
+static int ocfs2_enable_quotas(struct ocfs2_super *osb);
+static void ocfs2_disable_quotas(struct ocfs2_super *osb);
static const struct super_operations ocfs2_sops = {
.statfs = ocfs2_statfs,
@@ -137,6 +145,8 @@ static const struct super_operations ocfs2_sops = {
.put_super = ocfs2_put_super,
.remount_fs = ocfs2_remount,
.show_options = ocfs2_show_options,
+ .quota_read = ocfs2_quota_read,
+ .quota_write = ocfs2_quota_write,
};
enum {
@@ -158,6 +168,10 @@ enum {
Opt_user_xattr,
Opt_nouser_xattr,
Opt_inode64,
+ Opt_acl,
+ Opt_noacl,
+ Opt_usrquota,
+ Opt_grpquota,
Opt_err,
};
@@ -180,6 +194,10 @@ static const match_table_t tokens = {
{Opt_user_xattr, "user_xattr"},
{Opt_nouser_xattr, "nouser_xattr"},
{Opt_inode64, "inode64"},
+ {Opt_acl, "acl"},
+ {Opt_noacl, "noacl"},
+ {Opt_usrquota, "usrquota"},
+ {Opt_grpquota, "grpquota"},
{Opt_err, NULL}
};
@@ -221,6 +239,19 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait)
return 0;
}
+static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino)
+{
+ if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)
+ && (ino == USER_QUOTA_SYSTEM_INODE
+ || ino == LOCAL_USER_QUOTA_SYSTEM_INODE))
+ return 0;
+ if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
+ && (ino == GROUP_QUOTA_SYSTEM_INODE
+ || ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE))
+ return 0;
+ return 1;
+}
+
static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
{
struct inode *new = NULL;
@@ -247,6 +278,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb)
for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE;
i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) {
+ if (!ocfs2_need_system_inode(osb, i))
+ continue;
new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
if (!new) {
ocfs2_release_system_inodes(osb);
@@ -277,6 +310,8 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb)
for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1;
i < NUM_SYSTEM_INODES;
i++) {
+ if (!ocfs2_need_system_inode(osb, i))
+ continue;
new = ocfs2_get_system_file_inode(osb, i, osb->slot_num);
if (!new) {
ocfs2_release_system_inodes(osb);
@@ -426,6 +461,12 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
/* We're going to/from readonly mode. */
if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
+ /* Disable quota accounting before remounting RO */
+ if (*flags & MS_RDONLY) {
+ ret = ocfs2_susp_quotas(osb, 0);
+ if (ret < 0)
+ goto out;
+ }
/* Lock here so the check of HARD_RO and the potential
* setting of SOFT_RO is atomic. */
spin_lock(&osb->osb_lock);
@@ -461,11 +502,28 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data)
}
unlock_osb:
spin_unlock(&osb->osb_lock);
+ /* Enable quota accounting after remounting RW */
+ if (!ret && !(*flags & MS_RDONLY)) {
+ if (sb_any_quota_suspended(sb))
+ ret = ocfs2_susp_quotas(osb, 1);
+ else
+ ret = ocfs2_enable_quotas(osb);
+ if (ret < 0) {
+ /* Return back changes... */
+ spin_lock(&osb->osb_lock);
+ sb->s_flags |= MS_RDONLY;
+ osb->osb_flags |= OCFS2_OSB_SOFT_RO;
+ spin_unlock(&osb->osb_lock);
+ goto out;
+ }
+ }
}
if (!ret) {
/* Only save off the new mount options in case of a successful
* remount. */
+ if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
+ parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
osb->s_mount_opt = parsed_options.mount_opt;
osb->s_atime_quantum = parsed_options.atime_quantum;
osb->preferred_slot = parsed_options.slot;
@@ -619,6 +677,131 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb,
return 0;
}
+static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
+{
+ int type;
+ struct super_block *sb = osb->sb;
+ unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+ int status = 0;
+
+ for (type = 0; type < MAXQUOTAS; type++) {
+ if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+ continue;
+ if (unsuspend)
+ status = vfs_quota_enable(
+ sb_dqopt(sb)->files[type],
+ type, QFMT_OCFS2,
+ DQUOT_SUSPENDED);
+ else
+ status = vfs_quota_disable(sb, type,
+ DQUOT_SUSPENDED);
+ if (status < 0)
+ break;
+ }
+ if (status < 0)
+ mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on "
+ "remount (error = %d).\n", status);
+ return status;
+}
+
+static int ocfs2_enable_quotas(struct ocfs2_super *osb)
+{
+ struct inode *inode[MAXQUOTAS] = { NULL, NULL };
+ struct super_block *sb = osb->sb;
+ unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+ unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE,
+ LOCAL_GROUP_QUOTA_SYSTEM_INODE };
+ int status;
+ int type;
+
+ sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE;
+ for (type = 0; type < MAXQUOTAS; type++) {
+ if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+ continue;
+ inode[type] = ocfs2_get_system_file_inode(osb, ino[type],
+ osb->slot_num);
+ if (!inode[type]) {
+ status = -ENOENT;
+ goto out_quota_off;
+ }
+ status = vfs_quota_enable(inode[type], type, QFMT_OCFS2,
+ DQUOT_USAGE_ENABLED);
+ if (status < 0)
+ goto out_quota_off;
+ }
+
+ for (type = 0; type < MAXQUOTAS; type++)
+ iput(inode[type]);
+ return 0;
+out_quota_off:
+ ocfs2_disable_quotas(osb);
+ for (type = 0; type < MAXQUOTAS; type++)
+ iput(inode[type]);
+ mlog_errno(status);
+ return status;
+}
+
+static void ocfs2_disable_quotas(struct ocfs2_super *osb)
+{
+ int type;
+ struct inode *inode;
+ struct super_block *sb = osb->sb;
+
+ /* We mostly ignore errors in this function because there's not much
+ * we can do when we see them */
+ for (type = 0; type < MAXQUOTAS; type++) {
+ if (!sb_has_quota_loaded(sb, type))
+ continue;
+ inode = igrab(sb->s_dquot.files[type]);
+ /* Turn off quotas. This will remove all dquot structures from
+ * memory and so they will be automatically synced to global
+ * quota files */
+ vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED |
+ DQUOT_LIMITS_ENABLED);
+ if (!inode)
+ continue;
+ iput(inode);
+ }
+}
+
+/* Handle quota on quotactl */
+static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
+ char *path, int remount)
+{
+ unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
+
+ if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
+ return -EINVAL;
+
+ if (remount)
+ return 0; /* Just ignore it has been handled in
+ * ocfs2_remount() */
+ return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
+ format_id, DQUOT_LIMITS_ENABLED);
+}
+
+/* Handle quota off quotactl */
+static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
+{
+ if (remount)
+ return 0; /* Ignore now and handle later in
+ * ocfs2_remount() */
+ return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
+}
+
+static struct quotactl_ops ocfs2_quotactl_ops = {
+ .quota_on = ocfs2_quota_on,
+ .quota_off = ocfs2_quota_off,
+ .quota_sync = vfs_quota_sync,
+ .get_info = vfs_get_dqinfo,
+ .set_info = vfs_set_dqinfo,
+ .get_dqblk = vfs_get_dqblk,
+ .set_dqblk = vfs_set_dqblk,
+};
+
static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
{
struct dentry *root;
@@ -651,12 +834,32 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
}
brelse(bh);
bh = NULL;
+
+ if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR))
+ parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
+
osb->s_mount_opt = parsed_options.mount_opt;
osb->s_atime_quantum = parsed_options.atime_quantum;
osb->preferred_slot = parsed_options.slot;
osb->osb_commit_interval = parsed_options.commit_interval;
osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt);
osb->local_alloc_bits = osb->local_alloc_default_bits;
+ if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA &&
+ !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+ OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+ status = -EINVAL;
+ mlog(ML_ERROR, "User quotas were requested, but this "
+ "filesystem does not have the feature enabled.\n");
+ goto read_super_error;
+ }
+ if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA &&
+ !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+ status = -EINVAL;
+ mlog(ML_ERROR, "Group quotas were requested, but this "
+ "filesystem does not have the feature enabled.\n");
+ goto read_super_error;
+ }
status = ocfs2_verify_userspace_stack(osb, &parsed_options);
if (status)
@@ -664,6 +867,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = OCFS2_SUPER_MAGIC;
+ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
+ ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
+
/* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
* heartbeat=none */
if (bdev_read_only(sb->s_bdev)) {
@@ -758,6 +964,28 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
atomic_set(&osb->vol_state, VOLUME_MOUNTED);
wake_up(&osb->osb_mount_event);
+ /* Now we can initialize quotas because we can afford to wait
+ * for cluster locks recovery now. That also means that truncation
+ * log recovery can happen but that waits for proper quota setup */
+ if (!(sb->s_flags & MS_RDONLY)) {
+ status = ocfs2_enable_quotas(osb);
+ if (status < 0) {
+ /* We have to err-out specially here because
+ * s_root is already set */
+ mlog_errno(status);
+ atomic_set(&osb->vol_state, VOLUME_DISABLED);
+ wake_up(&osb->osb_mount_event);
+ mlog_exit(status);
+ return status;
+ }
+ }
+
+ ocfs2_complete_quota_recovery(osb);
+
+ /* Now we wake up again for processes waiting for quotas */
+ atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS);
+ wake_up(&osb->osb_mount_event);
+
mlog_exit(status);
return status;
@@ -945,6 +1173,41 @@ static int ocfs2_parse_options(struct super_block *sb,
case Opt_inode64:
mopt->mount_opt |= OCFS2_MOUNT_INODE64;
break;
+ case Opt_usrquota:
+ /* We check only on remount, otherwise features
+ * aren't yet initialized. */
+ if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+ OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) {
+ mlog(ML_ERROR, "User quota requested but "
+ "filesystem feature is not set\n");
+ status = 0;
+ goto bail;
+ }
+ mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA;
+ break;
+ case Opt_grpquota:
+ if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb,
+ OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) {
+ mlog(ML_ERROR, "Group quota requested but "
+ "filesystem feature is not set\n");
+ status = 0;
+ goto bail;
+ }
+ mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA;
+ break;
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+ case Opt_acl:
+ mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL;
+ break;
+ case Opt_noacl:
+ mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL;
+ break;
+#else
+ case Opt_acl:
+ case Opt_noacl:
+ printk(KERN_INFO "ocfs2 (no)acl options not supported\n");
+ break;
+#endif
default:
mlog(ML_ERROR,
"Unrecognized mount option \"%s\" "
@@ -1008,6 +1271,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
if (osb->osb_cluster_stack[0])
seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN,
osb->osb_cluster_stack);
+ if (opts & OCFS2_MOUNT_USRQUOTA)
+ seq_printf(s, ",usrquota");
+ if (opts & OCFS2_MOUNT_GRPQUOTA)
+ seq_printf(s, ",grpquota");
if (opts & OCFS2_MOUNT_NOUSERXATTR)
seq_printf(s, ",nouser_xattr");
@@ -1017,6 +1284,13 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
if (opts & OCFS2_MOUNT_INODE64)
seq_printf(s, ",inode64");
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+ if (opts & OCFS2_MOUNT_POSIX_ACL)
+ seq_printf(s, ",acl");
+ else
+ seq_printf(s, ",noacl");
+#endif
+
return 0;
}
@@ -1052,10 +1326,16 @@ static int __init ocfs2_init(void)
mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n");
}
+ status = ocfs2_quota_setup();
+ if (status)
+ goto leave;
+
ocfs2_set_locking_protocol();
+ status = register_quota_format(&ocfs2_quota_format);
leave:
if (status < 0) {
+ ocfs2_quota_shutdown();
ocfs2_free_mem_caches();
exit_ocfs2_uptodate_cache();
}
@@ -1072,11 +1352,15 @@ static void __exit ocfs2_exit(void)
{
mlog_entry_void();
+ ocfs2_quota_shutdown();
+
if (ocfs2_wq) {
flush_workqueue(ocfs2_wq);
destroy_workqueue(ocfs2_wq);
}
+ unregister_quota_format(&ocfs2_quota_format);
+
debugfs_remove(ocfs2_debugfs_root);
ocfs2_free_mem_caches();
@@ -1192,8 +1476,27 @@ static int ocfs2_initialize_mem_caches(void)
(SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
SLAB_MEM_SPREAD),
ocfs2_inode_init_once);
- if (!ocfs2_inode_cachep)
+ ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
+ sizeof(struct ocfs2_dquot),
+ 0,
+ (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+ SLAB_MEM_SPREAD),
+ NULL);
+ ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache",
+ sizeof(struct ocfs2_quota_chunk),
+ 0,
+ (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+ NULL);
+ if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep ||
+ !ocfs2_qf_chunk_cachep) {
+ if (ocfs2_inode_cachep)
+ kmem_cache_destroy(ocfs2_inode_cachep);
+ if (ocfs2_dquot_cachep)
+ kmem_cache_destroy(ocfs2_dquot_cachep);
+ if (ocfs2_qf_chunk_cachep)
+ kmem_cache_destroy(ocfs2_qf_chunk_cachep);
return -ENOMEM;
+ }
return 0;
}
@@ -1202,8 +1505,15 @@ static void ocfs2_free_mem_caches(void)
{
if (ocfs2_inode_cachep)
kmem_cache_destroy(ocfs2_inode_cachep);
-
ocfs2_inode_cachep = NULL;
+
+ if (ocfs2_dquot_cachep)
+ kmem_cache_destroy(ocfs2_dquot_cachep);
+ ocfs2_dquot_cachep = NULL;
+
+ if (ocfs2_qf_chunk_cachep)
+ kmem_cache_destroy(ocfs2_qf_chunk_cachep);
+ ocfs2_qf_chunk_cachep = NULL;
}
static int ocfs2_get_sector(struct super_block *sb,
@@ -1303,6 +1613,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
osb = OCFS2_SB(sb);
BUG_ON(!osb);
+ ocfs2_disable_quotas(osb);
+
ocfs2_shutdown_local_alloc(osb);
ocfs2_truncate_log_shutdown(osb);
@@ -1413,6 +1725,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
sb->s_fs_info = osb;
sb->s_op = &ocfs2_sops;
sb->s_export_op = &ocfs2_export_ops;
+ sb->s_qcop = &ocfs2_quotactl_ops;
+ sb->dq_op = &ocfs2_quota_operations;
sb->s_xattr = ocfs2_xattr_handlers;
sb->s_time_gran = 1;
sb->s_flags |= MS_NOATIME;
@@ -1676,6 +1990,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE,
strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) {
+ /* We have to do a raw check of the feature here */
+ if (le32_to_cpu(di->id2.i_super.s_feature_incompat) &
+ OCFS2_FEATURE_INCOMPAT_META_ECC) {
+ status = ocfs2_block_check_validate(bh->b_data,
+ bh->b_size,
+ &di->i_check);
+ if (status)
+ goto out;
+ }
status = -EINVAL;
if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) {
mlog(ML_ERROR, "found superblock with incorrect block "
@@ -1717,6 +2040,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di,
}
}
+out:
mlog_exit(status);
return status;
}
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index cbd03dfdc7b9..ed0a0cfd68d2 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -84,7 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode,
mlog_entry_void();
- status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh);
+ status = ocfs2_read_inode_block(inode, bh);
if (status < 0) {
mlog_errno(status);
link = ERR_PTR(status);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 74d7367ade13..e1d638af6ac3 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -35,12 +35,14 @@
#include <linux/init.h>
#include <linux/module.h>
#include <linux/string.h>
+#include <linux/security.h>
#define MLOG_MASK_PREFIX ML_XATTR
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "alloc.h"
+#include "blockcheck.h"
#include "dlmglue.h"
#include "file.h"
#include "symlink.h"
@@ -61,12 +63,32 @@ struct ocfs2_xattr_def_value_root {
};
struct ocfs2_xattr_bucket {
- struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
- struct ocfs2_xattr_header *xh;
+ /* The inode these xattrs are associated with */
+ struct inode *bu_inode;
+
+ /* The actual buffers that make up the bucket */
+ struct buffer_head *bu_bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET];
+
+ /* How many blocks make up one bucket for this filesystem */
+ int bu_blocks;
+};
+
+struct ocfs2_xattr_set_ctxt {
+ handle_t *handle;
+ struct ocfs2_alloc_context *meta_ac;
+ struct ocfs2_alloc_context *data_ac;
+ struct ocfs2_cached_dealloc_ctxt dealloc;
};
#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root))
#define OCFS2_XATTR_INLINE_SIZE 80
+#define OCFS2_XATTR_FREE_IN_IBODY (OCFS2_MIN_XATTR_INLINE_SIZE \
+ - sizeof(struct ocfs2_xattr_header) \
+ - sizeof(__u32))
+#define OCFS2_XATTR_FREE_IN_BLOCK(ptr) ((ptr)->i_sb->s_blocksize \
+ - sizeof(struct ocfs2_xattr_block) \
+ - sizeof(struct ocfs2_xattr_header) \
+ - sizeof(__u32))
static struct ocfs2_xattr_def_value_root def_xv = {
.xv.xr_list.l_count = cpu_to_le16(1),
@@ -74,13 +96,25 @@ static struct ocfs2_xattr_def_value_root def_xv = {
struct xattr_handler *ocfs2_xattr_handlers[] = {
&ocfs2_xattr_user_handler,
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+ &ocfs2_xattr_acl_access_handler,
+ &ocfs2_xattr_acl_default_handler,
+#endif
&ocfs2_xattr_trusted_handler,
+ &ocfs2_xattr_security_handler,
NULL
};
static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = {
[OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler,
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+ [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS]
+ = &ocfs2_xattr_acl_access_handler,
+ [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT]
+ = &ocfs2_xattr_acl_default_handler,
+#endif
[OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler,
+ [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler,
};
struct ocfs2_xattr_info {
@@ -98,7 +132,7 @@ struct ocfs2_xattr_search {
*/
struct buffer_head *xattr_bh;
struct ocfs2_xattr_header *header;
- struct ocfs2_xattr_bucket bucket;
+ struct ocfs2_xattr_bucket *bucket;
void *base;
void *end;
struct ocfs2_xattr_entry *here;
@@ -127,14 +161,20 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode,
size_t buffer_size);
static int ocfs2_xattr_create_index_block(struct inode *inode,
- struct ocfs2_xattr_search *xs);
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt);
static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
struct ocfs2_xattr_info *xi,
- struct ocfs2_xattr_search *xs);
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt);
static int ocfs2_delete_xattr_index_block(struct inode *inode,
struct buffer_head *xb_bh);
+static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
+ u64 src_blk, u64 last_blk, u64 to_blk,
+ unsigned int start_bucket,
+ u32 *first_hash);
static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb)
{
@@ -154,6 +194,216 @@ static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb)
return len / sizeof(struct ocfs2_xattr_entry);
}
+#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr)
+#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data)
+#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0))
+
+static struct ocfs2_xattr_bucket *ocfs2_xattr_bucket_new(struct inode *inode)
+{
+ struct ocfs2_xattr_bucket *bucket;
+ int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+ BUG_ON(blks > OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET);
+
+ bucket = kzalloc(sizeof(struct ocfs2_xattr_bucket), GFP_NOFS);
+ if (bucket) {
+ bucket->bu_inode = inode;
+ bucket->bu_blocks = blks;
+ }
+
+ return bucket;
+}
+
+static void ocfs2_xattr_bucket_relse(struct ocfs2_xattr_bucket *bucket)
+{
+ int i;
+
+ for (i = 0; i < bucket->bu_blocks; i++) {
+ brelse(bucket->bu_bhs[i]);
+ bucket->bu_bhs[i] = NULL;
+ }
+}
+
+static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket)
+{
+ if (bucket) {
+ ocfs2_xattr_bucket_relse(bucket);
+ bucket->bu_inode = NULL;
+ kfree(bucket);
+ }
+}
+
+/*
+ * A bucket that has never been written to disk doesn't need to be
+ * read. We just need the buffer_heads. Don't call this for
+ * buckets that are already on disk. ocfs2_read_xattr_bucket() initializes
+ * them fully.
+ */
+static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
+ u64 xb_blkno)
+{
+ int i, rc = 0;
+
+ for (i = 0; i < bucket->bu_blocks; i++) {
+ bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb,
+ xb_blkno + i);
+ if (!bucket->bu_bhs[i]) {
+ rc = -EIO;
+ mlog_errno(rc);
+ break;
+ }
+
+ if (!ocfs2_buffer_uptodate(bucket->bu_inode,
+ bucket->bu_bhs[i]))
+ ocfs2_set_new_buffer_uptodate(bucket->bu_inode,
+ bucket->bu_bhs[i]);
+ }
+
+ if (rc)
+ ocfs2_xattr_bucket_relse(bucket);
+ return rc;
+}
+
+/* Read the xattr bucket at xb_blkno */
+static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
+ u64 xb_blkno)
+{
+ int rc;
+
+ rc = ocfs2_read_blocks(bucket->bu_inode, xb_blkno,
+ bucket->bu_blocks, bucket->bu_bhs, 0,
+ NULL);
+ if (!rc) {
+ rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb,
+ bucket->bu_bhs,
+ bucket->bu_blocks,
+ &bucket_xh(bucket)->xh_check);
+ if (rc)
+ mlog_errno(rc);
+ }
+
+ if (rc)
+ ocfs2_xattr_bucket_relse(bucket);
+ return rc;
+}
+
+static int ocfs2_xattr_bucket_journal_access(handle_t *handle,
+ struct ocfs2_xattr_bucket *bucket,
+ int type)
+{
+ int i, rc = 0;
+
+ for (i = 0; i < bucket->bu_blocks; i++) {
+ rc = ocfs2_journal_access(handle, bucket->bu_inode,
+ bucket->bu_bhs[i], type);
+ if (rc) {
+ mlog_errno(rc);
+ break;
+ }
+ }
+
+ return rc;
+}
+
+static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
+ struct ocfs2_xattr_bucket *bucket)
+{
+ int i;
+
+ ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb,
+ bucket->bu_bhs, bucket->bu_blocks,
+ &bucket_xh(bucket)->xh_check);
+
+ for (i = 0; i < bucket->bu_blocks; i++)
+ ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
+}
+
+static void ocfs2_xattr_bucket_copy_data(struct ocfs2_xattr_bucket *dest,
+ struct ocfs2_xattr_bucket *src)
+{
+ int i;
+ int blocksize = src->bu_inode->i_sb->s_blocksize;
+
+ BUG_ON(dest->bu_blocks != src->bu_blocks);
+ BUG_ON(dest->bu_inode != src->bu_inode);
+
+ for (i = 0; i < src->bu_blocks; i++) {
+ memcpy(bucket_block(dest, i), bucket_block(src, i),
+ blocksize);
+ }
+}
+
+static int ocfs2_validate_xattr_block(struct super_block *sb,
+ struct buffer_head *bh)
+{
+ int rc;
+ struct ocfs2_xattr_block *xb =
+ (struct ocfs2_xattr_block *)bh->b_data;
+
+ mlog(0, "Validating xattr block %llu\n",
+ (unsigned long long)bh->b_blocknr);
+
+ BUG_ON(!buffer_uptodate(bh));
+
+ /*
+ * If the ecc fails, we return the error but otherwise
+ * leave the filesystem running. We know any error is
+ * local to this block.
+ */
+ rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &xb->xb_check);
+ if (rc)
+ return rc;
+
+ /*
+ * Errors after here are fatal
+ */
+
+ if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
+ ocfs2_error(sb,
+ "Extended attribute block #%llu has bad "
+ "signature %.*s",
+ (unsigned long long)bh->b_blocknr, 7,
+ xb->xb_signature);
+ return -EINVAL;
+ }
+
+ if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) {
+ ocfs2_error(sb,
+ "Extended attribute block #%llu has an "
+ "invalid xb_blkno of %llu",
+ (unsigned long long)bh->b_blocknr,
+ (unsigned long long)le64_to_cpu(xb->xb_blkno));
+ return -EINVAL;
+ }
+
+ if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) {
+ ocfs2_error(sb,
+ "Extended attribute block #%llu has an invalid "
+ "xb_fs_generation of #%u",
+ (unsigned long long)bh->b_blocknr,
+ le32_to_cpu(xb->xb_fs_generation));
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno,
+ struct buffer_head **bh)
+{
+ int rc;
+ struct buffer_head *tmp = *bh;
+
+ rc = ocfs2_read_block(inode, xb_blkno, &tmp,
+ ocfs2_validate_xattr_block);
+
+ /* If ocfs2_read_block() got us a new bh, pass it up. */
+ if (!rc && !*bh)
+ *bh = tmp;
+
+ return rc;
+}
+
static inline const char *ocfs2_xattr_prefix(int name_index)
{
struct xattr_handler *handler = NULL;
@@ -200,54 +450,163 @@ static void ocfs2_xattr_hash_entry(struct inode *inode,
return;
}
+static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len)
+{
+ int size = 0;
+
+ if (value_len <= OCFS2_XATTR_INLINE_SIZE)
+ size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len);
+ else
+ size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
+ size += sizeof(struct ocfs2_xattr_entry);
+
+ return size;
+}
+
+int ocfs2_calc_security_init(struct inode *dir,
+ struct ocfs2_security_xattr_info *si,
+ int *want_clusters,
+ int *xattr_credits,
+ struct ocfs2_alloc_context **xattr_ac)
+{
+ int ret = 0;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ int s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
+ si->value_len);
+
+ /*
+ * The max space of security xattr taken inline is
+ * 256(name) + 80(value) + 16(entry) = 352 bytes,
+ * So reserve one metadata block for it is ok.
+ */
+ if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+ s_size > OCFS2_XATTR_FREE_IN_IBODY) {
+ ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+ *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+ }
+
+ /* reserve clusters for xattr value which will be set in B tree*/
+ if (si->value_len > OCFS2_XATTR_INLINE_SIZE) {
+ int new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
+ si->value_len);
+
+ *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+ new_clusters);
+ *want_clusters += new_clusters;
+ }
+ return ret;
+}
+
+int ocfs2_calc_xattr_init(struct inode *dir,
+ struct buffer_head *dir_bh,
+ int mode,
+ struct ocfs2_security_xattr_info *si,
+ int *want_clusters,
+ int *xattr_credits,
+ struct ocfs2_alloc_context **xattr_ac)
+{
+ int ret = 0;
+ struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
+ int s_size = 0, a_size = 0, acl_len = 0, new_clusters;
+
+ if (si->enable)
+ s_size = ocfs2_xattr_entry_real_size(strlen(si->name),
+ si->value_len);
+
+ if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) {
+ acl_len = ocfs2_xattr_get_nolock(dir, dir_bh,
+ OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT,
+ "", NULL, 0);
+ if (acl_len > 0) {
+ a_size = ocfs2_xattr_entry_real_size(0, acl_len);
+ if (S_ISDIR(mode))
+ a_size <<= 1;
+ } else if (acl_len != 0 && acl_len != -ENODATA) {
+ mlog_errno(ret);
+ return ret;
+ }
+ }
+
+ if (!(s_size + a_size))
+ return ret;
+
+ /*
+ * The max space of security xattr taken inline is
+ * 256(name) + 80(value) + 16(entry) = 352 bytes,
+ * The max space of acl xattr taken inline is
+ * 80(value) + 16(entry) * 2(if directory) = 192 bytes,
+ * when blocksize = 512, may reserve one more cluser for
+ * xattr bucket, otherwise reserve one metadata block
+ * for them is ok.
+ */
+ if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE ||
+ (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) {
+ ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+ *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+ }
+
+ if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE &&
+ (s_size + a_size) > OCFS2_XATTR_FREE_IN_BLOCK(dir)) {
+ *want_clusters += 1;
+ *xattr_credits += ocfs2_blocks_per_xattr_bucket(dir->i_sb);
+ }
+
+ /*
+ * reserve credits and clusters for xattrs which has large value
+ * and have to be set outside
+ */
+ if (si->enable && si->value_len > OCFS2_XATTR_INLINE_SIZE) {
+ new_clusters = ocfs2_clusters_for_bytes(dir->i_sb,
+ si->value_len);
+ *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+ new_clusters);
+ *want_clusters += new_clusters;
+ }
+ if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL &&
+ acl_len > OCFS2_XATTR_INLINE_SIZE) {
+ /* for directory, it has DEFAULT and ACCESS two types of acls */
+ new_clusters = (S_ISDIR(mode) ? 2 : 1) *
+ ocfs2_clusters_for_bytes(dir->i_sb, acl_len);
+ *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb,
+ new_clusters);
+ *want_clusters += new_clusters;
+ }
+
+ return ret;
+}
+
static int ocfs2_xattr_extend_allocation(struct inode *inode,
u32 clusters_to_add,
- struct buffer_head *xattr_bh,
- struct ocfs2_xattr_value_root *xv)
+ struct ocfs2_xattr_value_buf *vb,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
int status = 0;
- int restart_func = 0;
- int credits = 0;
- handle_t *handle = NULL;
- struct ocfs2_alloc_context *data_ac = NULL;
- struct ocfs2_alloc_context *meta_ac = NULL;
+ handle_t *handle = ctxt->handle;
enum ocfs2_alloc_restarted why;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters);
+ u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters);
struct ocfs2_extent_tree et;
mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add);
- ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv);
-
-restart_all:
-
- status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
- &data_ac, &meta_ac);
- if (status) {
- mlog_errno(status);
- goto leave;
- }
-
- credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
- clusters_to_add);
- handle = ocfs2_start_trans(osb, credits);
- if (IS_ERR(handle)) {
- status = PTR_ERR(handle);
- handle = NULL;
- mlog_errno(status);
- goto leave;
- }
+ ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
-restarted_transaction:
- status = ocfs2_journal_access(handle, inode, xattr_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ status = vb->vb_access(handle, inode, vb->vb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- prev_clusters = le32_to_cpu(xv->xr_clusters);
+ prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
status = ocfs2_add_clusters_in_btree(osb,
inode,
&logical_start,
@@ -255,157 +614,84 @@ restarted_transaction:
0,
&et,
handle,
- data_ac,
- meta_ac,
+ ctxt->data_ac,
+ ctxt->meta_ac,
&why);
- if ((status < 0) && (status != -EAGAIN)) {
- if (status != -ENOSPC)
- mlog_errno(status);
+ if (status < 0) {
+ mlog_errno(status);
goto leave;
}
- status = ocfs2_journal_dirty(handle, xattr_bh);
+ status = ocfs2_journal_dirty(handle, vb->vb_bh);
if (status < 0) {
mlog_errno(status);
goto leave;
}
- clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters;
+ clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters;
- if (why != RESTART_NONE && clusters_to_add) {
- if (why == RESTART_META) {
- mlog(0, "restarting function.\n");
- restart_func = 1;
- } else {
- BUG_ON(why != RESTART_TRANS);
-
- mlog(0, "restarting transaction.\n");
- /* TODO: This can be more intelligent. */
- credits = ocfs2_calc_extend_credits(osb->sb,
- et.et_root_el,
- clusters_to_add);
- status = ocfs2_extend_trans(handle, credits);
- if (status < 0) {
- /* handle still has to be committed at
- * this point. */
- status = -ENOMEM;
- mlog_errno(status);
- goto leave;
- }
- goto restarted_transaction;
- }
- }
+ /*
+ * We should have already allocated enough space before the transaction,
+ * so no need to restart.
+ */
+ BUG_ON(why != RESTART_NONE || clusters_to_add);
leave:
- if (handle) {
- ocfs2_commit_trans(osb, handle);
- handle = NULL;
- }
- if (data_ac) {
- ocfs2_free_alloc_context(data_ac);
- data_ac = NULL;
- }
- if (meta_ac) {
- ocfs2_free_alloc_context(meta_ac);
- meta_ac = NULL;
- }
- if ((!status) && restart_func) {
- restart_func = 0;
- goto restart_all;
- }
return status;
}
static int __ocfs2_remove_xattr_range(struct inode *inode,
- struct buffer_head *root_bh,
- struct ocfs2_xattr_value_root *xv,
+ struct ocfs2_xattr_value_buf *vb,
u32 cpos, u32 phys_cpos, u32 len,
- struct ocfs2_cached_dealloc_ctxt *dealloc)
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
int ret;
u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct inode *tl_inode = osb->osb_tl_inode;
- handle_t *handle;
- struct ocfs2_alloc_context *meta_ac = NULL;
+ handle_t *handle = ctxt->handle;
struct ocfs2_extent_tree et;
- ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv);
+ ocfs2_init_xattr_value_extent_tree(&et, inode, vb);
- ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac);
+ ret = vb->vb_access(handle, inode, vb->vb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- return ret;
- }
-
- mutex_lock(&tl_inode->i_mutex);
-
- if (ocfs2_truncate_log_needs_flush(osb)) {
- ret = __ocfs2_flush_truncate_log(osb);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
- }
-
- handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access(handle, inode, root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
-
- ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac,
- dealloc);
+ ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, ctxt->meta_ac,
+ &ctxt->dealloc);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
- le32_add_cpu(&xv->xr_clusters, -len);
+ le32_add_cpu(&vb->vb_xv->xr_clusters, -len);
- ret = ocfs2_journal_dirty(handle, root_bh);
+ ret = ocfs2_journal_dirty(handle, vb->vb_bh);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
- ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len);
+ ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len);
if (ret)
mlog_errno(ret);
-out_commit:
- ocfs2_commit_trans(osb, handle);
out:
- mutex_unlock(&tl_inode->i_mutex);
-
- if (meta_ac)
- ocfs2_free_alloc_context(meta_ac);
-
return ret;
}
static int ocfs2_xattr_shrink_size(struct inode *inode,
u32 old_clusters,
u32 new_clusters,
- struct buffer_head *root_bh,
- struct ocfs2_xattr_value_root *xv)
+ struct ocfs2_xattr_value_buf *vb,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
int ret = 0;
u32 trunc_len, cpos, phys_cpos, alloc_size;
u64 block;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- struct ocfs2_cached_dealloc_ctxt dealloc;
-
- ocfs2_init_dealloc_ctxt(&dealloc);
if (old_clusters <= new_clusters)
return 0;
@@ -414,7 +700,8 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
trunc_len = old_clusters - new_clusters;
while (trunc_len) {
ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos,
- &alloc_size, &xv->xr_list);
+ &alloc_size,
+ &vb->vb_xv->xr_list);
if (ret) {
mlog_errno(ret);
goto out;
@@ -423,9 +710,9 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
if (alloc_size > trunc_len)
alloc_size = trunc_len;
- ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos,
+ ret = __ocfs2_remove_xattr_range(inode, vb, cpos,
phys_cpos, alloc_size,
- &dealloc);
+ ctxt);
if (ret) {
mlog_errno(ret);
goto out;
@@ -439,20 +726,17 @@ static int ocfs2_xattr_shrink_size(struct inode *inode,
}
out:
- ocfs2_schedule_truncate_log_flush(osb, 1);
- ocfs2_run_deallocs(osb, &dealloc);
-
return ret;
}
static int ocfs2_xattr_value_truncate(struct inode *inode,
- struct buffer_head *root_bh,
- struct ocfs2_xattr_value_root *xv,
- int len)
+ struct ocfs2_xattr_value_buf *vb,
+ int len,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
int ret;
u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len);
- u32 old_clusters = le32_to_cpu(xv->xr_clusters);
+ u32 old_clusters = le32_to_cpu(vb->vb_xv->xr_clusters);
if (new_clusters == old_clusters)
return 0;
@@ -460,11 +744,11 @@ static int ocfs2_xattr_value_truncate(struct inode *inode,
if (new_clusters > old_clusters)
ret = ocfs2_xattr_extend_allocation(inode,
new_clusters - old_clusters,
- root_bh, xv);
+ vb, ctxt);
else
ret = ocfs2_xattr_shrink_size(inode,
old_clusters, new_clusters,
- root_bh, xv);
+ vb, ctxt);
return ret;
}
@@ -554,18 +838,14 @@ static int ocfs2_xattr_block_list(struct inode *inode,
if (!di->i_xattr_loc)
return ret;
- ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+ ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+ &blk_bh);
if (ret < 0) {
mlog_errno(ret);
return ret;
}
xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
- if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
- ret = -EIO;
- goto cleanup;
- }
-
if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header;
ret = ocfs2_xattr_list_entries(inode, header,
@@ -575,7 +855,7 @@ static int ocfs2_xattr_block_list(struct inode *inode,
ret = ocfs2_xattr_tree_list_index_block(inode, xt,
buffer, buffer_size);
}
-cleanup:
+
brelse(blk_bh);
return ret;
@@ -685,7 +965,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode,
blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
/* Copy ocfs2_xattr_value */
for (i = 0; i < num_clusters * bpc; i++, blkno++) {
- ret = ocfs2_read_block(inode, blkno, &bh);
+ ret = ocfs2_read_block(inode, blkno, &bh, NULL);
if (ret) {
mlog_errno(ret);
goto out;
@@ -769,7 +1049,12 @@ static int ocfs2_xattr_block_get(struct inode *inode,
size_t size;
int ret = -ENODATA, name_offset, name_len, block_off, i;
- memset(&xs->bucket, 0, sizeof(xs->bucket));
+ xs->bucket = ocfs2_xattr_bucket_new(inode);
+ if (!xs->bucket) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto cleanup;
+ }
ret = ocfs2_xattr_block_find(inode, name_index, name, xs);
if (ret) {
@@ -795,11 +1080,11 @@ static int ocfs2_xattr_block_get(struct inode *inode,
if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
ret = ocfs2_xattr_bucket_get_name_value(inode,
- xs->bucket.xh,
+ bucket_xh(xs->bucket),
i,
&block_off,
&name_offset);
- xs->base = xs->bucket.bhs[block_off]->b_data;
+ xs->base = bucket_block(xs->bucket, block_off);
}
if (ocfs2_xattr_is_local(xs->here)) {
memcpy(buffer, (void *)xs->base +
@@ -817,21 +1102,15 @@ static int ocfs2_xattr_block_get(struct inode *inode,
}
ret = size;
cleanup:
- for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++)
- brelse(xs->bucket.bhs[i]);
- memset(&xs->bucket, 0, sizeof(xs->bucket));
+ ocfs2_xattr_bucket_free(xs->bucket);
brelse(xs->xattr_bh);
xs->xattr_bh = NULL;
return ret;
}
-/* ocfs2_xattr_get()
- *
- * Copy an extended attribute into the buffer provided.
- * Buffer is NULL to compute the size of buffer required.
- */
-static int ocfs2_xattr_get(struct inode *inode,
+int ocfs2_xattr_get_nolock(struct inode *inode,
+ struct buffer_head *di_bh,
int name_index,
const char *name,
void *buffer,
@@ -839,7 +1118,6 @@ static int ocfs2_xattr_get(struct inode *inode,
{
int ret;
struct ocfs2_dinode *di = NULL;
- struct buffer_head *di_bh = NULL;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_xattr_search xis = {
.not_found = -ENODATA,
@@ -854,11 +1132,6 @@ static int ocfs2_xattr_get(struct inode *inode,
if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
ret = -ENODATA;
- ret = ocfs2_inode_lock(inode, &di_bh, 0);
- if (ret < 0) {
- mlog_errno(ret);
- return ret;
- }
xis.inode_bh = xbs.inode_bh = di_bh;
di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -869,6 +1142,32 @@ static int ocfs2_xattr_get(struct inode *inode,
ret = ocfs2_xattr_block_get(inode, name_index, name, buffer,
buffer_size, &xbs);
up_read(&oi->ip_xattr_sem);
+
+ return ret;
+}
+
+/* ocfs2_xattr_get()
+ *
+ * Copy an extended attribute into the buffer provided.
+ * Buffer is NULL to compute the size of buffer required.
+ */
+static int ocfs2_xattr_get(struct inode *inode,
+ int name_index,
+ const char *name,
+ void *buffer,
+ size_t buffer_size)
+{
+ int ret;
+ struct buffer_head *di_bh = NULL;
+
+ ret = ocfs2_inode_lock(inode, &di_bh, 0);
+ if (ret < 0) {
+ mlog_errno(ret);
+ return ret;
+ }
+ ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index,
+ name, buffer, buffer_size);
+
ocfs2_inode_unlock(inode, 0);
brelse(di_bh);
@@ -877,44 +1176,36 @@ static int ocfs2_xattr_get(struct inode *inode,
}
static int __ocfs2_xattr_set_value_outside(struct inode *inode,
+ handle_t *handle,
struct ocfs2_xattr_value_root *xv,
const void *value,
int value_len)
{
- int ret = 0, i, cp_len, credits;
+ int ret = 0, i, cp_len;
u16 blocksize = inode->i_sb->s_blocksize;
u32 p_cluster, num_clusters;
u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len);
u64 blkno;
struct buffer_head *bh = NULL;
- handle_t *handle;
BUG_ON(clusters > le32_to_cpu(xv->xr_clusters));
- credits = clusters * bpc;
- handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- mlog_errno(ret);
- goto out;
- }
-
while (cpos < clusters) {
ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
&num_clusters, &xv->xr_list);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
for (i = 0; i < num_clusters * bpc; i++, blkno++) {
- ret = ocfs2_read_block(inode, blkno, &bh);
+ ret = ocfs2_read_block(inode, blkno, &bh, NULL);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
ret = ocfs2_journal_access(handle,
@@ -923,7 +1214,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
cp_len = value_len > blocksize ? blocksize : value_len;
@@ -937,7 +1228,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
ret = ocfs2_journal_dirty(handle, bh);
if (ret < 0) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
brelse(bh);
bh = NULL;
@@ -951,8 +1242,6 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode,
}
cpos += num_clusters;
}
-out_commit:
- ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
out:
brelse(bh);
@@ -960,28 +1249,22 @@ out:
}
static int ocfs2_xattr_cleanup(struct inode *inode,
+ handle_t *handle,
struct ocfs2_xattr_info *xi,
struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_value_buf *vb,
size_t offs)
{
- handle_t *handle = NULL;
int ret = 0;
size_t name_len = strlen(xi->name);
void *val = xs->base + offs;
size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE;
- handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
- OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- mlog_errno(ret);
- goto out;
- }
- ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = vb->vb_access(handle, inode, vb->vb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
/* Decrease xattr count */
le16_add_cpu(&xs->header->xh_count, -1);
@@ -989,35 +1272,27 @@ static int ocfs2_xattr_cleanup(struct inode *inode,
memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry));
memset(val, 0, size);
- ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+ ret = ocfs2_journal_dirty(handle, vb->vb_bh);
if (ret < 0)
mlog_errno(ret);
-out_commit:
- ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
out:
return ret;
}
static int ocfs2_xattr_update_entry(struct inode *inode,
+ handle_t *handle,
struct ocfs2_xattr_info *xi,
struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_value_buf *vb,
size_t offs)
{
- handle_t *handle = NULL;
- int ret = 0;
+ int ret;
- handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
- OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- mlog_errno(ret);
- goto out;
- }
- ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = vb->vb_access(handle, inode, vb->vb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
xs->here->xe_name_offset = cpu_to_le16(offs);
@@ -1028,11 +1303,9 @@ static int ocfs2_xattr_update_entry(struct inode *inode,
ocfs2_xattr_set_local(xs->here, 0);
ocfs2_xattr_hash_entry(inode, xs->header, xs->here);
- ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
+ ret = ocfs2_journal_dirty(handle, vb->vb_bh);
if (ret < 0)
mlog_errno(ret);
-out_commit:
- ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
out:
return ret;
}
@@ -1045,6 +1318,8 @@ out:
static int ocfs2_xattr_set_value_outside(struct inode *inode,
struct ocfs2_xattr_info *xi,
struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt,
+ struct ocfs2_xattr_value_buf *vb,
size_t offs)
{
size_t name_len = strlen(xi->name);
@@ -1062,20 +1337,20 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode,
xv->xr_list.l_tree_depth = 0;
xv->xr_list.l_count = cpu_to_le16(1);
xv->xr_list.l_next_free_rec = 0;
+ vb->vb_xv = xv;
- ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv,
- xi->value_len);
+ ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt);
if (ret < 0) {
mlog_errno(ret);
return ret;
}
- ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value,
- xi->value_len);
+ ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs);
if (ret < 0) {
mlog_errno(ret);
return ret;
}
- ret = ocfs2_xattr_update_entry(inode, xi, xs, offs);
+ ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv,
+ xi->value, xi->value_len);
if (ret < 0)
mlog_errno(ret);
@@ -1195,6 +1470,7 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode,
static int ocfs2_xattr_set_entry(struct inode *inode,
struct ocfs2_xattr_info *xi,
struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt,
int flag)
{
struct ocfs2_xattr_entry *last;
@@ -1202,7 +1478,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name);
size_t size_l = 0;
- handle_t *handle = NULL;
+ handle_t *handle = ctxt->handle;
int free, i, ret;
struct ocfs2_xattr_info xi_l = {
.name_index = xi->name_index,
@@ -1210,6 +1486,16 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
.value = xi->value,
.value_len = xi->value_len,
};
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_bh = xs->xattr_bh,
+ .vb_access = ocfs2_journal_access_di,
+ };
+
+ if (!(flag & OCFS2_INLINE_XATTR_FL)) {
+ BUG_ON(xs->xattr_bh == xs->inode_bh);
+ vb.vb_access = ocfs2_journal_access_xb;
+ } else
+ BUG_ON(xs->xattr_bh != xs->inode_bh);
/* Compute min_offs, last and free space. */
last = xs->header->xh_entries;
@@ -1265,15 +1551,14 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
if (ocfs2_xattr_is_local(xs->here) && size == size_l) {
/* Replace existing local xattr with tree root */
ret = ocfs2_xattr_set_value_outside(inode, xi, xs,
- offs);
+ ctxt, &vb, offs);
if (ret < 0)
mlog_errno(ret);
goto out;
} else if (!ocfs2_xattr_is_local(xs->here)) {
/* For existing xattr which has value outside */
- struct ocfs2_xattr_value_root *xv = NULL;
- xv = (struct ocfs2_xattr_value_root *)(val +
- OCFS2_XATTR_SIZE(name_len));
+ vb.vb_xv = (struct ocfs2_xattr_value_root *)
+ (val + OCFS2_XATTR_SIZE(name_len));
if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
/*
@@ -1282,27 +1567,30 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
* then set new value with set_value_outside().
*/
ret = ocfs2_xattr_value_truncate(inode,
- xs->xattr_bh,
- xv,
- xi->value_len);
+ &vb,
+ xi->value_len,
+ ctxt);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
- ret = __ocfs2_xattr_set_value_outside(inode,
- xv,
- xi->value,
- xi->value_len);
+ ret = ocfs2_xattr_update_entry(inode,
+ handle,
+ xi,
+ xs,
+ &vb,
+ offs);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_xattr_update_entry(inode,
- xi,
- xs,
- offs);
+ ret = __ocfs2_xattr_set_value_outside(inode,
+ handle,
+ vb.vb_xv,
+ xi->value,
+ xi->value_len);
if (ret < 0)
mlog_errno(ret);
goto out;
@@ -1312,44 +1600,28 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
* just trucate old value to zero.
*/
ret = ocfs2_xattr_value_truncate(inode,
- xs->xattr_bh,
- xv,
- 0);
+ &vb,
+ 0,
+ ctxt);
if (ret < 0)
mlog_errno(ret);
}
}
}
- handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)),
- OCFS2_INODE_UPDATE_CREDITS);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- mlog_errno(ret);
- goto out;
- }
-
- ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
if (!(flag & OCFS2_INLINE_XATTR_FL)) {
- /* set extended attribute in external block. */
- ret = ocfs2_extend_trans(handle,
- OCFS2_INODE_UPDATE_CREDITS +
- OCFS2_XATTR_BLOCK_UPDATE_CREDITS);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
- ret = ocfs2_journal_access(handle, inode, xs->xattr_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = vb.vb_access(handle, inode, vb.vb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
}
@@ -1363,7 +1635,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
ret = ocfs2_journal_dirty(handle, xs->xattr_bh);
if (ret < 0) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
}
@@ -1391,25 +1663,19 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
oi->ip_dyn_features |= flag;
di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
spin_unlock(&oi->ip_lock);
- /* Update inode ctime */
- inode->i_ctime = CURRENT_TIME;
- di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
- di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
ret = ocfs2_journal_dirty(handle, xs->inode_bh);
if (ret < 0)
mlog_errno(ret);
-out_commit:
- ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
-
if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
/*
* Set value outside in B tree.
* This is the second step for value size > INLINE_SIZE.
*/
size_t offs = le16_to_cpu(xs->here->xe_name_offset);
- ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs);
+ ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt,
+ &vb, offs);
if (ret < 0) {
int ret2;
@@ -1418,41 +1684,56 @@ out_commit:
* If set value outside failed, we have to clean
* the junk tree root we have already set in local.
*/
- ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs);
+ ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle,
+ xi, xs, &vb, offs);
if (ret2 < 0)
mlog_errno(ret2);
}
}
out:
return ret;
-
}
static int ocfs2_remove_value_outside(struct inode*inode,
- struct buffer_head *bh,
+ struct ocfs2_xattr_value_buf *vb,
struct ocfs2_xattr_header *header)
{
int ret = 0, i;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
+
+ ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
+
+ ctxt.handle = ocfs2_start_trans(osb,
+ ocfs2_remove_extent_credits(osb->sb));
+ if (IS_ERR(ctxt.handle)) {
+ ret = PTR_ERR(ctxt.handle);
+ mlog_errno(ret);
+ goto out;
+ }
for (i = 0; i < le16_to_cpu(header->xh_count); i++) {
struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
if (!ocfs2_xattr_is_local(entry)) {
- struct ocfs2_xattr_value_root *xv;
void *val;
val = (void *)header +
le16_to_cpu(entry->xe_name_offset);
- xv = (struct ocfs2_xattr_value_root *)
+ vb->vb_xv = (struct ocfs2_xattr_value_root *)
(val + OCFS2_XATTR_SIZE(entry->xe_name_len));
- ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0);
+ ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt);
if (ret < 0) {
mlog_errno(ret);
- return ret;
+ break;
}
}
}
+ ocfs2_commit_trans(osb, ctxt.handle);
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &ctxt.dealloc);
+out:
return ret;
}
@@ -1463,12 +1744,16 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode,
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_xattr_header *header;
int ret;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_bh = di_bh,
+ .vb_access = ocfs2_journal_access_di,
+ };
header = (struct ocfs2_xattr_header *)
((void *)di + inode->i_sb->s_blocksize -
le16_to_cpu(di->i_xattr_inline_size));
- ret = ocfs2_remove_value_outside(inode, di_bh, header);
+ ret = ocfs2_remove_value_outside(inode, &vb, header);
return ret;
}
@@ -1478,11 +1763,15 @@ static int ocfs2_xattr_block_remove(struct inode *inode,
{
struct ocfs2_xattr_block *xb;
int ret = 0;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_bh = blk_bh,
+ .vb_access = ocfs2_journal_access_xb,
+ };
xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header);
- ret = ocfs2_remove_value_outside(inode, blk_bh, header);
+ ret = ocfs2_remove_value_outside(inode, &vb, header);
} else
ret = ocfs2_delete_xattr_index_block(inode, blk_bh);
@@ -1502,24 +1791,19 @@ static int ocfs2_xattr_free_block(struct inode *inode,
u64 blk, bg_blkno;
u16 bit;
- ret = ocfs2_read_block(inode, block, &blk_bh);
+ ret = ocfs2_read_xattr_block(inode, block, &blk_bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
- xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
- if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
- ret = -EIO;
- goto out;
- }
-
ret = ocfs2_xattr_block_remove(inode, blk_bh);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
+ xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
blk = le64_to_cpu(xb->xb_blkno);
bit = le16_to_cpu(xb->xb_suballoc_bit);
bg_blkno = ocfs2_which_suballoc_group(blk, bit);
@@ -1606,8 +1890,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access(handle, inode, di_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_di(handle, inode, di_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
@@ -1714,7 +1998,8 @@ static int ocfs2_xattr_ibody_find(struct inode *inode,
*/
static int ocfs2_xattr_ibody_set(struct inode *inode,
struct ocfs2_xattr_info *xi,
- struct ocfs2_xattr_search *xs)
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
@@ -1731,7 +2016,7 @@ static int ocfs2_xattr_ibody_set(struct inode *inode,
}
}
- ret = ocfs2_xattr_set_entry(inode, xi, xs,
+ ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL));
out:
up_write(&oi->ip_alloc_sem);
@@ -1758,19 +2043,15 @@ static int ocfs2_xattr_block_find(struct inode *inode,
if (!di->i_xattr_loc)
return ret;
- ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh);
+ ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc),
+ &blk_bh);
if (ret < 0) {
mlog_errno(ret);
return ret;
}
- xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
- if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) {
- ret = -EIO;
- goto cleanup;
- }
-
xs->xattr_bh = blk_bh;
+ xb = (struct ocfs2_xattr_block *)blk_bh->b_data;
if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) {
xs->header = &xb->xb_attrs.xb_header;
@@ -1804,13 +2085,13 @@ cleanup:
*/
static int ocfs2_xattr_block_set(struct inode *inode,
struct ocfs2_xattr_info *xi,
- struct ocfs2_xattr_search *xs)
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
struct buffer_head *new_bh = NULL;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data;
- struct ocfs2_alloc_context *meta_ac = NULL;
- handle_t *handle = NULL;
+ handle_t *handle = ctxt->handle;
struct ocfs2_xattr_block *xblk = NULL;
u16 suballoc_bit_start;
u32 num_got;
@@ -1818,45 +2099,29 @@ static int ocfs2_xattr_block_set(struct inode *inode,
int ret;
if (!xs->xattr_bh) {
- /*
- * Alloc one external block for extended attribute
- * outside of inode.
- */
- ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
+ ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (ret < 0) {
mlog_errno(ret);
- goto out;
- }
- handle = ocfs2_start_trans(osb,
- OCFS2_XATTR_BLOCK_CREATE_CREDITS);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- mlog_errno(ret);
- goto out;
- }
- ret = ocfs2_journal_access(handle, inode, xs->inode_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_commit;
+ goto end;
}
- ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1,
+ ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1,
&suballoc_bit_start, &num_got,
&first_blkno);
if (ret < 0) {
mlog_errno(ret);
- goto out_commit;
+ goto end;
}
new_bh = sb_getblk(inode->i_sb, first_blkno);
ocfs2_set_new_buffer_uptodate(inode, new_bh);
- ret = ocfs2_journal_access(handle, inode, new_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ ret = ocfs2_journal_access_xb(handle, inode, new_bh,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (ret < 0) {
mlog_errno(ret);
- goto out_commit;
+ goto end;
}
/* Initialize ocfs2_xattr_block */
@@ -1874,44 +2139,555 @@ static int ocfs2_xattr_block_set(struct inode *inode,
xs->end = (void *)xblk + inode->i_sb->s_blocksize;
xs->here = xs->header->xh_entries;
-
ret = ocfs2_journal_dirty(handle, new_bh);
if (ret < 0) {
mlog_errno(ret);
- goto out_commit;
+ goto end;
}
di->i_xattr_loc = cpu_to_le64(first_blkno);
- ret = ocfs2_journal_dirty(handle, xs->inode_bh);
- if (ret < 0)
- mlog_errno(ret);
-out_commit:
- ocfs2_commit_trans(osb, handle);
-out:
- if (meta_ac)
- ocfs2_free_alloc_context(meta_ac);
- if (ret < 0)
- return ret;
+ ocfs2_journal_dirty(handle, xs->inode_bh);
} else
xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data;
if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) {
/* Set extended attribute into external block */
- ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL);
+ ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt,
+ OCFS2_HAS_XATTR_FL);
if (!ret || ret != -ENOSPC)
goto end;
- ret = ocfs2_xattr_create_index_block(inode, xs);
+ ret = ocfs2_xattr_create_index_block(inode, xs, ctxt);
if (ret)
goto end;
}
- ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs);
+ ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt);
end:
return ret;
}
+/* Check whether the new xattr can be inserted into the inode. */
+static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xs)
+{
+ u64 value_size;
+ struct ocfs2_xattr_entry *last;
+ int free, i;
+ size_t min_offs = xs->end - xs->base;
+
+ if (!xs->header)
+ return 0;
+
+ last = xs->header->xh_entries;
+
+ for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) {
+ size_t offs = le16_to_cpu(last->xe_name_offset);
+ if (offs < min_offs)
+ min_offs = offs;
+ last += 1;
+ }
+
+ free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+ if (free < 0)
+ return 0;
+
+ BUG_ON(!xs->not_found);
+
+ if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+ value_size = OCFS2_XATTR_ROOT_SIZE;
+ else
+ value_size = OCFS2_XATTR_SIZE(xi->value_len);
+
+ if (free >= sizeof(struct ocfs2_xattr_entry) +
+ OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size)
+ return 1;
+
+ return 0;
+}
+
+static int ocfs2_calc_xattr_set_need(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xis,
+ struct ocfs2_xattr_search *xbs,
+ int *clusters_need,
+ int *meta_need,
+ int *credits_need)
+{
+ int ret = 0, old_in_xb = 0;
+ int clusters_add = 0, meta_add = 0, credits = 0;
+ struct buffer_head *bh = NULL;
+ struct ocfs2_xattr_block *xb = NULL;
+ struct ocfs2_xattr_entry *xe = NULL;
+ struct ocfs2_xattr_value_root *xv = NULL;
+ char *base = NULL;
+ int name_offset, name_len = 0;
+ u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
+ xi->value_len);
+ u64 value_size;
+
+ /*
+ * Calculate the clusters we need to write.
+ * No matter whether we replace an old one or add a new one,
+ * we need this for writing.
+ */
+ if (xi->value_len > OCFS2_XATTR_INLINE_SIZE)
+ credits += new_clusters *
+ ocfs2_clusters_to_blocks(inode->i_sb, 1);
+
+ if (xis->not_found && xbs->not_found) {
+ credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+ if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+ clusters_add += new_clusters;
+ credits += ocfs2_calc_extend_credits(inode->i_sb,
+ &def_xv.xv.xr_list,
+ new_clusters);
+ }
+
+ goto meta_guess;
+ }
+
+ if (!xis->not_found) {
+ xe = xis->here;
+ name_offset = le16_to_cpu(xe->xe_name_offset);
+ name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+ base = xis->base;
+ credits += OCFS2_INODE_UPDATE_CREDITS;
+ } else {
+ int i, block_off = 0;
+ xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+ xe = xbs->here;
+ name_offset = le16_to_cpu(xe->xe_name_offset);
+ name_len = OCFS2_XATTR_SIZE(xe->xe_name_len);
+ i = xbs->here - xbs->header->xh_entries;
+ old_in_xb = 1;
+
+ if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+ ret = ocfs2_xattr_bucket_get_name_value(inode,
+ bucket_xh(xbs->bucket),
+ i, &block_off,
+ &name_offset);
+ base = bucket_block(xbs->bucket, block_off);
+ credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ } else {
+ base = xbs->base;
+ credits += OCFS2_XATTR_BLOCK_UPDATE_CREDITS;
+ }
+ }
+
+ /*
+ * delete a xattr doesn't need metadata and cluster allocation.
+ * so just calculate the credits and return.
+ *
+ * The credits for removing the value tree will be extended
+ * by ocfs2_remove_extent itself.
+ */
+ if (!xi->value) {
+ if (!ocfs2_xattr_is_local(xe))
+ credits += ocfs2_remove_extent_credits(inode->i_sb);
+
+ goto out;
+ }
+
+ /* do cluster allocation guess first. */
+ value_size = le64_to_cpu(xe->xe_value_size);
+
+ if (old_in_xb) {
+ /*
+ * In xattr set, we always try to set the xe in inode first,
+ * so if it can be inserted into inode successfully, the old
+ * one will be removed from the xattr block, and this xattr
+ * will be inserted into inode as a new xattr in inode.
+ */
+ if (ocfs2_xattr_can_be_in_inode(inode, xi, xis)) {
+ clusters_add += new_clusters;
+ credits += ocfs2_remove_extent_credits(inode->i_sb) +
+ OCFS2_INODE_UPDATE_CREDITS;
+ if (!ocfs2_xattr_is_local(xe))
+ credits += ocfs2_calc_extend_credits(
+ inode->i_sb,
+ &def_xv.xv.xr_list,
+ new_clusters);
+ goto out;
+ }
+ }
+
+ if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) {
+ /* the new values will be stored outside. */
+ u32 old_clusters = 0;
+
+ if (!ocfs2_xattr_is_local(xe)) {
+ old_clusters = ocfs2_clusters_for_bytes(inode->i_sb,
+ value_size);
+ xv = (struct ocfs2_xattr_value_root *)
+ (base + name_offset + name_len);
+ value_size = OCFS2_XATTR_ROOT_SIZE;
+ } else
+ xv = &def_xv.xv;
+
+ if (old_clusters >= new_clusters) {
+ credits += ocfs2_remove_extent_credits(inode->i_sb);
+ goto out;
+ } else {
+ meta_add += ocfs2_extend_meta_needed(&xv->xr_list);
+ clusters_add += new_clusters - old_clusters;
+ credits += ocfs2_calc_extend_credits(inode->i_sb,
+ &xv->xr_list,
+ new_clusters -
+ old_clusters);
+ if (value_size >= OCFS2_XATTR_ROOT_SIZE)
+ goto out;
+ }
+ } else {
+ /*
+ * Now the new value will be stored inside. So if the new
+ * value is smaller than the size of value root or the old
+ * value, we don't need any allocation, otherwise we have
+ * to guess metadata allocation.
+ */
+ if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) ||
+ (!ocfs2_xattr_is_local(xe) &&
+ OCFS2_XATTR_ROOT_SIZE >= xi->value_len))
+ goto out;
+ }
+
+meta_guess:
+ /* calculate metadata allocation. */
+ if (di->i_xattr_loc) {
+ if (!xbs->xattr_bh) {
+ ret = ocfs2_read_xattr_block(inode,
+ le64_to_cpu(di->i_xattr_loc),
+ &bh);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ xb = (struct ocfs2_xattr_block *)bh->b_data;
+ } else
+ xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data;
+
+ /*
+ * If there is already an xattr tree, good, we can calculate
+ * like other b-trees. Otherwise we may have the chance of
+ * create a tree, the credit calculation is borrowed from
+ * ocfs2_calc_extend_credits with root_el = NULL. And the
+ * new tree will be cluster based, so no meta is needed.
+ */
+ if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) {
+ struct ocfs2_extent_list *el =
+ &xb->xb_attrs.xb_root.xt_list;
+ meta_add += ocfs2_extend_meta_needed(el);
+ credits += ocfs2_calc_extend_credits(inode->i_sb,
+ el, 1);
+ } else
+ credits += OCFS2_SUBALLOC_ALLOC + 1;
+
+ /*
+ * This cluster will be used either for new bucket or for
+ * new xattr block.
+ * If the cluster size is the same as the bucket size, one
+ * more is needed since we may need to extend the bucket
+ * also.
+ */
+ clusters_add += 1;
+ credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ if (OCFS2_XATTR_BUCKET_SIZE ==
+ OCFS2_SB(inode->i_sb)->s_clustersize) {
+ credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ clusters_add += 1;
+ }
+ } else {
+ meta_add += 1;
+ credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS;
+ }
+out:
+ if (clusters_need)
+ *clusters_need = clusters_add;
+ if (meta_need)
+ *meta_need = meta_add;
+ if (credits_need)
+ *credits_need = credits;
+ brelse(bh);
+ return ret;
+}
+
+static int ocfs2_init_xattr_set_ctxt(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xis,
+ struct ocfs2_xattr_search *xbs,
+ struct ocfs2_xattr_set_ctxt *ctxt,
+ int *credits)
+{
+ int clusters_add, meta_add, ret;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+ memset(ctxt, 0, sizeof(struct ocfs2_xattr_set_ctxt));
+
+ ocfs2_init_dealloc_ctxt(&ctxt->dealloc);
+
+ ret = ocfs2_calc_xattr_set_need(inode, di, xi, xis, xbs,
+ &clusters_add, &meta_add, credits);
+ if (ret) {
+ mlog_errno(ret);
+ return ret;
+ }
+
+ mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, "
+ "credits = %d\n", xi->name, meta_add, clusters_add, *credits);
+
+ if (meta_add) {
+ ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add,
+ &ctxt->meta_ac);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+
+ if (clusters_add) {
+ ret = ocfs2_reserve_clusters(osb, clusters_add, &ctxt->data_ac);
+ if (ret)
+ mlog_errno(ret);
+ }
+out:
+ if (ret) {
+ if (ctxt->meta_ac) {
+ ocfs2_free_alloc_context(ctxt->meta_ac);
+ ctxt->meta_ac = NULL;
+ }
+
+ /*
+ * We cannot have an error and a non null ctxt->data_ac.
+ */
+ }
+
+ return ret;
+}
+
+static int __ocfs2_xattr_set_handle(struct inode *inode,
+ struct ocfs2_dinode *di,
+ struct ocfs2_xattr_info *xi,
+ struct ocfs2_xattr_search *xis,
+ struct ocfs2_xattr_search *xbs,
+ struct ocfs2_xattr_set_ctxt *ctxt)
+{
+ int ret = 0, credits, old_found;
+
+ if (!xi->value) {
+ /* Remove existing extended attribute */
+ if (!xis->not_found)
+ ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
+ else if (!xbs->not_found)
+ ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+ } else {
+ /* We always try to set extended attribute into inode first*/
+ ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt);
+ if (!ret && !xbs->not_found) {
+ /*
+ * If succeed and that extended attribute existing in
+ * external block, then we will remove it.
+ */
+ xi->value = NULL;
+ xi->value_len = 0;
+
+ old_found = xis->not_found;
+ xis->not_found = -ENODATA;
+ ret = ocfs2_calc_xattr_set_need(inode,
+ di,
+ xi,
+ xis,
+ xbs,
+ NULL,
+ NULL,
+ &credits);
+ xis->not_found = old_found;
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_extend_trans(ctxt->handle, credits +
+ ctxt->handle->h_buffer_credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+ } else if (ret == -ENOSPC) {
+ if (di->i_xattr_loc && !xbs->xattr_bh) {
+ ret = ocfs2_xattr_block_find(inode,
+ xi->name_index,
+ xi->name, xbs);
+ if (ret)
+ goto out;
+
+ old_found = xis->not_found;
+ xis->not_found = -ENODATA;
+ ret = ocfs2_calc_xattr_set_need(inode,
+ di,
+ xi,
+ xis,
+ xbs,
+ NULL,
+ NULL,
+ &credits);
+ xis->not_found = old_found;
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_extend_trans(ctxt->handle, credits +
+ ctxt->handle->h_buffer_credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ }
+ /*
+ * If no space in inode, we will set extended attribute
+ * into external block.
+ */
+ ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt);
+ if (ret)
+ goto out;
+ if (!xis->not_found) {
+ /*
+ * If succeed and that extended attribute
+ * existing in inode, we will remove it.
+ */
+ xi->value = NULL;
+ xi->value_len = 0;
+ xbs->not_found = -ENODATA;
+ ret = ocfs2_calc_xattr_set_need(inode,
+ di,
+ xi,
+ xis,
+ xbs,
+ NULL,
+ NULL,
+ &credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_extend_trans(ctxt->handle, credits +
+ ctxt->handle->h_buffer_credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+ ret = ocfs2_xattr_ibody_set(inode, xi,
+ xis, ctxt);
+ }
+ }
+ }
+
+ if (!ret) {
+ /* Update inode ctime. */
+ ret = ocfs2_journal_access(ctxt->handle, inode, xis->inode_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
+
+ inode->i_ctime = CURRENT_TIME;
+ di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
+ di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+ ocfs2_journal_dirty(ctxt->handle, xis->inode_bh);
+ }
+out:
+ return ret;
+}
+
+/*
+ * This function only called duing creating inode
+ * for init security/acl xattrs of the new inode.
+ * All transanction credits have been reserved in mknod.
+ */
+int ocfs2_xattr_set_handle(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *di_bh,
+ int name_index,
+ const char *name,
+ const void *value,
+ size_t value_len,
+ int flags,
+ struct ocfs2_alloc_context *meta_ac,
+ struct ocfs2_alloc_context *data_ac)
+{
+ struct ocfs2_dinode *di;
+ int ret;
+
+ struct ocfs2_xattr_info xi = {
+ .name_index = name_index,
+ .name = name,
+ .value = value,
+ .value_len = value_len,
+ };
+
+ struct ocfs2_xattr_search xis = {
+ .not_found = -ENODATA,
+ };
+
+ struct ocfs2_xattr_search xbs = {
+ .not_found = -ENODATA,
+ };
+
+ struct ocfs2_xattr_set_ctxt ctxt = {
+ .handle = handle,
+ .meta_ac = meta_ac,
+ .data_ac = data_ac,
+ };
+
+ if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
+ return -EOPNOTSUPP;
+
+ /*
+ * In extreme situation, may need xattr bucket when
+ * block size is too small. And we have already reserved
+ * the credits for bucket in mknod.
+ */
+ if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) {
+ xbs.bucket = ocfs2_xattr_bucket_new(inode);
+ if (!xbs.bucket) {
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
+ }
+ }
+
+ xis.inode_bh = xbs.inode_bh = di_bh;
+ di = (struct ocfs2_dinode *)di_bh->b_data;
+
+ down_write(&OCFS2_I(inode)->ip_xattr_sem);
+
+ ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis);
+ if (ret)
+ goto cleanup;
+ if (xis.not_found) {
+ ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs);
+ if (ret)
+ goto cleanup;
+ }
+
+ ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+
+cleanup:
+ up_write(&OCFS2_I(inode)->ip_xattr_sem);
+ brelse(xbs.xattr_bh);
+ ocfs2_xattr_bucket_free(xbs.bucket);
+
+ return ret;
+}
+
/*
* ocfs2_xattr_set()
*
@@ -1928,8 +2704,10 @@ int ocfs2_xattr_set(struct inode *inode,
{
struct buffer_head *di_bh = NULL;
struct ocfs2_dinode *di;
- int ret;
- u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ int ret, credits;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct inode *tl_inode = osb->osb_tl_inode;
+ struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, };
struct ocfs2_xattr_info xi = {
.name_index = name_index,
@@ -1949,10 +2727,20 @@ int ocfs2_xattr_set(struct inode *inode,
if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb)))
return -EOPNOTSUPP;
+ /*
+ * Only xbs will be used on indexed trees. xis doesn't need a
+ * bucket.
+ */
+ xbs.bucket = ocfs2_xattr_bucket_new(inode);
+ if (!xbs.bucket) {
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
+ }
+
ret = ocfs2_inode_lock(inode, &di_bh, 1);
if (ret < 0) {
mlog_errno(ret);
- return ret;
+ goto cleanup_nolock;
}
xis.inode_bh = xbs.inode_bh = di_bh;
di = (struct ocfs2_dinode *)di_bh->b_data;
@@ -1984,55 +2772,53 @@ int ocfs2_xattr_set(struct inode *inode,
goto cleanup;
}
- if (!value) {
- /* Remove existing extended attribute */
- if (!xis.not_found)
- ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
- else if (!xbs.not_found)
- ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
- } else {
- /* We always try to set extended attribute into inode first*/
- ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
- if (!ret && !xbs.not_found) {
- /*
- * If succeed and that extended attribute existing in
- * external block, then we will remove it.
- */
- xi.value = NULL;
- xi.value_len = 0;
- ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
- } else if (ret == -ENOSPC) {
- if (di->i_xattr_loc && !xbs.xattr_bh) {
- ret = ocfs2_xattr_block_find(inode, name_index,
- name, &xbs);
- if (ret)
- goto cleanup;
- }
- /*
- * If no space in inode, we will set extended attribute
- * into external block.
- */
- ret = ocfs2_xattr_block_set(inode, &xi, &xbs);
- if (ret)
- goto cleanup;
- if (!xis.not_found) {
- /*
- * If succeed and that extended attribute
- * existing in inode, we will remove it.
- */
- xi.value = NULL;
- xi.value_len = 0;
- ret = ocfs2_xattr_ibody_set(inode, &xi, &xis);
- }
+
+ mutex_lock(&tl_inode->i_mutex);
+
+ if (ocfs2_truncate_log_needs_flush(osb)) {
+ ret = __ocfs2_flush_truncate_log(osb);
+ if (ret < 0) {
+ mutex_unlock(&tl_inode->i_mutex);
+ mlog_errno(ret);
+ goto cleanup;
}
}
+ mutex_unlock(&tl_inode->i_mutex);
+
+ ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
+ &xbs, &ctxt, &credits);
+ if (ret) {
+ mlog_errno(ret);
+ goto cleanup;
+ }
+
+ /* we need to update inode's ctime field, so add credit for it. */
+ credits += OCFS2_INODE_UPDATE_CREDITS;
+ ctxt.handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(ctxt.handle)) {
+ ret = PTR_ERR(ctxt.handle);
+ mlog_errno(ret);
+ goto cleanup;
+ }
+
+ ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt);
+
+ ocfs2_commit_trans(osb, ctxt.handle);
+
+ if (ctxt.data_ac)
+ ocfs2_free_alloc_context(ctxt.data_ac);
+ if (ctxt.meta_ac)
+ ocfs2_free_alloc_context(ctxt.meta_ac);
+ if (ocfs2_dealloc_has_cluster(&ctxt.dealloc))
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &ctxt.dealloc);
cleanup:
up_write(&OCFS2_I(inode)->ip_xattr_sem);
ocfs2_inode_unlock(inode, 1);
+cleanup_nolock:
brelse(di_bh);
brelse(xbs.xattr_bh);
- for (i = 0; i < blk_per_bucket; i++)
- brelse(xbs.bucket.bhs[i]);
+ ocfs2_xattr_bucket_free(xbs.bucket);
return ret;
}
@@ -2107,7 +2893,7 @@ typedef int (xattr_bucket_func)(struct inode *inode,
void *para);
static int ocfs2_find_xe_in_bucket(struct inode *inode,
- struct buffer_head *header_bh,
+ struct ocfs2_xattr_bucket *bucket,
int name_index,
const char *name,
u32 name_hash,
@@ -2115,11 +2901,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
int *found)
{
int i, ret = 0, cmp = 1, block_off, new_offset;
- struct ocfs2_xattr_header *xh =
- (struct ocfs2_xattr_header *)header_bh->b_data;
+ struct ocfs2_xattr_header *xh = bucket_xh(bucket);
size_t name_len = strlen(name);
struct ocfs2_xattr_entry *xe = NULL;
- struct buffer_head *name_bh = NULL;
char *xe_name;
/*
@@ -2150,19 +2934,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode,
break;
}
- ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off,
- &name_bh);
- if (ret) {
- mlog_errno(ret);
- break;
- }
- xe_name = name_bh->b_data + new_offset;
- cmp = memcmp(name, xe_name, name_len);
- brelse(name_bh);
- name_bh = NULL;
-
- if (cmp == 0) {
+ xe_name = bucket_block(bucket, block_off) + new_offset;
+ if (!memcmp(name, xe_name, name_len)) {
*xe_index = i;
*found = 1;
ret = 0;
@@ -2192,39 +2966,42 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
struct ocfs2_xattr_search *xs)
{
int ret, found = 0;
- struct buffer_head *bh = NULL;
- struct buffer_head *lower_bh = NULL;
struct ocfs2_xattr_header *xh = NULL;
struct ocfs2_xattr_entry *xe = NULL;
u16 index = 0;
u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
int low_bucket = 0, bucket, high_bucket;
+ struct ocfs2_xattr_bucket *search;
u32 last_hash;
- u64 blkno;
+ u64 blkno, lower_blkno = 0;
- ret = ocfs2_read_block(inode, p_blkno, &bh);
+ search = ocfs2_xattr_bucket_new(inode);
+ if (!search) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
+
+ ret = ocfs2_read_xattr_bucket(search, p_blkno);
if (ret) {
mlog_errno(ret);
goto out;
}
- xh = (struct ocfs2_xattr_header *)bh->b_data;
+ xh = bucket_xh(search);
high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1;
-
while (low_bucket <= high_bucket) {
- brelse(bh);
- bh = NULL;
- bucket = (low_bucket + high_bucket) / 2;
+ ocfs2_xattr_bucket_relse(search);
+ bucket = (low_bucket + high_bucket) / 2;
blkno = p_blkno + bucket * blk_per_bucket;
-
- ret = ocfs2_read_block(inode, blkno, &bh);
+ ret = ocfs2_read_xattr_bucket(search, blkno);
if (ret) {
mlog_errno(ret);
goto out;
}
- xh = (struct ocfs2_xattr_header *)bh->b_data;
+ xh = bucket_xh(search);
xe = &xh->xh_entries[0];
if (name_hash < le32_to_cpu(xe->xe_name_hash)) {
high_bucket = bucket - 1;
@@ -2241,10 +3018,8 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
last_hash = le32_to_cpu(xe->xe_name_hash);
- /* record lower_bh which may be the insert place. */
- brelse(lower_bh);
- lower_bh = bh;
- bh = NULL;
+ /* record lower_blkno which may be the insert place. */
+ lower_blkno = blkno;
if (name_hash > le32_to_cpu(xe->xe_name_hash)) {
low_bucket = bucket + 1;
@@ -2252,7 +3027,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
}
/* the searched xattr should reside in this bucket if exists. */
- ret = ocfs2_find_xe_in_bucket(inode, lower_bh,
+ ret = ocfs2_find_xe_in_bucket(inode, search,
name_index, name, name_hash,
&index, &found);
if (ret) {
@@ -2267,46 +3042,29 @@ static int ocfs2_xattr_bucket_find(struct inode *inode,
* When the xattr's hash value is in the gap of 2 buckets, we will
* always set it to the previous bucket.
*/
- if (!lower_bh) {
- /*
- * We can't find any bucket whose first name_hash is less
- * than the find name_hash.
- */
- BUG_ON(bh->b_blocknr != p_blkno);
- lower_bh = bh;
- bh = NULL;
+ if (!lower_blkno)
+ lower_blkno = p_blkno;
+
+ /* This should be in cache - we just read it during the search */
+ ret = ocfs2_read_xattr_bucket(xs->bucket, lower_blkno);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
}
- xs->bucket.bhs[0] = lower_bh;
- xs->bucket.xh = (struct ocfs2_xattr_header *)
- xs->bucket.bhs[0]->b_data;
- lower_bh = NULL;
- xs->header = xs->bucket.xh;
- xs->base = xs->bucket.bhs[0]->b_data;
+ xs->header = bucket_xh(xs->bucket);
+ xs->base = bucket_block(xs->bucket, 0);
xs->end = xs->base + inode->i_sb->s_blocksize;
if (found) {
- /*
- * If we have found the xattr enty, read all the blocks in
- * this bucket.
- */
- ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1,
- blk_per_bucket - 1, &xs->bucket.bhs[1],
- 0);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
-
xs->here = &xs->header->xh_entries[index];
mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name,
- (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index);
+ (unsigned long long)bucket_blkno(xs->bucket), index);
} else
ret = -ENODATA;
out:
- brelse(bh);
- brelse(lower_bh);
+ ocfs2_xattr_bucket_free(search);
return ret;
}
@@ -2357,53 +3115,50 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode,
xattr_bucket_func *func,
void *para)
{
- int i, j, ret = 0;
- int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ int i, ret = 0;
u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb));
u32 num_buckets = clusters * bpc;
- struct ocfs2_xattr_bucket bucket;
+ struct ocfs2_xattr_bucket *bucket;
- memset(&bucket, 0, sizeof(bucket));
+ bucket = ocfs2_xattr_bucket_new(inode);
+ if (!bucket) {
+ mlog_errno(-ENOMEM);
+ return -ENOMEM;
+ }
mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n",
clusters, (unsigned long long)blkno);
- for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) {
- ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket,
- bucket.bhs, 0);
+ for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) {
+ ret = ocfs2_read_xattr_bucket(bucket, blkno);
if (ret) {
mlog_errno(ret);
- goto out;
+ break;
}
- bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data;
/*
* The real bucket num in this series of blocks is stored
* in the 1st bucket.
*/
if (i == 0)
- num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets);
+ num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets);
mlog(0, "iterating xattr bucket %llu, first hash %u\n",
(unsigned long long)blkno,
- le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash));
+ le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash));
if (func) {
- ret = func(inode, &bucket, para);
- if (ret) {
+ ret = func(inode, bucket, para);
+ if (ret)
mlog_errno(ret);
- break;
- }
+ /* Fall through to bucket_relse() */
}
- for (j = 0; j < blk_per_bucket; j++)
- brelse(bucket.bhs[j]);
- memset(&bucket, 0, sizeof(bucket));
+ ocfs2_xattr_bucket_relse(bucket);
+ if (ret)
+ break;
}
-out:
- for (j = 0; j < blk_per_bucket; j++)
- brelse(bucket.bhs[j]);
-
+ ocfs2_xattr_bucket_free(bucket);
return ret;
}
@@ -2441,21 +3196,21 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
int i, block_off, new_offset;
const char *prefix, *name;
- for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) {
- struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i];
+ for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) {
+ struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i];
type = ocfs2_xattr_get_type(entry);
prefix = ocfs2_xattr_prefix(type);
if (prefix) {
ret = ocfs2_xattr_bucket_get_name_value(inode,
- bucket->xh,
+ bucket_xh(bucket),
i,
&block_off,
&new_offset);
if (ret)
break;
- name = (const char *)bucket->bhs[block_off]->b_data +
+ name = (const char *)bucket_block(bucket, block_off) +
new_offset;
ret = ocfs2_xattr_list_entry(xl->buffer,
xl->buffer_size,
@@ -2540,32 +3295,34 @@ static void swap_xe(void *a, void *b, int size)
/*
* When the ocfs2_xattr_block is filled up, new bucket will be created
* and all the xattr entries will be moved to the new bucket.
+ * The header goes at the start of the bucket, and the names+values are
+ * filled from the end. This is why *target starts as the last buffer.
* Note: we need to sort the entries since they are not saved in order
* in the ocfs2_xattr_block.
*/
static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
struct buffer_head *xb_bh,
- struct buffer_head *xh_bh,
- struct buffer_head *data_bh)
+ struct ocfs2_xattr_bucket *bucket)
{
int i, blocksize = inode->i_sb->s_blocksize;
+ int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
u16 offset, size, off_change;
struct ocfs2_xattr_entry *xe;
struct ocfs2_xattr_block *xb =
(struct ocfs2_xattr_block *)xb_bh->b_data;
struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header;
- struct ocfs2_xattr_header *xh =
- (struct ocfs2_xattr_header *)xh_bh->b_data;
+ struct ocfs2_xattr_header *xh = bucket_xh(bucket);
u16 count = le16_to_cpu(xb_xh->xh_count);
- char *target = xh_bh->b_data, *src = xb_bh->b_data;
+ char *src = xb_bh->b_data;
+ char *target = bucket_block(bucket, blks - 1);
mlog(0, "cp xattr from block %llu to bucket %llu\n",
(unsigned long long)xb_bh->b_blocknr,
- (unsigned long long)xh_bh->b_blocknr);
+ (unsigned long long)bucket_blkno(bucket));
+
+ for (i = 0; i < blks; i++)
+ memset(bucket_block(bucket, i), 0, blocksize);
- memset(xh_bh->b_data, 0, blocksize);
- if (data_bh)
- memset(data_bh->b_data, 0, blocksize);
/*
* Since the xe_name_offset is based on ocfs2_xattr_header,
* there is a offset change corresponding to the change of
@@ -2577,8 +3334,6 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
size = blocksize - offset;
/* copy all the names and values. */
- if (data_bh)
- target = data_bh->b_data;
memcpy(target + offset, src + offset, size);
/* Init new header now. */
@@ -2588,7 +3343,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size);
/* copy all the entries. */
- target = xh_bh->b_data;
+ target = bucket_block(bucket, 0);
offset = offsetof(struct ocfs2_xattr_header, xh_entries);
size = count * sizeof(struct ocfs2_xattr_entry);
memcpy(target + offset, (char *)xb_xh + offset, size);
@@ -2614,73 +3369,47 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode,
* While if the entry is in index b-tree, "bucket" indicates the
* real place of the xattr.
*/
-static int ocfs2_xattr_update_xattr_search(struct inode *inode,
- struct ocfs2_xattr_search *xs,
- struct buffer_head *old_bh,
- struct buffer_head *new_bh)
+static void ocfs2_xattr_update_xattr_search(struct inode *inode,
+ struct ocfs2_xattr_search *xs,
+ struct buffer_head *old_bh)
{
- int ret = 0;
char *buf = old_bh->b_data;
struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf;
struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header;
- int i, blocksize = inode->i_sb->s_blocksize;
- u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-
- xs->bucket.bhs[0] = new_bh;
- get_bh(new_bh);
- xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data;
- xs->header = xs->bucket.xh;
+ int i;
- xs->base = new_bh->b_data;
+ xs->header = bucket_xh(xs->bucket);
+ xs->base = bucket_block(xs->bucket, 0);
xs->end = xs->base + inode->i_sb->s_blocksize;
- if (!xs->not_found) {
- if (OCFS2_XATTR_BUCKET_SIZE != blocksize) {
- ret = ocfs2_read_blocks(inode,
- xs->bucket.bhs[0]->b_blocknr + 1,
- blk_per_bucket - 1, &xs->bucket.bhs[1],
- 0);
- if (ret) {
- mlog_errno(ret);
- return ret;
- }
-
- }
- i = xs->here - old_xh->xh_entries;
- xs->here = &xs->header->xh_entries[i];
- }
+ if (xs->not_found)
+ return;
- return ret;
+ i = xs->here - old_xh->xh_entries;
+ xs->here = &xs->header->xh_entries[i];
}
static int ocfs2_xattr_create_index_block(struct inode *inode,
- struct ocfs2_xattr_search *xs)
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
- int ret, credits = OCFS2_SUBALLOC_ALLOC;
+ int ret;
u32 bit_off, len;
u64 blkno;
- handle_t *handle;
+ handle_t *handle = ctxt->handle;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_inode_info *oi = OCFS2_I(inode);
- struct ocfs2_alloc_context *data_ac;
- struct buffer_head *xh_bh = NULL, *data_bh = NULL;
struct buffer_head *xb_bh = xs->xattr_bh;
struct ocfs2_xattr_block *xb =
(struct ocfs2_xattr_block *)xb_bh->b_data;
struct ocfs2_xattr_tree_root *xr;
u16 xb_flags = le16_to_cpu(xb->xb_flags);
- u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
mlog(0, "create xattr index block for %llu\n",
(unsigned long long)xb_bh->b_blocknr);
BUG_ON(xb_flags & OCFS2_XATTR_INDEXED);
-
- ret = ocfs2_reserve_clusters(osb, 1, &data_ac);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
+ BUG_ON(!xs->bucket);
/*
* XXX:
@@ -2689,29 +3418,18 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
*/
down_write(&oi->ip_alloc_sem);
- /*
- * 3 more credits, one for xattr block update, one for the 1st block
- * of the new xattr bucket and one for the value/data.
- */
- credits += 3;
- handle = ocfs2_start_trans(osb, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- mlog_errno(ret);
- goto out_sem;
- }
-
- ret = ocfs2_journal_access(handle, inode, xb_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_xb(handle, inode, xb_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
- ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len);
+ ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac,
+ 1, 1, &bit_off, &len);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
/*
@@ -2724,51 +3442,23 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
mlog(0, "allocate 1 cluster from %llu to xattr block\n",
(unsigned long long)blkno);
- xh_bh = sb_getblk(inode->i_sb, blkno);
- if (!xh_bh) {
- ret = -EIO;
+ ret = ocfs2_init_xattr_bucket(xs->bucket, blkno);
+ if (ret) {
mlog_errno(ret);
- goto out_commit;
+ goto out;
}
- ocfs2_set_new_buffer_uptodate(inode, xh_bh);
-
- ret = ocfs2_journal_access(handle, inode, xh_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
+ ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
+ OCFS2_JOURNAL_ACCESS_CREATE);
if (ret) {
mlog_errno(ret);
- goto out_commit;
- }
-
- if (bpb > 1) {
- data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1);
- if (!data_bh) {
- ret = -EIO;
- mlog_errno(ret);
- goto out_commit;
- }
-
- ocfs2_set_new_buffer_uptodate(inode, data_bh);
-
- ret = ocfs2_journal_access(handle, inode, data_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
+ goto out;
}
- ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh);
+ ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xs->bucket);
+ ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
- ocfs2_journal_dirty(handle, xh_bh);
- if (data_bh)
- ocfs2_journal_dirty(handle, data_bh);
-
- ret = ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
+ ocfs2_xattr_update_xattr_search(inode, xs, xb_bh);
/* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */
memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize -
@@ -2787,24 +3477,10 @@ static int ocfs2_xattr_create_index_block(struct inode *inode,
xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED);
- ret = ocfs2_journal_dirty(handle, xb_bh);
- if (ret) {
- mlog_errno(ret);
- goto out_commit;
- }
-
-out_commit:
- ocfs2_commit_trans(osb, handle);
-
-out_sem:
- up_write(&oi->ip_alloc_sem);
+ ocfs2_journal_dirty(handle, xb_bh);
out:
- if (data_ac)
- ocfs2_free_alloc_context(data_ac);
-
- brelse(xh_bh);
- brelse(data_bh);
+ up_write(&oi->ip_alloc_sem);
return ret;
}
@@ -2829,29 +3505,18 @@ static int cmp_xe_offset(const void *a, const void *b)
* so that we can spare some space for insertion.
*/
static int ocfs2_defrag_xattr_bucket(struct inode *inode,
+ handle_t *handle,
struct ocfs2_xattr_bucket *bucket)
{
int ret, i;
size_t end, offset, len, value_len;
struct ocfs2_xattr_header *xh;
char *entries, *buf, *bucket_buf = NULL;
- u64 blkno = bucket->bhs[0]->b_blocknr;
- u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+ u64 blkno = bucket_blkno(bucket);
u16 xh_free_start;
size_t blocksize = inode->i_sb->s_blocksize;
- handle_t *handle;
- struct buffer_head **bhs;
struct ocfs2_xattr_entry *xe;
- bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
- GFP_NOFS);
- if (!bhs)
- return -ENOMEM;
-
- ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs, 0);
- if (ret)
- goto out;
-
/*
* In order to make the operation more efficient and generic,
* we copy all the blocks into a contiguous memory and do the
@@ -2865,26 +3530,16 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
}
buf = bucket_buf;
- for (i = 0; i < blk_per_bucket; i++, buf += blocksize)
- memcpy(buf, bhs[i]->b_data, blocksize);
+ for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
+ memcpy(buf, bucket_block(bucket, i), blocksize);
- handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- handle = NULL;
+ ret = ocfs2_xattr_bucket_journal_access(handle, bucket,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
mlog_errno(ret);
goto out;
}
- for (i = 0; i < blk_per_bucket; i++) {
- ret = ocfs2_journal_access(handle, inode, bhs[i],
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret < 0) {
- mlog_errno(ret);
- goto commit;
- }
- }
-
xh = (struct ocfs2_xattr_header *)bucket_buf;
entries = (char *)xh->xh_entries;
xh_free_start = le16_to_cpu(xh->xh_free_start);
@@ -2940,7 +3595,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
"bucket %llu\n", (unsigned long long)blkno);
if (xh_free_start == end)
- goto commit;
+ goto out;
memset(bucket_buf + xh_free_start, 0, end - xh_free_start);
xh->xh_free_start = cpu_to_le16(end);
@@ -2951,169 +3606,94 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode,
cmp_xe, swap_xe);
buf = bucket_buf;
- for (i = 0; i < blk_per_bucket; i++, buf += blocksize) {
- memcpy(bhs[i]->b_data, buf, blocksize);
- ocfs2_journal_dirty(handle, bhs[i]);
- }
+ for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize)
+ memcpy(bucket_block(bucket, i), buf, blocksize);
+ ocfs2_xattr_bucket_journal_dirty(handle, bucket);
-commit:
- ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
out:
-
- if (bhs) {
- for (i = 0; i < blk_per_bucket; i++)
- brelse(bhs[i]);
- }
- kfree(bhs);
-
kfree(bucket_buf);
return ret;
}
/*
- * Move half nums of the xattr bucket in the previous cluster to this new
- * cluster. We only touch the last cluster of the previous extend record.
+ * prev_blkno points to the start of an existing extent. new_blkno
+ * points to a newly allocated extent. Because we know each of our
+ * clusters contains more than bucket, we can easily split one cluster
+ * at a bucket boundary. So we take the last cluster of the existing
+ * extent and split it down the middle. We move the last half of the
+ * buckets in the last cluster of the existing extent over to the new
+ * extent.
+ *
+ * first_bh is the buffer at prev_blkno so we can update the existing
+ * extent's bucket count. header_bh is the bucket were we were hoping
+ * to insert our xattr. If the bucket move places the target in the new
+ * extent, we'll update first_bh and header_bh after modifying the old
+ * extent.
*
- * first_bh is the first buffer_head of a series of bucket in the same
- * extent rec and header_bh is the header of one bucket in this cluster.
- * They will be updated if we move the data header_bh contains to the new
- * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster.
+ * first_hash will be set as the 1st xe's name_hash in the new extent.
*/
static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode,
handle_t *handle,
- struct buffer_head **first_bh,
- struct buffer_head **header_bh,
+ struct ocfs2_xattr_bucket *first,
+ struct ocfs2_xattr_bucket *target,
u64 new_blkno,
- u64 prev_blkno,
u32 num_clusters,
u32 *first_hash)
{
- int i, ret, credits;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
- int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
- int blocksize = inode->i_sb->s_blocksize;
- struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL;
- struct ocfs2_xattr_header *new_xh;
- struct ocfs2_xattr_header *xh =
- (struct ocfs2_xattr_header *)((*first_bh)->b_data);
-
- BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets);
- BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize);
-
- prev_bh = *first_bh;
- get_bh(prev_bh);
- xh = (struct ocfs2_xattr_header *)prev_bh->b_data;
+ int ret;
+ struct super_block *sb = inode->i_sb;
+ int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(sb);
+ int num_buckets = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb));
+ int to_move = num_buckets / 2;
+ u64 src_blkno;
+ u64 last_cluster_blkno = bucket_blkno(first) +
+ ((num_clusters - 1) * ocfs2_clusters_to_blocks(sb, 1));
- prev_blkno += (num_clusters - 1) * bpc + bpc / 2;
+ BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets);
+ BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize);
mlog(0, "move half of xattrs in cluster %llu to %llu\n",
- (unsigned long long)prev_blkno, (unsigned long long)new_blkno);
+ (unsigned long long)last_cluster_blkno, (unsigned long long)new_blkno);
- /*
- * We need to update the 1st half of the new cluster and
- * 1 more for the update of the 1st bucket of the previous
- * extent record.
- */
- credits = bpc / 2 + 1;
- ret = ocfs2_extend_trans(handle, credits);
+ ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first),
+ last_cluster_blkno, new_blkno,
+ to_move, first_hash);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access(handle, inode, prev_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
+ /* This is the first bucket that got moved */
+ src_blkno = last_cluster_blkno + (to_move * blks_per_bucket);
- for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) {
- old_bh = new_bh = NULL;
- new_bh = sb_getblk(inode->i_sb, new_blkno);
- if (!new_bh) {
- ret = -EIO;
- mlog_errno(ret);
- goto out;
- }
+ /*
+ * If the target bucket was part of the moved buckets, we need to
+ * update first and target.
+ */
+ if (bucket_blkno(target) >= src_blkno) {
+ /* Find the block for the new target bucket */
+ src_blkno = new_blkno +
+ (bucket_blkno(target) - src_blkno);
- ocfs2_set_new_buffer_uptodate(inode, new_bh);
+ ocfs2_xattr_bucket_relse(first);
+ ocfs2_xattr_bucket_relse(target);
- ret = ocfs2_journal_access(handle, inode, new_bh,
- OCFS2_JOURNAL_ACCESS_CREATE);
- if (ret < 0) {
+ /*
+ * These shouldn't fail - the buffers are in the
+ * journal from ocfs2_cp_xattr_bucket().
+ */
+ ret = ocfs2_read_xattr_bucket(first, new_blkno);
+ if (ret) {
mlog_errno(ret);
- brelse(new_bh);
goto out;
}
-
- ret = ocfs2_read_block(inode, prev_blkno, &old_bh);
- if (ret < 0) {
+ ret = ocfs2_read_xattr_bucket(target, src_blkno);
+ if (ret)
mlog_errno(ret);
- brelse(new_bh);
- goto out;
- }
- memcpy(new_bh->b_data, old_bh->b_data, blocksize);
-
- if (i == 0) {
- new_xh = (struct ocfs2_xattr_header *)new_bh->b_data;
- new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2);
-
- if (first_hash)
- *first_hash = le32_to_cpu(
- new_xh->xh_entries[0].xe_name_hash);
- new_first_bh = new_bh;
- get_bh(new_first_bh);
- }
-
- ocfs2_journal_dirty(handle, new_bh);
-
- if (*header_bh == old_bh) {
- brelse(*header_bh);
- *header_bh = new_bh;
- get_bh(*header_bh);
-
- brelse(*first_bh);
- *first_bh = new_first_bh;
- get_bh(*first_bh);
- }
- brelse(new_bh);
- brelse(old_bh);
}
- le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2));
-
- ocfs2_journal_dirty(handle, prev_bh);
out:
- brelse(prev_bh);
- brelse(new_first_bh);
- return ret;
-}
-
-static int ocfs2_read_xattr_bucket(struct inode *inode,
- u64 blkno,
- struct buffer_head **bhs,
- int new)
-{
- int ret = 0;
- u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
-
- if (!new)
- return ocfs2_read_blocks(inode, blkno,
- blk_per_bucket, bhs, 0);
-
- for (i = 0; i < blk_per_bucket; i++) {
- bhs[i] = sb_getblk(inode->i_sb, blkno + i);
- if (bhs[i] == NULL) {
- ret = -EIO;
- mlog_errno(ret);
- break;
- }
- ocfs2_set_new_buffer_uptodate(inode, bhs[i]);
- }
-
return ret;
}
@@ -3178,8 +3758,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
{
int ret, i;
int count, start, len, name_value_len = 0, xe_len, name_offset = 0;
- u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
- struct buffer_head **s_bhs, **t_bhs = NULL;
+ struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
struct ocfs2_xattr_header *xh;
struct ocfs2_xattr_entry *xe;
int blocksize = inode->i_sb->s_blocksize;
@@ -3187,47 +3766,52 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
mlog(0, "move some of xattrs from bucket %llu to %llu\n",
(unsigned long long)blk, (unsigned long long)new_blk);
- s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
- if (!s_bhs)
- return -ENOMEM;
-
- ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0);
- if (ret) {
+ s_bucket = ocfs2_xattr_bucket_new(inode);
+ t_bucket = ocfs2_xattr_bucket_new(inode);
+ if (!s_bucket || !t_bucket) {
+ ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access(handle, inode, s_bhs[0],
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_read_xattr_bucket(s_bucket, blk);
if (ret) {
mlog_errno(ret);
goto out;
}
- t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS);
- if (!t_bhs) {
- ret = -ENOMEM;
+ ret = ocfs2_xattr_bucket_journal_access(handle, s_bucket,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
goto out;
}
- ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head);
+ /*
+ * Even if !new_bucket_head, we're overwriting t_bucket. Thus,
+ * there's no need to read it.
+ */
+ ret = ocfs2_init_xattr_bucket(t_bucket, new_blk);
if (ret) {
mlog_errno(ret);
goto out;
}
- for (i = 0; i < blk_per_bucket; i++) {
- ret = ocfs2_journal_access(handle, inode, t_bhs[i],
- new_bucket_head ?
- OCFS2_JOURNAL_ACCESS_CREATE :
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
+ /*
+ * Hey, if we're overwriting t_bucket, what difference does
+ * ACCESS_CREATE vs ACCESS_WRITE make? See the comment in the
+ * same part of ocfs2_cp_xattr_bucket().
+ */
+ ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
+ new_bucket_head ?
+ OCFS2_JOURNAL_ACCESS_CREATE :
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
}
- xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+ xh = bucket_xh(s_bucket);
count = le16_to_cpu(xh->xh_count);
start = ocfs2_xattr_find_divide_pos(xh);
@@ -3239,10 +3823,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
* The hash value is set as one larger than
* that of the last entry in the previous bucket.
*/
- for (i = 0; i < blk_per_bucket; i++)
- memset(t_bhs[i]->b_data, 0, blocksize);
+ for (i = 0; i < t_bucket->bu_blocks; i++)
+ memset(bucket_block(t_bucket, i), 0, blocksize);
- xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+ xh = bucket_xh(t_bucket);
xh->xh_free_start = cpu_to_le16(blocksize);
xh->xh_entries[0].xe_name_hash = xe->xe_name_hash;
le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1);
@@ -3251,11 +3835,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode,
}
/* copy the whole bucket to the new first. */
- for (i = 0; i < blk_per_bucket; i++)
- memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
+ ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
/* update the new bucket. */
- xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data;
+ xh = bucket_xh(t_bucket);
/*
* Calculate the total name/value len and xh_free_start for
@@ -3319,11 +3902,7 @@ set_num_buckets:
else
xh->xh_num_buckets = 0;
- for (i = 0; i < blk_per_bucket; i++) {
- ocfs2_journal_dirty(handle, t_bhs[i]);
- if (ret)
- mlog_errno(ret);
- }
+ ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
/* store the first_hash of the new bucket. */
if (first_hash)
@@ -3337,29 +3916,18 @@ set_num_buckets:
if (start == count)
goto out;
- xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data;
+ xh = bucket_xh(s_bucket);
memset(&xh->xh_entries[start], 0,
sizeof(struct ocfs2_xattr_entry) * (count - start));
xh->xh_count = cpu_to_le16(start);
xh->xh_free_start = cpu_to_le16(name_offset);
xh->xh_name_value_len = cpu_to_le16(name_value_len);
- ocfs2_journal_dirty(handle, s_bhs[0]);
- if (ret)
- mlog_errno(ret);
+ ocfs2_xattr_bucket_journal_dirty(handle, s_bucket);
out:
- if (s_bhs) {
- for (i = 0; i < blk_per_bucket; i++)
- brelse(s_bhs[i]);
- }
- kfree(s_bhs);
-
- if (t_bhs) {
- for (i = 0; i < blk_per_bucket; i++)
- brelse(t_bhs[i]);
- }
- kfree(t_bhs);
+ ocfs2_xattr_bucket_free(s_bucket);
+ ocfs2_xattr_bucket_free(t_bucket);
return ret;
}
@@ -3376,10 +3944,8 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
u64 t_blkno,
int t_is_new)
{
- int ret, i;
- int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
- int blocksize = inode->i_sb->s_blocksize;
- struct buffer_head **s_bhs, **t_bhs = NULL;
+ int ret;
+ struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL;
BUG_ON(s_blkno == t_blkno);
@@ -3387,92 +3953,115 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode,
(unsigned long long)s_blkno, (unsigned long long)t_blkno,
t_is_new);
- s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
- GFP_NOFS);
- if (!s_bhs)
- return -ENOMEM;
+ s_bucket = ocfs2_xattr_bucket_new(inode);
+ t_bucket = ocfs2_xattr_bucket_new(inode);
+ if (!s_bucket || !t_bucket) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
- ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0);
+ ret = ocfs2_read_xattr_bucket(s_bucket, s_blkno);
if (ret)
goto out;
- t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket,
- GFP_NOFS);
- if (!t_bhs) {
- ret = -ENOMEM;
+ /*
+ * Even if !t_is_new, we're overwriting t_bucket. Thus,
+ * there's no need to read it.
+ */
+ ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno);
+ if (ret)
goto out;
- }
- ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new);
+ /*
+ * Hey, if we're overwriting t_bucket, what difference does
+ * ACCESS_CREATE vs ACCESS_WRITE make? Well, if we allocated a new
+ * cluster to fill, we came here from
+ * ocfs2_mv_xattr_buckets(), and it is really new -
+ * ACCESS_CREATE is required. But we also might have moved data
+ * out of t_bucket before extending back into it.
+ * ocfs2_add_new_xattr_bucket() can do this - its call to
+ * ocfs2_add_new_xattr_cluster() may have created a new extent
+ * and copied out the end of the old extent. Then it re-extends
+ * the old extent back to create space for new xattrs. That's
+ * how we get here, and the bucket isn't really new.
+ */
+ ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket,
+ t_is_new ?
+ OCFS2_JOURNAL_ACCESS_CREATE :
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret)
goto out;
- for (i = 0; i < blk_per_bucket; i++) {
- ret = ocfs2_journal_access(handle, inode, t_bhs[i],
- t_is_new ?
- OCFS2_JOURNAL_ACCESS_CREATE :
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret)
- goto out;
- }
-
- for (i = 0; i < blk_per_bucket; i++) {
- memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize);
- ocfs2_journal_dirty(handle, t_bhs[i]);
- }
+ ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket);
+ ocfs2_xattr_bucket_journal_dirty(handle, t_bucket);
out:
- if (s_bhs) {
- for (i = 0; i < blk_per_bucket; i++)
- brelse(s_bhs[i]);
- }
- kfree(s_bhs);
-
- if (t_bhs) {
- for (i = 0; i < blk_per_bucket; i++)
- brelse(t_bhs[i]);
- }
- kfree(t_bhs);
+ ocfs2_xattr_bucket_free(t_bucket);
+ ocfs2_xattr_bucket_free(s_bucket);
return ret;
}
/*
- * Copy one xattr cluster from src_blk to to_blk.
- * The to_blk will become the first bucket header of the cluster, so its
- * xh_num_buckets will be initialized as the bucket num in the cluster.
+ * src_blk points to the start of an existing extent. last_blk points to
+ * last cluster in that extent. to_blk points to a newly allocated
+ * extent. We copy the buckets from the cluster at last_blk to the new
+ * extent. If start_bucket is non-zero, we skip that many buckets before
+ * we start copying. The new extent's xh_num_buckets gets set to the
+ * number of buckets we copied. The old extent's xh_num_buckets shrinks
+ * by the same amount.
*/
-static int ocfs2_cp_xattr_cluster(struct inode *inode,
- handle_t *handle,
- struct buffer_head *first_bh,
- u64 src_blk,
- u64 to_blk,
+static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle,
+ u64 src_blk, u64 last_blk, u64 to_blk,
+ unsigned int start_bucket,
u32 *first_hash)
{
int i, ret, credits;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+ int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
int num_buckets = ocfs2_xattr_buckets_per_cluster(osb);
- struct buffer_head *bh = NULL;
- struct ocfs2_xattr_header *xh;
- u64 to_blk_start = to_blk;
+ struct ocfs2_xattr_bucket *old_first, *new_first;
+
+ mlog(0, "mv xattrs from cluster %llu to %llu\n",
+ (unsigned long long)last_blk, (unsigned long long)to_blk);
+
+ BUG_ON(start_bucket >= num_buckets);
+ if (start_bucket) {
+ num_buckets -= start_bucket;
+ last_blk += (start_bucket * blks_per_bucket);
+ }
+
+ /* The first bucket of the original extent */
+ old_first = ocfs2_xattr_bucket_new(inode);
+ /* The first bucket of the new extent */
+ new_first = ocfs2_xattr_bucket_new(inode);
+ if (!old_first || !new_first) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
- mlog(0, "cp xattrs from cluster %llu to %llu\n",
- (unsigned long long)src_blk, (unsigned long long)to_blk);
+ ret = ocfs2_read_xattr_bucket(old_first, src_blk);
+ if (ret) {
+ mlog_errno(ret);
+ goto out;
+ }
/*
- * We need to update the new cluster and 1 more for the update of
- * the 1st bucket of the previous extent rec.
+ * We need to update the first bucket of the old extent and all
+ * the buckets going to the new extent.
*/
- credits = bpc + 1;
+ credits = ((num_buckets + 1) * blks_per_bucket) +
+ handle->h_buffer_credits;
ret = ocfs2_extend_trans(handle, credits);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access(handle, inode, first_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_xattr_bucket_journal_access(handle, old_first,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
@@ -3480,45 +4069,45 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode,
for (i = 0; i < num_buckets; i++) {
ret = ocfs2_cp_xattr_bucket(inode, handle,
- src_blk, to_blk, 1);
+ last_blk + (i * blks_per_bucket),
+ to_blk + (i * blks_per_bucket),
+ 1);
if (ret) {
mlog_errno(ret);
goto out;
}
-
- src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
- to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb);
}
- /* update the old bucket header. */
- xh = (struct ocfs2_xattr_header *)first_bh->b_data;
- le16_add_cpu(&xh->xh_num_buckets, -num_buckets);
-
- ocfs2_journal_dirty(handle, first_bh);
-
- /* update the new bucket header. */
- ret = ocfs2_read_block(inode, to_blk_start, &bh);
- if (ret < 0) {
+ /*
+ * Get the new bucket ready before we dirty anything
+ * (This actually shouldn't fail, because we already dirtied
+ * it once in ocfs2_cp_xattr_bucket()).
+ */
+ ret = ocfs2_read_xattr_bucket(new_first, to_blk);
+ if (ret) {
mlog_errno(ret);
goto out;
}
-
- ret = ocfs2_journal_access(handle, inode, bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_xattr_bucket_journal_access(handle, new_first,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
}
- xh = (struct ocfs2_xattr_header *)bh->b_data;
- xh->xh_num_buckets = cpu_to_le16(num_buckets);
+ /* Now update the headers */
+ le16_add_cpu(&bucket_xh(old_first)->xh_num_buckets, -num_buckets);
+ ocfs2_xattr_bucket_journal_dirty(handle, old_first);
- ocfs2_journal_dirty(handle, bh);
+ bucket_xh(new_first)->xh_num_buckets = cpu_to_le16(num_buckets);
+ ocfs2_xattr_bucket_journal_dirty(handle, new_first);
if (first_hash)
- *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
+ *first_hash = le32_to_cpu(bucket_xh(new_first)->xh_entries[0].xe_name_hash);
+
out:
- brelse(bh);
+ ocfs2_xattr_bucket_free(new_first);
+ ocfs2_xattr_bucket_free(old_first);
return ret;
}
@@ -3534,7 +4123,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
u32 *first_hash)
{
u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
- int ret, credits = 2 * blk_per_bucket;
+ int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits;
BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize);
@@ -3577,43 +4166,49 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode,
*/
static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
handle_t *handle,
- struct buffer_head **first_bh,
- struct buffer_head **header_bh,
+ struct ocfs2_xattr_bucket *first,
+ struct ocfs2_xattr_bucket *target,
u64 new_blk,
- u64 prev_blk,
u32 prev_clusters,
u32 *v_start,
int *extend)
{
- int ret = 0;
- int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+ int ret;
mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n",
- (unsigned long long)prev_blk, prev_clusters,
+ (unsigned long long)bucket_blkno(first), prev_clusters,
(unsigned long long)new_blk);
- if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1)
+ if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) {
ret = ocfs2_mv_xattr_bucket_cross_cluster(inode,
handle,
- first_bh,
- header_bh,
+ first, target,
new_blk,
- prev_blk,
prev_clusters,
v_start);
- else {
- u64 last_blk = prev_blk + bpc * (prev_clusters - 1);
-
- if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk)
- ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh,
- last_blk, new_blk,
+ if (ret)
+ mlog_errno(ret);
+ } else {
+ /* The start of the last cluster in the first extent */
+ u64 last_blk = bucket_blkno(first) +
+ ((prev_clusters - 1) *
+ ocfs2_clusters_to_blocks(inode->i_sb, 1));
+
+ if (prev_clusters > 1 && bucket_blkno(target) != last_blk) {
+ ret = ocfs2_mv_xattr_buckets(inode, handle,
+ bucket_blkno(first),
+ last_blk, new_blk, 0,
v_start);
- else {
+ if (ret)
+ mlog_errno(ret);
+ } else {
ret = ocfs2_divide_xattr_cluster(inode, handle,
last_blk, new_blk,
v_start);
+ if (ret)
+ mlog_errno(ret);
- if ((*header_bh)->b_blocknr == last_blk && extend)
+ if ((bucket_blkno(target) == last_blk) && extend)
*extend = 0;
}
}
@@ -3639,56 +4234,37 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode,
*/
static int ocfs2_add_new_xattr_cluster(struct inode *inode,
struct buffer_head *root_bh,
- struct buffer_head **first_bh,
- struct buffer_head **header_bh,
+ struct ocfs2_xattr_bucket *first,
+ struct ocfs2_xattr_bucket *target,
u32 *num_clusters,
u32 prev_cpos,
- u64 prev_blkno,
- int *extend)
+ int *extend,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
- int ret, credits;
+ int ret;
u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
u32 prev_clusters = *num_clusters;
u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0;
u64 block;
- handle_t *handle = NULL;
- struct ocfs2_alloc_context *data_ac = NULL;
- struct ocfs2_alloc_context *meta_ac = NULL;
+ handle_t *handle = ctxt->handle;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_extent_tree et;
mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, "
"previous xattr blkno = %llu\n",
(unsigned long long)OCFS2_I(inode)->ip_blkno,
- prev_cpos, (unsigned long long)prev_blkno);
+ prev_cpos, (unsigned long long)bucket_blkno(first));
ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh);
- ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0,
- &data_ac, &meta_ac);
- if (ret) {
- mlog_errno(ret);
- goto leave;
- }
-
- credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el,
- clusters_to_add);
- handle = ocfs2_start_trans(osb, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- handle = NULL;
- mlog_errno(ret);
- goto leave;
- }
-
- ret = ocfs2_journal_access(handle, inode, root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_xb(handle, inode, root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto leave;
}
- ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1,
+ ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1,
clusters_to_add, &bit_off, &num_bits);
if (ret < 0) {
if (ret != -ENOSPC)
@@ -3702,7 +4278,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n",
num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
- if (prev_blkno + prev_clusters * bpc == block &&
+ if (bucket_blkno(first) + (prev_clusters * bpc) == block &&
(prev_clusters + num_bits) << osb->s_clustersize_bits <=
OCFS2_MAX_XATTR_TREE_LEAF_SIZE) {
/*
@@ -3721,10 +4297,9 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
} else {
ret = ocfs2_adjust_xattr_cross_cluster(inode,
handle,
- first_bh,
- header_bh,
+ first,
+ target,
block,
- prev_blkno,
prev_clusters,
&v_start,
extend);
@@ -3734,149 +4309,137 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode,
}
}
- if (handle->h_buffer_credits < credits) {
- /*
- * The journal has been restarted before, and don't
- * have enough space for the insertion, so extend it
- * here.
- */
- ret = ocfs2_extend_trans(handle, credits);
- if (ret) {
- mlog_errno(ret);
- goto leave;
- }
- }
mlog(0, "Insert %u clusters at block %llu for xattr at %u\n",
num_bits, (unsigned long long)block, v_start);
ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block,
- num_bits, 0, meta_ac);
+ num_bits, 0, ctxt->meta_ac);
if (ret < 0) {
mlog_errno(ret);
goto leave;
}
ret = ocfs2_journal_dirty(handle, root_bh);
- if (ret < 0) {
+ if (ret < 0)
mlog_errno(ret);
- goto leave;
- }
leave:
- if (handle)
- ocfs2_commit_trans(osb, handle);
- if (data_ac)
- ocfs2_free_alloc_context(data_ac);
- if (meta_ac)
- ocfs2_free_alloc_context(meta_ac);
-
return ret;
}
/*
- * Extend a new xattr bucket and move xattrs to the end one by one until
- * We meet with start_bh. Only move half of the xattrs to the bucket after it.
+ * We are given an extent. 'first' is the bucket at the very front of
+ * the extent. The extent has space for an additional bucket past
+ * bucket_xh(first)->xh_num_buckets. 'target_blkno' is the block number
+ * of the target bucket. We wish to shift every bucket past the target
+ * down one, filling in that additional space. When we get back to the
+ * target, we split the target between itself and the now-empty bucket
+ * at target+1 (aka, target_blkno + blks_per_bucket).
*/
static int ocfs2_extend_xattr_bucket(struct inode *inode,
- struct buffer_head *first_bh,
- struct buffer_head *start_bh,
+ handle_t *handle,
+ struct ocfs2_xattr_bucket *first,
+ u64 target_blk,
u32 num_clusters)
{
int ret, credits;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
- u64 start_blk = start_bh->b_blocknr, end_blk;
- u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb);
- handle_t *handle;
- struct ocfs2_xattr_header *first_xh =
- (struct ocfs2_xattr_header *)first_bh->b_data;
- u16 bucket = le16_to_cpu(first_xh->xh_num_buckets);
+ u64 end_blk;
+ u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets);
mlog(0, "extend xattr bucket in %llu, xattr extend rec starting "
- "from %llu, len = %u\n", (unsigned long long)start_blk,
- (unsigned long long)first_bh->b_blocknr, num_clusters);
+ "from %llu, len = %u\n", (unsigned long long)target_blk,
+ (unsigned long long)bucket_blkno(first), num_clusters);
- BUG_ON(bucket >= num_buckets);
+ /* The extent must have room for an additional bucket */
+ BUG_ON(new_bucket >=
+ (num_clusters * ocfs2_xattr_buckets_per_cluster(osb)));
- end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket;
+ /* end_blk points to the last existing bucket */
+ end_blk = bucket_blkno(first) + ((new_bucket - 1) * blk_per_bucket);
/*
- * We will touch all the buckets after the start_bh(include it).
- * Add one more bucket and modify the first_bh.
+ * end_blk is the start of the last existing bucket.
+ * Thus, (end_blk - target_blk) covers the target bucket and
+ * every bucket after it up to, but not including, the last
+ * existing bucket. Then we add the last existing bucket, the
+ * new bucket, and the first bucket (3 * blk_per_bucket).
*/
- credits = end_blk - start_blk + 2 * blk_per_bucket + 1;
- handle = ocfs2_start_trans(osb, credits);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- handle = NULL;
+ credits = (end_blk - target_blk) + (3 * blk_per_bucket) +
+ handle->h_buffer_credits;
+ ret = ocfs2_extend_trans(handle, credits);
+ if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access(handle, inode, first_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_xattr_bucket_journal_access(handle, first,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- goto commit;
+ goto out;
}
- while (end_blk != start_blk) {
+ while (end_blk != target_blk) {
ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk,
end_blk + blk_per_bucket, 0);
if (ret)
- goto commit;
+ goto out;
end_blk -= blk_per_bucket;
}
- /* Move half of the xattr in start_blk to the next bucket. */
- ret = ocfs2_divide_xattr_bucket(inode, handle, start_blk,
- start_blk + blk_per_bucket, NULL, 0);
+ /* Move half of the xattr in target_blkno to the next bucket. */
+ ret = ocfs2_divide_xattr_bucket(inode, handle, target_blk,
+ target_blk + blk_per_bucket, NULL, 0);
- le16_add_cpu(&first_xh->xh_num_buckets, 1);
- ocfs2_journal_dirty(handle, first_bh);
+ le16_add_cpu(&bucket_xh(first)->xh_num_buckets, 1);
+ ocfs2_xattr_bucket_journal_dirty(handle, first);
-commit:
- ocfs2_commit_trans(osb, handle);
out:
return ret;
}
/*
- * Add new xattr bucket in an extent record and adjust the buckets accordingly.
- * xb_bh is the ocfs2_xattr_block.
- * We will move all the buckets starting from header_bh to the next place. As
- * for this one, half num of its xattrs will be moved to the next one.
+ * Add new xattr bucket in an extent record and adjust the buckets
+ * accordingly. xb_bh is the ocfs2_xattr_block, and target is the
+ * bucket we want to insert into.
+ *
+ * In the easy case, we will move all the buckets after target down by
+ * one. Half of target's xattrs will be moved to the next bucket.
*
- * We will allocate a new cluster if current cluster is full and adjust
- * header_bh and first_bh if the insert place is moved to the new cluster.
+ * If current cluster is full, we'll allocate a new one. This may not
+ * be contiguous. The underlying calls will make sure that there is
+ * space for the insert, shifting buckets around if necessary.
+ * 'target' may be moved by those calls.
*/
static int ocfs2_add_new_xattr_bucket(struct inode *inode,
struct buffer_head *xb_bh,
- struct buffer_head *header_bh)
+ struct ocfs2_xattr_bucket *target,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
- struct ocfs2_xattr_header *first_xh = NULL;
- struct buffer_head *first_bh = NULL;
struct ocfs2_xattr_block *xb =
(struct ocfs2_xattr_block *)xb_bh->b_data;
struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root;
struct ocfs2_extent_list *el = &xb_root->xt_list;
- struct ocfs2_xattr_header *xh =
- (struct ocfs2_xattr_header *)header_bh->b_data;
- u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash);
- struct super_block *sb = inode->i_sb;
- struct ocfs2_super *osb = OCFS2_SB(sb);
+ u32 name_hash =
+ le32_to_cpu(bucket_xh(target)->xh_entries[0].xe_name_hash);
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
int ret, num_buckets, extend = 1;
u64 p_blkno;
u32 e_cpos, num_clusters;
+ /* The bucket at the front of the extent */
+ struct ocfs2_xattr_bucket *first;
- mlog(0, "Add new xattr bucket starting form %llu\n",
- (unsigned long long)header_bh->b_blocknr);
+ mlog(0, "Add new xattr bucket starting from %llu\n",
+ (unsigned long long)bucket_blkno(target));
- /*
- * Add refrence for header_bh here because it may be
- * changed in ocfs2_add_new_xattr_cluster and we need
- * to free it in the end.
- */
- get_bh(header_bh);
+ /* The first bucket of the original extent */
+ first = ocfs2_xattr_bucket_new(inode);
+ if (!first) {
+ ret = -ENOMEM;
+ mlog_errno(ret);
+ goto out;
+ }
ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos,
&num_clusters, el);
@@ -3885,40 +4448,45 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode,
goto out;
}
- ret = ocfs2_read_block(inode, p_blkno, &first_bh);
+ ret = ocfs2_read_xattr_bucket(first, p_blkno);
if (ret) {
mlog_errno(ret);
goto out;
}
num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters;
- first_xh = (struct ocfs2_xattr_header *)first_bh->b_data;
-
- if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) {
+ if (num_buckets == le16_to_cpu(bucket_xh(first)->xh_num_buckets)) {
+ /*
+ * This can move first+target if the target bucket moves
+ * to the new extent.
+ */
ret = ocfs2_add_new_xattr_cluster(inode,
xb_bh,
- &first_bh,
- &header_bh,
+ first,
+ target,
&num_clusters,
e_cpos,
- p_blkno,
- &extend);
+ &extend,
+ ctxt);
if (ret) {
mlog_errno(ret);
goto out;
}
}
- if (extend)
+ if (extend) {
ret = ocfs2_extend_xattr_bucket(inode,
- first_bh,
- header_bh,
+ ctxt->handle,
+ first,
+ bucket_blkno(target),
num_clusters);
- if (ret)
- mlog_errno(ret);
+ if (ret)
+ mlog_errno(ret);
+ }
+
out:
- brelse(first_bh);
- brelse(header_bh);
+ ocfs2_xattr_bucket_free(first);
+
return ret;
}
@@ -3929,7 +4497,7 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
int block_off = offs >> inode->i_sb->s_blocksize_bits;
offs = offs % inode->i_sb->s_blocksize;
- return bucket->bhs[block_off]->b_data + offs;
+ return bucket_block(bucket, block_off) + offs;
}
/*
@@ -3984,7 +4552,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode,
xe->xe_value_size = 0;
val = ocfs2_xattr_bucket_get_val(inode,
- &xs->bucket, offs);
+ xs->bucket, offs);
memset(val + OCFS2_XATTR_SIZE(name_len), 0,
size - OCFS2_XATTR_SIZE(name_len));
if (OCFS2_XATTR_SIZE(xi->value_len) > 0)
@@ -4062,8 +4630,7 @@ set_new_name_value:
xh->xh_free_start = cpu_to_le16(offs);
}
- val = ocfs2_xattr_bucket_get_val(inode,
- &xs->bucket, offs - size);
+ val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size);
xe->xe_name_offset = cpu_to_le16(offs - size);
memset(val, 0, size);
@@ -4079,125 +4646,45 @@ set_new_name_value:
return;
}
-static int ocfs2_xattr_bucket_handle_journal(struct inode *inode,
- handle_t *handle,
- struct ocfs2_xattr_search *xs,
- struct buffer_head **bhs,
- u16 bh_num)
-{
- int ret = 0, off, block_off;
- struct ocfs2_xattr_entry *xe = xs->here;
-
- /*
- * First calculate all the blocks we should journal_access
- * and journal_dirty. The first block should always be touched.
- */
- ret = ocfs2_journal_dirty(handle, bhs[0]);
- if (ret)
- mlog_errno(ret);
-
- /* calc the data. */
- off = le16_to_cpu(xe->xe_name_offset);
- block_off = off >> inode->i_sb->s_blocksize_bits;
- ret = ocfs2_journal_dirty(handle, bhs[block_off]);
- if (ret)
- mlog_errno(ret);
-
- return ret;
-}
-
/*
* Set the xattr entry in the specified bucket.
* The bucket is indicated by xs->bucket and it should have the enough
* space for the xattr insertion.
*/
static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode,
+ handle_t *handle,
struct ocfs2_xattr_info *xi,
struct ocfs2_xattr_search *xs,
u32 name_hash,
int local)
{
- int i, ret;
- handle_t *handle = NULL;
- u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ int ret;
+ u64 blkno;
mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n",
(unsigned long)xi->value_len, xi->name_index,
- (unsigned long long)xs->bucket.bhs[0]->b_blocknr);
+ (unsigned long long)bucket_blkno(xs->bucket));
- if (!xs->bucket.bhs[1]) {
- ret = ocfs2_read_blocks(inode,
- xs->bucket.bhs[0]->b_blocknr + 1,
- blk_per_bucket - 1, &xs->bucket.bhs[1],
- 0);
+ if (!xs->bucket->bu_bhs[1]) {
+ blkno = bucket_blkno(xs->bucket);
+ ocfs2_xattr_bucket_relse(xs->bucket);
+ ret = ocfs2_read_xattr_bucket(xs->bucket, blkno);
if (ret) {
mlog_errno(ret);
goto out;
}
}
- handle = ocfs2_start_trans(osb, blk_per_bucket);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- handle = NULL;
+ ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
+ OCFS2_JOURNAL_ACCESS_WRITE);
+ if (ret < 0) {
mlog_errno(ret);
goto out;
}
- for (i = 0; i < blk_per_bucket; i++) {
- ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i],
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret < 0) {
- mlog_errno(ret);
- goto out;
- }
- }
-
ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local);
+ ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
- /*Only dirty the blocks we have touched in set xattr. */
- ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs,
- xs->bucket.bhs, blk_per_bucket);
- if (ret)
- mlog_errno(ret);
-out:
- ocfs2_commit_trans(osb, handle);
-
- return ret;
-}
-
-static int ocfs2_xattr_value_update_size(struct inode *inode,
- struct buffer_head *xe_bh,
- struct ocfs2_xattr_entry *xe,
- u64 new_size)
-{
- int ret;
- struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
- handle_t *handle = NULL;
-
- handle = ocfs2_start_trans(osb, 1);
- if (IS_ERR(handle)) {
- ret = -ENOMEM;
- mlog_errno(ret);
- goto out;
- }
-
- ret = ocfs2_journal_access(handle, inode, xe_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
- if (ret < 0) {
- mlog_errno(ret);
- goto out_commit;
- }
-
- xe->xe_value_size = cpu_to_le64(new_size);
-
- ret = ocfs2_journal_dirty(handle, xe_bh);
- if (ret < 0)
- mlog_errno(ret);
-
-out_commit:
- ocfs2_commit_trans(osb, handle);
out:
return ret;
}
@@ -4210,18 +4697,19 @@ out:
* Copy the new updated xe and xe_value_root to new_xe and new_xv if needed.
*/
static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
- struct buffer_head *header_bh,
+ struct ocfs2_xattr_bucket *bucket,
int xe_off,
- int len)
+ int len,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
int ret, offset;
u64 value_blk;
- struct buffer_head *value_bh = NULL;
- struct ocfs2_xattr_value_root *xv;
struct ocfs2_xattr_entry *xe;
- struct ocfs2_xattr_header *xh =
- (struct ocfs2_xattr_header *)header_bh->b_data;
+ struct ocfs2_xattr_header *xh = bucket_xh(bucket);
size_t blocksize = inode->i_sb->s_blocksize;
+ struct ocfs2_xattr_value_buf vb = {
+ .vb_access = ocfs2_journal_access,
+ };
xe = &xh->xh_entries[xe_off];
@@ -4234,49 +4722,58 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode,
/* We don't allow ocfs2_xattr_value to be stored in different block. */
BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize);
- value_blk += header_bh->b_blocknr;
- ret = ocfs2_read_block(inode, value_blk, &value_bh);
- if (ret) {
- mlog_errno(ret);
- goto out;
- }
+ vb.vb_bh = bucket->bu_bhs[value_blk];
+ BUG_ON(!vb.vb_bh);
- xv = (struct ocfs2_xattr_value_root *)
- (value_bh->b_data + offset % blocksize);
+ vb.vb_xv = (struct ocfs2_xattr_value_root *)
+ (vb.vb_bh->b_data + offset % blocksize);
- mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
- xe_off, (unsigned long long)header_bh->b_blocknr, len);
- ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len);
+ ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
}
- ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len);
+ /*
+ * From here on out we have to dirty the bucket. The generic
+ * value calls only modify one of the bucket's bhs, but we need
+ * to send the bucket at once. So if they error, they *could* have
+ * modified something. We have to assume they did, and dirty
+ * the whole bucket. This leaves us in a consistent state.
+ */
+ mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n",
+ xe_off, (unsigned long long)bucket_blkno(bucket), len);
+ ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt);
if (ret) {
mlog_errno(ret);
- goto out;
+ goto out_dirty;
}
+ xe->xe_value_size = cpu_to_le64(len);
+
+out_dirty:
+ ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket);
+
out:
- brelse(value_bh);
return ret;
}
static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
- struct ocfs2_xattr_search *xs,
- int len)
+ struct ocfs2_xattr_search *xs,
+ int len,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
int ret, offset;
struct ocfs2_xattr_entry *xe = xs->here;
struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base;
- BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe));
+ BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe));
offset = xe - xh->xh_entries;
- ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0],
- offset, len);
+ ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket,
+ offset, len, ctxt);
if (ret)
mlog_errno(ret);
@@ -4284,6 +4781,7 @@ static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode,
}
static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
+ handle_t *handle,
struct ocfs2_xattr_search *xs,
char *val,
int value_len)
@@ -4299,7 +4797,8 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode,
xv = (struct ocfs2_xattr_value_root *)(xs->base + offset);
- return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len);
+ return __ocfs2_xattr_set_value_outside(inode, handle,
+ xv, val, value_len);
}
static int ocfs2_rm_xattr_cluster(struct inode *inode,
@@ -4343,15 +4842,15 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
}
}
- handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS);
+ handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb));
if (IS_ERR(handle)) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
- ret = ocfs2_journal_access(handle, inode, root_bh,
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_journal_access_xb(handle, inode, root_bh,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
@@ -4392,26 +4891,19 @@ out:
}
static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
+ handle_t *handle,
struct ocfs2_xattr_search *xs)
{
- handle_t *handle = NULL;
- struct ocfs2_xattr_header *xh = xs->bucket.xh;
+ struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket);
struct ocfs2_xattr_entry *last = &xh->xh_entries[
le16_to_cpu(xh->xh_count) - 1];
int ret = 0;
- handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1);
- if (IS_ERR(handle)) {
- ret = PTR_ERR(handle);
- mlog_errno(ret);
- return;
- }
-
- ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0],
- OCFS2_JOURNAL_ACCESS_WRITE);
+ ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket,
+ OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
- goto out_commit;
+ return;
}
/* Remove the old entry. */
@@ -4420,11 +4912,7 @@ static void ocfs2_xattr_bucket_remove_xs(struct inode *inode,
memset(last, 0, sizeof(struct ocfs2_xattr_entry));
le16_add_cpu(&xh->xh_count, -1);
- ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]);
- if (ret < 0)
- mlog_errno(ret);
-out_commit:
- ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
+ ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket);
}
/*
@@ -4440,7 +4928,8 @@ out_commit:
*/
static int ocfs2_xattr_set_in_bucket(struct inode *inode,
struct ocfs2_xattr_info *xi,
- struct ocfs2_xattr_search *xs)
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
int ret, local = 1;
size_t value_len;
@@ -4468,7 +4957,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
value_len = 0;
ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
- value_len);
+ value_len,
+ ctxt);
if (ret)
goto out;
@@ -4488,7 +4978,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
xi->value_len = OCFS2_XATTR_ROOT_SIZE;
}
- ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local);
+ ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs,
+ name_hash, local);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4499,7 +4990,7 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
/* allocate the space now for the outside block storage. */
ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs,
- value_len);
+ value_len, ctxt);
if (ret) {
mlog_errno(ret);
@@ -4509,13 +5000,14 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode,
* storage and we have allocated xattr already,
* so need to remove it.
*/
- ocfs2_xattr_bucket_remove_xs(inode, xs);
+ ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs);
}
goto out;
}
set_value_outside:
- ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len);
+ ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle,
+ xs, val, value_len);
out:
return ret;
}
@@ -4530,7 +5022,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
struct ocfs2_xattr_bucket *bucket,
const char *name)
{
- struct ocfs2_xattr_header *xh = bucket->xh;
+ struct ocfs2_xattr_header *xh = bucket_xh(bucket);
u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name));
if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash))
@@ -4540,7 +5032,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
xh->xh_entries[0].xe_name_hash) {
mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, "
"hash = %u\n",
- (unsigned long long)bucket->bhs[0]->b_blocknr,
+ (unsigned long long)bucket_blkno(bucket),
le32_to_cpu(xh->xh_entries[0].xe_name_hash));
return -ENOSPC;
}
@@ -4550,16 +5042,16 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode,
static int ocfs2_xattr_set_entry_index_block(struct inode *inode,
struct ocfs2_xattr_info *xi,
- struct ocfs2_xattr_search *xs)
+ struct ocfs2_xattr_search *xs,
+ struct ocfs2_xattr_set_ctxt *ctxt)
{
struct ocfs2_xattr_header *xh;
struct ocfs2_xattr_entry *xe;
u16 count, header_size, xh_free_start;
- int i, free, max_free, need, old;
+ int free, max_free, need, old;
size_t value_size = 0, name_len = strlen(xi->name);
size_t blocksize = inode->i_sb->s_blocksize;
int ret, allocation = 0;
- u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb);
mlog_entry("Set xattr %s in xattr index block\n", xi->name);
@@ -4574,7 +5066,7 @@ try_again:
mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
"of %u which exceed block size\n",
- (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+ (unsigned long long)bucket_blkno(xs->bucket),
header_size);
if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE)
@@ -4614,11 +5106,13 @@ try_again:
mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, "
"need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len ="
" %u\n", xs->not_found,
- (unsigned long long)xs->bucket.bhs[0]->b_blocknr,
+ (unsigned long long)bucket_blkno(xs->bucket),
free, need, max_free, le16_to_cpu(xh->xh_free_start),
le16_to_cpu(xh->xh_name_value_len));
- if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
+ if (free < need ||
+ (xs->not_found &&
+ count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) {
if (need <= max_free &&
count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) {
/*
@@ -4626,7 +5120,8 @@ try_again:
* name/value will be moved, the xe shouldn't be changed
* in xs.
*/
- ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket);
+ ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle,
+ xs->bucket);
if (ret) {
mlog_errno(ret);
goto out;
@@ -4658,7 +5153,7 @@ try_again:
* add a new bucket for the insert.
*/
ret = ocfs2_check_xattr_bucket_collision(inode,
- &xs->bucket,
+ xs->bucket,
xi->name);
if (ret) {
mlog_errno(ret);
@@ -4667,17 +5162,21 @@ try_again:
ret = ocfs2_add_new_xattr_bucket(inode,
xs->xattr_bh,
- xs->bucket.bhs[0]);
+ xs->bucket,
+ ctxt);
if (ret) {
mlog_errno(ret);
goto out;
}
- for (i = 0; i < blk_per_bucket; i++)
- brelse(xs->bucket.bhs[i]);
-
- memset(&xs->bucket, 0, sizeof(xs->bucket));
-
+ /*
+ * ocfs2_add_new_xattr_bucket() will have updated
+ * xs->bucket if it moved, but it will not have updated
+ * any of the other search fields. Thus, we drop it and
+ * re-search. Everything should be cached, so it'll be
+ * quick.
+ */
+ ocfs2_xattr_bucket_relse(xs->bucket);
ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh,
xi->name_index,
xi->name, xs);
@@ -4689,7 +5188,7 @@ try_again:
}
xattr_set:
- ret = ocfs2_xattr_set_in_bucket(inode, xi, xs);
+ ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt);
out:
mlog_exit(ret);
return ret;
@@ -4700,24 +5199,41 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode,
void *para)
{
int ret = 0;
- struct ocfs2_xattr_header *xh = bucket->xh;
+ struct ocfs2_xattr_header *xh = bucket_xh(bucket);
u16 i;
struct ocfs2_xattr_entry *xe;
+ struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+ struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,};
+ int credits = ocfs2_remove_extent_credits(osb->sb) +
+ ocfs2_blocks_per_xattr_bucket(inode->i_sb);
+
+
+ ocfs2_init_dealloc_ctxt(&ctxt.dealloc);
for (i = 0; i < le16_to_cpu(xh->xh_count); i++) {
xe = &xh->xh_entries[i];
if (ocfs2_xattr_is_local(xe))
continue;
- ret = ocfs2_xattr_bucket_value_truncate(inode,
- bucket->bhs[0],
- i, 0);
+ ctxt.handle = ocfs2_start_trans(osb, credits);
+ if (IS_ERR(ctxt.handle)) {
+ ret = PTR_ERR(ctxt.handle);
+ mlog_errno(ret);
+ break;
+ }
+
+ ret = ocfs2_xattr_bucket_value_truncate(inode, bucket,
+ i, 0, &ctxt);
+
+ ocfs2_commit_trans(osb, ctxt.handle);
if (ret) {
mlog_errno(ret);
break;
}
}
+ ocfs2_schedule_truncate_log_flush(osb, 1);
+ ocfs2_run_deallocs(osb, &ctxt.dealloc);
return ret;
}
@@ -4768,6 +5284,74 @@ out:
}
/*
+ * 'security' attributes support
+ */
+static size_t ocfs2_xattr_security_list(struct inode *inode, char *list,
+ size_t list_size, const char *name,
+ size_t name_len)
+{
+ const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
+ const size_t total_len = prefix_len + name_len + 1;
+
+ if (list && total_len <= list_size) {
+ memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+ memcpy(list + prefix_len, name, name_len);
+ list[prefix_len + name_len] = '\0';
+ }
+ return total_len;
+}
+
+static int ocfs2_xattr_security_get(struct inode *inode, const char *name,
+ void *buffer, size_t size)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+ return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name,
+ buffer, size);
+}
+
+static int ocfs2_xattr_security_set(struct inode *inode, const char *name,
+ const void *value, size_t size, int flags)
+{
+ if (strcmp(name, "") == 0)
+ return -EINVAL;
+
+ return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value,
+ size, flags);
+}
+
+int ocfs2_init_security_get(struct inode *inode,
+ struct inode *dir,
+ struct ocfs2_security_xattr_info *si)
+{
+ /* check whether ocfs2 support feature xattr */
+ if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb)))
+ return -EOPNOTSUPP;
+ return security_inode_init_security(inode, dir, &si->name, &si->value,
+ &si->value_len);
+}
+
+int ocfs2_init_security_set(handle_t *handle,
+ struct inode *inode,
+ struct buffer_head *di_bh,
+ struct ocfs2_security_xattr_info *si,
+ struct ocfs2_alloc_context *xattr_ac,
+ struct ocfs2_alloc_context *data_ac)
+{
+ return ocfs2_xattr_set_handle(handle, inode, di_bh,
+ OCFS2_XATTR_INDEX_SECURITY,
+ si->name, si->value, si->value_len, 0,
+ xattr_ac, data_ac);
+}
+
+struct xattr_handler ocfs2_xattr_security_handler = {
+ .prefix = XATTR_SECURITY_PREFIX,
+ .list = ocfs2_xattr_security_list,
+ .get = ocfs2_xattr_security_get,
+ .set = ocfs2_xattr_security_set,
+};
+
+/*
* 'trusted' attributes support
*/
static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list,
diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h
index 1d8314c7656d..5a1ebc789f7e 100644
--- a/fs/ocfs2/xattr.h
+++ b/fs/ocfs2/xattr.h
@@ -30,13 +30,58 @@ enum ocfs2_xattr_type {
OCFS2_XATTR_MAX
};
+struct ocfs2_security_xattr_info {
+ int enable;
+ char *name;
+ void *value;
+ size_t value_len;
+};
+
extern struct xattr_handler ocfs2_xattr_user_handler;
extern struct xattr_handler ocfs2_xattr_trusted_handler;
+extern struct xattr_handler ocfs2_xattr_security_handler;
+#ifdef CONFIG_OCFS2_FS_POSIX_ACL
+extern struct xattr_handler ocfs2_xattr_acl_access_handler;
+extern struct xattr_handler ocfs2_xattr_acl_default_handler;
+#endif
extern struct xattr_handler *ocfs2_xattr_handlers[];
ssize_t ocfs2_listxattr(struct dentry *, char *, size_t);
+int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int,
+ const char *, void *, size_t);
int ocfs2_xattr_set(struct inode *, int, const char *, const void *,
size_t, int);
+int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *,
+ int, const char *, const void *, size_t, int,
+ struct ocfs2_alloc_context *,
+ struct ocfs2_alloc_context *);
int ocfs2_xattr_remove(struct inode *, struct buffer_head *);
+int ocfs2_init_security_get(struct inode *, struct inode *,
+ struct ocfs2_security_xattr_info *);
+int ocfs2_init_security_set(handle_t *, struct inode *,
+ struct buffer_head *,
+ struct ocfs2_security_xattr_info *,
+ struct ocfs2_alloc_context *,
+ struct ocfs2_alloc_context *);
+int ocfs2_calc_security_init(struct inode *,
+ struct ocfs2_security_xattr_info *,
+ int *, int *, struct ocfs2_alloc_context **);
+int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *,
+ int, struct ocfs2_security_xattr_info *,
+ int *, int *, struct ocfs2_alloc_context **);
+
+/*
+ * xattrs can live inside an inode, as part of an external xattr block,
+ * or inside an xattr bucket, which is the leaf of a tree rooted in an
+ * xattr block. Some of the xattr calls, especially the value setting
+ * functions, want to treat each of these locations as equal. Let's wrap
+ * them in a structure that we can pass around instead of raw buffer_heads.
+ */
+struct ocfs2_xattr_value_buf {
+ struct buffer_head *vb_bh;
+ ocfs2_journal_access_func vb_access;
+ struct ocfs2_xattr_value_root *vb_xv;
+};
+
#endif /* OCFS2_XATTR_H */
diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c
index 6afe57c84f84..633e9dc972bb 100644
--- a/fs/omfs/inode.c
+++ b/fs/omfs/inode.c
@@ -39,7 +39,6 @@ struct inode *omfs_new_inode(struct inode *dir, int mode)
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_blocks = 0;
inode->i_mapping->a_ops = &omfs_aops;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
diff --git a/fs/open.c b/fs/open.c
index c0a426d5766c..d882fd2351d6 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -272,6 +272,8 @@ static long do_sys_truncate(const char __user *pathname, loff_t length)
goto put_write_and_out;
error = locks_verify_truncate(inode, NULL, length);
+ if (!error)
+ error = security_path_truncate(&path, length, 0);
if (!error) {
DQUOT_INIT(inode);
error = do_truncate(path.dentry, length, 0, NULL);
@@ -329,6 +331,9 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
error = locks_verify_truncate(inode, file, length);
if (!error)
+ error = security_path_truncate(&file->f_path, length,
+ ATTR_MTIME|ATTR_CTIME);
+ if (!error)
error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, file);
out_putf:
fput(file);
@@ -407,7 +412,7 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len)
if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0))
goto out_fput;
- if (inode->i_op && inode->i_op->fallocate)
+ if (inode->i_op->fallocate)
ret = inode->i_op->fallocate(inode, mode, offset, len);
else
ret = -EOPNOTSUPP;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index d41bdc784de4..ffcd04f0012c 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -256,9 +256,6 @@ found:
break;
}
- inode->i_gid = 0;
- inode->i_uid = 0;
-
d_add(dentry, inode);
return NULL;
}
diff --git a/fs/pipe.c b/fs/pipe.c
index aaf797bd57b9..891697112f66 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -1016,10 +1016,7 @@ int do_pipe_flags(int *fd, int flags)
goto err_fdr;
fdw = error;
- error = audit_fd_pair(fdr, fdw);
- if (error < 0)
- goto err_fdw;
-
+ audit_fd_pair(fdr, fdw);
fd_install(fdr, fr);
fd_install(fdw, fw);
fd[0] = fdr;
@@ -1027,8 +1024,6 @@ int do_pipe_flags(int *fd, int flags)
return 0;
- err_fdw:
- put_unused_fd(fdw);
err_fdr:
put_unused_fd(fdr);
err_read_pipe:
diff --git a/fs/proc/base.c b/fs/proc/base.c
index cad92c1ac2b3..10fd5223d600 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1426,8 +1426,6 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
if (!ei->pid)
goto out_unlock;
- inode->i_uid = 0;
- inode->i_gid = 0;
if (task_dumpable(task)) {
rcu_read_lock();
cred = __task_cred(task);
@@ -2349,8 +2347,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
if (!ei->pid)
goto out_iput;
- inode->i_uid = 0;
- inode->i_gid = 0;
inode->i_mode = p->mode;
if (S_ISDIR(inode->i_mode))
inode->i_nlink = 2;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 06ed10b7da9e..94fcfff6863a 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -31,7 +31,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */
inode->i_mode = table->mode;
- inode->i_uid = inode->i_gid = 0;
if (!table->child) {
inode->i_mode |= S_IFREG;
inode->i_op = &proc_sys_inode_operations;
diff --git a/fs/proc/stat.c b/fs/proc/stat.c
index 81904f07679d..f75efa22df5e 100644
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -9,6 +9,7 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/time.h>
+#include <linux/irqnr.h>
#include <asm/cputime.h>
#ifndef arch_irq_stat_cpu
@@ -44,10 +45,9 @@ static int show_stat(struct seq_file *p, void *v)
softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
-
- for_each_irq_nr(j)
+ for_each_irq_nr(j) {
sum += kstat_irqs_cpu(j, i);
-
+ }
sum += arch_irq_stat_cpu(i);
}
sum += arch_irq_stat();
@@ -92,7 +92,6 @@ static int show_stat(struct seq_file *p, void *v)
/* sum again ? it could be updated? */
for_each_irq_nr(j) {
per_irq_sum = 0;
-
for_each_possible_cpu(i)
per_irq_sum += kstat_irqs_cpu(j, i);
diff --git a/fs/quota.c b/fs/quota.c
index b7fe44e01618..4a8c94f05f76 100644
--- a/fs/quota.c
+++ b/fs/quota.c
@@ -73,7 +73,7 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid
case Q_SETQUOTA:
case Q_GETQUOTA:
/* This is just informative test so we are satisfied without a lock */
- if (!sb_has_quota_enabled(sb, type))
+ if (!sb_has_quota_active(sb, type))
return -ESRCH;
}
@@ -160,6 +160,9 @@ static void quota_sync_sb(struct super_block *sb, int type)
int cnt;
sb->s_qcop->quota_sync(sb, type);
+
+ if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE)
+ return;
/* This is not very clever (and fast) but currently I don't know about
* any other simple way of getting quota data to disk and we must get
* them there for userspace to be visible... */
@@ -175,7 +178,7 @@ static void quota_sync_sb(struct super_block *sb, int type)
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (type != -1 && cnt != type)
continue;
- if (!sb_has_quota_enabled(sb, cnt))
+ if (!sb_has_quota_active(sb, cnt))
continue;
mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA);
truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0);
@@ -201,7 +204,7 @@ restart:
for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
if (type != -1 && type != cnt)
continue;
- if (!sb_has_quota_enabled(sb, cnt))
+ if (!sb_has_quota_active(sb, cnt))
continue;
if (!info_dirty(&sb_dqopt(sb)->info[cnt]) &&
list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list))
@@ -245,7 +248,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void
__u32 fmt;
down_read(&sb_dqopt(sb)->dqptr_sem);
- if (!sb_has_quota_enabled(sb, type)) {
+ if (!sb_has_quota_active(sb, type)) {
up_read(&sb_dqopt(sb)->dqptr_sem);
return -ESRCH;
}
diff --git a/fs/quota_tree.c b/fs/quota_tree.c
new file mode 100644
index 000000000000..953404c95b17
--- /dev/null
+++ b/fs/quota_tree.c
@@ -0,0 +1,645 @@
+/*
+ * vfsv0 quota IO operations on file
+ */
+
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/dqblk_v2.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/quotaops.h>
+
+#include <asm/byteorder.h>
+
+#include "quota_tree.h"
+
+MODULE_AUTHOR("Jan Kara");
+MODULE_DESCRIPTION("Quota trie support");
+MODULE_LICENSE("GPL");
+
+#define __QUOTA_QT_PARANOIA
+
+typedef char *dqbuf_t;
+
+static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth)
+{
+ unsigned int epb = info->dqi_usable_bs >> 2;
+
+ depth = info->dqi_qtree_depth - depth - 1;
+ while (depth--)
+ id /= epb;
+ return id % epb;
+}
+
+/* Number of entries in one blocks */
+static inline int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info)
+{
+ return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader))
+ / info->dqi_entry_size;
+}
+
+static dqbuf_t getdqbuf(size_t size)
+{
+ dqbuf_t buf = kmalloc(size, GFP_NOFS);
+ if (!buf)
+ printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
+ return buf;
+}
+
+static inline void freedqbuf(dqbuf_t buf)
+{
+ kfree(buf);
+}
+
+static inline ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
+{
+ struct super_block *sb = info->dqi_sb;
+
+ memset(buf, 0, info->dqi_usable_bs);
+ return sb->s_op->quota_read(sb, info->dqi_type, (char *)buf,
+ info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+}
+
+static inline ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf)
+{
+ struct super_block *sb = info->dqi_sb;
+
+ return sb->s_op->quota_write(sb, info->dqi_type, (char *)buf,
+ info->dqi_usable_bs, blk << info->dqi_blocksize_bits);
+}
+
+/* Remove empty block from list and return it */
+static int get_free_dqblk(struct qtree_mem_dqinfo *info)
+{
+ dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+ struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+ int ret, blk;
+
+ if (!buf)
+ return -ENOMEM;
+ if (info->dqi_free_blk) {
+ blk = info->dqi_free_blk;
+ ret = read_blk(info, blk, buf);
+ if (ret < 0)
+ goto out_buf;
+ info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
+ }
+ else {
+ memset(buf, 0, info->dqi_usable_bs);
+ /* Assure block allocation... */
+ ret = write_blk(info, info->dqi_blocks, buf);
+ if (ret < 0)
+ goto out_buf;
+ blk = info->dqi_blocks++;
+ }
+ mark_info_dirty(info->dqi_sb, info->dqi_type);
+ ret = blk;
+out_buf:
+ freedqbuf(buf);
+ return ret;
+}
+
+/* Insert empty block to the list */
+static int put_free_dqblk(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+ struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+ int err;
+
+ dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk);
+ dh->dqdh_prev_free = cpu_to_le32(0);
+ dh->dqdh_entries = cpu_to_le16(0);
+ err = write_blk(info, blk, buf);
+ if (err < 0)
+ return err;
+ info->dqi_free_blk = blk;
+ mark_info_dirty(info->dqi_sb, info->dqi_type);
+ return 0;
+}
+
+/* Remove given block from the list of blocks with free entries */
+static int remove_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+ dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
+ struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+ uint nextblk = le32_to_cpu(dh->dqdh_next_free);
+ uint prevblk = le32_to_cpu(dh->dqdh_prev_free);
+ int err;
+
+ if (!tmpbuf)
+ return -ENOMEM;
+ if (nextblk) {
+ err = read_blk(info, nextblk, tmpbuf);
+ if (err < 0)
+ goto out_buf;
+ ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
+ dh->dqdh_prev_free;
+ err = write_blk(info, nextblk, tmpbuf);
+ if (err < 0)
+ goto out_buf;
+ }
+ if (prevblk) {
+ err = read_blk(info, prevblk, tmpbuf);
+ if (err < 0)
+ goto out_buf;
+ ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_next_free =
+ dh->dqdh_next_free;
+ err = write_blk(info, prevblk, tmpbuf);
+ if (err < 0)
+ goto out_buf;
+ } else {
+ info->dqi_free_entry = nextblk;
+ mark_info_dirty(info->dqi_sb, info->dqi_type);
+ }
+ freedqbuf(tmpbuf);
+ dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
+ /* No matter whether write succeeds block is out of list */
+ if (write_blk(info, blk, buf) < 0)
+ printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
+ return 0;
+out_buf:
+ freedqbuf(tmpbuf);
+ return err;
+}
+
+/* Insert given block to the beginning of list with free entries */
+static int insert_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk)
+{
+ dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs);
+ struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf;
+ int err;
+
+ if (!tmpbuf)
+ return -ENOMEM;
+ dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry);
+ dh->dqdh_prev_free = cpu_to_le32(0);
+ err = write_blk(info, blk, buf);
+ if (err < 0)
+ goto out_buf;
+ if (info->dqi_free_entry) {
+ err = read_blk(info, info->dqi_free_entry, tmpbuf);
+ if (err < 0)
+ goto out_buf;
+ ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free =
+ cpu_to_le32(blk);
+ err = write_blk(info, info->dqi_free_entry, tmpbuf);
+ if (err < 0)
+ goto out_buf;
+ }
+ freedqbuf(tmpbuf);
+ info->dqi_free_entry = blk;
+ mark_info_dirty(info->dqi_sb, info->dqi_type);
+ return 0;
+out_buf:
+ freedqbuf(tmpbuf);
+ return err;
+}
+
+/* Is the entry in the block free? */
+int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk)
+{
+ int i;
+
+ for (i = 0; i < info->dqi_entry_size; i++)
+ if (disk[i])
+ return 0;
+ return 1;
+}
+EXPORT_SYMBOL(qtree_entry_unused);
+
+/* Find space for dquot */
+static uint find_free_dqentry(struct qtree_mem_dqinfo *info,
+ struct dquot *dquot, int *err)
+{
+ uint blk, i;
+ struct qt_disk_dqdbheader *dh;
+ dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+ char *ddquot;
+
+ *err = 0;
+ if (!buf) {
+ *err = -ENOMEM;
+ return 0;
+ }
+ dh = (struct qt_disk_dqdbheader *)buf;
+ if (info->dqi_free_entry) {
+ blk = info->dqi_free_entry;
+ *err = read_blk(info, blk, buf);
+ if (*err < 0)
+ goto out_buf;
+ } else {
+ blk = get_free_dqblk(info);
+ if ((int)blk < 0) {
+ *err = blk;
+ freedqbuf(buf);
+ return 0;
+ }
+ memset(buf, 0, info->dqi_usable_bs);
+ /* This is enough as block is already zeroed and entry list is empty... */
+ info->dqi_free_entry = blk;
+ mark_info_dirty(dquot->dq_sb, dquot->dq_type);
+ }
+ /* Block will be full? */
+ if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) {
+ *err = remove_free_dqentry(info, buf, blk);
+ if (*err < 0) {
+ printk(KERN_ERR "VFS: find_free_dqentry(): Can't "
+ "remove block (%u) from entry free list.\n",
+ blk);
+ goto out_buf;
+ }
+ }
+ le16_add_cpu(&dh->dqdh_entries, 1);
+ /* Find free structure in block */
+ for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
+ i < qtree_dqstr_in_blk(info) && !qtree_entry_unused(info, ddquot);
+ i++, ddquot += info->dqi_entry_size);
+#ifdef __QUOTA_QT_PARANOIA
+ if (i == qtree_dqstr_in_blk(info)) {
+ printk(KERN_ERR "VFS: find_free_dqentry(): Data block full "
+ "but it shouldn't.\n");
+ *err = -EIO;
+ goto out_buf;
+ }
+#endif
+ *err = write_blk(info, blk, buf);
+ if (*err < 0) {
+ printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota "
+ "data block %u.\n", blk);
+ goto out_buf;
+ }
+ dquot->dq_off = (blk << info->dqi_blocksize_bits) +
+ sizeof(struct qt_disk_dqdbheader) +
+ i * info->dqi_entry_size;
+ freedqbuf(buf);
+ return blk;
+out_buf:
+ freedqbuf(buf);
+ return 0;
+}
+
+/* Insert reference to structure into the trie */
+static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+ uint *treeblk, int depth)
+{
+ dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+ int ret = 0, newson = 0, newact = 0;
+ __le32 *ref;
+ uint newblk;
+
+ if (!buf)
+ return -ENOMEM;
+ if (!*treeblk) {
+ ret = get_free_dqblk(info);
+ if (ret < 0)
+ goto out_buf;
+ *treeblk = ret;
+ memset(buf, 0, info->dqi_usable_bs);
+ newact = 1;
+ } else {
+ ret = read_blk(info, *treeblk, buf);
+ if (ret < 0) {
+ printk(KERN_ERR "VFS: Can't read tree quota block "
+ "%u.\n", *treeblk);
+ goto out_buf;
+ }
+ }
+ ref = (__le32 *)buf;
+ newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+ if (!newblk)
+ newson = 1;
+ if (depth == info->dqi_qtree_depth - 1) {
+#ifdef __QUOTA_QT_PARANOIA
+ if (newblk) {
+ printk(KERN_ERR "VFS: Inserting already present quota "
+ "entry (block %u).\n",
+ le32_to_cpu(ref[get_index(info,
+ dquot->dq_id, depth)]));
+ ret = -EIO;
+ goto out_buf;
+ }
+#endif
+ newblk = find_free_dqentry(info, dquot, &ret);
+ } else {
+ ret = do_insert_tree(info, dquot, &newblk, depth+1);
+ }
+ if (newson && ret >= 0) {
+ ref[get_index(info, dquot->dq_id, depth)] =
+ cpu_to_le32(newblk);
+ ret = write_blk(info, *treeblk, buf);
+ } else if (newact && ret < 0) {
+ put_free_dqblk(info, buf, *treeblk);
+ }
+out_buf:
+ freedqbuf(buf);
+ return ret;
+}
+
+/* Wrapper for inserting quota structure into tree */
+static inline int dq_insert_tree(struct qtree_mem_dqinfo *info,
+ struct dquot *dquot)
+{
+ int tmp = QT_TREEOFF;
+ return do_insert_tree(info, dquot, &tmp, 0);
+}
+
+/*
+ * We don't have to be afraid of deadlocks as we never have quotas on quota files...
+ */
+int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+ int type = dquot->dq_type;
+ struct super_block *sb = dquot->dq_sb;
+ ssize_t ret;
+ dqbuf_t ddquot = getdqbuf(info->dqi_entry_size);
+
+ if (!ddquot)
+ return -ENOMEM;
+
+ /* dq_off is guarded by dqio_mutex */
+ if (!dquot->dq_off) {
+ ret = dq_insert_tree(info, dquot);
+ if (ret < 0) {
+ printk(KERN_ERR "VFS: Error %zd occurred while "
+ "creating quota.\n", ret);
+ freedqbuf(ddquot);
+ return ret;
+ }
+ }
+ spin_lock(&dq_data_lock);
+ info->dqi_ops->mem2disk_dqblk(ddquot, dquot);
+ spin_unlock(&dq_data_lock);
+ ret = sb->s_op->quota_write(sb, type, (char *)ddquot,
+ info->dqi_entry_size, dquot->dq_off);
+ if (ret != info->dqi_entry_size) {
+ printk(KERN_WARNING "VFS: dquota write failed on dev %s\n",
+ sb->s_id);
+ if (ret >= 0)
+ ret = -ENOSPC;
+ } else {
+ ret = 0;
+ }
+ dqstats.writes++;
+ freedqbuf(ddquot);
+
+ return ret;
+}
+EXPORT_SYMBOL(qtree_write_dquot);
+
+/* Free dquot entry in data block */
+static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+ uint blk)
+{
+ struct qt_disk_dqdbheader *dh;
+ dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+ int ret = 0;
+
+ if (!buf)
+ return -ENOMEM;
+ if (dquot->dq_off >> info->dqi_blocksize_bits != blk) {
+ printk(KERN_ERR "VFS: Quota structure has offset to other "
+ "block (%u) than it should (%u).\n", blk,
+ (uint)(dquot->dq_off >> info->dqi_blocksize_bits));
+ goto out_buf;
+ }
+ ret = read_blk(info, blk, buf);
+ if (ret < 0) {
+ printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
+ goto out_buf;
+ }
+ dh = (struct qt_disk_dqdbheader *)buf;
+ le16_add_cpu(&dh->dqdh_entries, -1);
+ if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */
+ ret = remove_free_dqentry(info, buf, blk);
+ if (ret >= 0)
+ ret = put_free_dqblk(info, buf, blk);
+ if (ret < 0) {
+ printk(KERN_ERR "VFS: Can't move quota data block (%u) "
+ "to free list.\n", blk);
+ goto out_buf;
+ }
+ } else {
+ memset(buf +
+ (dquot->dq_off & ((1 << info->dqi_blocksize_bits) - 1)),
+ 0, info->dqi_entry_size);
+ if (le16_to_cpu(dh->dqdh_entries) ==
+ qtree_dqstr_in_blk(info) - 1) {
+ /* Insert will write block itself */
+ ret = insert_free_dqentry(info, buf, blk);
+ if (ret < 0) {
+ printk(KERN_ERR "VFS: Can't insert quota data "
+ "block (%u) to free entry list.\n", blk);
+ goto out_buf;
+ }
+ } else {
+ ret = write_blk(info, blk, buf);
+ if (ret < 0) {
+ printk(KERN_ERR "VFS: Can't write quota data "
+ "block %u\n", blk);
+ goto out_buf;
+ }
+ }
+ }
+ dquot->dq_off = 0; /* Quota is now unattached */
+out_buf:
+ freedqbuf(buf);
+ return ret;
+}
+
+/* Remove reference to dquot from tree */
+static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot,
+ uint *blk, int depth)
+{
+ dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+ int ret = 0;
+ uint newblk;
+ __le32 *ref = (__le32 *)buf;
+
+ if (!buf)
+ return -ENOMEM;
+ ret = read_blk(info, *blk, buf);
+ if (ret < 0) {
+ printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
+ goto out_buf;
+ }
+ newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+ if (depth == info->dqi_qtree_depth - 1) {
+ ret = free_dqentry(info, dquot, newblk);
+ newblk = 0;
+ } else {
+ ret = remove_tree(info, dquot, &newblk, depth+1);
+ }
+ if (ret >= 0 && !newblk) {
+ int i;
+ ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0);
+ /* Block got empty? */
+ for (i = 0;
+ i < (info->dqi_usable_bs >> 2) && !ref[i];
+ i++);
+ /* Don't put the root block into the free block list */
+ if (i == (info->dqi_usable_bs >> 2)
+ && *blk != QT_TREEOFF) {
+ put_free_dqblk(info, buf, *blk);
+ *blk = 0;
+ } else {
+ ret = write_blk(info, *blk, buf);
+ if (ret < 0)
+ printk(KERN_ERR "VFS: Can't write quota tree "
+ "block %u.\n", *blk);
+ }
+ }
+out_buf:
+ freedqbuf(buf);
+ return ret;
+}
+
+/* Delete dquot from tree */
+int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+ uint tmp = QT_TREEOFF;
+
+ if (!dquot->dq_off) /* Even not allocated? */
+ return 0;
+ return remove_tree(info, dquot, &tmp, 0);
+}
+EXPORT_SYMBOL(qtree_delete_dquot);
+
+/* Find entry in block */
+static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info,
+ struct dquot *dquot, uint blk)
+{
+ dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+ loff_t ret = 0;
+ int i;
+ char *ddquot;
+
+ if (!buf)
+ return -ENOMEM;
+ ret = read_blk(info, blk, buf);
+ if (ret < 0) {
+ printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+ goto out_buf;
+ }
+ for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader);
+ i < qtree_dqstr_in_blk(info) && !info->dqi_ops->is_id(ddquot, dquot);
+ i++, ddquot += info->dqi_entry_size);
+ if (i == qtree_dqstr_in_blk(info)) {
+ printk(KERN_ERR "VFS: Quota for id %u referenced "
+ "but not present.\n", dquot->dq_id);
+ ret = -EIO;
+ goto out_buf;
+ } else {
+ ret = (blk << info->dqi_blocksize_bits) + sizeof(struct
+ qt_disk_dqdbheader) + i * info->dqi_entry_size;
+ }
+out_buf:
+ freedqbuf(buf);
+ return ret;
+}
+
+/* Find entry for given id in the tree */
+static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info,
+ struct dquot *dquot, uint blk, int depth)
+{
+ dqbuf_t buf = getdqbuf(info->dqi_usable_bs);
+ loff_t ret = 0;
+ __le32 *ref = (__le32 *)buf;
+
+ if (!buf)
+ return -ENOMEM;
+ ret = read_blk(info, blk, buf);
+ if (ret < 0) {
+ printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
+ goto out_buf;
+ }
+ ret = 0;
+ blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]);
+ if (!blk) /* No reference? */
+ goto out_buf;
+ if (depth < info->dqi_qtree_depth - 1)
+ ret = find_tree_dqentry(info, dquot, blk, depth+1);
+ else
+ ret = find_block_dqentry(info, dquot, blk);
+out_buf:
+ freedqbuf(buf);
+ return ret;
+}
+
+/* Find entry for given id in the tree - wrapper function */
+static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info,
+ struct dquot *dquot)
+{
+ return find_tree_dqentry(info, dquot, QT_TREEOFF, 0);
+}
+
+int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+ int type = dquot->dq_type;
+ struct super_block *sb = dquot->dq_sb;
+ loff_t offset;
+ dqbuf_t ddquot;
+ int ret = 0;
+
+#ifdef __QUOTA_QT_PARANOIA
+ /* Invalidated quota? */
+ if (!sb_dqopt(dquot->dq_sb)->files[type]) {
+ printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
+ return -EIO;
+ }
+#endif
+ /* Do we know offset of the dquot entry in the quota file? */
+ if (!dquot->dq_off) {
+ offset = find_dqentry(info, dquot);
+ if (offset <= 0) { /* Entry not present? */
+ if (offset < 0)
+ printk(KERN_ERR "VFS: Can't read quota "
+ "structure for id %u.\n", dquot->dq_id);
+ dquot->dq_off = 0;
+ set_bit(DQ_FAKE_B, &dquot->dq_flags);
+ memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
+ ret = offset;
+ goto out;
+ }
+ dquot->dq_off = offset;
+ }
+ ddquot = getdqbuf(info->dqi_entry_size);
+ if (!ddquot)
+ return -ENOMEM;
+ ret = sb->s_op->quota_read(sb, type, (char *)ddquot,
+ info->dqi_entry_size, dquot->dq_off);
+ if (ret != info->dqi_entry_size) {
+ if (ret >= 0)
+ ret = -EIO;
+ printk(KERN_ERR "VFS: Error while reading quota "
+ "structure for id %u.\n", dquot->dq_id);
+ set_bit(DQ_FAKE_B, &dquot->dq_flags);
+ memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
+ freedqbuf(ddquot);
+ goto out;
+ }
+ spin_lock(&dq_data_lock);
+ info->dqi_ops->disk2mem_dqblk(dquot, ddquot);
+ if (!dquot->dq_dqb.dqb_bhardlimit &&
+ !dquot->dq_dqb.dqb_bsoftlimit &&
+ !dquot->dq_dqb.dqb_ihardlimit &&
+ !dquot->dq_dqb.dqb_isoftlimit)
+ set_bit(DQ_FAKE_B, &dquot->dq_flags);
+ spin_unlock(&dq_data_lock);
+ freedqbuf(ddquot);
+out:
+ dqstats.reads++;
+ return ret;
+}
+EXPORT_SYMBOL(qtree_read_dquot);
+
+/* Check whether dquot should not be deleted. We know we are
+ * the only one operating on dquot (thanks to dq_lock) */
+int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot)
+{
+ if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
+ return qtree_delete_dquot(info, dquot);
+ return 0;
+}
+EXPORT_SYMBOL(qtree_release_dquot);
diff --git a/fs/quota_tree.h b/fs/quota_tree.h
new file mode 100644
index 000000000000..a1ab8db81a51
--- /dev/null
+++ b/fs/quota_tree.h
@@ -0,0 +1,25 @@
+/*
+ * Definitions of structures for vfsv0 quota format
+ */
+
+#ifndef _LINUX_QUOTA_TREE_H
+#define _LINUX_QUOTA_TREE_H
+
+#include <linux/types.h>
+#include <linux/quota.h>
+
+/*
+ * Structure of header of block with quota structures. It is padded to 16 bytes so
+ * there will be space for exactly 21 quota-entries in a block
+ */
+struct qt_disk_dqdbheader {
+ __le32 dqdh_next_free; /* Number of next block with free entry */
+ __le32 dqdh_prev_free; /* Number of previous block with free entry */
+ __le16 dqdh_entries; /* Number of valid entries in block */
+ __le16 dqdh_pad1;
+ __le32 dqdh_pad2;
+};
+
+#define QT_TREEOFF 1 /* Offset of tree in file in blocks */
+
+#endif /* _LINUX_QUOTAIO_TREE_H */
diff --git a/fs/quota_v1.c b/fs/quota_v1.c
index 5ae15b13eeb0..b4af1c69ad16 100644
--- a/fs/quota_v1.c
+++ b/fs/quota_v1.c
@@ -3,25 +3,39 @@
#include <linux/quota.h>
#include <linux/quotaops.h>
#include <linux/dqblk_v1.h>
-#include <linux/quotaio_v1.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <asm/byteorder.h>
+#include "quotaio_v1.h"
+
MODULE_AUTHOR("Jan Kara");
MODULE_DESCRIPTION("Old quota format support");
MODULE_LICENSE("GPL");
+#define QUOTABLOCK_BITS 10
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
+
+static inline qsize_t v1_stoqb(qsize_t space)
+{
+ return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
+}
+
+static inline qsize_t v1_qbtos(qsize_t blocks)
+{
+ return blocks << QUOTABLOCK_BITS;
+}
+
static void v1_disk2mem_dqblk(struct mem_dqblk *m, struct v1_disk_dqblk *d)
{
m->dqb_ihardlimit = d->dqb_ihardlimit;
m->dqb_isoftlimit = d->dqb_isoftlimit;
m->dqb_curinodes = d->dqb_curinodes;
- m->dqb_bhardlimit = d->dqb_bhardlimit;
- m->dqb_bsoftlimit = d->dqb_bsoftlimit;
- m->dqb_curspace = ((qsize_t)d->dqb_curblocks) << QUOTABLOCK_BITS;
+ m->dqb_bhardlimit = v1_qbtos(d->dqb_bhardlimit);
+ m->dqb_bsoftlimit = v1_qbtos(d->dqb_bsoftlimit);
+ m->dqb_curspace = v1_qbtos(d->dqb_curblocks);
m->dqb_itime = d->dqb_itime;
m->dqb_btime = d->dqb_btime;
}
@@ -31,9 +45,9 @@ static void v1_mem2disk_dqblk(struct v1_disk_dqblk *d, struct mem_dqblk *m)
d->dqb_ihardlimit = m->dqb_ihardlimit;
d->dqb_isoftlimit = m->dqb_isoftlimit;
d->dqb_curinodes = m->dqb_curinodes;
- d->dqb_bhardlimit = m->dqb_bhardlimit;
- d->dqb_bsoftlimit = m->dqb_bsoftlimit;
- d->dqb_curblocks = toqb(m->dqb_curspace);
+ d->dqb_bhardlimit = v1_stoqb(m->dqb_bhardlimit);
+ d->dqb_bsoftlimit = v1_stoqb(m->dqb_bsoftlimit);
+ d->dqb_curblocks = v1_stoqb(m->dqb_curspace);
d->dqb_itime = m->dqb_itime;
d->dqb_btime = m->dqb_btime;
}
diff --git a/fs/quota_v2.c b/fs/quota_v2.c
index b53827dc02d9..b618b563635c 100644
--- a/fs/quota_v2.c
+++ b/fs/quota_v2.c
@@ -6,7 +6,6 @@
#include <linux/fs.h>
#include <linux/mount.h>
#include <linux/dqblk_v2.h>
-#include <linux/quotaio_v2.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -15,16 +14,37 @@
#include <asm/byteorder.h>
+#include "quota_tree.h"
+#include "quotaio_v2.h"
+
MODULE_AUTHOR("Jan Kara");
MODULE_DESCRIPTION("Quota format v2 support");
MODULE_LICENSE("GPL");
#define __QUOTA_V2_PARANOIA
-typedef char *dqbuf_t;
+static void v2_mem2diskdqb(void *dp, struct dquot *dquot);
+static void v2_disk2memdqb(struct dquot *dquot, void *dp);
+static int v2_is_id(void *dp, struct dquot *dquot);
+
+static struct qtree_fmt_operations v2_qtree_ops = {
+ .mem2disk_dqblk = v2_mem2diskdqb,
+ .disk2mem_dqblk = v2_disk2memdqb,
+ .is_id = v2_is_id,
+};
+
+#define QUOTABLOCK_BITS 10
+#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS)
-#define GETIDINDEX(id, depth) (((id) >> ((V2_DQTREEDEPTH-(depth)-1)*8)) & 0xff)
-#define GETENTRIES(buf) ((struct v2_disk_dqblk *)(((char *)buf)+sizeof(struct v2_disk_dqdbheader)))
+static inline qsize_t v2_stoqb(qsize_t space)
+{
+ return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS;
+}
+
+static inline qsize_t v2_qbtos(qsize_t blocks)
+{
+ return blocks << QUOTABLOCK_BITS;
+}
/* Check whether given file is really vfsv0 quotafile */
static int v2_check_quota_file(struct super_block *sb, int type)
@@ -50,7 +70,8 @@ static int v2_check_quota_file(struct super_block *sb, int type)
static int v2_read_file_info(struct super_block *sb, int type)
{
struct v2_disk_dqinfo dinfo;
- struct mem_dqinfo *info = sb_dqopt(sb)->info+type;
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct qtree_mem_dqinfo *qinfo;
ssize_t size;
size = sb->s_op->quota_read(sb, type, (char *)&dinfo,
@@ -60,15 +81,29 @@ static int v2_read_file_info(struct super_block *sb, int type)
sb->s_id);
return -1;
}
+ info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS);
+ if (!info->dqi_priv) {
+ printk(KERN_WARNING
+ "Not enough memory for quota information structure.\n");
+ return -1;
+ }
+ qinfo = info->dqi_priv;
/* limits are stored as unsigned 32-bit data */
info->dqi_maxblimit = 0xffffffff;
info->dqi_maxilimit = 0xffffffff;
info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
- info->u.v2_i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
- info->u.v2_i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
- info->u.v2_i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+ qinfo->dqi_sb = sb;
+ qinfo->dqi_type = type;
+ qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
+ qinfo->dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
+ qinfo->dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
+ qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS;
+ qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS;
+ qinfo->dqi_qtree_depth = qtree_depth(qinfo);
+ qinfo->dqi_entry_size = sizeof(struct v2_disk_dqblk);
+ qinfo->dqi_ops = &v2_qtree_ops;
return 0;
}
@@ -76,7 +111,8 @@ static int v2_read_file_info(struct super_block *sb, int type)
static int v2_write_file_info(struct super_block *sb, int type)
{
struct v2_disk_dqinfo dinfo;
- struct mem_dqinfo *info = sb_dqopt(sb)->info+type;
+ struct mem_dqinfo *info = sb_dqinfo(sb, type);
+ struct qtree_mem_dqinfo *qinfo = info->dqi_priv;
ssize_t size;
spin_lock(&dq_data_lock);
@@ -85,9 +121,9 @@ static int v2_write_file_info(struct super_block *sb, int type)
dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
spin_unlock(&dq_data_lock);
- dinfo.dqi_blocks = cpu_to_le32(info->u.v2_i.dqi_blocks);
- dinfo.dqi_free_blk = cpu_to_le32(info->u.v2_i.dqi_free_blk);
- dinfo.dqi_free_entry = cpu_to_le32(info->u.v2_i.dqi_free_entry);
+ dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks);
+ dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk);
+ dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry);
size = sb->s_op->quota_write(sb, type, (char *)&dinfo,
sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF);
if (size != sizeof(struct v2_disk_dqinfo)) {
@@ -98,574 +134,75 @@ static int v2_write_file_info(struct super_block *sb, int type)
return 0;
}
-static void disk2memdqb(struct mem_dqblk *m, struct v2_disk_dqblk *d)
+static void v2_disk2memdqb(struct dquot *dquot, void *dp)
{
+ struct v2_disk_dqblk *d = dp, empty;
+ struct mem_dqblk *m = &dquot->dq_dqb;
+
m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit);
m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit);
m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes);
m->dqb_itime = le64_to_cpu(d->dqb_itime);
- m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit);
- m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit);
+ m->dqb_bhardlimit = v2_qbtos(le32_to_cpu(d->dqb_bhardlimit));
+ m->dqb_bsoftlimit = v2_qbtos(le32_to_cpu(d->dqb_bsoftlimit));
m->dqb_curspace = le64_to_cpu(d->dqb_curspace);
m->dqb_btime = le64_to_cpu(d->dqb_btime);
+ /* We need to escape back all-zero structure */
+ memset(&empty, 0, sizeof(struct v2_disk_dqblk));
+ empty.dqb_itime = cpu_to_le64(1);
+ if (!memcmp(&empty, dp, sizeof(struct v2_disk_dqblk)))
+ m->dqb_itime = 0;
}
-static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id)
+static void v2_mem2diskdqb(void *dp, struct dquot *dquot)
{
+ struct v2_disk_dqblk *d = dp;
+ struct mem_dqblk *m = &dquot->dq_dqb;
+ struct qtree_mem_dqinfo *info =
+ sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+
d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit);
d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit);
d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes);
d->dqb_itime = cpu_to_le64(m->dqb_itime);
- d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit);
- d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit);
+ d->dqb_bhardlimit = cpu_to_le32(v2_stoqb(m->dqb_bhardlimit));
+ d->dqb_bsoftlimit = cpu_to_le32(v2_stoqb(m->dqb_bsoftlimit));
d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
d->dqb_btime = cpu_to_le64(m->dqb_btime);
- d->dqb_id = cpu_to_le32(id);
-}
-
-static dqbuf_t getdqbuf(void)
-{
- dqbuf_t buf = kmalloc(V2_DQBLKSIZE, GFP_NOFS);
- if (!buf)
- printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n");
- return buf;
-}
-
-static inline void freedqbuf(dqbuf_t buf)
-{
- kfree(buf);
-}
-
-static inline ssize_t read_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
-{
- memset(buf, 0, V2_DQBLKSIZE);
- return sb->s_op->quota_read(sb, type, (char *)buf,
- V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
-}
-
-static inline ssize_t write_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf)
-{
- return sb->s_op->quota_write(sb, type, (char *)buf,
- V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS);
-}
-
-/* Remove empty block from list and return it */
-static int get_free_dqblk(struct super_block *sb, int type)
-{
- dqbuf_t buf = getdqbuf();
- struct mem_dqinfo *info = sb_dqinfo(sb, type);
- struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
- int ret, blk;
-
- if (!buf)
- return -ENOMEM;
- if (info->u.v2_i.dqi_free_blk) {
- blk = info->u.v2_i.dqi_free_blk;
- if ((ret = read_blk(sb, type, blk, buf)) < 0)
- goto out_buf;
- info->u.v2_i.dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
- }
- else {
- memset(buf, 0, V2_DQBLKSIZE);
- /* Assure block allocation... */
- if ((ret = write_blk(sb, type, info->u.v2_i.dqi_blocks, buf)) < 0)
- goto out_buf;
- blk = info->u.v2_i.dqi_blocks++;
- }
- mark_info_dirty(sb, type);
- ret = blk;
-out_buf:
- freedqbuf(buf);
- return ret;
-}
-
-/* Insert empty block to the list */
-static int put_free_dqblk(struct super_block *sb, int type, dqbuf_t buf, uint blk)
-{
- struct mem_dqinfo *info = sb_dqinfo(sb, type);
- struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
- int err;
-
- dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_blk);
- dh->dqdh_prev_free = cpu_to_le32(0);
- dh->dqdh_entries = cpu_to_le16(0);
- info->u.v2_i.dqi_free_blk = blk;
- mark_info_dirty(sb, type);
- /* Some strange block. We had better leave it... */
- if ((err = write_blk(sb, type, blk, buf)) < 0)
- return err;
- return 0;
+ d->dqb_id = cpu_to_le32(dquot->dq_id);
+ if (qtree_entry_unused(info, dp))
+ d->dqb_itime = cpu_to_le64(1);
}
-/* Remove given block from the list of blocks with free entries */
-static int remove_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
+static int v2_is_id(void *dp, struct dquot *dquot)
{
- dqbuf_t tmpbuf = getdqbuf();
- struct mem_dqinfo *info = sb_dqinfo(sb, type);
- struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
- uint nextblk = le32_to_cpu(dh->dqdh_next_free), prevblk = le32_to_cpu(dh->dqdh_prev_free);
- int err;
+ struct v2_disk_dqblk *d = dp;
+ struct qtree_mem_dqinfo *info =
+ sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
- if (!tmpbuf)
- return -ENOMEM;
- if (nextblk) {
- if ((err = read_blk(sb, type, nextblk, tmpbuf)) < 0)
- goto out_buf;
- ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = dh->dqdh_prev_free;
- if ((err = write_blk(sb, type, nextblk, tmpbuf)) < 0)
- goto out_buf;
- }
- if (prevblk) {
- if ((err = read_blk(sb, type, prevblk, tmpbuf)) < 0)
- goto out_buf;
- ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_next_free = dh->dqdh_next_free;
- if ((err = write_blk(sb, type, prevblk, tmpbuf)) < 0)
- goto out_buf;
- }
- else {
- info->u.v2_i.dqi_free_entry = nextblk;
- mark_info_dirty(sb, type);
- }
- freedqbuf(tmpbuf);
- dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0);
- /* No matter whether write succeeds block is out of list */
- if (write_blk(sb, type, blk, buf) < 0)
- printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk);
- return 0;
-out_buf:
- freedqbuf(tmpbuf);
- return err;
-}
-
-/* Insert given block to the beginning of list with free entries */
-static int insert_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk)
-{
- dqbuf_t tmpbuf = getdqbuf();
- struct mem_dqinfo *info = sb_dqinfo(sb, type);
- struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf;
- int err;
-
- if (!tmpbuf)
- return -ENOMEM;
- dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_entry);
- dh->dqdh_prev_free = cpu_to_le32(0);
- if ((err = write_blk(sb, type, blk, buf)) < 0)
- goto out_buf;
- if (info->u.v2_i.dqi_free_entry) {
- if ((err = read_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
- goto out_buf;
- ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = cpu_to_le32(blk);
- if ((err = write_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0)
- goto out_buf;
- }
- freedqbuf(tmpbuf);
- info->u.v2_i.dqi_free_entry = blk;
- mark_info_dirty(sb, type);
- return 0;
-out_buf:
- freedqbuf(tmpbuf);
- return err;
-}
-
-/* Find space for dquot */
-static uint find_free_dqentry(struct dquot *dquot, int *err)
-{
- struct super_block *sb = dquot->dq_sb;
- struct mem_dqinfo *info = sb_dqopt(sb)->info+dquot->dq_type;
- uint blk, i;
- struct v2_disk_dqdbheader *dh;
- struct v2_disk_dqblk *ddquot;
- struct v2_disk_dqblk fakedquot;
- dqbuf_t buf;
-
- *err = 0;
- if (!(buf = getdqbuf())) {
- *err = -ENOMEM;
+ if (qtree_entry_unused(info, dp))
return 0;
- }
- dh = (struct v2_disk_dqdbheader *)buf;
- ddquot = GETENTRIES(buf);
- if (info->u.v2_i.dqi_free_entry) {
- blk = info->u.v2_i.dqi_free_entry;
- if ((*err = read_blk(sb, dquot->dq_type, blk, buf)) < 0)
- goto out_buf;
- }
- else {
- blk = get_free_dqblk(sb, dquot->dq_type);
- if ((int)blk < 0) {
- *err = blk;
- freedqbuf(buf);
- return 0;
- }
- memset(buf, 0, V2_DQBLKSIZE);
- /* This is enough as block is already zeroed and entry list is empty... */
- info->u.v2_i.dqi_free_entry = blk;
- mark_info_dirty(sb, dquot->dq_type);
- }
- if (le16_to_cpu(dh->dqdh_entries)+1 >= V2_DQSTRINBLK) /* Block will be full? */
- if ((*err = remove_free_dqentry(sb, dquot->dq_type, buf, blk)) < 0) {
- printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk);
- goto out_buf;
- }
- le16_add_cpu(&dh->dqdh_entries, 1);
- memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
- /* Find free structure in block */
- for (i = 0; i < V2_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)); i++);
-#ifdef __QUOTA_V2_PARANOIA
- if (i == V2_DQSTRINBLK) {
- printk(KERN_ERR "VFS: find_free_dqentry(): Data block full but it shouldn't.\n");
- *err = -EIO;
- goto out_buf;
- }
-#endif
- if ((*err = write_blk(sb, dquot->dq_type, blk, buf)) < 0) {
- printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk);
- goto out_buf;
- }
- dquot->dq_off = (blk<<V2_DQBLKSIZE_BITS)+sizeof(struct v2_disk_dqdbheader)+i*sizeof(struct v2_disk_dqblk);
- freedqbuf(buf);
- return blk;
-out_buf:
- freedqbuf(buf);
- return 0;
-}
-
-/* Insert reference to structure into the trie */
-static int do_insert_tree(struct dquot *dquot, uint *treeblk, int depth)
-{
- struct super_block *sb = dquot->dq_sb;
- dqbuf_t buf;
- int ret = 0, newson = 0, newact = 0;
- __le32 *ref;
- uint newblk;
-
- if (!(buf = getdqbuf()))
- return -ENOMEM;
- if (!*treeblk) {
- ret = get_free_dqblk(sb, dquot->dq_type);
- if (ret < 0)
- goto out_buf;
- *treeblk = ret;
- memset(buf, 0, V2_DQBLKSIZE);
- newact = 1;
- }
- else {
- if ((ret = read_blk(sb, dquot->dq_type, *treeblk, buf)) < 0) {
- printk(KERN_ERR "VFS: Can't read tree quota block %u.\n", *treeblk);
- goto out_buf;
- }
- }
- ref = (__le32 *)buf;
- newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
- if (!newblk)
- newson = 1;
- if (depth == V2_DQTREEDEPTH-1) {
-#ifdef __QUOTA_V2_PARANOIA
- if (newblk) {
- printk(KERN_ERR "VFS: Inserting already present quota entry (block %u).\n", le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]));
- ret = -EIO;
- goto out_buf;
- }
-#endif
- newblk = find_free_dqentry(dquot, &ret);
- }
- else
- ret = do_insert_tree(dquot, &newblk, depth+1);
- if (newson && ret >= 0) {
- ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(newblk);
- ret = write_blk(sb, dquot->dq_type, *treeblk, buf);
- }
- else if (newact && ret < 0)
- put_free_dqblk(sb, dquot->dq_type, buf, *treeblk);
-out_buf:
- freedqbuf(buf);
- return ret;
+ return le32_to_cpu(d->dqb_id) == dquot->dq_id;
}
-/* Wrapper for inserting quota structure into tree */
-static inline int dq_insert_tree(struct dquot *dquot)
+static int v2_read_dquot(struct dquot *dquot)
{
- int tmp = V2_DQTREEOFF;
- return do_insert_tree(dquot, &tmp, 0);
+ return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
}
-/*
- * We don't have to be afraid of deadlocks as we never have quotas on quota files...
- */
static int v2_write_dquot(struct dquot *dquot)
{
- int type = dquot->dq_type;
- ssize_t ret;
- struct v2_disk_dqblk ddquot, empty;
-
- /* dq_off is guarded by dqio_mutex */
- if (!dquot->dq_off)
- if ((ret = dq_insert_tree(dquot)) < 0) {
- printk(KERN_ERR "VFS: Error %zd occurred while creating quota.\n", ret);
- return ret;
- }
- spin_lock(&dq_data_lock);
- mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id);
- /* Argh... We may need to write structure full of zeroes but that would be
- * treated as an empty place by the rest of the code. Format change would
- * be definitely cleaner but the problems probably are not worth it */
- memset(&empty, 0, sizeof(struct v2_disk_dqblk));
- if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
- ddquot.dqb_itime = cpu_to_le64(1);
- spin_unlock(&dq_data_lock);
- ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type,
- (char *)&ddquot, sizeof(struct v2_disk_dqblk), dquot->dq_off);
- if (ret != sizeof(struct v2_disk_dqblk)) {
- printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", dquot->dq_sb->s_id);
- if (ret >= 0)
- ret = -ENOSPC;
- }
- else
- ret = 0;
- dqstats.writes++;
-
- return ret;
+ return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
}
-/* Free dquot entry in data block */
-static int free_dqentry(struct dquot *dquot, uint blk)
-{
- struct super_block *sb = dquot->dq_sb;
- int type = dquot->dq_type;
- struct v2_disk_dqdbheader *dh;
- dqbuf_t buf = getdqbuf();
- int ret = 0;
-
- if (!buf)
- return -ENOMEM;
- if (dquot->dq_off >> V2_DQBLKSIZE_BITS != blk) {
- printk(KERN_ERR "VFS: Quota structure has offset to other "
- "block (%u) than it should (%u).\n", blk,
- (uint)(dquot->dq_off >> V2_DQBLKSIZE_BITS));
- goto out_buf;
- }
- if ((ret = read_blk(sb, type, blk, buf)) < 0) {
- printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk);
- goto out_buf;
- }
- dh = (struct v2_disk_dqdbheader *)buf;
- le16_add_cpu(&dh->dqdh_entries, -1);
- if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */
- if ((ret = remove_free_dqentry(sb, type, buf, blk)) < 0 ||
- (ret = put_free_dqblk(sb, type, buf, blk)) < 0) {
- printk(KERN_ERR "VFS: Can't move quota data block (%u) "
- "to free list.\n", blk);
- goto out_buf;
- }
- }
- else {
- memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0,
- sizeof(struct v2_disk_dqblk));
- if (le16_to_cpu(dh->dqdh_entries) == V2_DQSTRINBLK-1) {
- /* Insert will write block itself */
- if ((ret = insert_free_dqentry(sb, type, buf, blk)) < 0) {
- printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk);
- goto out_buf;
- }
- }
- else
- if ((ret = write_blk(sb, type, blk, buf)) < 0) {
- printk(KERN_ERR "VFS: Can't write quota data "
- "block %u\n", blk);
- goto out_buf;
- }
- }
- dquot->dq_off = 0; /* Quota is now unattached */
-out_buf:
- freedqbuf(buf);
- return ret;
-}
-
-/* Remove reference to dquot from tree */
-static int remove_tree(struct dquot *dquot, uint *blk, int depth)
-{
- struct super_block *sb = dquot->dq_sb;
- int type = dquot->dq_type;
- dqbuf_t buf = getdqbuf();
- int ret = 0;
- uint newblk;
- __le32 *ref = (__le32 *)buf;
-
- if (!buf)
- return -ENOMEM;
- if ((ret = read_blk(sb, type, *blk, buf)) < 0) {
- printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk);
- goto out_buf;
- }
- newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
- if (depth == V2_DQTREEDEPTH-1) {
- ret = free_dqentry(dquot, newblk);
- newblk = 0;
- }
- else
- ret = remove_tree(dquot, &newblk, depth+1);
- if (ret >= 0 && !newblk) {
- int i;
- ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(0);
- for (i = 0; i < V2_DQBLKSIZE && !buf[i]; i++); /* Block got empty? */
- /* Don't put the root block into the free block list */
- if (i == V2_DQBLKSIZE && *blk != V2_DQTREEOFF) {
- put_free_dqblk(sb, type, buf, *blk);
- *blk = 0;
- }
- else
- if ((ret = write_blk(sb, type, *blk, buf)) < 0)
- printk(KERN_ERR "VFS: Can't write quota tree "
- "block %u.\n", *blk);
- }
-out_buf:
- freedqbuf(buf);
- return ret;
-}
-
-/* Delete dquot from tree */
-static int v2_delete_dquot(struct dquot *dquot)
-{
- uint tmp = V2_DQTREEOFF;
-
- if (!dquot->dq_off) /* Even not allocated? */
- return 0;
- return remove_tree(dquot, &tmp, 0);
-}
-
-/* Find entry in block */
-static loff_t find_block_dqentry(struct dquot *dquot, uint blk)
-{
- dqbuf_t buf = getdqbuf();
- loff_t ret = 0;
- int i;
- struct v2_disk_dqblk *ddquot = GETENTRIES(buf);
-
- if (!buf)
- return -ENOMEM;
- if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
- printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
- goto out_buf;
- }
- if (dquot->dq_id)
- for (i = 0; i < V2_DQSTRINBLK &&
- le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++);
- else { /* ID 0 as a bit more complicated searching... */
- struct v2_disk_dqblk fakedquot;
-
- memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk));
- for (i = 0; i < V2_DQSTRINBLK; i++)
- if (!le32_to_cpu(ddquot[i].dqb_id) &&
- memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)))
- break;
- }
- if (i == V2_DQSTRINBLK) {
- printk(KERN_ERR "VFS: Quota for id %u referenced "
- "but not present.\n", dquot->dq_id);
- ret = -EIO;
- goto out_buf;
- }
- else
- ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct
- v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk);
-out_buf:
- freedqbuf(buf);
- return ret;
-}
-
-/* Find entry for given id in the tree */
-static loff_t find_tree_dqentry(struct dquot *dquot, uint blk, int depth)
-{
- dqbuf_t buf = getdqbuf();
- loff_t ret = 0;
- __le32 *ref = (__le32 *)buf;
-
- if (!buf)
- return -ENOMEM;
- if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) {
- printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk);
- goto out_buf;
- }
- ret = 0;
- blk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]);
- if (!blk) /* No reference? */
- goto out_buf;
- if (depth < V2_DQTREEDEPTH-1)
- ret = find_tree_dqentry(dquot, blk, depth+1);
- else
- ret = find_block_dqentry(dquot, blk);
-out_buf:
- freedqbuf(buf);
- return ret;
-}
-
-/* Find entry for given id in the tree - wrapper function */
-static inline loff_t find_dqentry(struct dquot *dquot)
-{
- return find_tree_dqentry(dquot, V2_DQTREEOFF, 0);
-}
-
-static int v2_read_dquot(struct dquot *dquot)
+static int v2_release_dquot(struct dquot *dquot)
{
- int type = dquot->dq_type;
- loff_t offset;
- struct v2_disk_dqblk ddquot, empty;
- int ret = 0;
-
-#ifdef __QUOTA_V2_PARANOIA
- /* Invalidated quota? */
- if (!dquot->dq_sb || !sb_dqopt(dquot->dq_sb)->files[type]) {
- printk(KERN_ERR "VFS: Quota invalidated while reading!\n");
- return -EIO;
- }
-#endif
- offset = find_dqentry(dquot);
- if (offset <= 0) { /* Entry not present? */
- if (offset < 0)
- printk(KERN_ERR "VFS: Can't read quota "
- "structure for id %u.\n", dquot->dq_id);
- dquot->dq_off = 0;
- set_bit(DQ_FAKE_B, &dquot->dq_flags);
- memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk));
- ret = offset;
- }
- else {
- dquot->dq_off = offset;
- if ((ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type,
- (char *)&ddquot, sizeof(struct v2_disk_dqblk), offset))
- != sizeof(struct v2_disk_dqblk)) {
- if (ret >= 0)
- ret = -EIO;
- printk(KERN_ERR "VFS: Error while reading quota "
- "structure for id %u.\n", dquot->dq_id);
- memset(&ddquot, 0, sizeof(struct v2_disk_dqblk));
- }
- else {
- ret = 0;
- /* We need to escape back all-zero structure */
- memset(&empty, 0, sizeof(struct v2_disk_dqblk));
- empty.dqb_itime = cpu_to_le64(1);
- if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk)))
- ddquot.dqb_itime = 0;
- }
- disk2memdqb(&dquot->dq_dqb, &ddquot);
- if (!dquot->dq_dqb.dqb_bhardlimit &&
- !dquot->dq_dqb.dqb_bsoftlimit &&
- !dquot->dq_dqb.dqb_ihardlimit &&
- !dquot->dq_dqb.dqb_isoftlimit)
- set_bit(DQ_FAKE_B, &dquot->dq_flags);
- }
- dqstats.reads++;
-
- return ret;
+ return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot);
}
-/* Check whether dquot should not be deleted. We know we are
- * the only one operating on dquot (thanks to dq_lock) */
-static int v2_release_dquot(struct dquot *dquot)
+static int v2_free_file_info(struct super_block *sb, int type)
{
- if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace))
- return v2_delete_dquot(dquot);
+ kfree(sb_dqinfo(sb, type)->dqi_priv);
return 0;
}
@@ -673,7 +210,7 @@ static struct quota_format_ops v2_format_ops = {
.check_quota_file = v2_check_quota_file,
.read_file_info = v2_read_file_info,
.write_file_info = v2_write_file_info,
- .free_file_info = NULL,
+ .free_file_info = v2_free_file_info,
.read_dqblk = v2_read_dquot,
.commit_dqblk = v2_write_dquot,
.release_dqblk = v2_release_dquot,
diff --git a/fs/quotaio_v1.h b/fs/quotaio_v1.h
new file mode 100644
index 000000000000..746654b5de70
--- /dev/null
+++ b/fs/quotaio_v1.h
@@ -0,0 +1,33 @@
+#ifndef _LINUX_QUOTAIO_V1_H
+#define _LINUX_QUOTAIO_V1_H
+
+#include <linux/types.h>
+
+/*
+ * The following constants define the amount of time given a user
+ * before the soft limits are treated as hard limits (usually resulting
+ * in an allocation failure). The timer is started when the user crosses
+ * their soft limit, it is reset when they go below their soft limit.
+ */
+#define MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */
+#define MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */
+
+/*
+ * The following structure defines the format of the disk quota file
+ * (as it appears on disk) - the file is an array of these structures
+ * indexed by user or group number.
+ */
+struct v1_disk_dqblk {
+ __u32 dqb_bhardlimit; /* absolute limit on disk blks alloc */
+ __u32 dqb_bsoftlimit; /* preferred limit on disk blks */
+ __u32 dqb_curblocks; /* current block count */
+ __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */
+ __u32 dqb_isoftlimit; /* preferred inode limit */
+ __u32 dqb_curinodes; /* current # allocated inodes */
+ time_t dqb_btime; /* time limit for excessive disk use */
+ time_t dqb_itime; /* time limit for excessive inode use */
+};
+
+#define v1_dqoff(UID) ((loff_t)((UID) * sizeof (struct v1_disk_dqblk)))
+
+#endif /* _LINUX_QUOTAIO_V1_H */
diff --git a/fs/quotaio_v2.h b/fs/quotaio_v2.h
new file mode 100644
index 000000000000..530fe580685c
--- /dev/null
+++ b/fs/quotaio_v2.h
@@ -0,0 +1,60 @@
+/*
+ * Definitions of structures for vfsv0 quota format
+ */
+
+#ifndef _LINUX_QUOTAIO_V2_H
+#define _LINUX_QUOTAIO_V2_H
+
+#include <linux/types.h>
+#include <linux/quota.h>
+
+/*
+ * Definitions of magics and versions of current quota files
+ */
+#define V2_INITQMAGICS {\
+ 0xd9c01f11, /* USRQUOTA */\
+ 0xd9c01927 /* GRPQUOTA */\
+}
+
+#define V2_INITQVERSIONS {\
+ 0, /* USRQUOTA */\
+ 0 /* GRPQUOTA */\
+}
+
+/* First generic header */
+struct v2_disk_dqheader {
+ __le32 dqh_magic; /* Magic number identifying file */
+ __le32 dqh_version; /* File version */
+};
+
+/*
+ * The following structure defines the format of the disk quota file
+ * (as it appears on disk) - the file is a radix tree whose leaves point
+ * to blocks of these structures.
+ */
+struct v2_disk_dqblk {
+ __le32 dqb_id; /* id this quota applies to */
+ __le32 dqb_ihardlimit; /* absolute limit on allocated inodes */
+ __le32 dqb_isoftlimit; /* preferred inode limit */
+ __le32 dqb_curinodes; /* current # allocated inodes */
+ __le32 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */
+ __le32 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */
+ __le64 dqb_curspace; /* current space occupied (in bytes) */
+ __le64 dqb_btime; /* time limit for excessive disk use */
+ __le64 dqb_itime; /* time limit for excessive inode use */
+};
+
+/* Header with type and version specific information */
+struct v2_disk_dqinfo {
+ __le32 dqi_bgrace; /* Time before block soft limit becomes hard limit */
+ __le32 dqi_igrace; /* Time before inode soft limit becomes hard limit */
+ __le32 dqi_flags; /* Flags for quotafile (DQF_*) */
+ __le32 dqi_blocks; /* Number of blocks in file */
+ __le32 dqi_free_blk; /* Number of first free block in the list */
+ __le32 dqi_free_entry; /* Number of block with at least one free entry */
+};
+
+#define V2_DQINFOOFF sizeof(struct v2_disk_dqheader) /* Offset of info header in file */
+#define V2_DQBLKSIZE_BITS 10 /* Size of leaf block in tree */
+
+#endif /* _LINUX_QUOTAIO_V2_H */
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a83a3518ae33..b7e6ac706b87 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -57,7 +57,6 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev)
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
- inode->i_blocks = 0;
inode->i_mapping->a_ops = &ramfs_aops;
inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
diff --git a/fs/read_write.c b/fs/read_write.c
index 969a6d9c020b..5cc6924eb158 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -50,6 +50,14 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin)
offset += inode->i_size;
break;
case SEEK_CUR:
+ /*
+ * Here we special-case the lseek(fd, 0, SEEK_CUR)
+ * position-querying operation. Avoid rewriting the "same"
+ * f_pos value back to the file because a concurrent read(),
+ * write() or lseek() might have altered it
+ */
+ if (offset == 0)
+ return file->f_pos;
offset += file->f_pos;
break;
}
@@ -105,6 +113,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
offset += i_size_read(file->f_path.dentry->d_inode);
break;
case SEEK_CUR:
+ if (offset == 0) {
+ retval = file->f_pos;
+ goto out;
+ }
offset += file->f_pos;
}
retval = -EINVAL;
@@ -115,6 +127,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin)
}
retval = offset;
}
+out:
unlock_kernel();
return retval;
}
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 6c4c2c69449f..55fce92cdf18 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1753,6 +1753,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
struct inode *inode)
{
struct super_block *sb;
+ struct reiserfs_iget_args args;
INITIALIZE_PATH(path_to_key);
struct cpu_key key;
struct item_head ih;
@@ -1780,6 +1781,20 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
err = -ENOMEM;
goto out_bad_inode;
}
+ args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
+ if (old_format_only(sb))
+ make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
+ TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
+ else
+ make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
+ TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
+ memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
+ args.dirid = le32_to_cpu(ih.ih_key.k_dir_id);
+ if (insert_inode_locked4(inode, args.objectid,
+ reiserfs_find_actor, &args) < 0) {
+ err = -EINVAL;
+ goto out_bad_inode;
+ }
if (old_format_only(sb))
/* not a perfect generation count, as object ids can be reused, but
** this is as good as reiserfs can do right now.
@@ -1825,13 +1840,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
reiserfs_init_acl_default(inode);
reiserfs_init_xattr_rwsem(inode);
- if (old_format_only(sb))
- make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET,
- TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT);
- else
- make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET,
- TYPE_STAT_DATA, SD_SIZE, MAX_US_INT);
-
/* key to search for correct place for new stat data */
_make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id),
le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET,
@@ -1859,13 +1867,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
} else {
inode2sd(&sd, inode, inode->i_size);
}
- // these do not go to on-disk stat data
- inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid);
-
// store in in-core inode the key of stat data and version all
// object items will have (directory items will have old offset
// format, other new objects will consist of new items)
- memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE);
if (old_format_only(sb) || S_ISDIR(mode) || S_ISLNK(mode))
set_inode_item_key_version(inode, KEY_FORMAT_3_5);
else
@@ -1929,7 +1933,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
reiserfs_mark_inode_private(inode);
}
- insert_inode_hash(inode);
reiserfs_update_sd(th, inode);
reiserfs_check_path(&path_to_key);
@@ -1956,6 +1959,7 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
out_inserted_sd:
inode->i_nlink = 0;
th->t_trans_id = 0; /* so the caller can't use this handle later */
+ unlock_new_inode(inode); /* OK to do even if we hadn't locked it */
/* If we were inheriting an ACL, we need to release the lock so that
* iput doesn't deadlock in reiserfs_delete_xattrs. The locking
@@ -2556,7 +2560,7 @@ static int reiserfs_write_begin(struct file *file,
}
index = pos >> PAGE_CACHE_SHIFT;
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
*pagep = page;
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 4f322e5ed840..738967f6c8ee 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -646,6 +646,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
err = journal_end(&th, dir->i_sb, jbegin_count);
if (err)
retval = err;
+ unlock_new_inode(inode);
iput(inode);
goto out_failed;
}
@@ -653,6 +654,7 @@ static int reiserfs_create(struct inode *dir, struct dentry *dentry, int mode,
reiserfs_update_inode_transaction(dir);
d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
retval = journal_end(&th, dir->i_sb, jbegin_count);
out_failed:
@@ -727,11 +729,13 @@ static int reiserfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
err = journal_end(&th, dir->i_sb, jbegin_count);
if (err)
retval = err;
+ unlock_new_inode(inode);
iput(inode);
goto out_failed;
}
d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
retval = journal_end(&th, dir->i_sb, jbegin_count);
out_failed:
@@ -812,6 +816,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
err = journal_end(&th, dir->i_sb, jbegin_count);
if (err)
retval = err;
+ unlock_new_inode(inode);
iput(inode);
goto out_failed;
}
@@ -819,6 +824,7 @@ static int reiserfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
reiserfs_update_sd(&th, dir);
d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
retval = journal_end(&th, dir->i_sb, jbegin_count);
out_failed:
if (locked)
@@ -1096,11 +1102,13 @@ static int reiserfs_symlink(struct inode *parent_dir,
err = journal_end(&th, parent_dir->i_sb, jbegin_count);
if (err)
retval = err;
+ unlock_new_inode(inode);
iput(inode);
goto out_failed;
}
d_instantiate(dentry, inode);
+ unlock_new_inode(inode);
retval = journal_end(&th, parent_dir->i_sb, jbegin_count);
out_failed:
reiserfs_write_unlock(parent_dir->i_sb);
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 663a91f5dce8..c55651f1407c 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -649,6 +649,8 @@ static struct dquot_operations reiserfs_quota_operations = {
.release_dquot = reiserfs_release_dquot,
.mark_dirty = reiserfs_mark_dquot_dirty,
.write_info = reiserfs_write_info,
+ .alloc_dquot = dquot_alloc,
+ .destroy_dquot = dquot_destroy,
};
static struct quotactl_ops reiserfs_qctl_operations = {
@@ -994,8 +996,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
if (c == 'u' || c == 'g') {
int qtype = c == 'u' ? USRQUOTA : GRPQUOTA;
- if ((sb_any_quota_enabled(s) ||
- sb_any_quota_suspended(s)) &&
+ if (sb_any_quota_loaded(s) &&
(!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) {
reiserfs_warning(s,
"reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
@@ -1041,8 +1042,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
"reiserfs_parse_options: unknown quota format specified.");
return 0;
}
- if ((sb_any_quota_enabled(s) ||
- sb_any_quota_suspended(s)) &&
+ if (sb_any_quota_loaded(s) &&
*qfmt != REISERFS_SB(s)->s_jquota_fmt) {
reiserfs_warning(s,
"reiserfs_parse_options: cannot change journaled quota options when quota turned on.");
@@ -1067,7 +1067,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin
}
/* This checking is not precise wrt the quota type but for our purposes it is sufficient */
if (!(*mount_options & (1 << REISERFS_QUOTA))
- && sb_any_quota_enabled(s)) {
+ && sb_any_quota_loaded(s)) {
reiserfs_warning(s,
"reiserfs_parse_options: quota options must be present when quota is turned on.");
return 0;
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
index 60d2f822e87b..c97d4c931715 100644
--- a/fs/romfs/inode.c
+++ b/fs/romfs/inode.c
@@ -524,7 +524,6 @@ romfs_iget(struct super_block *sb, unsigned long ino)
i->i_size = be32_to_cpu(ri.size);
i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
- i->i_uid = i->i_gid = 0;
/* Precalculate the data offset */
ino = romfs_strnlen(i, ino+ROMFH_SIZE, ROMFS_MAXFN);
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 16c211558c22..b569ff1c4dc8 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -389,8 +389,14 @@ char *mangle_path(char *s, char *p, char *esc)
}
EXPORT_SYMBOL(mangle_path);
-/*
- * return the absolute path of 'dentry' residing in mount 'mnt'.
+/**
+ * seq_path - seq_file interface to print a pathname
+ * @m: the seq_file handle
+ * @path: the struct path to print
+ * @esc: set of characters to escape in the output
+ *
+ * return the absolute path of 'path', as represented by the
+ * dentry / mnt pair in the path parameter.
*/
int seq_path(struct seq_file *m, struct path *path, char *esc)
{
@@ -462,7 +468,8 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc)
return -1;
}
-int seq_bitmap(struct seq_file *m, unsigned long *bits, unsigned int nr_bits)
+int seq_bitmap(struct seq_file *m, const unsigned long *bits,
+ unsigned int nr_bits)
{
if (m->count < m->size) {
int len = bitmap_scnprintf(m->buf + m->count,
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index e4f8d51a5553..92d5e8ffb639 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -297,7 +297,7 @@ static int smb_write_begin(struct file *file, struct address_space *mapping,
struct page **pagep, void **fsdata)
{
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
- *pagep = __grab_cache_page(mapping, index);
+ *pagep = grab_cache_page_write_begin(mapping, index, flags);
if (!*pagep)
return -ENOMEM;
return 0;
diff --git a/fs/stat.c b/fs/stat.c
index 7c46fbeb8b76..7e12a6f82795 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -305,7 +305,7 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *pathname,
struct inode *inode = path.dentry->d_inode;
error = -EINVAL;
- if (inode->i_op && inode->i_op->readlink) {
+ if (inode->i_op->readlink) {
error = security_inode_readlink(path.dentry);
if (!error) {
touch_atime(path.mnt, path.dentry);
diff --git a/fs/sync.c b/fs/sync.c
index 2967562d416f..0921d6d4b5e6 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -75,14 +75,39 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
return ret;
}
-long do_fsync(struct file *file, int datasync)
+/**
+ * vfs_fsync - perform a fsync or fdatasync on a file
+ * @file: file to sync
+ * @dentry: dentry of @file
+ * @data: only perform a fdatasync operation
+ *
+ * Write back data and metadata for @file to disk. If @datasync is
+ * set only metadata needed to access modified file data is written.
+ *
+ * In case this function is called from nfsd @file may be %NULL and
+ * only @dentry is set. This can only happen when the filesystem
+ * implements the export_operations API.
+ */
+int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
{
- int ret;
- int err;
- struct address_space *mapping = file->f_mapping;
+ const struct file_operations *fop;
+ struct address_space *mapping;
+ int err, ret;
+
+ /*
+ * Get mapping and operations from the file in case we have
+ * as file, or get the default values for them in case we
+ * don't have a struct file available. Damn nfsd..
+ */
+ if (file) {
+ mapping = file->f_mapping;
+ fop = file->f_op;
+ } else {
+ mapping = dentry->d_inode->i_mapping;
+ fop = dentry->d_inode->i_fop;
+ }
- if (!file->f_op || !file->f_op->fsync) {
- /* Why? We can still call filemap_fdatawrite */
+ if (!fop || !fop->fsync) {
ret = -EINVAL;
goto out;
}
@@ -94,7 +119,7 @@ long do_fsync(struct file *file, int datasync)
* livelocks in fsync_buffers_list().
*/
mutex_lock(&mapping->host->i_mutex);
- err = file->f_op->fsync(file, file->f_path.dentry, datasync);
+ err = fop->fsync(file, dentry, datasync);
if (!ret)
ret = err;
mutex_unlock(&mapping->host->i_mutex);
@@ -104,15 +129,16 @@ long do_fsync(struct file *file, int datasync)
out:
return ret;
}
+EXPORT_SYMBOL(vfs_fsync);
-static long __do_fsync(unsigned int fd, int datasync)
+static int do_fsync(unsigned int fd, int datasync)
{
struct file *file;
int ret = -EBADF;
file = fget(fd);
if (file) {
- ret = do_fsync(file, datasync);
+ ret = vfs_fsync(file, file->f_path.dentry, datasync);
fput(file);
}
return ret;
@@ -120,12 +146,12 @@ static long __do_fsync(unsigned int fd, int datasync)
asmlinkage long sys_fsync(unsigned int fd)
{
- return __do_fsync(fd, 0);
+ return do_fsync(fd, 0);
}
asmlinkage long sys_fdatasync(unsigned int fd)
{
- return __do_fsync(fd, 1);
+ return do_fsync(fd, 1);
}
/*
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index eb53c632f856..dfa3d94cfc74 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -107,8 +107,6 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
static inline void set_default_inode_attr(struct inode * inode, mode_t mode)
{
inode->i_mode = mode;
- inode->i_uid = 0;
- inode->i_gid = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
}
@@ -149,7 +147,6 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
{
struct bin_attribute *bin_attr;
- inode->i_blocks = 0;
inode->i_mapping->a_ops = &sysfs_aops;
inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info;
inode->i_op = &sysfs_inode_operations;
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index df0d435baa48..3d81bf58dae2 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -27,6 +27,7 @@
#include <linux/init.h>
#include <linux/buffer_head.h>
#include <linux/vfs.h>
+#include <linux/namei.h>
#include <asm/byteorder.h>
#include "sysv.h"
@@ -163,8 +164,11 @@ void sysv_set_inode(struct inode *inode, dev_t rdev)
if (inode->i_blocks) {
inode->i_op = &sysv_symlink_inode_operations;
inode->i_mapping->a_ops = &sysv_aops;
- } else
+ } else {
inode->i_op = &sysv_fast_symlink_inode_operations;
+ nd_terminate_link(SYSV_I(inode)->i_data, inode->i_size,
+ sizeof(SYSV_I(inode)->i_data) - 1);
+ }
} else
init_special_inode(inode, inode->i_mode, rdev);
}
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index 4a18f084cc42..0e5e54d82924 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -32,18 +32,15 @@
#include "ubifs.h"
#include <linux/writeback.h>
-#include <asm/div64.h>
+#include <linux/math64.h>
/*
* When pessimistic budget calculations say that there is no enough space,
* UBIFS starts writing back dirty inodes and pages, doing garbage collection,
- * or committing. The below constants define maximum number of times UBIFS
+ * or committing. The below constant defines maximum number of times UBIFS
* repeats the operations.
*/
-#define MAX_SHRINK_RETRIES 8
-#define MAX_GC_RETRIES 4
-#define MAX_CMT_RETRIES 2
-#define MAX_NOSPC_RETRIES 1
+#define MAX_MKSPC_RETRIES 3
/*
* The below constant defines amount of dirty pages which should be written
@@ -52,30 +49,6 @@
#define NR_TO_WRITE 16
/**
- * struct retries_info - information about re-tries while making free space.
- * @prev_liability: previous liability
- * @shrink_cnt: how many times the liability was shrinked
- * @shrink_retries: count of liability shrink re-tries (increased when
- * liability does not shrink)
- * @try_gc: GC should be tried first
- * @gc_retries: how many times GC was run
- * @cmt_retries: how many times commit has been done
- * @nospc_retries: how many times GC returned %-ENOSPC
- *
- * Since we consider budgeting to be the fast-path, and this structure has to
- * be allocated on stack and zeroed out, we make it smaller using bit-fields.
- */
-struct retries_info {
- long long prev_liability;
- unsigned int shrink_cnt;
- unsigned int shrink_retries:5;
- unsigned int try_gc:1;
- unsigned int gc_retries:4;
- unsigned int cmt_retries:3;
- unsigned int nospc_retries:1;
-};
-
-/**
* shrink_liability - write-back some dirty pages/inodes.
* @c: UBIFS file-system description object
* @nr_to_write: how many dirty pages to write-back
@@ -147,9 +120,25 @@ static int run_gc(struct ubifs_info *c)
}
/**
+ * get_liability - calculate current liability.
+ * @c: UBIFS file-system description object
+ *
+ * This function calculates and returns current UBIFS liability, i.e. the
+ * amount of bytes UBIFS has "promised" to write to the media.
+ */
+static long long get_liability(struct ubifs_info *c)
+{
+ long long liab;
+
+ spin_lock(&c->space_lock);
+ liab = c->budg_idx_growth + c->budg_data_growth + c->budg_dd_growth;
+ spin_unlock(&c->space_lock);
+ return liab;
+}
+
+/**
* make_free_space - make more free space on the file-system.
* @c: UBIFS file-system description object
- * @ri: information about previous invocations of this function
*
* This function is called when an operation cannot be budgeted because there
* is supposedly no free space. But in most cases there is some free space:
@@ -165,87 +154,42 @@ static int run_gc(struct ubifs_info *c)
* Returns %-ENOSPC if it couldn't do more free space, and other negative error
* codes on failures.
*/
-static int make_free_space(struct ubifs_info *c, struct retries_info *ri)
+static int make_free_space(struct ubifs_info *c)
{
- int err;
-
- /*
- * If we have some dirty pages and inodes (liability), try to write
- * them back unless this was tried too many times without effect
- * already.
- */
- if (ri->shrink_retries < MAX_SHRINK_RETRIES && !ri->try_gc) {
- long long liability;
-
- spin_lock(&c->space_lock);
- liability = c->budg_idx_growth + c->budg_data_growth +
- c->budg_dd_growth;
- spin_unlock(&c->space_lock);
+ int err, retries = 0;
+ long long liab1, liab2;
- if (ri->prev_liability >= liability) {
- /* Liability does not shrink, next time try GC then */
- ri->shrink_retries += 1;
- if (ri->gc_retries < MAX_GC_RETRIES)
- ri->try_gc = 1;
- dbg_budg("liability did not shrink: retries %d of %d",
- ri->shrink_retries, MAX_SHRINK_RETRIES);
- }
+ do {
+ liab1 = get_liability(c);
+ /*
+ * We probably have some dirty pages or inodes (liability), try
+ * to write them back.
+ */
+ dbg_budg("liability %lld, run write-back", liab1);
+ shrink_liability(c, NR_TO_WRITE);
- dbg_budg("force write-back (count %d)", ri->shrink_cnt);
- shrink_liability(c, NR_TO_WRITE + ri->shrink_cnt);
+ liab2 = get_liability(c);
+ if (liab2 < liab1)
+ return -EAGAIN;
- ri->prev_liability = liability;
- ri->shrink_cnt += 1;
- return -EAGAIN;
- }
+ dbg_budg("new liability %lld (not shrinked)", liab2);
- /*
- * Try to run garbage collector unless it was already tried too many
- * times.
- */
- if (ri->gc_retries < MAX_GC_RETRIES) {
- ri->gc_retries += 1;
- dbg_budg("run GC, retries %d of %d",
- ri->gc_retries, MAX_GC_RETRIES);
-
- ri->try_gc = 0;
+ /* Liability did not shrink again, try GC */
+ dbg_budg("Run GC");
err = run_gc(c);
if (!err)
return -EAGAIN;
- if (err == -EAGAIN) {
- dbg_budg("GC asked to commit");
- err = ubifs_run_commit(c);
- if (err)
- return err;
- return -EAGAIN;
- }
-
- if (err != -ENOSPC)
- return err;
-
- /*
- * GC could not make any progress. If this is the first time,
- * then it makes sense to try to commit, because it might make
- * some dirty space.
- */
- dbg_budg("GC returned -ENOSPC, retries %d",
- ri->nospc_retries);
- if (ri->nospc_retries >= MAX_NOSPC_RETRIES)
+ if (err != -EAGAIN && err != -ENOSPC)
+ /* Some real error happened */
return err;
- ri->nospc_retries += 1;
- }
- /* Neither GC nor write-back helped, try to commit */
- if (ri->cmt_retries < MAX_CMT_RETRIES) {
- ri->cmt_retries += 1;
- dbg_budg("run commit, retries %d of %d",
- ri->cmt_retries, MAX_CMT_RETRIES);
+ dbg_budg("Run commit (retries %d)", retries);
err = ubifs_run_commit(c);
if (err)
return err;
- return -EAGAIN;
- }
+ } while (retries++ < MAX_MKSPC_RETRIES);
+
return -ENOSPC;
}
@@ -258,8 +202,8 @@ static int make_free_space(struct ubifs_info *c, struct retries_info *ri)
*/
int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
{
- int ret;
- uint64_t idx_size;
+ int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz;
+ long long idx_size;
idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
@@ -271,23 +215,16 @@ int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
* pair, nor similarly the two variables for the new index size, so we
* have to do this costly 64-bit division on fast-path.
*/
- if (do_div(idx_size, c->leb_size - c->max_idx_node_sz))
- ret = idx_size + 1;
- else
- ret = idx_size;
+ idx_size += eff_leb_size - 1;
+ idx_lebs = div_u64(idx_size, eff_leb_size);
/*
* The index head is not available for the in-the-gaps method, so add an
* extra LEB to compensate.
*/
- ret += 1;
- /*
- * At present the index needs at least 2 LEBs: one for the index head
- * and one for in-the-gaps method (which currently does not cater for
- * the index head and so excludes it from consideration).
- */
- if (ret < 2)
- ret = 2;
- return ret;
+ idx_lebs += 1;
+ if (idx_lebs < MIN_INDEX_LEBS)
+ idx_lebs = MIN_INDEX_LEBS;
+ return idx_lebs;
}
/**
@@ -530,8 +467,7 @@ static int calc_dd_growth(const struct ubifs_info *c,
int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
{
int uninitialized_var(cmt_retries), uninitialized_var(wb_retries);
- int err, idx_growth, data_growth, dd_growth;
- struct retries_info ri;
+ int err, idx_growth, data_growth, dd_growth, retried = 0;
ubifs_assert(req->new_page <= 1);
ubifs_assert(req->dirtied_page <= 1);
@@ -549,7 +485,6 @@ int ubifs_budget_space(struct ubifs_info *c, struct ubifs_budget_req *req)
if (!data_growth && !dd_growth)
return 0;
idx_growth = calc_idx_growth(c, req);
- memset(&ri, 0, sizeof(struct retries_info));
again:
spin_lock(&c->space_lock);
@@ -587,12 +522,17 @@ again:
return err;
}
- err = make_free_space(c, &ri);
+ err = make_free_space(c);
+ cond_resched();
if (err == -EAGAIN) {
dbg_budg("try again");
- cond_resched();
goto again;
} else if (err == -ENOSPC) {
+ if (!retried) {
+ retried = 1;
+ dbg_budg("-ENOSPC, but anyway try once again");
+ goto again;
+ }
dbg_budg("FS is full, -ENOSPC");
c->nospace = 1;
if (can_use_rp(c) || c->rp_size == 0)
@@ -712,9 +652,9 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
* user-space. User-space application tend to expect that if the file-system
* (e.g., via the 'statfs()' call) reports that it has N bytes available, they
* are able to write a file of size N. UBIFS attaches node headers to each data
- * node and it has to write indexind nodes as well. This introduces additional
- * overhead, and UBIFS it has to report sligtly less free space to meet the
- * above expectetion.
+ * node and it has to write indexing nodes as well. This introduces additional
+ * overhead, and UBIFS has to report slightly less free space to meet the above
+ * expectations.
*
* This function assumes free space is made up of uncompressed data nodes and
* full index nodes (one per data node, tripled because we always allow enough
@@ -723,7 +663,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c,
* Note, the calculation is pessimistic, which means that most of the time
* UBIFS reports less space than it actually has.
*/
-long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
+long long ubifs_reported_space(const struct ubifs_info *c, long long free)
{
int divisor, factor, f;
@@ -737,7 +677,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
* of data nodes, f - fanout. Because effective UBIFS fanout is twice
* as less than maximum fanout, we assume that each data node
* introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes.
- * Note, the multiplier 3 is because UBIFS reseves thrice as more space
+ * Note, the multiplier 3 is because UBIFS reserves thrice as more space
* for the index.
*/
f = c->fanout > 3 ? c->fanout >> 1 : 2;
@@ -745,8 +685,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
divisor = UBIFS_MAX_DATA_NODE_SZ;
divisor += (c->max_idx_node_sz * 3) / (f - 1);
free *= factor;
- do_div(free, divisor);
- return free;
+ return div_u64(free, divisor);
}
/**
@@ -756,10 +695,10 @@ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free)
* This function calculates amount of free space to report to user-space.
*
* Because UBIFS may introduce substantial overhead (the index, node headers,
- * alighment, wastage at the end of eraseblocks, etc), it cannot report real
+ * alignment, wastage at the end of eraseblocks, etc), it cannot report real
* amount of free flash space it has (well, because not all dirty space is
- * reclamable, UBIFS does not actually know the real amount). If UBIFS did so,
- * it would bread user expectetion about what free space is. Users seem to
+ * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so,
+ * it would bread user expectations about what free space is. Users seem to
* accustomed to assume that if the file-system reports N bytes of free space,
* they would be able to fit a file of N bytes to the FS. This almost works for
* traditional file-systems, because they have way less overhead than UBIFS.
@@ -771,18 +710,9 @@ long long ubifs_get_free_space(struct ubifs_info *c)
long long available, outstanding, free;
spin_lock(&c->space_lock);
- min_idx_lebs = ubifs_calc_min_idx_lebs(c);
+ min_idx_lebs = c->min_idx_lebs;
+ ubifs_assert(min_idx_lebs == ubifs_calc_min_idx_lebs(c));
outstanding = c->budg_data_growth + c->budg_dd_growth;
-
- /*
- * Force the amount available to the total size reported if the used
- * space is zero.
- */
- if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) {
- spin_unlock(&c->space_lock);
- return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT;
- }
-
available = ubifs_calc_available(c, min_idx_lebs);
/*
diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c
index b49884c8c10e..f3a7945527fb 100644
--- a/fs/ubifs/commit.c
+++ b/fs/ubifs/commit.c
@@ -470,12 +470,12 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot)
{
struct ubifs_idx_node *idx;
int lnum, offs, len, err = 0;
+ struct ubifs_debug_info *d = c->dbg;
- c->old_zroot = *zroot;
-
- lnum = c->old_zroot.lnum;
- offs = c->old_zroot.offs;
- len = c->old_zroot.len;
+ d->old_zroot = *zroot;
+ lnum = d->old_zroot.lnum;
+ offs = d->old_zroot.offs;
+ len = d->old_zroot.len;
idx = kmalloc(c->max_idx_node_sz, GFP_NOFS);
if (!idx)
@@ -485,8 +485,8 @@ int dbg_old_index_check_init(struct ubifs_info *c, struct ubifs_zbranch *zroot)
if (err)
goto out;
- c->old_zroot_level = le16_to_cpu(idx->level);
- c->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum);
+ d->old_zroot_level = le16_to_cpu(idx->level);
+ d->old_zroot_sqnum = le64_to_cpu(idx->ch.sqnum);
out:
kfree(idx);
return err;
@@ -509,6 +509,7 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
{
int lnum, offs, len, err = 0, uninitialized_var(last_level), child_cnt;
int first = 1, iip;
+ struct ubifs_debug_info *d = c->dbg;
union ubifs_key lower_key, upper_key, l_key, u_key;
unsigned long long uninitialized_var(last_sqnum);
struct ubifs_idx_node *idx;
@@ -525,9 +526,9 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
UBIFS_IDX_NODE_SZ;
/* Start at the old zroot */
- lnum = c->old_zroot.lnum;
- offs = c->old_zroot.offs;
- len = c->old_zroot.len;
+ lnum = d->old_zroot.lnum;
+ offs = d->old_zroot.offs;
+ len = d->old_zroot.len;
iip = 0;
/*
@@ -560,11 +561,11 @@ int dbg_check_old_index(struct ubifs_info *c, struct ubifs_zbranch *zroot)
if (first) {
first = 0;
/* Check root level and sqnum */
- if (le16_to_cpu(idx->level) != c->old_zroot_level) {
+ if (le16_to_cpu(idx->level) != d->old_zroot_level) {
err = 2;
goto out_dump;
}
- if (le64_to_cpu(idx->ch.sqnum) != c->old_zroot_sqnum) {
+ if (le64_to_cpu(idx->ch.sqnum) != d->old_zroot_sqnum) {
err = 3;
goto out_dump;
}
diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c
index a0ada596b17c..11e4132f314a 100644
--- a/fs/ubifs/compress.c
+++ b/fs/ubifs/compress.c
@@ -33,7 +33,7 @@
/* Fake description object for the "none" compressor */
static struct ubifs_compressor none_compr = {
.compr_type = UBIFS_COMPR_NONE,
- .name = "no compression",
+ .name = "none",
.capi_name = "",
};
@@ -43,13 +43,13 @@ static DEFINE_MUTEX(lzo_mutex);
static struct ubifs_compressor lzo_compr = {
.compr_type = UBIFS_COMPR_LZO,
.comp_mutex = &lzo_mutex,
- .name = "LZO",
+ .name = "lzo",
.capi_name = "lzo",
};
#else
static struct ubifs_compressor lzo_compr = {
.compr_type = UBIFS_COMPR_LZO,
- .name = "LZO",
+ .name = "lzo",
};
#endif
@@ -108,7 +108,7 @@ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
if (compr->comp_mutex)
mutex_lock(compr->comp_mutex);
err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf,
- out_len);
+ (unsigned int *)out_len);
if (compr->comp_mutex)
mutex_unlock(compr->comp_mutex);
if (unlikely(err)) {
@@ -119,10 +119,10 @@ void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
}
/*
- * Presently, we just require that compression results in less data,
- * rather than any defined minimum compression ratio or amount.
+ * If the data compressed only slightly, it is better to leave it
+ * uncompressed to improve read speed.
*/
- if (ALIGN(*out_len, 8) >= ALIGN(in_len, 8))
+ if (in_len - *out_len < UBIFS_MIN_COMPRESS_DIFF)
goto no_compr;
return;
@@ -172,7 +172,7 @@ int ubifs_decompress(const void *in_buf, int in_len, void *out_buf,
if (compr->decomp_mutex)
mutex_lock(compr->decomp_mutex);
err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf,
- out_len);
+ (unsigned int *)out_len);
if (compr->decomp_mutex)
mutex_unlock(compr->decomp_mutex);
if (err)
@@ -244,7 +244,7 @@ out_lzo:
/**
* ubifs_compressors_exit - de-initialize UBIFS compressors.
*/
-void __exit ubifs_compressors_exit(void)
+void ubifs_compressors_exit(void)
{
compr_exit(&lzo_compr);
compr_exit(&zlib_compr);
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 510ffa0bbda4..792c5a16c182 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -32,6 +32,8 @@
#include "ubifs.h"
#include <linux/module.h>
#include <linux/moduleparam.h>
+#include <linux/debugfs.h>
+#include <linux/math64.h>
#ifdef CONFIG_UBIFS_FS_DEBUG
@@ -596,7 +598,9 @@ void dbg_dump_budg(struct ubifs_info *c)
struct rb_node *rb;
struct ubifs_bud *bud;
struct ubifs_gced_idx_leb *idx_gc;
+ long long available, outstanding, free;
+ ubifs_assert(spin_is_locked(&c->space_lock));
spin_lock(&dbg_lock);
printk(KERN_DEBUG "(pid %d) Budgeting info: budg_data_growth %lld, "
"budg_dd_growth %lld, budg_idx_growth %lld\n", current->pid,
@@ -629,6 +633,17 @@ void dbg_dump_budg(struct ubifs_info *c)
printk(KERN_DEBUG "\tGC'ed idx LEB %d unmap %d\n",
idx_gc->lnum, idx_gc->unmap);
printk(KERN_DEBUG "\tcommit state %d\n", c->cmt_state);
+
+ /* Print budgeting predictions */
+ available = ubifs_calc_available(c, c->min_idx_lebs);
+ outstanding = c->budg_data_growth + c->budg_dd_growth;
+ if (available > outstanding)
+ free = ubifs_reported_space(c, available - outstanding);
+ else
+ free = 0;
+ printk(KERN_DEBUG "Budgeting predictions:\n");
+ printk(KERN_DEBUG "\tavailable: %lld, outstanding %lld, free %lld\n",
+ available, outstanding, free);
spin_unlock(&dbg_lock);
}
@@ -645,7 +660,8 @@ void dbg_dump_lprops(struct ubifs_info *c)
struct ubifs_lprops lp;
struct ubifs_lp_stats lst;
- printk(KERN_DEBUG "(pid %d) Dumping LEB properties\n", current->pid);
+ printk(KERN_DEBUG "(pid %d) start dumping LEB properties\n",
+ current->pid);
ubifs_get_lp_stats(c, &lst);
dbg_dump_lstats(&lst);
@@ -656,6 +672,8 @@ void dbg_dump_lprops(struct ubifs_info *c)
dbg_dump_lprop(c, &lp);
}
+ printk(KERN_DEBUG "(pid %d) finish dumping LEB properties\n",
+ current->pid);
}
void dbg_dump_lpt_info(struct ubifs_info *c)
@@ -663,6 +681,7 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
int i;
spin_lock(&dbg_lock);
+ printk(KERN_DEBUG "(pid %d) dumping LPT information\n", current->pid);
printk(KERN_DEBUG "\tlpt_sz: %lld\n", c->lpt_sz);
printk(KERN_DEBUG "\tpnode_sz: %d\n", c->pnode_sz);
printk(KERN_DEBUG "\tnnode_sz: %d\n", c->nnode_sz);
@@ -684,7 +703,8 @@ void dbg_dump_lpt_info(struct ubifs_info *c)
printk(KERN_DEBUG "\tLPT root is at %d:%d\n", c->lpt_lnum, c->lpt_offs);
printk(KERN_DEBUG "\tLPT head is at %d:%d\n",
c->nhead_lnum, c->nhead_offs);
- printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n", c->ltab_lnum, c->ltab_offs);
+ printk(KERN_DEBUG "\tLPT ltab is at %d:%d\n",
+ c->ltab_lnum, c->ltab_offs);
if (c->big_lpt)
printk(KERN_DEBUG "\tLPT lsave is at %d:%d\n",
c->lsave_lnum, c->lsave_offs);
@@ -703,9 +723,9 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
if (dbg_failure_mode)
return;
- printk(KERN_DEBUG "(pid %d) Dumping LEB %d\n", current->pid, lnum);
-
- sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+ printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
+ current->pid, lnum);
+ sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
if (IS_ERR(sleb)) {
ubifs_err("scan error %d", (int)PTR_ERR(sleb));
return;
@@ -721,6 +741,8 @@ void dbg_dump_leb(const struct ubifs_info *c, int lnum)
dbg_dump_node(c, snod->node);
}
+ printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
+ current->pid, lnum);
ubifs_scan_destroy(sleb);
return;
}
@@ -768,7 +790,7 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
{
int i;
- printk(KERN_DEBUG "(pid %d) Dumping heap cat %d (%d elements)\n",
+ printk(KERN_DEBUG "(pid %d) start dumping heap cat %d (%d elements)\n",
current->pid, cat, heap->cnt);
for (i = 0; i < heap->cnt; i++) {
struct ubifs_lprops *lprops = heap->arr[i];
@@ -777,6 +799,7 @@ void dbg_dump_heap(struct ubifs_info *c, struct ubifs_lpt_heap *heap, int cat)
"flags %d\n", i, lprops->lnum, lprops->hpos,
lprops->free, lprops->dirty, lprops->flags);
}
+ printk(KERN_DEBUG "(pid %d) finish dumping heap\n", current->pid);
}
void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
@@ -784,7 +807,7 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
{
int i;
- printk(KERN_DEBUG "(pid %d) Dumping pnode:\n", current->pid);
+ printk(KERN_DEBUG "(pid %d) dumping pnode:\n", current->pid);
printk(KERN_DEBUG "\taddress %zx parent %zx cnext %zx\n",
(size_t)pnode, (size_t)parent, (size_t)pnode->cnext);
printk(KERN_DEBUG "\tflags %lu iip %d level %d num %d\n",
@@ -803,7 +826,7 @@ void dbg_dump_tnc(struct ubifs_info *c)
int level;
printk(KERN_DEBUG "\n");
- printk(KERN_DEBUG "(pid %d) Dumping the TNC tree\n", current->pid);
+ printk(KERN_DEBUG "(pid %d) start dumping TNC tree\n", current->pid);
znode = ubifs_tnc_levelorder_next(c->zroot.znode, NULL);
level = znode->level;
printk(KERN_DEBUG "== Level %d ==\n", level);
@@ -815,8 +838,7 @@ void dbg_dump_tnc(struct ubifs_info *c)
dbg_dump_znode(c, znode);
znode = ubifs_tnc_levelorder_next(c->zroot.znode, znode);
}
-
- printk(KERN_DEBUG "\n");
+ printk(KERN_DEBUG "(pid %d) finish dumping TNC tree\n", current->pid);
}
static int dump_znode(struct ubifs_info *c, struct ubifs_znode *znode,
@@ -992,8 +1014,8 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
zbr1->offs, DBGKEY(&key));
dbg_err("but it should have key %s according to tnc",
DBGKEY(&zbr1->key));
- dbg_dump_node(c, dent1);
- goto out_free;
+ dbg_dump_node(c, dent1);
+ goto out_free;
}
key_read(c, &dent2->key, &key);
@@ -1002,8 +1024,8 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
zbr1->offs, DBGKEY(&key));
dbg_err("but it should have key %s according to tnc",
DBGKEY(&zbr2->key));
- dbg_dump_node(c, dent2);
- goto out_free;
+ dbg_dump_node(c, dent2);
+ goto out_free;
}
nlen1 = le16_to_cpu(dent1->nlen);
@@ -1020,9 +1042,9 @@ static int dbg_check_key_order(struct ubifs_info *c, struct ubifs_zbranch *zbr1,
dbg_err("bad order of colliding key %s",
DBGKEY(&key));
- dbg_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
+ ubifs_msg("first node at %d:%d\n", zbr1->lnum, zbr1->offs);
dbg_dump_node(c, dent1);
- dbg_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs);
+ ubifs_msg("second node at %d:%d\n", zbr2->lnum, zbr2->offs);
dbg_dump_node(c, dent2);
out_free:
@@ -2097,13 +2119,13 @@ static int simple_rand(void)
return (next >> 16) & 32767;
}
-void dbg_failure_mode_registration(struct ubifs_info *c)
+static void failure_mode_init(struct ubifs_info *c)
{
struct failure_mode_info *fmi;
fmi = kmalloc(sizeof(struct failure_mode_info), GFP_NOFS);
if (!fmi) {
- dbg_err("Failed to register failure mode - no memory");
+ ubifs_err("Failed to register failure mode - no memory");
return;
}
fmi->c = c;
@@ -2112,7 +2134,7 @@ void dbg_failure_mode_registration(struct ubifs_info *c)
spin_unlock(&fmi_lock);
}
-void dbg_failure_mode_deregistration(struct ubifs_info *c)
+static void failure_mode_exit(struct ubifs_info *c)
{
struct failure_mode_info *fmi, *tmp;
@@ -2146,42 +2168,44 @@ static int in_failure_mode(struct ubi_volume_desc *desc)
struct ubifs_info *c = dbg_find_info(desc);
if (c && dbg_failure_mode)
- return c->failure_mode;
+ return c->dbg->failure_mode;
return 0;
}
static int do_fail(struct ubi_volume_desc *desc, int lnum, int write)
{
struct ubifs_info *c = dbg_find_info(desc);
+ struct ubifs_debug_info *d;
if (!c || !dbg_failure_mode)
return 0;
- if (c->failure_mode)
+ d = c->dbg;
+ if (d->failure_mode)
return 1;
- if (!c->fail_cnt) {
+ if (!d->fail_cnt) {
/* First call - decide delay to failure */
if (chance(1, 2)) {
unsigned int delay = 1 << (simple_rand() >> 11);
if (chance(1, 2)) {
- c->fail_delay = 1;
- c->fail_timeout = jiffies +
+ d->fail_delay = 1;
+ d->fail_timeout = jiffies +
msecs_to_jiffies(delay);
dbg_rcvry("failing after %ums", delay);
} else {
- c->fail_delay = 2;
- c->fail_cnt_max = delay;
+ d->fail_delay = 2;
+ d->fail_cnt_max = delay;
dbg_rcvry("failing after %u calls", delay);
}
}
- c->fail_cnt += 1;
+ d->fail_cnt += 1;
}
/* Determine if failure delay has expired */
- if (c->fail_delay == 1) {
- if (time_before(jiffies, c->fail_timeout))
+ if (d->fail_delay == 1) {
+ if (time_before(jiffies, d->fail_timeout))
return 0;
- } else if (c->fail_delay == 2)
- if (c->fail_cnt++ < c->fail_cnt_max)
+ } else if (d->fail_delay == 2)
+ if (d->fail_cnt++ < d->fail_cnt_max)
return 0;
if (lnum == UBIFS_SB_LNUM) {
if (write) {
@@ -2239,7 +2263,7 @@ static int do_fail(struct ubi_volume_desc *desc, int lnum, int write)
dbg_rcvry("failing in bud LEB %d commit not running", lnum);
}
ubifs_err("*** SETTING FAILURE MODE ON (LEB %d) ***", lnum);
- c->failure_mode = 1;
+ d->failure_mode = 1;
dump_stack();
return 1;
}
@@ -2344,4 +2368,181 @@ int dbg_leb_map(struct ubi_volume_desc *desc, int lnum, int dtype)
return 0;
}
+/**
+ * ubifs_debugging_init - initialize UBIFS debugging.
+ * @c: UBIFS file-system description object
+ *
+ * This function initializes debugging-related data for the file system.
+ * Returns zero in case of success and a negative error code in case of
+ * failure.
+ */
+int ubifs_debugging_init(struct ubifs_info *c)
+{
+ c->dbg = kzalloc(sizeof(struct ubifs_debug_info), GFP_KERNEL);
+ if (!c->dbg)
+ return -ENOMEM;
+
+ c->dbg->buf = vmalloc(c->leb_size);
+ if (!c->dbg->buf)
+ goto out;
+
+ failure_mode_init(c);
+ return 0;
+
+out:
+ kfree(c->dbg);
+ return -ENOMEM;
+}
+
+/**
+ * ubifs_debugging_exit - free debugging data.
+ * @c: UBIFS file-system description object
+ */
+void ubifs_debugging_exit(struct ubifs_info *c)
+{
+ failure_mode_exit(c);
+ vfree(c->dbg->buf);
+ kfree(c->dbg);
+}
+
+/*
+ * Root directory for UBIFS stuff in debugfs. Contains sub-directories which
+ * contain the stuff specific to particular file-system mounts.
+ */
+static struct dentry *debugfs_rootdir;
+
+/**
+ * dbg_debugfs_init - initialize debugfs file-system.
+ *
+ * UBIFS uses debugfs file-system to expose various debugging knobs to
+ * user-space. This function creates "ubifs" directory in the debugfs
+ * file-system. Returns zero in case of success and a negative error code in
+ * case of failure.
+ */
+int dbg_debugfs_init(void)
+{
+ debugfs_rootdir = debugfs_create_dir("ubifs", NULL);
+ if (IS_ERR(debugfs_rootdir)) {
+ int err = PTR_ERR(debugfs_rootdir);
+ ubifs_err("cannot create \"ubifs\" debugfs directory, "
+ "error %d\n", err);
+ return err;
+ }
+
+ return 0;
+}
+
+/**
+ * dbg_debugfs_exit - remove the "ubifs" directory from debugfs file-system.
+ */
+void dbg_debugfs_exit(void)
+{
+ debugfs_remove(debugfs_rootdir);
+}
+
+static int open_debugfs_file(struct inode *inode, struct file *file)
+{
+ file->private_data = inode->i_private;
+ return 0;
+}
+
+static ssize_t write_debugfs_file(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct ubifs_info *c = file->private_data;
+ struct ubifs_debug_info *d = c->dbg;
+
+ if (file->f_path.dentry == d->dump_lprops)
+ dbg_dump_lprops(c);
+ else if (file->f_path.dentry == d->dump_budg) {
+ spin_lock(&c->space_lock);
+ dbg_dump_budg(c);
+ spin_unlock(&c->space_lock);
+ } else if (file->f_path.dentry == d->dump_tnc) {
+ mutex_lock(&c->tnc_mutex);
+ dbg_dump_tnc(c);
+ mutex_unlock(&c->tnc_mutex);
+ } else
+ return -EINVAL;
+
+ *ppos += count;
+ return count;
+}
+
+static const struct file_operations debugfs_fops = {
+ .open = open_debugfs_file,
+ .write = write_debugfs_file,
+ .owner = THIS_MODULE,
+};
+
+/**
+ * dbg_debugfs_init_fs - initialize debugfs for UBIFS instance.
+ * @c: UBIFS file-system description object
+ *
+ * This function creates all debugfs files for this instance of UBIFS. Returns
+ * zero in case of success and a negative error code in case of failure.
+ *
+ * Note, the only reason we have not merged this function with the
+ * 'ubifs_debugging_init()' function is because it is better to initialize
+ * debugfs interfaces at the very end of the mount process, and remove them at
+ * the very beginning of the mount process.
+ */
+int dbg_debugfs_init_fs(struct ubifs_info *c)
+{
+ int err;
+ const char *fname;
+ struct dentry *dent;
+ struct ubifs_debug_info *d = c->dbg;
+
+ sprintf(d->debugfs_dir_name, "ubi%d_%d", c->vi.ubi_num, c->vi.vol_id);
+ d->debugfs_dir = debugfs_create_dir(d->debugfs_dir_name,
+ debugfs_rootdir);
+ if (IS_ERR(d->debugfs_dir)) {
+ err = PTR_ERR(d->debugfs_dir);
+ ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
+ d->debugfs_dir_name, err);
+ goto out;
+ }
+
+ fname = "dump_lprops";
+ dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
+ &debugfs_fops);
+ if (IS_ERR(dent))
+ goto out_remove;
+ d->dump_lprops = dent;
+
+ fname = "dump_budg";
+ dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
+ &debugfs_fops);
+ if (IS_ERR(dent))
+ goto out_remove;
+ d->dump_budg = dent;
+
+ fname = "dump_tnc";
+ dent = debugfs_create_file(fname, S_IWUGO, d->debugfs_dir, c,
+ &debugfs_fops);
+ if (IS_ERR(dent))
+ goto out_remove;
+ d->dump_tnc = dent;
+
+ return 0;
+
+out_remove:
+ err = PTR_ERR(dent);
+ ubifs_err("cannot create \"%s\" debugfs directory, error %d\n",
+ fname, err);
+ debugfs_remove_recursive(d->debugfs_dir);
+out:
+ return err;
+}
+
+/**
+ * dbg_debugfs_exit_fs - remove all debugfs files.
+ * @c: UBIFS file-system description object
+ */
+void dbg_debugfs_exit_fs(struct ubifs_info *c)
+{
+ debugfs_remove_recursive(c->dbg->debugfs_dir);
+}
+
#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/debug.h b/fs/ubifs/debug.h
index 33d6b95071e4..9820d6999f7e 100644
--- a/fs/ubifs/debug.h
+++ b/fs/ubifs/debug.h
@@ -25,7 +25,56 @@
#ifdef CONFIG_UBIFS_FS_DEBUG
-#define UBIFS_DBG(op) op
+/**
+ * ubifs_debug_info - per-FS debugging information.
+ * @buf: a buffer of LEB size, used for various purposes
+ * @old_zroot: old index root - used by 'dbg_check_old_index()'
+ * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
+ * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
+ * @failure_mode: failure mode for recovery testing
+ * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
+ * @fail_timeout: time in jiffies when delay of failure mode expires
+ * @fail_cnt: current number of calls to failure mode I/O functions
+ * @fail_cnt_max: number of calls by which to delay failure mode
+ * @chk_lpt_sz: used by LPT tree size checker
+ * @chk_lpt_sz2: used by LPT tree size checker
+ * @chk_lpt_wastage: used by LPT tree size checker
+ * @chk_lpt_lebs: used by LPT tree size checker
+ * @new_nhead_offs: used by LPT tree size checker
+ * @new_ihead_lnum: used by debugging to check ihead_lnum
+ * @new_ihead_offs: used by debugging to check ihead_offs
+ *
+ * debugfs_dir_name: name of debugfs directory containing this file-system's
+ * files
+ * debugfs_dir: direntry object of the file-system debugfs directory
+ * dump_lprops: "dump lprops" debugfs knob
+ * dump_budg: "dump budgeting information" debugfs knob
+ * dump_tnc: "dump TNC" debugfs knob
+ */
+struct ubifs_debug_info {
+ void *buf;
+ struct ubifs_zbranch old_zroot;
+ int old_zroot_level;
+ unsigned long long old_zroot_sqnum;
+ int failure_mode;
+ int fail_delay;
+ unsigned long fail_timeout;
+ unsigned int fail_cnt;
+ unsigned int fail_cnt_max;
+ long long chk_lpt_sz;
+ long long chk_lpt_sz2;
+ long long chk_lpt_wastage;
+ int chk_lpt_lebs;
+ int new_nhead_offs;
+ int new_ihead_lnum;
+ int new_ihead_offs;
+
+ char debugfs_dir_name[100];
+ struct dentry *debugfs_dir;
+ struct dentry *dump_lprops;
+ struct dentry *dump_budg;
+ struct dentry *dump_tnc;
+};
#define ubifs_assert(expr) do { \
if (unlikely(!(expr))) { \
@@ -211,14 +260,18 @@ extern unsigned int ubifs_msg_flags;
extern unsigned int ubifs_chk_flags;
extern unsigned int ubifs_tst_flags;
-/* Dump functions */
+int ubifs_debugging_init(struct ubifs_info *c);
+void ubifs_debugging_exit(struct ubifs_info *c);
+/* Dump functions */
const char *dbg_ntype(int type);
const char *dbg_cstate(int cmt_state);
const char *dbg_get_key_dump(const struct ubifs_info *c,
const union ubifs_key *key);
void dbg_dump_inode(const struct ubifs_info *c, const struct inode *inode);
void dbg_dump_node(const struct ubifs_info *c, const void *node);
+void dbg_dump_lpt_node(const struct ubifs_info *c, void *node, int lnum,
+ int offs);
void dbg_dump_budget_req(const struct ubifs_budget_req *req);
void dbg_dump_lstats(const struct ubifs_lp_stats *lst);
void dbg_dump_budg(struct ubifs_info *c);
@@ -233,9 +286,9 @@ void dbg_dump_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
struct ubifs_nnode *parent, int iip);
void dbg_dump_tnc(struct ubifs_info *c);
void dbg_dump_index(struct ubifs_info *c);
+void dbg_dump_lpt_lebs(const struct ubifs_info *c);
/* Checking helper functions */
-
typedef int (*dbg_leaf_callback)(struct ubifs_info *c,
struct ubifs_zbranch *zbr, void *priv);
typedef int (*dbg_znode_callback)(struct ubifs_info *c,
@@ -274,9 +327,6 @@ int dbg_force_in_the_gaps(void);
#define dbg_failure_mode (ubifs_tst_flags & UBIFS_TST_RCVRY)
-void dbg_failure_mode_registration(struct ubifs_info *c);
-void dbg_failure_mode_deregistration(struct ubifs_info *c);
-
#ifndef UBIFS_DBG_PRESERVE_UBI
#define ubi_leb_read dbg_leb_read
@@ -318,9 +368,13 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
return dbg_leb_change(desc, lnum, buf, len, UBI_UNKNOWN);
}
-#else /* !CONFIG_UBIFS_FS_DEBUG */
+/* Debugfs-related stuff */
+int dbg_debugfs_init(void);
+void dbg_debugfs_exit(void);
+int dbg_debugfs_init_fs(struct ubifs_info *c);
+void dbg_debugfs_exit_fs(struct ubifs_info *c);
-#define UBIFS_DBG(op)
+#else /* !CONFIG_UBIFS_FS_DEBUG */
/* Use "if (0)" to make compiler check arguments even if debugging is off */
#define ubifs_assert(expr) do { \
@@ -360,23 +414,28 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
#define DBGKEY(key) ((char *)(key))
#define DBGKEY1(key) ((char *)(key))
-#define dbg_ntype(type) ""
-#define dbg_cstate(cmt_state) ""
-#define dbg_get_key_dump(c, key) ({})
-#define dbg_dump_inode(c, inode) ({})
-#define dbg_dump_node(c, node) ({})
-#define dbg_dump_budget_req(req) ({})
-#define dbg_dump_lstats(lst) ({})
-#define dbg_dump_budg(c) ({})
-#define dbg_dump_lprop(c, lp) ({})
-#define dbg_dump_lprops(c) ({})
-#define dbg_dump_lpt_info(c) ({})
-#define dbg_dump_leb(c, lnum) ({})
-#define dbg_dump_znode(c, znode) ({})
-#define dbg_dump_heap(c, heap, cat) ({})
-#define dbg_dump_pnode(c, pnode, parent, iip) ({})
-#define dbg_dump_tnc(c) ({})
-#define dbg_dump_index(c) ({})
+#define ubifs_debugging_init(c) 0
+#define ubifs_debugging_exit(c) ({})
+
+#define dbg_ntype(type) ""
+#define dbg_cstate(cmt_state) ""
+#define dbg_get_key_dump(c, key) ({})
+#define dbg_dump_inode(c, inode) ({})
+#define dbg_dump_node(c, node) ({})
+#define dbg_dump_lpt_node(c, node, lnum, offs) ({})
+#define dbg_dump_budget_req(req) ({})
+#define dbg_dump_lstats(lst) ({})
+#define dbg_dump_budg(c) ({})
+#define dbg_dump_lprop(c, lp) ({})
+#define dbg_dump_lprops(c) ({})
+#define dbg_dump_lpt_info(c) ({})
+#define dbg_dump_leb(c, lnum) ({})
+#define dbg_dump_znode(c, znode) ({})
+#define dbg_dump_heap(c, heap, cat) ({})
+#define dbg_dump_pnode(c, pnode, parent, iip) ({})
+#define dbg_dump_tnc(c) ({})
+#define dbg_dump_index(c) ({})
+#define dbg_dump_lpt_lebs(c) ({})
#define dbg_walk_index(c, leaf_cb, znode_cb, priv) 0
#define dbg_old_index_check_init(c, zroot) 0
@@ -396,9 +455,11 @@ static inline int dbg_change(struct ubi_volume_desc *desc, int lnum,
#define dbg_force_in_the_gaps_enabled 0
#define dbg_force_in_the_gaps() 0
#define dbg_failure_mode 0
-#define dbg_failure_mode_registration(c) ({})
-#define dbg_failure_mode_deregistration(c) ({})
-#endif /* !CONFIG_UBIFS_FS_DEBUG */
+#define dbg_debugfs_init() 0
+#define dbg_debugfs_exit()
+#define dbg_debugfs_init_fs(c) 0
+#define dbg_debugfs_exit_fs(c) 0
+#endif /* !CONFIG_UBIFS_FS_DEBUG */
#endif /* !__UBIFS_DEBUG_H__ */
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 2624411d9758..bf37374567fa 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -72,8 +72,8 @@ static int read_block(struct inode *inode, void *addr, unsigned int block,
return err;
}
- ubifs_assert(le64_to_cpu(dn->ch.sqnum) > ubifs_inode(inode)->creat_sqnum);
-
+ ubifs_assert(le64_to_cpu(dn->ch.sqnum) >
+ ubifs_inode(inode)->creat_sqnum);
len = le32_to_cpu(dn->size);
if (len <= 0 || len > UBIFS_BLOCK_SIZE)
goto dump;
@@ -219,7 +219,8 @@ static void release_existing_page_budget(struct ubifs_info *c)
}
static int write_begin_slow(struct address_space *mapping,
- loff_t pos, unsigned len, struct page **pagep)
+ loff_t pos, unsigned len, struct page **pagep,
+ unsigned flags)
{
struct inode *inode = mapping->host;
struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -247,14 +248,14 @@ static int write_begin_slow(struct address_space *mapping,
if (unlikely(err))
return err;
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (unlikely(!page)) {
ubifs_release_budget(c, &req);
return -ENOMEM;
}
if (!PageUptodate(page)) {
- if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+ if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
SetPageChecked(page);
else {
err = do_readpage(page);
@@ -438,13 +439,13 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
return -EROFS;
/* Try out the fast-path part first */
- page = __grab_cache_page(mapping, index);
+ page = grab_cache_page_write_begin(mapping, index, flags);
if (unlikely(!page))
return -ENOMEM;
if (!PageUptodate(page)) {
/* The page is not loaded from the flash */
- if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+ if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
/*
* We change whole page so no need to load it. But we
* have to set the @PG_checked flag to make the further
@@ -483,7 +484,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
unlock_page(page);
page_cache_release(page);
- return write_begin_slow(mapping, pos, len, pagep);
+ return write_begin_slow(mapping, pos, len, pagep, flags);
}
/*
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 5e82cffe9695..6db7a6be6c97 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -154,6 +154,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case FS_IOC_GETFLAGS:
flags = ubifs2ioctl(ubifs_inode(inode)->flags);
+ dbg_gen("get flags: %#x, i_flags %#x", flags, inode->i_flags);
return put_user(flags, (int __user *) arg);
case FS_IOC_SETFLAGS: {
@@ -176,6 +177,7 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
err = mnt_want_write(file->f_path.mnt);
if (err)
return err;
+ dbg_gen("set flags: %#x, i_flags %#x", flags, inode->i_flags);
err = setflags(inode, flags);
mnt_drop_write(file->f_path.mnt);
return err;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index f91b745908ea..10ae25b7d1db 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -704,7 +704,7 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
data->size = cpu_to_le32(len);
zero_data_node_unused(data);
- if (!(ui->flags && UBIFS_COMPR_FL))
+ if (!(ui->flags & UBIFS_COMPR_FL))
/* Compression is disabled for this inode */
compr_type = UBIFS_COMPR_NONE;
else
@@ -1220,7 +1220,7 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
data_key_init(c, &key, inum, blk);
bit = old_size & (UBIFS_BLOCK_SIZE - 1);
- blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0: 1);
+ blk = (old_size >> UBIFS_BLOCK_SHIFT) - (bit ? 0 : 1);
data_key_init(c, &to_key, inum, blk);
err = ubifs_tnc_remove_range(c, &key, &to_key);
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 3f1f16bc25c9..efb3430a2581 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -38,6 +38,22 @@
#define __UBIFS_KEY_H__
/**
+ * key_mask_hash - mask a valid hash value.
+ * @val: value to be masked
+ *
+ * We use hash values as offset in directories, so values %0 and %1 are
+ * reserved for "." and "..". %2 is reserved for "end of readdir" marker. This
+ * function makes sure the reserved values are not used.
+ */
+static inline uint32_t key_mask_hash(uint32_t hash)
+{
+ hash &= UBIFS_S_KEY_HASH_MASK;
+ if (unlikely(hash <= 2))
+ hash += 3;
+ return hash;
+}
+
+/**
* key_r5_hash - R5 hash function (borrowed from reiserfs).
* @s: direntry name
* @len: name length
@@ -54,16 +70,7 @@ static inline uint32_t key_r5_hash(const char *s, int len)
str++;
}
- a &= UBIFS_S_KEY_HASH_MASK;
-
- /*
- * We use hash values as offset in directories, so values %0 and %1 are
- * reserved for "." and "..". %2 is reserved for "end of readdir"
- * marker.
- */
- if (unlikely(a >= 0 && a <= 2))
- a += 3;
- return a;
+ return key_mask_hash(a);
}
/**
@@ -77,10 +84,7 @@ static inline uint32_t key_test_hash(const char *str, int len)
len = min_t(uint32_t, len, 4);
memcpy(&a, str, len);
- a &= UBIFS_S_KEY_HASH_MASK;
- if (unlikely(a >= 0 && a <= 2))
- a += 3;
- return a;
+ return key_mask_hash(a);
}
/**
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index f27176e9b70d..dfd2bcece27a 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -520,13 +520,13 @@ static int is_lprops_dirty(struct ubifs_info *c, struct ubifs_lprops *lprops)
* @flags: new flags
* @idx_gc_cnt: change to the count of idx_gc list
*
- * This function changes LEB properties. This function does not change a LEB
- * property (@free, @dirty or @flag) if the value passed is %LPROPS_NC.
+ * This function changes LEB properties (@free, @dirty or @flag). However, the
+ * property which has the %LPROPS_NC value is not changed. Returns a pointer to
+ * the updated LEB properties on success and a negative error code on failure.
*
- * This function returns a pointer to the updated LEB properties on success
- * and a negative error code on failure. N.B. the LEB properties may have had to
- * be copied (due to COW) and consequently the pointer returned may not be the
- * same as the pointer passed.
+ * Note, the LEB properties may have had to be copied (due to COW) and
+ * consequently the pointer returned may not be the same as the pointer
+ * passed.
*/
const struct ubifs_lprops *ubifs_change_lp(struct ubifs_info *c,
const struct ubifs_lprops *lp,
@@ -1088,7 +1088,7 @@ static int scan_check_cb(struct ubifs_info *c,
}
}
- sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+ sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
if (IS_ERR(sleb)) {
/*
* After an unclean unmount, empty and freeable LEBs
diff --git a/fs/ubifs/lpt.c b/fs/ubifs/lpt.c
index db8bd0e518b2..b2792e84d245 100644
--- a/fs/ubifs/lpt.c
+++ b/fs/ubifs/lpt.c
@@ -36,15 +36,16 @@
* can be written into a single eraseblock. In that case, garbage collection
* consists of just writing the whole table, which therefore makes all other
* eraseblocks reusable. In the case of the big model, dirty eraseblocks are
- * selected for garbage collection, which consists are marking the nodes in
+ * selected for garbage collection, which consists of marking the clean nodes in
* that LEB as dirty, and then only the dirty nodes are written out. Also, in
* the case of the big model, a table of LEB numbers is saved so that the entire
* LPT does not to be scanned looking for empty eraseblocks when UBIFS is first
* mounted.
*/
-#include <linux/crc16.h>
#include "ubifs.h"
+#include <linux/crc16.h>
+#include <linux/math64.h>
/**
* do_calc_lpt_geom - calculate sizes for the LPT area.
@@ -135,15 +136,13 @@ static void do_calc_lpt_geom(struct ubifs_info *c)
int ubifs_calc_lpt_geom(struct ubifs_info *c)
{
int lebs_needed;
- uint64_t sz;
+ long long sz;
do_calc_lpt_geom(c);
/* Verify that lpt_lebs is big enough */
sz = c->lpt_sz * 2; /* Must have at least 2 times the size */
- sz += c->leb_size - 1;
- do_div(sz, c->leb_size);
- lebs_needed = sz;
+ lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size);
if (lebs_needed > c->lpt_lebs) {
ubifs_err("too few LPT LEBs");
return -EINVAL;
@@ -156,7 +155,6 @@ int ubifs_calc_lpt_geom(struct ubifs_info *c)
}
c->check_lpt_free = c->big_lpt;
-
return 0;
}
@@ -176,7 +174,7 @@ static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
int *big_lpt)
{
int i, lebs_needed;
- uint64_t sz;
+ long long sz;
/* Start by assuming the minimum number of LPT LEBs */
c->lpt_lebs = UBIFS_MIN_LPT_LEBS;
@@ -203,9 +201,7 @@ static int calc_dflt_lpt_geom(struct ubifs_info *c, int *main_lebs,
/* Now check there are enough LPT LEBs */
for (i = 0; i < 64 ; i++) {
sz = c->lpt_sz * 4; /* Allow 4 times the size */
- sz += c->leb_size - 1;
- do_div(sz, c->leb_size);
- lebs_needed = sz;
+ lebs_needed = div_u64(sz + c->leb_size - 1, c->leb_size);
if (lebs_needed > c->lpt_lebs) {
/* Not enough LPT LEBs so try again with more */
c->lpt_lebs = lebs_needed;
@@ -558,7 +554,7 @@ static int calc_nnode_num(int row, int col)
* This function calculates and returns the nnode number based on the parent's
* nnode number and the index in parent.
*/
-static int calc_nnode_num_from_parent(struct ubifs_info *c,
+static int calc_nnode_num_from_parent(const struct ubifs_info *c,
struct ubifs_nnode *parent, int iip)
{
int num, shft;
@@ -583,7 +579,7 @@ static int calc_nnode_num_from_parent(struct ubifs_info *c,
* This function calculates and returns the pnode number based on the parent's
* nnode number and the index in parent.
*/
-static int calc_pnode_num_from_parent(struct ubifs_info *c,
+static int calc_pnode_num_from_parent(const struct ubifs_info *c,
struct ubifs_nnode *parent, int iip)
{
int i, n = c->lpt_hght - 1, pnum = parent->num, num = 0;
@@ -966,7 +962,7 @@ static int check_lpt_type(uint8_t **addr, int *pos, int type)
*
* This function returns %0 on success and a negative error code on failure.
*/
-static int unpack_pnode(struct ubifs_info *c, void *buf,
+static int unpack_pnode(const struct ubifs_info *c, void *buf,
struct ubifs_pnode *pnode)
{
uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
@@ -996,15 +992,15 @@ static int unpack_pnode(struct ubifs_info *c, void *buf,
}
/**
- * unpack_nnode - unpack a nnode.
+ * ubifs_unpack_nnode - unpack a nnode.
* @c: UBIFS file-system description object
* @buf: buffer containing packed nnode to unpack
* @nnode: nnode structure to fill
*
* This function returns %0 on success and a negative error code on failure.
*/
-static int unpack_nnode(struct ubifs_info *c, void *buf,
- struct ubifs_nnode *nnode)
+int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf,
+ struct ubifs_nnode *nnode)
{
uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
int i, pos = 0, err;
@@ -1036,7 +1032,7 @@ static int unpack_nnode(struct ubifs_info *c, void *buf,
*
* This function returns %0 on success and a negative error code on failure.
*/
-static int unpack_ltab(struct ubifs_info *c, void *buf)
+static int unpack_ltab(const struct ubifs_info *c, void *buf)
{
uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
int i, pos = 0, err;
@@ -1068,7 +1064,7 @@ static int unpack_ltab(struct ubifs_info *c, void *buf)
*
* This function returns %0 on success and a negative error code on failure.
*/
-static int unpack_lsave(struct ubifs_info *c, void *buf)
+static int unpack_lsave(const struct ubifs_info *c, void *buf)
{
uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
int i, pos = 0, err;
@@ -1096,7 +1092,7 @@ static int unpack_lsave(struct ubifs_info *c, void *buf)
*
* This function returns %0 on success and a negative error code on failure.
*/
-static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode,
+static int validate_nnode(const struct ubifs_info *c, struct ubifs_nnode *nnode,
struct ubifs_nnode *parent, int iip)
{
int i, lvl, max_offs;
@@ -1140,7 +1136,7 @@ static int validate_nnode(struct ubifs_info *c, struct ubifs_nnode *nnode,
*
* This function returns %0 on success and a negative error code on failure.
*/
-static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
+static int validate_pnode(const struct ubifs_info *c, struct ubifs_pnode *pnode,
struct ubifs_nnode *parent, int iip)
{
int i;
@@ -1174,7 +1170,8 @@ static int validate_pnode(struct ubifs_info *c, struct ubifs_pnode *pnode,
* This function calculates the LEB numbers for the LEB properties it contains
* based on the pnode number.
*/
-static void set_pnode_lnum(struct ubifs_info *c, struct ubifs_pnode *pnode)
+static void set_pnode_lnum(const struct ubifs_info *c,
+ struct ubifs_pnode *pnode)
{
int i, lnum;
@@ -1227,7 +1224,7 @@ int ubifs_read_nnode(struct ubifs_info *c, struct ubifs_nnode *parent, int iip)
err = ubi_read(c->ubi, lnum, buf, offs, c->nnode_sz);
if (err)
goto out;
- err = unpack_nnode(c, buf, nnode);
+ err = ubifs_unpack_nnode(c, buf, nnode);
if (err)
goto out;
}
@@ -1816,7 +1813,7 @@ static struct ubifs_nnode *scan_get_nnode(struct ubifs_info *c,
c->nnode_sz);
if (err)
return ERR_PTR(err);
- err = unpack_nnode(c, buf, nnode);
+ err = ubifs_unpack_nnode(c, buf, nnode);
if (err)
return ERR_PTR(err);
}
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index a41434b42785..96ca95707175 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -320,6 +320,8 @@ no_space:
dbg_err("LPT out of space at LEB %d:%d needing %d, done_ltab %d, "
"done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
dbg_dump_lpt_info(c);
+ dbg_dump_lpt_lebs(c);
+ dump_stack();
return err;
}
@@ -546,8 +548,10 @@ static int write_cnodes(struct ubifs_info *c)
no_space:
ubifs_err("LPT out of space mismatch");
dbg_err("LPT out of space mismatch at LEB %d:%d needing %d, done_ltab "
- "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
+ "%d, done_lsave %d", lnum, offs, len, done_ltab, done_lsave);
dbg_dump_lpt_info(c);
+ dbg_dump_lpt_lebs(c);
+ dump_stack();
return err;
}
@@ -749,7 +753,7 @@ static void lpt_tgc_start(struct ubifs_info *c)
* LPT trivial garbage collection is where a LPT LEB contains only dirty and
* free space and so may be reused as soon as the next commit is completed.
* This function is called after the commit is completed (master node has been
- * written) and unmaps LPT LEBs that were marked for trivial GC.
+ * written) and un-maps LPT LEBs that were marked for trivial GC.
*/
static int lpt_tgc_end(struct ubifs_info *c)
{
@@ -1025,7 +1029,7 @@ static int make_node_dirty(struct ubifs_info *c, int node_type, int node_num,
* @c: UBIFS file-system description object
* @node_type: LPT node type
*/
-static int get_lpt_node_len(struct ubifs_info *c, int node_type)
+static int get_lpt_node_len(const struct ubifs_info *c, int node_type)
{
switch (node_type) {
case UBIFS_LPT_NNODE:
@@ -1046,7 +1050,7 @@ static int get_lpt_node_len(struct ubifs_info *c, int node_type)
* @buf: buffer
* @len: length of buffer
*/
-static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len)
+static int get_pad_len(const struct ubifs_info *c, uint8_t *buf, int len)
{
int offs, pad_len;
@@ -1063,7 +1067,8 @@ static int get_pad_len(struct ubifs_info *c, uint8_t *buf, int len)
* @buf: buffer
* @node_num: node number is returned here
*/
-static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num)
+static int get_lpt_node_type(const struct ubifs_info *c, uint8_t *buf,
+ int *node_num)
{
uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
int pos = 0, node_type;
@@ -1081,7 +1086,7 @@ static int get_lpt_node_type(struct ubifs_info *c, uint8_t *buf, int *node_num)
*
* This function returns %1 if the buffer contains a node or %0 if it does not.
*/
-static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len)
+static int is_a_node(const struct ubifs_info *c, uint8_t *buf, int len)
{
uint8_t *addr = buf + UBIFS_LPT_CRC_BYTES;
int pos = 0, node_type, node_len;
@@ -1105,7 +1110,6 @@ static int is_a_node(struct ubifs_info *c, uint8_t *buf, int len)
return 1;
}
-
/**
* lpt_gc_lnum - garbage collect a LPT LEB.
* @c: UBIFS file-system description object
@@ -1463,7 +1467,7 @@ void ubifs_lpt_free(struct ubifs_info *c, int wr_only)
#ifdef CONFIG_UBIFS_FS_DEBUG
/**
- * dbg_is_all_ff - determine if a buffer contains only 0xff bytes.
+ * dbg_is_all_ff - determine if a buffer contains only 0xFF bytes.
* @buf: buffer
* @len: buffer length
*/
@@ -1488,7 +1492,7 @@ static int dbg_is_nnode_dirty(struct ubifs_info *c, int lnum, int offs)
struct ubifs_nnode *nnode;
int hght;
- /* Entire tree is in memory so first_nnode / next_nnode are ok */
+ /* Entire tree is in memory so first_nnode / next_nnode are OK */
nnode = first_nnode(c, &hght);
for (; nnode; nnode = next_nnode(c, nnode, &hght)) {
struct ubifs_nbranch *branch;
@@ -1602,7 +1606,10 @@ static int dbg_check_ltab_lnum(struct ubifs_info *c, int lnum)
{
int err, len = c->leb_size, dirty = 0, node_type, node_num, node_len;
int ret;
- void *buf = c->dbg_buf;
+ void *buf = c->dbg->buf;
+
+ if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+ return 0;
dbg_lp("LEB %d", lnum);
err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
@@ -1704,6 +1711,9 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
long long free = 0;
int i;
+ if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+ return 0;
+
for (i = 0; i < c->lpt_lebs; i++) {
if (c->ltab[i].tgc || c->ltab[i].cmt)
continue;
@@ -1716,6 +1726,8 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
dbg_err("LPT space error: free %lld lpt_sz %lld",
free, c->lpt_sz);
dbg_dump_lpt_info(c);
+ dbg_dump_lpt_lebs(c);
+ dump_stack();
return -EINVAL;
}
return 0;
@@ -1731,15 +1743,19 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
*/
int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
{
+ struct ubifs_debug_info *d = c->dbg;
long long chk_lpt_sz, lpt_sz;
int err = 0;
+ if (!(ubifs_chk_flags & UBIFS_CHK_LPROPS))
+ return 0;
+
switch (action) {
case 0:
- c->chk_lpt_sz = 0;
- c->chk_lpt_sz2 = 0;
- c->chk_lpt_lebs = 0;
- c->chk_lpt_wastage = 0;
+ d->chk_lpt_sz = 0;
+ d->chk_lpt_sz2 = 0;
+ d->chk_lpt_lebs = 0;
+ d->chk_lpt_wastage = 0;
if (c->dirty_pn_cnt > c->pnode_cnt) {
dbg_err("dirty pnodes %d exceed max %d",
c->dirty_pn_cnt, c->pnode_cnt);
@@ -1752,35 +1768,35 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
}
return err;
case 1:
- c->chk_lpt_sz += len;
+ d->chk_lpt_sz += len;
return 0;
case 2:
- c->chk_lpt_sz += len;
- c->chk_lpt_wastage += len;
- c->chk_lpt_lebs += 1;
+ d->chk_lpt_sz += len;
+ d->chk_lpt_wastage += len;
+ d->chk_lpt_lebs += 1;
return 0;
case 3:
chk_lpt_sz = c->leb_size;
- chk_lpt_sz *= c->chk_lpt_lebs;
+ chk_lpt_sz *= d->chk_lpt_lebs;
chk_lpt_sz += len - c->nhead_offs;
- if (c->chk_lpt_sz != chk_lpt_sz) {
+ if (d->chk_lpt_sz != chk_lpt_sz) {
dbg_err("LPT wrote %lld but space used was %lld",
- c->chk_lpt_sz, chk_lpt_sz);
+ d->chk_lpt_sz, chk_lpt_sz);
err = -EINVAL;
}
- if (c->chk_lpt_sz > c->lpt_sz) {
+ if (d->chk_lpt_sz > c->lpt_sz) {
dbg_err("LPT wrote %lld but lpt_sz is %lld",
- c->chk_lpt_sz, c->lpt_sz);
+ d->chk_lpt_sz, c->lpt_sz);
err = -EINVAL;
}
- if (c->chk_lpt_sz2 && c->chk_lpt_sz != c->chk_lpt_sz2) {
+ if (d->chk_lpt_sz2 && d->chk_lpt_sz != d->chk_lpt_sz2) {
dbg_err("LPT layout size %lld but wrote %lld",
- c->chk_lpt_sz, c->chk_lpt_sz2);
+ d->chk_lpt_sz, d->chk_lpt_sz2);
err = -EINVAL;
}
- if (c->chk_lpt_sz2 && c->new_nhead_offs != len) {
+ if (d->chk_lpt_sz2 && d->new_nhead_offs != len) {
dbg_err("LPT new nhead offs: expected %d was %d",
- c->new_nhead_offs, len);
+ d->new_nhead_offs, len);
err = -EINVAL;
}
lpt_sz = (long long)c->pnode_cnt * c->pnode_sz;
@@ -1788,26 +1804,146 @@ int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
lpt_sz += c->ltab_sz;
if (c->big_lpt)
lpt_sz += c->lsave_sz;
- if (c->chk_lpt_sz - c->chk_lpt_wastage > lpt_sz) {
+ if (d->chk_lpt_sz - d->chk_lpt_wastage > lpt_sz) {
dbg_err("LPT chk_lpt_sz %lld + waste %lld exceeds %lld",
- c->chk_lpt_sz, c->chk_lpt_wastage, lpt_sz);
+ d->chk_lpt_sz, d->chk_lpt_wastage, lpt_sz);
err = -EINVAL;
}
- if (err)
+ if (err) {
dbg_dump_lpt_info(c);
- c->chk_lpt_sz2 = c->chk_lpt_sz;
- c->chk_lpt_sz = 0;
- c->chk_lpt_wastage = 0;
- c->chk_lpt_lebs = 0;
- c->new_nhead_offs = len;
+ dbg_dump_lpt_lebs(c);
+ dump_stack();
+ }
+ d->chk_lpt_sz2 = d->chk_lpt_sz;
+ d->chk_lpt_sz = 0;
+ d->chk_lpt_wastage = 0;
+ d->chk_lpt_lebs = 0;
+ d->new_nhead_offs = len;
return err;
case 4:
- c->chk_lpt_sz += len;
- c->chk_lpt_wastage += len;
+ d->chk_lpt_sz += len;
+ d->chk_lpt_wastage += len;
return 0;
default:
return -EINVAL;
}
}
+/**
+ * dbg_dump_lpt_leb - dump an LPT LEB.
+ * @c: UBIFS file-system description object
+ * @lnum: LEB number to dump
+ *
+ * This function dumps an LEB from LPT area. Nodes in this area are very
+ * different to nodes in the main area (e.g., they do not have common headers,
+ * they do not have 8-byte alignments, etc), so we have a separate function to
+ * dump LPT area LEBs. Note, LPT has to be locked by the caller.
+ */
+static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
+{
+ int err, len = c->leb_size, node_type, node_num, node_len, offs;
+ void *buf = c->dbg->buf;
+
+ printk(KERN_DEBUG "(pid %d) start dumping LEB %d\n",
+ current->pid, lnum);
+ err = ubi_read(c->ubi, lnum, buf, 0, c->leb_size);
+ if (err) {
+ ubifs_err("cannot read LEB %d, error %d", lnum, err);
+ return;
+ }
+ while (1) {
+ offs = c->leb_size - len;
+ if (!is_a_node(c, buf, len)) {
+ int pad_len;
+
+ pad_len = get_pad_len(c, buf, len);
+ if (pad_len) {
+ printk(KERN_DEBUG "LEB %d:%d, pad %d bytes\n",
+ lnum, offs, pad_len);
+ buf += pad_len;
+ len -= pad_len;
+ continue;
+ }
+ if (len)
+ printk(KERN_DEBUG "LEB %d:%d, free %d bytes\n",
+ lnum, offs, len);
+ break;
+ }
+
+ node_type = get_lpt_node_type(c, buf, &node_num);
+ switch (node_type) {
+ case UBIFS_LPT_PNODE:
+ {
+ node_len = c->pnode_sz;
+ if (c->big_lpt)
+ printk(KERN_DEBUG "LEB %d:%d, pnode num %d\n",
+ lnum, offs, node_num);
+ else
+ printk(KERN_DEBUG "LEB %d:%d, pnode\n",
+ lnum, offs);
+ break;
+ }
+ case UBIFS_LPT_NNODE:
+ {
+ int i;
+ struct ubifs_nnode nnode;
+
+ node_len = c->nnode_sz;
+ if (c->big_lpt)
+ printk(KERN_DEBUG "LEB %d:%d, nnode num %d, ",
+ lnum, offs, node_num);
+ else
+ printk(KERN_DEBUG "LEB %d:%d, nnode, ",
+ lnum, offs);
+ err = ubifs_unpack_nnode(c, buf, &nnode);
+ for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
+ printk("%d:%d", nnode.nbranch[i].lnum,
+ nnode.nbranch[i].offs);
+ if (i != UBIFS_LPT_FANOUT - 1)
+ printk(", ");
+ }
+ printk("\n");
+ break;
+ }
+ case UBIFS_LPT_LTAB:
+ node_len = c->ltab_sz;
+ printk(KERN_DEBUG "LEB %d:%d, ltab\n",
+ lnum, offs);
+ break;
+ case UBIFS_LPT_LSAVE:
+ node_len = c->lsave_sz;
+ printk(KERN_DEBUG "LEB %d:%d, lsave len\n", lnum, offs);
+ break;
+ default:
+ ubifs_err("LPT node type %d not recognized", node_type);
+ return;
+ }
+
+ buf += node_len;
+ len -= node_len;
+ }
+
+ printk(KERN_DEBUG "(pid %d) finish dumping LEB %d\n",
+ current->pid, lnum);
+}
+
+/**
+ * dbg_dump_lpt_lebs - dump LPT lebs.
+ * @c: UBIFS file-system description object
+ *
+ * This function dumps all LPT LEBs. The caller has to make sure the LPT is
+ * locked.
+ */
+void dbg_dump_lpt_lebs(const struct ubifs_info *c)
+{
+ int i;
+
+ printk(KERN_DEBUG "(pid %d) start dumping all LPT LEBs\n",
+ current->pid);
+ for (i = 0; i < c->lpt_lebs; i++)
+ dump_lpt_leb(c, i + c->lpt_first);
+ printk(KERN_DEBUG "(pid %d) finish dumping all LPT LEBs\n",
+ current->pid);
+}
+
#endif /* CONFIG_UBIFS_FS_DEBUG */
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index 9bd5a43d4526..9e6f403f170e 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -899,7 +899,7 @@ static int dbg_scan_orphans(struct ubifs_info *c, struct check_info *ci)
for (lnum = c->orph_first; lnum <= c->orph_last; lnum++) {
struct ubifs_scan_leb *sleb;
- sleb = ubifs_scan(c, lnum, 0, c->dbg_buf);
+ sleb = ubifs_scan(c, lnum, 0, c->dbg->buf);
if (IS_ERR(sleb)) {
err = PTR_ERR(sleb);
break;
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 21f7d047c306..ce42a7b0ca5a 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -144,7 +144,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
/*
* If the replay order was perfect the dirty space would now be
* zero. The order is not perfect because the the journal heads
- * race with eachother. This is not a problem but is does mean
+ * race with each other. This is not a problem but is does mean
* that the dirty space may temporarily exceed c->leb_size
* during the replay.
*/
@@ -656,7 +656,7 @@ out_dump:
* @dirty: amount of dirty space from padding and deletion nodes
*
* This function inserts a reference node to the replay tree and returns zero
- * in case of success ort a negative error code in case of failure.
+ * in case of success or a negative error code in case of failure.
*/
static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
unsigned long long sqnum, int free, int dirty)
@@ -883,7 +883,7 @@ static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
* This means that we reached end of log and now
* look to the older log data, which was already
* committed but the eraseblock was not erased (UBIFS
- * only unmaps it). So this basically means we have to
+ * only un-maps it). So this basically means we have to
* exit with "end of log" code.
*/
err = 1;
@@ -1062,6 +1062,15 @@ int ubifs_replay_journal(struct ubifs_info *c)
if (err)
goto out;
+ /*
+ * UBIFS budgeting calculations use @c->budg_uncommitted_idx variable
+ * to roughly estimate index growth. Things like @c->min_idx_lebs
+ * depend on it. This means we have to initialize it to make sure
+ * budgeting works properly.
+ */
+ c->budg_uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
+ c->budg_uncommitted_idx *= c->max_idx_node_sz;
+
ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
"highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 0f392351dc5a..e070c643d1bb 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -28,6 +28,7 @@
#include "ubifs.h"
#include <linux/random.h>
+#include <linux/math64.h>
/*
* Default journal size in logical eraseblocks as a percent of total
@@ -80,7 +81,7 @@ static int create_default_filesystem(struct ubifs_info *c)
int err, tmp, jnl_lebs, log_lebs, max_buds, main_lebs, main_first;
int lpt_lebs, lpt_first, orph_lebs, big_lpt, ino_waste, sup_flags = 0;
int min_leb_cnt = UBIFS_MIN_LEB_CNT;
- uint64_t tmp64, main_bytes;
+ long long tmp64, main_bytes;
__le64 tmp_le64;
/* Some functions called from here depend on the @c->key_len filed */
@@ -160,7 +161,7 @@ static int create_default_filesystem(struct ubifs_info *c)
if (!sup)
return -ENOMEM;
- tmp64 = (uint64_t)max_buds * c->leb_size;
+ tmp64 = (long long)max_buds * c->leb_size;
if (big_lpt)
sup_flags |= UBIFS_FLG_BIGLPT;
@@ -179,14 +180,16 @@ static int create_default_filesystem(struct ubifs_info *c)
sup->fanout = cpu_to_le32(DEFAULT_FANOUT);
sup->lsave_cnt = cpu_to_le32(c->lsave_cnt);
sup->fmt_version = cpu_to_le32(UBIFS_FORMAT_VERSION);
- sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
sup->time_gran = cpu_to_le32(DEFAULT_TIME_GRAN);
+ if (c->mount_opts.override_compr)
+ sup->default_compr = cpu_to_le16(c->mount_opts.compr_type);
+ else
+ sup->default_compr = cpu_to_le16(UBIFS_COMPR_LZO);
generate_random_uuid(sup->uuid);
- main_bytes = (uint64_t)main_lebs * c->leb_size;
- tmp64 = main_bytes * DEFAULT_RP_PERCENT;
- do_div(tmp64, 100);
+ main_bytes = (long long)main_lebs * c->leb_size;
+ tmp64 = div_u64(main_bytes * DEFAULT_RP_PERCENT, 100);
if (tmp64 > DEFAULT_MAX_RP_SIZE)
tmp64 = DEFAULT_MAX_RP_SIZE;
sup->rp_size = cpu_to_le64(tmp64);
@@ -582,16 +585,15 @@ int ubifs_read_superblock(struct ubifs_info *c)
c->jhead_cnt = le32_to_cpu(sup->jhead_cnt) + NONDATA_JHEADS_CNT;
c->fanout = le32_to_cpu(sup->fanout);
c->lsave_cnt = le32_to_cpu(sup->lsave_cnt);
- c->default_compr = le16_to_cpu(sup->default_compr);
c->rp_size = le64_to_cpu(sup->rp_size);
c->rp_uid = le32_to_cpu(sup->rp_uid);
c->rp_gid = le32_to_cpu(sup->rp_gid);
sup_flags = le32_to_cpu(sup->flags);
+ if (!c->mount_opts.override_compr)
+ c->default_compr = le16_to_cpu(sup->default_compr);
c->vfs_sb->s_time_gran = le32_to_cpu(sup->time_gran);
-
memcpy(&c->uuid, &sup->uuid, 16);
-
c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
/* Automatically increase file system size to the maximum size */
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index d80b2aef42b6..0d7564b95f8e 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -34,6 +34,8 @@
#include <linux/parser.h>
#include <linux/seq_file.h>
#include <linux/mount.h>
+#include <linux/math64.h>
+#include <linux/writeback.h>
#include "ubifs.h"
/*
@@ -417,39 +419,54 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
else if (c->mount_opts.chk_data_crc == 1)
seq_printf(s, ",no_chk_data_crc");
+ if (c->mount_opts.override_compr) {
+ seq_printf(s, ",compr=");
+ seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type));
+ }
+
return 0;
}
static int ubifs_sync_fs(struct super_block *sb, int wait)
{
+ int i, err;
struct ubifs_info *c = sb->s_fs_info;
- int i, ret = 0, err;
- long long bud_bytes;
-
- if (c->jheads) {
- for (i = 0; i < c->jhead_cnt; i++) {
- err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
- if (err && !ret)
- ret = err;
- }
+ struct writeback_control wbc = {
+ .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
+ .nr_to_write = LONG_MAX,
+ };
+
+ if (sb->s_flags & MS_RDONLY)
+ return 0;
- /* Commit the journal unless it has too little data */
- spin_lock(&c->buds_lock);
- bud_bytes = c->bud_bytes;
- spin_unlock(&c->buds_lock);
- if (bud_bytes > c->leb_size) {
- err = ubifs_run_commit(c);
- if (err)
- return err;
- }
+ /*
+ * Synchronize write buffers, because 'ubifs_run_commit()' does not
+ * do this if it waits for an already running commit.
+ */
+ for (i = 0; i < c->jhead_cnt; i++) {
+ err = ubifs_wbuf_sync(&c->jheads[i].wbuf);
+ if (err)
+ return err;
}
/*
- * We ought to call sync for c->ubi but it does not have one. If it had
- * it would in turn call mtd->sync, however mtd operations are
- * synchronous anyway, so we don't lose any sleep here.
+ * VFS calls '->sync_fs()' before synchronizing all dirty inodes and
+ * pages, so synchronize them first, then commit the journal. Strictly
+ * speaking, it is not necessary to commit the journal here,
+ * synchronizing write-buffers would be enough. But committing makes
+ * UBIFS free space predictions much more accurate, so we want to let
+ * the user be able to get more accurate results of 'statfs()' after
+ * they synchronize the file system.
*/
- return ret;
+ generic_sync_sb_inodes(sb, &wbc);
+
+ err = ubifs_run_commit(c);
+ if (err)
+ return err;
+
+ return ubi_sync(c->vi.ubi_num);
}
/**
@@ -596,7 +613,7 @@ static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
}
/*
- * init_constants_late - initialize UBIFS constants.
+ * init_constants_sb - initialize UBIFS constants.
* @c: UBIFS file-system description object
*
* This is a helper function which initializes various UBIFS constants after
@@ -604,10 +621,10 @@ static int bud_wbuf_callback(struct ubifs_info *c, int lnum, int free, int pad)
* makes sure they are all right. Returns zero in case of success and a
* negative error code in case of failure.
*/
-static int init_constants_late(struct ubifs_info *c)
+static int init_constants_sb(struct ubifs_info *c)
{
int tmp, err;
- uint64_t tmp64;
+ long long tmp64;
c->main_bytes = (long long)c->main_lebs * c->leb_size;
c->max_znode_sz = sizeof(struct ubifs_znode) +
@@ -634,9 +651,8 @@ static int init_constants_late(struct ubifs_info *c)
* Make sure that the log is large enough to fit reference nodes for
* all buds plus one reserved LEB.
*/
- tmp64 = c->max_bud_bytes;
- tmp = do_div(tmp64, c->leb_size);
- c->max_bud_cnt = tmp64 + !!tmp;
+ tmp64 = c->max_bud_bytes + c->leb_size - 1;
+ c->max_bud_cnt = div_u64(tmp64, c->leb_size);
tmp = (c->ref_node_alsz * c->max_bud_cnt + c->leb_size - 1);
tmp /= c->leb_size;
tmp += 1;
@@ -672,7 +688,7 @@ static int init_constants_late(struct ubifs_info *c)
* Consequently, if the journal is too small, UBIFS will treat it as
* always full.
*/
- tmp64 = (uint64_t)(c->jhead_cnt + 1) * c->leb_size + 1;
+ tmp64 = (long long)(c->jhead_cnt + 1) * c->leb_size + 1;
if (c->bg_bud_bytes < tmp64)
c->bg_bud_bytes = tmp64;
if (c->max_bud_bytes < tmp64 + c->leb_size)
@@ -682,6 +698,21 @@ static int init_constants_late(struct ubifs_info *c)
if (err)
return err;
+ return 0;
+}
+
+/*
+ * init_constants_master - initialize UBIFS constants.
+ * @c: UBIFS file-system description object
+ *
+ * This is a helper function which initializes various UBIFS constants after
+ * the master node has been read. It also checks various UBIFS parameters and
+ * makes sure they are all right.
+ */
+static void init_constants_master(struct ubifs_info *c)
+{
+ long long tmp64;
+
c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
/*
@@ -690,14 +721,13 @@ static int init_constants_late(struct ubifs_info *c)
* necessary to report something for the 'statfs()' call.
*
* Subtract the LEB reserved for GC, the LEB which is reserved for
- * deletions, and assume only one journal head is available.
+ * deletions, minimum LEBs for the index, and assume only one journal
+ * head is available.
*/
- tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1;
- tmp64 *= (uint64_t)c->leb_size - c->leb_overhead;
+ tmp64 = c->main_lebs - 1 - 1 - MIN_INDEX_LEBS - c->jhead_cnt + 1;
+ tmp64 *= (long long)c->leb_size - c->leb_overhead;
tmp64 = ubifs_reported_space(c, tmp64);
c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT;
-
- return 0;
}
/**
@@ -878,6 +908,7 @@ static int check_volume_empty(struct ubifs_info *c)
* Opt_no_bulk_read: disable bulk-reads
* Opt_chk_data_crc: check CRCs when reading data nodes
* Opt_no_chk_data_crc: do not check CRCs when reading data nodes
+ * Opt_override_compr: override default compressor
* Opt_err: just end of array marker
*/
enum {
@@ -887,6 +918,7 @@ enum {
Opt_no_bulk_read,
Opt_chk_data_crc,
Opt_no_chk_data_crc,
+ Opt_override_compr,
Opt_err,
};
@@ -897,6 +929,7 @@ static const match_table_t tokens = {
{Opt_no_bulk_read, "no_bulk_read"},
{Opt_chk_data_crc, "chk_data_crc"},
{Opt_no_chk_data_crc, "no_chk_data_crc"},
+ {Opt_override_compr, "compr=%s"},
{Opt_err, NULL},
};
@@ -950,6 +983,28 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options,
c->mount_opts.chk_data_crc = 1;
c->no_chk_data_crc = 1;
break;
+ case Opt_override_compr:
+ {
+ char *name = match_strdup(&args[0]);
+
+ if (!name)
+ return -ENOMEM;
+ if (!strcmp(name, "none"))
+ c->mount_opts.compr_type = UBIFS_COMPR_NONE;
+ else if (!strcmp(name, "lzo"))
+ c->mount_opts.compr_type = UBIFS_COMPR_LZO;
+ else if (!strcmp(name, "zlib"))
+ c->mount_opts.compr_type = UBIFS_COMPR_ZLIB;
+ else {
+ ubifs_err("unknown compressor \"%s\"", name);
+ kfree(name);
+ return -EINVAL;
+ }
+ kfree(name);
+ c->mount_opts.override_compr = 1;
+ c->default_compr = c->mount_opts.compr_type;
+ break;
+ }
default:
ubifs_err("unrecognized mount option \"%s\" "
"or missing value", p);
@@ -1019,6 +1074,30 @@ again:
}
/**
+ * check_free_space - check if there is enough free space to mount.
+ * @c: UBIFS file-system description object
+ *
+ * This function makes sure UBIFS has enough free space to be mounted in
+ * read/write mode. UBIFS must always have some free space to allow deletions.
+ */
+static int check_free_space(struct ubifs_info *c)
+{
+ ubifs_assert(c->dark_wm > 0);
+ if (c->lst.total_free + c->lst.total_dirty < c->dark_wm) {
+ ubifs_err("insufficient free space to mount in read/write mode");
+ dbg_dump_budg(c);
+ dbg_dump_lprops(c);
+ /*
+ * We return %-EINVAL instead of %-ENOSPC because it seems to
+ * be the closest error code mentioned in the mount function
+ * documentation.
+ */
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/**
* mount_ubifs - mount UBIFS file-system.
* @c: UBIFS file-system description object
*
@@ -1039,11 +1118,9 @@ static int mount_ubifs(struct ubifs_info *c)
if (err)
return err;
-#ifdef CONFIG_UBIFS_FS_DEBUG
- c->dbg_buf = vmalloc(c->leb_size);
- if (!c->dbg_buf)
- return -ENOMEM;
-#endif
+ err = ubifs_debugging_init(c);
+ if (err)
+ return err;
err = check_volume_empty(c);
if (err)
@@ -1100,27 +1177,25 @@ static int mount_ubifs(struct ubifs_info *c)
goto out_free;
/*
- * Make sure the compressor which is set as the default on in the
- * superblock was actually compiled in.
+ * Make sure the compressor which is set as default in the superblock
+ * or overridden by mount options is actually compiled in.
*/
if (!ubifs_compr_present(c->default_compr)) {
- ubifs_warn("'%s' compressor is set by superblock, but not "
- "compiled in", ubifs_compr_name(c->default_compr));
- c->default_compr = UBIFS_COMPR_NONE;
+ ubifs_err("'compressor \"%s\" is not compiled in",
+ ubifs_compr_name(c->default_compr));
+ goto out_free;
}
- dbg_failure_mode_registration(c);
-
- err = init_constants_late(c);
+ err = init_constants_sb(c);
if (err)
- goto out_dereg;
+ goto out_free;
sz = ALIGN(c->max_idx_node_sz, c->min_io_size);
sz = ALIGN(sz + c->max_idx_node_sz, c->min_io_size);
c->cbuf = kmalloc(sz, GFP_NOFS);
if (!c->cbuf) {
err = -ENOMEM;
- goto out_dereg;
+ goto out_free;
}
sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id);
@@ -1145,6 +1220,8 @@ static int mount_ubifs(struct ubifs_info *c)
if (err)
goto out_master;
+ init_constants_master(c);
+
if ((c->mst_node->flags & cpu_to_le32(UBIFS_MST_DIRTY)) != 0) {
ubifs_msg("recovery needed");
c->need_recovery = 1;
@@ -1183,12 +1260,9 @@ static int mount_ubifs(struct ubifs_info *c)
if (!mounted_read_only) {
int lnum;
- /* Check for enough free space */
- if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
- ubifs_err("insufficient available space");
- err = -EINVAL;
+ err = check_free_space(c);
+ if (err)
goto out_orphans;
- }
/* Check for enough log space */
lnum = c->lhead_lnum + 1;
@@ -1232,6 +1306,10 @@ static int mount_ubifs(struct ubifs_info *c)
}
}
+ err = dbg_debugfs_init_fs(c);
+ if (err)
+ goto out_infos;
+
err = dbg_check_filesystem(c);
if (err)
goto out_infos;
@@ -1283,8 +1361,20 @@ static int mount_ubifs(struct ubifs_info *c)
dbg_msg("tree fanout: %d", c->fanout);
dbg_msg("reserved GC LEB: %d", c->gc_lnum);
dbg_msg("first main LEB: %d", c->main_first);
+ dbg_msg("max. znode size %d", c->max_znode_sz);
+ dbg_msg("max. index node size %d", c->max_idx_node_sz);
+ dbg_msg("node sizes: data %zu, inode %zu, dentry %zu",
+ UBIFS_DATA_NODE_SZ, UBIFS_INO_NODE_SZ, UBIFS_DENT_NODE_SZ);
+ dbg_msg("node sizes: trun %zu, sb %zu, master %zu",
+ UBIFS_TRUN_NODE_SZ, UBIFS_SB_NODE_SZ, UBIFS_MST_NODE_SZ);
+ dbg_msg("node sizes: ref %zu, cmt. start %zu, orph %zu",
+ UBIFS_REF_NODE_SZ, UBIFS_CS_NODE_SZ, UBIFS_ORPH_NODE_SZ);
+ dbg_msg("max. node sizes: data %zu, inode %zu dentry %zu",
+ UBIFS_MAX_DATA_NODE_SZ, UBIFS_MAX_INO_NODE_SZ,
+ UBIFS_MAX_DENT_NODE_SZ);
dbg_msg("dead watermark: %d", c->dead_wm);
dbg_msg("dark watermark: %d", c->dark_wm);
+ dbg_msg("LEB overhead: %d", c->leb_overhead);
x = (long long)c->main_lebs * c->dark_wm;
dbg_msg("max. dark space: %lld (%lld KiB, %lld MiB)",
x, x >> 10, x >> 20);
@@ -1320,14 +1410,12 @@ out_wbufs:
free_wbufs(c);
out_cbuf:
kfree(c->cbuf);
-out_dereg:
- dbg_failure_mode_deregistration(c);
out_free:
kfree(c->bu.buf);
vfree(c->ileb_buf);
vfree(c->sbuf);
kfree(c->bottom_up_buf);
- UBIFS_DBG(vfree(c->dbg_buf));
+ ubifs_debugging_exit(c);
return err;
}
@@ -1345,6 +1433,7 @@ static void ubifs_umount(struct ubifs_info *c)
dbg_gen("un-mounting UBI device %d, volume %d", c->vi.ubi_num,
c->vi.vol_id);
+ dbg_debugfs_exit_fs(c);
spin_lock(&ubifs_infos_lock);
list_del(&c->infos_list);
spin_unlock(&ubifs_infos_lock);
@@ -1364,8 +1453,7 @@ static void ubifs_umount(struct ubifs_info *c)
vfree(c->ileb_buf);
vfree(c->sbuf);
kfree(c->bottom_up_buf);
- UBIFS_DBG(vfree(c->dbg_buf));
- dbg_failure_mode_deregistration(c);
+ ubifs_debugging_exit(c);
}
/**
@@ -1387,12 +1475,9 @@ static int ubifs_remount_rw(struct ubifs_info *c)
c->remounting_rw = 1;
c->always_chk_crc = 1;
- /* Check for enough free space */
- if (ubifs_calc_available(c, c->min_idx_lebs) <= 0) {
- ubifs_err("insufficient available space");
- err = -EINVAL;
+ err = check_free_space(c);
+ if (err)
goto out;
- }
if (c->old_leb_cnt != c->leb_cnt) {
struct ubifs_sb_node *sup;
@@ -1515,20 +1600,24 @@ out:
* @c: UBIFS file-system description object
*
* This function is called during un-mounting and re-mounting, and it commits
- * the journal unless the "fast unmount" mode is enabled. It also avoids
- * committing the journal if it contains too few data.
+ * the journal unless the "fast unmount" mode is enabled.
*/
static void commit_on_unmount(struct ubifs_info *c)
{
- if (!c->fast_unmount) {
- long long bud_bytes;
+ struct super_block *sb = c->vfs_sb;
+ long long bud_bytes;
- spin_lock(&c->buds_lock);
- bud_bytes = c->bud_bytes;
- spin_unlock(&c->buds_lock);
- if (bud_bytes > c->leb_size)
- ubifs_run_commit(c);
- }
+ /*
+ * This function is called before the background thread is stopped, so
+ * we may race with ongoing commit, which means we have to take
+ * @c->bud_lock to access @c->bud_bytes.
+ */
+ spin_lock(&c->buds_lock);
+ bud_bytes = c->bud_bytes;
+ spin_unlock(&c->buds_lock);
+
+ if (!c->fast_unmount && !(sb->s_flags & MS_RDONLY) && bud_bytes)
+ ubifs_run_commit(c);
}
/**
@@ -1849,7 +1938,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
goto out_iput;
mutex_unlock(&c->umount_mutex);
-
return 0;
out_iput:
@@ -1955,7 +2043,7 @@ static void ubifs_kill_sb(struct super_block *sb)
* We do 'commit_on_unmount()' here instead of 'ubifs_put_super()'
* in order to be outside BKL.
*/
- if (sb->s_root && !(sb->s_flags & MS_RDONLY))
+ if (sb->s_root)
commit_on_unmount(c);
/* The un-mount routine is actually done in put_super() */
generic_shutdown_super(sb);
@@ -2021,6 +2109,14 @@ static int __init ubifs_init(void)
BUILD_BUG_ON(UBIFS_REF_NODE_SZ != 64);
/*
+ * We use 2 bit wide bit-fields to store compression type, which should
+ * be amended if more compressors are added. The bit-fields are:
+ * @compr_type in 'struct ubifs_inode', @default_compr in
+ * 'struct ubifs_info' and @compr_type in 'struct ubifs_mount_opts'.
+ */
+ BUILD_BUG_ON(UBIFS_COMPR_TYPES_CNT > 4);
+
+ /*
* We require that PAGE_CACHE_SIZE is greater-than-or-equal-to
* UBIFS_BLOCK_SIZE. It is assumed that both are powers of 2.
*/
@@ -2049,11 +2145,17 @@ static int __init ubifs_init(void)
err = ubifs_compressors_init();
if (err)
+ goto out_shrinker;
+
+ err = dbg_debugfs_init();
+ if (err)
goto out_compr;
return 0;
out_compr:
+ ubifs_compressors_exit();
+out_shrinker:
unregister_shrinker(&ubifs_shrinker_info);
kmem_cache_destroy(ubifs_inode_slab);
out_reg:
@@ -2068,6 +2170,7 @@ static void __exit ubifs_exit(void)
ubifs_assert(list_empty(&ubifs_infos));
ubifs_assert(atomic_long_read(&ubifs_clean_zn_cnt) == 0);
+ dbg_debugfs_exit();
ubifs_compressors_exit();
unregister_shrinker(&ubifs_shrinker_info);
kmem_cache_destroy(ubifs_inode_slab);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index 6eef5344a145..f7e36f545527 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -2245,12 +2245,11 @@ int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
if (found) {
/* Ensure the znode is dirtied */
if (znode->cnext || !ubifs_zn_dirty(znode)) {
- znode = dirty_cow_bottom_up(c,
- znode);
- if (IS_ERR(znode)) {
- err = PTR_ERR(znode);
- goto out_unlock;
- }
+ znode = dirty_cow_bottom_up(c, znode);
+ if (IS_ERR(znode)) {
+ err = PTR_ERR(znode);
+ goto out_unlock;
+ }
}
zbr = &znode->zbranch[n];
lnc_free(zbr);
@@ -2317,11 +2316,11 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
/* Ensure the znode is dirtied */
if (znode->cnext || !ubifs_zn_dirty(znode)) {
- znode = dirty_cow_bottom_up(c, znode);
- if (IS_ERR(znode)) {
- err = PTR_ERR(znode);
- goto out_unlock;
- }
+ znode = dirty_cow_bottom_up(c, znode);
+ if (IS_ERR(znode)) {
+ err = PTR_ERR(znode);
+ goto out_unlock;
+ }
}
if (found == 1) {
@@ -2627,11 +2626,11 @@ int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
/* Ensure the znode is dirtied */
if (znode->cnext || !ubifs_zn_dirty(znode)) {
- znode = dirty_cow_bottom_up(c, znode);
- if (IS_ERR(znode)) {
- err = PTR_ERR(znode);
- goto out_unlock;
- }
+ znode = dirty_cow_bottom_up(c, znode);
+ if (IS_ERR(znode)) {
+ err = PTR_ERR(znode);
+ goto out_unlock;
+ }
}
/* Remove all keys in range except the first */
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 8ac76b1c2d55..fde8d127c768 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -553,8 +553,8 @@ static int layout_in_empty_space(struct ubifs_info *c)
}
#ifdef CONFIG_UBIFS_FS_DEBUG
- c->new_ihead_lnum = lnum;
- c->new_ihead_offs = buf_offs;
+ c->dbg->new_ihead_lnum = lnum;
+ c->dbg->new_ihead_offs = buf_offs;
#endif
return 0;
@@ -802,8 +802,10 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot)
* budgeting subsystem to assume the index is already committed,
* even though it is not.
*/
+ ubifs_assert(c->min_idx_lebs == ubifs_calc_min_idx_lebs(c));
c->old_idx_sz = c->calc_idx_sz;
c->budg_uncommitted_idx = 0;
+ c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
spin_unlock(&c->space_lock);
mutex_unlock(&c->tnc_mutex);
@@ -1002,7 +1004,8 @@ static int write_index(struct ubifs_info *c)
}
#ifdef CONFIG_UBIFS_FS_DEBUG
- if (lnum != c->new_ihead_lnum || buf_offs != c->new_ihead_offs) {
+ if (lnum != c->dbg->new_ihead_lnum ||
+ buf_offs != c->dbg->new_ihead_offs) {
ubifs_err("inconsistent ihead");
return -EINVAL;
}
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index 0b378042a3a2..b25fc36cf72f 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -51,6 +51,13 @@
*/
#define UBIFS_MIN_COMPR_LEN 128
+/*
+ * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes
+ * shorter than uncompressed data length, UBIFS preferes to leave this data
+ * node uncompress, because it'll be read faster.
+ */
+#define UBIFS_MIN_COMPRESS_DIFF 64
+
/* Root inode number */
#define UBIFS_ROOT_INO 1
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 46b172560a06..fc2a4cc66d03 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -63,6 +63,14 @@
#define SQNUM_WARN_WATERMARK 0xFFFFFFFF00000000ULL
#define SQNUM_WATERMARK 0xFFFFFFFFFF000000ULL
+/*
+ * Minimum amount of LEBs reserved for the index. At present the index needs at
+ * least 2 LEBs: one for the index head and one for in-the-gaps method (which
+ * currently does not cater for the index head and so excludes it from
+ * consideration).
+ */
+#define MIN_INDEX_LEBS 2
+
/* Minimum amount of data UBIFS writes to the flash */
#define MIN_WRITE_SZ (UBIFS_DATA_NODE_SZ + 8)
@@ -386,12 +394,12 @@ struct ubifs_inode {
unsigned int dirty:1;
unsigned int xattr:1;
unsigned int bulk_read:1;
+ unsigned int compr_type:2;
struct mutex ui_mutex;
spinlock_t ui_lock;
loff_t synced_i_size;
loff_t ui_size;
int flags;
- int compr_type;
pgoff_t last_page_read;
pgoff_t read_in_a_row;
int data_len;
@@ -419,7 +427,7 @@ struct ubifs_unclean_leb {
*
* LPROPS_UNCAT: not categorized
* LPROPS_DIRTY: dirty > 0, not index
- * LPROPS_DIRTY_IDX: dirty + free > UBIFS_CH_SZ and index
+ * LPROPS_DIRTY_IDX: dirty + free > @c->min_idx_node_sze and index
* LPROPS_FREE: free > 0, not empty, not index
* LPROPS_HEAP_CNT: number of heaps used for storing categorized LEBs
* LPROPS_EMPTY: LEB is empty, not taken
@@ -473,8 +481,8 @@ struct ubifs_lprops {
struct ubifs_lpt_lprops {
int free;
int dirty;
- unsigned tgc : 1;
- unsigned cmt : 1;
+ unsigned tgc:1;
+ unsigned cmt:1;
};
/**
@@ -482,24 +490,26 @@ struct ubifs_lpt_lprops {
* @empty_lebs: number of empty LEBs
* @taken_empty_lebs: number of taken LEBs
* @idx_lebs: number of indexing LEBs
- * @total_free: total free space in bytes
- * @total_dirty: total dirty space in bytes
- * @total_used: total used space in bytes (includes only data LEBs)
- * @total_dead: total dead space in bytes (includes only data LEBs)
- * @total_dark: total dark space in bytes (includes only data LEBs)
+ * @total_free: total free space in bytes (includes all LEBs)
+ * @total_dirty: total dirty space in bytes (includes all LEBs)
+ * @total_used: total used space in bytes (does not include index LEBs)
+ * @total_dead: total dead space in bytes (does not include index LEBs)
+ * @total_dark: total dark space in bytes (does not include index LEBs)
+ *
+ * The @taken_empty_lebs field counts the LEBs that are in the transient state
+ * of having been "taken" for use but not yet written to. @taken_empty_lebs is
+ * needed to account correctly for @gc_lnum, otherwise @empty_lebs could be
+ * used by itself (in which case 'unused_lebs' would be a better name). In the
+ * case of @gc_lnum, it is "taken" at mount time or whenever a LEB is retained
+ * by GC, but unlike other empty LEBs that are "taken", it may not be written
+ * straight away (i.e. before the next commit start or unmount), so either
+ * @gc_lnum must be specially accounted for, or the current approach followed
+ * i.e. count it under @taken_empty_lebs.
*
- * N.B. total_dirty and total_used are different to other total_* fields,
- * because they account _all_ LEBs, not just data LEBs.
+ * @empty_lebs includes @taken_empty_lebs.
*
- * 'taken_empty_lebs' counts the LEBs that are in the transient state of having
- * been 'taken' for use but not yet written to. 'taken_empty_lebs' is needed
- * to account correctly for gc_lnum, otherwise 'empty_lebs' could be used
- * by itself (in which case 'unused_lebs' would be a better name). In the case
- * of gc_lnum, it is 'taken' at mount time or whenever a LEB is retained by GC,
- * but unlike other empty LEBs that are 'taken', it may not be written straight
- * away (i.e. before the next commit start or unmount), so either gc_lnum must
- * be specially accounted for, or the current approach followed i.e. count it
- * under 'taken_empty_lebs'.
+ * @total_used, @total_dead and @total_dark fields do not account indexing
+ * LEBs.
*/
struct ubifs_lp_stats {
int empty_lebs;
@@ -893,15 +903,25 @@ struct ubifs_orphan {
/**
* struct ubifs_mount_opts - UBIFS-specific mount options information.
* @unmount_mode: selected unmount mode (%0 default, %1 normal, %2 fast)
- * @bulk_read: enable bulk-reads
- * @chk_data_crc: check CRCs when reading data nodes
+ * @bulk_read: enable/disable bulk-reads (%0 default, %1 disabe, %2 enable)
+ * @chk_data_crc: enable/disable CRC data checking when reading data nodes
+ * (%0 default, %1 disabe, %2 enable)
+ * @override_compr: override default compressor (%0 - do not override and use
+ * superblock compressor, %1 - override and use compressor
+ * specified in @compr_type)
+ * @compr_type: compressor type to override the superblock compressor with
+ * (%UBIFS_COMPR_NONE, etc)
*/
struct ubifs_mount_opts {
unsigned int unmount_mode:2;
unsigned int bulk_read:2;
unsigned int chk_data_crc:2;
+ unsigned int override_compr:1;
+ unsigned int compr_type:2;
};
+struct ubifs_debug_info;
+
/**
* struct ubifs_info - UBIFS file-system description data structure
* (per-superblock).
@@ -946,6 +966,7 @@ struct ubifs_mount_opts {
* @no_chk_data_crc: do not check CRCs when reading data nodes (except during
* recovery)
* @bulk_read: enable bulk-reads
+ * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
*
* @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
* @calc_idx_sz
@@ -963,8 +984,6 @@ struct ubifs_mount_opts {
* @ileb_nxt: next pre-allocated index LEBs
* @old_idx: tree of index nodes obsoleted since the last commit start
* @bottom_up_buf: a buffer which is used by 'dirty_cow_bottom_up()' in tnc.c
- * @new_ihead_lnum: used by debugging to check ihead_lnum
- * @new_ihead_offs: used by debugging to check ihead_offs
*
* @mst_node: master node
* @mst_offs: offset of valid master node
@@ -986,7 +1005,6 @@ struct ubifs_mount_opts {
* @main_lebs: count of LEBs in the main area
* @main_first: first LEB of the main area
* @main_bytes: main area size in bytes
- * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
*
* @key_hash_type: type of the key hash
* @key_hash: direntry key hash function
@@ -1149,15 +1167,7 @@ struct ubifs_mount_opts {
* @always_chk_crc: always check CRCs (while mounting and remounting rw)
* @mount_opts: UBIFS-specific mount options
*
- * @dbg_buf: a buffer of LEB size used for debugging purposes
- * @old_zroot: old index root - used by 'dbg_check_old_index()'
- * @old_zroot_level: old index root level - used by 'dbg_check_old_index()'
- * @old_zroot_sqnum: old index root sqnum - used by 'dbg_check_old_index()'
- * @failure_mode: failure mode for recovery testing
- * @fail_delay: 0=>don't delay, 1=>delay a time, 2=>delay a number of calls
- * @fail_timeout: time in jiffies when delay of failure mode expires
- * @fail_cnt: current number of calls to failure mode I/O functions
- * @fail_cnt_max: number of calls by which to delay failure mode
+ * @dbg: debugging-related information
*/
struct ubifs_info {
struct super_block *vfs_sb;
@@ -1196,6 +1206,7 @@ struct ubifs_info {
unsigned int big_lpt:1;
unsigned int no_chk_data_crc:1;
unsigned int bulk_read:1;
+ unsigned int default_compr:2;
struct mutex tnc_mutex;
struct ubifs_zbranch zroot;
@@ -1212,10 +1223,6 @@ struct ubifs_info {
int ileb_nxt;
struct rb_root old_idx;
int *bottom_up_buf;
-#ifdef CONFIG_UBIFS_FS_DEBUG
- int new_ihead_lnum;
- int new_ihead_offs;
-#endif
struct ubifs_mst_node *mst_node;
int mst_offs;
@@ -1237,7 +1244,6 @@ struct ubifs_info {
int main_lebs;
int main_first;
long long main_bytes;
- int default_compr;
uint8_t key_hash_type;
uint32_t (*key_hash)(const char *str, int len);
@@ -1315,8 +1321,8 @@ struct ubifs_info {
void *sbuf;
struct list_head idx_gc;
int idx_gc_cnt;
- volatile int gc_seq;
- volatile int gced_lnum;
+ int gc_seq;
+ int gced_lnum;
struct list_head infos_list;
struct mutex umount_mutex;
@@ -1391,21 +1397,7 @@ struct ubifs_info {
struct ubifs_mount_opts mount_opts;
#ifdef CONFIG_UBIFS_FS_DEBUG
- void *dbg_buf;
- struct ubifs_zbranch old_zroot;
- int old_zroot_level;
- unsigned long long old_zroot_sqnum;
- int failure_mode;
- int fail_delay;
- unsigned long fail_timeout;
- unsigned int fail_cnt;
- unsigned int fail_cnt_max;
- long long chk_lpt_sz;
- long long chk_lpt_sz2;
- long long chk_lpt_wastage;
- int chk_lpt_lebs;
- int new_nhead_lnum;
- int new_nhead_offs;
+ struct ubifs_debug_info *dbg;
#endif
};
@@ -1505,7 +1497,7 @@ void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode,
long long ubifs_get_free_space(struct ubifs_info *c);
int ubifs_calc_min_idx_lebs(struct ubifs_info *c);
void ubifs_convert_page_budget(struct ubifs_info *c);
-long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free);
+long long ubifs_reported_space(const struct ubifs_info *c, long long free);
long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
/* find.c */
@@ -1639,6 +1631,9 @@ void ubifs_add_lpt_dirt(struct ubifs_info *c, int lnum, int dirty);
void ubifs_add_nnode_dirt(struct ubifs_info *c, struct ubifs_nnode *nnode);
uint32_t ubifs_unpack_bits(uint8_t **addr, int *pos, int nrbits);
struct ubifs_nnode *ubifs_first_nnode(struct ubifs_info *c, int *hght);
+/* Needed only in debugging code in lpt_commit.c */
+int ubifs_unpack_nnode(const struct ubifs_info *c, void *buf,
+ struct ubifs_nnode *nnode);
/* lpt_commit.c */
int ubifs_lpt_start_commit(struct ubifs_info *c);
@@ -1714,7 +1709,7 @@ long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
/* compressor.c */
int __init ubifs_compressors_init(void);
-void __exit ubifs_compressors_exit(void);
+void ubifs_compressors_exit(void);
void ubifs_compress(const void *in_buf, int in_len, void *out_buf, int *out_len,
int *compr_type);
int ubifs_decompress(const void *buf, int len, void *out, int *out_len,
diff --git a/fs/xattr.c b/fs/xattr.c
index 468377e66531..237804cd6b56 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -175,7 +175,7 @@ vfs_listxattr(struct dentry *d, char *list, size_t size)
if (error)
return error;
error = -EOPNOTSUPP;
- if (d->d_inode->i_op && d->d_inode->i_op->listxattr) {
+ if (d->d_inode->i_op->listxattr) {
error = d->d_inode->i_op->listxattr(d, list, size);
} else {
error = security_inode_listsecurity(d->d_inode, list, size);
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 737c9a425361..c3dc491fff89 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -85,13 +85,13 @@ xfs-y += xfs_alloc.o \
xfs_trans_inode.o \
xfs_trans_item.o \
xfs_utils.o \
- xfs_vfsops.o \
xfs_vnodeops.o \
xfs_rw.o \
xfs_dmops.o \
xfs_qmops.o
-xfs-$(CONFIG_XFS_TRACE) += xfs_dir2_trace.o
+xfs-$(CONFIG_XFS_TRACE) += xfs_btree_trace.o \
+ xfs_dir2_trace.o
# Objects in linux/
xfs-y += $(addprefix $(XFS_LINUX)/, \
@@ -106,7 +106,7 @@ xfs-y += $(addprefix $(XFS_LINUX)/, \
xfs_iops.o \
xfs_lrw.o \
xfs_super.o \
- xfs_vnode.o \
+ xfs_sync.o \
xfs_xattr.o)
# Objects in support/
diff --git a/fs/xfs/linux-2.6/sv.h b/fs/xfs/linux-2.6/sv.h
index 351a8f454bd1..4dfc7c370819 100644
--- a/fs/xfs/linux-2.6/sv.h
+++ b/fs/xfs/linux-2.6/sv.h
@@ -32,23 +32,15 @@ typedef struct sv_s {
wait_queue_head_t waiters;
} sv_t;
-#define SV_FIFO 0x0 /* sv_t is FIFO type */
-#define SV_LIFO 0x2 /* sv_t is LIFO type */
-#define SV_PRIO 0x4 /* sv_t is PRIO type */
-#define SV_KEYED 0x6 /* sv_t is KEYED type */
-#define SV_DEFAULT SV_FIFO
-
-
-static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
- unsigned long timeout)
+static inline void _sv_wait(sv_t *sv, spinlock_t *lock)
{
DECLARE_WAITQUEUE(wait, current);
add_wait_queue_exclusive(&sv->waiters, &wait);
- __set_current_state(state);
+ __set_current_state(TASK_UNINTERRUPTIBLE);
spin_unlock(lock);
- schedule_timeout(timeout);
+ schedule();
remove_wait_queue(&sv->waiters, &wait);
}
@@ -58,13 +50,7 @@ static inline void _sv_wait(sv_t *sv, spinlock_t *lock, int state,
#define sv_destroy(sv) \
/*NOTHING*/
#define sv_wait(sv, pri, lock, s) \
- _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
-#define sv_wait_sig(sv, pri, lock, s) \
- _sv_wait(sv, lock, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT)
-#define sv_timedwait(sv, pri, lock, s, svf, ts, rts) \
- _sv_wait(sv, lock, TASK_UNINTERRUPTIBLE, timespec_to_jiffies(ts))
-#define sv_timedwait_sig(sv, pri, lock, s, svf, ts, rts) \
- _sv_wait(sv, lock, TASK_INTERRUPTIBLE, timespec_to_jiffies(ts))
+ _sv_wait(sv, lock)
#define sv_signal(sv) \
wake_up(&(sv)->waiters)
#define sv_broadcast(sv) \
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index a44d68eb50b5..de3a198f771e 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -42,6 +42,40 @@
#include <linux/pagevec.h>
#include <linux/writeback.h>
+
+/*
+ * Prime number of hash buckets since address is used as the key.
+ */
+#define NVSYNC 37
+#define to_ioend_wq(v) (&xfs_ioend_wq[((unsigned long)v) % NVSYNC])
+static wait_queue_head_t xfs_ioend_wq[NVSYNC];
+
+void __init
+xfs_ioend_init(void)
+{
+ int i;
+
+ for (i = 0; i < NVSYNC; i++)
+ init_waitqueue_head(&xfs_ioend_wq[i]);
+}
+
+void
+xfs_ioend_wait(
+ xfs_inode_t *ip)
+{
+ wait_queue_head_t *wq = to_ioend_wq(ip);
+
+ wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
+}
+
+STATIC void
+xfs_ioend_wake(
+ xfs_inode_t *ip)
+{
+ if (atomic_dec_and_test(&ip->i_iocount))
+ wake_up(to_ioend_wq(ip));
+}
+
STATIC void
xfs_count_page_state(
struct page *page,
@@ -146,16 +180,25 @@ xfs_destroy_ioend(
xfs_ioend_t *ioend)
{
struct buffer_head *bh, *next;
+ struct xfs_inode *ip = XFS_I(ioend->io_inode);
for (bh = ioend->io_buffer_head; bh; bh = next) {
next = bh->b_private;
bh->b_end_io(bh, !ioend->io_error);
}
- if (unlikely(ioend->io_error)) {
- vn_ioerror(XFS_I(ioend->io_inode), ioend->io_error,
- __FILE__,__LINE__);
+
+ /*
+ * Volume managers supporting multiple paths can send back ENODEV
+ * when the final path disappears. In this case continuing to fill
+ * the page cache with dirty data which cannot be written out is
+ * evil, so prevent that.
+ */
+ if (unlikely(ioend->io_error == -ENODEV)) {
+ xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ,
+ __FILE__, __LINE__);
}
- vn_iowake(XFS_I(ioend->io_inode));
+
+ xfs_ioend_wake(ip);
mempool_free(ioend, xfs_ioend_pool);
}
@@ -191,7 +234,7 @@ xfs_setfilesize(
ip->i_d.di_size = isize;
ip->i_update_core = 1;
ip->i_update_size = 1;
- mark_inode_dirty_sync(ioend->io_inode);
+ xfs_mark_inode_dirty_sync(ip);
}
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -317,14 +360,9 @@ xfs_map_blocks(
xfs_iomap_t *mapp,
int flags)
{
- xfs_inode_t *ip = XFS_I(inode);
- int error, nmaps = 1;
-
- error = xfs_iomap(ip, offset, count,
- flags, mapp, &nmaps);
- if (!error && (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)))
- xfs_iflags_set(ip, XFS_IMODIFIED);
- return -error;
+ int nmaps = 1;
+
+ return -xfs_iomap(XFS_I(inode), offset, count, flags, mapp, &nmaps);
}
STATIC_INLINE int
@@ -512,7 +550,7 @@ xfs_cancel_ioend(
unlock_buffer(bh);
} while ((bh = next_bh) != NULL);
- vn_iowake(XFS_I(ioend->io_inode));
+ xfs_ioend_wake(XFS_I(ioend->io_inode));
mempool_free(ioend, xfs_ioend_pool);
} while ((ioend = next) != NULL);
}
diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h
index 3ba0631a3818..7b26f5ff9692 100644
--- a/fs/xfs/linux-2.6/xfs_aops.h
+++ b/fs/xfs/linux-2.6/xfs_aops.h
@@ -43,4 +43,7 @@ typedef struct xfs_ioend {
extern const struct address_space_operations xfs_address_space_operations;
extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int);
+extern void xfs_ioend_init(void);
+extern void xfs_ioend_wait(struct xfs_inode *);
+
#endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 36d5fcd3f593..cb329edc925b 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -630,6 +630,29 @@ xfs_buf_get_flags(
return NULL;
}
+STATIC int
+_xfs_buf_read(
+ xfs_buf_t *bp,
+ xfs_buf_flags_t flags)
+{
+ int status;
+
+ XB_TRACE(bp, "_xfs_buf_read", (unsigned long)flags);
+
+ ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE)));
+ ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
+
+ bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
+ XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+ bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | \
+ XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+
+ status = xfs_buf_iorequest(bp);
+ if (!status && !(flags & XBF_ASYNC))
+ status = xfs_buf_iowait(bp);
+ return status;
+}
+
xfs_buf_t *
xfs_buf_read_flags(
xfs_buftarg_t *target,
@@ -646,7 +669,7 @@ xfs_buf_read_flags(
if (!XFS_BUF_ISDONE(bp)) {
XB_TRACE(bp, "read", (unsigned long)flags);
XFS_STATS_INC(xb_get_read);
- xfs_buf_iostart(bp, flags);
+ _xfs_buf_read(bp, flags);
} else if (flags & XBF_ASYNC) {
XB_TRACE(bp, "read_async", (unsigned long)flags);
/*
@@ -1048,50 +1071,39 @@ xfs_buf_ioerror(
XB_TRACE(bp, "ioerror", (unsigned long)error);
}
-/*
- * Initiate I/O on a buffer, based on the flags supplied.
- * The b_iodone routine in the buffer supplied will only be called
- * when all of the subsidiary I/O requests, if any, have been completed.
- */
int
-xfs_buf_iostart(
- xfs_buf_t *bp,
- xfs_buf_flags_t flags)
+xfs_bawrite(
+ void *mp,
+ struct xfs_buf *bp)
{
- int status = 0;
+ XB_TRACE(bp, "bawrite", 0);
- XB_TRACE(bp, "iostart", (unsigned long)flags);
+ ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
- if (flags & XBF_DELWRI) {
- bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC);
- bp->b_flags |= flags & (XBF_DELWRI | XBF_ASYNC);
- xfs_buf_delwri_queue(bp, 1);
- return 0;
- }
+ xfs_buf_delwri_dequeue(bp);
- bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_ASYNC | XBF_DELWRI | \
- XBF_READ_AHEAD | _XBF_RUN_QUEUES);
- bp->b_flags |= flags & (XBF_READ | XBF_WRITE | XBF_ASYNC | \
- XBF_READ_AHEAD | _XBF_RUN_QUEUES);
+ bp->b_flags &= ~(XBF_READ | XBF_DELWRI | XBF_READ_AHEAD);
+ bp->b_flags |= (XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
+
+ bp->b_mount = mp;
+ bp->b_strat = xfs_bdstrat_cb;
+ return xfs_bdstrat_cb(bp);
+}
- BUG_ON(bp->b_bn == XFS_BUF_DADDR_NULL);
+void
+xfs_bdwrite(
+ void *mp,
+ struct xfs_buf *bp)
+{
+ XB_TRACE(bp, "bdwrite", 0);
- /* For writes allow an alternate strategy routine to precede
- * the actual I/O request (which may not be issued at all in
- * a shutdown situation, for example).
- */
- status = (flags & XBF_WRITE) ?
- xfs_buf_iostrategy(bp) : xfs_buf_iorequest(bp);
+ bp->b_strat = xfs_bdstrat_cb;
+ bp->b_mount = mp;
- /* Wait for I/O if we are not an async request.
- * Note: async I/O request completion will release the buffer,
- * and that can already be done by this point. So using the
- * buffer pointer from here on, after async I/O, is invalid.
- */
- if (!status && !(flags & XBF_ASYNC))
- status = xfs_buf_iowait(bp);
+ bp->b_flags &= ~XBF_READ;
+ bp->b_flags |= (XBF_DELWRI | XBF_ASYNC);
- return status;
+ xfs_buf_delwri_queue(bp, 1);
}
STATIC_INLINE void
@@ -1114,8 +1126,7 @@ xfs_buf_bio_end_io(
unsigned int blocksize = bp->b_target->bt_bsize;
struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
- if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
- bp->b_error = EIO;
+ xfs_buf_ioerror(bp, -error);
do {
struct page *page = bvec->bv_page;
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index 456519a088c7..288ae7c4c800 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -168,7 +168,7 @@ typedef struct xfs_buf {
struct completion b_iowait; /* queue for I/O waiters */
void *b_fspriv;
void *b_fspriv2;
- void *b_fspriv3;
+ struct xfs_mount *b_mount;
unsigned short b_error; /* error code on I/O */
unsigned int b_page_count; /* size of page array */
unsigned int b_offset; /* page offset in first page */
@@ -214,9 +214,10 @@ extern void xfs_buf_lock(xfs_buf_t *);
extern void xfs_buf_unlock(xfs_buf_t *);
/* Buffer Read and Write Routines */
+extern int xfs_bawrite(void *mp, xfs_buf_t *bp);
+extern void xfs_bdwrite(void *mp, xfs_buf_t *bp);
extern void xfs_buf_ioend(xfs_buf_t *, int);
extern void xfs_buf_ioerror(xfs_buf_t *, int);
-extern int xfs_buf_iostart(xfs_buf_t *, xfs_buf_flags_t);
extern int xfs_buf_iorequest(xfs_buf_t *);
extern int xfs_buf_iowait(xfs_buf_t *);
extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, xfs_caddr_t,
@@ -311,10 +312,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
#define XFS_BUF_UNORDERED(bp) ((bp)->b_flags &= ~XBF_ORDERED)
#define XFS_BUF_ISORDERED(bp) ((bp)->b_flags & XBF_ORDERED)
-#define XFS_BUF_SHUT(bp) do { } while (0)
-#define XFS_BUF_UNSHUT(bp) do { } while (0)
-#define XFS_BUF_ISSHUT(bp) (0)
-
#define XFS_BUF_HOLD(bp) xfs_buf_hold(bp)
#define XFS_BUF_READ(bp) ((bp)->b_flags |= XBF_READ)
#define XFS_BUF_UNREAD(bp) ((bp)->b_flags &= ~XBF_READ)
@@ -334,8 +331,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
#define XFS_BUF_SET_FSPRIVATE(bp, val) ((bp)->b_fspriv = (void*)(val))
#define XFS_BUF_FSPRIVATE2(bp, type) ((type)(bp)->b_fspriv2)
#define XFS_BUF_SET_FSPRIVATE2(bp, val) ((bp)->b_fspriv2 = (void*)(val))
-#define XFS_BUF_FSPRIVATE3(bp, type) ((type)(bp)->b_fspriv3)
-#define XFS_BUF_SET_FSPRIVATE3(bp, val) ((bp)->b_fspriv3 = (void*)(val))
#define XFS_BUF_SET_START(bp) do { } while (0)
#define XFS_BUF_SET_BRELSE_FUNC(bp, func) ((bp)->b_relse = (func))
@@ -366,14 +361,6 @@ extern void xfs_buf_trace(xfs_buf_t *, char *, void *, void *);
#define XFS_BUF_TARGET(bp) ((bp)->b_target)
#define XFS_BUFTARG_NAME(target) xfs_buf_target_name(target)
-static inline int xfs_bawrite(void *mp, xfs_buf_t *bp)
-{
- bp->b_fspriv3 = mp;
- bp->b_strat = xfs_bdstrat_cb;
- xfs_buf_delwri_dequeue(bp);
- return xfs_buf_iostart(bp, XBF_WRITE | XBF_ASYNC | _XBF_RUN_QUEUES);
-}
-
static inline void xfs_buf_relse(xfs_buf_t *bp)
{
if (!bp->b_relse)
@@ -414,17 +401,6 @@ static inline int XFS_bwrite(xfs_buf_t *bp)
return error;
}
-/*
- * No error can be returned from xfs_buf_iostart for delwri
- * buffers as they are queued and no I/O is issued.
- */
-static inline void xfs_bdwrite(void *mp, xfs_buf_t *bp)
-{
- bp->b_strat = xfs_bdstrat_cb;
- bp->b_fspriv3 = mp;
- (void)xfs_buf_iostart(bp, XBF_DELWRI | XBF_ASYNC);
-}
-
#define XFS_bdstrat(bp) xfs_buf_iorequest(bp)
#define xfs_iowait(bp) xfs_buf_iowait(bp)
diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h
index 8c022cd0ad67..55bddf3b6091 100644
--- a/fs/xfs/linux-2.6/xfs_cred.h
+++ b/fs/xfs/linux-2.6/xfs_cred.h
@@ -25,12 +25,4 @@
*/
typedef const struct cred cred_t;
-extern cred_t *sys_cred;
-
-/* this is a hack.. (assumes sys_cred is the only cred_t in the system) */
-static inline int capable_cred(cred_t *cr, int cid)
-{
- return (cr == sys_cred) ? 1 : capable(cid);
-}
-
#endif /* __XFS_CRED_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c
index 7f7abec25e14..595751f78350 100644
--- a/fs/xfs/linux-2.6/xfs_export.c
+++ b/fs/xfs/linux-2.6/xfs_export.c
@@ -29,7 +29,6 @@
#include "xfs_vnodeops.h"
#include "xfs_bmap_btree.h"
#include "xfs_inode.h"
-#include "xfs_vfsops.h"
/*
* Note that we only accept fileids which are long enough rather than allow
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 3fee790f138b..e14c4e3aea0c 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -36,89 +36,54 @@
#include "xfs_inode.h"
#include "xfs_error.h"
#include "xfs_rw.h"
-#include "xfs_ioctl32.h"
#include "xfs_vnodeops.h"
+#include "xfs_da_btree.h"
+#include "xfs_ioctl.h"
#include <linux/dcache.h>
#include <linux/smp_lock.h>
static struct vm_operations_struct xfs_file_vm_ops;
-STATIC_INLINE ssize_t
-__xfs_file_read(
+STATIC ssize_t
+xfs_file_aio_read(
struct kiocb *iocb,
const struct iovec *iov,
unsigned long nr_segs,
- int ioflags,
loff_t pos)
{
struct file *file = iocb->ki_filp;
+ int ioflags = IO_ISAIO;
BUG_ON(iocb->ki_pos != pos);
if (unlikely(file->f_flags & O_DIRECT))
ioflags |= IO_ISDIRECT;
+ if (file->f_mode & FMODE_NOCMTIME)
+ ioflags |= IO_INVIS;
return xfs_read(XFS_I(file->f_path.dentry->d_inode), iocb, iov,
nr_segs, &iocb->ki_pos, ioflags);
}
STATIC ssize_t
-xfs_file_aio_read(
- struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t pos)
-{
- return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO, pos);
-}
-
-STATIC ssize_t
-xfs_file_aio_read_invis(
- struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t pos)
-{
- return __xfs_file_read(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos);
-}
-
-STATIC_INLINE ssize_t
-__xfs_file_write(
+xfs_file_aio_write(
struct kiocb *iocb,
const struct iovec *iov,
unsigned long nr_segs,
- int ioflags,
loff_t pos)
{
- struct file *file = iocb->ki_filp;
+ struct file *file = iocb->ki_filp;
+ int ioflags = IO_ISAIO;
BUG_ON(iocb->ki_pos != pos);
if (unlikely(file->f_flags & O_DIRECT))
ioflags |= IO_ISDIRECT;
+ if (file->f_mode & FMODE_NOCMTIME)
+ ioflags |= IO_INVIS;
return xfs_write(XFS_I(file->f_mapping->host), iocb, iov, nr_segs,
&iocb->ki_pos, ioflags);
}
STATIC ssize_t
-xfs_file_aio_write(
- struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t pos)
-{
- return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO, pos);
-}
-
-STATIC ssize_t
-xfs_file_aio_write_invis(
- struct kiocb *iocb,
- const struct iovec *iov,
- unsigned long nr_segs,
- loff_t pos)
-{
- return __xfs_file_write(iocb, iov, nr_segs, IO_ISAIO|IO_INVIS, pos);
-}
-
-STATIC ssize_t
xfs_file_splice_read(
struct file *infilp,
loff_t *ppos,
@@ -126,20 +91,13 @@ xfs_file_splice_read(
size_t len,
unsigned int flags)
{
- return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode),
- infilp, ppos, pipe, len, flags, 0);
-}
+ int ioflags = 0;
+
+ if (infilp->f_mode & FMODE_NOCMTIME)
+ ioflags |= IO_INVIS;
-STATIC ssize_t
-xfs_file_splice_read_invis(
- struct file *infilp,
- loff_t *ppos,
- struct pipe_inode_info *pipe,
- size_t len,
- unsigned int flags)
-{
return xfs_splice_read(XFS_I(infilp->f_path.dentry->d_inode),
- infilp, ppos, pipe, len, flags, IO_INVIS);
+ infilp, ppos, pipe, len, flags, ioflags);
}
STATIC ssize_t
@@ -150,30 +108,49 @@ xfs_file_splice_write(
size_t len,
unsigned int flags)
{
- return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode),
- pipe, outfilp, ppos, len, flags, 0);
-}
+ int ioflags = 0;
+
+ if (outfilp->f_mode & FMODE_NOCMTIME)
+ ioflags |= IO_INVIS;
-STATIC ssize_t
-xfs_file_splice_write_invis(
- struct pipe_inode_info *pipe,
- struct file *outfilp,
- loff_t *ppos,
- size_t len,
- unsigned int flags)
-{
return xfs_splice_write(XFS_I(outfilp->f_path.dentry->d_inode),
- pipe, outfilp, ppos, len, flags, IO_INVIS);
+ pipe, outfilp, ppos, len, flags, ioflags);
}
STATIC int
xfs_file_open(
struct inode *inode,
- struct file *filp)
+ struct file *file)
{
- if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
+ if (!(file->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS)
return -EFBIG;
- return -xfs_open(XFS_I(inode));
+ if (XFS_FORCED_SHUTDOWN(XFS_M(inode->i_sb)))
+ return -EIO;
+ return 0;
+}
+
+STATIC int
+xfs_dir_open(
+ struct inode *inode,
+ struct file *file)
+{
+ struct xfs_inode *ip = XFS_I(inode);
+ int mode;
+ int error;
+
+ error = xfs_file_open(inode, file);
+ if (error)
+ return error;
+
+ /*
+ * If there are any blocks, read-ahead block 0 as we're almost
+ * certain to have the next operation be a read there.
+ */
+ mode = xfs_ilock_map_shared(ip);
+ if (ip->i_d.di_nextents > 0)
+ xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
+ xfs_iunlock(ip, mode);
+ return 0;
}
STATIC int
@@ -227,7 +204,7 @@ xfs_file_readdir(
* point we can change the ->readdir prototype to include the
* buffer size.
*/
- bufsize = (size_t)min_t(loff_t, PAGE_SIZE, inode->i_size);
+ bufsize = (size_t)min_t(loff_t, PAGE_SIZE, ip->i_d.di_size);
error = xfs_readdir(ip, dirent, bufsize,
(xfs_off_t *)&filp->f_pos, filldir);
@@ -248,48 +225,6 @@ xfs_file_mmap(
return 0;
}
-STATIC long
-xfs_file_ioctl(
- struct file *filp,
- unsigned int cmd,
- unsigned long p)
-{
- int error;
- struct inode *inode = filp->f_path.dentry->d_inode;
-
- error = xfs_ioctl(XFS_I(inode), filp, 0, cmd, (void __user *)p);
- xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED);
-
- /* NOTE: some of the ioctl's return positive #'s as a
- * byte count indicating success, such as
- * readlink_by_handle. So we don't "sign flip"
- * like most other routines. This means true
- * errors need to be returned as a negative value.
- */
- return error;
-}
-
-STATIC long
-xfs_file_ioctl_invis(
- struct file *filp,
- unsigned int cmd,
- unsigned long p)
-{
- int error;
- struct inode *inode = filp->f_path.dentry->d_inode;
-
- error = xfs_ioctl(XFS_I(inode), filp, IO_INVIS, cmd, (void __user *)p);
- xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED);
-
- /* NOTE: some of the ioctl's return positive #'s as a
- * byte count indicating success, such as
- * readlink_by_handle. So we don't "sign flip"
- * like most other routines. This means true
- * errors need to be returned as a negative value.
- */
- return error;
-}
-
/*
* mmap()d file has taken write protection fault and is being made
* writable. We can set the page state up correctly for a writable
@@ -325,26 +260,8 @@ const struct file_operations xfs_file_operations = {
#endif
};
-const struct file_operations xfs_invis_file_operations = {
- .llseek = generic_file_llseek,
- .read = do_sync_read,
- .write = do_sync_write,
- .aio_read = xfs_file_aio_read_invis,
- .aio_write = xfs_file_aio_write_invis,
- .splice_read = xfs_file_splice_read_invis,
- .splice_write = xfs_file_splice_write_invis,
- .unlocked_ioctl = xfs_file_ioctl_invis,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = xfs_file_compat_invis_ioctl,
-#endif
- .mmap = xfs_file_mmap,
- .open = xfs_file_open,
- .release = xfs_file_release,
- .fsync = xfs_file_fsync,
-};
-
-
const struct file_operations xfs_dir_file_operations = {
+ .open = xfs_dir_open,
.read = generic_read_dir,
.readdir = xfs_file_readdir,
.llseek = generic_file_llseek,
diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c
index 36caa6d957df..5aeb77776961 100644
--- a/fs/xfs/linux-2.6/xfs_fs_subr.c
+++ b/fs/xfs/linux-2.6/xfs_fs_subr.c
@@ -24,6 +24,10 @@ int fs_noerr(void) { return 0; }
int fs_nosys(void) { return ENOSYS; }
void fs_noval(void) { return; }
+/*
+ * note: all filemap functions return negative error codes. These
+ * need to be inverted before returning to the xfs core functions.
+ */
void
xfs_tosspages(
xfs_inode_t *ip,
@@ -53,7 +57,7 @@ xfs_flushinval_pages(
if (!ret)
truncate_inode_pages(mapping, first);
}
- return ret;
+ return -ret;
}
int
@@ -72,10 +76,23 @@ xfs_flush_pages(
xfs_iflags_clear(ip, XFS_ITRUNCATED);
ret = filemap_fdatawrite(mapping);
if (flags & XFS_B_ASYNC)
- return ret;
+ return -ret;
ret2 = filemap_fdatawait(mapping);
if (!ret)
ret = ret2;
}
- return ret;
+ return -ret;
+}
+
+int
+xfs_wait_on_pages(
+ xfs_inode_t *ip,
+ xfs_off_t first,
+ xfs_off_t last)
+{
+ struct address_space *mapping = VFS_I(ip)->i_mapping;
+
+ if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK))
+ return -filemap_fdatawait(mapping);
+ return 0;
}
diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c
index ef90e64641e6..2ae8b1ccb02e 100644
--- a/fs/xfs/linux-2.6/xfs_globals.c
+++ b/fs/xfs/linux-2.6/xfs_globals.c
@@ -26,7 +26,6 @@
*/
xfs_param_t xfs_params = {
/* MIN DFLT MAX */
- .restrict_chown = { 0, 1, 1 },
.sgid_inherit = { 0, 0, 1 },
.symlink_mode = { 0, 0, 1 },
.panic_mask = { 0, 0, 255 },
@@ -43,10 +42,3 @@ xfs_param_t xfs_params = {
.inherit_nodfrg = { 0, 1, 1 },
.fstrm_timer = { 1, 30*100, 3600*100},
};
-
-/*
- * Global system credential structure.
- */
-static cred_t sys_cred_val;
-cred_t *sys_cred = &sys_cred_val;
-
diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h
index 6eda8a3eb6f1..69f71caf061c 100644
--- a/fs/xfs/linux-2.6/xfs_globals.h
+++ b/fs/xfs/linux-2.6/xfs_globals.h
@@ -19,6 +19,5 @@
#define __XFS_GLOBALS_H__
extern uint64_t xfs_panic_mask; /* set to cause more panics */
-extern cred_t *sys_cred;
#endif /* __XFS_GLOBALS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c
index 281cbd5a25cf..67205f6198ba 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl.c
@@ -68,26 +68,22 @@
* XFS_IOC_PATH_TO_HANDLE
* returns full handle for a path
*/
-STATIC int
+int
xfs_find_handle(
unsigned int cmd,
- void __user *arg)
+ xfs_fsop_handlereq_t *hreq)
{
int hsize;
xfs_handle_t handle;
- xfs_fsop_handlereq_t hreq;
struct inode *inode;
- if (copy_from_user(&hreq, arg, sizeof(hreq)))
- return -XFS_ERROR(EFAULT);
-
memset((char *)&handle, 0, sizeof(handle));
switch (cmd) {
case XFS_IOC_PATH_TO_FSHANDLE:
case XFS_IOC_PATH_TO_HANDLE: {
struct path path;
- int error = user_lpath((const char __user *)hreq.path, &path);
+ int error = user_lpath((const char __user *)hreq->path, &path);
if (error)
return error;
@@ -101,7 +97,7 @@ xfs_find_handle(
case XFS_IOC_FD_TO_HANDLE: {
struct file *file;
- file = fget(hreq.fd);
+ file = fget(hreq->fd);
if (!file)
return -EBADF;
@@ -158,8 +154,8 @@ xfs_find_handle(
}
/* now copy our handle into the user buffer & write out the size */
- if (copy_to_user(hreq.ohandle, &handle, hsize) ||
- copy_to_user(hreq.ohandlen, &hsize, sizeof(__s32))) {
+ if (copy_to_user(hreq->ohandle, &handle, hsize) ||
+ copy_to_user(hreq->ohandlen, &hsize, sizeof(__s32))) {
iput(inode);
return -XFS_ERROR(EFAULT);
}
@@ -249,10 +245,10 @@ xfs_vget_fsop_handlereq(
return 0;
}
-STATIC int
+int
xfs_open_by_handle(
xfs_mount_t *mp,
- void __user *arg,
+ xfs_fsop_handlereq_t *hreq,
struct file *parfilp,
struct inode *parinode)
{
@@ -263,14 +259,11 @@ xfs_open_by_handle(
struct file *filp;
struct inode *inode;
struct dentry *dentry;
- xfs_fsop_handlereq_t hreq;
if (!capable(CAP_SYS_ADMIN))
return -XFS_ERROR(EPERM);
- if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
- return -XFS_ERROR(EFAULT);
- error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &inode);
+ error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode);
if (error)
return -error;
@@ -281,10 +274,10 @@ xfs_open_by_handle(
}
#if BITS_PER_LONG != 32
- hreq.oflags |= O_LARGEFILE;
+ hreq->oflags |= O_LARGEFILE;
#endif
/* Put open permission in namei format. */
- permflag = hreq.oflags;
+ permflag = hreq->oflags;
if ((permflag+1) & O_ACCMODE)
permflag++;
if (permflag & O_TRUNC)
@@ -322,15 +315,16 @@ xfs_open_by_handle(
mntget(parfilp->f_path.mnt);
/* Create file pointer. */
- filp = dentry_open(dentry, parfilp->f_path.mnt, hreq.oflags, cred);
+ filp = dentry_open(dentry, parfilp->f_path.mnt, hreq->oflags, cred);
if (IS_ERR(filp)) {
put_unused_fd(new_fd);
return -XFS_ERROR(-PTR_ERR(filp));
}
+
if (inode->i_mode & S_IFREG) {
/* invisible operation should not change atime */
filp->f_flags |= O_NOATIME;
- filp->f_op = &xfs_invis_file_operations;
+ filp->f_mode |= FMODE_NOCMTIME;
}
fd_install(new_fd, filp);
@@ -363,24 +357,21 @@ do_readlink(
}
-STATIC int
+int
xfs_readlink_by_handle(
xfs_mount_t *mp,
- void __user *arg,
+ xfs_fsop_handlereq_t *hreq,
struct inode *parinode)
{
struct inode *inode;
- xfs_fsop_handlereq_t hreq;
__u32 olen;
void *link;
int error;
if (!capable(CAP_SYS_ADMIN))
return -XFS_ERROR(EPERM);
- if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
- return -XFS_ERROR(EFAULT);
- error = xfs_vget_fsop_handlereq(mp, parinode, &hreq, &inode);
+ error = xfs_vget_fsop_handlereq(mp, parinode, hreq, &inode);
if (error)
return -error;
@@ -390,7 +381,7 @@ xfs_readlink_by_handle(
goto out_iput;
}
- if (copy_from_user(&olen, hreq.ohandlen, sizeof(__u32))) {
+ if (copy_from_user(&olen, hreq->ohandlen, sizeof(__u32))) {
error = -XFS_ERROR(EFAULT);
goto out_iput;
}
@@ -402,7 +393,7 @@ xfs_readlink_by_handle(
error = -xfs_readlink(XFS_I(inode), link);
if (error)
goto out_kfree;
- error = do_readlink(hreq.ohandle, olen, link);
+ error = do_readlink(hreq->ohandle, olen, link);
if (error)
goto out_kfree;
@@ -501,7 +492,7 @@ xfs_attrlist_by_handle(
return -error;
}
-STATIC int
+int
xfs_attrmulti_attr_get(
struct inode *inode,
char *name,
@@ -530,7 +521,7 @@ xfs_attrmulti_attr_get(
return error;
}
-STATIC int
+int
xfs_attrmulti_attr_set(
struct inode *inode,
char *name,
@@ -560,7 +551,7 @@ xfs_attrmulti_attr_set(
return error;
}
-STATIC int
+int
xfs_attrmulti_attr_remove(
struct inode *inode,
char *name,
@@ -662,19 +653,26 @@ xfs_attrmulti_by_handle(
return -error;
}
-STATIC int
+int
xfs_ioc_space(
struct xfs_inode *ip,
struct inode *inode,
struct file *filp,
int ioflags,
unsigned int cmd,
- void __user *arg)
+ xfs_flock64_t *bf)
{
- xfs_flock64_t bf;
int attr_flags = 0;
int error;
+ /*
+ * Only allow the sys admin to reserve space unless
+ * unwritten extents are enabled.
+ */
+ if (!xfs_sb_version_hasextflgbit(&ip->i_mount->m_sb) &&
+ !capable(CAP_SYS_ADMIN))
+ return -XFS_ERROR(EPERM);
+
if (inode->i_flags & (S_IMMUTABLE|S_APPEND))
return -XFS_ERROR(EPERM);
@@ -684,16 +682,12 @@ xfs_ioc_space(
if (!S_ISREG(inode->i_mode))
return -XFS_ERROR(EINVAL);
- if (copy_from_user(&bf, arg, sizeof(bf)))
- return -XFS_ERROR(EFAULT);
-
if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
attr_flags |= XFS_ATTR_NONBLOCK;
if (ioflags & IO_INVIS)
attr_flags |= XFS_ATTR_DMI;
- error = xfs_change_file_space(ip, cmd, &bf, filp->f_pos,
- NULL, attr_flags);
+ error = xfs_change_file_space(ip, cmd, bf, filp->f_pos, attr_flags);
return -error;
}
@@ -1105,10 +1099,6 @@ xfs_ioctl_setattr(
/*
* Change file ownership. Must be the owner or privileged.
- * If the system was configured with the "restricted_chown"
- * option, the owner is not permitted to give away the file,
- * and can change the group id only to a group of which he
- * or she is a member.
*/
if (mask & FSX_PROJID) {
/*
@@ -1137,7 +1127,7 @@ xfs_ioctl_setattr(
* the superblock version number since projids didn't
* exist before DINODE_VERSION_2 and SB_VERSION_NLINK.
*/
- if (ip->i_d.di_version == XFS_DINODE_VERSION_1)
+ if (ip->i_d.di_version == 1)
xfs_bump_ino_vers2(tp, ip);
}
@@ -1256,43 +1246,67 @@ xfs_ioc_setxflags(
}
STATIC int
+xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full)
+{
+ struct getbmap __user *base = *ap;
+
+ /* copy only getbmap portion (not getbmapx) */
+ if (copy_to_user(base, bmv, sizeof(struct getbmap)))
+ return XFS_ERROR(EFAULT);
+
+ *ap += sizeof(struct getbmap);
+ return 0;
+}
+
+STATIC int
xfs_ioc_getbmap(
struct xfs_inode *ip,
int ioflags,
unsigned int cmd,
void __user *arg)
{
- struct getbmap bm;
- int iflags;
+ struct getbmapx bmx;
int error;
- if (copy_from_user(&bm, arg, sizeof(bm)))
+ if (copy_from_user(&bmx, arg, sizeof(struct getbmapx)))
return -XFS_ERROR(EFAULT);
- if (bm.bmv_count < 2)
+ if (bmx.bmv_count < 2)
return -XFS_ERROR(EINVAL);
- iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
+ bmx.bmv_iflags = (cmd == XFS_IOC_GETBMAPA ? BMV_IF_ATTRFORK : 0);
if (ioflags & IO_INVIS)
- iflags |= BMV_IF_NO_DMAPI_READ;
+ bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ;
- error = xfs_getbmap(ip, &bm, (struct getbmap __user *)arg+1, iflags);
+ error = xfs_getbmap(ip, &bmx, xfs_getbmap_format,
+ (struct getbmap *)arg+1);
if (error)
return -error;
- if (copy_to_user(arg, &bm, sizeof(bm)))
+ /* copy back header - only size of getbmap */
+ if (copy_to_user(arg, &bmx, sizeof(struct getbmap)))
return -XFS_ERROR(EFAULT);
return 0;
}
STATIC int
+xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full)
+{
+ struct getbmapx __user *base = *ap;
+
+ if (copy_to_user(base, bmv, sizeof(struct getbmapx)))
+ return XFS_ERROR(EFAULT);
+
+ *ap += sizeof(struct getbmapx);
+ return 0;
+}
+
+STATIC int
xfs_ioc_getbmapx(
struct xfs_inode *ip,
void __user *arg)
{
struct getbmapx bmx;
- struct getbmap bm;
- int iflags;
int error;
if (copy_from_user(&bmx, arg, sizeof(bmx)))
@@ -1301,46 +1315,46 @@ xfs_ioc_getbmapx(
if (bmx.bmv_count < 2)
return -XFS_ERROR(EINVAL);
- /*
- * Map input getbmapx structure to a getbmap
- * structure for xfs_getbmap.
- */
- GETBMAP_CONVERT(bmx, bm);
-
- iflags = bmx.bmv_iflags;
-
- if (iflags & (~BMV_IF_VALID))
+ if (bmx.bmv_iflags & (~BMV_IF_VALID))
return -XFS_ERROR(EINVAL);
- iflags |= BMV_IF_EXTENDED;
-
- error = xfs_getbmap(ip, &bm, (struct getbmapx __user *)arg+1, iflags);
+ error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format,
+ (struct getbmapx *)arg+1);
if (error)
return -error;
- GETBMAP_CONVERT(bm, bmx);
-
- if (copy_to_user(arg, &bmx, sizeof(bmx)))
+ /* copy back header */
+ if (copy_to_user(arg, &bmx, sizeof(struct getbmapx)))
return -XFS_ERROR(EFAULT);
return 0;
}
-int
-xfs_ioctl(
- xfs_inode_t *ip,
+/*
+ * Note: some of the ioctl's return positive numbers as a
+ * byte count indicating success, such as readlink_by_handle.
+ * So we don't "sign flip" like most other routines. This means
+ * true errors need to be returned as a negative value.
+ */
+long
+xfs_file_ioctl(
struct file *filp,
- int ioflags,
unsigned int cmd,
- void __user *arg)
+ unsigned long p)
{
struct inode *inode = filp->f_path.dentry->d_inode;
- xfs_mount_t *mp = ip->i_mount;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ void __user *arg = (void __user *)p;
+ int ioflags = 0;
int error;
- xfs_itrace_entry(XFS_I(inode));
- switch (cmd) {
+ if (filp->f_mode & FMODE_NOCMTIME)
+ ioflags |= IO_INVIS;
+ xfs_itrace_entry(ip);
+
+ switch (cmd) {
case XFS_IOC_ALLOCSP:
case XFS_IOC_FREESP:
case XFS_IOC_RESVSP:
@@ -1348,17 +1362,13 @@ xfs_ioctl(
case XFS_IOC_ALLOCSP64:
case XFS_IOC_FREESP64:
case XFS_IOC_RESVSP64:
- case XFS_IOC_UNRESVSP64:
- /*
- * Only allow the sys admin to reserve space unless
- * unwritten extents are enabled.
- */
- if (!xfs_sb_version_hasextflgbit(&mp->m_sb) &&
- !capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- return xfs_ioc_space(ip, inode, filp, ioflags, cmd, arg);
+ case XFS_IOC_UNRESVSP64: {
+ xfs_flock64_t bf;
+ if (copy_from_user(&bf, arg, sizeof(bf)))
+ return -XFS_ERROR(EFAULT);
+ return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+ }
case XFS_IOC_DIOINFO: {
struct dioattr da;
xfs_buftarg_t *target =
@@ -1418,18 +1428,30 @@ xfs_ioctl(
case XFS_IOC_FD_TO_HANDLE:
case XFS_IOC_PATH_TO_HANDLE:
- case XFS_IOC_PATH_TO_FSHANDLE:
- return xfs_find_handle(cmd, arg);
+ case XFS_IOC_PATH_TO_FSHANDLE: {
+ xfs_fsop_handlereq_t hreq;
- case XFS_IOC_OPEN_BY_HANDLE:
- return xfs_open_by_handle(mp, arg, filp, inode);
+ if (copy_from_user(&hreq, arg, sizeof(hreq)))
+ return -XFS_ERROR(EFAULT);
+ return xfs_find_handle(cmd, &hreq);
+ }
+ case XFS_IOC_OPEN_BY_HANDLE: {
+ xfs_fsop_handlereq_t hreq;
+ if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
+ return -XFS_ERROR(EFAULT);
+ return xfs_open_by_handle(mp, &hreq, filp, inode);
+ }
case XFS_IOC_FSSETDM_BY_HANDLE:
return xfs_fssetdm_by_handle(mp, arg, inode);
- case XFS_IOC_READLINK_BY_HANDLE:
- return xfs_readlink_by_handle(mp, arg, inode);
+ case XFS_IOC_READLINK_BY_HANDLE: {
+ xfs_fsop_handlereq_t hreq;
+ if (copy_from_user(&hreq, arg, sizeof(xfs_fsop_handlereq_t)))
+ return -XFS_ERROR(EFAULT);
+ return xfs_readlink_by_handle(mp, &hreq, inode);
+ }
case XFS_IOC_ATTRLIST_BY_HANDLE:
return xfs_attrlist_by_handle(mp, arg, inode);
@@ -1437,7 +1459,11 @@ xfs_ioctl(
return xfs_attrmulti_by_handle(mp, arg, filp, inode);
case XFS_IOC_SWAPEXT: {
- error = xfs_swapext((struct xfs_swapext __user *)arg);
+ struct xfs_swapext sxp;
+
+ if (copy_from_user(&sxp, arg, sizeof(xfs_swapext_t)))
+ return -XFS_ERROR(EFAULT);
+ error = xfs_swapext(&sxp);
return -error;
}
@@ -1493,9 +1519,6 @@ xfs_ioctl(
case XFS_IOC_FSGROWFSDATA: {
xfs_growfs_data_t in;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
if (copy_from_user(&in, arg, sizeof(in)))
return -XFS_ERROR(EFAULT);
@@ -1506,9 +1529,6 @@ xfs_ioctl(
case XFS_IOC_FSGROWFSLOG: {
xfs_growfs_log_t in;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
if (copy_from_user(&in, arg, sizeof(in)))
return -XFS_ERROR(EFAULT);
@@ -1519,9 +1539,6 @@ xfs_ioctl(
case XFS_IOC_FSGROWFSRT: {
xfs_growfs_rt_t in;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
if (copy_from_user(&in, arg, sizeof(in)))
return -XFS_ERROR(EFAULT);
diff --git a/fs/xfs/linux-2.6/xfs_ioctl.h b/fs/xfs/linux-2.6/xfs_ioctl.h
new file mode 100644
index 000000000000..8c16bf2d7e03
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_ioctl.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_IOCTL_H__
+#define __XFS_IOCTL_H__
+
+extern int
+xfs_ioc_space(
+ struct xfs_inode *ip,
+ struct inode *inode,
+ struct file *filp,
+ int ioflags,
+ unsigned int cmd,
+ xfs_flock64_t *bf);
+
+extern int
+xfs_find_handle(
+ unsigned int cmd,
+ xfs_fsop_handlereq_t *hreq);
+
+extern int
+xfs_open_by_handle(
+ xfs_mount_t *mp,
+ xfs_fsop_handlereq_t *hreq,
+ struct file *parfilp,
+ struct inode *parinode);
+
+extern int
+xfs_readlink_by_handle(
+ xfs_mount_t *mp,
+ xfs_fsop_handlereq_t *hreq,
+ struct inode *parinode);
+
+extern int
+xfs_attrmulti_attr_get(
+ struct inode *inode,
+ char *name,
+ char __user *ubuf,
+ __uint32_t *len,
+ __uint32_t flags);
+
+extern int
+ xfs_attrmulti_attr_set(
+ struct inode *inode,
+ char *name,
+ const char __user *ubuf,
+ __uint32_t len,
+ __uint32_t flags);
+
+extern int
+xfs_attrmulti_attr_remove(
+ struct inode *inode,
+ char *name,
+ __uint32_t flags);
+
+extern long
+xfs_file_ioctl(
+ struct file *filp,
+ unsigned int cmd,
+ unsigned long p);
+
+extern long
+xfs_file_compat_ioctl(
+ struct file *file,
+ unsigned int cmd,
+ unsigned long arg);
+
+#endif
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index a4b254eb43b2..0504cece9f66 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -16,11 +16,7 @@
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include <linux/compat.h>
-#include <linux/init.h>
#include <linux/ioctl.h>
-#include <linux/syscalls.h>
-#include <linux/types.h>
-#include <linux/fs.h>
#include <asm/uaccess.h>
#include "xfs.h"
#include "xfs_fs.h"
@@ -36,7 +32,6 @@
#include "xfs_bmap_btree.h"
#include "xfs_attr_sf.h"
#include "xfs_dir2_sf.h"
-#include "xfs_vfs.h"
#include "xfs_vnode.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
@@ -44,221 +39,219 @@
#include "xfs_error.h"
#include "xfs_dfrag.h"
#include "xfs_vnodeops.h"
+#include "xfs_fsops.h"
+#include "xfs_alloc.h"
+#include "xfs_rtalloc.h"
+#include "xfs_attr.h"
+#include "xfs_ioctl.h"
#include "xfs_ioctl32.h"
#define _NATIVE_IOC(cmd, type) \
_IOC(_IOC_DIR(cmd), _IOC_TYPE(cmd), _IOC_NR(cmd), sizeof(type))
-#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
-#define BROKEN_X86_ALIGNMENT
-#define _PACKED __attribute__((packed))
-/* on ia32 l_start is on a 32-bit boundary */
-typedef struct xfs_flock64_32 {
- __s16 l_type;
- __s16 l_whence;
- __s64 l_start __attribute__((packed));
- /* len == 0 means until end of file */
- __s64 l_len __attribute__((packed));
- __s32 l_sysid;
- __u32 l_pid;
- __s32 l_pad[4]; /* reserve area */
-} xfs_flock64_32_t;
-
-#define XFS_IOC_ALLOCSP_32 _IOW ('X', 10, struct xfs_flock64_32)
-#define XFS_IOC_FREESP_32 _IOW ('X', 11, struct xfs_flock64_32)
-#define XFS_IOC_ALLOCSP64_32 _IOW ('X', 36, struct xfs_flock64_32)
-#define XFS_IOC_FREESP64_32 _IOW ('X', 37, struct xfs_flock64_32)
-#define XFS_IOC_RESVSP_32 _IOW ('X', 40, struct xfs_flock64_32)
-#define XFS_IOC_UNRESVSP_32 _IOW ('X', 41, struct xfs_flock64_32)
-#define XFS_IOC_RESVSP64_32 _IOW ('X', 42, struct xfs_flock64_32)
-#define XFS_IOC_UNRESVSP64_32 _IOW ('X', 43, struct xfs_flock64_32)
-
-/* just account for different alignment */
-STATIC unsigned long
-xfs_ioctl32_flock(
- unsigned long arg)
+#ifdef BROKEN_X86_ALIGNMENT
+STATIC int
+xfs_compat_flock64_copyin(
+ xfs_flock64_t *bf,
+ compat_xfs_flock64_t __user *arg32)
{
- xfs_flock64_32_t __user *p32 = (void __user *)arg;
- xfs_flock64_t __user *p = compat_alloc_user_space(sizeof(*p));
-
- if (copy_in_user(&p->l_type, &p32->l_type, sizeof(s16)) ||
- copy_in_user(&p->l_whence, &p32->l_whence, sizeof(s16)) ||
- copy_in_user(&p->l_start, &p32->l_start, sizeof(s64)) ||
- copy_in_user(&p->l_len, &p32->l_len, sizeof(s64)) ||
- copy_in_user(&p->l_sysid, &p32->l_sysid, sizeof(s32)) ||
- copy_in_user(&p->l_pid, &p32->l_pid, sizeof(u32)) ||
- copy_in_user(&p->l_pad, &p32->l_pad, 4*sizeof(u32)))
- return -EFAULT;
-
- return (unsigned long)p;
+ if (get_user(bf->l_type, &arg32->l_type) ||
+ get_user(bf->l_whence, &arg32->l_whence) ||
+ get_user(bf->l_start, &arg32->l_start) ||
+ get_user(bf->l_len, &arg32->l_len) ||
+ get_user(bf->l_sysid, &arg32->l_sysid) ||
+ get_user(bf->l_pid, &arg32->l_pid) ||
+ copy_from_user(bf->l_pad, &arg32->l_pad, 4*sizeof(u32)))
+ return -XFS_ERROR(EFAULT);
+ return 0;
}
-typedef struct compat_xfs_fsop_geom_v1 {
- __u32 blocksize; /* filesystem (data) block size */
- __u32 rtextsize; /* realtime extent size */
- __u32 agblocks; /* fsblocks in an AG */
- __u32 agcount; /* number of allocation groups */
- __u32 logblocks; /* fsblocks in the log */
- __u32 sectsize; /* (data) sector size, bytes */
- __u32 inodesize; /* inode size in bytes */
- __u32 imaxpct; /* max allowed inode space(%) */
- __u64 datablocks; /* fsblocks in data subvolume */
- __u64 rtblocks; /* fsblocks in realtime subvol */
- __u64 rtextents; /* rt extents in realtime subvol*/
- __u64 logstart; /* starting fsblock of the log */
- unsigned char uuid[16]; /* unique id of the filesystem */
- __u32 sunit; /* stripe unit, fsblocks */
- __u32 swidth; /* stripe width, fsblocks */
- __s32 version; /* structure version */
- __u32 flags; /* superblock version flags */
- __u32 logsectsize; /* log sector size, bytes */
- __u32 rtsectsize; /* realtime sector size, bytes */
- __u32 dirblocksize; /* directory block size, bytes */
-} __attribute__((packed)) compat_xfs_fsop_geom_v1_t;
-
-#define XFS_IOC_FSGEOMETRY_V1_32 \
- _IOR ('X', 100, struct compat_xfs_fsop_geom_v1)
-
-STATIC unsigned long xfs_ioctl32_geom_v1(unsigned long arg)
+STATIC int
+xfs_compat_ioc_fsgeometry_v1(
+ struct xfs_mount *mp,
+ compat_xfs_fsop_geom_v1_t __user *arg32)
{
- compat_xfs_fsop_geom_v1_t __user *p32 = (void __user *)arg;
- xfs_fsop_geom_v1_t __user *p = compat_alloc_user_space(sizeof(*p));
+ xfs_fsop_geom_t fsgeo;
+ int error;
- if (copy_in_user(p, p32, sizeof(*p32)))
- return -EFAULT;
- return (unsigned long)p;
+ error = xfs_fs_geometry(mp, &fsgeo, 3);
+ if (error)
+ return -error;
+ /* The 32-bit variant simply has some padding at the end */
+ if (copy_to_user(arg32, &fsgeo, sizeof(struct compat_xfs_fsop_geom_v1)))
+ return -XFS_ERROR(EFAULT);
+ return 0;
}
-typedef struct compat_xfs_inogrp {
- __u64 xi_startino; /* starting inode number */
- __s32 xi_alloccount; /* # bits set in allocmask */
- __u64 xi_allocmask; /* mask of allocated inodes */
-} __attribute__((packed)) compat_xfs_inogrp_t;
-
-STATIC int xfs_inumbers_fmt_compat(
- void __user *ubuffer,
- const xfs_inogrp_t *buffer,
- long count,
- long *written)
+STATIC int
+xfs_compat_growfs_data_copyin(
+ struct xfs_growfs_data *in,
+ compat_xfs_growfs_data_t __user *arg32)
{
- compat_xfs_inogrp_t __user *p32 = ubuffer;
- long i;
+ if (get_user(in->newblocks, &arg32->newblocks) ||
+ get_user(in->imaxpct, &arg32->imaxpct))
+ return -XFS_ERROR(EFAULT);
+ return 0;
+}
+
+STATIC int
+xfs_compat_growfs_rt_copyin(
+ struct xfs_growfs_rt *in,
+ compat_xfs_growfs_rt_t __user *arg32)
+{
+ if (get_user(in->newblocks, &arg32->newblocks) ||
+ get_user(in->extsize, &arg32->extsize))
+ return -XFS_ERROR(EFAULT);
+ return 0;
+}
+
+STATIC int
+xfs_inumbers_fmt_compat(
+ void __user *ubuffer,
+ const xfs_inogrp_t *buffer,
+ long count,
+ long *written)
+{
+ compat_xfs_inogrp_t __user *p32 = ubuffer;
+ long i;
for (i = 0; i < count; i++) {
if (put_user(buffer[i].xi_startino, &p32[i].xi_startino) ||
put_user(buffer[i].xi_alloccount, &p32[i].xi_alloccount) ||
put_user(buffer[i].xi_allocmask, &p32[i].xi_allocmask))
- return -EFAULT;
+ return -XFS_ERROR(EFAULT);
}
*written = count * sizeof(*p32);
return 0;
}
#else
-
#define xfs_inumbers_fmt_compat xfs_inumbers_fmt
-#define _PACKED
+#endif /* BROKEN_X86_ALIGNMENT */
-#endif
+STATIC int
+xfs_ioctl32_bstime_copyin(
+ xfs_bstime_t *bstime,
+ compat_xfs_bstime_t __user *bstime32)
+{
+ compat_time_t sec32; /* tv_sec differs on 64 vs. 32 */
-/* XFS_IOC_FSBULKSTAT and friends */
+ if (get_user(sec32, &bstime32->tv_sec) ||
+ get_user(bstime->tv_nsec, &bstime32->tv_nsec))
+ return -XFS_ERROR(EFAULT);
+ bstime->tv_sec = sec32;
+ return 0;
+}
+
+/* xfs_bstat_t has differing alignment on intel, & bstime_t sizes everywhere */
+STATIC int
+xfs_ioctl32_bstat_copyin(
+ xfs_bstat_t *bstat,
+ compat_xfs_bstat_t __user *bstat32)
+{
+ if (get_user(bstat->bs_ino, &bstat32->bs_ino) ||
+ get_user(bstat->bs_mode, &bstat32->bs_mode) ||
+ get_user(bstat->bs_nlink, &bstat32->bs_nlink) ||
+ get_user(bstat->bs_uid, &bstat32->bs_uid) ||
+ get_user(bstat->bs_gid, &bstat32->bs_gid) ||
+ get_user(bstat->bs_rdev, &bstat32->bs_rdev) ||
+ get_user(bstat->bs_blksize, &bstat32->bs_blksize) ||
+ get_user(bstat->bs_size, &bstat32->bs_size) ||
+ xfs_ioctl32_bstime_copyin(&bstat->bs_atime, &bstat32->bs_atime) ||
+ xfs_ioctl32_bstime_copyin(&bstat->bs_mtime, &bstat32->bs_mtime) ||
+ xfs_ioctl32_bstime_copyin(&bstat->bs_ctime, &bstat32->bs_ctime) ||
+ get_user(bstat->bs_blocks, &bstat32->bs_size) ||
+ get_user(bstat->bs_xflags, &bstat32->bs_size) ||
+ get_user(bstat->bs_extsize, &bstat32->bs_extsize) ||
+ get_user(bstat->bs_extents, &bstat32->bs_extents) ||
+ get_user(bstat->bs_gen, &bstat32->bs_gen) ||
+ get_user(bstat->bs_projid, &bstat32->bs_projid) ||
+ get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) ||
+ get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) ||
+ get_user(bstat->bs_aextents, &bstat32->bs_aextents))
+ return -XFS_ERROR(EFAULT);
+ return 0;
+}
-typedef struct compat_xfs_bstime {
- __s32 tv_sec; /* seconds */
- __s32 tv_nsec; /* and nanoseconds */
-} compat_xfs_bstime_t;
+/* XFS_IOC_FSBULKSTAT and friends */
-STATIC int xfs_bstime_store_compat(
- compat_xfs_bstime_t __user *p32,
- const xfs_bstime_t *p)
+STATIC int
+xfs_bstime_store_compat(
+ compat_xfs_bstime_t __user *p32,
+ const xfs_bstime_t *p)
{
- __s32 sec32;
+ __s32 sec32;
sec32 = p->tv_sec;
if (put_user(sec32, &p32->tv_sec) ||
put_user(p->tv_nsec, &p32->tv_nsec))
- return -EFAULT;
+ return -XFS_ERROR(EFAULT);
return 0;
}
-typedef struct compat_xfs_bstat {
- __u64 bs_ino; /* inode number */
- __u16 bs_mode; /* type and mode */
- __u16 bs_nlink; /* number of links */
- __u32 bs_uid; /* user id */
- __u32 bs_gid; /* group id */
- __u32 bs_rdev; /* device value */
- __s32 bs_blksize; /* block size */
- __s64 bs_size; /* file size */
- compat_xfs_bstime_t bs_atime; /* access time */
- compat_xfs_bstime_t bs_mtime; /* modify time */
- compat_xfs_bstime_t bs_ctime; /* inode change time */
- int64_t bs_blocks; /* number of blocks */
- __u32 bs_xflags; /* extended flags */
- __s32 bs_extsize; /* extent size */
- __s32 bs_extents; /* number of extents */
- __u32 bs_gen; /* generation count */
- __u16 bs_projid; /* project id */
- unsigned char bs_pad[14]; /* pad space, unused */
- __u32 bs_dmevmask; /* DMIG event mask */
- __u16 bs_dmstate; /* DMIG state info */
- __u16 bs_aextents; /* attribute number of extents */
-} _PACKED compat_xfs_bstat_t;
-
-STATIC int xfs_bulkstat_one_fmt_compat(
+/* Return 0 on success or positive error (to xfs_bulkstat()) */
+STATIC int
+xfs_bulkstat_one_fmt_compat(
void __user *ubuffer,
+ int ubsize,
+ int *ubused,
const xfs_bstat_t *buffer)
{
- compat_xfs_bstat_t __user *p32 = ubuffer;
-
- if (put_user(buffer->bs_ino, &p32->bs_ino) ||
- put_user(buffer->bs_mode, &p32->bs_mode) ||
- put_user(buffer->bs_nlink, &p32->bs_nlink) ||
- put_user(buffer->bs_uid, &p32->bs_uid) ||
- put_user(buffer->bs_gid, &p32->bs_gid) ||
- put_user(buffer->bs_rdev, &p32->bs_rdev) ||
- put_user(buffer->bs_blksize, &p32->bs_blksize) ||
- put_user(buffer->bs_size, &p32->bs_size) ||
+ compat_xfs_bstat_t __user *p32 = ubuffer;
+
+ if (ubsize < sizeof(*p32))
+ return XFS_ERROR(ENOMEM);
+
+ if (put_user(buffer->bs_ino, &p32->bs_ino) ||
+ put_user(buffer->bs_mode, &p32->bs_mode) ||
+ put_user(buffer->bs_nlink, &p32->bs_nlink) ||
+ put_user(buffer->bs_uid, &p32->bs_uid) ||
+ put_user(buffer->bs_gid, &p32->bs_gid) ||
+ put_user(buffer->bs_rdev, &p32->bs_rdev) ||
+ put_user(buffer->bs_blksize, &p32->bs_blksize) ||
+ put_user(buffer->bs_size, &p32->bs_size) ||
xfs_bstime_store_compat(&p32->bs_atime, &buffer->bs_atime) ||
xfs_bstime_store_compat(&p32->bs_mtime, &buffer->bs_mtime) ||
xfs_bstime_store_compat(&p32->bs_ctime, &buffer->bs_ctime) ||
- put_user(buffer->bs_blocks, &p32->bs_blocks) ||
- put_user(buffer->bs_xflags, &p32->bs_xflags) ||
- put_user(buffer->bs_extsize, &p32->bs_extsize) ||
- put_user(buffer->bs_extents, &p32->bs_extents) ||
- put_user(buffer->bs_gen, &p32->bs_gen) ||
- put_user(buffer->bs_projid, &p32->bs_projid) ||
- put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) ||
- put_user(buffer->bs_dmstate, &p32->bs_dmstate) ||
+ put_user(buffer->bs_blocks, &p32->bs_blocks) ||
+ put_user(buffer->bs_xflags, &p32->bs_xflags) ||
+ put_user(buffer->bs_extsize, &p32->bs_extsize) ||
+ put_user(buffer->bs_extents, &p32->bs_extents) ||
+ put_user(buffer->bs_gen, &p32->bs_gen) ||
+ put_user(buffer->bs_projid, &p32->bs_projid) ||
+ put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) ||
+ put_user(buffer->bs_dmstate, &p32->bs_dmstate) ||
put_user(buffer->bs_aextents, &p32->bs_aextents))
- return -EFAULT;
- return sizeof(*p32);
+ return XFS_ERROR(EFAULT);
+ if (ubused)
+ *ubused = sizeof(*p32);
+ return 0;
}
-
-
-typedef struct compat_xfs_fsop_bulkreq {
- compat_uptr_t lastip; /* last inode # pointer */
- __s32 icount; /* count of entries in buffer */
- compat_uptr_t ubuffer; /* user buffer for inode desc. */
- compat_uptr_t ocount; /* output count pointer */
-} compat_xfs_fsop_bulkreq_t;
-
-#define XFS_IOC_FSBULKSTAT_32 \
- _IOWR('X', 101, struct compat_xfs_fsop_bulkreq)
-#define XFS_IOC_FSBULKSTAT_SINGLE_32 \
- _IOWR('X', 102, struct compat_xfs_fsop_bulkreq)
-#define XFS_IOC_FSINUMBERS_32 \
- _IOWR('X', 103, struct compat_xfs_fsop_bulkreq)
+STATIC int
+xfs_bulkstat_one_compat(
+ xfs_mount_t *mp, /* mount point for filesystem */
+ xfs_ino_t ino, /* inode number to get data for */
+ void __user *buffer, /* buffer to place output in */
+ int ubsize, /* size of buffer */
+ void *private_data, /* my private data */
+ xfs_daddr_t bno, /* starting bno of inode cluster */
+ int *ubused, /* bytes used by me */
+ void *dibuff, /* on-disk inode buffer */
+ int *stat) /* BULKSTAT_RV_... */
+{
+ return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
+ xfs_bulkstat_one_fmt_compat, bno,
+ ubused, dibuff, stat);
+}
/* copied from xfs_ioctl.c */
STATIC int
-xfs_ioc_bulkstat_compat(
- xfs_mount_t *mp,
- unsigned int cmd,
- void __user *arg)
+xfs_compat_ioc_bulkstat(
+ xfs_mount_t *mp,
+ unsigned int cmd,
+ compat_xfs_fsop_bulkreq_t __user *p32)
{
- compat_xfs_fsop_bulkreq_t __user *p32 = (void __user *)arg;
u32 addr;
xfs_fsop_bulkreq_t bulkreq;
int count; /* # of records returned */
@@ -270,20 +263,20 @@ xfs_ioc_bulkstat_compat(
/* should be called again (unused here, but used in dmapi) */
if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+ return -XFS_ERROR(EPERM);
if (XFS_FORCED_SHUTDOWN(mp))
return -XFS_ERROR(EIO);
if (get_user(addr, &p32->lastip))
- return -EFAULT;
+ return -XFS_ERROR(EFAULT);
bulkreq.lastip = compat_ptr(addr);
if (get_user(bulkreq.icount, &p32->icount) ||
get_user(addr, &p32->ubuffer))
- return -EFAULT;
+ return -XFS_ERROR(EFAULT);
bulkreq.ubuffer = compat_ptr(addr);
if (get_user(addr, &p32->ocount))
- return -EFAULT;
+ return -XFS_ERROR(EFAULT);
bulkreq.ocount = compat_ptr(addr);
if (copy_from_user(&inlast, bulkreq.lastip, sizeof(__s64)))
@@ -295,17 +288,22 @@ xfs_ioc_bulkstat_compat(
if (bulkreq.ubuffer == NULL)
return -XFS_ERROR(EINVAL);
- if (cmd == XFS_IOC_FSINUMBERS)
+ if (cmd == XFS_IOC_FSINUMBERS_32) {
error = xfs_inumbers(mp, &inlast, &count,
bulkreq.ubuffer, xfs_inumbers_fmt_compat);
- else {
- /* declare a var to get a warning in case the type changes */
- bulkstat_one_fmt_pf formatter = xfs_bulkstat_one_fmt_compat;
+ } else if (cmd == XFS_IOC_FSBULKSTAT_SINGLE_32) {
+ int res;
+
+ error = xfs_bulkstat_one_compat(mp, inlast, bulkreq.ubuffer,
+ sizeof(compat_xfs_bstat_t),
+ NULL, 0, NULL, NULL, &res);
+ } else if (cmd == XFS_IOC_FSBULKSTAT_32) {
error = xfs_bulkstat(mp, &inlast, &count,
- xfs_bulkstat_one, formatter,
+ xfs_bulkstat_one_compat, NULL,
sizeof(compat_xfs_bstat_t), bulkreq.ubuffer,
BULKSTAT_FG_QUICK, &done);
- }
+ } else
+ error = XFS_ERROR(EINVAL);
if (error)
return -error;
@@ -321,63 +319,306 @@ xfs_ioc_bulkstat_compat(
return 0;
}
+STATIC int
+xfs_compat_handlereq_copyin(
+ xfs_fsop_handlereq_t *hreq,
+ compat_xfs_fsop_handlereq_t __user *arg32)
+{
+ compat_xfs_fsop_handlereq_t hreq32;
+
+ if (copy_from_user(&hreq32, arg32, sizeof(compat_xfs_fsop_handlereq_t)))
+ return -XFS_ERROR(EFAULT);
+
+ hreq->fd = hreq32.fd;
+ hreq->path = compat_ptr(hreq32.path);
+ hreq->oflags = hreq32.oflags;
+ hreq->ihandle = compat_ptr(hreq32.ihandle);
+ hreq->ihandlen = hreq32.ihandlen;
+ hreq->ohandle = compat_ptr(hreq32.ohandle);
+ hreq->ohandlen = compat_ptr(hreq32.ohandlen);
+ return 0;
+}
-typedef struct compat_xfs_fsop_handlereq {
- __u32 fd; /* fd for FD_TO_HANDLE */
- compat_uptr_t path; /* user pathname */
- __u32 oflags; /* open flags */
- compat_uptr_t ihandle; /* user supplied handle */
- __u32 ihandlen; /* user supplied length */
- compat_uptr_t ohandle; /* user buffer for handle */
- compat_uptr_t ohandlen; /* user buffer length */
-} compat_xfs_fsop_handlereq_t;
-
-#define XFS_IOC_PATH_TO_FSHANDLE_32 \
- _IOWR('X', 104, struct compat_xfs_fsop_handlereq)
-#define XFS_IOC_PATH_TO_HANDLE_32 \
- _IOWR('X', 105, struct compat_xfs_fsop_handlereq)
-#define XFS_IOC_FD_TO_HANDLE_32 \
- _IOWR('X', 106, struct compat_xfs_fsop_handlereq)
-#define XFS_IOC_OPEN_BY_HANDLE_32 \
- _IOWR('X', 107, struct compat_xfs_fsop_handlereq)
-#define XFS_IOC_READLINK_BY_HANDLE_32 \
- _IOWR('X', 108, struct compat_xfs_fsop_handlereq)
-
-STATIC unsigned long xfs_ioctl32_fshandle(unsigned long arg)
+/*
+ * Convert userspace handle data into inode.
+ *
+ * We use the fact that all the fsop_handlereq ioctl calls have a data
+ * structure argument whose first component is always a xfs_fsop_handlereq_t,
+ * so we can pass that sub structure into this handy, shared routine.
+ *
+ * If no error, caller must always iput the returned inode.
+ */
+STATIC int
+xfs_vget_fsop_handlereq_compat(
+ xfs_mount_t *mp,
+ struct inode *parinode, /* parent inode pointer */
+ compat_xfs_fsop_handlereq_t *hreq,
+ struct inode **inode)
{
- compat_xfs_fsop_handlereq_t __user *p32 = (void __user *)arg;
- xfs_fsop_handlereq_t __user *p = compat_alloc_user_space(sizeof(*p));
- u32 addr;
-
- if (copy_in_user(&p->fd, &p32->fd, sizeof(__u32)) ||
- get_user(addr, &p32->path) ||
- put_user(compat_ptr(addr), &p->path) ||
- copy_in_user(&p->oflags, &p32->oflags, sizeof(__u32)) ||
- get_user(addr, &p32->ihandle) ||
- put_user(compat_ptr(addr), &p->ihandle) ||
- copy_in_user(&p->ihandlen, &p32->ihandlen, sizeof(__u32)) ||
- get_user(addr, &p32->ohandle) ||
- put_user(compat_ptr(addr), &p->ohandle) ||
- get_user(addr, &p32->ohandlen) ||
- put_user(compat_ptr(addr), &p->ohandlen))
- return -EFAULT;
-
- return (unsigned long)p;
+ void __user *hanp;
+ size_t hlen;
+ xfs_fid_t *xfid;
+ xfs_handle_t *handlep;
+ xfs_handle_t handle;
+ xfs_inode_t *ip;
+ xfs_ino_t ino;
+ __u32 igen;
+ int error;
+
+ /*
+ * Only allow handle opens under a directory.
+ */
+ if (!S_ISDIR(parinode->i_mode))
+ return XFS_ERROR(ENOTDIR);
+
+ hanp = compat_ptr(hreq->ihandle);
+ hlen = hreq->ihandlen;
+ handlep = &handle;
+
+ if (hlen < sizeof(handlep->ha_fsid) || hlen > sizeof(*handlep))
+ return XFS_ERROR(EINVAL);
+ if (copy_from_user(handlep, hanp, hlen))
+ return XFS_ERROR(EFAULT);
+ if (hlen < sizeof(*handlep))
+ memset(((char *)handlep) + hlen, 0, sizeof(*handlep) - hlen);
+ if (hlen > sizeof(handlep->ha_fsid)) {
+ if (handlep->ha_fid.fid_len !=
+ (hlen - sizeof(handlep->ha_fsid) -
+ sizeof(handlep->ha_fid.fid_len)) ||
+ handlep->ha_fid.fid_pad)
+ return XFS_ERROR(EINVAL);
+ }
+
+ /*
+ * Crack the handle, obtain the inode # & generation #
+ */
+ xfid = (struct xfs_fid *)&handlep->ha_fid;
+ if (xfid->fid_len == sizeof(*xfid) - sizeof(xfid->fid_len)) {
+ ino = xfid->fid_ino;
+ igen = xfid->fid_gen;
+ } else {
+ return XFS_ERROR(EINVAL);
+ }
+
+ /*
+ * Get the XFS inode, building a Linux inode to go with it.
+ */
+ error = xfs_iget(mp, NULL, ino, 0, XFS_ILOCK_SHARED, &ip, 0);
+ if (error)
+ return error;
+ if (ip == NULL)
+ return XFS_ERROR(EIO);
+ if (ip->i_d.di_gen != igen) {
+ xfs_iput_new(ip, XFS_ILOCK_SHARED);
+ return XFS_ERROR(ENOENT);
+ }
+
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+ *inode = VFS_I(ip);
+ return 0;
}
+STATIC int
+xfs_compat_attrlist_by_handle(
+ xfs_mount_t *mp,
+ void __user *arg,
+ struct inode *parinode)
+{
+ int error;
+ attrlist_cursor_kern_t *cursor;
+ compat_xfs_fsop_attrlist_handlereq_t al_hreq;
+ struct inode *inode;
+ char *kbuf;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -XFS_ERROR(EPERM);
+ if (copy_from_user(&al_hreq, arg,
+ sizeof(compat_xfs_fsop_attrlist_handlereq_t)))
+ return -XFS_ERROR(EFAULT);
+ if (al_hreq.buflen > XATTR_LIST_MAX)
+ return -XFS_ERROR(EINVAL);
+
+ /*
+ * Reject flags, only allow namespaces.
+ */
+ if (al_hreq.flags & ~(ATTR_ROOT | ATTR_SECURE))
+ return -XFS_ERROR(EINVAL);
+
+ error = xfs_vget_fsop_handlereq_compat(mp, parinode, &al_hreq.hreq,
+ &inode);
+ if (error)
+ goto out;
+
+ kbuf = kmalloc(al_hreq.buflen, GFP_KERNEL);
+ if (!kbuf)
+ goto out_vn_rele;
+
+ cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
+ error = xfs_attr_list(XFS_I(inode), kbuf, al_hreq.buflen,
+ al_hreq.flags, cursor);
+ if (error)
+ goto out_kfree;
+
+ if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen))
+ error = -EFAULT;
+
+ out_kfree:
+ kfree(kbuf);
+ out_vn_rele:
+ iput(inode);
+ out:
+ return -error;
+}
-STATIC long
-xfs_compat_ioctl(
- int mode,
- struct file *file,
- unsigned cmd,
- unsigned long arg)
+STATIC int
+xfs_compat_attrmulti_by_handle(
+ xfs_mount_t *mp,
+ void __user *arg,
+ struct inode *parinode)
+{
+ int error;
+ compat_xfs_attr_multiop_t *ops;
+ compat_xfs_fsop_attrmulti_handlereq_t am_hreq;
+ struct inode *inode;
+ unsigned int i, size;
+ char *attr_name;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -XFS_ERROR(EPERM);
+ if (copy_from_user(&am_hreq, arg,
+ sizeof(compat_xfs_fsop_attrmulti_handlereq_t)))
+ return -XFS_ERROR(EFAULT);
+
+ error = xfs_vget_fsop_handlereq_compat(mp, parinode, &am_hreq.hreq,
+ &inode);
+ if (error)
+ goto out;
+
+ error = E2BIG;
+ size = am_hreq.opcount * sizeof(compat_xfs_attr_multiop_t);
+ if (!size || size > 16 * PAGE_SIZE)
+ goto out_vn_rele;
+
+ error = ENOMEM;
+ ops = kmalloc(size, GFP_KERNEL);
+ if (!ops)
+ goto out_vn_rele;
+
+ error = EFAULT;
+ if (copy_from_user(ops, compat_ptr(am_hreq.ops), size))
+ goto out_kfree_ops;
+
+ attr_name = kmalloc(MAXNAMELEN, GFP_KERNEL);
+ if (!attr_name)
+ goto out_kfree_ops;
+
+
+ error = 0;
+ for (i = 0; i < am_hreq.opcount; i++) {
+ ops[i].am_error = strncpy_from_user(attr_name,
+ compat_ptr(ops[i].am_attrname),
+ MAXNAMELEN);
+ if (ops[i].am_error == 0 || ops[i].am_error == MAXNAMELEN)
+ error = -ERANGE;
+ if (ops[i].am_error < 0)
+ break;
+
+ switch (ops[i].am_opcode) {
+ case ATTR_OP_GET:
+ ops[i].am_error = xfs_attrmulti_attr_get(inode,
+ attr_name,
+ compat_ptr(ops[i].am_attrvalue),
+ &ops[i].am_length, ops[i].am_flags);
+ break;
+ case ATTR_OP_SET:
+ ops[i].am_error = xfs_attrmulti_attr_set(inode,
+ attr_name,
+ compat_ptr(ops[i].am_attrvalue),
+ ops[i].am_length, ops[i].am_flags);
+ break;
+ case ATTR_OP_REMOVE:
+ ops[i].am_error = xfs_attrmulti_attr_remove(inode,
+ attr_name, ops[i].am_flags);
+ break;
+ default:
+ ops[i].am_error = EINVAL;
+ }
+ }
+
+ if (copy_to_user(compat_ptr(am_hreq.ops), ops, size))
+ error = XFS_ERROR(EFAULT);
+
+ kfree(attr_name);
+ out_kfree_ops:
+ kfree(ops);
+ out_vn_rele:
+ iput(inode);
+ out:
+ return -error;
+}
+
+STATIC int
+xfs_compat_fssetdm_by_handle(
+ xfs_mount_t *mp,
+ void __user *arg,
+ struct inode *parinode)
+{
+ int error;
+ struct fsdmidata fsd;
+ compat_xfs_fsop_setdm_handlereq_t dmhreq;
+ struct inode *inode;
+
+ if (!capable(CAP_MKNOD))
+ return -XFS_ERROR(EPERM);
+ if (copy_from_user(&dmhreq, arg,
+ sizeof(compat_xfs_fsop_setdm_handlereq_t)))
+ return -XFS_ERROR(EFAULT);
+
+ error = xfs_vget_fsop_handlereq_compat(mp, parinode, &dmhreq.hreq,
+ &inode);
+ if (error)
+ return -error;
+
+ if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) {
+ error = -XFS_ERROR(EPERM);
+ goto out;
+ }
+
+ if (copy_from_user(&fsd, compat_ptr(dmhreq.data), sizeof(fsd))) {
+ error = -XFS_ERROR(EFAULT);
+ goto out;
+ }
+
+ error = -xfs_set_dmattrs(XFS_I(inode), fsd.fsd_dmevmask,
+ fsd.fsd_dmstate);
+
+out:
+ iput(inode);
+ return error;
+}
+
+long
+xfs_file_compat_ioctl(
+ struct file *filp,
+ unsigned cmd,
+ unsigned long p)
{
- struct inode *inode = file->f_path.dentry->d_inode;
- int error;
+ struct inode *inode = filp->f_path.dentry->d_inode;
+ struct xfs_inode *ip = XFS_I(inode);
+ struct xfs_mount *mp = ip->i_mount;
+ void __user *arg = (void __user *)p;
+ int ioflags = 0;
+ int error;
+
+ if (filp->f_mode & FMODE_NOCMTIME)
+ ioflags |= IO_INVIS;
+
+ xfs_itrace_entry(ip);
switch (cmd) {
+ /* No size or alignment issues on any arch */
case XFS_IOC_DIOINFO:
case XFS_IOC_FSGEOMETRY:
case XFS_IOC_FSGETXATTR:
@@ -387,48 +628,18 @@ xfs_compat_ioctl(
case XFS_IOC_GETBMAP:
case XFS_IOC_GETBMAPA:
case XFS_IOC_GETBMAPX:
-/* not handled
- case XFS_IOC_FSSETDM_BY_HANDLE:
- case XFS_IOC_ATTRLIST_BY_HANDLE:
- case XFS_IOC_ATTRMULTI_BY_HANDLE:
-*/
case XFS_IOC_FSCOUNTS:
case XFS_IOC_SET_RESBLKS:
case XFS_IOC_GET_RESBLKS:
- case XFS_IOC_FSGROWFSDATA:
case XFS_IOC_FSGROWFSLOG:
- case XFS_IOC_FSGROWFSRT:
case XFS_IOC_FREEZE:
case XFS_IOC_THAW:
case XFS_IOC_GOINGDOWN:
case XFS_IOC_ERROR_INJECTION:
case XFS_IOC_ERROR_CLEARALL:
- break;
-
- case XFS_IOC32_GETXFLAGS:
- case XFS_IOC32_SETXFLAGS:
- case XFS_IOC32_GETVERSION:
- cmd = _NATIVE_IOC(cmd, long);
- break;
-#ifdef BROKEN_X86_ALIGNMENT
- /* xfs_flock_t has wrong u32 vs u64 alignment */
- case XFS_IOC_ALLOCSP_32:
- case XFS_IOC_FREESP_32:
- case XFS_IOC_ALLOCSP64_32:
- case XFS_IOC_FREESP64_32:
- case XFS_IOC_RESVSP_32:
- case XFS_IOC_UNRESVSP_32:
- case XFS_IOC_RESVSP64_32:
- case XFS_IOC_UNRESVSP64_32:
- arg = xfs_ioctl32_flock(arg);
- cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
- break;
- case XFS_IOC_FSGEOMETRY_V1_32:
- arg = xfs_ioctl32_geom_v1(arg);
- cmd = _NATIVE_IOC(cmd, struct xfs_fsop_geom_v1);
- break;
-
-#else /* These are handled fine if no alignment issues */
+ return xfs_file_ioctl(filp, cmd, p);
+#ifndef BROKEN_X86_ALIGNMENT
+ /* These are handled fine if no alignment issues */
case XFS_IOC_ALLOCSP:
case XFS_IOC_FREESP:
case XFS_IOC_RESVSP:
@@ -438,51 +649,97 @@ xfs_compat_ioctl(
case XFS_IOC_RESVSP64:
case XFS_IOC_UNRESVSP64:
case XFS_IOC_FSGEOMETRY_V1:
- break;
+ case XFS_IOC_FSGROWFSDATA:
+ case XFS_IOC_FSGROWFSRT:
+ return xfs_file_ioctl(filp, cmd, p);
+#else
+ case XFS_IOC_ALLOCSP_32:
+ case XFS_IOC_FREESP_32:
+ case XFS_IOC_ALLOCSP64_32:
+ case XFS_IOC_FREESP64_32:
+ case XFS_IOC_RESVSP_32:
+ case XFS_IOC_UNRESVSP_32:
+ case XFS_IOC_RESVSP64_32:
+ case XFS_IOC_UNRESVSP64_32: {
+ struct xfs_flock64 bf;
- /* xfs_bstat_t still has wrong u32 vs u64 alignment */
- case XFS_IOC_SWAPEXT:
- break;
+ if (xfs_compat_flock64_copyin(&bf, arg))
+ return -XFS_ERROR(EFAULT);
+ cmd = _NATIVE_IOC(cmd, struct xfs_flock64);
+ return xfs_ioc_space(ip, inode, filp, ioflags, cmd, &bf);
+ }
+ case XFS_IOC_FSGEOMETRY_V1_32:
+ return xfs_compat_ioc_fsgeometry_v1(mp, arg);
+ case XFS_IOC_FSGROWFSDATA_32: {
+ struct xfs_growfs_data in;
+
+ if (xfs_compat_growfs_data_copyin(&in, arg))
+ return -XFS_ERROR(EFAULT);
+ error = xfs_growfs_data(mp, &in);
+ return -error;
+ }
+ case XFS_IOC_FSGROWFSRT_32: {
+ struct xfs_growfs_rt in;
+ if (xfs_compat_growfs_rt_copyin(&in, arg))
+ return -XFS_ERROR(EFAULT);
+ error = xfs_growfs_rt(mp, &in);
+ return -error;
+ }
#endif
+ /* long changes size, but xfs only copiese out 32 bits */
+ case XFS_IOC_GETXFLAGS_32:
+ case XFS_IOC_SETXFLAGS_32:
+ case XFS_IOC_GETVERSION_32:
+ cmd = _NATIVE_IOC(cmd, long);
+ return xfs_file_ioctl(filp, cmd, p);
+ case XFS_IOC_SWAPEXT: {
+ struct xfs_swapext sxp;
+ struct compat_xfs_swapext __user *sxu = arg;
+
+ /* Bulk copy in up to the sx_stat field, then copy bstat */
+ if (copy_from_user(&sxp, sxu,
+ offsetof(struct xfs_swapext, sx_stat)) ||
+ xfs_ioctl32_bstat_copyin(&sxp.sx_stat, &sxu->sx_stat))
+ return -XFS_ERROR(EFAULT);
+ error = xfs_swapext(&sxp);
+ return -error;
+ }
case XFS_IOC_FSBULKSTAT_32:
case XFS_IOC_FSBULKSTAT_SINGLE_32:
case XFS_IOC_FSINUMBERS_32:
- cmd = _NATIVE_IOC(cmd, struct xfs_fsop_bulkreq);
- return xfs_ioc_bulkstat_compat(XFS_I(inode)->i_mount,
- cmd, (void __user*)arg);
+ return xfs_compat_ioc_bulkstat(mp, cmd, arg);
case XFS_IOC_FD_TO_HANDLE_32:
case XFS_IOC_PATH_TO_HANDLE_32:
- case XFS_IOC_PATH_TO_FSHANDLE_32:
- case XFS_IOC_OPEN_BY_HANDLE_32:
- case XFS_IOC_READLINK_BY_HANDLE_32:
- arg = xfs_ioctl32_fshandle(arg);
+ case XFS_IOC_PATH_TO_FSHANDLE_32: {
+ struct xfs_fsop_handlereq hreq;
+
+ if (xfs_compat_handlereq_copyin(&hreq, arg))
+ return -XFS_ERROR(EFAULT);
cmd = _NATIVE_IOC(cmd, struct xfs_fsop_handlereq);
- break;
- default:
- return -ENOIOCTLCMD;
+ return xfs_find_handle(cmd, &hreq);
}
+ case XFS_IOC_OPEN_BY_HANDLE_32: {
+ struct xfs_fsop_handlereq hreq;
- error = xfs_ioctl(XFS_I(inode), file, mode, cmd, (void __user *)arg);
- xfs_iflags_set(XFS_I(inode), XFS_IMODIFIED);
-
- return error;
-}
-
-long
-xfs_file_compat_ioctl(
- struct file *file,
- unsigned cmd,
- unsigned long arg)
-{
- return xfs_compat_ioctl(0, file, cmd, arg);
-}
+ if (xfs_compat_handlereq_copyin(&hreq, arg))
+ return -XFS_ERROR(EFAULT);
+ return xfs_open_by_handle(mp, &hreq, filp, inode);
+ }
+ case XFS_IOC_READLINK_BY_HANDLE_32: {
+ struct xfs_fsop_handlereq hreq;
-long
-xfs_file_compat_invis_ioctl(
- struct file *file,
- unsigned cmd,
- unsigned long arg)
-{
- return xfs_compat_ioctl(IO_INVIS, file, cmd, arg);
+ if (xfs_compat_handlereq_copyin(&hreq, arg))
+ return -XFS_ERROR(EFAULT);
+ return xfs_readlink_by_handle(mp, &hreq, inode);
+ }
+ case XFS_IOC_ATTRLIST_BY_HANDLE_32:
+ return xfs_compat_attrlist_by_handle(mp, arg, inode);
+ case XFS_IOC_ATTRMULTI_BY_HANDLE_32:
+ return xfs_compat_attrmulti_by_handle(mp, arg, inode);
+ case XFS_IOC_FSSETDM_BY_HANDLE_32:
+ return xfs_compat_fssetdm_by_handle(mp, arg, inode);
+ default:
+ return -XFS_ERROR(ENOIOCTLCMD);
+ }
}
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h
index 02de6e62ee37..1024c4f8ba0d 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.h
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.h
@@ -18,7 +18,217 @@
#ifndef __XFS_IOCTL32_H__
#define __XFS_IOCTL32_H__
-extern long xfs_file_compat_ioctl(struct file *, unsigned, unsigned long);
-extern long xfs_file_compat_invis_ioctl(struct file *, unsigned, unsigned long);
+#include <linux/compat.h>
+
+/*
+ * on 32-bit arches, ioctl argument structures may have different sizes
+ * and/or alignment. We define compat structures which match the
+ * 32-bit sizes/alignments here, and their associated ioctl numbers.
+ *
+ * xfs_ioctl32.c contains routines to copy these structures in and out.
+ */
+
+/* stock kernel-level ioctls we support */
+#define XFS_IOC_GETXFLAGS_32 FS_IOC32_GETFLAGS
+#define XFS_IOC_SETXFLAGS_32 FS_IOC32_SETFLAGS
+#define XFS_IOC_GETVERSION_32 FS_IOC32_GETVERSION
+
+/*
+ * On intel, even if sizes match, alignment and/or padding may differ.
+ */
+#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
+#define BROKEN_X86_ALIGNMENT
+#define __compat_packed __attribute__((packed))
+#else
+#define __compat_packed
+#endif
+
+typedef struct compat_xfs_bstime {
+ compat_time_t tv_sec; /* seconds */
+ __s32 tv_nsec; /* and nanoseconds */
+} compat_xfs_bstime_t;
+
+typedef struct compat_xfs_bstat {
+ __u64 bs_ino; /* inode number */
+ __u16 bs_mode; /* type and mode */
+ __u16 bs_nlink; /* number of links */
+ __u32 bs_uid; /* user id */
+ __u32 bs_gid; /* group id */
+ __u32 bs_rdev; /* device value */
+ __s32 bs_blksize; /* block size */
+ __s64 bs_size; /* file size */
+ compat_xfs_bstime_t bs_atime; /* access time */
+ compat_xfs_bstime_t bs_mtime; /* modify time */
+ compat_xfs_bstime_t bs_ctime; /* inode change time */
+ int64_t bs_blocks; /* number of blocks */
+ __u32 bs_xflags; /* extended flags */
+ __s32 bs_extsize; /* extent size */
+ __s32 bs_extents; /* number of extents */
+ __u32 bs_gen; /* generation count */
+ __u16 bs_projid; /* project id */
+ unsigned char bs_pad[14]; /* pad space, unused */
+ __u32 bs_dmevmask; /* DMIG event mask */
+ __u16 bs_dmstate; /* DMIG state info */
+ __u16 bs_aextents; /* attribute number of extents */
+} __compat_packed compat_xfs_bstat_t;
+
+typedef struct compat_xfs_fsop_bulkreq {
+ compat_uptr_t lastip; /* last inode # pointer */
+ __s32 icount; /* count of entries in buffer */
+ compat_uptr_t ubuffer; /* user buffer for inode desc. */
+ compat_uptr_t ocount; /* output count pointer */
+} compat_xfs_fsop_bulkreq_t;
+
+#define XFS_IOC_FSBULKSTAT_32 \
+ _IOWR('X', 101, struct compat_xfs_fsop_bulkreq)
+#define XFS_IOC_FSBULKSTAT_SINGLE_32 \
+ _IOWR('X', 102, struct compat_xfs_fsop_bulkreq)
+#define XFS_IOC_FSINUMBERS_32 \
+ _IOWR('X', 103, struct compat_xfs_fsop_bulkreq)
+
+typedef struct compat_xfs_fsop_handlereq {
+ __u32 fd; /* fd for FD_TO_HANDLE */
+ compat_uptr_t path; /* user pathname */
+ __u32 oflags; /* open flags */
+ compat_uptr_t ihandle; /* user supplied handle */
+ __u32 ihandlen; /* user supplied length */
+ compat_uptr_t ohandle; /* user buffer for handle */
+ compat_uptr_t ohandlen; /* user buffer length */
+} compat_xfs_fsop_handlereq_t;
+
+#define XFS_IOC_PATH_TO_FSHANDLE_32 \
+ _IOWR('X', 104, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_PATH_TO_HANDLE_32 \
+ _IOWR('X', 105, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_FD_TO_HANDLE_32 \
+ _IOWR('X', 106, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_OPEN_BY_HANDLE_32 \
+ _IOWR('X', 107, struct compat_xfs_fsop_handlereq)
+#define XFS_IOC_READLINK_BY_HANDLE_32 \
+ _IOWR('X', 108, struct compat_xfs_fsop_handlereq)
+
+/* The bstat field in the swapext struct needs translation */
+typedef struct compat_xfs_swapext {
+ __int64_t sx_version; /* version */
+ __int64_t sx_fdtarget; /* fd of target file */
+ __int64_t sx_fdtmp; /* fd of tmp file */
+ xfs_off_t sx_offset; /* offset into file */
+ xfs_off_t sx_length; /* leng from offset */
+ char sx_pad[16]; /* pad space, unused */
+ compat_xfs_bstat_t sx_stat; /* stat of target b4 copy */
+} __compat_packed compat_xfs_swapext_t;
+
+#define XFS_IOC_SWAPEXT_32 _IOWR('X', 109, struct compat_xfs_swapext)
+
+typedef struct compat_xfs_fsop_attrlist_handlereq {
+ struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
+ struct xfs_attrlist_cursor pos; /* opaque cookie, list offset */
+ __u32 flags; /* which namespace to use */
+ __u32 buflen; /* length of buffer supplied */
+ compat_uptr_t buffer; /* returned names */
+} __compat_packed compat_xfs_fsop_attrlist_handlereq_t;
+
+/* Note: actually this is read/write */
+#define XFS_IOC_ATTRLIST_BY_HANDLE_32 \
+ _IOW('X', 122, struct compat_xfs_fsop_attrlist_handlereq)
+
+/* am_opcodes defined in xfs_fs.h */
+typedef struct compat_xfs_attr_multiop {
+ __u32 am_opcode;
+ __s32 am_error;
+ compat_uptr_t am_attrname;
+ compat_uptr_t am_attrvalue;
+ __u32 am_length;
+ __u32 am_flags;
+} compat_xfs_attr_multiop_t;
+
+typedef struct compat_xfs_fsop_attrmulti_handlereq {
+ struct compat_xfs_fsop_handlereq hreq; /* handle interface structure */
+ __u32 opcount;/* count of following multiop */
+ /* ptr to compat_xfs_attr_multiop */
+ compat_uptr_t ops; /* attr_multi data */
+} compat_xfs_fsop_attrmulti_handlereq_t;
+
+#define XFS_IOC_ATTRMULTI_BY_HANDLE_32 \
+ _IOW('X', 123, struct compat_xfs_fsop_attrmulti_handlereq)
+
+typedef struct compat_xfs_fsop_setdm_handlereq {
+ struct compat_xfs_fsop_handlereq hreq; /* handle information */
+ /* ptr to struct fsdmidata */
+ compat_uptr_t data; /* DMAPI data */
+} compat_xfs_fsop_setdm_handlereq_t;
+
+#define XFS_IOC_FSSETDM_BY_HANDLE_32 \
+ _IOW('X', 121, struct compat_xfs_fsop_setdm_handlereq)
+
+#ifdef BROKEN_X86_ALIGNMENT
+/* on ia32 l_start is on a 32-bit boundary */
+typedef struct compat_xfs_flock64 {
+ __s16 l_type;
+ __s16 l_whence;
+ __s64 l_start __attribute__((packed));
+ /* len == 0 means until end of file */
+ __s64 l_len __attribute__((packed));
+ __s32 l_sysid;
+ __u32 l_pid;
+ __s32 l_pad[4]; /* reserve area */
+} compat_xfs_flock64_t;
+
+#define XFS_IOC_ALLOCSP_32 _IOW('X', 10, struct compat_xfs_flock64)
+#define XFS_IOC_FREESP_32 _IOW('X', 11, struct compat_xfs_flock64)
+#define XFS_IOC_ALLOCSP64_32 _IOW('X', 36, struct compat_xfs_flock64)
+#define XFS_IOC_FREESP64_32 _IOW('X', 37, struct compat_xfs_flock64)
+#define XFS_IOC_RESVSP_32 _IOW('X', 40, struct compat_xfs_flock64)
+#define XFS_IOC_UNRESVSP_32 _IOW('X', 41, struct compat_xfs_flock64)
+#define XFS_IOC_RESVSP64_32 _IOW('X', 42, struct compat_xfs_flock64)
+#define XFS_IOC_UNRESVSP64_32 _IOW('X', 43, struct compat_xfs_flock64)
+
+typedef struct compat_xfs_fsop_geom_v1 {
+ __u32 blocksize; /* filesystem (data) block size */
+ __u32 rtextsize; /* realtime extent size */
+ __u32 agblocks; /* fsblocks in an AG */
+ __u32 agcount; /* number of allocation groups */
+ __u32 logblocks; /* fsblocks in the log */
+ __u32 sectsize; /* (data) sector size, bytes */
+ __u32 inodesize; /* inode size in bytes */
+ __u32 imaxpct; /* max allowed inode space(%) */
+ __u64 datablocks; /* fsblocks in data subvolume */
+ __u64 rtblocks; /* fsblocks in realtime subvol */
+ __u64 rtextents; /* rt extents in realtime subvol*/
+ __u64 logstart; /* starting fsblock of the log */
+ unsigned char uuid[16]; /* unique id of the filesystem */
+ __u32 sunit; /* stripe unit, fsblocks */
+ __u32 swidth; /* stripe width, fsblocks */
+ __s32 version; /* structure version */
+ __u32 flags; /* superblock version flags */
+ __u32 logsectsize; /* log sector size, bytes */
+ __u32 rtsectsize; /* realtime sector size, bytes */
+ __u32 dirblocksize; /* directory block size, bytes */
+} __attribute__((packed)) compat_xfs_fsop_geom_v1_t;
+
+#define XFS_IOC_FSGEOMETRY_V1_32 \
+ _IOR('X', 100, struct compat_xfs_fsop_geom_v1)
+
+typedef struct compat_xfs_inogrp {
+ __u64 xi_startino; /* starting inode number */
+ __s32 xi_alloccount; /* # bits set in allocmask */
+ __u64 xi_allocmask; /* mask of allocated inodes */
+} __attribute__((packed)) compat_xfs_inogrp_t;
+
+/* These growfs input structures have padding on the end, so must translate */
+typedef struct compat_xfs_growfs_data {
+ __u64 newblocks; /* new data subvol size, fsblocks */
+ __u32 imaxpct; /* new inode space percentage limit */
+} __attribute__((packed)) compat_xfs_growfs_data_t;
+
+typedef struct compat_xfs_growfs_rt {
+ __u64 newblocks; /* new realtime size, fsblocks */
+ __u32 extsize; /* new realtime extent size, fsblocks */
+} __attribute__((packed)) compat_xfs_growfs_rt_t;
+
+#define XFS_IOC_FSGROWFSDATA_32 _IOW('X', 110, struct compat_xfs_growfs_data)
+#define XFS_IOC_FSGROWFSRT_32 _IOW('X', 112, struct compat_xfs_growfs_rt)
+
+#endif /* BROKEN_X86_ALIGNMENT */
#endif /* __XFS_IOCTL32_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 095d271f3434..7aa53fefc67f 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -53,6 +53,7 @@
#include <linux/namei.h>
#include <linux/security.h>
#include <linux/falloc.h>
+#include <linux/fiemap.h>
/*
* Bring the atime in the XFS inode uptodate.
@@ -64,14 +65,14 @@ xfs_synchronize_atime(
{
struct inode *inode = VFS_I(ip);
- if (inode) {
+ if (!(inode->i_state & I_CLEAR)) {
ip->i_d.di_atime.t_sec = (__int32_t)inode->i_atime.tv_sec;
ip->i_d.di_atime.t_nsec = (__int32_t)inode->i_atime.tv_nsec;
}
}
/*
- * If the linux inode exists, mark it dirty.
+ * If the linux inode is valid, mark it dirty.
* Used when commiting a dirty inode into a transaction so that
* the inode will get written back by the linux code
*/
@@ -81,7 +82,7 @@ xfs_mark_inode_dirty_sync(
{
struct inode *inode = VFS_I(ip);
- if (inode)
+ if (!(inode->i_state & (I_WILL_FREE|I_FREEING|I_CLEAR)))
mark_inode_dirty_sync(inode);
}
@@ -128,7 +129,7 @@ xfs_ichgtime(
if (sync_it) {
SYNCHRONIZE();
ip->i_update_core = 1;
- mark_inode_dirty_sync(inode);
+ xfs_mark_inode_dirty_sync(ip);
}
}
@@ -158,8 +159,6 @@ xfs_init_security(
}
error = xfs_attr_set(ip, name, value, length, ATTR_SECURE);
- if (!error)
- xfs_iflags_set(ip, XFS_IMODIFIED);
kfree(name);
kfree(value);
@@ -260,7 +259,6 @@ xfs_vn_mknod(
error = _ACL_INHERIT(inode, mode, default_acl);
if (unlikely(error))
goto out_cleanup_inode;
- xfs_iflags_set(ip, XFS_IMODIFIED);
_ACL_FREE(default_acl);
}
@@ -366,21 +364,17 @@ xfs_vn_link(
struct inode *dir,
struct dentry *dentry)
{
- struct inode *inode; /* inode of guy being linked to */
+ struct inode *inode = old_dentry->d_inode;
struct xfs_name name;
int error;
- inode = old_dentry->d_inode;
xfs_dentry_to_name(&name, dentry);
- igrab(inode);
error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
- if (unlikely(error)) {
- iput(inode);
+ if (unlikely(error))
return -error;
- }
- xfs_iflags_set(XFS_I(dir), XFS_IMODIFIED);
+ atomic_inc(&inode->i_count);
d_instantiate(dentry, inode);
return 0;
}
@@ -601,7 +595,7 @@ xfs_vn_setattr(
struct dentry *dentry,
struct iattr *iattr)
{
- return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0, NULL);
+ return -xfs_setattr(XFS_I(dentry->d_inode), iattr, 0);
}
/*
@@ -642,7 +636,7 @@ xfs_vn_fallocate(
xfs_ilock(ip, XFS_IOLOCK_EXCL);
error = xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
- 0, NULL, XFS_ATTR_NOLOCK);
+ 0, XFS_ATTR_NOLOCK);
if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
offset + len > i_size_read(inode))
new_size = offset + len;
@@ -653,7 +647,7 @@ xfs_vn_fallocate(
iattr.ia_valid = ATTR_SIZE;
iattr.ia_size = new_size;
- error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK, NULL);
+ error = xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
}
xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -661,6 +655,88 @@ out_error:
return error;
}
+#define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR)
+
+/*
+ * Call fiemap helper to fill in user data.
+ * Returns positive errors to xfs_getbmap.
+ */
+STATIC int
+xfs_fiemap_format(
+ void **arg,
+ struct getbmapx *bmv,
+ int *full)
+{
+ int error;
+ struct fiemap_extent_info *fieinfo = *arg;
+ u32 fiemap_flags = 0;
+ u64 logical, physical, length;
+
+ /* Do nothing for a hole */
+ if (bmv->bmv_block == -1LL)
+ return 0;
+
+ logical = BBTOB(bmv->bmv_offset);
+ physical = BBTOB(bmv->bmv_block);
+ length = BBTOB(bmv->bmv_length);
+
+ if (bmv->bmv_oflags & BMV_OF_PREALLOC)
+ fiemap_flags |= FIEMAP_EXTENT_UNWRITTEN;
+ else if (bmv->bmv_oflags & BMV_OF_DELALLOC) {
+ fiemap_flags |= FIEMAP_EXTENT_DELALLOC;
+ physical = 0; /* no block yet */
+ }
+ if (bmv->bmv_oflags & BMV_OF_LAST)
+ fiemap_flags |= FIEMAP_EXTENT_LAST;
+
+ error = fiemap_fill_next_extent(fieinfo, logical, physical,
+ length, fiemap_flags);
+ if (error > 0) {
+ error = 0;
+ *full = 1; /* user array now full */
+ }
+
+ return -error;
+}
+
+STATIC int
+xfs_vn_fiemap(
+ struct inode *inode,
+ struct fiemap_extent_info *fieinfo,
+ u64 start,
+ u64 length)
+{
+ xfs_inode_t *ip = XFS_I(inode);
+ struct getbmapx bm;
+ int error;
+
+ error = fiemap_check_flags(fieinfo, XFS_FIEMAP_FLAGS);
+ if (error)
+ return error;
+
+ /* Set up bmap header for xfs internal routine */
+ bm.bmv_offset = BTOBB(start);
+ /* Special case for whole file */
+ if (length == FIEMAP_MAX_OFFSET)
+ bm.bmv_length = -1LL;
+ else
+ bm.bmv_length = BTOBB(length);
+
+ /* our formatter will tell xfs_getbmap when to stop. */
+ bm.bmv_count = MAXEXTNUM;
+ bm.bmv_iflags = BMV_IF_PREALLOC;
+ if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
+ bm.bmv_iflags |= BMV_IF_ATTRFORK;
+ if (!(fieinfo->fi_flags & FIEMAP_FLAG_SYNC))
+ bm.bmv_iflags |= BMV_IF_DELALLOC;
+
+ error = xfs_getbmap(ip, &bm, xfs_fiemap_format, fieinfo);
+ if (error)
+ return -error;
+
+ return 0;
+}
+
static const struct inode_operations xfs_inode_operations = {
.permission = xfs_vn_permission,
.truncate = xfs_vn_truncate,
@@ -671,6 +747,7 @@ static const struct inode_operations xfs_inode_operations = {
.removexattr = generic_removexattr,
.listxattr = xfs_vn_listxattr,
.fallocate = xfs_vn_fallocate,
+ .fiemap = xfs_vn_fiemap,
};
static const struct inode_operations xfs_dir_inode_operations = {
@@ -766,12 +843,20 @@ xfs_diflags_to_iflags(
* When reading existing inodes from disk this is called directly
* from xfs_iget, when creating a new inode it is called from
* xfs_ialloc after setting up the inode.
+ *
+ * We are always called with an uninitialised linux inode here.
+ * We need to initialise the necessary fields and take a reference
+ * on it.
*/
void
xfs_setup_inode(
struct xfs_inode *ip)
{
- struct inode *inode = ip->i_vnode;
+ struct inode *inode = &ip->i_vnode;
+
+ inode->i_ino = ip->i_ino;
+ inode->i_state = I_NEW|I_LOCK;
+ inode_add_to_lists(ip->i_mount->m_super, inode);
inode->i_mode = ip->i_d.di_mode;
inode->i_nlink = ip->i_d.di_nlink;
@@ -799,7 +884,6 @@ xfs_setup_inode(
inode->i_ctime.tv_sec = ip->i_d.di_ctime.t_sec;
inode->i_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec;
xfs_diflags_to_iflags(inode, ip);
- xfs_iflags_clear(ip, XFS_IMODIFIED);
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
diff --git a/fs/xfs/linux-2.6/xfs_iops.h b/fs/xfs/linux-2.6/xfs_iops.h
index 8b1a1e31dc21..ef41c92ce66e 100644
--- a/fs/xfs/linux-2.6/xfs_iops.h
+++ b/fs/xfs/linux-2.6/xfs_iops.h
@@ -22,7 +22,6 @@ struct xfs_inode;
extern const struct file_operations xfs_file_operations;
extern const struct file_operations xfs_dir_file_operations;
-extern const struct file_operations xfs_invis_file_operations;
extern ssize_t xfs_vn_listxattr(struct dentry *, char *data, size_t size);
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index cc0f7b3a9795..507492d6dccd 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -21,18 +21,12 @@
#include <linux/types.h>
/*
- * Some types are conditional depending on the target system.
* XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
- * XFS_BIG_INUMS needs the VFS inode number to be 64 bits, as well
- * as requiring XFS_BIG_BLKNOS to be set.
+ * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set.
*/
#if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
# define XFS_BIG_BLKNOS 1
-# if BITS_PER_LONG == 64
-# define XFS_BIG_INUMS 1
-# else
-# define XFS_BIG_INUMS 0
-# endif
+# define XFS_BIG_INUMS 1
#else
# define XFS_BIG_BLKNOS 0
# define XFS_BIG_INUMS 0
@@ -77,6 +71,7 @@
#include <linux/spinlock.h>
#include <linux/random.h>
#include <linux/ctype.h>
+#include <linux/writeback.h>
#include <asm/page.h>
#include <asm/div64.h>
@@ -85,7 +80,6 @@
#include <asm/byteorder.h>
#include <asm/unaligned.h>
-#include <xfs_vfs.h>
#include <xfs_cred.h>
#include <xfs_vnode.h>
#include <xfs_stats.h>
@@ -107,7 +101,6 @@
#undef HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */
#endif
-#define restricted_chown xfs_params.restrict_chown.val
#define irix_sgid_inherit xfs_params.sgid_inherit.val
#define irix_symlink_mode xfs_params.symlink_mode.val
#define xfs_panic_mask xfs_params.panic_mask.val
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 1957e5357d04..7e90daa0d1d1 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -51,7 +51,6 @@
#include "xfs_vnodeops.h"
#include <linux/capability.h>
-#include <linux/mount.h>
#include <linux/writeback.h>
@@ -243,7 +242,7 @@ xfs_read(
if (unlikely(ioflags & IO_ISDIRECT)) {
if (inode->i_mapping->nrpages)
- ret = xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
+ ret = -xfs_flushinval_pages(ip, (*offset & PAGE_CACHE_MASK),
-1, FI_REMAPF_LOCKED);
mutex_unlock(&inode->i_mutex);
if (ret) {
@@ -668,15 +667,8 @@ start:
if (new_size > xip->i_size)
xip->i_new_size = new_size;
- /*
- * We're not supposed to change timestamps in readonly-mounted
- * filesystems. Throw it away if anyone asks us.
- */
- if (likely(!(ioflags & IO_INVIS) &&
- !mnt_want_write(file->f_path.mnt))) {
+ if (likely(!(ioflags & IO_INVIS)))
xfs_ichgtime(xip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- mnt_drop_write(file->f_path.mnt);
- }
/*
* If the offset is beyond the size of the file, we have a couple
@@ -715,7 +707,6 @@ start:
}
}
-retry:
/* We can write back this queue in page reclaim */
current->backing_dev_info = mapping->backing_dev_info;
@@ -771,6 +762,17 @@ retry:
if (ret == -EIOCBQUEUED && !(ioflags & IO_ISAIO))
ret = wait_on_sync_kiocb(iocb);
+ isize = i_size_read(inode);
+ if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
+ *offset = isize;
+
+ if (*offset > xip->i_size) {
+ xfs_ilock(xip, XFS_ILOCK_EXCL);
+ if (*offset > xip->i_size)
+ xip->i_size = *offset;
+ xfs_iunlock(xip, XFS_ILOCK_EXCL);
+ }
+
if (ret == -ENOSPC &&
DM_EVENT_ENABLED(xip, DM_EVENT_NOSPACE) && !(ioflags & IO_INVIS)) {
xfs_iunlock(xip, iolock);
@@ -784,20 +786,7 @@ retry:
xfs_ilock(xip, iolock);
if (error)
goto out_unlock_internal;
- pos = xip->i_size;
- ret = 0;
- goto retry;
- }
-
- isize = i_size_read(inode);
- if (unlikely(ret < 0 && ret != -EFAULT && *offset > isize))
- *offset = isize;
-
- if (*offset > xip->i_size) {
- xfs_ilock(xip, XFS_ILOCK_EXCL);
- if (*offset > xip->i_size)
- xip->i_size = *offset;
- xfs_iunlock(xip, XFS_ILOCK_EXCL);
+ goto start;
}
error = -ret;
@@ -855,13 +844,7 @@ retry:
int
xfs_bdstrat_cb(struct xfs_buf *bp)
{
- xfs_mount_t *mp;
-
- mp = XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *);
- if (!XFS_FORCED_SHUTDOWN(mp)) {
- xfs_buf_iorequest(bp);
- return 0;
- } else {
+ if (XFS_FORCED_SHUTDOWN(bp->b_mount)) {
xfs_buftrace("XFS__BDSTRAT IOERROR", bp);
/*
* Metadata write that didn't get logged but
@@ -874,6 +857,9 @@ xfs_bdstrat_cb(struct xfs_buf *bp)
else
return (xfs_bioerror(bp));
}
+
+ xfs_buf_iorequest(bp);
+ return 0;
}
/*
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index 3d5b67c075c7..c3526d445f6a 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -53,11 +53,15 @@ xfs_read_xfsstats(
{ "icluster", XFSSTAT_END_INODE_CLUSTER },
{ "vnodes", XFSSTAT_END_VNODE_OPS },
{ "buf", XFSSTAT_END_BUF },
+ { "abtb2", XFSSTAT_END_ABTB_V2 },
+ { "abtc2", XFSSTAT_END_ABTC_V2 },
+ { "bmbt2", XFSSTAT_END_BMBT_V2 },
+ { "ibt2", XFSSTAT_END_IBT_V2 },
};
/* Loop over all stats groups */
for (i=j=len = 0; i < ARRAY_SIZE(xstats); i++) {
- len += sprintf(buffer + len, xstats[i].desc);
+ len += sprintf(buffer + len, "%s", xstats[i].desc);
/* inner loop does each group */
while (j < xstats[i].endpoint) {
val = 0;
diff --git a/fs/xfs/linux-2.6/xfs_stats.h b/fs/xfs/linux-2.6/xfs_stats.h
index e83820febc9f..736854b1ca1a 100644
--- a/fs/xfs/linux-2.6/xfs_stats.h
+++ b/fs/xfs/linux-2.6/xfs_stats.h
@@ -118,6 +118,71 @@ struct xfsstats {
__uint32_t xb_page_retries;
__uint32_t xb_page_found;
__uint32_t xb_get_read;
+/* Version 2 btree counters */
+#define XFSSTAT_END_ABTB_V2 (XFSSTAT_END_BUF+15)
+ __uint32_t xs_abtb_2_lookup;
+ __uint32_t xs_abtb_2_compare;
+ __uint32_t xs_abtb_2_insrec;
+ __uint32_t xs_abtb_2_delrec;
+ __uint32_t xs_abtb_2_newroot;
+ __uint32_t xs_abtb_2_killroot;
+ __uint32_t xs_abtb_2_increment;
+ __uint32_t xs_abtb_2_decrement;
+ __uint32_t xs_abtb_2_lshift;
+ __uint32_t xs_abtb_2_rshift;
+ __uint32_t xs_abtb_2_split;
+ __uint32_t xs_abtb_2_join;
+ __uint32_t xs_abtb_2_alloc;
+ __uint32_t xs_abtb_2_free;
+ __uint32_t xs_abtb_2_moves;
+#define XFSSTAT_END_ABTC_V2 (XFSSTAT_END_ABTB_V2+15)
+ __uint32_t xs_abtc_2_lookup;
+ __uint32_t xs_abtc_2_compare;
+ __uint32_t xs_abtc_2_insrec;
+ __uint32_t xs_abtc_2_delrec;
+ __uint32_t xs_abtc_2_newroot;
+ __uint32_t xs_abtc_2_killroot;
+ __uint32_t xs_abtc_2_increment;
+ __uint32_t xs_abtc_2_decrement;
+ __uint32_t xs_abtc_2_lshift;
+ __uint32_t xs_abtc_2_rshift;
+ __uint32_t xs_abtc_2_split;
+ __uint32_t xs_abtc_2_join;
+ __uint32_t xs_abtc_2_alloc;
+ __uint32_t xs_abtc_2_free;
+ __uint32_t xs_abtc_2_moves;
+#define XFSSTAT_END_BMBT_V2 (XFSSTAT_END_ABTC_V2+15)
+ __uint32_t xs_bmbt_2_lookup;
+ __uint32_t xs_bmbt_2_compare;
+ __uint32_t xs_bmbt_2_insrec;
+ __uint32_t xs_bmbt_2_delrec;
+ __uint32_t xs_bmbt_2_newroot;
+ __uint32_t xs_bmbt_2_killroot;
+ __uint32_t xs_bmbt_2_increment;
+ __uint32_t xs_bmbt_2_decrement;
+ __uint32_t xs_bmbt_2_lshift;
+ __uint32_t xs_bmbt_2_rshift;
+ __uint32_t xs_bmbt_2_split;
+ __uint32_t xs_bmbt_2_join;
+ __uint32_t xs_bmbt_2_alloc;
+ __uint32_t xs_bmbt_2_free;
+ __uint32_t xs_bmbt_2_moves;
+#define XFSSTAT_END_IBT_V2 (XFSSTAT_END_BMBT_V2+15)
+ __uint32_t xs_ibt_2_lookup;
+ __uint32_t xs_ibt_2_compare;
+ __uint32_t xs_ibt_2_insrec;
+ __uint32_t xs_ibt_2_delrec;
+ __uint32_t xs_ibt_2_newroot;
+ __uint32_t xs_ibt_2_killroot;
+ __uint32_t xs_ibt_2_increment;
+ __uint32_t xs_ibt_2_decrement;
+ __uint32_t xs_ibt_2_lshift;
+ __uint32_t xs_ibt_2_rshift;
+ __uint32_t xs_ibt_2_split;
+ __uint32_t xs_ibt_2_join;
+ __uint32_t xs_ibt_2_alloc;
+ __uint32_t xs_ibt_2_free;
+ __uint32_t xs_ibt_2_moves;
/* Extra precision counters */
__uint64_t xs_xstrat_bytes;
__uint64_t xs_write_bytes;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 37ebe36056eb..36f6cc703ef2 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -18,7 +18,6 @@
#include "xfs.h"
#include "xfs_bit.h"
#include "xfs_log.h"
-#include "xfs_clnt.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
@@ -36,6 +35,7 @@
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
#include "xfs_ialloc.h"
#include "xfs_bmap.h"
#include "xfs_rtalloc.h"
@@ -48,7 +48,6 @@
#include "xfs_buf_item.h"
#include "xfs_utils.h"
#include "xfs_vnodeops.h"
-#include "xfs_vfsops.h"
#include "xfs_version.h"
#include "xfs_log_priv.h"
#include "xfs_trans_priv.h"
@@ -58,6 +57,7 @@
#include "xfs_extfree_item.h"
#include "xfs_mru_cache.h"
#include "xfs_inode_item.h"
+#include "xfs_sync.h"
#include <linux/namei.h>
#include <linux/init.h>
@@ -70,36 +70,9 @@
static struct quotactl_ops xfs_quotactl_operations;
static struct super_operations xfs_super_operations;
-static kmem_zone_t *xfs_vnode_zone;
static kmem_zone_t *xfs_ioend_zone;
mempool_t *xfs_ioend_pool;
-STATIC struct xfs_mount_args *
-xfs_args_allocate(
- struct super_block *sb,
- int silent)
-{
- struct xfs_mount_args *args;
-
- args = kzalloc(sizeof(struct xfs_mount_args), GFP_KERNEL);
- if (!args)
- return NULL;
-
- args->logbufs = args->logbufsize = -1;
- strncpy(args->fsname, sb->s_id, MAXNAMELEN);
-
- /* Copy the already-parsed mount(2) flags we're interested in */
- if (sb->s_flags & MS_DIRSYNC)
- args->flags |= XFSMNT_DIRSYNC;
- if (sb->s_flags & MS_SYNCHRONOUS)
- args->flags |= XFSMNT_WSYNC;
- if (silent)
- args->flags |= XFSMNT_QUIET;
- args->flags |= XFSMNT_32BITINODES;
-
- return args;
-}
-
#define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */
#define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */
#define MNTOPT_LOGDEV "logdev" /* log device */
@@ -188,26 +161,54 @@ suffix_strtoul(char *s, char **endp, unsigned int base)
return simple_strtoul((const char *)s, endp, base) << shift_left_factor;
}
+/*
+ * This function fills in xfs_mount_t fields based on mount args.
+ * Note: the superblock has _not_ yet been read in.
+ *
+ * Note that this function leaks the various device name allocations on
+ * failure. The caller takes care of them.
+ */
STATIC int
xfs_parseargs(
struct xfs_mount *mp,
char *options,
- struct xfs_mount_args *args,
- int update)
+ char **mtpt)
{
+ struct super_block *sb = mp->m_super;
char *this_char, *value, *eov;
- int dsunit, dswidth, vol_dsunit, vol_dswidth;
- int iosize;
+ int dsunit = 0;
+ int dswidth = 0;
+ int iosize = 0;
int dmapi_implies_ikeep = 1;
+ uchar_t iosizelog = 0;
+
+ /*
+ * Copy binary VFS mount flags we are interested in.
+ */
+ if (sb->s_flags & MS_RDONLY)
+ mp->m_flags |= XFS_MOUNT_RDONLY;
+ if (sb->s_flags & MS_DIRSYNC)
+ mp->m_flags |= XFS_MOUNT_DIRSYNC;
+ if (sb->s_flags & MS_SYNCHRONOUS)
+ mp->m_flags |= XFS_MOUNT_WSYNC;
+
+ /*
+ * Set some default flags that could be cleared by the mount option
+ * parsing.
+ */
+ mp->m_flags |= XFS_MOUNT_BARRIER;
+ mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
+ mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
- args->flags |= XFSMNT_BARRIER;
- args->flags2 |= XFSMNT2_COMPAT_IOSIZE;
+ /*
+ * These can be overridden by the mount option parsing.
+ */
+ mp->m_logbufs = -1;
+ mp->m_logbsize = -1;
if (!options)
goto done;
- iosize = dsunit = dswidth = vol_dsunit = vol_dswidth = 0;
-
while ((this_char = strsep(&options, ",")) != NULL) {
if (!*this_char)
continue;
@@ -221,7 +222,7 @@ xfs_parseargs(
this_char);
return EINVAL;
}
- args->logbufs = simple_strtoul(value, &eov, 10);
+ mp->m_logbufs = simple_strtoul(value, &eov, 10);
} else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
if (!value || !*value) {
cmn_err(CE_WARN,
@@ -229,7 +230,7 @@ xfs_parseargs(
this_char);
return EINVAL;
}
- args->logbufsize = suffix_strtoul(value, &eov, 10);
+ mp->m_logbsize = suffix_strtoul(value, &eov, 10);
} else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
if (!value || !*value) {
cmn_err(CE_WARN,
@@ -237,7 +238,9 @@ xfs_parseargs(
this_char);
return EINVAL;
}
- strncpy(args->logname, value, MAXNAMELEN);
+ mp->m_logname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+ if (!mp->m_logname)
+ return ENOMEM;
} else if (!strcmp(this_char, MNTOPT_MTPT)) {
if (!value || !*value) {
cmn_err(CE_WARN,
@@ -245,7 +248,9 @@ xfs_parseargs(
this_char);
return EINVAL;
}
- strncpy(args->mtpt, value, MAXNAMELEN);
+ *mtpt = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+ if (!*mtpt)
+ return ENOMEM;
} else if (!strcmp(this_char, MNTOPT_RTDEV)) {
if (!value || !*value) {
cmn_err(CE_WARN,
@@ -253,7 +258,9 @@ xfs_parseargs(
this_char);
return EINVAL;
}
- strncpy(args->rtname, value, MAXNAMELEN);
+ mp->m_rtname = kstrndup(value, MAXNAMELEN, GFP_KERNEL);
+ if (!mp->m_rtname)
+ return ENOMEM;
} else if (!strcmp(this_char, MNTOPT_BIOSIZE)) {
if (!value || !*value) {
cmn_err(CE_WARN,
@@ -262,8 +269,7 @@ xfs_parseargs(
return EINVAL;
}
iosize = simple_strtoul(value, &eov, 10);
- args->flags |= XFSMNT_IOSIZE;
- args->iosizelog = (uint8_t) iosize;
+ iosizelog = ffs(iosize) - 1;
} else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
if (!value || !*value) {
cmn_err(CE_WARN,
@@ -272,8 +278,7 @@ xfs_parseargs(
return EINVAL;
}
iosize = suffix_strtoul(value, &eov, 10);
- args->flags |= XFSMNT_IOSIZE;
- args->iosizelog = ffs(iosize) - 1;
+ iosizelog = ffs(iosize) - 1;
} else if (!strcmp(this_char, MNTOPT_GRPID) ||
!strcmp(this_char, MNTOPT_BSDGROUPS)) {
mp->m_flags |= XFS_MOUNT_GRPID;
@@ -281,23 +286,25 @@ xfs_parseargs(
!strcmp(this_char, MNTOPT_SYSVGROUPS)) {
mp->m_flags &= ~XFS_MOUNT_GRPID;
} else if (!strcmp(this_char, MNTOPT_WSYNC)) {
- args->flags |= XFSMNT_WSYNC;
+ mp->m_flags |= XFS_MOUNT_WSYNC;
} else if (!strcmp(this_char, MNTOPT_OSYNCISOSYNC)) {
- args->flags |= XFSMNT_OSYNCISOSYNC;
+ mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
} else if (!strcmp(this_char, MNTOPT_NORECOVERY)) {
- args->flags |= XFSMNT_NORECOVERY;
+ mp->m_flags |= XFS_MOUNT_NORECOVERY;
} else if (!strcmp(this_char, MNTOPT_INO64)) {
- args->flags |= XFSMNT_INO64;
-#if !XFS_BIG_INUMS
+#if XFS_BIG_INUMS
+ mp->m_flags |= XFS_MOUNT_INO64;
+ mp->m_inoadd = XFS_INO64_OFFSET;
+#else
cmn_err(CE_WARN,
"XFS: %s option not allowed on this system",
this_char);
return EINVAL;
#endif
} else if (!strcmp(this_char, MNTOPT_NOALIGN)) {
- args->flags |= XFSMNT_NOALIGN;
+ mp->m_flags |= XFS_MOUNT_NOALIGN;
} else if (!strcmp(this_char, MNTOPT_SWALLOC)) {
- args->flags |= XFSMNT_SWALLOC;
+ mp->m_flags |= XFS_MOUNT_SWALLOC;
} else if (!strcmp(this_char, MNTOPT_SUNIT)) {
if (!value || !*value) {
cmn_err(CE_WARN,
@@ -315,7 +322,7 @@ xfs_parseargs(
}
dswidth = simple_strtoul(value, &eov, 10);
} else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
- args->flags &= ~XFSMNT_32BITINODES;
+ mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
#if !XFS_BIG_INUMS
cmn_err(CE_WARN,
"XFS: %s option not allowed on this system",
@@ -323,56 +330,61 @@ xfs_parseargs(
return EINVAL;
#endif
} else if (!strcmp(this_char, MNTOPT_NOUUID)) {
- args->flags |= XFSMNT_NOUUID;
+ mp->m_flags |= XFS_MOUNT_NOUUID;
} else if (!strcmp(this_char, MNTOPT_BARRIER)) {
- args->flags |= XFSMNT_BARRIER;
+ mp->m_flags |= XFS_MOUNT_BARRIER;
} else if (!strcmp(this_char, MNTOPT_NOBARRIER)) {
- args->flags &= ~XFSMNT_BARRIER;
+ mp->m_flags &= ~XFS_MOUNT_BARRIER;
} else if (!strcmp(this_char, MNTOPT_IKEEP)) {
- args->flags |= XFSMNT_IKEEP;
+ mp->m_flags |= XFS_MOUNT_IKEEP;
} else if (!strcmp(this_char, MNTOPT_NOIKEEP)) {
dmapi_implies_ikeep = 0;
- args->flags &= ~XFSMNT_IKEEP;
+ mp->m_flags &= ~XFS_MOUNT_IKEEP;
} else if (!strcmp(this_char, MNTOPT_LARGEIO)) {
- args->flags2 &= ~XFSMNT2_COMPAT_IOSIZE;
+ mp->m_flags &= ~XFS_MOUNT_COMPAT_IOSIZE;
} else if (!strcmp(this_char, MNTOPT_NOLARGEIO)) {
- args->flags2 |= XFSMNT2_COMPAT_IOSIZE;
+ mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
} else if (!strcmp(this_char, MNTOPT_ATTR2)) {
- args->flags |= XFSMNT_ATTR2;
+ mp->m_flags |= XFS_MOUNT_ATTR2;
} else if (!strcmp(this_char, MNTOPT_NOATTR2)) {
- args->flags &= ~XFSMNT_ATTR2;
- args->flags |= XFSMNT_NOATTR2;
+ mp->m_flags &= ~XFS_MOUNT_ATTR2;
+ mp->m_flags |= XFS_MOUNT_NOATTR2;
} else if (!strcmp(this_char, MNTOPT_FILESTREAM)) {
- args->flags2 |= XFSMNT2_FILESTREAMS;
+ mp->m_flags |= XFS_MOUNT_FILESTREAMS;
} else if (!strcmp(this_char, MNTOPT_NOQUOTA)) {
- args->flags &= ~(XFSMNT_UQUOTAENF|XFSMNT_UQUOTA);
- args->flags &= ~(XFSMNT_GQUOTAENF|XFSMNT_GQUOTA);
+ mp->m_qflags &= ~(XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
+ XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+ XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+ XFS_UQUOTA_ENFD | XFS_OQUOTA_ENFD);
} else if (!strcmp(this_char, MNTOPT_QUOTA) ||
!strcmp(this_char, MNTOPT_UQUOTA) ||
!strcmp(this_char, MNTOPT_USRQUOTA)) {
- args->flags |= XFSMNT_UQUOTA | XFSMNT_UQUOTAENF;
+ mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
+ XFS_UQUOTA_ENFD);
} else if (!strcmp(this_char, MNTOPT_QUOTANOENF) ||
!strcmp(this_char, MNTOPT_UQUOTANOENF)) {
- args->flags |= XFSMNT_UQUOTA;
- args->flags &= ~XFSMNT_UQUOTAENF;
+ mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
+ mp->m_qflags &= ~XFS_UQUOTA_ENFD;
} else if (!strcmp(this_char, MNTOPT_PQUOTA) ||
!strcmp(this_char, MNTOPT_PRJQUOTA)) {
- args->flags |= XFSMNT_PQUOTA | XFSMNT_PQUOTAENF;
+ mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+ XFS_OQUOTA_ENFD);
} else if (!strcmp(this_char, MNTOPT_PQUOTANOENF)) {
- args->flags |= XFSMNT_PQUOTA;
- args->flags &= ~XFSMNT_PQUOTAENF;
+ mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
+ mp->m_qflags &= ~XFS_OQUOTA_ENFD;
} else if (!strcmp(this_char, MNTOPT_GQUOTA) ||
!strcmp(this_char, MNTOPT_GRPQUOTA)) {
- args->flags |= XFSMNT_GQUOTA | XFSMNT_GQUOTAENF;
+ mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+ XFS_OQUOTA_ENFD);
} else if (!strcmp(this_char, MNTOPT_GQUOTANOENF)) {
- args->flags |= XFSMNT_GQUOTA;
- args->flags &= ~XFSMNT_GQUOTAENF;
+ mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
+ mp->m_qflags &= ~XFS_OQUOTA_ENFD;
} else if (!strcmp(this_char, MNTOPT_DMAPI)) {
- args->flags |= XFSMNT_DMAPI;
+ mp->m_flags |= XFS_MOUNT_DMAPI;
} else if (!strcmp(this_char, MNTOPT_XDSM)) {
- args->flags |= XFSMNT_DMAPI;
+ mp->m_flags |= XFS_MOUNT_DMAPI;
} else if (!strcmp(this_char, MNTOPT_DMI)) {
- args->flags |= XFSMNT_DMAPI;
+ mp->m_flags |= XFS_MOUNT_DMAPI;
} else if (!strcmp(this_char, "ihashsize")) {
cmn_err(CE_WARN,
"XFS: ihashsize no longer used, option is deprecated.");
@@ -390,27 +402,29 @@ xfs_parseargs(
}
}
- if (args->flags & XFSMNT_NORECOVERY) {
- if ((mp->m_flags & XFS_MOUNT_RDONLY) == 0) {
- cmn_err(CE_WARN,
- "XFS: no-recovery mounts must be read-only.");
- return EINVAL;
- }
+ /*
+ * no recovery flag requires a read-only mount
+ */
+ if ((mp->m_flags & XFS_MOUNT_NORECOVERY) &&
+ !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+ cmn_err(CE_WARN, "XFS: no-recovery mounts must be read-only.");
+ return EINVAL;
}
- if ((args->flags & XFSMNT_NOALIGN) && (dsunit || dswidth)) {
+ if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) {
cmn_err(CE_WARN,
"XFS: sunit and swidth options incompatible with the noalign option");
return EINVAL;
}
- if ((args->flags & XFSMNT_GQUOTA) && (args->flags & XFSMNT_PQUOTA)) {
+ if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) &&
+ (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) {
cmn_err(CE_WARN,
"XFS: cannot mount with both project and group quota");
return EINVAL;
}
- if ((args->flags & XFSMNT_DMAPI) && *args->mtpt == '\0') {
+ if ((mp->m_flags & XFS_MOUNT_DMAPI) && (!*mtpt || *mtpt[0] == '\0')) {
printk("XFS: %s option needs the mount point option as well\n",
MNTOPT_DMAPI);
return EINVAL;
@@ -438,27 +452,66 @@ xfs_parseargs(
* Note that if "ikeep" or "noikeep" mount options are
* supplied, then they are honored.
*/
- if ((args->flags & XFSMNT_DMAPI) && dmapi_implies_ikeep)
- args->flags |= XFSMNT_IKEEP;
+ if ((mp->m_flags & XFS_MOUNT_DMAPI) && dmapi_implies_ikeep)
+ mp->m_flags |= XFS_MOUNT_IKEEP;
- if ((args->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
+done:
+ if (!(mp->m_flags & XFS_MOUNT_NOALIGN)) {
+ /*
+ * At this point the superblock has not been read
+ * in, therefore we do not know the block size.
+ * Before the mount call ends we will convert
+ * these to FSBs.
+ */
if (dsunit) {
- args->sunit = dsunit;
- args->flags |= XFSMNT_RETERR;
- } else {
- args->sunit = vol_dsunit;
+ mp->m_dalign = dsunit;
+ mp->m_flags |= XFS_MOUNT_RETERR;
}
- dswidth ? (args->swidth = dswidth) :
- (args->swidth = vol_dswidth);
- } else {
- args->sunit = args->swidth = 0;
+
+ if (dswidth)
+ mp->m_swidth = dswidth;
+ }
+
+ if (mp->m_logbufs != -1 &&
+ mp->m_logbufs != 0 &&
+ (mp->m_logbufs < XLOG_MIN_ICLOGS ||
+ mp->m_logbufs > XLOG_MAX_ICLOGS)) {
+ cmn_err(CE_WARN,
+ "XFS: invalid logbufs value: %d [not %d-%d]",
+ mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
+ return XFS_ERROR(EINVAL);
+ }
+ if (mp->m_logbsize != -1 &&
+ mp->m_logbsize != 0 &&
+ (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE ||
+ mp->m_logbsize > XLOG_MAX_RECORD_BSIZE ||
+ !is_power_of_2(mp->m_logbsize))) {
+ cmn_err(CE_WARN,
+ "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
+ mp->m_logbsize);
+ return XFS_ERROR(EINVAL);
+ }
+
+ mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL);
+ if (!mp->m_fsname)
+ return ENOMEM;
+ mp->m_fsname_len = strlen(mp->m_fsname) + 1;
+
+ if (iosizelog) {
+ if (iosizelog > XFS_MAX_IO_LOG ||
+ iosizelog < XFS_MIN_IO_LOG) {
+ cmn_err(CE_WARN,
+ "XFS: invalid log iosize: %d [not %d-%d]",
+ iosizelog, XFS_MIN_IO_LOG,
+ XFS_MAX_IO_LOG);
+ return XFS_ERROR(EINVAL);
+ }
+
+ mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
+ mp->m_readio_log = iosizelog;
+ mp->m_writeio_log = iosizelog;
}
-done:
- if (args->flags & XFSMNT_32BITINODES)
- mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
- if (args->flags2)
- args->flags |= XFSMNT_FLAGS2;
return 0;
}
@@ -704,8 +757,7 @@ xfs_close_devices(
*/
STATIC int
xfs_open_devices(
- struct xfs_mount *mp,
- struct xfs_mount_args *args)
+ struct xfs_mount *mp)
{
struct block_device *ddev = mp->m_super->s_bdev;
struct block_device *logdev = NULL, *rtdev = NULL;
@@ -714,14 +766,14 @@ xfs_open_devices(
/*
* Open real time and log devices - order is important.
*/
- if (args->logname[0]) {
- error = xfs_blkdev_get(mp, args->logname, &logdev);
+ if (mp->m_logname) {
+ error = xfs_blkdev_get(mp, mp->m_logname, &logdev);
if (error)
goto out;
}
- if (args->rtname[0]) {
- error = xfs_blkdev_get(mp, args->rtname, &rtdev);
+ if (mp->m_rtname) {
+ error = xfs_blkdev_get(mp, mp->m_rtname, &rtdev);
if (error)
goto out_close_logdev;
@@ -813,18 +865,18 @@ xfs_setup_devices(
*/
void
xfsaild_wakeup(
- xfs_mount_t *mp,
+ struct xfs_ail *ailp,
xfs_lsn_t threshold_lsn)
{
- mp->m_ail.xa_target = threshold_lsn;
- wake_up_process(mp->m_ail.xa_task);
+ ailp->xa_target = threshold_lsn;
+ wake_up_process(ailp->xa_task);
}
int
xfsaild(
void *data)
{
- xfs_mount_t *mp = (xfs_mount_t *)data;
+ struct xfs_ail *ailp = data;
xfs_lsn_t last_pushed_lsn = 0;
long tout = 0;
@@ -836,11 +888,11 @@ xfsaild(
/* swsusp */
try_to_freeze();
- ASSERT(mp->m_log);
- if (XFS_FORCED_SHUTDOWN(mp))
+ ASSERT(ailp->xa_mount->m_log);
+ if (XFS_FORCED_SHUTDOWN(ailp->xa_mount))
continue;
- tout = xfsaild_push(mp, &last_pushed_lsn);
+ tout = xfsaild_push(ailp, &last_pushed_lsn);
}
return 0;
@@ -848,43 +900,82 @@ xfsaild(
int
xfsaild_start(
- xfs_mount_t *mp)
+ struct xfs_ail *ailp)
{
- mp->m_ail.xa_target = 0;
- mp->m_ail.xa_task = kthread_run(xfsaild, mp, "xfsaild");
- if (IS_ERR(mp->m_ail.xa_task))
- return -PTR_ERR(mp->m_ail.xa_task);
+ ailp->xa_target = 0;
+ ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild");
+ if (IS_ERR(ailp->xa_task))
+ return -PTR_ERR(ailp->xa_task);
return 0;
}
void
xfsaild_stop(
- xfs_mount_t *mp)
+ struct xfs_ail *ailp)
{
- kthread_stop(mp->m_ail.xa_task);
+ kthread_stop(ailp->xa_task);
}
-
+/* Catch misguided souls that try to use this interface on XFS */
STATIC struct inode *
xfs_fs_alloc_inode(
struct super_block *sb)
{
- return kmem_zone_alloc(xfs_vnode_zone, KM_SLEEP);
+ BUG();
+ return NULL;
}
+/*
+ * Now that the generic code is guaranteed not to be accessing
+ * the linux inode, we can reclaim the inode.
+ */
STATIC void
xfs_fs_destroy_inode(
- struct inode *inode)
+ struct inode *inode)
{
- kmem_zone_free(xfs_vnode_zone, inode);
+ xfs_inode_t *ip = XFS_I(inode);
+
+ XFS_STATS_INC(vn_reclaim);
+ if (xfs_reclaim(ip))
+ panic("%s: cannot reclaim 0x%p\n", __func__, inode);
}
+/*
+ * Slab object creation initialisation for the XFS inode.
+ * This covers only the idempotent fields in the XFS inode;
+ * all other fields need to be initialised on allocation
+ * from the slab. This avoids the need to repeatedly intialise
+ * fields in the xfs inode that left in the initialise state
+ * when freeing the inode.
+ */
STATIC void
xfs_fs_inode_init_once(
- void *vnode)
+ void *inode)
{
- inode_init_once((struct inode *)vnode);
+ struct xfs_inode *ip = inode;
+
+ memset(ip, 0, sizeof(struct xfs_inode));
+
+ /* vfs inode */
+ inode_init_once(VFS_I(ip));
+
+ /* xfs inode */
+ atomic_set(&ip->i_iocount, 0);
+ atomic_set(&ip->i_pincount, 0);
+ spin_lock_init(&ip->i_flags_lock);
+ init_waitqueue_head(&ip->i_ipin_wait);
+ /*
+ * Because we want to use a counting completion, complete
+ * the flush completion once to allow a single access to
+ * the flush completion without blocking.
+ */
+ init_completion(&ip->i_flush);
+ complete(&ip->i_flush);
+
+ mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
+ "xfsino", ip->i_ino);
+ mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
}
/*
@@ -898,21 +989,26 @@ xfs_fs_write_inode(
struct inode *inode,
int sync)
{
+ struct xfs_inode *ip = XFS_I(inode);
int error = 0;
int flags = 0;
- xfs_itrace_entry(XFS_I(inode));
+ xfs_itrace_entry(ip);
if (sync) {
- filemap_fdatawait(inode->i_mapping);
+ error = xfs_wait_on_pages(ip, 0, -1);
+ if (error)
+ goto out_error;
flags |= FLUSH_SYNC;
}
- error = xfs_inode_flush(XFS_I(inode), flags);
+ error = xfs_inode_flush(ip, flags);
+
+out_error:
/*
* if we failed to write out the inode then mark
* it dirty again so we'll try again later.
*/
if (error)
- mark_inode_dirty_sync(inode);
+ xfs_mark_inode_dirty_sync(ip);
return -error;
}
@@ -923,164 +1019,12 @@ xfs_fs_clear_inode(
{
xfs_inode_t *ip = XFS_I(inode);
- /*
- * ip can be null when xfs_iget_core calls xfs_idestroy if we
- * find an inode with di_mode == 0 but without IGET_CREATE set.
- */
- if (ip) {
- xfs_itrace_entry(ip);
- XFS_STATS_INC(vn_rele);
- XFS_STATS_INC(vn_remove);
- XFS_STATS_INC(vn_reclaim);
- XFS_STATS_DEC(vn_active);
-
- xfs_inactive(ip);
- xfs_iflags_clear(ip, XFS_IMODIFIED);
- if (xfs_reclaim(ip))
- panic("%s: cannot reclaim 0x%p\n", __func__, inode);
- }
-
- ASSERT(XFS_I(inode) == NULL);
-}
+ xfs_itrace_entry(ip);
+ XFS_STATS_INC(vn_rele);
+ XFS_STATS_INC(vn_remove);
+ XFS_STATS_DEC(vn_active);
-/*
- * Enqueue a work item to be picked up by the vfs xfssyncd thread.
- * Doing this has two advantages:
- * - It saves on stack space, which is tight in certain situations
- * - It can be used (with care) as a mechanism to avoid deadlocks.
- * Flushing while allocating in a full filesystem requires both.
- */
-STATIC void
-xfs_syncd_queue_work(
- struct xfs_mount *mp,
- void *data,
- void (*syncer)(struct xfs_mount *, void *))
-{
- struct bhv_vfs_sync_work *work;
-
- work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
- INIT_LIST_HEAD(&work->w_list);
- work->w_syncer = syncer;
- work->w_data = data;
- work->w_mount = mp;
- spin_lock(&mp->m_sync_lock);
- list_add_tail(&work->w_list, &mp->m_sync_list);
- spin_unlock(&mp->m_sync_lock);
- wake_up_process(mp->m_sync_task);
-}
-
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations. At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room...
- */
-STATIC void
-xfs_flush_inode_work(
- struct xfs_mount *mp,
- void *arg)
-{
- struct inode *inode = arg;
- filemap_flush(inode->i_mapping);
- iput(inode);
-}
-
-void
-xfs_flush_inode(
- xfs_inode_t *ip)
-{
- struct inode *inode = VFS_I(ip);
-
- igrab(inode);
- xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
- delay(msecs_to_jiffies(500));
-}
-
-/*
- * This is the "bigger hammer" version of xfs_flush_inode_work...
- * (IOW, "If at first you don't succeed, use a Bigger Hammer").
- */
-STATIC void
-xfs_flush_device_work(
- struct xfs_mount *mp,
- void *arg)
-{
- struct inode *inode = arg;
- sync_blockdev(mp->m_super->s_bdev);
- iput(inode);
-}
-
-void
-xfs_flush_device(
- xfs_inode_t *ip)
-{
- struct inode *inode = VFS_I(ip);
-
- igrab(inode);
- xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
- delay(msecs_to_jiffies(500));
- xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
-}
-
-STATIC void
-xfs_sync_worker(
- struct xfs_mount *mp,
- void *unused)
-{
- int error;
-
- if (!(mp->m_flags & XFS_MOUNT_RDONLY))
- error = xfs_sync(mp, SYNC_FSDATA | SYNC_BDFLUSH | SYNC_ATTR);
- mp->m_sync_seq++;
- wake_up(&mp->m_wait_single_sync_task);
-}
-
-STATIC int
-xfssyncd(
- void *arg)
-{
- struct xfs_mount *mp = arg;
- long timeleft;
- bhv_vfs_sync_work_t *work, *n;
- LIST_HEAD (tmp);
-
- set_freezable();
- timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
- for (;;) {
- timeleft = schedule_timeout_interruptible(timeleft);
- /* swsusp */
- try_to_freeze();
- if (kthread_should_stop() && list_empty(&mp->m_sync_list))
- break;
-
- spin_lock(&mp->m_sync_lock);
- /*
- * We can get woken by laptop mode, to do a sync -
- * that's the (only!) case where the list would be
- * empty with time remaining.
- */
- if (!timeleft || list_empty(&mp->m_sync_list)) {
- if (!timeleft)
- timeleft = xfs_syncd_centisecs *
- msecs_to_jiffies(10);
- INIT_LIST_HEAD(&mp->m_sync_work.w_list);
- list_add_tail(&mp->m_sync_work.w_list,
- &mp->m_sync_list);
- }
- list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
- list_move(&work->w_list, &tmp);
- spin_unlock(&mp->m_sync_lock);
-
- list_for_each_entry_safe(work, n, &tmp, w_list) {
- (*work->w_syncer)(mp, work->w_data);
- list_del(&work->w_list);
- if (work == &mp->m_sync_work)
- continue;
- kmem_free(work);
- }
- }
-
- return 0;
+ xfs_inactive(ip);
}
STATIC void
@@ -1099,11 +1043,9 @@ xfs_fs_put_super(
struct xfs_mount *mp = XFS_M(sb);
struct xfs_inode *rip = mp->m_rootip;
int unmount_event_flags = 0;
- int error;
- kthread_stop(mp->m_sync_task);
-
- xfs_sync(mp, SYNC_ATTR | SYNC_DELWRI);
+ xfs_syncd_stop(mp);
+ xfs_sync_inodes(mp, SYNC_ATTR|SYNC_DELWRI);
#ifdef HAVE_DMAPI
if (mp->m_flags & XFS_MOUNT_DMAPI) {
@@ -1128,18 +1070,6 @@ xfs_fs_put_super(
xfs_filestream_unmount(mp);
XFS_bflush(mp->m_ddev_targp);
- error = xfs_unmount_flush(mp, 0);
- WARN_ON(error);
-
- /*
- * If we're forcing a shutdown, typically because of a media error,
- * we want to make sure we invalidate dirty pages that belong to
- * referenced vnodes as well.
- */
- if (XFS_FORCED_SHUTDOWN(mp)) {
- error = xfs_sync(mp, SYNC_WAIT | SYNC_CLOSE);
- ASSERT(error != EFSCORRUPTED);
- }
if (mp->m_flags & XFS_MOUNT_DMAPI) {
XFS_SEND_UNMOUNT(mp, rip, DM_RIGHT_NULL, 0, 0,
@@ -1161,7 +1091,7 @@ xfs_fs_write_super(
struct super_block *sb)
{
if (!(sb->s_flags & MS_RDONLY))
- xfs_sync(XFS_M(sb), SYNC_FSDATA);
+ xfs_sync_fsdata(XFS_M(sb), 0);
sb->s_dirt = 0;
}
@@ -1172,7 +1102,6 @@ xfs_fs_sync_super(
{
struct xfs_mount *mp = XFS_M(sb);
int error;
- int flags;
/*
* Treat a sync operation like a freeze. This is to work
@@ -1186,20 +1115,10 @@ xfs_fs_sync_super(
* dirty the Linux inode until after the transaction I/O
* completes.
*/
- if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE)) {
- /*
- * First stage of freeze - no more writers will make progress
- * now we are here, so we flush delwri and delalloc buffers
- * here, then wait for all I/O to complete. Data is frozen at
- * that point. Metadata is not frozen, transactions can still
- * occur here so don't bother flushing the buftarg (i.e
- * SYNC_QUIESCE) because it'll just get dirty again.
- */
- flags = SYNC_DATA_QUIESCE;
- } else
- flags = SYNC_FSDATA;
-
- error = xfs_sync(mp, flags);
+ if (wait || unlikely(sb->s_frozen == SB_FREEZE_WRITE))
+ error = xfs_quiesce_data(mp);
+ else
+ error = xfs_sync_fsdata(mp, 0);
sb->s_dirt = 0;
if (unlikely(laptop_mode)) {
@@ -1337,9 +1256,8 @@ xfs_fs_remount(
/* rw -> ro */
if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
- xfs_filestream_flush(mp);
- xfs_sync(mp, SYNC_DATA_QUIESCE);
- xfs_attr_quiesce(mp);
+ xfs_quiesce_data(mp);
+ xfs_quiesce_attr(mp);
mp->m_flags |= XFS_MOUNT_RDONLY;
}
@@ -1348,7 +1266,7 @@ xfs_fs_remount(
/*
* Second stage of a freeze. The data is already frozen so we only
- * need to take care of themetadata. Once that's done write a dummy
+ * need to take care of the metadata. Once that's done write a dummy
* record to dirty the log in case of a crash while frozen.
*/
STATIC void
@@ -1357,7 +1275,7 @@ xfs_fs_lockfs(
{
struct xfs_mount *mp = XFS_M(sb);
- xfs_attr_quiesce(mp);
+ xfs_quiesce_attr(mp);
xfs_fs_log_dummy(mp);
}
@@ -1422,175 +1340,28 @@ xfs_fs_setxquota(
/*
* This function fills in xfs_mount_t fields based on mount args.
- * Note: the superblock has _not_ yet been read in.
- */
-STATIC int
-xfs_start_flags(
- struct xfs_mount_args *ap,
- struct xfs_mount *mp)
-{
- int error;
-
- /* Values are in BBs */
- if ((ap->flags & XFSMNT_NOALIGN) != XFSMNT_NOALIGN) {
- /*
- * At this point the superblock has not been read
- * in, therefore we do not know the block size.
- * Before the mount call ends we will convert
- * these to FSBs.
- */
- mp->m_dalign = ap->sunit;
- mp->m_swidth = ap->swidth;
- }
-
- if (ap->logbufs != -1 &&
- ap->logbufs != 0 &&
- (ap->logbufs < XLOG_MIN_ICLOGS ||
- ap->logbufs > XLOG_MAX_ICLOGS)) {
- cmn_err(CE_WARN,
- "XFS: invalid logbufs value: %d [not %d-%d]",
- ap->logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS);
- return XFS_ERROR(EINVAL);
- }
- mp->m_logbufs = ap->logbufs;
- if (ap->logbufsize != -1 &&
- ap->logbufsize != 0 &&
- (ap->logbufsize < XLOG_MIN_RECORD_BSIZE ||
- ap->logbufsize > XLOG_MAX_RECORD_BSIZE ||
- !is_power_of_2(ap->logbufsize))) {
- cmn_err(CE_WARN,
- "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]",
- ap->logbufsize);
- return XFS_ERROR(EINVAL);
- }
-
- error = ENOMEM;
-
- mp->m_logbsize = ap->logbufsize;
- mp->m_fsname_len = strlen(ap->fsname) + 1;
-
- mp->m_fsname = kstrdup(ap->fsname, GFP_KERNEL);
- if (!mp->m_fsname)
- goto out;
-
- if (ap->rtname[0]) {
- mp->m_rtname = kstrdup(ap->rtname, GFP_KERNEL);
- if (!mp->m_rtname)
- goto out_free_fsname;
-
- }
-
- if (ap->logname[0]) {
- mp->m_logname = kstrdup(ap->logname, GFP_KERNEL);
- if (!mp->m_logname)
- goto out_free_rtname;
- }
-
- if (ap->flags & XFSMNT_WSYNC)
- mp->m_flags |= XFS_MOUNT_WSYNC;
-#if XFS_BIG_INUMS
- if (ap->flags & XFSMNT_INO64) {
- mp->m_flags |= XFS_MOUNT_INO64;
- mp->m_inoadd = XFS_INO64_OFFSET;
- }
-#endif
- if (ap->flags & XFSMNT_RETERR)
- mp->m_flags |= XFS_MOUNT_RETERR;
- if (ap->flags & XFSMNT_NOALIGN)
- mp->m_flags |= XFS_MOUNT_NOALIGN;
- if (ap->flags & XFSMNT_SWALLOC)
- mp->m_flags |= XFS_MOUNT_SWALLOC;
- if (ap->flags & XFSMNT_OSYNCISOSYNC)
- mp->m_flags |= XFS_MOUNT_OSYNCISOSYNC;
- if (ap->flags & XFSMNT_32BITINODES)
- mp->m_flags |= XFS_MOUNT_32BITINODES;
-
- if (ap->flags & XFSMNT_IOSIZE) {
- if (ap->iosizelog > XFS_MAX_IO_LOG ||
- ap->iosizelog < XFS_MIN_IO_LOG) {
- cmn_err(CE_WARN,
- "XFS: invalid log iosize: %d [not %d-%d]",
- ap->iosizelog, XFS_MIN_IO_LOG,
- XFS_MAX_IO_LOG);
- return XFS_ERROR(EINVAL);
- }
-
- mp->m_flags |= XFS_MOUNT_DFLT_IOSIZE;
- mp->m_readio_log = mp->m_writeio_log = ap->iosizelog;
- }
-
- if (ap->flags & XFSMNT_IKEEP)
- mp->m_flags |= XFS_MOUNT_IKEEP;
- if (ap->flags & XFSMNT_DIRSYNC)
- mp->m_flags |= XFS_MOUNT_DIRSYNC;
- if (ap->flags & XFSMNT_ATTR2)
- mp->m_flags |= XFS_MOUNT_ATTR2;
- if (ap->flags & XFSMNT_NOATTR2)
- mp->m_flags |= XFS_MOUNT_NOATTR2;
-
- if (ap->flags2 & XFSMNT2_COMPAT_IOSIZE)
- mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE;
-
- /*
- * no recovery flag requires a read-only mount
- */
- if (ap->flags & XFSMNT_NORECOVERY) {
- if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
- cmn_err(CE_WARN,
- "XFS: tried to mount a FS read-write without recovery!");
- return XFS_ERROR(EINVAL);
- }
- mp->m_flags |= XFS_MOUNT_NORECOVERY;
- }
-
- if (ap->flags & XFSMNT_NOUUID)
- mp->m_flags |= XFS_MOUNT_NOUUID;
- if (ap->flags & XFSMNT_BARRIER)
- mp->m_flags |= XFS_MOUNT_BARRIER;
- else
- mp->m_flags &= ~XFS_MOUNT_BARRIER;
-
- if (ap->flags2 & XFSMNT2_FILESTREAMS)
- mp->m_flags |= XFS_MOUNT_FILESTREAMS;
-
- if (ap->flags & XFSMNT_DMAPI)
- mp->m_flags |= XFS_MOUNT_DMAPI;
- return 0;
-
-
- out_free_rtname:
- kfree(mp->m_rtname);
- out_free_fsname:
- kfree(mp->m_fsname);
- out:
- return error;
-}
-
-/*
- * This function fills in xfs_mount_t fields based on mount args.
* Note: the superblock _has_ now been read in.
*/
STATIC int
xfs_finish_flags(
- struct xfs_mount_args *ap,
struct xfs_mount *mp)
{
int ronly = (mp->m_flags & XFS_MOUNT_RDONLY);
/* Fail a mount where the logbuf is smaller then the log stripe */
if (xfs_sb_version_haslogv2(&mp->m_sb)) {
- if ((ap->logbufsize <= 0) &&
- (mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE)) {
+ if (mp->m_logbsize <= 0 &&
+ mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) {
mp->m_logbsize = mp->m_sb.sb_logsunit;
- } else if (ap->logbufsize > 0 &&
- ap->logbufsize < mp->m_sb.sb_logsunit) {
+ } else if (mp->m_logbsize > 0 &&
+ mp->m_logbsize < mp->m_sb.sb_logsunit) {
cmn_err(CE_WARN,
"XFS: logbuf size must be greater than or equal to log stripe size");
return XFS_ERROR(EINVAL);
}
} else {
/* Fail a mount if the logbuf is larger than 32K */
- if (ap->logbufsize > XLOG_BIG_RECORD_BSIZE) {
+ if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) {
cmn_err(CE_WARN,
"XFS: logbuf size for version 1 logs must be 16K or 32K");
return XFS_ERROR(EINVAL);
@@ -1602,7 +1373,7 @@ xfs_finish_flags(
* told by noattr2 to turn it off
*/
if (xfs_sb_version_hasattr2(&mp->m_sb) &&
- !(ap->flags & XFSMNT_NOATTR2))
+ !(mp->m_flags & XFS_MOUNT_NOATTR2))
mp->m_flags |= XFS_MOUNT_ATTR2;
/*
@@ -1614,48 +1385,6 @@ xfs_finish_flags(
return XFS_ERROR(EROFS);
}
- /*
- * check for shared mount.
- */
- if (ap->flags & XFSMNT_SHARED) {
- if (!xfs_sb_version_hasshared(&mp->m_sb))
- return XFS_ERROR(EINVAL);
-
- /*
- * For IRIX 6.5, shared mounts must have the shared
- * version bit set, have the persistent readonly
- * field set, must be version 0 and can only be mounted
- * read-only.
- */
- if (!ronly || !(mp->m_sb.sb_flags & XFS_SBF_READONLY) ||
- (mp->m_sb.sb_shared_vn != 0))
- return XFS_ERROR(EINVAL);
-
- mp->m_flags |= XFS_MOUNT_SHARED;
-
- /*
- * Shared XFS V0 can't deal with DMI. Return EINVAL.
- */
- if (mp->m_sb.sb_shared_vn == 0 && (ap->flags & XFSMNT_DMAPI))
- return XFS_ERROR(EINVAL);
- }
-
- if (ap->flags & XFSMNT_UQUOTA) {
- mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
- if (ap->flags & XFSMNT_UQUOTAENF)
- mp->m_qflags |= XFS_UQUOTA_ENFD;
- }
-
- if (ap->flags & XFSMNT_GQUOTA) {
- mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
- if (ap->flags & XFSMNT_GQUOTAENF)
- mp->m_qflags |= XFS_OQUOTA_ENFD;
- } else if (ap->flags & XFSMNT_PQUOTA) {
- mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
- if (ap->flags & XFSMNT_PQUOTAENF)
- mp->m_qflags |= XFS_OQUOTA_ENFD;
- }
-
return 0;
}
@@ -1667,19 +1396,14 @@ xfs_fs_fill_super(
{
struct inode *root;
struct xfs_mount *mp = NULL;
- struct xfs_mount_args *args;
int flags = 0, error = ENOMEM;
-
- args = xfs_args_allocate(sb, silent);
- if (!args)
- return -ENOMEM;
+ char *mtpt = NULL;
mp = kzalloc(sizeof(struct xfs_mount), GFP_KERNEL);
if (!mp)
- goto out_free_args;
+ goto out;
spin_lock_init(&mp->m_sb_lock);
- mutex_init(&mp->m_ilock);
mutex_init(&mp->m_growlock);
atomic_set(&mp->m_active_trans, 0);
INIT_LIST_HEAD(&mp->m_sync_list);
@@ -1689,12 +1413,9 @@ xfs_fs_fill_super(
mp->m_super = sb;
sb->s_fs_info = mp;
- if (sb->s_flags & MS_RDONLY)
- mp->m_flags |= XFS_MOUNT_RDONLY;
-
- error = xfs_parseargs(mp, (char *)data, args, 0);
+ error = xfs_parseargs(mp, (char *)data, &mtpt);
if (error)
- goto out_free_mp;
+ goto out_free_fsname;
sb_min_blocksize(sb, BBSIZE);
sb->s_xattr = xfs_xattr_handlers;
@@ -1702,33 +1423,28 @@ xfs_fs_fill_super(
sb->s_qcop = &xfs_quotactl_operations;
sb->s_op = &xfs_super_operations;
- error = xfs_dmops_get(mp, args);
+ error = xfs_dmops_get(mp);
if (error)
- goto out_free_mp;
- error = xfs_qmops_get(mp, args);
+ goto out_free_fsname;
+ error = xfs_qmops_get(mp);
if (error)
goto out_put_dmops;
- if (args->flags & XFSMNT_QUIET)
+ if (silent)
flags |= XFS_MFSI_QUIET;
- error = xfs_open_devices(mp, args);
+ error = xfs_open_devices(mp);
if (error)
goto out_put_qmops;
if (xfs_icsb_init_counters(mp))
mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB;
- /*
- * Setup flags based on mount(2) options and then the superblock
- */
- error = xfs_start_flags(args, mp);
- if (error)
- goto out_free_fsname;
error = xfs_readsb(mp, flags);
if (error)
- goto out_free_fsname;
- error = xfs_finish_flags(args, mp);
+ goto out_destroy_counters;
+
+ error = xfs_finish_flags(mp);
if (error)
goto out_free_sb;
@@ -1747,7 +1463,7 @@ xfs_fs_fill_super(
if (error)
goto out_filestream_unmount;
- XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, args->mtpt, args->fsname);
+ XFS_SEND_MOUNT(mp, DM_RIGHT_NULL, mtpt, mp->m_fsname);
sb->s_dirt = 1;
sb->s_magic = XFS_SB_MAGIC;
@@ -1772,35 +1488,31 @@ xfs_fs_fill_super(
goto fail_vnrele;
}
- mp->m_sync_work.w_syncer = xfs_sync_worker;
- mp->m_sync_work.w_mount = mp;
- mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
- if (IS_ERR(mp->m_sync_task)) {
- error = -PTR_ERR(mp->m_sync_task);
+ error = xfs_syncd_init(mp);
+ if (error)
goto fail_vnrele;
- }
- xfs_itrace_exit(XFS_I(sb->s_root->d_inode));
+ kfree(mtpt);
- kfree(args);
+ xfs_itrace_exit(XFS_I(sb->s_root->d_inode));
return 0;
out_filestream_unmount:
xfs_filestream_unmount(mp);
out_free_sb:
xfs_freesb(mp);
- out_free_fsname:
- xfs_free_fsname(mp);
+ out_destroy_counters:
xfs_icsb_destroy_counters(mp);
xfs_close_devices(mp);
out_put_qmops:
xfs_qmops_put(mp);
out_put_dmops:
xfs_dmops_put(mp);
- out_free_mp:
+ out_free_fsname:
+ xfs_free_fsname(mp);
+ kfree(mtpt);
kfree(mp);
- out_free_args:
- kfree(args);
+ out:
return -error;
fail_vnrele:
@@ -1820,8 +1532,6 @@ xfs_fs_fill_super(
xfs_filestream_unmount(mp);
XFS_bflush(mp->m_ddev_targp);
- error = xfs_unmount_flush(mp, 0);
- WARN_ON(error);
xfs_unmountfs(mp);
goto out_free_sb;
@@ -1882,10 +1592,19 @@ xfs_alloc_trace_bufs(void)
if (!xfs_bmap_trace_buf)
goto out_free_alloc_trace;
#endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
+ xfs_allocbt_trace_buf = ktrace_alloc(XFS_ALLOCBT_TRACE_SIZE,
+ KM_MAYFAIL);
+ if (!xfs_allocbt_trace_buf)
+ goto out_free_bmap_trace;
+
+ xfs_inobt_trace_buf = ktrace_alloc(XFS_INOBT_TRACE_SIZE, KM_MAYFAIL);
+ if (!xfs_inobt_trace_buf)
+ goto out_free_allocbt_trace;
+
xfs_bmbt_trace_buf = ktrace_alloc(XFS_BMBT_TRACE_SIZE, KM_MAYFAIL);
if (!xfs_bmbt_trace_buf)
- goto out_free_bmap_trace;
+ goto out_free_inobt_trace;
#endif
#ifdef XFS_ATTR_TRACE
xfs_attr_trace_buf = ktrace_alloc(XFS_ATTR_TRACE_SIZE, KM_MAYFAIL);
@@ -1907,8 +1626,12 @@ xfs_alloc_trace_bufs(void)
ktrace_free(xfs_attr_trace_buf);
out_free_bmbt_trace:
#endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
ktrace_free(xfs_bmbt_trace_buf);
+ out_free_inobt_trace:
+ ktrace_free(xfs_inobt_trace_buf);
+ out_free_allocbt_trace:
+ ktrace_free(xfs_allocbt_trace_buf);
out_free_bmap_trace:
#endif
#ifdef XFS_BMAP_TRACE
@@ -1931,8 +1654,10 @@ xfs_free_trace_bufs(void)
#ifdef XFS_ATTR_TRACE
ktrace_free(xfs_attr_trace_buf);
#endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
ktrace_free(xfs_bmbt_trace_buf);
+ ktrace_free(xfs_inobt_trace_buf);
+ ktrace_free(xfs_allocbt_trace_buf);
#endif
#ifdef XFS_BMAP_TRACE
ktrace_free(xfs_bmap_trace_buf);
@@ -1945,16 +1670,10 @@ xfs_free_trace_bufs(void)
STATIC int __init
xfs_init_zones(void)
{
- xfs_vnode_zone = kmem_zone_init_flags(sizeof(struct inode), "xfs_vnode",
- KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
- KM_ZONE_SPREAD,
- xfs_fs_inode_init_once);
- if (!xfs_vnode_zone)
- goto out;
xfs_ioend_zone = kmem_zone_init(sizeof(xfs_ioend_t), "xfs_ioend");
if (!xfs_ioend_zone)
- goto out_destroy_vnode_zone;
+ goto out;
xfs_ioend_pool = mempool_create_slab_pool(4 * MAX_BUF_PER_PAGE,
xfs_ioend_zone);
@@ -1970,6 +1689,7 @@ xfs_init_zones(void)
"xfs_bmap_free_item");
if (!xfs_bmap_free_item_zone)
goto out_destroy_log_ticket_zone;
+
xfs_btree_cur_zone = kmem_zone_init(sizeof(xfs_btree_cur_t),
"xfs_btree_cur");
if (!xfs_btree_cur_zone)
@@ -2017,8 +1737,8 @@ xfs_init_zones(void)
xfs_inode_zone =
kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
- KM_ZONE_HWALIGN | KM_ZONE_RECLAIM |
- KM_ZONE_SPREAD, NULL);
+ KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
+ xfs_fs_inode_init_once);
if (!xfs_inode_zone)
goto out_destroy_efi_zone;
@@ -2066,8 +1786,6 @@ xfs_init_zones(void)
mempool_destroy(xfs_ioend_pool);
out_destroy_ioend_zone:
kmem_zone_destroy(xfs_ioend_zone);
- out_destroy_vnode_zone:
- kmem_zone_destroy(xfs_vnode_zone);
out:
return -ENOMEM;
}
@@ -2092,7 +1810,6 @@ xfs_destroy_zones(void)
kmem_zone_destroy(xfs_log_ticket_zone);
mempool_destroy(xfs_ioend_pool);
kmem_zone_destroy(xfs_ioend_zone);
- kmem_zone_destroy(xfs_vnode_zone);
}
@@ -2100,13 +1817,12 @@ STATIC int __init
init_xfs_fs(void)
{
int error;
- static char message[] __initdata = KERN_INFO \
- XFS_VERSION_STRING " with " XFS_BUILD_OPTIONS " enabled\n";
- printk(message);
+ printk(KERN_INFO XFS_VERSION_STRING " with "
+ XFS_BUILD_OPTIONS " enabled\n");
ktrace_init(64);
- vn_init();
+ xfs_ioend_init();
xfs_dir_startup();
error = xfs_init_zones();
diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h
index fe2ef4e6a0f9..d5d776d4cd67 100644
--- a/fs/xfs/linux-2.6/xfs_super.h
+++ b/fs/xfs/linux-2.6/xfs_super.h
@@ -20,24 +20,12 @@
#include <linux/exportfs.h>
-#ifdef CONFIG_XFS_DMAPI
-# define vfs_insertdmapi(vfs) vfs_insertops(vfsp, &xfs_dmops)
-# define vfs_initdmapi() dmapi_init()
-# define vfs_exitdmapi() dmapi_uninit()
-#else
-# define vfs_insertdmapi(vfs) do { } while (0)
-# define vfs_initdmapi() do { } while (0)
-# define vfs_exitdmapi() do { } while (0)
-#endif
-
#ifdef CONFIG_XFS_QUOTA
-# define vfs_insertquota(vfs) vfs_insertops(vfsp, &xfs_qmops)
extern void xfs_qm_init(void);
extern void xfs_qm_exit(void);
# define vfs_initquota() xfs_qm_init()
# define vfs_exitquota() xfs_qm_exit()
#else
-# define vfs_insertquota(vfs) do { } while (0)
# define vfs_initquota() do { } while (0)
# define vfs_exitquota() do { } while (0)
#endif
@@ -101,9 +89,6 @@ struct block_device;
extern __uint64_t xfs_max_file_offset(unsigned int);
-extern void xfs_flush_inode(struct xfs_inode *);
-extern void xfs_flush_device(struct xfs_inode *);
-
extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
extern const struct export_operations xfs_export_operations;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
new file mode 100644
index 000000000000..2ed035354c26
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -0,0 +1,762 @@
+/*
+ * Copyright (c) 2000-2005 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_btree.h"
+#include "xfs_dir2_sf.h"
+#include "xfs_attr_sf.h"
+#include "xfs_inode.h"
+#include "xfs_dinode.h"
+#include "xfs_error.h"
+#include "xfs_mru_cache.h"
+#include "xfs_filestream.h"
+#include "xfs_vnodeops.h"
+#include "xfs_utils.h"
+#include "xfs_buf_item.h"
+#include "xfs_inode_item.h"
+#include "xfs_rw.h"
+
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+/*
+ * Sync all the inodes in the given AG according to the
+ * direction given by the flags.
+ */
+STATIC int
+xfs_sync_inodes_ag(
+ xfs_mount_t *mp,
+ int ag,
+ int flags)
+{
+ xfs_perag_t *pag = &mp->m_perag[ag];
+ int nr_found;
+ uint32_t first_index = 0;
+ int error = 0;
+ int last_error = 0;
+ int fflag = XFS_B_ASYNC;
+
+ if (flags & SYNC_DELWRI)
+ fflag = XFS_B_DELWRI;
+ if (flags & SYNC_WAIT)
+ fflag = 0; /* synchronous overrides all */
+
+ do {
+ struct inode *inode;
+ xfs_inode_t *ip = NULL;
+ int lock_flags = XFS_ILOCK_SHARED;
+
+ /*
+ * use a gang lookup to find the next inode in the tree
+ * as the tree is sparse and a gang lookup walks to find
+ * the number of objects requested.
+ */
+ read_lock(&pag->pag_ici_lock);
+ nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+ (void**)&ip, first_index, 1);
+
+ if (!nr_found) {
+ read_unlock(&pag->pag_ici_lock);
+ break;
+ }
+
+ /*
+ * Update the index for the next lookup. Catch overflows
+ * into the next AG range which can occur if we have inodes
+ * in the last block of the AG and we are currently
+ * pointing to the last inode.
+ */
+ first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+ if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
+ read_unlock(&pag->pag_ici_lock);
+ break;
+ }
+
+ /* nothing to sync during shutdown */
+ if (XFS_FORCED_SHUTDOWN(mp)) {
+ read_unlock(&pag->pag_ici_lock);
+ return 0;
+ }
+
+ /*
+ * If we can't get a reference on the inode, it must be
+ * in reclaim. Leave it for the reclaim code to flush.
+ */
+ inode = VFS_I(ip);
+ if (!igrab(inode)) {
+ read_unlock(&pag->pag_ici_lock);
+ continue;
+ }
+ read_unlock(&pag->pag_ici_lock);
+
+ /* avoid new or bad inodes */
+ if (is_bad_inode(inode) ||
+ xfs_iflags_test(ip, XFS_INEW)) {
+ IRELE(ip);
+ continue;
+ }
+
+ /*
+ * If we have to flush data or wait for I/O completion
+ * we need to hold the iolock.
+ */
+ if ((flags & SYNC_DELWRI) && VN_DIRTY(inode)) {
+ xfs_ilock(ip, XFS_IOLOCK_SHARED);
+ lock_flags |= XFS_IOLOCK_SHARED;
+ error = xfs_flush_pages(ip, 0, -1, fflag, FI_NONE);
+ if (flags & SYNC_IOWAIT)
+ xfs_ioend_wait(ip);
+ }
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+
+ if ((flags & SYNC_ATTR) && !xfs_inode_clean(ip)) {
+ if (flags & SYNC_WAIT) {
+ xfs_iflock(ip);
+ if (!xfs_inode_clean(ip))
+ error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
+ else
+ xfs_ifunlock(ip);
+ } else if (xfs_iflock_nowait(ip)) {
+ if (!xfs_inode_clean(ip))
+ error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
+ else
+ xfs_ifunlock(ip);
+ }
+ }
+ xfs_iput(ip, lock_flags);
+
+ if (error)
+ last_error = error;
+ /*
+ * bail out if the filesystem is corrupted.
+ */
+ if (error == EFSCORRUPTED)
+ return XFS_ERROR(error);
+
+ } while (nr_found);
+
+ return last_error;
+}
+
+int
+xfs_sync_inodes(
+ xfs_mount_t *mp,
+ int flags)
+{
+ int error;
+ int last_error;
+ int i;
+ int lflags = XFS_LOG_FORCE;
+
+ if (mp->m_flags & XFS_MOUNT_RDONLY)
+ return 0;
+ error = 0;
+ last_error = 0;
+
+ if (flags & SYNC_WAIT)
+ lflags |= XFS_LOG_SYNC;
+
+ for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+ if (!mp->m_perag[i].pag_ici_init)
+ continue;
+ error = xfs_sync_inodes_ag(mp, i, flags);
+ if (error)
+ last_error = error;
+ if (error == EFSCORRUPTED)
+ break;
+ }
+ if (flags & SYNC_DELWRI)
+ xfs_log_force(mp, 0, lflags);
+
+ return XFS_ERROR(last_error);
+}
+
+STATIC int
+xfs_commit_dummy_trans(
+ struct xfs_mount *mp,
+ uint log_flags)
+{
+ struct xfs_inode *ip = mp->m_rootip;
+ struct xfs_trans *tp;
+ int error;
+
+ /*
+ * Put a dummy transaction in the log to tell recovery
+ * that all others are OK.
+ */
+ tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
+ error = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), 0, 0, 0);
+ if (error) {
+ xfs_trans_cancel(tp, 0);
+ return error;
+ }
+
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+ xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+ xfs_trans_ihold(tp, ip);
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ /* XXX(hch): ignoring the error here.. */
+ error = xfs_trans_commit(tp, 0);
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+ xfs_log_force(mp, 0, log_flags);
+ return 0;
+}
+
+int
+xfs_sync_fsdata(
+ struct xfs_mount *mp,
+ int flags)
+{
+ struct xfs_buf *bp;
+ struct xfs_buf_log_item *bip;
+ int error = 0;
+
+ /*
+ * If this is xfssyncd() then only sync the superblock if we can
+ * lock it without sleeping and it is not pinned.
+ */
+ if (flags & SYNC_BDFLUSH) {
+ ASSERT(!(flags & SYNC_WAIT));
+
+ bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
+ if (!bp)
+ goto out;
+
+ bip = XFS_BUF_FSPRIVATE(bp, struct xfs_buf_log_item *);
+ if (!bip || !xfs_buf_item_dirty(bip) || XFS_BUF_ISPINNED(bp))
+ goto out_brelse;
+ } else {
+ bp = xfs_getsb(mp, 0);
+
+ /*
+ * If the buffer is pinned then push on the log so we won't
+ * get stuck waiting in the write for someone, maybe
+ * ourselves, to flush the log.
+ *
+ * Even though we just pushed the log above, we did not have
+ * the superblock buffer locked at that point so it can
+ * become pinned in between there and here.
+ */
+ if (XFS_BUF_ISPINNED(bp))
+ xfs_log_force(mp, 0, XFS_LOG_FORCE);
+ }
+
+
+ if (flags & SYNC_WAIT)
+ XFS_BUF_UNASYNC(bp);
+ else
+ XFS_BUF_ASYNC(bp);
+
+ return xfs_bwrite(mp, bp);
+
+ out_brelse:
+ xfs_buf_relse(bp);
+ out:
+ return error;
+}
+
+/*
+ * When remounting a filesystem read-only or freezing the filesystem, we have
+ * two phases to execute. This first phase is syncing the data before we
+ * quiesce the filesystem, and the second is flushing all the inodes out after
+ * we've waited for all the transactions created by the first phase to
+ * complete. The second phase ensures that the inodes are written to their
+ * location on disk rather than just existing in transactions in the log. This
+ * means after a quiesce there is no log replay required to write the inodes to
+ * disk (this is the main difference between a sync and a quiesce).
+ */
+/*
+ * First stage of freeze - no writers will make progress now we are here,
+ * so we flush delwri and delalloc buffers here, then wait for all I/O to
+ * complete. Data is frozen at that point. Metadata is not frozen,
+ * transactions can still occur here so don't bother flushing the buftarg
+ * because it'll just get dirty again.
+ */
+int
+xfs_quiesce_data(
+ struct xfs_mount *mp)
+{
+ int error;
+
+ /* push non-blocking */
+ xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_BDFLUSH);
+ XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
+ xfs_filestream_flush(mp);
+
+ /* push and block */
+ xfs_sync_inodes(mp, SYNC_DELWRI|SYNC_WAIT|SYNC_IOWAIT);
+ XFS_QM_DQSYNC(mp, SYNC_WAIT);
+
+ /* write superblock and hoover up shutdown errors */
+ error = xfs_sync_fsdata(mp, 0);
+
+ /* flush data-only devices */
+ if (mp->m_rtdev_targp)
+ XFS_bflush(mp->m_rtdev_targp);
+
+ return error;
+}
+
+STATIC void
+xfs_quiesce_fs(
+ struct xfs_mount *mp)
+{
+ int count = 0, pincount;
+
+ xfs_flush_buftarg(mp->m_ddev_targp, 0);
+ xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+
+ /*
+ * This loop must run at least twice. The first instance of the loop
+ * will flush most meta data but that will generate more meta data
+ * (typically directory updates). Which then must be flushed and
+ * logged before we can write the unmount record.
+ */
+ do {
+ xfs_sync_inodes(mp, SYNC_ATTR|SYNC_WAIT);
+ pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
+ if (!pincount) {
+ delay(50);
+ count++;
+ }
+ } while (count < 2);
+}
+
+/*
+ * Second stage of a quiesce. The data is already synced, now we have to take
+ * care of the metadata. New transactions are already blocked, so we need to
+ * wait for any remaining transactions to drain out before proceding.
+ */
+void
+xfs_quiesce_attr(
+ struct xfs_mount *mp)
+{
+ int error = 0;
+
+ /* wait for all modifications to complete */
+ while (atomic_read(&mp->m_active_trans) > 0)
+ delay(100);
+
+ /* flush inodes and push all remaining buffers out to disk */
+ xfs_quiesce_fs(mp);
+
+ ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
+
+ /* Push the superblock and write an unmount record */
+ error = xfs_log_sbcount(mp, 1);
+ if (error)
+ xfs_fs_cmn_err(CE_WARN, mp,
+ "xfs_attr_quiesce: failed to log sb changes. "
+ "Frozen image may not be consistent.");
+ xfs_log_unmount_write(mp);
+ xfs_unmountfs_writesb(mp);
+}
+
+/*
+ * Enqueue a work item to be picked up by the vfs xfssyncd thread.
+ * Doing this has two advantages:
+ * - It saves on stack space, which is tight in certain situations
+ * - It can be used (with care) as a mechanism to avoid deadlocks.
+ * Flushing while allocating in a full filesystem requires both.
+ */
+STATIC void
+xfs_syncd_queue_work(
+ struct xfs_mount *mp,
+ void *data,
+ void (*syncer)(struct xfs_mount *, void *))
+{
+ struct bhv_vfs_sync_work *work;
+
+ work = kmem_alloc(sizeof(struct bhv_vfs_sync_work), KM_SLEEP);
+ INIT_LIST_HEAD(&work->w_list);
+ work->w_syncer = syncer;
+ work->w_data = data;
+ work->w_mount = mp;
+ spin_lock(&mp->m_sync_lock);
+ list_add_tail(&work->w_list, &mp->m_sync_list);
+ spin_unlock(&mp->m_sync_lock);
+ wake_up_process(mp->m_sync_task);
+}
+
+/*
+ * Flush delayed allocate data, attempting to free up reserved space
+ * from existing allocations. At this point a new allocation attempt
+ * has failed with ENOSPC and we are in the process of scratching our
+ * heads, looking about for more room...
+ */
+STATIC void
+xfs_flush_inode_work(
+ struct xfs_mount *mp,
+ void *arg)
+{
+ struct inode *inode = arg;
+ filemap_flush(inode->i_mapping);
+ iput(inode);
+}
+
+void
+xfs_flush_inode(
+ xfs_inode_t *ip)
+{
+ struct inode *inode = VFS_I(ip);
+
+ igrab(inode);
+ xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inode_work);
+ delay(msecs_to_jiffies(500));
+}
+
+/*
+ * This is the "bigger hammer" version of xfs_flush_inode_work...
+ * (IOW, "If at first you don't succeed, use a Bigger Hammer").
+ */
+STATIC void
+xfs_flush_device_work(
+ struct xfs_mount *mp,
+ void *arg)
+{
+ struct inode *inode = arg;
+ sync_blockdev(mp->m_super->s_bdev);
+ iput(inode);
+}
+
+void
+xfs_flush_device(
+ xfs_inode_t *ip)
+{
+ struct inode *inode = VFS_I(ip);
+
+ igrab(inode);
+ xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_device_work);
+ delay(msecs_to_jiffies(500));
+ xfs_log_force(ip->i_mount, (xfs_lsn_t)0, XFS_LOG_FORCE|XFS_LOG_SYNC);
+}
+
+/*
+ * Every sync period we need to unpin all items, reclaim inodes, sync
+ * quota and write out the superblock. We might need to cover the log
+ * to indicate it is idle.
+ */
+STATIC void
+xfs_sync_worker(
+ struct xfs_mount *mp,
+ void *unused)
+{
+ int error;
+
+ if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+ xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
+ xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_DELWRI_ELSE_ASYNC);
+ /* dgc: errors ignored here */
+ error = XFS_QM_DQSYNC(mp, SYNC_BDFLUSH);
+ error = xfs_sync_fsdata(mp, SYNC_BDFLUSH);
+ if (xfs_log_need_covered(mp))
+ error = xfs_commit_dummy_trans(mp, XFS_LOG_FORCE);
+ }
+ mp->m_sync_seq++;
+ wake_up(&mp->m_wait_single_sync_task);
+}
+
+STATIC int
+xfssyncd(
+ void *arg)
+{
+ struct xfs_mount *mp = arg;
+ long timeleft;
+ bhv_vfs_sync_work_t *work, *n;
+ LIST_HEAD (tmp);
+
+ set_freezable();
+ timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10);
+ for (;;) {
+ timeleft = schedule_timeout_interruptible(timeleft);
+ /* swsusp */
+ try_to_freeze();
+ if (kthread_should_stop() && list_empty(&mp->m_sync_list))
+ break;
+
+ spin_lock(&mp->m_sync_lock);
+ /*
+ * We can get woken by laptop mode, to do a sync -
+ * that's the (only!) case where the list would be
+ * empty with time remaining.
+ */
+ if (!timeleft || list_empty(&mp->m_sync_list)) {
+ if (!timeleft)
+ timeleft = xfs_syncd_centisecs *
+ msecs_to_jiffies(10);
+ INIT_LIST_HEAD(&mp->m_sync_work.w_list);
+ list_add_tail(&mp->m_sync_work.w_list,
+ &mp->m_sync_list);
+ }
+ list_for_each_entry_safe(work, n, &mp->m_sync_list, w_list)
+ list_move(&work->w_list, &tmp);
+ spin_unlock(&mp->m_sync_lock);
+
+ list_for_each_entry_safe(work, n, &tmp, w_list) {
+ (*work->w_syncer)(mp, work->w_data);
+ list_del(&work->w_list);
+ if (work == &mp->m_sync_work)
+ continue;
+ kmem_free(work);
+ }
+ }
+
+ return 0;
+}
+
+int
+xfs_syncd_init(
+ struct xfs_mount *mp)
+{
+ mp->m_sync_work.w_syncer = xfs_sync_worker;
+ mp->m_sync_work.w_mount = mp;
+ mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd");
+ if (IS_ERR(mp->m_sync_task))
+ return -PTR_ERR(mp->m_sync_task);
+ return 0;
+}
+
+void
+xfs_syncd_stop(
+ struct xfs_mount *mp)
+{
+ kthread_stop(mp->m_sync_task);
+}
+
+int
+xfs_reclaim_inode(
+ xfs_inode_t *ip,
+ int locked,
+ int sync_mode)
+{
+ xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
+
+ /* The hash lock here protects a thread in xfs_iget_core from
+ * racing with us on linking the inode back with a vnode.
+ * Once we have the XFS_IRECLAIM flag set it will not touch
+ * us.
+ */
+ write_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+ if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
+ !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
+ spin_unlock(&ip->i_flags_lock);
+ write_unlock(&pag->pag_ici_lock);
+ if (locked) {
+ xfs_ifunlock(ip);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ }
+ return 1;
+ }
+ __xfs_iflags_set(ip, XFS_IRECLAIM);
+ spin_unlock(&ip->i_flags_lock);
+ write_unlock(&pag->pag_ici_lock);
+ xfs_put_perag(ip->i_mount, pag);
+
+ /*
+ * If the inode is still dirty, then flush it out. If the inode
+ * is not in the AIL, then it will be OK to flush it delwri as
+ * long as xfs_iflush() does not keep any references to the inode.
+ * We leave that decision up to xfs_iflush() since it has the
+ * knowledge of whether it's OK to simply do a delwri flush of
+ * the inode or whether we need to wait until the inode is
+ * pulled from the AIL.
+ * We get the flush lock regardless, though, just to make sure
+ * we don't free it while it is being flushed.
+ */
+ if (!locked) {
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
+ xfs_iflock(ip);
+ }
+
+ /*
+ * In the case of a forced shutdown we rely on xfs_iflush() to
+ * wait for the inode to be unpinned before returning an error.
+ */
+ if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) {
+ /* synchronize with xfs_iflush_done */
+ xfs_iflock(ip);
+ xfs_ifunlock(ip);
+ }
+
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ xfs_ireclaim(ip);
+ return 0;
+}
+
+/*
+ * We set the inode flag atomically with the radix tree tag.
+ * Once we get tag lookups on the radix tree, this inode flag
+ * can go away.
+ */
+void
+xfs_inode_set_reclaim_tag(
+ xfs_inode_t *ip)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
+
+ read_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+ radix_tree_tag_set(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+ __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+ spin_unlock(&ip->i_flags_lock);
+ read_unlock(&pag->pag_ici_lock);
+ xfs_put_perag(mp, pag);
+}
+
+void
+__xfs_inode_clear_reclaim_tag(
+ xfs_mount_t *mp,
+ xfs_perag_t *pag,
+ xfs_inode_t *ip)
+{
+ radix_tree_tag_clear(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
+}
+
+void
+xfs_inode_clear_reclaim_tag(
+ xfs_inode_t *ip)
+{
+ xfs_mount_t *mp = ip->i_mount;
+ xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
+
+ read_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+ __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+ spin_unlock(&ip->i_flags_lock);
+ read_unlock(&pag->pag_ici_lock);
+ xfs_put_perag(mp, pag);
+}
+
+
+STATIC void
+xfs_reclaim_inodes_ag(
+ xfs_mount_t *mp,
+ int ag,
+ int noblock,
+ int mode)
+{
+ xfs_inode_t *ip = NULL;
+ xfs_perag_t *pag = &mp->m_perag[ag];
+ int nr_found;
+ uint32_t first_index;
+ int skipped;
+
+restart:
+ first_index = 0;
+ skipped = 0;
+ do {
+ /*
+ * use a gang lookup to find the next inode in the tree
+ * as the tree is sparse and a gang lookup walks to find
+ * the number of objects requested.
+ */
+ read_lock(&pag->pag_ici_lock);
+ nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root,
+ (void**)&ip, first_index, 1,
+ XFS_ICI_RECLAIM_TAG);
+
+ if (!nr_found) {
+ read_unlock(&pag->pag_ici_lock);
+ break;
+ }
+
+ /*
+ * Update the index for the next lookup. Catch overflows
+ * into the next AG range which can occur if we have inodes
+ * in the last block of the AG and we are currently
+ * pointing to the last inode.
+ */
+ first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+ if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
+ read_unlock(&pag->pag_ici_lock);
+ break;
+ }
+
+ /* ignore if already under reclaim */
+ if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
+ read_unlock(&pag->pag_ici_lock);
+ continue;
+ }
+
+ if (noblock) {
+ if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+ read_unlock(&pag->pag_ici_lock);
+ continue;
+ }
+ if (xfs_ipincount(ip) ||
+ !xfs_iflock_nowait(ip)) {
+ xfs_iunlock(ip, XFS_ILOCK_EXCL);
+ read_unlock(&pag->pag_ici_lock);
+ continue;
+ }
+ }
+ read_unlock(&pag->pag_ici_lock);
+
+ /*
+ * hmmm - this is an inode already in reclaim. Do
+ * we even bother catching it here?
+ */
+ if (xfs_reclaim_inode(ip, noblock, mode))
+ skipped++;
+ } while (nr_found);
+
+ if (skipped) {
+ delay(1);
+ goto restart;
+ }
+ return;
+
+}
+
+int
+xfs_reclaim_inodes(
+ xfs_mount_t *mp,
+ int noblock,
+ int mode)
+{
+ int i;
+
+ for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+ if (!mp->m_perag[i].pag_ici_init)
+ continue;
+ xfs_reclaim_inodes_ag(mp, i, noblock, mode);
+ }
+ return 0;
+}
+
+
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
new file mode 100644
index 000000000000..5f6de1efe1f6
--- /dev/null
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2000-2006 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef XFS_SYNC_H
+#define XFS_SYNC_H 1
+
+struct xfs_mount;
+
+typedef struct bhv_vfs_sync_work {
+ struct list_head w_list;
+ struct xfs_mount *w_mount;
+ void *w_data; /* syncer routine argument */
+ void (*w_syncer)(struct xfs_mount *, void *);
+} bhv_vfs_sync_work_t;
+
+#define SYNC_ATTR 0x0001 /* sync attributes */
+#define SYNC_DELWRI 0x0002 /* look at delayed writes */
+#define SYNC_WAIT 0x0004 /* wait for i/o to complete */
+#define SYNC_BDFLUSH 0x0008 /* BDFLUSH is calling -- don't block */
+#define SYNC_IOWAIT 0x0010 /* wait for all I/O to complete */
+
+int xfs_syncd_init(struct xfs_mount *mp);
+void xfs_syncd_stop(struct xfs_mount *mp);
+
+int xfs_sync_inodes(struct xfs_mount *mp, int flags);
+int xfs_sync_fsdata(struct xfs_mount *mp, int flags);
+
+int xfs_quiesce_data(struct xfs_mount *mp);
+void xfs_quiesce_attr(struct xfs_mount *mp);
+
+void xfs_flush_inode(struct xfs_inode *ip);
+void xfs_flush_device(struct xfs_inode *ip);
+
+int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
+int xfs_reclaim_inodes(struct xfs_mount *mp, int noblock, int mode);
+
+void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
+void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
+void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
+ struct xfs_inode *ip);
+#endif
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c
index 7dacb5bbde3f..916c0ffb6083 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.c
+++ b/fs/xfs/linux-2.6/xfs_sysctl.c
@@ -56,17 +56,6 @@ xfs_stats_clear_proc_handler(
static ctl_table xfs_table[] = {
{
- .ctl_name = XFS_RESTRICT_CHOWN,
- .procname = "restrict_chown",
- .data = &xfs_params.restrict_chown.val,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
- .strategy = &sysctl_intvec,
- .extra1 = &xfs_params.restrict_chown.min,
- .extra2 = &xfs_params.restrict_chown.max
- },
- {
.ctl_name = XFS_SGID_INHERIT,
.procname = "irix_sgid_inherit",
.data = &xfs_params.sgid_inherit.val,
diff --git a/fs/xfs/linux-2.6/xfs_sysctl.h b/fs/xfs/linux-2.6/xfs_sysctl.h
index 4aadb8056c37..b9937d450f8e 100644
--- a/fs/xfs/linux-2.6/xfs_sysctl.h
+++ b/fs/xfs/linux-2.6/xfs_sysctl.h
@@ -31,7 +31,6 @@ typedef struct xfs_sysctl_val {
} xfs_sysctl_val_t;
typedef struct xfs_param {
- xfs_sysctl_val_t restrict_chown;/* Root/non-root can give away files.*/
xfs_sysctl_val_t sgid_inherit; /* Inherit S_ISGID if process' GID is
* not a member of parent dir GID. */
xfs_sysctl_val_t symlink_mode; /* Link creat mode affected by umask */
@@ -68,7 +67,7 @@ typedef struct xfs_param {
enum {
/* XFS_REFCACHE_SIZE = 1 */
/* XFS_REFCACHE_PURGE = 2 */
- XFS_RESTRICT_CHOWN = 3,
+ /* XFS_RESTRICT_CHOWN = 3 */
XFS_SGID_INHERIT = 4,
XFS_SYMLINK_MODE = 5,
XFS_PANIC_MASK = 6,
diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h
deleted file mode 100644
index 7e60c7776b1c..000000000000
--- a/fs/xfs/linux-2.6/xfs_vfs.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2000-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_VFS_H__
-#define __XFS_VFS_H__
-
-#include <linux/vfs.h>
-#include "xfs_fs.h"
-
-struct inode;
-
-struct fid;
-struct cred;
-struct seq_file;
-struct super_block;
-struct xfs_inode;
-struct xfs_mount;
-struct xfs_mount_args;
-
-typedef struct kstatfs bhv_statvfs_t;
-
-typedef struct bhv_vfs_sync_work {
- struct list_head w_list;
- struct xfs_mount *w_mount;
- void *w_data; /* syncer routine argument */
- void (*w_syncer)(struct xfs_mount *, void *);
-} bhv_vfs_sync_work_t;
-
-#define SYNC_ATTR 0x0001 /* sync attributes */
-#define SYNC_CLOSE 0x0002 /* close file system down */
-#define SYNC_DELWRI 0x0004 /* look at delayed writes */
-#define SYNC_WAIT 0x0008 /* wait for i/o to complete */
-#define SYNC_BDFLUSH 0x0010 /* BDFLUSH is calling -- don't block */
-#define SYNC_FSDATA 0x0020 /* flush fs data (e.g. superblocks) */
-#define SYNC_REFCACHE 0x0040 /* prune some of the nfs ref cache */
-#define SYNC_REMOUNT 0x0080 /* remount readonly, no dummy LRs */
-#define SYNC_IOWAIT 0x0100 /* wait for all I/O to complete */
-
-/*
- * When remounting a filesystem read-only or freezing the filesystem,
- * we have two phases to execute. This first phase is syncing the data
- * before we quiesce the fielsystem, and the second is flushing all the
- * inodes out after we've waited for all the transactions created by
- * the first phase to complete. The second phase uses SYNC_INODE_QUIESCE
- * to ensure that the inodes are written to their location on disk
- * rather than just existing in transactions in the log. This means
- * after a quiesce there is no log replay required to write the inodes
- * to disk (this is the main difference between a sync and a quiesce).
- */
-#define SYNC_DATA_QUIESCE (SYNC_DELWRI|SYNC_FSDATA|SYNC_WAIT|SYNC_IOWAIT)
-#define SYNC_INODE_QUIESCE (SYNC_REMOUNT|SYNC_ATTR|SYNC_WAIT)
-
-#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */
-#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */
-#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */
-#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */
-#define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */
-#define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */
-
-#define xfs_test_for_freeze(mp) ((mp)->m_super->s_frozen)
-#define xfs_wait_for_freeze(mp,l) vfs_check_frozen((mp)->m_super, (l))
-
-#endif /* __XFS_VFS_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.c b/fs/xfs/linux-2.6/xfs_vnode.c
deleted file mode 100644
index b52528bbbfff..000000000000
--- a/fs/xfs/linux-2.6/xfs_vnode.c
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_vnodeops.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-
-/*
- * And this gunk is needed for xfs_mount.h"
- */
-#include "xfs_log.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_dmapi.h"
-#include "xfs_inum.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-
-
-/*
- * Dedicated vnode inactive/reclaim sync wait queues.
- * Prime number of hash buckets since address is used as the key.
- */
-#define NVSYNC 37
-#define vptosync(v) (&vsync[((unsigned long)v) % NVSYNC])
-static wait_queue_head_t vsync[NVSYNC];
-
-void __init
-vn_init(void)
-{
- int i;
-
- for (i = 0; i < NVSYNC; i++)
- init_waitqueue_head(&vsync[i]);
-}
-
-void
-vn_iowait(
- xfs_inode_t *ip)
-{
- wait_queue_head_t *wq = vptosync(ip);
-
- wait_event(*wq, (atomic_read(&ip->i_iocount) == 0));
-}
-
-void
-vn_iowake(
- xfs_inode_t *ip)
-{
- if (atomic_dec_and_test(&ip->i_iocount))
- wake_up(vptosync(ip));
-}
-
-/*
- * Volume managers supporting multiple paths can send back ENODEV when the
- * final path disappears. In this case continuing to fill the page cache
- * with dirty data which cannot be written out is evil, so prevent that.
- */
-void
-vn_ioerror(
- xfs_inode_t *ip,
- int error,
- char *f,
- int l)
-{
- if (unlikely(error == -ENODEV))
- xfs_do_force_shutdown(ip->i_mount, SHUTDOWN_DEVICE_REQ, f, l);
-}
-
-#ifdef XFS_INODE_TRACE
-
-/*
- * Reference count of Linux inode if present, -1 if the xfs_inode
- * has no associated Linux inode.
- */
-static inline int xfs_icount(struct xfs_inode *ip)
-{
- struct inode *vp = VFS_I(ip);
-
- if (vp)
- return vn_count(vp);
- return -1;
-}
-
-#define KTRACE_ENTER(ip, vk, s, line, ra) \
- ktrace_enter( (ip)->i_trace, \
-/* 0 */ (void *)(__psint_t)(vk), \
-/* 1 */ (void *)(s), \
-/* 2 */ (void *)(__psint_t) line, \
-/* 3 */ (void *)(__psint_t)xfs_icount(ip), \
-/* 4 */ (void *)(ra), \
-/* 5 */ NULL, \
-/* 6 */ (void *)(__psint_t)current_cpu(), \
-/* 7 */ (void *)(__psint_t)current_pid(), \
-/* 8 */ (void *)__return_address, \
-/* 9 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL)
-
-/*
- * Vnode tracing code.
- */
-void
-_xfs_itrace_entry(xfs_inode_t *ip, const char *func, inst_t *ra)
-{
- KTRACE_ENTER(ip, INODE_KTRACE_ENTRY, func, 0, ra);
-}
-
-void
-_xfs_itrace_exit(xfs_inode_t *ip, const char *func, inst_t *ra)
-{
- KTRACE_ENTER(ip, INODE_KTRACE_EXIT, func, 0, ra);
-}
-
-void
-xfs_itrace_hold(xfs_inode_t *ip, char *file, int line, inst_t *ra)
-{
- KTRACE_ENTER(ip, INODE_KTRACE_HOLD, file, line, ra);
-}
-
-void
-_xfs_itrace_ref(xfs_inode_t *ip, char *file, int line, inst_t *ra)
-{
- KTRACE_ENTER(ip, INODE_KTRACE_REF, file, line, ra);
-}
-
-void
-xfs_itrace_rele(xfs_inode_t *ip, char *file, int line, inst_t *ra)
-{
- KTRACE_ENTER(ip, INODE_KTRACE_RELE, file, line, ra);
-}
-#endif /* XFS_INODE_TRACE */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 683ce16210ff..f65983a230d3 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -18,7 +18,10 @@
#ifndef __XFS_VNODE_H__
#define __XFS_VNODE_H__
+#include "xfs_fs.h"
+
struct file;
+struct xfs_inode;
struct xfs_iomap;
struct attrlist_cursor_kern;
@@ -51,40 +54,6 @@ struct attrlist_cursor_kern;
Prevent VM access to the pages until
the operation completes. */
-
-extern void vn_init(void);
-
-/*
- * Yeah, these don't take vnode anymore at all, all this should be
- * cleaned up at some point.
- */
-extern void vn_iowait(struct xfs_inode *ip);
-extern void vn_iowake(struct xfs_inode *ip);
-extern void vn_ioerror(struct xfs_inode *ip, int error, char *f, int l);
-
-static inline int vn_count(struct inode *vp)
-{
- return atomic_read(&vp->i_count);
-}
-
-#define IHOLD(ip) \
-do { \
- ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
- atomic_inc(&(VFS_I(ip)->i_count)); \
- xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
-} while (0)
-
-#define IRELE(ip) \
-do { \
- xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
- iput(VFS_I(ip)); \
-} while (0)
-
-static inline struct inode *vn_grab(struct inode *vp)
-{
- return igrab(vp);
-}
-
/*
* Dealing with bad inodes
*/
@@ -121,39 +90,4 @@ static inline void vn_atime_to_time_t(struct inode *vp, time_t *tt)
PAGECACHE_TAG_DIRTY)
-/*
- * Tracking vnode activity.
- */
-#if defined(XFS_INODE_TRACE)
-
-#define INODE_TRACE_SIZE 16 /* number of trace entries */
-#define INODE_KTRACE_ENTRY 1
-#define INODE_KTRACE_EXIT 2
-#define INODE_KTRACE_HOLD 3
-#define INODE_KTRACE_REF 4
-#define INODE_KTRACE_RELE 5
-
-extern void _xfs_itrace_entry(struct xfs_inode *, const char *, inst_t *);
-extern void _xfs_itrace_exit(struct xfs_inode *, const char *, inst_t *);
-extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *);
-extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *);
-extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *);
-#define xfs_itrace_entry(ip) \
- _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address)
-#define xfs_itrace_exit(ip) \
- _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address)
-#define xfs_itrace_exit_tag(ip, tag) \
- _xfs_itrace_exit(ip, tag, (inst_t *)__return_address)
-#define xfs_itrace_ref(ip) \
- _xfs_itrace_ref(ip, __FILE__, __LINE__, (inst_t *)__return_address)
-
-#else
-#define xfs_itrace_entry(a)
-#define xfs_itrace_exit(a)
-#define xfs_itrace_exit_tag(a, b)
-#define xfs_itrace_hold(a, b, c, d)
-#define xfs_itrace_ref(a)
-#define xfs_itrace_rele(a, b, c, d)
-#endif
-
#endif /* __XFS_VNODE_H__ */
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index f2705f2fd43c..591ca6602bfb 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -101,7 +101,7 @@ xfs_qm_dqinit(
if (brandnewdquot) {
dqp->dq_flnext = dqp->dq_flprev = dqp;
mutex_init(&dqp->q_qlock);
- sv_init(&dqp->q_pinwait, SV_DEFAULT, "pdq");
+ init_waitqueue_head(&dqp->q_pinwait);
/*
* Because we want to use a counting completion, complete
@@ -131,7 +131,7 @@ xfs_qm_dqinit(
dqp->q_res_bcount = 0;
dqp->q_res_icount = 0;
dqp->q_res_rtbcount = 0;
- dqp->q_pincount = 0;
+ atomic_set(&dqp->q_pincount, 0);
dqp->q_hash = NULL;
ASSERT(dqp->dq_flnext == dqp->dq_flprev);
@@ -1221,16 +1221,14 @@ xfs_qm_dqflush(
xfs_dqtrace_entry(dqp, "DQFLUSH");
/*
- * If not dirty, nada.
+ * If not dirty, or it's pinned and we are not supposed to
+ * block, nada.
*/
- if (!XFS_DQ_IS_DIRTY(dqp)) {
+ if (!XFS_DQ_IS_DIRTY(dqp) ||
+ (!(flags & XFS_QMOPT_SYNC) && atomic_read(&dqp->q_pincount) > 0)) {
xfs_dqfunlock(dqp);
- return (0);
+ return 0;
}
-
- /*
- * Cant flush a pinned dquot. Wait for it.
- */
xfs_qm_dqunpin_wait(dqp);
/*
@@ -1274,10 +1272,8 @@ xfs_qm_dqflush(
dqp->dq_flags &= ~(XFS_DQ_DIRTY);
mp = dqp->q_mount;
- /* lsn is 64 bits */
- spin_lock(&mp->m_ail_lock);
- dqp->q_logitem.qli_flush_lsn = dqp->q_logitem.qli_item.li_lsn;
- spin_unlock(&mp->m_ail_lock);
+ xfs_trans_ail_copy_lsn(mp->m_ail, &dqp->q_logitem.qli_flush_lsn,
+ &dqp->q_logitem.qli_item.li_lsn);
/*
* Attach an iodone routine so that we can remove this dquot from the
@@ -1323,8 +1319,10 @@ xfs_qm_dqflush_done(
xfs_dq_logitem_t *qip)
{
xfs_dquot_t *dqp;
+ struct xfs_ail *ailp;
dqp = qip->qli_dquot;
+ ailp = qip->qli_item.li_ailp;
/*
* We only want to pull the item from the AIL if its
@@ -1337,15 +1335,12 @@ xfs_qm_dqflush_done(
if ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
qip->qli_item.li_lsn == qip->qli_flush_lsn) {
- spin_lock(&dqp->q_mount->m_ail_lock);
- /*
- * xfs_trans_delete_ail() drops the AIL lock.
- */
+ /* xfs_trans_ail_delete() drops the AIL lock. */
+ spin_lock(&ailp->xa_lock);
if (qip->qli_item.li_lsn == qip->qli_flush_lsn)
- xfs_trans_delete_ail(dqp->q_mount,
- (xfs_log_item_t*)qip);
+ xfs_trans_ail_delete(ailp, (xfs_log_item_t*)qip);
else
- spin_unlock(&dqp->q_mount->m_ail_lock);
+ spin_unlock(&ailp->xa_lock);
}
/*
@@ -1375,7 +1370,7 @@ xfs_dqunlock(
mutex_unlock(&(dqp->q_qlock));
if (dqp->q_logitem.qli_dquot == dqp) {
/* Once was dqp->q_mount, but might just have been cleared */
- xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_mountp,
+ xfs_trans_unlocked_item(dqp->q_logitem.qli_item.li_ailp,
(xfs_log_item_t*)&(dqp->q_logitem));
}
}
@@ -1489,7 +1484,7 @@ xfs_qm_dqpurge(
"xfs_qm_dqpurge: dquot %p flush failed", dqp);
xfs_dqflock(dqp);
}
- ASSERT(dqp->q_pincount == 0);
+ ASSERT(atomic_read(&dqp->q_pincount) == 0);
ASSERT(XFS_FORCED_SHUTDOWN(mp) ||
!(dqp->q_logitem.qli_item.li_flags & XFS_LI_IN_AIL));
diff --git a/fs/xfs/quota/xfs_dquot.h b/fs/xfs/quota/xfs_dquot.h
index 8958d0faf8d3..7e455337e2ba 100644
--- a/fs/xfs/quota/xfs_dquot.h
+++ b/fs/xfs/quota/xfs_dquot.h
@@ -83,8 +83,8 @@ typedef struct xfs_dquot {
xfs_qcnt_t q_res_rtbcount;/* total realtime blks used+reserved */
mutex_t q_qlock; /* quota lock */
struct completion q_flush; /* flush completion queue */
- uint q_pincount; /* pin count for this dquot */
- sv_t q_pinwait; /* sync var for pinning */
+ atomic_t q_pincount; /* dquot pin count */
+ wait_queue_head_t q_pinwait; /* dquot pinning wait queue */
#ifdef XFS_DQUOT_TRACE
struct ktrace *q_trace; /* trace header structure */
#endif
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index f028644caa5e..1728f6a7c4f5 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -88,25 +88,22 @@ xfs_qm_dquot_logitem_format(
/*
* Increment the pin count of the given dquot.
- * This value is protected by pinlock spinlock in the xQM structure.
*/
STATIC void
xfs_qm_dquot_logitem_pin(
xfs_dq_logitem_t *logitem)
{
- xfs_dquot_t *dqp;
+ xfs_dquot_t *dqp = logitem->qli_dquot;
- dqp = logitem->qli_dquot;
ASSERT(XFS_DQ_IS_LOCKED(dqp));
- spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
- dqp->q_pincount++;
- spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
+ atomic_inc(&dqp->q_pincount);
}
/*
* Decrement the pin count of the given dquot, and wake up
* anyone in xfs_dqwait_unpin() if the count goes to 0. The
- * dquot must have been previously pinned with a call to xfs_dqpin().
+ * dquot must have been previously pinned with a call to
+ * xfs_qm_dquot_logitem_pin().
*/
/* ARGSUSED */
STATIC void
@@ -114,16 +111,11 @@ xfs_qm_dquot_logitem_unpin(
xfs_dq_logitem_t *logitem,
int stale)
{
- xfs_dquot_t *dqp;
+ xfs_dquot_t *dqp = logitem->qli_dquot;
- dqp = logitem->qli_dquot;
- ASSERT(dqp->q_pincount > 0);
- spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
- dqp->q_pincount--;
- if (dqp->q_pincount == 0) {
- sv_broadcast(&dqp->q_pinwait);
- }
- spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
+ ASSERT(atomic_read(&dqp->q_pincount) > 0);
+ if (atomic_dec_and_test(&dqp->q_pincount))
+ wake_up(&dqp->q_pinwait);
}
/* ARGSUSED */
@@ -193,21 +185,14 @@ xfs_qm_dqunpin_wait(
xfs_dquot_t *dqp)
{
ASSERT(XFS_DQ_IS_LOCKED(dqp));
- if (dqp->q_pincount == 0) {
+ if (atomic_read(&dqp->q_pincount) == 0)
return;
- }
/*
* Give the log a push so we don't wait here too long.
*/
xfs_log_force(dqp->q_mount, (xfs_lsn_t)0, XFS_LOG_FORCE);
- spin_lock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
- if (dqp->q_pincount == 0) {
- spin_unlock(&(XFS_DQ_TO_QINF(dqp)->qi_pinlock));
- return;
- }
- sv_wait(&(dqp->q_pinwait), PINOD,
- &(XFS_DQ_TO_QINF(dqp)->qi_pinlock), s);
+ wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
}
/*
@@ -310,7 +295,7 @@ xfs_qm_dquot_logitem_trylock(
uint retval;
dqp = qip->qli_dquot;
- if (dqp->q_pincount > 0)
+ if (atomic_read(&dqp->q_pincount) > 0)
return (XFS_ITEM_PINNED);
if (! xfs_qm_dqlock_nowait(dqp))
@@ -568,14 +553,16 @@ xfs_qm_qoffend_logitem_committed(
xfs_lsn_t lsn)
{
xfs_qoff_logitem_t *qfs;
+ struct xfs_ail *ailp;
qfs = qfe->qql_start_lip;
- spin_lock(&qfs->qql_item.li_mountp->m_ail_lock);
+ ailp = qfs->qql_item.li_ailp;
+ spin_lock(&ailp->xa_lock);
/*
* Delete the qoff-start logitem from the AIL.
- * xfs_trans_delete_ail() drops the AIL lock.
+ * xfs_trans_ail_delete() drops the AIL lock.
*/
- xfs_trans_delete_ail(qfs->qql_item.li_mountp, (xfs_log_item_t *)qfs);
+ xfs_trans_ail_delete(ailp, (xfs_log_item_t *)qfs);
kmem_free(qfs);
kmem_free(qfe);
return (xfs_lsn_t)-1;
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index df0ffef9775a..6b13960cf318 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -20,7 +20,6 @@
#include "xfs_bit.h"
#include "xfs_log.h"
#include "xfs_inum.h"
-#include "xfs_clnt.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -396,13 +395,10 @@ xfs_qm_mount_quotas(
/*
* Called from the vfsops layer.
*/
-int
+void
xfs_qm_unmount_quotas(
xfs_mount_t *mp)
{
- xfs_inode_t *uqp, *gqp;
- int error = 0;
-
/*
* Release the dquots that root inode, et al might be holding,
* before we flush quotas and blow away the quotainfo structure.
@@ -415,43 +411,18 @@ xfs_qm_unmount_quotas(
xfs_qm_dqdetach(mp->m_rsumip);
/*
- * Flush out the quota inodes.
+ * Release the quota inodes.
*/
- uqp = gqp = NULL;
if (mp->m_quotainfo) {
- if ((uqp = mp->m_quotainfo->qi_uquotaip) != NULL) {
- xfs_ilock(uqp, XFS_ILOCK_EXCL);
- xfs_iflock(uqp);
- error = xfs_iflush(uqp, XFS_IFLUSH_SYNC);
- xfs_iunlock(uqp, XFS_ILOCK_EXCL);
- if (unlikely(error == EFSCORRUPTED)) {
- XFS_ERROR_REPORT("xfs_qm_unmount_quotas(1)",
- XFS_ERRLEVEL_LOW, mp);
- goto out;
- }
+ if (mp->m_quotainfo->qi_uquotaip) {
+ IRELE(mp->m_quotainfo->qi_uquotaip);
+ mp->m_quotainfo->qi_uquotaip = NULL;
}
- if ((gqp = mp->m_quotainfo->qi_gquotaip) != NULL) {
- xfs_ilock(gqp, XFS_ILOCK_EXCL);
- xfs_iflock(gqp);
- error = xfs_iflush(gqp, XFS_IFLUSH_SYNC);
- xfs_iunlock(gqp, XFS_ILOCK_EXCL);
- if (unlikely(error == EFSCORRUPTED)) {
- XFS_ERROR_REPORT("xfs_qm_unmount_quotas(2)",
- XFS_ERRLEVEL_LOW, mp);
- goto out;
- }
+ if (mp->m_quotainfo->qi_gquotaip) {
+ IRELE(mp->m_quotainfo->qi_gquotaip);
+ mp->m_quotainfo->qi_gquotaip = NULL;
}
}
- if (uqp) {
- IRELE(uqp);
- mp->m_quotainfo->qi_uquotaip = NULL;
- }
- if (gqp) {
- IRELE(gqp);
- mp->m_quotainfo->qi_gquotaip = NULL;
- }
-out:
- return XFS_ERROR(error);
}
/*
@@ -987,14 +958,10 @@ xfs_qm_dqdetach(
}
/*
- * This is called by VFS_SYNC and flags arg determines the caller,
- * and its motives, as done in xfs_sync.
- *
- * vfs_sync: SYNC_FSDATA|SYNC_ATTR|SYNC_BDFLUSH 0x31
- * syscall sync: SYNC_FSDATA|SYNC_ATTR|SYNC_DELWRI 0x25
- * umountroot : SYNC_WAIT | SYNC_CLOSE | SYNC_ATTR | SYNC_FSDATA
+ * This is called to sync quotas. We can be told to use non-blocking
+ * semantics by either the SYNC_BDFLUSH flag or the absence of the
+ * SYNC_WAIT flag.
*/
-
int
xfs_qm_sync(
xfs_mount_t *mp,
@@ -1137,7 +1104,6 @@ xfs_qm_init_quotainfo(
return error;
}
- spin_lock_init(&qinf->qi_pinlock);
xfs_qm_list_init(&qinf->qi_dqlist, "mpdqlist", 0);
qinf->qi_dqreclaims = 0;
@@ -1234,7 +1200,6 @@ xfs_qm_destroy_quotainfo(
*/
xfs_qm_rele_quotafs_ref(mp);
- spinlock_destroy(&qi->qi_pinlock);
xfs_qm_list_destroy(&qi->qi_dqlist);
if (qi->qi_uquotaip) {
diff --git a/fs/xfs/quota/xfs_qm.h b/fs/xfs/quota/xfs_qm.h
index 44f25349e478..ddf09166387c 100644
--- a/fs/xfs/quota/xfs_qm.h
+++ b/fs/xfs/quota/xfs_qm.h
@@ -106,7 +106,6 @@ typedef struct xfs_qm {
typedef struct xfs_quotainfo {
xfs_inode_t *qi_uquotaip; /* user quota inode */
xfs_inode_t *qi_gquotaip; /* group quota inode */
- spinlock_t qi_pinlock; /* dquot pinning lock */
xfs_dqlist_t qi_dqlist; /* all dquots in filesys */
int qi_dqreclaims; /* a change here indicates
a removal in the dqlist */
@@ -168,7 +167,7 @@ extern void xfs_qm_destroy_quotainfo(xfs_mount_t *);
extern void xfs_qm_mount_quotas(xfs_mount_t *);
extern int xfs_qm_quotacheck(xfs_mount_t *);
extern void xfs_qm_unmount_quotadestroy(xfs_mount_t *);
-extern int xfs_qm_unmount_quotas(xfs_mount_t *);
+extern void xfs_qm_unmount_quotas(xfs_mount_t *);
extern int xfs_qm_write_sb_changes(xfs_mount_t *, __int64_t);
extern int xfs_qm_sync(xfs_mount_t *, int);
diff --git a/fs/xfs/quota/xfs_qm_bhv.c b/fs/xfs/quota/xfs_qm_bhv.c
index eea2e60b456b..bc6c5cca3e12 100644
--- a/fs/xfs/quota/xfs_qm_bhv.c
+++ b/fs/xfs/quota/xfs_qm_bhv.c
@@ -20,7 +20,6 @@
#include "xfs_bit.h"
#include "xfs_log.h"
#include "xfs_inum.h"
-#include "xfs_clnt.h"
#include "xfs_trans.h"
#include "xfs_sb.h"
#include "xfs_ag.h"
@@ -51,7 +50,7 @@
STATIC void
xfs_fill_statvfs_from_dquot(
- bhv_statvfs_t *statp,
+ struct kstatfs *statp,
xfs_disk_dquot_t *dp)
{
__uint64_t limit;
@@ -88,7 +87,7 @@ xfs_fill_statvfs_from_dquot(
STATIC void
xfs_qm_statvfs(
xfs_inode_t *ip,
- bhv_statvfs_t *statp)
+ struct kstatfs *statp)
{
xfs_mount_t *mp = ip->i_mount;
xfs_dquot_t *dqp;
diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c
index 1a3b803dfa55..68139b38aede 100644
--- a/fs/xfs/quota/xfs_qm_syscalls.c
+++ b/fs/xfs/quota/xfs_qm_syscalls.c
@@ -127,7 +127,7 @@ xfs_qm_quotactl(
break;
case Q_XQUOTASYNC:
- return (xfs_sync_inodes(mp, SYNC_DELWRI, NULL));
+ return xfs_sync_inodes(mp, SYNC_DELWRI);
default:
break;
@@ -1022,101 +1022,104 @@ xfs_qm_export_flags(
/*
- * Go thru all the inodes in the file system, releasing their dquots.
- * Note that the mount structure gets modified to indicate that quotas are off
- * AFTER this, in the case of quotaoff. This also gets called from
- * xfs_rootumount.
+ * Release all the dquots on the inodes in an AG.
*/
-void
-xfs_qm_dqrele_all_inodes(
- struct xfs_mount *mp,
- uint flags)
+STATIC void
+xfs_qm_dqrele_inodes_ag(
+ xfs_mount_t *mp,
+ int ag,
+ uint flags)
{
- xfs_inode_t *ip, *topino;
- uint ireclaims;
- struct inode *vp;
- boolean_t vnode_refd;
+ xfs_inode_t *ip = NULL;
+ xfs_perag_t *pag = &mp->m_perag[ag];
+ int first_index = 0;
+ int nr_found;
- ASSERT(mp->m_quotainfo);
-
- XFS_MOUNT_ILOCK(mp);
-again:
- ip = mp->m_inodes;
- if (ip == NULL) {
- XFS_MOUNT_IUNLOCK(mp);
- return;
- }
do {
- /* Skip markers inserted by xfs_sync */
- if (ip->i_mount == NULL) {
- ip = ip->i_mnext;
- continue;
+ /*
+ * use a gang lookup to find the next inode in the tree
+ * as the tree is sparse and a gang lookup walks to find
+ * the number of objects requested.
+ */
+ read_lock(&pag->pag_ici_lock);
+ nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+ (void**)&ip, first_index, 1);
+
+ if (!nr_found) {
+ read_unlock(&pag->pag_ici_lock);
+ break;
}
- /* Root inode, rbmip and rsumip have associated blocks */
+
+ /*
+ * Update the index for the next lookup. Catch overflows
+ * into the next AG range which can occur if we have inodes
+ * in the last block of the AG and we are currently
+ * pointing to the last inode.
+ */
+ first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1);
+ if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) {
+ read_unlock(&pag->pag_ici_lock);
+ break;
+ }
+
+ /* skip quota inodes */
if (ip == XFS_QI_UQIP(mp) || ip == XFS_QI_GQIP(mp)) {
ASSERT(ip->i_udquot == NULL);
ASSERT(ip->i_gdquot == NULL);
- ip = ip->i_mnext;
+ read_unlock(&pag->pag_ici_lock);
continue;
}
- vp = VFS_I(ip);
- if (!vp) {
- ASSERT(ip->i_udquot == NULL);
- ASSERT(ip->i_gdquot == NULL);
- ip = ip->i_mnext;
+
+ /*
+ * If we can't get a reference on the inode, it must be
+ * in reclaim. Leave it for the reclaim code to flush.
+ */
+ if (!igrab(VFS_I(ip))) {
+ read_unlock(&pag->pag_ici_lock);
continue;
}
- vnode_refd = B_FALSE;
- if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
- ireclaims = mp->m_ireclaims;
- topino = mp->m_inodes;
- vp = vn_grab(vp);
- if (!vp)
- goto again;
-
- XFS_MOUNT_IUNLOCK(mp);
- /* XXX restart limit ? */
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- vnode_refd = B_TRUE;
- } else {
- ireclaims = mp->m_ireclaims;
- topino = mp->m_inodes;
- XFS_MOUNT_IUNLOCK(mp);
+ read_unlock(&pag->pag_ici_lock);
+
+ /* avoid new inodes though we shouldn't find any here */
+ if (xfs_iflags_test(ip, XFS_INEW)) {
+ IRELE(ip);
+ continue;
}
- /*
- * We don't keep the mountlock across the dqrele() call,
- * since it can take a while..
- */
+ xfs_ilock(ip, XFS_ILOCK_EXCL);
if ((flags & XFS_UQUOTA_ACCT) && ip->i_udquot) {
xfs_qm_dqrele(ip->i_udquot);
ip->i_udquot = NULL;
}
- if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) && ip->i_gdquot) {
+ if (flags & (XFS_PQUOTA_ACCT|XFS_GQUOTA_ACCT) &&
+ ip->i_gdquot) {
xfs_qm_dqrele(ip->i_gdquot);
ip->i_gdquot = NULL;
}
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- /*
- * Wait until we've dropped the ilock and mountlock to
- * do the vn_rele. Or be condemned to an eternity in the
- * inactive code in hell.
- */
- if (vnode_refd)
- IRELE(ip);
- XFS_MOUNT_ILOCK(mp);
- /*
- * If an inode was inserted or removed, we gotta
- * start over again.
- */
- if (topino != mp->m_inodes || mp->m_ireclaims != ireclaims) {
- /* XXX use a sentinel */
- goto again;
- }
- ip = ip->i_mnext;
- } while (ip != mp->m_inodes);
+ xfs_iput(ip, XFS_ILOCK_EXCL);
+
+ } while (nr_found);
+}
+
+/*
+ * Go thru all the inodes in the file system, releasing their dquots.
+ * Note that the mount structure gets modified to indicate that quotas are off
+ * AFTER this, in the case of quotaoff. This also gets called from
+ * xfs_rootumount.
+ */
+void
+xfs_qm_dqrele_all_inodes(
+ struct xfs_mount *mp,
+ uint flags)
+{
+ int i;
- XFS_MOUNT_IUNLOCK(mp);
+ ASSERT(mp->m_quotainfo);
+ for (i = 0; i < mp->m_sb.sb_agcount; i++) {
+ if (!mp->m_perag[i].pag_ici_init)
+ continue;
+ xfs_qm_dqrele_inodes_ag(mp, i, flags);
+ }
}
/*------------------------------------------------------------------------*/
diff --git a/fs/xfs/support/debug.c b/fs/xfs/support/debug.c
index c27abef7b84f..ae5482965424 100644
--- a/fs/xfs/support/debug.c
+++ b/fs/xfs/support/debug.c
@@ -18,6 +18,13 @@
#include <xfs.h>
#include "debug.h"
+/* xfs_mount.h drags a lot of crap in, sorry.. */
+#include "xfs_sb.h"
+#include "xfs_inum.h"
+#include "xfs_ag.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+
static char message[1024]; /* keep it off the stack */
static DEFINE_SPINLOCK(xfs_err_lock);
@@ -55,22 +62,42 @@ cmn_err(register int level, char *fmt, ...)
}
void
-icmn_err(register int level, char *fmt, va_list ap)
+xfs_fs_vcmn_err(
+ int level,
+ struct xfs_mount *mp,
+ char *fmt,
+ va_list ap)
{
- ulong flags;
- int len;
+ unsigned long flags;
+ int len = 0;
level &= XFS_ERR_MASK;
- if(level > XFS_MAX_ERR_LEVEL)
+ if (level > XFS_MAX_ERR_LEVEL)
level = XFS_MAX_ERR_LEVEL;
+
spin_lock_irqsave(&xfs_err_lock,flags);
- len = vsnprintf(message, sizeof(message), fmt, ap);
+
+ if (mp) {
+ len = sprintf(message, "Filesystem \"%s\": ", mp->m_fsname);
+
+ /*
+ * Skip the printk if we can't print anything useful
+ * due to an over-long device name.
+ */
+ if (len >= sizeof(message))
+ goto out;
+ }
+
+ len = vsnprintf(message + len, sizeof(message) - len, fmt, ap);
if (len >= sizeof(message))
len = sizeof(message) - 1;
if (message[len-1] == '\n')
message[len-1] = 0;
+
printk("%s%s\n", err_level[level], message);
+ out:
spin_unlock_irqrestore(&xfs_err_lock,flags);
+
BUG_ON(level == CE_PANIC);
}
@@ -84,5 +111,5 @@ assfail(char *expr, char *file, int line)
void
xfs_hex_dump(void *p, int length)
{
- print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_OFFSET, 16, 1, p, length, 1);
+ print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1);
}
diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h
index 75845f950814..6f4fd37c67af 100644
--- a/fs/xfs/support/debug.h
+++ b/fs/xfs/support/debug.h
@@ -27,8 +27,6 @@
#define CE_ALERT 1 /* alert */
#define CE_PANIC 0 /* panic */
-extern void icmn_err(int, char *, va_list)
- __attribute__ ((format (printf, 2, 0)));
extern void cmn_err(int, char *, ...)
__attribute__ ((format (printf, 2, 3)));
extern void assfail(char *expr, char *f, int l);
diff --git a/fs/xfs/support/ktrace.c b/fs/xfs/support/ktrace.c
index a34ef05489b1..2d494c26717f 100644
--- a/fs/xfs/support/ktrace.c
+++ b/fs/xfs/support/ktrace.c
@@ -113,21 +113,16 @@ ktrace_alloc(int nentries, unsigned int __nocast sleep)
void
ktrace_free(ktrace_t *ktp)
{
- int entries_size;
-
if (ktp == (ktrace_t *)NULL)
return;
/*
* Special treatment for the Vnode trace buffer.
*/
- if (ktp->kt_nentries == ktrace_zentries) {
+ if (ktp->kt_nentries == ktrace_zentries)
kmem_zone_free(ktrace_ent_zone, ktp->kt_entries);
- } else {
- entries_size = (int)(ktp->kt_nentries * sizeof(ktrace_entry_t));
-
+ else
kmem_free(ktp->kt_entries);
- }
kmem_zone_free(ktrace_hdr_zone, ktp);
}
diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h
index 540e4c989825..17254b529c54 100644
--- a/fs/xfs/xfs.h
+++ b/fs/xfs/xfs.h
@@ -30,7 +30,7 @@
#define XFS_ATTR_TRACE 1
#define XFS_BLI_TRACE 1
#define XFS_BMAP_TRACE 1
-#define XFS_BMBT_TRACE 1
+#define XFS_BTREE_TRACE 1
#define XFS_DIR2_TRACE 1
#define XFS_DQUOT_TRACE 1
#define XFS_ILOCK_TRACE 1
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 91d69338d3b2..a8cdd73999a4 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -758,7 +758,7 @@ xfs_acl_setmode(
if (gap && nomask)
iattr.ia_mode |= gap->ae_perm << 3;
- return xfs_setattr(XFS_I(vp), &iattr, 0, sys_cred);
+ return xfs_setattr(XFS_I(vp), &iattr, 0);
}
/*
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 61b292a9fb41..f2e21817a226 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -91,6 +91,8 @@ typedef struct xfs_agf {
#define XFS_AGF_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGF_DADDR(mp))
#define XFS_BUF_TO_AGF(bp) ((xfs_agf_t *)XFS_BUF_PTR(bp))
+extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
/*
* Size of the unlinked inode hash table in the agi.
@@ -142,6 +144,9 @@ typedef struct xfs_agi {
#define XFS_AGI_BLOCK(mp) XFS_HDR_BLOCK(mp, XFS_AGI_DADDR(mp))
#define XFS_BUF_TO_AGI(bp) ((xfs_agi_t *)XFS_BUF_PTR(bp))
+extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
+ xfs_agnumber_t agno, struct xfs_buf **bpp);
+
/*
* The third a.g. block contains the a.g. freelist, an array
* of block pointers to blocks owned by the allocation btree code.
@@ -192,17 +197,23 @@ typedef struct xfs_perag
xfs_agino_t pagi_freecount; /* number of free inodes */
xfs_agino_t pagi_count; /* number of allocated inodes */
int pagb_count; /* pagb slots in use */
+ xfs_perag_busy_t *pagb_list; /* unstable blocks */
#ifdef __KERNEL__
spinlock_t pagb_lock; /* lock for pagb_list */
-#endif
- xfs_perag_busy_t *pagb_list; /* unstable blocks */
+
atomic_t pagf_fstrms; /* # of filestreams active in this AG */
int pag_ici_init; /* incore inode cache initialised */
rwlock_t pag_ici_lock; /* incore inode lock */
struct radix_tree_root pag_ici_root; /* incore inode cache root */
+#endif
} xfs_perag_t;
+/*
+ * tags for inode radix tree
+ */
+#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
+
#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \
(MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp)))
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 1956f83489f1..028e44e58ea9 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -90,6 +90,92 @@ STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
*/
/*
+ * Lookup the record equal to [bno, len] in the btree given by cur.
+ */
+STATIC int /* error */
+xfs_alloc_lookup_eq(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len, /* length of extent */
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.a.ar_startblock = bno;
+ cur->bc_rec.a.ar_blockcount = len;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/*
+ * Lookup the first record greater than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+STATIC int /* error */
+xfs_alloc_lookup_ge(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len, /* length of extent */
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.a.ar_startblock = bno;
+ cur->bc_rec.a.ar_blockcount = len;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+ * Lookup the first record less than or equal to [bno, len]
+ * in the btree given by cur.
+ */
+STATIC int /* error */
+xfs_alloc_lookup_le(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len, /* length of extent */
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.a.ar_startblock = bno;
+ cur->bc_rec.a.ar_blockcount = len;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int /* error */
+xfs_alloc_update(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* starting block of extent */
+ xfs_extlen_t len) /* length of extent */
+{
+ union xfs_btree_rec rec;
+
+ rec.alloc.ar_startblock = cpu_to_be32(bno);
+ rec.alloc.ar_blockcount = cpu_to_be32(len);
+ return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+STATIC int /* error */
+xfs_alloc_get_rec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t *bno, /* output: starting block of extent */
+ xfs_extlen_t *len, /* output: length of extent */
+ int *stat) /* output: success/failure */
+{
+ union xfs_btree_rec *rec;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &rec, stat);
+ if (!error && *stat == 1) {
+ *bno = be32_to_cpu(rec->alloc.ar_startblock);
+ *len = be32_to_cpu(rec->alloc.ar_blockcount);
+ }
+ return error;
+}
+
+/*
* Compute aligned version of the found extent.
* Takes alignment and min length into account.
*/
@@ -294,21 +380,20 @@ xfs_alloc_fixup_trees(
return error;
XFS_WANT_CORRUPTED_RETURN(i == 1);
}
+
#ifdef DEBUG
- {
- xfs_alloc_block_t *bnoblock;
- xfs_alloc_block_t *cntblock;
-
- if (bno_cur->bc_nlevels == 1 &&
- cnt_cur->bc_nlevels == 1) {
- bnoblock = XFS_BUF_TO_ALLOC_BLOCK(bno_cur->bc_bufs[0]);
- cntblock = XFS_BUF_TO_ALLOC_BLOCK(cnt_cur->bc_bufs[0]);
- XFS_WANT_CORRUPTED_RETURN(
- be16_to_cpu(bnoblock->bb_numrecs) ==
- be16_to_cpu(cntblock->bb_numrecs));
- }
+ if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
+ struct xfs_btree_block *bnoblock;
+ struct xfs_btree_block *cntblock;
+
+ bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_bufs[0]);
+ cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_bufs[0]);
+
+ XFS_WANT_CORRUPTED_RETURN(
+ bnoblock->bb_numrecs == cntblock->bb_numrecs);
}
#endif
+
/*
* Deal with all four cases: the allocated record is contained
* within the freespace record, so we can have new freespace
@@ -333,7 +418,7 @@ xfs_alloc_fixup_trees(
/*
* Delete the entry from the by-size btree.
*/
- if ((error = xfs_alloc_delete(cnt_cur, &i)))
+ if ((error = xfs_btree_delete(cnt_cur, &i)))
return error;
XFS_WANT_CORRUPTED_RETURN(i == 1);
/*
@@ -343,7 +428,7 @@ xfs_alloc_fixup_trees(
if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
return error;
XFS_WANT_CORRUPTED_RETURN(i == 0);
- if ((error = xfs_alloc_insert(cnt_cur, &i)))
+ if ((error = xfs_btree_insert(cnt_cur, &i)))
return error;
XFS_WANT_CORRUPTED_RETURN(i == 1);
}
@@ -351,7 +436,7 @@ xfs_alloc_fixup_trees(
if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
return error;
XFS_WANT_CORRUPTED_RETURN(i == 0);
- if ((error = xfs_alloc_insert(cnt_cur, &i)))
+ if ((error = xfs_btree_insert(cnt_cur, &i)))
return error;
XFS_WANT_CORRUPTED_RETURN(i == 1);
}
@@ -362,7 +447,7 @@ xfs_alloc_fixup_trees(
/*
* No remaining freespace, just delete the by-block tree entry.
*/
- if ((error = xfs_alloc_delete(bno_cur, &i)))
+ if ((error = xfs_btree_delete(bno_cur, &i)))
return error;
XFS_WANT_CORRUPTED_RETURN(i == 1);
} else {
@@ -379,7 +464,7 @@ xfs_alloc_fixup_trees(
if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
return error;
XFS_WANT_CORRUPTED_RETURN(i == 0);
- if ((error = xfs_alloc_insert(bno_cur, &i)))
+ if ((error = xfs_btree_insert(bno_cur, &i)))
return error;
XFS_WANT_CORRUPTED_RETURN(i == 1);
}
@@ -640,8 +725,8 @@ xfs_alloc_ag_vextent_exact(
/*
* Allocate/initialize a cursor for the by-number freespace btree.
*/
- bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_BNO, NULL, 0);
+ bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->agno, XFS_BTNUM_BNO);
/*
* Lookup bno and minlen in the btree (minlen is irrelevant, really).
* Look for the closest free block <= bno, it must contain bno
@@ -696,8 +781,8 @@ xfs_alloc_ag_vextent_exact(
* We are allocating agbno for rlen [agbno .. end]
* Allocate/initialize a cursor for the by-size btree.
*/
- cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_CNT, NULL, 0);
+ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->agno, XFS_BTNUM_CNT);
ASSERT(args->agbno + args->len <=
be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
@@ -759,8 +844,8 @@ xfs_alloc_ag_vextent_near(
/*
* Get a cursor for the by-size btree.
*/
- cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_CNT, NULL, 0);
+ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->agno, XFS_BTNUM_CNT);
ltlen = 0;
bno_cur_lt = bno_cur_gt = NULL;
/*
@@ -818,7 +903,7 @@ xfs_alloc_ag_vextent_near(
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
if (ltlen >= args->minlen)
break;
- if ((error = xfs_alloc_increment(cnt_cur, 0, &i)))
+ if ((error = xfs_btree_increment(cnt_cur, 0, &i)))
goto error0;
} while (i);
ASSERT(ltlen >= args->minlen);
@@ -828,7 +913,7 @@ xfs_alloc_ag_vextent_near(
i = cnt_cur->bc_ptrs[0];
for (j = 1, blen = 0, bdiff = 0;
!error && j && (blen < args->maxlen || bdiff > 0);
- error = xfs_alloc_increment(cnt_cur, 0, &j)) {
+ error = xfs_btree_increment(cnt_cur, 0, &j)) {
/*
* For each entry, decide if it's better than
* the previous best entry.
@@ -886,8 +971,8 @@ xfs_alloc_ag_vextent_near(
/*
* Set up a cursor for the by-bno tree.
*/
- bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp,
- args->agbp, args->agno, XFS_BTNUM_BNO, NULL, 0);
+ bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp,
+ args->agbp, args->agno, XFS_BTNUM_BNO);
/*
* Fix up the btree entries.
*/
@@ -914,8 +999,8 @@ xfs_alloc_ag_vextent_near(
/*
* Allocate and initialize the cursor for the leftward search.
*/
- bno_cur_lt = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_BNO, NULL, 0);
+ bno_cur_lt = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->agno, XFS_BTNUM_BNO);
/*
* Lookup <= bno to find the leftward search's starting point.
*/
@@ -938,7 +1023,7 @@ xfs_alloc_ag_vextent_near(
* Increment the cursor, so we will point at the entry just right
* of the leftward entry if any, or to the leftmost entry.
*/
- if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
+ if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
goto error0;
if (!i) {
/*
@@ -961,7 +1046,7 @@ xfs_alloc_ag_vextent_near(
args->minlen, &ltbnoa, &ltlena);
if (ltlena >= args->minlen)
break;
- if ((error = xfs_alloc_decrement(bno_cur_lt, 0, &i)))
+ if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i)))
goto error0;
if (!i) {
xfs_btree_del_cursor(bno_cur_lt,
@@ -977,7 +1062,7 @@ xfs_alloc_ag_vextent_near(
args->minlen, &gtbnoa, &gtlena);
if (gtlena >= args->minlen)
break;
- if ((error = xfs_alloc_increment(bno_cur_gt, 0, &i)))
+ if ((error = xfs_btree_increment(bno_cur_gt, 0, &i)))
goto error0;
if (!i) {
xfs_btree_del_cursor(bno_cur_gt,
@@ -1066,7 +1151,7 @@ xfs_alloc_ag_vextent_near(
/*
* Fell off the right end.
*/
- if ((error = xfs_alloc_increment(
+ if ((error = xfs_btree_increment(
bno_cur_gt, 0, &i)))
goto error0;
if (!i) {
@@ -1162,7 +1247,7 @@ xfs_alloc_ag_vextent_near(
/*
* Fell off the left end.
*/
- if ((error = xfs_alloc_decrement(
+ if ((error = xfs_btree_decrement(
bno_cur_lt, 0, &i)))
goto error0;
if (!i) {
@@ -1267,8 +1352,8 @@ xfs_alloc_ag_vextent_size(
/*
* Allocate and initialize a cursor for the by-size btree.
*/
- cnt_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_CNT, NULL, 0);
+ cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->agno, XFS_BTNUM_CNT);
bno_cur = NULL;
/*
* Look for an entry >= maxlen+alignment-1 blocks.
@@ -1321,7 +1406,7 @@ xfs_alloc_ag_vextent_size(
bestflen = flen;
bestfbno = fbno;
for (;;) {
- if ((error = xfs_alloc_decrement(cnt_cur, 0, &i)))
+ if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
goto error0;
if (i == 0)
break;
@@ -1372,8 +1457,8 @@ xfs_alloc_ag_vextent_size(
/*
* Allocate and initialize a cursor for the by-block tree.
*/
- bno_cur = xfs_btree_init_cursor(args->mp, args->tp, args->agbp,
- args->agno, XFS_BTNUM_BNO, NULL, 0);
+ bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
+ args->agno, XFS_BTNUM_BNO);
if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
rbno, rlen, XFSA_FIXUP_CNT_OK)))
goto error0;
@@ -1416,7 +1501,7 @@ xfs_alloc_ag_vextent_small(
xfs_extlen_t flen;
int i;
- if ((error = xfs_alloc_decrement(ccur, 0, &i)))
+ if ((error = xfs_btree_decrement(ccur, 0, &i)))
goto error0;
if (i) {
if ((error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i)))
@@ -1515,8 +1600,7 @@ xfs_free_ag_extent(
/*
* Allocate and initialize a cursor for the by-block btree.
*/
- bno_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO, NULL,
- 0);
+ bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_BNO);
cnt_cur = NULL;
/*
* Look for a neighboring block on the left (lower block numbers)
@@ -1549,7 +1633,7 @@ xfs_free_ag_extent(
* Look for a neighboring block on the right (higher block numbers)
* that is contiguous with this space.
*/
- if ((error = xfs_alloc_increment(bno_cur, 0, &haveright)))
+ if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
goto error0;
if (haveright) {
/*
@@ -1575,8 +1659,7 @@ xfs_free_ag_extent(
/*
* Now allocate and initialize a cursor for the by-size tree.
*/
- cnt_cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT, NULL,
- 0);
+ cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_CNT);
/*
* Have both left and right contiguous neighbors.
* Merge all three into a single free block.
@@ -1588,7 +1671,7 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_alloc_delete(cnt_cur, &i)))
+ if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
/*
@@ -1597,19 +1680,19 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_alloc_delete(cnt_cur, &i)))
+ if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
/*
* Delete the old by-block entry for the right block.
*/
- if ((error = xfs_alloc_delete(bno_cur, &i)))
+ if ((error = xfs_btree_delete(bno_cur, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
/*
* Move the by-block cursor back to the left neighbor.
*/
- if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
+ if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
#ifdef DEBUG
@@ -1648,14 +1731,14 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_alloc_delete(cnt_cur, &i)))
+ if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
/*
* Back up the by-block cursor to the left neighbor, and
* update its length.
*/
- if ((error = xfs_alloc_decrement(bno_cur, 0, &i)))
+ if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
nbno = ltbno;
@@ -1674,7 +1757,7 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_alloc_delete(cnt_cur, &i)))
+ if ((error = xfs_btree_delete(cnt_cur, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
/*
@@ -1693,7 +1776,7 @@ xfs_free_ag_extent(
else {
nbno = bno;
nlen = len;
- if ((error = xfs_alloc_insert(bno_cur, &i)))
+ if ((error = xfs_btree_insert(bno_cur, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
}
@@ -1705,7 +1788,7 @@ xfs_free_ag_extent(
if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 0, error0);
- if ((error = xfs_alloc_insert(cnt_cur, &i)))
+ if ((error = xfs_btree_insert(cnt_cur, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
@@ -2150,51 +2233,83 @@ xfs_alloc_put_freelist(
* Read in the allocation group header (free/alloc section).
*/
int /* error */
-xfs_alloc_read_agf(
- xfs_mount_t *mp, /* mount point structure */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- int flags, /* XFS_ALLOC_FLAG_... */
- xfs_buf_t **bpp) /* buffer for the ag freelist header */
+xfs_read_agf(
+ struct xfs_mount *mp, /* mount point structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_agnumber_t agno, /* allocation group number */
+ int flags, /* XFS_BUF_ */
+ struct xfs_buf **bpp) /* buffer for the ag freelist header */
{
- xfs_agf_t *agf; /* ag freelist header */
+ struct xfs_agf *agf; /* ag freelist header */
int agf_ok; /* set if agf is consistent */
- xfs_buf_t *bp; /* return value */
- xfs_perag_t *pag; /* per allocation group data */
int error;
ASSERT(agno != NULLAGNUMBER);
error = xfs_trans_read_buf(
mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1),
- (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0U,
- &bp);
+ XFS_FSS_TO_BB(mp, 1), flags, bpp);
if (error)
return error;
- ASSERT(!bp || !XFS_BUF_GETERROR(bp));
- if (!bp) {
- *bpp = NULL;
+ if (!*bpp)
return 0;
- }
+
+ ASSERT(!XFS_BUF_GETERROR(*bpp));
+ agf = XFS_BUF_TO_AGF(*bpp);
+
/*
* Validate the magic number of the agf block.
*/
- agf = XFS_BUF_TO_AGF(bp);
agf_ok =
be32_to_cpu(agf->agf_magicnum) == XFS_AGF_MAGIC &&
XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp);
+ be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) &&
+ be32_to_cpu(agf->agf_seqno) == agno;
+ if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+ agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
+ be32_to_cpu(agf->agf_length);
if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
XFS_RANDOM_ALLOC_READ_AGF))) {
XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
XFS_ERRLEVEL_LOW, mp, agf);
- xfs_trans_brelse(tp, bp);
+ xfs_trans_brelse(tp, *bpp);
return XFS_ERROR(EFSCORRUPTED);
}
+
+ XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGF, XFS_AGF_REF);
+ return 0;
+}
+
+/*
+ * Read in the allocation group header (free/alloc section).
+ */
+int /* error */
+xfs_alloc_read_agf(
+ struct xfs_mount *mp, /* mount point structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_agnumber_t agno, /* allocation group number */
+ int flags, /* XFS_ALLOC_FLAG_... */
+ struct xfs_buf **bpp) /* buffer for the ag freelist header */
+{
+ struct xfs_agf *agf; /* ag freelist header */
+ struct xfs_perag *pag; /* per allocation group data */
+ int error;
+
+ ASSERT(agno != NULLAGNUMBER);
+
+ error = xfs_read_agf(mp, tp, agno,
+ (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XFS_BUF_TRYLOCK : 0,
+ bpp);
+ if (error)
+ return error;
+ if (!*bpp)
+ return 0;
+ ASSERT(!XFS_BUF_GETERROR(*bpp));
+
+ agf = XFS_BUF_TO_AGF(*bpp);
pag = &mp->m_perag[agno];
if (!pag->pagf_init) {
pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
@@ -2213,6 +2328,7 @@ xfs_alloc_read_agf(
#ifdef DEBUG
else if (!XFS_FORCED_SHUTDOWN(mp)) {
ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
+ ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
@@ -2221,8 +2337,6 @@ xfs_alloc_read_agf(
be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
}
#endif
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGF, XFS_AGF_REF);
- *bpp = bp;
return 0;
}
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 5aec15d0651e..588172796f7b 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -121,6 +121,19 @@ extern ktrace_t *xfs_alloc_trace_buf;
#define XFS_ALLOC_KTRACE_BUSYSEARCH 6
#endif
+void
+xfs_alloc_mark_busy(xfs_trans_t *tp,
+ xfs_agnumber_t agno,
+ xfs_agblock_t bno,
+ xfs_extlen_t len);
+
+void
+xfs_alloc_clear_busy(xfs_trans_t *tp,
+ xfs_agnumber_t ag,
+ int idx);
+
+#endif /* __KERNEL__ */
+
/*
* Compute and fill in value of m_ag_maxlevels.
*/
@@ -196,18 +209,4 @@ xfs_free_extent(
xfs_fsblock_t bno, /* starting block number of extent */
xfs_extlen_t len); /* length of extent */
-void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
- xfs_agnumber_t agno,
- xfs_agblock_t bno,
- xfs_extlen_t len);
-
-void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
- xfs_agnumber_t ag,
- int idx);
-
-
-#endif /* __KERNEL__ */
-
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index 3ce2645508ae..733cb75a8c5d 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -35,2177 +35,464 @@
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
#include "xfs_ialloc.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
-/*
- * Prototypes for internal functions.
- */
-STATIC void xfs_alloc_log_block(xfs_trans_t *, xfs_buf_t *, int);
-STATIC void xfs_alloc_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_alloc_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_alloc_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC int xfs_alloc_lshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_alloc_newroot(xfs_btree_cur_t *, int *);
-STATIC int xfs_alloc_rshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_alloc_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
- xfs_alloc_key_t *, xfs_btree_cur_t **, int *);
-STATIC int xfs_alloc_updkey(xfs_btree_cur_t *, xfs_alloc_key_t *, int);
+STATIC struct xfs_btree_cur *
+xfs_allocbt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_allocbt_init_cursor(cur->bc_mp, cur->bc_tp,
+ cur->bc_private.a.agbp, cur->bc_private.a.agno,
+ cur->bc_btnum);
+}
-/*
- * Internal functions.
- */
+STATIC void
+xfs_allocbt_set_root(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ int inc)
+{
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
+ int btnum = cur->bc_btnum;
-/*
- * Single level of the xfs_alloc_delete record deletion routine.
- * Delete record pointed to by cur/level.
- * Remove the record from its block then rebalance the tree.
- * Return 0 for error, 1 for done, 2 to go on to the next level.
- */
-STATIC int /* error */
-xfs_alloc_delrec(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level removing record from */
- int *stat) /* fail/done/go-on */
+ ASSERT(ptr->s != 0);
+
+ agf->agf_roots[btnum] = ptr->s;
+ be32_add_cpu(&agf->agf_levels[btnum], inc);
+ cur->bc_mp->m_perag[seqno].pagf_levels[btnum] += inc;
+
+ xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
+}
+
+STATIC int
+xfs_allocbt_alloc_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int length,
+ int *stat)
{
- xfs_agf_t *agf; /* allocation group freelist header */
- xfs_alloc_block_t *block; /* btree block record/key lives in */
- xfs_agblock_t bno; /* btree block number */
- xfs_buf_t *bp; /* buffer for block */
- int error; /* error return value */
- int i; /* loop index */
- xfs_alloc_key_t key; /* kp points here if block is level 0 */
- xfs_agblock_t lbno; /* left block's block number */
- xfs_buf_t *lbp; /* left block's buffer pointer */
- xfs_alloc_block_t *left; /* left btree block */
- xfs_alloc_key_t *lkp=NULL; /* left block key pointer */
- xfs_alloc_ptr_t *lpp=NULL; /* left block address pointer */
- int lrecs=0; /* number of records in left block */
- xfs_alloc_rec_t *lrp; /* left block record pointer */
- xfs_mount_t *mp; /* mount structure */
- int ptr; /* index in btree block for this rec */
- xfs_agblock_t rbno; /* right block's block number */
- xfs_buf_t *rbp; /* right block's buffer pointer */
- xfs_alloc_block_t *right; /* right btree block */
- xfs_alloc_key_t *rkp; /* right block key pointer */
- xfs_alloc_ptr_t *rpp; /* right block address pointer */
- int rrecs=0; /* number of records in right block */
- int numrecs;
- xfs_alloc_rec_t *rrp; /* right block record pointer */
- xfs_btree_cur_t *tcur; /* temporary btree cursor */
+ int error;
+ xfs_agblock_t bno;
- /*
- * Get the index of the entry being deleted, check for nothing there.
- */
- ptr = cur->bc_ptrs[level];
- if (ptr == 0) {
- *stat = 0;
- return 0;
- }
- /*
- * Get the buffer & block containing the record or key/ptr.
- */
- bp = cur->bc_bufs[level];
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+ /* Allocate the new block from the freelist. If we can't, give up. */
+ error = xfs_alloc_get_freelist(cur->bc_tp, cur->bc_private.a.agbp,
+ &bno, 1);
+ if (error) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
-#endif
- /*
- * Fail if we're off the end of the block.
- */
- numrecs = be16_to_cpu(block->bb_numrecs);
- if (ptr > numrecs) {
+ }
+
+ if (bno == NULLAGBLOCK) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 0;
return 0;
}
- XFS_STATS_INC(xs_abt_delrec);
- /*
- * It's a nonleaf. Excise the key and ptr being deleted, by
- * sliding the entries past them down one.
- * Log the changed areas of the block.
- */
- if (level > 0) {
- lkp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
- lpp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
- for (i = ptr; i < numrecs; i++) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
- return error;
- }
-#endif
- if (ptr < numrecs) {
- memmove(&lkp[ptr - 1], &lkp[ptr],
- (numrecs - ptr) * sizeof(*lkp));
- memmove(&lpp[ptr - 1], &lpp[ptr],
- (numrecs - ptr) * sizeof(*lpp));
- xfs_alloc_log_ptrs(cur, bp, ptr, numrecs - 1);
- xfs_alloc_log_keys(cur, bp, ptr, numrecs - 1);
- }
- }
- /*
- * It's a leaf. Excise the record being deleted, by sliding the
- * entries past it down one. Log the changed areas of the block.
- */
- else {
- lrp = XFS_ALLOC_REC_ADDR(block, 1, cur);
- if (ptr < numrecs) {
- memmove(&lrp[ptr - 1], &lrp[ptr],
- (numrecs - ptr) * sizeof(*lrp));
- xfs_alloc_log_recs(cur, bp, ptr, numrecs - 1);
- }
- /*
- * If it's the first record in the block, we'll need a key
- * structure to pass up to the next level (updkey).
- */
- if (ptr == 1) {
- key.ar_startblock = lrp->ar_startblock;
- key.ar_blockcount = lrp->ar_blockcount;
- lkp = &key;
- }
- }
- /*
- * Decrement and log the number of entries in the block.
- */
- numrecs--;
- block->bb_numrecs = cpu_to_be16(numrecs);
- xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
- /*
- * See if the longest free extent in the allocation group was
- * changed by this operation. True if it's the by-size btree, and
- * this is the leaf level, and there is no right sibling block,
- * and this was the last record.
- */
- agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
- mp = cur->bc_mp;
- if (level == 0 &&
- cur->bc_btnum == XFS_BTNUM_CNT &&
- be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
- ptr > numrecs) {
- ASSERT(ptr == numrecs + 1);
- /*
- * There are still records in the block. Grab the size
- * from the last one.
- */
- if (numrecs) {
- rrp = XFS_ALLOC_REC_ADDR(block, numrecs, cur);
- agf->agf_longest = rrp->ar_blockcount;
- }
- /*
- * No free extents left.
- */
- else
- agf->agf_longest = 0;
- mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_longest =
- be32_to_cpu(agf->agf_longest);
- xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
- XFS_AGF_LONGEST);
- }
- /*
- * Is this the root level? If so, we're almost done.
- */
- if (level == cur->bc_nlevels - 1) {
- /*
- * If this is the root level,
- * and there's only one entry left,
- * and it's NOT the leaf level,
- * then we can get rid of this level.
- */
- if (numrecs == 1 && level > 0) {
- /*
- * lpp is still set to the first pointer in the block.
- * Make it the new root of the btree.
- */
- bno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]);
- agf->agf_roots[cur->bc_btnum] = *lpp;
- be32_add_cpu(&agf->agf_levels[cur->bc_btnum], -1);
- mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_levels[cur->bc_btnum]--;
- /*
- * Put this buffer/block on the ag's freelist.
- */
- error = xfs_alloc_put_freelist(cur->bc_tp,
- cur->bc_private.a.agbp, NULL, bno, 1);
- if (error)
- return error;
- /*
- * Since blocks move to the free list without the
- * coordination used in xfs_bmap_finish, we can't allow
- * block to be available for reallocation and
- * non-transaction writing (user data) until we know
- * that the transaction that moved it to the free list
- * is permanently on disk. We track the blocks by
- * declaring these blocks as "busy"; the busy list is
- * maintained on a per-ag basis and each transaction
- * records which entries should be removed when the
- * iclog commits to disk. If a busy block is
- * allocated, the iclog is pushed up to the LSN
- * that freed the block.
- */
- xfs_alloc_mark_busy(cur->bc_tp,
- be32_to_cpu(agf->agf_seqno), bno, 1);
+ xfs_trans_agbtree_delta(cur->bc_tp, 1);
+ new->s = cpu_to_be32(bno);
- xfs_trans_agbtree_delta(cur->bc_tp, -1);
- xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
- XFS_AGF_ROOTS | XFS_AGF_LEVELS);
- /*
- * Update the cursor so there's one fewer level.
- */
- xfs_btree_setbuf(cur, level, NULL);
- cur->bc_nlevels--;
- } else if (level > 0 &&
- (error = xfs_alloc_decrement(cur, level, &i)))
- return error;
- *stat = 1;
- return 0;
- }
- /*
- * If we deleted the leftmost entry in the block, update the
- * key values above us in the tree.
- */
- if (ptr == 1 && (error = xfs_alloc_updkey(cur, lkp, level + 1)))
- return error;
- /*
- * If the number of records remaining in the block is at least
- * the minimum, we're done.
- */
- if (numrecs >= XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
- if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
- return error;
- *stat = 1;
- return 0;
- }
- /*
- * Otherwise, we have to move some records around to keep the
- * tree balanced. Look at the left and right sibling blocks to
- * see if we can re-balance by moving only one record.
- */
- rbno = be32_to_cpu(block->bb_rightsib);
- lbno = be32_to_cpu(block->bb_leftsib);
- bno = NULLAGBLOCK;
- ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
- /*
- * Duplicate the cursor so our btree manipulations here won't
- * disrupt the next level up.
- */
- if ((error = xfs_btree_dup_cursor(cur, &tcur)))
- return error;
- /*
- * If there's a right sibling, see if it's ok to shift an entry
- * out of it.
- */
- if (rbno != NULLAGBLOCK) {
- /*
- * Move the temp cursor to the last entry in the next block.
- * Actually any entry but the first would suffice.
- */
- i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_alloc_increment(tcur, level, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- /*
- * Grab a pointer to the block.
- */
- rbp = tcur->bc_bufs[level];
- right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
- goto error0;
-#endif
- /*
- * Grab the current block number, for future use.
- */
- bno = be32_to_cpu(right->bb_leftsib);
- /*
- * If right block is full enough so that removing one entry
- * won't make it too empty, and left-shifting an entry out
- * of right to us works, we're done.
- */
- if (be16_to_cpu(right->bb_numrecs) - 1 >=
- XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
- if ((error = xfs_alloc_lshift(tcur, level, &i)))
- goto error0;
- if (i) {
- ASSERT(be16_to_cpu(block->bb_numrecs) >=
- XFS_ALLOC_BLOCK_MINRECS(level, cur));
- xfs_btree_del_cursor(tcur,
- XFS_BTREE_NOERROR);
- if (level > 0 &&
- (error = xfs_alloc_decrement(cur, level,
- &i)))
- return error;
- *stat = 1;
- return 0;
- }
- }
- /*
- * Otherwise, grab the number of records in right for
- * future reference, and fix up the temp cursor to point
- * to our block again (last record).
- */
- rrecs = be16_to_cpu(right->bb_numrecs);
- if (lbno != NULLAGBLOCK) {
- i = xfs_btree_firstrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_alloc_decrement(tcur, level, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- }
- }
- /*
- * If there's a left sibling, see if it's ok to shift an entry
- * out of it.
- */
- if (lbno != NULLAGBLOCK) {
- /*
- * Move the temp cursor to the first entry in the
- * previous block.
- */
- i = xfs_btree_firstrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_alloc_decrement(tcur, level, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- xfs_btree_firstrec(tcur, level);
- /*
- * Grab a pointer to the block.
- */
- lbp = tcur->bc_bufs[level];
- left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
- goto error0;
-#endif
- /*
- * Grab the current block number, for future use.
- */
- bno = be32_to_cpu(left->bb_rightsib);
- /*
- * If left block is full enough so that removing one entry
- * won't make it too empty, and right-shifting an entry out
- * of left to us works, we're done.
- */
- if (be16_to_cpu(left->bb_numrecs) - 1 >=
- XFS_ALLOC_BLOCK_MINRECS(level, cur)) {
- if ((error = xfs_alloc_rshift(tcur, level, &i)))
- goto error0;
- if (i) {
- ASSERT(be16_to_cpu(block->bb_numrecs) >=
- XFS_ALLOC_BLOCK_MINRECS(level, cur));
- xfs_btree_del_cursor(tcur,
- XFS_BTREE_NOERROR);
- if (level == 0)
- cur->bc_ptrs[0]++;
- *stat = 1;
- return 0;
- }
- }
- /*
- * Otherwise, grab the number of records in right for
- * future reference.
- */
- lrecs = be16_to_cpu(left->bb_numrecs);
- }
- /*
- * Delete the temp cursor, we're done with it.
- */
- xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
- /*
- * If here, we need to do a join to keep the tree balanced.
- */
- ASSERT(bno != NULLAGBLOCK);
- /*
- * See if we can join with the left neighbor block.
- */
- if (lbno != NULLAGBLOCK &&
- lrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
- /*
- * Set "right" to be the starting block,
- * "left" to be the left neighbor.
- */
- rbno = bno;
- right = block;
- rrecs = be16_to_cpu(right->bb_numrecs);
- rbp = bp;
- if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
- cur->bc_private.a.agno, lbno, 0, &lbp,
- XFS_ALLOC_BTREE_REF)))
- return error;
- left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
- lrecs = be16_to_cpu(left->bb_numrecs);
- if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
- return error;
- }
- /*
- * If that won't work, see if we can join with the right neighbor block.
- */
- else if (rbno != NULLAGBLOCK &&
- rrecs + numrecs <= XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
- /*
- * Set "left" to be the starting block,
- * "right" to be the right neighbor.
- */
- lbno = bno;
- left = block;
- lrecs = be16_to_cpu(left->bb_numrecs);
- lbp = bp;
- if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
- cur->bc_private.a.agno, rbno, 0, &rbp,
- XFS_ALLOC_BTREE_REF)))
- return error;
- right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
- rrecs = be16_to_cpu(right->bb_numrecs);
- if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
- return error;
- }
- /*
- * Otherwise, we can't fix the imbalance.
- * Just return. This is probably a logic error, but it's not fatal.
- */
- else {
- if (level > 0 && (error = xfs_alloc_decrement(cur, level, &i)))
- return error;
- *stat = 1;
- return 0;
- }
- /*
- * We're now going to join "left" and "right" by moving all the stuff
- * in "right" to "left" and deleting "right".
- */
- if (level > 0) {
- /*
- * It's a non-leaf. Move keys and pointers.
- */
- lkp = XFS_ALLOC_KEY_ADDR(left, lrecs + 1, cur);
- lpp = XFS_ALLOC_PTR_ADDR(left, lrecs + 1, cur);
- rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
- rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
- for (i = 0; i < rrecs; i++) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
- return error;
- }
-#endif
- memcpy(lkp, rkp, rrecs * sizeof(*lkp));
- memcpy(lpp, rpp, rrecs * sizeof(*lpp));
- xfs_alloc_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
- xfs_alloc_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
- } else {
- /*
- * It's a leaf. Move records.
- */
- lrp = XFS_ALLOC_REC_ADDR(left, lrecs + 1, cur);
- rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
- memcpy(lrp, rrp, rrecs * sizeof(*lrp));
- xfs_alloc_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
- }
- /*
- * If we joined with the left neighbor, set the buffer in the
- * cursor to the left block, and fix up the index.
- */
- if (bp != lbp) {
- xfs_btree_setbuf(cur, level, lbp);
- cur->bc_ptrs[level] += lrecs;
- }
- /*
- * If we joined with the right neighbor and there's a level above
- * us, increment the cursor at that level.
- */
- else if (level + 1 < cur->bc_nlevels &&
- (error = xfs_alloc_increment(cur, level + 1, &i)))
- return error;
- /*
- * Fix up the number of records in the surviving block.
- */
- lrecs += rrecs;
- left->bb_numrecs = cpu_to_be16(lrecs);
- /*
- * Fix up the right block pointer in the surviving block, and log it.
- */
- left->bb_rightsib = right->bb_rightsib;
- xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
- /*
- * If there is a right sibling now, make it point to the
- * remaining block.
- */
- if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
- xfs_alloc_block_t *rrblock;
- xfs_buf_t *rrbp;
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+}
- if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
- cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0,
- &rrbp, XFS_ALLOC_BTREE_REF)))
- return error;
- rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
- if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
- return error;
- rrblock->bb_leftsib = cpu_to_be32(lbno);
- xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
- }
- /*
- * Free the deleting block by putting it on the freelist.
- */
- error = xfs_alloc_put_freelist(cur->bc_tp,
- cur->bc_private.a.agbp, NULL, rbno, 1);
+STATIC int
+xfs_allocbt_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ xfs_agblock_t bno;
+ int error;
+
+ bno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(bp));
+ error = xfs_alloc_put_freelist(cur->bc_tp, agbp, NULL, bno, 1);
if (error)
return error;
+
/*
- * Since blocks move to the free list without the coordination
- * used in xfs_bmap_finish, we can't allow block to be available
- * for reallocation and non-transaction writing (user data)
- * until we know that the transaction that moved it to the free
- * list is permanently on disk. We track the blocks by declaring
- * these blocks as "busy"; the busy list is maintained on a
- * per-ag basis and each transaction records which entries
- * should be removed when the iclog commits to disk. If a
- * busy block is allocated, the iclog is pushed up to the
+ * Since blocks move to the free list without the coordination used in
+ * xfs_bmap_finish, we can't allow block to be available for
+ * reallocation and non-transaction writing (user data) until we know
+ * that the transaction that moved it to the free list is permanently
+ * on disk. We track the blocks by declaring these blocks as "busy";
+ * the busy list is maintained on a per-ag basis and each transaction
+ * records which entries should be removed when the iclog commits to
+ * disk. If a busy block is allocated, the iclog is pushed up to the
* LSN that freed the block.
*/
xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
xfs_trans_agbtree_delta(cur->bc_tp, -1);
-
- /*
- * Adjust the current level's cursor so that we're left referring
- * to the right node, after we're done.
- * If this leaves the ptr value 0 our caller will fix it up.
- */
- if (level > 0)
- cur->bc_ptrs[level]--;
- /*
- * Return value means the next level up has something to do.
- */
- *stat = 2;
return 0;
-
-error0:
- xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
- return error;
}
/*
- * Insert one record/level. Return information to the caller
- * allowing the next level up to proceed if necessary.
+ * Update the longest extent in the AGF
*/
-STATIC int /* error */
-xfs_alloc_insrec(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level to insert record at */
- xfs_agblock_t *bnop, /* i/o: block number inserted */
- xfs_alloc_rec_t *recp, /* i/o: record data inserted */
- xfs_btree_cur_t **curp, /* output: new cursor replacing cur */
- int *stat) /* output: success/failure */
+STATIC void
+xfs_allocbt_update_lastrec(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_rec *rec,
+ int ptr,
+ int reason)
{
- xfs_agf_t *agf; /* allocation group freelist header */
- xfs_alloc_block_t *block; /* btree block record/key lives in */
- xfs_buf_t *bp; /* buffer for block */
- int error; /* error return value */
- int i; /* loop index */
- xfs_alloc_key_t key; /* key value being inserted */
- xfs_alloc_key_t *kp; /* pointer to btree keys */
- xfs_agblock_t nbno; /* block number of allocated block */
- xfs_btree_cur_t *ncur; /* new cursor to be used at next lvl */
- xfs_alloc_key_t nkey; /* new key value, from split */
- xfs_alloc_rec_t nrec; /* new record value, for caller */
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+ xfs_agnumber_t seqno = be32_to_cpu(agf->agf_seqno);
+ __be32 len;
int numrecs;
- int optr; /* old ptr value */
- xfs_alloc_ptr_t *pp; /* pointer to btree addresses */
- int ptr; /* index in btree block for this rec */
- xfs_alloc_rec_t *rp; /* pointer to btree records */
- ASSERT(be32_to_cpu(recp->ar_blockcount) > 0);
+ ASSERT(cur->bc_btnum == XFS_BTNUM_CNT);
+
+ switch (reason) {
+ case LASTREC_UPDATE:
+ /*
+ * If this is the last leaf block and it's the last record,
+ * then update the size of the longest extent in the AG.
+ */
+ if (ptr != xfs_btree_get_numrecs(block))
+ return;
+ len = rec->alloc.ar_blockcount;
+ break;
+ case LASTREC_INSREC:
+ if (be32_to_cpu(rec->alloc.ar_blockcount) <=
+ be32_to_cpu(agf->agf_longest))
+ return;
+ len = rec->alloc.ar_blockcount;
+ break;
+ case LASTREC_DELREC:
+ numrecs = xfs_btree_get_numrecs(block);
+ if (ptr <= numrecs)
+ return;
+ ASSERT(ptr == numrecs + 1);
- /*
- * GCC doesn't understand the (arguably complex) control flow in
- * this function and complains about uninitialized structure fields
- * without this.
- */
- memset(&nrec, 0, sizeof(nrec));
+ if (numrecs) {
+ xfs_alloc_rec_t *rrp;
- /*
- * If we made it to the root level, allocate a new root block
- * and we're done.
- */
- if (level >= cur->bc_nlevels) {
- XFS_STATS_INC(xs_abt_insrec);
- if ((error = xfs_alloc_newroot(cur, &i)))
- return error;
- *bnop = NULLAGBLOCK;
- *stat = i;
- return 0;
- }
- /*
- * Make a key out of the record data to be inserted, and save it.
- */
- key.ar_startblock = recp->ar_startblock;
- key.ar_blockcount = recp->ar_blockcount;
- optr = ptr = cur->bc_ptrs[level];
- /*
- * If we're off the left edge, return failure.
- */
- if (ptr == 0) {
- *stat = 0;
- return 0;
- }
- XFS_STATS_INC(xs_abt_insrec);
- /*
- * Get pointers to the btree buffer and block.
- */
- bp = cur->bc_bufs[level];
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
- numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
- return error;
- /*
- * Check that the new entry is being inserted in the right place.
- */
- if (ptr <= numrecs) {
- if (level == 0) {
- rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
- xfs_btree_check_rec(cur->bc_btnum, recp, rp);
+ rrp = XFS_ALLOC_REC_ADDR(cur->bc_mp, block, numrecs);
+ len = rrp->ar_blockcount;
} else {
- kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
- xfs_btree_check_key(cur->bc_btnum, &key, kp);
- }
- }
-#endif
- nbno = NULLAGBLOCK;
- ncur = NULL;
- /*
- * If the block is full, we can't insert the new entry until we
- * make the block un-full.
- */
- if (numrecs == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
- /*
- * First, try shifting an entry to the right neighbor.
- */
- if ((error = xfs_alloc_rshift(cur, level, &i)))
- return error;
- if (i) {
- /* nothing */
- }
- /*
- * Next, try shifting an entry to the left neighbor.
- */
- else {
- if ((error = xfs_alloc_lshift(cur, level, &i)))
- return error;
- if (i)
- optr = ptr = cur->bc_ptrs[level];
- else {
- /*
- * Next, try splitting the current block in
- * half. If this works we have to re-set our
- * variables because we could be in a
- * different block now.
- */
- if ((error = xfs_alloc_split(cur, level, &nbno,
- &nkey, &ncur, &i)))
- return error;
- if (i) {
- bp = cur->bc_bufs[level];
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
- if ((error =
- xfs_btree_check_sblock(cur,
- block, level, bp)))
- return error;
-#endif
- ptr = cur->bc_ptrs[level];
- nrec.ar_startblock = nkey.ar_startblock;
- nrec.ar_blockcount = nkey.ar_blockcount;
- }
- /*
- * Otherwise the insert fails.
- */
- else {
- *stat = 0;
- return 0;
- }
- }
- }
- }
- /*
- * At this point we know there's room for our new entry in the block
- * we're pointing at.
- */
- numrecs = be16_to_cpu(block->bb_numrecs);
- if (level > 0) {
- /*
- * It's a non-leaf entry. Make a hole for the new data
- * in the key and ptr regions of the block.
- */
- kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
- pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
- for (i = numrecs; i >= ptr; i--) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level)))
- return error;
+ len = 0;
}
-#endif
- memmove(&kp[ptr], &kp[ptr - 1],
- (numrecs - ptr + 1) * sizeof(*kp));
- memmove(&pp[ptr], &pp[ptr - 1],
- (numrecs - ptr + 1) * sizeof(*pp));
-#ifdef DEBUG
- if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
- return error;
-#endif
- /*
- * Now stuff the new data in, bump numrecs and log the new data.
- */
- kp[ptr - 1] = key;
- pp[ptr - 1] = cpu_to_be32(*bnop);
- numrecs++;
- block->bb_numrecs = cpu_to_be16(numrecs);
- xfs_alloc_log_keys(cur, bp, ptr, numrecs);
- xfs_alloc_log_ptrs(cur, bp, ptr, numrecs);
-#ifdef DEBUG
- if (ptr < numrecs)
- xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
- kp + ptr);
-#endif
- } else {
- /*
- * It's a leaf entry. Make a hole for the new record.
- */
- rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
- memmove(&rp[ptr], &rp[ptr - 1],
- (numrecs - ptr + 1) * sizeof(*rp));
- /*
- * Now stuff the new record in, bump numrecs
- * and log the new data.
- */
- rp[ptr - 1] = *recp;
- numrecs++;
- block->bb_numrecs = cpu_to_be16(numrecs);
- xfs_alloc_log_recs(cur, bp, ptr, numrecs);
-#ifdef DEBUG
- if (ptr < numrecs)
- xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
- rp + ptr);
-#endif
- }
- /*
- * Log the new number of records in the btree header.
- */
- xfs_alloc_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
- /*
- * If we inserted at the start of a block, update the parents' keys.
- */
- if (optr == 1 && (error = xfs_alloc_updkey(cur, &key, level + 1)))
- return error;
- /*
- * Look to see if the longest extent in the allocation group
- * needs to be updated.
- */
- agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
- if (level == 0 &&
- cur->bc_btnum == XFS_BTNUM_CNT &&
- be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
- be32_to_cpu(recp->ar_blockcount) > be32_to_cpu(agf->agf_longest)) {
- /*
- * If this is a leaf in the by-size btree and there
- * is no right sibling block and this block is bigger
- * than the previous longest block, update it.
- */
- agf->agf_longest = recp->ar_blockcount;
- cur->bc_mp->m_perag[be32_to_cpu(agf->agf_seqno)].pagf_longest
- = be32_to_cpu(recp->ar_blockcount);
- xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
- XFS_AGF_LONGEST);
+ break;
+ default:
+ ASSERT(0);
+ return;
}
- /*
- * Return the new block number, if any.
- * If there is one, give back a record value and a cursor too.
- */
- *bnop = nbno;
- if (nbno != NULLAGBLOCK) {
- *recp = nrec;
- *curp = ncur;
- }
- *stat = 1;
- return 0;
+
+ agf->agf_longest = len;
+ cur->bc_mp->m_perag[seqno].pagf_longest = be32_to_cpu(len);
+ xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp, XFS_AGF_LONGEST);
}
-/*
- * Log header fields from a btree block.
- */
-STATIC void
-xfs_alloc_log_block(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_buf_t *bp, /* buffer containing btree block */
- int fields) /* mask of fields: XFS_BB_... */
+STATIC int
+xfs_allocbt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
{
- int first; /* first byte offset logged */
- int last; /* last byte offset logged */
- static const short offsets[] = { /* table of offsets */
- offsetof(xfs_alloc_block_t, bb_magic),
- offsetof(xfs_alloc_block_t, bb_level),
- offsetof(xfs_alloc_block_t, bb_numrecs),
- offsetof(xfs_alloc_block_t, bb_leftsib),
- offsetof(xfs_alloc_block_t, bb_rightsib),
- sizeof(xfs_alloc_block_t)
- };
+ return cur->bc_mp->m_alloc_mnr[level != 0];
+}
- xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
- xfs_trans_log_buf(tp, bp, first, last);
+STATIC int
+xfs_allocbt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ return cur->bc_mp->m_alloc_mxr[level != 0];
}
-/*
- * Log keys from a btree block (nonleaf).
- */
STATIC void
-xfs_alloc_log_keys(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_buf_t *bp, /* buffer containing btree block */
- int kfirst, /* index of first key to log */
- int klast) /* index of last key to log */
+xfs_allocbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
{
- xfs_alloc_block_t *block; /* btree block to log from */
- int first; /* first byte offset logged */
- xfs_alloc_key_t *kp; /* key pointer in btree block */
- int last; /* last byte offset logged */
+ ASSERT(rec->alloc.ar_startblock != 0);
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
- kp = XFS_ALLOC_KEY_ADDR(block, 1, cur);
- first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
- last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
- xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+ key->alloc.ar_startblock = rec->alloc.ar_startblock;
+ key->alloc.ar_blockcount = rec->alloc.ar_blockcount;
}
-/*
- * Log block pointer fields from a btree block (nonleaf).
- */
STATIC void
-xfs_alloc_log_ptrs(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_buf_t *bp, /* buffer containing btree block */
- int pfirst, /* index of first pointer to log */
- int plast) /* index of last pointer to log */
+xfs_allocbt_init_rec_from_key(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
{
- xfs_alloc_block_t *block; /* btree block to log from */
- int first; /* first byte offset logged */
- int last; /* last byte offset logged */
- xfs_alloc_ptr_t *pp; /* block-pointer pointer in btree blk */
+ ASSERT(key->alloc.ar_startblock != 0);
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
- pp = XFS_ALLOC_PTR_ADDR(block, 1, cur);
- first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
- last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
- xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+ rec->alloc.ar_startblock = key->alloc.ar_startblock;
+ rec->alloc.ar_blockcount = key->alloc.ar_blockcount;
}
-/*
- * Log records from a btree block (leaf).
- */
STATIC void
-xfs_alloc_log_recs(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_buf_t *bp, /* buffer containing btree block */
- int rfirst, /* index of first record to log */
- int rlast) /* index of last record to log */
+xfs_allocbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
{
- xfs_alloc_block_t *block; /* btree block to log from */
- int first; /* first byte offset logged */
- int last; /* last byte offset logged */
- xfs_alloc_rec_t *rp; /* record pointer for btree block */
-
+ ASSERT(cur->bc_rec.a.ar_startblock != 0);
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
- rp = XFS_ALLOC_REC_ADDR(block, 1, cur);
-#ifdef DEBUG
- {
- xfs_agf_t *agf;
- xfs_alloc_rec_t *p;
-
- agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
- for (p = &rp[rfirst - 1]; p <= &rp[rlast - 1]; p++)
- ASSERT(be32_to_cpu(p->ar_startblock) +
- be32_to_cpu(p->ar_blockcount) <=
- be32_to_cpu(agf->agf_length));
- }
-#endif
- first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
- last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
- xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+ rec->alloc.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
+ rec->alloc.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
}
-/*
- * Lookup the record. The cursor is made to point to it, based on dir.
- * Return 0 if can't find any such record, 1 for success.
- */
-STATIC int /* error */
-xfs_alloc_lookup(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_lookup_t dir, /* <=, ==, or >= */
- int *stat) /* success/failure */
+STATIC void
+xfs_allocbt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
{
- xfs_agblock_t agbno; /* a.g. relative btree block number */
- xfs_agnumber_t agno; /* allocation group number */
- xfs_alloc_block_t *block=NULL; /* current btree block */
- int diff; /* difference for the current key */
- int error; /* error return value */
- int keyno=0; /* current key number */
- int level; /* level in the btree */
- xfs_mount_t *mp; /* file system mount point */
-
- XFS_STATS_INC(xs_abt_lookup);
- /*
- * Get the allocation group header, and the root block number.
- */
- mp = cur->bc_mp;
-
- {
- xfs_agf_t *agf; /* a.g. freespace header */
-
- agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
- agno = be32_to_cpu(agf->agf_seqno);
- agbno = be32_to_cpu(agf->agf_roots[cur->bc_btnum]);
- }
- /*
- * Iterate over each level in the btree, starting at the root.
- * For each level above the leaves, find the key we need, based
- * on the lookup record, then follow the corresponding block
- * pointer down to the next level.
- */
- for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
- xfs_buf_t *bp; /* buffer pointer for btree block */
- xfs_daddr_t d; /* disk address of btree block */
-
- /*
- * Get the disk address we're looking for.
- */
- d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- /*
- * If the old buffer at this level is for a different block,
- * throw it away, otherwise just use it.
- */
- bp = cur->bc_bufs[level];
- if (bp && XFS_BUF_ADDR(bp) != d)
- bp = NULL;
- if (!bp) {
- /*
- * Need to get a new buffer. Read it, then
- * set it in the cursor, releasing the old one.
- */
- if ((error = xfs_btree_read_bufs(mp, cur->bc_tp, agno,
- agbno, 0, &bp, XFS_ALLOC_BTREE_REF)))
- return error;
- xfs_btree_setbuf(cur, level, bp);
- /*
- * Point to the btree block, now that we have the buffer
- */
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
- if ((error = xfs_btree_check_sblock(cur, block, level,
- bp)))
- return error;
- } else
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
- /*
- * If we already had a key match at a higher level, we know
- * we need to use the first entry in this block.
- */
- if (diff == 0)
- keyno = 1;
- /*
- * Otherwise we need to search this block. Do a binary search.
- */
- else {
- int high; /* high entry number */
- xfs_alloc_key_t *kkbase=NULL;/* base of keys in block */
- xfs_alloc_rec_t *krbase=NULL;/* base of records in block */
- int low; /* low entry number */
-
- /*
- * Get a pointer to keys or records.
- */
- if (level > 0)
- kkbase = XFS_ALLOC_KEY_ADDR(block, 1, cur);
- else
- krbase = XFS_ALLOC_REC_ADDR(block, 1, cur);
- /*
- * Set low and high entry numbers, 1-based.
- */
- low = 1;
- if (!(high = be16_to_cpu(block->bb_numrecs))) {
- /*
- * If the block is empty, the tree must
- * be an empty leaf.
- */
- ASSERT(level == 0 && cur->bc_nlevels == 1);
- cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
- *stat = 0;
- return 0;
- }
- /*
- * Binary search the block.
- */
- while (low <= high) {
- xfs_extlen_t blockcount; /* key value */
- xfs_agblock_t startblock; /* key value */
-
- XFS_STATS_INC(xs_abt_compare);
- /*
- * keyno is average of low and high.
- */
- keyno = (low + high) >> 1;
- /*
- * Get startblock & blockcount.
- */
- if (level > 0) {
- xfs_alloc_key_t *kkp;
-
- kkp = kkbase + keyno - 1;
- startblock = be32_to_cpu(kkp->ar_startblock);
- blockcount = be32_to_cpu(kkp->ar_blockcount);
- } else {
- xfs_alloc_rec_t *krp;
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
- krp = krbase + keyno - 1;
- startblock = be32_to_cpu(krp->ar_startblock);
- blockcount = be32_to_cpu(krp->ar_blockcount);
- }
- /*
- * Compute difference to get next direction.
- */
- if (cur->bc_btnum == XFS_BTNUM_BNO)
- diff = (int)startblock -
- (int)cur->bc_rec.a.ar_startblock;
- else if (!(diff = (int)blockcount -
- (int)cur->bc_rec.a.ar_blockcount))
- diff = (int)startblock -
- (int)cur->bc_rec.a.ar_startblock;
- /*
- * Less than, move right.
- */
- if (diff < 0)
- low = keyno + 1;
- /*
- * Greater than, move left.
- */
- else if (diff > 0)
- high = keyno - 1;
- /*
- * Equal, we're done.
- */
- else
- break;
- }
- }
- /*
- * If there are more levels, set up for the next level
- * by getting the block number and filling in the cursor.
- */
- if (level > 0) {
- /*
- * If we moved left, need the previous key number,
- * unless there isn't one.
- */
- if (diff > 0 && --keyno < 1)
- keyno = 1;
- agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, keyno, cur));
-#ifdef DEBUG
- if ((error = xfs_btree_check_sptr(cur, agbno, level)))
- return error;
-#endif
- cur->bc_ptrs[level] = keyno;
- }
- }
- /*
- * Done with the search.
- * See if we need to adjust the results.
- */
- if (dir != XFS_LOOKUP_LE && diff < 0) {
- keyno++;
- /*
- * If ge search and we went off the end of the block, but it's
- * not the last block, we're in the wrong block.
- */
- if (dir == XFS_LOOKUP_GE &&
- keyno > be16_to_cpu(block->bb_numrecs) &&
- be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
- int i;
+ ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+ ASSERT(agf->agf_roots[cur->bc_btnum] != 0);
- cur->bc_ptrs[0] = keyno;
- if ((error = xfs_alloc_increment(cur, 0, &i)))
- return error;
- XFS_WANT_CORRUPTED_RETURN(i == 1);
- *stat = 1;
- return 0;
- }
- }
- else if (dir == XFS_LOOKUP_LE && diff > 0)
- keyno--;
- cur->bc_ptrs[0] = keyno;
- /*
- * Return if we succeeded or not.
- */
- if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs))
- *stat = 0;
- else
- *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
- return 0;
+ ptr->s = agf->agf_roots[cur->bc_btnum];
}
-/*
- * Move 1 record left from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int /* error */
-xfs_alloc_lshift(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level to shift record on */
- int *stat) /* success/failure */
+STATIC __int64_t
+xfs_allocbt_key_diff(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key)
{
- int error; /* error return value */
-#ifdef DEBUG
- int i; /* loop index */
-#endif
- xfs_alloc_key_t key; /* key value for leaf level upward */
- xfs_buf_t *lbp; /* buffer for left neighbor block */
- xfs_alloc_block_t *left; /* left neighbor btree block */
- int nrec; /* new number of left block entries */
- xfs_buf_t *rbp; /* buffer for right (current) block */
- xfs_alloc_block_t *right; /* right (current) btree block */
- xfs_alloc_key_t *rkp=NULL; /* key pointer for right block */
- xfs_alloc_ptr_t *rpp=NULL; /* address pointer for right block */
- xfs_alloc_rec_t *rrp=NULL; /* record pointer for right block */
+ xfs_alloc_rec_incore_t *rec = &cur->bc_rec.a;
+ xfs_alloc_key_t *kp = &key->alloc;
+ __int64_t diff;
- /*
- * Set up variables for this block as "right".
- */
- rbp = cur->bc_bufs[level];
- right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
- return error;
-#endif
- /*
- * If we've got no left sibling then we can't shift an entry left.
- */
- if (be32_to_cpu(right->bb_leftsib) == NULLAGBLOCK) {
- *stat = 0;
- return 0;
- }
- /*
- * If the cursor entry is the one that would be moved, don't
- * do it... it's too complicated.
- */
- if (cur->bc_ptrs[level] <= 1) {
- *stat = 0;
- return 0;
- }
- /*
- * Set up the left neighbor as "left".
- */
- if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib),
- 0, &lbp, XFS_ALLOC_BTREE_REF)))
- return error;
- left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
- if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
- return error;
- /*
- * If it's full, it can't take another entry.
- */
- if (be16_to_cpu(left->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
- *stat = 0;
- return 0;
+ if (cur->bc_btnum == XFS_BTNUM_BNO) {
+ return (__int64_t)be32_to_cpu(kp->ar_startblock) -
+ rec->ar_startblock;
}
- nrec = be16_to_cpu(left->bb_numrecs) + 1;
- /*
- * If non-leaf, copy a key and a ptr to the left block.
- */
- if (level > 0) {
- xfs_alloc_key_t *lkp; /* key pointer for left block */
- xfs_alloc_ptr_t *lpp; /* address pointer for left block */
- lkp = XFS_ALLOC_KEY_ADDR(left, nrec, cur);
- rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
- *lkp = *rkp;
- xfs_alloc_log_keys(cur, lbp, nrec, nrec);
- lpp = XFS_ALLOC_PTR_ADDR(left, nrec, cur);
- rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level)))
- return error;
-#endif
- *lpp = *rpp;
- xfs_alloc_log_ptrs(cur, lbp, nrec, nrec);
- xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
- }
- /*
- * If leaf, copy a record to the left block.
- */
- else {
- xfs_alloc_rec_t *lrp; /* record pointer for left block */
+ diff = (__int64_t)be32_to_cpu(kp->ar_blockcount) - rec->ar_blockcount;
+ if (diff)
+ return diff;
- lrp = XFS_ALLOC_REC_ADDR(left, nrec, cur);
- rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
- *lrp = *rrp;
- xfs_alloc_log_recs(cur, lbp, nrec, nrec);
- xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
- }
- /*
- * Bump and log left's numrecs, decrement and log right's numrecs.
- */
- be16_add_cpu(&left->bb_numrecs, 1);
- xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
- be16_add_cpu(&right->bb_numrecs, -1);
- xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
- /*
- * Slide the contents of right down one entry.
- */
- if (level > 0) {
-#ifdef DEBUG
- for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i + 1]),
- level)))
- return error;
- }
-#endif
- memmove(rkp, rkp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
- memmove(rpp, rpp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
- xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- } else {
- memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
- xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- key.ar_startblock = rrp->ar_startblock;
- key.ar_blockcount = rrp->ar_blockcount;
- rkp = &key;
- }
- /*
- * Update the parent key values of right.
- */
- if ((error = xfs_alloc_updkey(cur, rkp, level + 1)))
- return error;
- /*
- * Slide the cursor value left one.
- */
- cur->bc_ptrs[level]--;
- *stat = 1;
- return 0;
+ return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
}
-/*
- * Allocate a new root block, fill it in.
- */
-STATIC int /* error */
-xfs_alloc_newroot(
- xfs_btree_cur_t *cur, /* btree cursor */
- int *stat) /* success/failure */
+STATIC int
+xfs_allocbt_kill_root(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
+ int level,
+ union xfs_btree_ptr *newroot)
{
- int error; /* error return value */
- xfs_agblock_t lbno; /* left block number */
- xfs_buf_t *lbp; /* left btree buffer */
- xfs_alloc_block_t *left; /* left btree block */
- xfs_mount_t *mp; /* mount structure */
- xfs_agblock_t nbno; /* new block number */
- xfs_buf_t *nbp; /* new (root) buffer */
- xfs_alloc_block_t *new; /* new (root) btree block */
- int nptr; /* new value for key index, 1 or 2 */
- xfs_agblock_t rbno; /* right block number */
- xfs_buf_t *rbp; /* right btree buffer */
- xfs_alloc_block_t *right; /* right btree block */
-
- mp = cur->bc_mp;
+ int error;
- ASSERT(cur->bc_nlevels < XFS_AG_MAXLEVELS(mp));
- /*
- * Get a buffer from the freelist blocks, for the new root.
- */
- error = xfs_alloc_get_freelist(cur->bc_tp,
- cur->bc_private.a.agbp, &nbno, 1);
- if (error)
- return error;
- /*
- * None available, we fail.
- */
- if (nbno == NULLAGBLOCK) {
- *stat = 0;
- return 0;
- }
- xfs_trans_agbtree_delta(cur->bc_tp, 1);
- nbp = xfs_btree_get_bufs(mp, cur->bc_tp, cur->bc_private.a.agno, nbno,
- 0);
- new = XFS_BUF_TO_ALLOC_BLOCK(nbp);
- /*
- * Set the root data in the a.g. freespace structure.
- */
- {
- xfs_agf_t *agf; /* a.g. freespace header */
- xfs_agnumber_t seqno;
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_STATS_INC(cur, killroot);
- agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
- agf->agf_roots[cur->bc_btnum] = cpu_to_be32(nbno);
- be32_add_cpu(&agf->agf_levels[cur->bc_btnum], 1);
- seqno = be32_to_cpu(agf->agf_seqno);
- mp->m_perag[seqno].pagf_levels[cur->bc_btnum]++;
- xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
- XFS_AGF_ROOTS | XFS_AGF_LEVELS);
- }
/*
- * At the previous root level there are now two blocks: the old
- * root, and the new block generated when it was split.
- * We don't know which one the cursor is pointing at, so we
- * set up variables "left" and "right" for each case.
+ * Update the root pointer, decreasing the level by 1 and then
+ * free the old root.
*/
- lbp = cur->bc_bufs[cur->bc_nlevels - 1];
- left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, left, cur->bc_nlevels - 1, lbp)))
+ xfs_allocbt_set_root(cur, newroot, -1);
+ error = xfs_allocbt_free_block(cur, bp);
+ if (error) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
-#endif
- if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
- /*
- * Our block is left, pick up the right block.
- */
- lbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(lbp));
- rbno = be32_to_cpu(left->bb_rightsib);
- if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
- cur->bc_private.a.agno, rbno, 0, &rbp,
- XFS_ALLOC_BTREE_REF)))
- return error;
- right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
- if ((error = xfs_btree_check_sblock(cur, right,
- cur->bc_nlevels - 1, rbp)))
- return error;
- nptr = 1;
- } else {
- /*
- * Our block is right, pick up the left block.
- */
- rbp = lbp;
- right = left;
- rbno = XFS_DADDR_TO_AGBNO(mp, XFS_BUF_ADDR(rbp));
- lbno = be32_to_cpu(right->bb_leftsib);
- if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
- cur->bc_private.a.agno, lbno, 0, &lbp,
- XFS_ALLOC_BTREE_REF)))
- return error;
- left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
- if ((error = xfs_btree_check_sblock(cur, left,
- cur->bc_nlevels - 1, lbp)))
- return error;
- nptr = 2;
}
- /*
- * Fill in the new block's btree header and log it.
- */
- new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
- new->bb_level = cpu_to_be16(cur->bc_nlevels);
- new->bb_numrecs = cpu_to_be16(2);
- new->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
- new->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
- xfs_alloc_log_block(cur->bc_tp, nbp, XFS_BB_ALL_BITS);
- ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
- /*
- * Fill in the key data in the new root.
- */
- {
- xfs_alloc_key_t *kp; /* btree key pointer */
- kp = XFS_ALLOC_KEY_ADDR(new, 1, cur);
- if (be16_to_cpu(left->bb_level) > 0) {
- kp[0] = *XFS_ALLOC_KEY_ADDR(left, 1, cur);
- kp[1] = *XFS_ALLOC_KEY_ADDR(right, 1, cur);
- } else {
- xfs_alloc_rec_t *rp; /* btree record pointer */
+ XFS_BTREE_STATS_INC(cur, free);
- rp = XFS_ALLOC_REC_ADDR(left, 1, cur);
- kp[0].ar_startblock = rp->ar_startblock;
- kp[0].ar_blockcount = rp->ar_blockcount;
- rp = XFS_ALLOC_REC_ADDR(right, 1, cur);
- kp[1].ar_startblock = rp->ar_startblock;
- kp[1].ar_blockcount = rp->ar_blockcount;
- }
- }
- xfs_alloc_log_keys(cur, nbp, 1, 2);
- /*
- * Fill in the pointer data in the new root.
- */
- {
- xfs_alloc_ptr_t *pp; /* btree address pointer */
+ xfs_btree_setbuf(cur, level, NULL);
+ cur->bc_nlevels--;
- pp = XFS_ALLOC_PTR_ADDR(new, 1, cur);
- pp[0] = cpu_to_be32(lbno);
- pp[1] = cpu_to_be32(rbno);
- }
- xfs_alloc_log_ptrs(cur, nbp, 1, 2);
- /*
- * Fix up the cursor.
- */
- xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
- cur->bc_ptrs[cur->bc_nlevels] = nptr;
- cur->bc_nlevels++;
- *stat = 1;
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
return 0;
}
-/*
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int /* error */
-xfs_alloc_rshift(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level to shift record on */
- int *stat) /* success/failure */
-{
- int error; /* error return value */
- int i; /* loop index */
- xfs_alloc_key_t key; /* key value for leaf level upward */
- xfs_buf_t *lbp; /* buffer for left (current) block */
- xfs_alloc_block_t *left; /* left (current) btree block */
- xfs_buf_t *rbp; /* buffer for right neighbor block */
- xfs_alloc_block_t *right; /* right neighbor btree block */
- xfs_alloc_key_t *rkp; /* key pointer for right block */
- xfs_btree_cur_t *tcur; /* temporary cursor */
-
- /*
- * Set up variables for this block as "left".
- */
- lbp = cur->bc_bufs[level];
- left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
- return error;
-#endif
- /*
- * If we've got no right sibling then we can't shift an entry right.
- */
- if (be32_to_cpu(left->bb_rightsib) == NULLAGBLOCK) {
- *stat = 0;
- return 0;
- }
- /*
- * If the cursor entry is the one that would be moved, don't
- * do it... it's too complicated.
- */
- if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
- *stat = 0;
- return 0;
- }
- /*
- * Set up the right neighbor as "right".
- */
- if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib),
- 0, &rbp, XFS_ALLOC_BTREE_REF)))
- return error;
- right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
- if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
- return error;
- /*
- * If it's full, it can't take another entry.
- */
- if (be16_to_cpu(right->bb_numrecs) == XFS_ALLOC_BLOCK_MAXRECS(level, cur)) {
- *stat = 0;
- return 0;
- }
- /*
- * Make a hole at the start of the right neighbor block, then
- * copy the last left block entry to the hole.
- */
- if (level > 0) {
- xfs_alloc_key_t *lkp; /* key pointer for left block */
- xfs_alloc_ptr_t *lpp; /* address pointer for left block */
- xfs_alloc_ptr_t *rpp; /* address pointer for right block */
-
- lkp = XFS_ALLOC_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
- lpp = XFS_ALLOC_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
- rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
- rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
#ifdef DEBUG
- for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
- return error;
- }
-#endif
- memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
- memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-#ifdef DEBUG
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level)))
- return error;
-#endif
- *rkp = *lkp;
- *rpp = *lpp;
- xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
- xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
- xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
+STATIC int
+xfs_allocbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ if (cur->bc_btnum == XFS_BTNUM_BNO) {
+ return be32_to_cpu(k1->alloc.ar_startblock) <
+ be32_to_cpu(k2->alloc.ar_startblock);
} else {
- xfs_alloc_rec_t *lrp; /* record pointer for left block */
- xfs_alloc_rec_t *rrp; /* record pointer for right block */
-
- lrp = XFS_ALLOC_REC_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
- rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
- memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
- *rrp = *lrp;
- xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
- key.ar_startblock = rrp->ar_startblock;
- key.ar_blockcount = rrp->ar_blockcount;
- rkp = &key;
- xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
+ return be32_to_cpu(k1->alloc.ar_blockcount) <
+ be32_to_cpu(k2->alloc.ar_blockcount) ||
+ (k1->alloc.ar_blockcount == k2->alloc.ar_blockcount &&
+ be32_to_cpu(k1->alloc.ar_startblock) <
+ be32_to_cpu(k2->alloc.ar_startblock));
}
- /*
- * Decrement and log left's numrecs, bump and log right's numrecs.
- */
- be16_add_cpu(&left->bb_numrecs, -1);
- xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
- be16_add_cpu(&right->bb_numrecs, 1);
- xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
- /*
- * Using a temporary cursor, update the parent key values of the
- * block on the right.
- */
- if ((error = xfs_btree_dup_cursor(cur, &tcur)))
- return error;
- i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_alloc_increment(tcur, level, &i)) ||
- (error = xfs_alloc_updkey(tcur, rkp, level + 1)))
- goto error0;
- xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
- *stat = 1;
- return 0;
-error0:
- xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
- return error;
}
-/*
- * Split cur/level block in half.
- * Return new block number and its first record (to be inserted into parent).
- */
-STATIC int /* error */
-xfs_alloc_split(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level to split */
- xfs_agblock_t *bnop, /* output: block number allocated */
- xfs_alloc_key_t *keyp, /* output: first key of new block */
- xfs_btree_cur_t **curp, /* output: new cursor */
- int *stat) /* success/failure */
+STATIC int
+xfs_allocbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *r1,
+ union xfs_btree_rec *r2)
{
- int error; /* error return value */
- int i; /* loop index/record number */
- xfs_agblock_t lbno; /* left (current) block number */
- xfs_buf_t *lbp; /* buffer for left block */
- xfs_alloc_block_t *left; /* left (current) btree block */
- xfs_agblock_t rbno; /* right (new) block number */
- xfs_buf_t *rbp; /* buffer for right block */
- xfs_alloc_block_t *right; /* right (new) btree block */
-
- /*
- * Allocate the new block from the freelist.
- * If we can't do it, we're toast. Give up.
- */
- error = xfs_alloc_get_freelist(cur->bc_tp,
- cur->bc_private.a.agbp, &rbno, 1);
- if (error)
- return error;
- if (rbno == NULLAGBLOCK) {
- *stat = 0;
- return 0;
- }
- xfs_trans_agbtree_delta(cur->bc_tp, 1);
- rbp = xfs_btree_get_bufs(cur->bc_mp, cur->bc_tp, cur->bc_private.a.agno,
- rbno, 0);
- /*
- * Set up the new block as "right".
- */
- right = XFS_BUF_TO_ALLOC_BLOCK(rbp);
- /*
- * "Left" is the current (according to the cursor) block.
- */
- lbp = cur->bc_bufs[level];
- left = XFS_BUF_TO_ALLOC_BLOCK(lbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
- return error;
-#endif
- /*
- * Fill in the btree header for the new block.
- */
- right->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
- right->bb_level = left->bb_level;
- right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
- /*
- * Make sure that if there's an odd number of entries now, that
- * each new block will have the same number of entries.
- */
- if ((be16_to_cpu(left->bb_numrecs) & 1) &&
- cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
- be16_add_cpu(&right->bb_numrecs, 1);
- i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
- /*
- * For non-leaf blocks, copy keys and addresses over to the new block.
- */
- if (level > 0) {
- xfs_alloc_key_t *lkp; /* left btree key pointer */
- xfs_alloc_ptr_t *lpp; /* left btree address pointer */
- xfs_alloc_key_t *rkp; /* right btree key pointer */
- xfs_alloc_ptr_t *rpp; /* right btree address pointer */
-
- lkp = XFS_ALLOC_KEY_ADDR(left, i, cur);
- lpp = XFS_ALLOC_PTR_ADDR(left, i, cur);
- rkp = XFS_ALLOC_KEY_ADDR(right, 1, cur);
- rpp = XFS_ALLOC_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
- for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
- return error;
- }
-#endif
- memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
- memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
- xfs_alloc_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- xfs_alloc_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- *keyp = *rkp;
+ if (cur->bc_btnum == XFS_BTNUM_BNO) {
+ return be32_to_cpu(r1->alloc.ar_startblock) +
+ be32_to_cpu(r1->alloc.ar_blockcount) <=
+ be32_to_cpu(r2->alloc.ar_startblock);
+ } else {
+ return be32_to_cpu(r1->alloc.ar_blockcount) <
+ be32_to_cpu(r2->alloc.ar_blockcount) ||
+ (r1->alloc.ar_blockcount == r2->alloc.ar_blockcount &&
+ be32_to_cpu(r1->alloc.ar_startblock) <
+ be32_to_cpu(r2->alloc.ar_startblock));
}
- /*
- * For leaf blocks, copy records over to the new block.
- */
- else {
- xfs_alloc_rec_t *lrp; /* left btree record pointer */
- xfs_alloc_rec_t *rrp; /* right btree record pointer */
+}
+#endif /* DEBUG */
- lrp = XFS_ALLOC_REC_ADDR(left, i, cur);
- rrp = XFS_ALLOC_REC_ADDR(right, 1, cur);
- memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
- xfs_alloc_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- keyp->ar_startblock = rrp->ar_startblock;
- keyp->ar_blockcount = rrp->ar_blockcount;
- }
- /*
- * Find the left block number by looking in the buffer.
- * Adjust numrecs, sibling pointers.
- */
- lbno = XFS_DADDR_TO_AGBNO(cur->bc_mp, XFS_BUF_ADDR(lbp));
- be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
- right->bb_rightsib = left->bb_rightsib;
- left->bb_rightsib = cpu_to_be32(rbno);
- right->bb_leftsib = cpu_to_be32(lbno);
- xfs_alloc_log_block(cur->bc_tp, rbp, XFS_BB_ALL_BITS);
- xfs_alloc_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
- /*
- * If there's a block to the new block's right, make that block
- * point back to right instead of to left.
- */
- if (be32_to_cpu(right->bb_rightsib) != NULLAGBLOCK) {
- xfs_alloc_block_t *rrblock; /* rr btree block */
- xfs_buf_t *rrbp; /* buffer for rrblock */
+#ifdef XFS_BTREE_TRACE
+ktrace_t *xfs_allocbt_trace_buf;
- if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agno, be32_to_cpu(right->bb_rightsib), 0,
- &rrbp, XFS_ALLOC_BTREE_REF)))
- return error;
- rrblock = XFS_BUF_TO_ALLOC_BLOCK(rrbp);
- if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
- return error;
- rrblock->bb_leftsib = cpu_to_be32(rbno);
- xfs_alloc_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
- }
- /*
- * If the cursor is really in the right block, move it there.
- * If it's just pointing past the last entry in left, then we'll
- * insert there, so don't change anything in that case.
- */
- if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
- xfs_btree_setbuf(cur, level, rbp);
- cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
- }
- /*
- * If there are more levels, we'll need another cursor which refers to
- * the right block, no matter where this cursor was.
- */
- if (level + 1 < cur->bc_nlevels) {
- if ((error = xfs_btree_dup_cursor(cur, curp)))
- return error;
- (*curp)->bc_ptrs[level + 1]++;
- }
- *bnop = rbno;
- *stat = 1;
- return 0;
+STATIC void
+xfs_allocbt_trace_enter(
+ struct xfs_btree_cur *cur,
+ const char *func,
+ char *s,
+ int type,
+ int line,
+ __psunsigned_t a0,
+ __psunsigned_t a1,
+ __psunsigned_t a2,
+ __psunsigned_t a3,
+ __psunsigned_t a4,
+ __psunsigned_t a5,
+ __psunsigned_t a6,
+ __psunsigned_t a7,
+ __psunsigned_t a8,
+ __psunsigned_t a9,
+ __psunsigned_t a10)
+{
+ ktrace_enter(xfs_allocbt_trace_buf, (void *)(__psint_t)type,
+ (void *)func, (void *)s, NULL, (void *)cur,
+ (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+ (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+ (void *)a8, (void *)a9, (void *)a10);
}
-/*
- * Update keys at all levels from here to the root along the cursor's path.
- */
-STATIC int /* error */
-xfs_alloc_updkey(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_alloc_key_t *keyp, /* new key value to update to */
- int level) /* starting level for update */
+STATIC void
+xfs_allocbt_trace_cursor(
+ struct xfs_btree_cur *cur,
+ __uint32_t *s0,
+ __uint64_t *l0,
+ __uint64_t *l1)
{
- int ptr; /* index of key in block */
-
- /*
- * Go up the tree from this level toward the root.
- * At each level, update the key value to the value input.
- * Stop when we reach a level where the cursor isn't pointing
- * at the first entry in the block.
- */
- for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
- xfs_alloc_block_t *block; /* btree block */
- xfs_buf_t *bp; /* buffer for block */
-#ifdef DEBUG
- int error; /* error return value */
-#endif
- xfs_alloc_key_t *kp; /* ptr to btree block keys */
-
- bp = cur->bc_bufs[level];
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
- return error;
-#endif
- ptr = cur->bc_ptrs[level];
- kp = XFS_ALLOC_KEY_ADDR(block, ptr, cur);
- *kp = *keyp;
- xfs_alloc_log_keys(cur, bp, ptr, ptr);
- }
- return 0;
+ *s0 = cur->bc_private.a.agno;
+ *l0 = cur->bc_rec.a.ar_startblock;
+ *l1 = cur->bc_rec.a.ar_blockcount;
}
-/*
- * Externally visible routines.
- */
-
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int /* error */
-xfs_alloc_decrement(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level in btree, 0 is leaf */
- int *stat) /* success/failure */
+STATIC void
+xfs_allocbt_trace_key(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key,
+ __uint64_t *l0,
+ __uint64_t *l1)
{
- xfs_alloc_block_t *block; /* btree block */
- int error; /* error return value */
- int lev; /* btree level */
-
- ASSERT(level < cur->bc_nlevels);
- /*
- * Read-ahead to the left at this level.
- */
- xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
- /*
- * Decrement the ptr at this level. If we're still in the block
- * then we're done.
- */
- if (--cur->bc_ptrs[level] > 0) {
- *stat = 1;
- return 0;
- }
- /*
- * Get a pointer to the btree block.
- */
- block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[level]);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, level,
- cur->bc_bufs[level])))
- return error;
-#endif
- /*
- * If we just went off the left edge of the tree, return failure.
- */
- if (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK) {
- *stat = 0;
- return 0;
- }
- /*
- * March up the tree decrementing pointers.
- * Stop when we don't go off the left edge of a block.
- */
- for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
- if (--cur->bc_ptrs[lev] > 0)
- break;
- /*
- * Read-ahead the left block, we're going to read it
- * in the next loop.
- */
- xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
- }
- /*
- * If we went off the root then we are seriously confused.
- */
- ASSERT(lev < cur->bc_nlevels);
- /*
- * Now walk back down the tree, fixing up the cursor's buffer
- * pointers and key numbers.
- */
- for (block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
- xfs_agblock_t agbno; /* block number of btree block */
- xfs_buf_t *bp; /* buffer pointer for block */
-
- agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
- if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agno, agbno, 0, &bp,
- XFS_ALLOC_BTREE_REF)))
- return error;
- lev--;
- xfs_btree_setbuf(cur, lev, bp);
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
- if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
- return error;
- cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
- }
- *stat = 1;
- return 0;
+ *l0 = be32_to_cpu(key->alloc.ar_startblock);
+ *l1 = be32_to_cpu(key->alloc.ar_blockcount);
}
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-int /* error */
-xfs_alloc_delete(
- xfs_btree_cur_t *cur, /* btree cursor */
- int *stat) /* success/failure */
+STATIC void
+xfs_allocbt_trace_record(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ __uint64_t *l0,
+ __uint64_t *l1,
+ __uint64_t *l2)
{
- int error; /* error return value */
- int i; /* result code */
- int level; /* btree level */
-
- /*
- * Go up the tree, starting at leaf level.
- * If 2 is returned then a join was done; go to the next level.
- * Otherwise we are done.
- */
- for (level = 0, i = 2; i == 2; level++) {
- if ((error = xfs_alloc_delrec(cur, level, &i)))
- return error;
- }
- if (i == 0) {
- for (level = 1; level < cur->bc_nlevels; level++) {
- if (cur->bc_ptrs[level] == 0) {
- if ((error = xfs_alloc_decrement(cur, level, &i)))
- return error;
- break;
- }
- }
- }
- *stat = i;
- return 0;
+ *l0 = be32_to_cpu(rec->alloc.ar_startblock);
+ *l1 = be32_to_cpu(rec->alloc.ar_blockcount);
+ *l2 = 0;
}
+#endif /* XFS_BTREE_TRACE */
+
+static const struct xfs_btree_ops xfs_allocbt_ops = {
+ .rec_len = sizeof(xfs_alloc_rec_t),
+ .key_len = sizeof(xfs_alloc_key_t),
+
+ .dup_cursor = xfs_allocbt_dup_cursor,
+ .set_root = xfs_allocbt_set_root,
+ .kill_root = xfs_allocbt_kill_root,
+ .alloc_block = xfs_allocbt_alloc_block,
+ .free_block = xfs_allocbt_free_block,
+ .update_lastrec = xfs_allocbt_update_lastrec,
+ .get_minrecs = xfs_allocbt_get_minrecs,
+ .get_maxrecs = xfs_allocbt_get_maxrecs,
+ .init_key_from_rec = xfs_allocbt_init_key_from_rec,
+ .init_rec_from_key = xfs_allocbt_init_rec_from_key,
+ .init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
+ .key_diff = xfs_allocbt_key_diff,
-/*
- * Get the data from the pointed-to record.
- */
-int /* error */
-xfs_alloc_get_rec(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agblock_t *bno, /* output: starting block of extent */
- xfs_extlen_t *len, /* output: length of extent */
- int *stat) /* output: success/failure */
-{
- xfs_alloc_block_t *block; /* btree block */
#ifdef DEBUG
- int error; /* error return value */
+ .keys_inorder = xfs_allocbt_keys_inorder,
+ .recs_inorder = xfs_allocbt_recs_inorder,
#endif
- int ptr; /* record number */
- ptr = cur->bc_ptrs[0];
- block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
- return error;
+#ifdef XFS_BTREE_TRACE
+ .trace_enter = xfs_allocbt_trace_enter,
+ .trace_cursor = xfs_allocbt_trace_cursor,
+ .trace_key = xfs_allocbt_trace_key,
+ .trace_record = xfs_allocbt_trace_record,
#endif
- /*
- * Off the right end or left end, return failure.
- */
- if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) {
- *stat = 0;
- return 0;
- }
- /*
- * Point to the record and extract its data.
- */
- {
- xfs_alloc_rec_t *rec; /* record data */
-
- rec = XFS_ALLOC_REC_ADDR(block, ptr, cur);
- *bno = be32_to_cpu(rec->ar_startblock);
- *len = be32_to_cpu(rec->ar_blockcount);
- }
- *stat = 1;
- return 0;
-}
+};
/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
+ * Allocate a new allocation btree cursor.
*/
-int /* error */
-xfs_alloc_increment(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level in btree, 0 is leaf */
- int *stat) /* success/failure */
+struct xfs_btree_cur * /* new alloc btree cursor */
+xfs_allocbt_init_cursor(
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_buf *agbp, /* buffer for agf structure */
+ xfs_agnumber_t agno, /* allocation group number */
+ xfs_btnum_t btnum) /* btree identifier */
{
- xfs_alloc_block_t *block; /* btree block */
- xfs_buf_t *bp; /* tree block buffer */
- int error; /* error return value */
- int lev; /* btree level */
-
- ASSERT(level < cur->bc_nlevels);
- /*
- * Read-ahead to the right at this level.
- */
- xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
- /*
- * Get a pointer to the btree block.
- */
- bp = cur->bc_bufs[level];
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
- return error;
-#endif
- /*
- * Increment the ptr at this level. If we're still in the block
- * then we're done.
- */
- if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
- *stat = 1;
- return 0;
- }
- /*
- * If we just went off the right edge of the tree, return failure.
- */
- if (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK) {
- *stat = 0;
- return 0;
- }
- /*
- * March up the tree incrementing pointers.
- * Stop when we don't go off the right edge of a block.
- */
- for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
- bp = cur->bc_bufs[lev];
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
- return error;
-#endif
- if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
- break;
- /*
- * Read-ahead the right block, we're going to read it
- * in the next loop.
- */
- xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
- }
- /*
- * If we went off the root then we are seriously confused.
- */
- ASSERT(lev < cur->bc_nlevels);
- /*
- * Now walk back down the tree, fixing up the cursor's buffer
- * pointers and key numbers.
- */
- for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_ALLOC_BLOCK(bp);
- lev > level; ) {
- xfs_agblock_t agbno; /* block number of btree block */
+ struct xfs_agf *agf = XFS_BUF_TO_AGF(agbp);
+ struct xfs_btree_cur *cur;
- agbno = be32_to_cpu(*XFS_ALLOC_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
- if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agno, agbno, 0, &bp,
- XFS_ALLOC_BTREE_REF)))
- return error;
- lev--;
- xfs_btree_setbuf(cur, lev, bp);
- block = XFS_BUF_TO_ALLOC_BLOCK(bp);
- if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
- return error;
- cur->bc_ptrs[lev] = 1;
- }
- *stat = 1;
- return 0;
-}
+ ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
-/*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
- */
-int /* error */
-xfs_alloc_insert(
- xfs_btree_cur_t *cur, /* btree cursor */
- int *stat) /* success/failure */
-{
- int error; /* error return value */
- int i; /* result value, 0 for failure */
- int level; /* current level number in btree */
- xfs_agblock_t nbno; /* new block number (split result) */
- xfs_btree_cur_t *ncur; /* new cursor (split result) */
- xfs_alloc_rec_t nrec; /* record being inserted this level */
- xfs_btree_cur_t *pcur; /* previous level's cursor */
+ cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
- level = 0;
- nbno = NULLAGBLOCK;
- nrec.ar_startblock = cpu_to_be32(cur->bc_rec.a.ar_startblock);
- nrec.ar_blockcount = cpu_to_be32(cur->bc_rec.a.ar_blockcount);
- ncur = NULL;
- pcur = cur;
- /*
- * Loop going up the tree, starting at the leaf level.
- * Stop when we don't get a split block, that must mean that
- * the insert is finished with this level.
- */
- do {
- /*
- * Insert nrec/nbno into this level of the tree.
- * Note if we fail, nbno will be null.
- */
- if ((error = xfs_alloc_insrec(pcur, level++, &nbno, &nrec, &ncur,
- &i))) {
- if (pcur != cur)
- xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
- return error;
- }
- /*
- * See if the cursor we just used is trash.
- * Can't trash the caller's cursor, but otherwise we should
- * if ncur is a new cursor or we're about to be done.
- */
- if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
- cur->bc_nlevels = pcur->bc_nlevels;
- xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
- }
- /*
- * If we got a new cursor, switch to it.
- */
- if (ncur) {
- pcur = ncur;
- ncur = NULL;
- }
- } while (nbno != NULLAGBLOCK);
- *stat = i;
- return 0;
-}
+ cur->bc_tp = tp;
+ cur->bc_mp = mp;
+ cur->bc_nlevels = be32_to_cpu(agf->agf_levels[btnum]);
+ cur->bc_btnum = btnum;
+ cur->bc_blocklog = mp->m_sb.sb_blocklog;
-/*
- * Lookup the record equal to [bno, len] in the btree given by cur.
- */
-int /* error */
-xfs_alloc_lookup_eq(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agblock_t bno, /* starting block of extent */
- xfs_extlen_t len, /* length of extent */
- int *stat) /* success/failure */
-{
- cur->bc_rec.a.ar_startblock = bno;
- cur->bc_rec.a.ar_blockcount = len;
- return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
+ cur->bc_ops = &xfs_allocbt_ops;
+ if (btnum == XFS_BTNUM_CNT)
+ cur->bc_flags = XFS_BTREE_LASTREC_UPDATE;
-/*
- * Lookup the first record greater than or equal to [bno, len]
- * in the btree given by cur.
- */
-int /* error */
-xfs_alloc_lookup_ge(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agblock_t bno, /* starting block of extent */
- xfs_extlen_t len, /* length of extent */
- int *stat) /* success/failure */
-{
- cur->bc_rec.a.ar_startblock = bno;
- cur->bc_rec.a.ar_blockcount = len;
- return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, stat);
-}
+ cur->bc_private.a.agbp = agbp;
+ cur->bc_private.a.agno = agno;
-/*
- * Lookup the first record less than or equal to [bno, len]
- * in the btree given by cur.
- */
-int /* error */
-xfs_alloc_lookup_le(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agblock_t bno, /* starting block of extent */
- xfs_extlen_t len, /* length of extent */
- int *stat) /* success/failure */
-{
- cur->bc_rec.a.ar_startblock = bno;
- cur->bc_rec.a.ar_blockcount = len;
- return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, stat);
+ return cur;
}
/*
- * Update the record referred to by cur, to the value given by [bno, len].
- * This either works (return 0) or gets an EFSCORRUPTED error.
+ * Calculate number of records in an alloc btree block.
*/
-int /* error */
-xfs_alloc_update(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agblock_t bno, /* starting block of extent */
- xfs_extlen_t len) /* length of extent */
+int
+xfs_allocbt_maxrecs(
+ struct xfs_mount *mp,
+ int blocklen,
+ int leaf)
{
- xfs_alloc_block_t *block; /* btree block to update */
- int error; /* error return value */
- int ptr; /* current record number (updating) */
+ blocklen -= XFS_ALLOC_BLOCK_LEN(mp);
- ASSERT(len > 0);
- /*
- * Pick up the a.g. freelist struct and the current block.
- */
- block = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[0]);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, 0, cur->bc_bufs[0])))
- return error;
-#endif
- /*
- * Get the address of the rec to be updated.
- */
- ptr = cur->bc_ptrs[0];
- {
- xfs_alloc_rec_t *rp; /* pointer to updated record */
-
- rp = XFS_ALLOC_REC_ADDR(block, ptr, cur);
- /*
- * Fill in the new contents and log them.
- */
- rp->ar_startblock = cpu_to_be32(bno);
- rp->ar_blockcount = cpu_to_be32(len);
- xfs_alloc_log_recs(cur, cur->bc_bufs[0], ptr, ptr);
- }
- /*
- * If it's the by-size btree and it's the last leaf block and
- * it's the last record... then update the size of the longest
- * extent in the a.g., which we cache in the a.g. freelist header.
- */
- if (cur->bc_btnum == XFS_BTNUM_CNT &&
- be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK &&
- ptr == be16_to_cpu(block->bb_numrecs)) {
- xfs_agf_t *agf; /* a.g. freespace header */
- xfs_agnumber_t seqno;
-
- agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
- seqno = be32_to_cpu(agf->agf_seqno);
- cur->bc_mp->m_perag[seqno].pagf_longest = len;
- agf->agf_longest = cpu_to_be32(len);
- xfs_alloc_log_agf(cur->bc_tp, cur->bc_private.a.agbp,
- XFS_AGF_LONGEST);
- }
- /*
- * Updating first record in leaf. Pass new key value up to our parent.
- */
- if (ptr == 1) {
- xfs_alloc_key_t key; /* key containing [bno, len] */
-
- key.ar_startblock = cpu_to_be32(bno);
- key.ar_blockcount = cpu_to_be32(len);
- if ((error = xfs_alloc_updkey(cur, &key, 1)))
- return error;
- }
- return 0;
+ if (leaf)
+ return blocklen / sizeof(xfs_alloc_rec_t);
+ return blocklen / (sizeof(xfs_alloc_key_t) + sizeof(xfs_alloc_ptr_t));
}
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index 5bd1a2c8bd07..a6caa0022c9b 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -24,7 +24,6 @@
struct xfs_buf;
struct xfs_btree_cur;
-struct xfs_btree_sblock;
struct xfs_mount;
/*
@@ -50,16 +49,6 @@ typedef struct xfs_alloc_rec_incore {
/* btree pointer type */
typedef __be32 xfs_alloc_ptr_t;
-/* btree block header type */
-typedef struct xfs_btree_sblock xfs_alloc_block_t;
-
-#define XFS_BUF_TO_ALLOC_BLOCK(bp) ((xfs_alloc_block_t *)XFS_BUF_PTR(bp))
-
-/*
- * Real block structures have a size equal to the disk block size.
- */
-#define XFS_ALLOC_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_alloc_mxr[lev != 0])
-#define XFS_ALLOC_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_alloc_mnr[lev != 0])
/*
* Minimum and maximum blocksize and sectorsize.
@@ -83,73 +72,39 @@ typedef struct xfs_btree_sblock xfs_alloc_block_t;
#define XFS_CNT_BLOCK(mp) ((xfs_agblock_t)(XFS_BNO_BLOCK(mp) + 1))
/*
- * Record, key, and pointer address macros for btree blocks.
- */
-#define XFS_ALLOC_REC_ADDR(bb,i,cur) \
- XFS_BTREE_REC_ADDR(xfs_alloc, bb, i)
-
-#define XFS_ALLOC_KEY_ADDR(bb,i,cur) \
- XFS_BTREE_KEY_ADDR(xfs_alloc, bb, i)
-
-#define XFS_ALLOC_PTR_ADDR(bb,i,cur) \
- XFS_BTREE_PTR_ADDR(xfs_alloc, bb, i, XFS_ALLOC_BLOCK_MAXRECS(1, cur))
-
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_alloc_decrement(struct xfs_btree_cur *cur, int level, int *stat);
-
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-extern int xfs_alloc_delete(struct xfs_btree_cur *cur, int *stat);
-
-/*
- * Get the data from the pointed-to record.
- */
-extern int xfs_alloc_get_rec(struct xfs_btree_cur *cur, xfs_agblock_t *bno,
- xfs_extlen_t *len, int *stat);
-
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_alloc_increment(struct xfs_btree_cur *cur, int level, int *stat);
-
-/*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
- */
-extern int xfs_alloc_insert(struct xfs_btree_cur *cur, int *stat);
-
-/*
- * Lookup the record equal to [bno, len] in the btree given by cur.
- */
-extern int xfs_alloc_lookup_eq(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- xfs_extlen_t len, int *stat);
-
-/*
- * Lookup the first record greater than or equal to [bno, len]
- * in the btree given by cur.
- */
-extern int xfs_alloc_lookup_ge(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- xfs_extlen_t len, int *stat);
-
-/*
- * Lookup the first record less than or equal to [bno, len]
- * in the btree given by cur.
+ * Btree block header size depends on a superblock flag.
+ *
+ * (not quite yet, but soon)
*/
-extern int xfs_alloc_lookup_le(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- xfs_extlen_t len, int *stat);
+#define XFS_ALLOC_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN
/*
- * Update the record referred to by cur, to the value given by [bno, len].
- * This either works (return 0) or gets an EFSCORRUPTED error.
- */
-extern int xfs_alloc_update(struct xfs_btree_cur *cur, xfs_agblock_t bno,
- xfs_extlen_t len);
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_ALLOC_REC_ADDR(mp, block, index) \
+ ((xfs_alloc_rec_t *) \
+ ((char *)(block) + \
+ XFS_ALLOC_BLOCK_LEN(mp) + \
+ (((index) - 1) * sizeof(xfs_alloc_rec_t))))
+
+#define XFS_ALLOC_KEY_ADDR(mp, block, index) \
+ ((xfs_alloc_key_t *) \
+ ((char *)(block) + \
+ XFS_ALLOC_BLOCK_LEN(mp) + \
+ ((index) - 1) * sizeof(xfs_alloc_key_t)))
+
+#define XFS_ALLOC_PTR_ADDR(mp, block, index, maxrecs) \
+ ((xfs_alloc_ptr_t *) \
+ ((char *)(block) + \
+ XFS_ALLOC_BLOCK_LEN(mp) + \
+ (maxrecs) * sizeof(xfs_alloc_key_t) + \
+ ((index) - 1) * sizeof(xfs_alloc_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
+ struct xfs_trans *, struct xfs_buf *,
+ xfs_agnumber_t, xfs_btnum_t);
+extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_arch.h b/fs/xfs/xfs_arch.h
index 0b3b5efe848c..53d5e70d1360 100644
--- a/fs/xfs/xfs_arch.h
+++ b/fs/xfs/xfs_arch.h
@@ -41,21 +41,36 @@
#endif
#ifdef XFS_NATIVE_HOST
-#define cpu_to_be16(val) ((__be16)(val))
-#define cpu_to_be32(val) ((__be32)(val))
-#define cpu_to_be64(val) ((__be64)(val))
-#define be16_to_cpu(val) ((__uint16_t)(val))
-#define be32_to_cpu(val) ((__uint32_t)(val))
-#define be64_to_cpu(val) ((__uint64_t)(val))
+#define cpu_to_be16(val) ((__force __be16)(__u16)(val))
+#define cpu_to_be32(val) ((__force __be32)(__u32)(val))
+#define cpu_to_be64(val) ((__force __be64)(__u64)(val))
+#define be16_to_cpu(val) ((__force __u16)(__be16)(val))
+#define be32_to_cpu(val) ((__force __u32)(__be32)(val))
+#define be64_to_cpu(val) ((__force __u64)(__be64)(val))
#else
-#define cpu_to_be16(val) (__swab16((__uint16_t)(val)))
-#define cpu_to_be32(val) (__swab32((__uint32_t)(val)))
-#define cpu_to_be64(val) (__swab64((__uint64_t)(val)))
-#define be16_to_cpu(val) (__swab16((__be16)(val)))
-#define be32_to_cpu(val) (__swab32((__be32)(val)))
-#define be64_to_cpu(val) (__swab64((__be64)(val)))
+#define cpu_to_be16(val) ((__force __be16)__swab16((__u16)(val)))
+#define cpu_to_be32(val) ((__force __be32)__swab32((__u32)(val)))
+#define cpu_to_be64(val) ((__force __be64)__swab64((__u64)(val)))
+#define be16_to_cpu(val) (__swab16((__force __u16)(__be16)(val)))
+#define be32_to_cpu(val) (__swab32((__force __u32)(__be32)(val)))
+#define be64_to_cpu(val) (__swab64((__force __u64)(__be64)(val)))
#endif
+static inline void be16_add_cpu(__be16 *a, __s16 b)
+{
+ *a = cpu_to_be16(be16_to_cpu(*a) + b);
+}
+
+static inline void be32_add_cpu(__be32 *a, __s32 b)
+{
+ *a = cpu_to_be32(be32_to_cpu(*a) + b);
+}
+
+static inline void be64_add_cpu(__be64 *a, __s64 b)
+{
+ *a = cpu_to_be64(be64_to_cpu(*a) + b);
+}
+
#endif /* __KERNEL__ */
/* do we need conversion? */
diff --git a/fs/xfs/xfs_bit.h b/fs/xfs/xfs_bit.h
index 8e0e463dae2d..bca7b243c319 100644
--- a/fs/xfs/xfs_bit.h
+++ b/fs/xfs/xfs_bit.h
@@ -61,8 +61,7 @@ static inline int xfs_highbit64(__uint64_t v)
/* Get low bit set out of 32-bit argument, -1 if none set */
static inline int xfs_lowbit32(__uint32_t v)
{
- unsigned long t = v;
- return (v) ? find_first_bit(&t, 32) : -1;
+ return ffs(v) - 1;
}
/* Get low bit set out of 64-bit argument, -1 if none set */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index a1aab9275d5a..138308e70d14 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -393,8 +393,8 @@ xfs_bmap_count_leaves(
STATIC void
xfs_bmap_disk_count_leaves(
- xfs_extnum_t idx,
- xfs_bmbt_block_t *block,
+ struct xfs_mount *mp,
+ struct xfs_btree_block *block,
int numrecs,
int *count);
@@ -402,6 +402,53 @@ xfs_bmap_disk_count_leaves(
* Bmap internal routines.
*/
+STATIC int /* error */
+xfs_bmbt_lookup_eq(
+ struct xfs_btree_cur *cur,
+ xfs_fileoff_t off,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.b.br_startoff = off;
+ cur->bc_rec.b.br_startblock = bno;
+ cur->bc_rec.b.br_blockcount = len;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+STATIC int /* error */
+xfs_bmbt_lookup_ge(
+ struct xfs_btree_cur *cur,
+ xfs_fileoff_t off,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.b.br_startoff = off;
+ cur->bc_rec.b.br_startblock = bno;
+ cur->bc_rec.b.br_blockcount = len;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+* Update the record referred to by cur to the value given
+ * by [off, bno, len, state].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_bmbt_update(
+ struct xfs_btree_cur *cur,
+ xfs_fileoff_t off,
+ xfs_fsblock_t bno,
+ xfs_filblks_t len,
+ xfs_exntst_t state)
+{
+ union xfs_btree_rec rec;
+
+ xfs_bmbt_disk_set_allf(&rec.bmbt, off, bno, len, state);
+ return xfs_btree_update(cur, &rec);
+}
+
/*
* Called from xfs_bmap_add_attrfork to handle btree format files.
*/
@@ -422,15 +469,14 @@ xfs_bmap_add_attrfork_btree(
if (ip->i_df.if_broot_bytes <= XFS_IFORK_DSIZE(ip))
*flags |= XFS_ILOG_DBROOT;
else {
- cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
- XFS_DATA_FORK);
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, XFS_DATA_FORK);
cur->bc_private.b.flist = flist;
cur->bc_private.b.firstblock = *firstblock;
if ((error = xfs_bmbt_lookup_ge(cur, 0, 0, 0, &stat)))
goto error0;
/* must be at least one entry */
XFS_WANT_CORRUPTED_GOTO(stat == 1, error0);
- if ((error = xfs_bmbt_newroot(cur, flags, &stat)))
+ if ((error = xfs_btree_new_iroot(cur, flags, &stat)))
goto error0;
if (stat == 0) {
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
@@ -818,10 +864,10 @@ xfs_bmap_add_extent_delay_real(
RIGHT.br_blockcount, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_bmbt_delete(cur, &i)))
+ if ((error = xfs_btree_delete(cur, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
@@ -931,7 +977,7 @@ xfs_bmap_add_extent_delay_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 0, done);
cur->bc_rec.b.br_state = XFS_EXT_NORM;
- if ((error = xfs_bmbt_insert(cur, &i)))
+ if ((error = xfs_btree_insert(cur, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
@@ -1007,7 +1053,7 @@ xfs_bmap_add_extent_delay_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 0, done);
cur->bc_rec.b.br_state = XFS_EXT_NORM;
- if ((error = xfs_bmbt_insert(cur, &i)))
+ if ((error = xfs_btree_insert(cur, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
@@ -1097,7 +1143,7 @@ xfs_bmap_add_extent_delay_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 0, done);
cur->bc_rec.b.br_state = XFS_EXT_NORM;
- if ((error = xfs_bmbt_insert(cur, &i)))
+ if ((error = xfs_btree_insert(cur, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
@@ -1152,7 +1198,7 @@ xfs_bmap_add_extent_delay_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 0, done);
cur->bc_rec.b.br_state = XFS_EXT_NORM;
- if ((error = xfs_bmbt_insert(cur, &i)))
+ if ((error = xfs_btree_insert(cur, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
@@ -1379,16 +1425,16 @@ xfs_bmap_add_extent_unwritten_real(
RIGHT.br_blockcount, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_bmbt_delete(cur, &i)))
+ if ((error = xfs_btree_delete(cur, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_bmbt_delete(cur, &i)))
+ if ((error = xfs_btree_delete(cur, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
@@ -1428,10 +1474,10 @@ xfs_bmap_add_extent_unwritten_real(
&i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_bmbt_delete(cur, &i)))
+ if ((error = xfs_btree_delete(cur, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, LEFT.br_startoff,
@@ -1471,10 +1517,10 @@ xfs_bmap_add_extent_unwritten_real(
RIGHT.br_blockcount, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_bmbt_delete(cur, &i)))
+ if ((error = xfs_btree_delete(cur, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, new->br_startoff,
@@ -1557,7 +1603,7 @@ xfs_bmap_add_extent_unwritten_real(
PREV.br_blockcount - new->br_blockcount,
oldext)))
goto done;
- if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
if (xfs_bmbt_update(cur, LEFT.br_startoff,
LEFT.br_startblock,
@@ -1605,7 +1651,7 @@ xfs_bmap_add_extent_unwritten_real(
oldext)))
goto done;
cur->bc_rec.b = *new;
- if ((error = xfs_bmbt_insert(cur, &i)))
+ if ((error = xfs_btree_insert(cur, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
@@ -1647,7 +1693,7 @@ xfs_bmap_add_extent_unwritten_real(
PREV.br_blockcount - new->br_blockcount,
oldext)))
goto done;
- if ((error = xfs_bmbt_increment(cur, 0, &i)))
+ if ((error = xfs_btree_increment(cur, 0, &i)))
goto done;
if ((error = xfs_bmbt_update(cur, new->br_startoff,
new->br_startblock,
@@ -1695,7 +1741,7 @@ xfs_bmap_add_extent_unwritten_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 0, done);
cur->bc_rec.b.br_state = XFS_EXT_NORM;
- if ((error = xfs_bmbt_insert(cur, &i)))
+ if ((error = xfs_btree_insert(cur, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
@@ -1743,7 +1789,7 @@ xfs_bmap_add_extent_unwritten_real(
cur->bc_rec.b = PREV;
cur->bc_rec.b.br_blockcount =
new->br_startoff - PREV.br_startoff;
- if ((error = xfs_bmbt_insert(cur, &i)))
+ if ((error = xfs_btree_insert(cur, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
/*
@@ -1758,7 +1804,7 @@ xfs_bmap_add_extent_unwritten_real(
XFS_WANT_CORRUPTED_GOTO(i == 0, done);
/* new middle extent - newext */
cur->bc_rec.b.br_state = new->br_state;
- if ((error = xfs_bmbt_insert(cur, &i)))
+ if ((error = xfs_btree_insert(cur, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
@@ -2106,10 +2152,10 @@ xfs_bmap_add_extent_hole_real(
right.br_blockcount, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_bmbt_delete(cur, &i)))
+ if ((error = xfs_btree_delete(cur, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
- if ((error = xfs_bmbt_decrement(cur, 0, &i)))
+ if ((error = xfs_btree_decrement(cur, 0, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
if ((error = xfs_bmbt_update(cur, left.br_startoff,
@@ -2218,7 +2264,7 @@ xfs_bmap_add_extent_hole_real(
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 0, done);
cur->bc_rec.b.br_state = new->br_state;
- if ((error = xfs_bmbt_insert(cur, &i)))
+ if ((error = xfs_btree_insert(cur, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
}
@@ -2996,24 +3042,24 @@ xfs_bmap_btree_to_extents(
int whichfork) /* data or attr fork */
{
/* REFERENCED */
- xfs_bmbt_block_t *cblock;/* child btree block */
+ struct xfs_btree_block *cblock;/* child btree block */
xfs_fsblock_t cbno; /* child block number */
xfs_buf_t *cbp; /* child block's buffer */
int error; /* error return value */
xfs_ifork_t *ifp; /* inode fork data */
xfs_mount_t *mp; /* mount point structure */
__be64 *pp; /* ptr to block address */
- xfs_bmbt_block_t *rblock;/* root btree block */
+ struct xfs_btree_block *rblock;/* root btree block */
+ mp = ip->i_mount;
ifp = XFS_IFORK_PTR(ip, whichfork);
ASSERT(ifp->if_flags & XFS_IFEXTENTS);
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
rblock = ifp->if_broot;
ASSERT(be16_to_cpu(rblock->bb_level) == 1);
ASSERT(be16_to_cpu(rblock->bb_numrecs) == 1);
- ASSERT(XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes) == 1);
- mp = ip->i_mount;
- pp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, ifp->if_broot_bytes);
+ ASSERT(xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0) == 1);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, ifp->if_broot_bytes);
cbno = be64_to_cpu(*pp);
*logflagsp = 0;
#ifdef DEBUG
@@ -3023,8 +3069,8 @@ xfs_bmap_btree_to_extents(
if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp,
XFS_BMAP_BTREE_REF)))
return error;
- cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
- if ((error = xfs_btree_check_lblock(cur, cblock, 0, cbp)))
+ cblock = XFS_BUF_TO_BLOCK(cbp);
+ if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
return error;
xfs_bmap_add_free(cbno, 1, cur->bc_private.b.flist, mp);
ip->i_d.di_nblocks--;
@@ -3170,7 +3216,7 @@ xfs_bmap_del_extent(
flags |= XFS_ILOG_FEXT(whichfork);
break;
}
- if ((error = xfs_bmbt_delete(cur, &i)))
+ if ((error = xfs_btree_delete(cur, &i)))
goto done;
XFS_WANT_CORRUPTED_GOTO(i == 1, done);
break;
@@ -3254,10 +3300,10 @@ xfs_bmap_del_extent(
got.br_startblock, temp,
got.br_state)))
goto done;
- if ((error = xfs_bmbt_increment(cur, 0, &i)))
+ if ((error = xfs_btree_increment(cur, 0, &i)))
goto done;
cur->bc_rec.b = new;
- error = xfs_bmbt_insert(cur, &i);
+ error = xfs_btree_insert(cur, &i);
if (error && error != ENOSPC)
goto done;
/*
@@ -3404,11 +3450,11 @@ xfs_bmap_extents_to_btree(
int *logflagsp, /* inode logging flags */
int whichfork) /* data or attr fork */
{
- xfs_bmbt_block_t *ablock; /* allocated (child) bt block */
+ struct xfs_btree_block *ablock; /* allocated (child) bt block */
xfs_buf_t *abp; /* buffer for ablock */
xfs_alloc_arg_t args; /* allocation arguments */
xfs_bmbt_rec_t *arp; /* child record pointer */
- xfs_bmbt_block_t *block; /* btree root block */
+ struct xfs_btree_block *block; /* btree root block */
xfs_btree_cur_t *cur; /* bmap btree cursor */
xfs_bmbt_rec_host_t *ep; /* extent record pointer */
int error; /* error return value */
@@ -3428,6 +3474,7 @@ xfs_bmap_extents_to_btree(
*/
xfs_iroot_realloc(ip, 1, whichfork);
ifp->if_flags |= XFS_IFBROOT;
+
/*
* Fill in the root.
*/
@@ -3435,14 +3482,14 @@ xfs_bmap_extents_to_btree(
block->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
block->bb_level = cpu_to_be16(1);
block->bb_numrecs = cpu_to_be16(1);
- block->bb_leftsib = cpu_to_be64(NULLDFSBNO);
- block->bb_rightsib = cpu_to_be64(NULLDFSBNO);
+ block->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+ block->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+
/*
* Need a cursor. Can't allocate until bb_level is filled in.
*/
mp = ip->i_mount;
- cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
- whichfork);
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
cur->bc_private.b.firstblock = *firstblock;
cur->bc_private.b.flist = flist;
cur->bc_private.b.flags = wasdel ? XFS_BTCUR_BPRV_WASDEL : 0;
@@ -3489,12 +3536,12 @@ xfs_bmap_extents_to_btree(
/*
* Fill in the child block.
*/
- ablock = XFS_BUF_TO_BMBT_BLOCK(abp);
+ ablock = XFS_BUF_TO_BLOCK(abp);
ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
ablock->bb_level = 0;
- ablock->bb_leftsib = cpu_to_be64(NULLDFSBNO);
- ablock->bb_rightsib = cpu_to_be64(NULLDFSBNO);
- arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
+ ablock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+ ablock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+ arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
for (cnt = i = 0; i < nextents; i++) {
ep = xfs_iext_get_ext(ifp, i);
@@ -3505,21 +3552,24 @@ xfs_bmap_extents_to_btree(
}
}
ASSERT(cnt == XFS_IFORK_NEXTENTS(ip, whichfork));
- ablock->bb_numrecs = cpu_to_be16(cnt);
+ xfs_btree_set_numrecs(ablock, cnt);
+
/*
* Fill in the root key and pointer.
*/
- kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
- arp = XFS_BMAP_REC_IADDR(ablock, 1, cur);
+ kp = XFS_BMBT_KEY_ADDR(mp, block, 1);
+ arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
kp->br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(arp));
- pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
+ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, xfs_bmbt_get_maxrecs(cur,
+ be16_to_cpu(block->bb_level)));
*pp = cpu_to_be64(args.fsbno);
+
/*
* Do all this logging at the end so that
* the root is at the right level.
*/
- xfs_bmbt_log_block(cur, abp, XFS_BB_ALL_BITS);
- xfs_bmbt_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
+ xfs_btree_log_block(cur, abp, XFS_BB_ALL_BITS);
+ xfs_btree_log_recs(cur, abp, 1, be16_to_cpu(ablock->bb_numrecs));
ASSERT(*curp == NULL);
*curp = cur;
*logflagsp = XFS_ILOG_CORE | XFS_ILOG_FBROOT(whichfork);
@@ -4176,7 +4226,7 @@ xfs_bmap_compute_maxlevels(
maxleafents = MAXAEXTNUM;
sz = XFS_BMDR_SPACE_CALC(MINABTPTRS);
}
- maxrootrecs = (int)XFS_BTREE_BLOCK_MAXRECS(sz, xfs_bmdr, 0);
+ maxrootrecs = xfs_bmdr_maxrecs(mp, sz, 0);
minleafrecs = mp->m_bmap_dmnr[0];
minnoderecs = mp->m_bmap_dmnr[1];
maxblocks = (maxleafents + minleafrecs - 1) / minleafrecs;
@@ -4242,9 +4292,15 @@ xfs_bmap_finish(
* We have a new transaction, so we should return committed=1,
* even though we're returning an error.
*/
- if (error) {
+ if (error)
return error;
- }
+
+ /*
+ * transaction commit worked ok so we can drop the extra ticket
+ * reference that we gained in xfs_trans_dup()
+ */
+ xfs_log_ticket_put(ntp->t_ticket);
+
if ((error = xfs_trans_reserve(ntp, 0, logres, 0, XFS_TRANS_PERM_LOG_RES,
logcount)))
return error;
@@ -4474,6 +4530,22 @@ xfs_bmap_one_block(
return rval;
}
+STATIC int
+xfs_bmap_sanity_check(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ int level)
+{
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+
+ if (be32_to_cpu(block->bb_magic) != XFS_BMAP_MAGIC ||
+ be16_to_cpu(block->bb_level) != level ||
+ be16_to_cpu(block->bb_numrecs) == 0 ||
+ be16_to_cpu(block->bb_numrecs) > mp->m_bmap_dmxr[level != 0])
+ return 0;
+ return 1;
+}
+
/*
* Read in the extents to if_extents.
* All inode fields are set up by caller, we just traverse the btree
@@ -4486,7 +4558,7 @@ xfs_bmap_read_extents(
xfs_inode_t *ip, /* incore inode */
int whichfork) /* data or attr fork */
{
- xfs_bmbt_block_t *block; /* current btree block */
+ struct xfs_btree_block *block; /* current btree block */
xfs_fsblock_t bno; /* block # of "block" */
xfs_buf_t *bp; /* buffer for "block" */
int error; /* error return value */
@@ -4510,7 +4582,7 @@ xfs_bmap_read_extents(
*/
level = be16_to_cpu(block->bb_level);
ASSERT(level > 0);
- pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
bno = be64_to_cpu(*pp);
ASSERT(bno != NULLDFSBNO);
ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
@@ -4523,13 +4595,13 @@ xfs_bmap_read_extents(
if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
XFS_BMAP_BTREE_REF)))
return error;
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
+ block = XFS_BUF_TO_BLOCK(bp);
XFS_WANT_CORRUPTED_GOTO(
- XFS_BMAP_SANITY_CHECK(mp, block, level),
+ xfs_bmap_sanity_check(mp, bp, level),
error0);
if (level == 0)
break;
- pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
+ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
bno = be64_to_cpu(*pp);
XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
xfs_trans_brelse(tp, bp);
@@ -4549,7 +4621,7 @@ xfs_bmap_read_extents(
xfs_extnum_t start;
- num_recs = be16_to_cpu(block->bb_numrecs);
+ num_recs = xfs_btree_get_numrecs(block);
if (unlikely(i + num_recs > room)) {
ASSERT(i + num_recs <= room);
xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
@@ -4561,18 +4633,18 @@ xfs_bmap_read_extents(
goto error0;
}
XFS_WANT_CORRUPTED_GOTO(
- XFS_BMAP_SANITY_CHECK(mp, block, 0),
+ xfs_bmap_sanity_check(mp, bp, 0),
error0);
/*
* Read-ahead the next leaf block, if any.
*/
- nextbno = be64_to_cpu(block->bb_rightsib);
+ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
if (nextbno != NULLFSBLOCK)
xfs_btree_reada_bufl(mp, nextbno, 1);
/*
* Copy records into the extent records.
*/
- frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1);
+ frp = XFS_BMBT_REC_ADDR(mp, block, 1);
start = i;
for (j = 0; j < num_recs; j++, i++, frp++) {
xfs_bmbt_rec_host_t *trp = xfs_iext_get_ext(ifp, i);
@@ -4603,7 +4675,7 @@ xfs_bmap_read_extents(
if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
XFS_BMAP_BTREE_REF)))
return error;
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
+ block = XFS_BUF_TO_BLOCK(bp);
}
ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
@@ -5029,8 +5101,7 @@ xfs_bmapi(
if (abno == NULLFSBLOCK)
break;
if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
- cur = xfs_btree_init_cursor(mp,
- tp, NULL, 0, XFS_BTNUM_BMAP,
+ cur = xfs_bmbt_init_cursor(mp, tp,
ip, whichfork);
cur->bc_private.b.firstblock =
*firstblock;
@@ -5147,9 +5218,8 @@ xfs_bmapi(
*/
ASSERT(mval->br_blockcount <= len);
if ((ifp->if_flags & XFS_IFBROOT) && !cur) {
- cur = xfs_btree_init_cursor(mp,
- tp, NULL, 0, XFS_BTNUM_BMAP,
- ip, whichfork);
+ cur = xfs_bmbt_init_cursor(mp,
+ tp, ip, whichfork);
cur->bc_private.b.firstblock =
*firstblock;
cur->bc_private.b.flist = flist;
@@ -5440,8 +5510,7 @@ xfs_bunmapi(
logflags = 0;
if (ifp->if_flags & XFS_IFBROOT) {
ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
- cur = xfs_btree_init_cursor(mp, tp, NULL, 0, XFS_BTNUM_BMAP, ip,
- whichfork);
+ cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork);
cur->bc_private.b.firstblock = *firstblock;
cur->bc_private.b.flist = flist;
cur->bc_private.b.flags = 0;
@@ -5742,14 +5811,17 @@ error0:
STATIC int
xfs_getbmapx_fix_eof_hole(
xfs_inode_t *ip, /* xfs incore inode pointer */
- struct getbmap *out, /* output structure */
+ struct getbmapx *out, /* output structure */
int prealloced, /* this is a file with
- * preallocated data space */
+ * preallocated data space */
__int64_t end, /* last block requested */
xfs_fsblock_t startblock)
{
__int64_t fixlen;
xfs_mount_t *mp; /* file system mount point */
+ xfs_ifork_t *ifp; /* inode fork pointer */
+ xfs_extnum_t lastx; /* last extent pointer */
+ xfs_fileoff_t fileblock;
if (startblock == HOLESTARTBLOCK) {
mp = ip->i_mount;
@@ -5763,21 +5835,33 @@ xfs_getbmapx_fix_eof_hole(
out->bmv_length = fixlen;
}
} else {
- out->bmv_block = XFS_FSB_TO_DB(ip, startblock);
+ if (startblock == DELAYSTARTBLOCK)
+ out->bmv_block = -2;
+ else
+ out->bmv_block = XFS_FSB_TO_DB(ip, startblock);
+ fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
+ ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+ if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
+ (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
+ out->bmv_oflags |= BMV_OF_LAST;
}
return 1;
}
/*
- * Fcntl interface to xfs_bmapi.
+ * Get inode's extents as described in bmv, and format for output.
+ * Calls formatter to fill the user's buffer until all extents
+ * are mapped, until the passed-in bmv->bmv_count slots have
+ * been filled, or until the formatter short-circuits the loop,
+ * if it is tracking filled-in extents on its own.
*/
int /* error code */
xfs_getbmap(
xfs_inode_t *ip,
- struct getbmap *bmv, /* user bmap structure */
- void __user *ap, /* pointer to user's array */
- int interface) /* interface flags */
+ struct getbmapx *bmv, /* user bmap structure */
+ xfs_bmap_format_t formatter, /* format to user */
+ void *arg) /* formatter arg */
{
__int64_t bmvend; /* last block requested */
int error; /* return value */
@@ -5790,19 +5874,17 @@ xfs_getbmap(
int nexleft; /* # of user extents left */
int subnex; /* # of bmapi's can do */
int nmap; /* number of map entries */
- struct getbmap out; /* output structure */
+ struct getbmapx out; /* output structure */
int whichfork; /* data or attr fork */
int prealloced; /* this is a file with
* preallocated data space */
- int sh_unwritten; /* true, if unwritten */
- /* extents listed separately */
+ int iflags; /* interface flags */
int bmapi_flags; /* flags for xfs_bmapi */
- __int32_t oflags; /* getbmapx bmv_oflags field */
mp = ip->i_mount;
+ iflags = bmv->bmv_iflags;
- whichfork = interface & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
- sh_unwritten = (interface & BMV_IF_PREALLOC) != 0;
+ whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
/* If the BMV_IF_NO_DMAPI_READ interface bit specified, do not
* generate a DMAPI read event. Otherwise, if the DM_EVENT_READ
@@ -5817,7 +5899,7 @@ xfs_getbmap(
* could misinterpret holes in a DMAPI file as true holes,
* when in fact they may represent offline user data.
*/
- if ((interface & BMV_IF_NO_DMAPI_READ) == 0 &&
+ if ((iflags & BMV_IF_NO_DMAPI_READ) == 0 &&
DM_EVENT_ENABLED(ip, DM_EVENT_READ) &&
whichfork == XFS_DATA_FORK) {
error = XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL);
@@ -5873,8 +5955,9 @@ xfs_getbmap(
xfs_ilock(ip, XFS_IOLOCK_SHARED);
- if (whichfork == XFS_DATA_FORK &&
- (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) {
+ if (((iflags & BMV_IF_DELALLOC) == 0) &&
+ (whichfork == XFS_DATA_FORK) &&
+ (ip->i_delayed_blks || ip->i_size > ip->i_d.di_size)) {
/* xfs_fsize_t last_byte = xfs_file_last_byte(ip); */
error = xfs_flush_pages(ip, (xfs_off_t)0,
-1, 0, FI_REMAPF);
@@ -5884,7 +5967,8 @@ xfs_getbmap(
}
}
- ASSERT(whichfork == XFS_ATTR_FORK || ip->i_delayed_blks == 0);
+ ASSERT(whichfork == XFS_ATTR_FORK || (iflags & BMV_IF_DELALLOC) ||
+ ip->i_delayed_blks == 0);
lock = xfs_ilock_map_shared(ip);
@@ -5896,7 +5980,7 @@ xfs_getbmap(
nex = XFS_IFORK_NEXTENTS(ip, whichfork) * 2 + 1;
bmapi_flags = XFS_BMAPI_AFLAG(whichfork) |
- ((sh_unwritten) ? 0 : XFS_BMAPI_IGSTATE);
+ ((iflags & BMV_IF_PREALLOC) ? 0 : XFS_BMAPI_IGSTATE);
/*
* Allocate enough space to handle "subnex" maps at a time.
@@ -5906,9 +5990,12 @@ xfs_getbmap(
bmv->bmv_entries = 0;
- if (XFS_IFORK_NEXTENTS(ip, whichfork) == 0) {
- error = 0;
- goto unlock_and_return;
+ if ((XFS_IFORK_NEXTENTS(ip, whichfork) == 0)) {
+ if (((iflags & BMV_IF_DELALLOC) == 0) ||
+ whichfork == XFS_ATTR_FORK) {
+ error = 0;
+ goto unlock_and_return;
+ }
}
nexleft = nex;
@@ -5924,52 +6011,40 @@ xfs_getbmap(
ASSERT(nmap <= subnex);
for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
- nexleft--;
- oflags = (map[i].br_state == XFS_EXT_UNWRITTEN) ?
- BMV_OF_PREALLOC : 0;
+ out.bmv_oflags = 0;
+ if (map[i].br_state == XFS_EXT_UNWRITTEN)
+ out.bmv_oflags |= BMV_OF_PREALLOC;
+ else if (map[i].br_startblock == DELAYSTARTBLOCK)
+ out.bmv_oflags |= BMV_OF_DELALLOC;
out.bmv_offset = XFS_FSB_TO_BB(mp, map[i].br_startoff);
out.bmv_length = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
- ASSERT(map[i].br_startblock != DELAYSTARTBLOCK);
+ out.bmv_unused1 = out.bmv_unused2 = 0;
+ ASSERT(((iflags & BMV_IF_DELALLOC) != 0) ||
+ (map[i].br_startblock != DELAYSTARTBLOCK));
if (map[i].br_startblock == HOLESTARTBLOCK &&
whichfork == XFS_ATTR_FORK) {
/* came to the end of attribute fork */
+ out.bmv_oflags |= BMV_OF_LAST;
goto unlock_and_return;
} else {
+ int full = 0; /* user array is full */
+
if (!xfs_getbmapx_fix_eof_hole(ip, &out,
prealloced, bmvend,
map[i].br_startblock)) {
goto unlock_and_return;
}
- /* return either getbmap/getbmapx structure. */
- if (interface & BMV_IF_EXTENDED) {
- struct getbmapx outx;
-
- GETBMAP_CONVERT(out,outx);
- outx.bmv_oflags = oflags;
- outx.bmv_unused1 = outx.bmv_unused2 = 0;
- if (copy_to_user(ap, &outx,
- sizeof(outx))) {
- error = XFS_ERROR(EFAULT);
- goto unlock_and_return;
- }
- } else {
- if (copy_to_user(ap, &out,
- sizeof(out))) {
- error = XFS_ERROR(EFAULT);
- goto unlock_and_return;
- }
- }
+ /* format results & advance arg */
+ error = formatter(&arg, &out, &full);
+ if (error || full)
+ goto unlock_and_return;
+ nexleft--;
bmv->bmv_offset =
out.bmv_offset + out.bmv_length;
bmv->bmv_length = MAX((__int64_t)0,
(__int64_t)(bmvend - bmv->bmv_offset));
bmv->bmv_entries++;
- ap = (interface & BMV_IF_EXTENDED) ?
- (void __user *)
- ((struct getbmapx __user *)ap + 1) :
- (void __user *)
- ((struct getbmap __user *)ap + 1);
}
}
} while (nmap && nexleft && bmv->bmv_length);
@@ -6131,7 +6206,7 @@ xfs_bmap_get_bp(
void
xfs_check_block(
- xfs_bmbt_block_t *block,
+ struct xfs_btree_block *block,
xfs_mount_t *mp,
int root,
short sz)
@@ -6143,36 +6218,29 @@ xfs_check_block(
ASSERT(be16_to_cpu(block->bb_level) > 0);
prevp = NULL;
- for( i = 1; i <= be16_to_cpu(block->bb_numrecs); i++) {
+ for( i = 1; i <= xfs_btree_get_numrecs(block); i++) {
dmxr = mp->m_bmap_dmxr[0];
-
- if (root) {
- keyp = XFS_BMAP_BROOT_KEY_ADDR(block, i, sz);
- } else {
- keyp = XFS_BTREE_KEY_ADDR(xfs_bmbt, block, i);
- }
+ keyp = XFS_BMBT_KEY_ADDR(mp, block, i);
if (prevp) {
- xfs_btree_check_key(XFS_BTNUM_BMAP, prevp, keyp);
+ ASSERT(be64_to_cpu(prevp->br_startoff) <
+ be64_to_cpu(keyp->br_startoff));
}
prevp = keyp;
/*
* Compare the block numbers to see if there are dups.
*/
+ if (root)
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, i, sz);
+ else
+ pp = XFS_BMBT_PTR_ADDR(mp, block, i, dmxr);
- if (root) {
- pp = XFS_BMAP_BROOT_PTR_ADDR(block, i, sz);
- } else {
- pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, i, dmxr);
- }
for (j = i+1; j <= be16_to_cpu(block->bb_numrecs); j++) {
- if (root) {
- thispa = XFS_BMAP_BROOT_PTR_ADDR(block, j, sz);
- } else {
- thispa = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, j,
- dmxr);
- }
+ if (root)
+ thispa = XFS_BMAP_BROOT_PTR_ADDR(mp, block, j, sz);
+ else
+ thispa = XFS_BMBT_PTR_ADDR(mp, block, j, dmxr);
if (*thispa == *pp) {
cmn_err(CE_WARN, "%s: thispa(%d) == pp(%d) %Ld",
__func__, j, i,
@@ -6195,7 +6263,7 @@ xfs_bmap_check_leaf_extents(
xfs_inode_t *ip, /* incore inode pointer */
int whichfork) /* data or attr fork */
{
- xfs_bmbt_block_t *block; /* current btree block */
+ struct xfs_btree_block *block; /* current btree block */
xfs_fsblock_t bno; /* block # of "block" */
xfs_buf_t *bp; /* buffer for "block" */
int error; /* error return value */
@@ -6223,7 +6291,7 @@ xfs_bmap_check_leaf_extents(
level = be16_to_cpu(block->bb_level);
ASSERT(level > 0);
xfs_check_block(block, mp, 1, ifp->if_broot_bytes);
- pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
bno = be64_to_cpu(*pp);
ASSERT(bno != NULLDFSBNO);
@@ -6245,9 +6313,9 @@ xfs_bmap_check_leaf_extents(
if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
XFS_BMAP_BTREE_REF)))
goto error_norelse;
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
+ block = XFS_BUF_TO_BLOCK(bp);
XFS_WANT_CORRUPTED_GOTO(
- XFS_BMAP_SANITY_CHECK(mp, block, level),
+ xfs_bmap_sanity_check(mp, bp, level),
error0);
if (level == 0)
break;
@@ -6258,7 +6326,7 @@ xfs_bmap_check_leaf_extents(
*/
xfs_check_block(block, mp, 0, 0);
- pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
+ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
bno = be64_to_cpu(*pp);
XFS_WANT_CORRUPTED_GOTO(XFS_FSB_SANITY_CHECK(mp, bno), error0);
if (bp_release) {
@@ -6280,13 +6348,13 @@ xfs_bmap_check_leaf_extents(
xfs_extnum_t num_recs;
- num_recs = be16_to_cpu(block->bb_numrecs);
+ num_recs = xfs_btree_get_numrecs(block);
/*
* Read-ahead the next leaf block, if any.
*/
- nextbno = be64_to_cpu(block->bb_rightsib);
+ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
/*
* Check all the extents to make sure they are OK.
@@ -6294,13 +6362,17 @@ xfs_bmap_check_leaf_extents(
* conform with the first entry in this one.
*/
- ep = XFS_BTREE_REC_ADDR(xfs_bmbt, block, 1);
+ ep = XFS_BMBT_REC_ADDR(mp, block, 1);
if (i) {
- xfs_btree_check_rec(XFS_BTNUM_BMAP, &last, ep);
+ ASSERT(xfs_bmbt_disk_get_startoff(&last) +
+ xfs_bmbt_disk_get_blockcount(&last) <=
+ xfs_bmbt_disk_get_startoff(ep));
}
for (j = 1; j < num_recs; j++) {
- nextp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, j + 1);
- xfs_btree_check_rec(XFS_BTNUM_BMAP, ep, nextp);
+ nextp = XFS_BMBT_REC_ADDR(mp, block, j + 1);
+ ASSERT(xfs_bmbt_disk_get_startoff(ep) +
+ xfs_bmbt_disk_get_blockcount(ep) <=
+ xfs_bmbt_disk_get_startoff(nextp));
ep = nextp;
}
@@ -6326,7 +6398,7 @@ xfs_bmap_check_leaf_extents(
if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
XFS_BMAP_BTREE_REF)))
goto error_norelse;
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
+ block = XFS_BUF_TO_BLOCK(bp);
}
if (bp_release) {
bp_release = 0;
@@ -6356,7 +6428,7 @@ xfs_bmap_count_blocks(
int whichfork, /* data or attr fork */
int *count) /* out: count of blocks */
{
- xfs_bmbt_block_t *block; /* current btree block */
+ struct xfs_btree_block *block; /* current btree block */
xfs_fsblock_t bno; /* block # of "block" */
xfs_ifork_t *ifp; /* fork structure */
int level; /* btree level, for checking */
@@ -6379,7 +6451,7 @@ xfs_bmap_count_blocks(
block = ifp->if_broot;
level = be16_to_cpu(block->bb_level);
ASSERT(level > 0);
- pp = XFS_BMAP_BROOT_PTR_ADDR(block, 1, ifp->if_broot_bytes);
+ pp = XFS_BMAP_BROOT_PTR_ADDR(mp, block, 1, ifp->if_broot_bytes);
bno = be64_to_cpu(*pp);
ASSERT(bno != NULLDFSBNO);
ASSERT(XFS_FSB_TO_AGNO(mp, bno) < mp->m_sb.sb_agcount);
@@ -6413,29 +6485,29 @@ xfs_bmap_count_tree(
__be64 *pp;
xfs_fsblock_t bno = blockno;
xfs_fsblock_t nextbno;
- xfs_bmbt_block_t *block, *nextblock;
+ struct xfs_btree_block *block, *nextblock;
int numrecs;
if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF)))
return error;
*count += 1;
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
+ block = XFS_BUF_TO_BLOCK(bp);
if (--level) {
/* Not at node above leafs, count this level of nodes */
- nextbno = be64_to_cpu(block->bb_rightsib);
+ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
while (nextbno != NULLFSBLOCK) {
if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
0, &nbp, XFS_BMAP_BTREE_REF)))
return error;
*count += 1;
- nextblock = XFS_BUF_TO_BMBT_BLOCK(nbp);
- nextbno = be64_to_cpu(nextblock->bb_rightsib);
+ nextblock = XFS_BUF_TO_BLOCK(nbp);
+ nextbno = be64_to_cpu(nextblock->bb_u.l.bb_rightsib);
xfs_trans_brelse(tp, nbp);
}
/* Dive to the next level */
- pp = XFS_BTREE_PTR_ADDR(xfs_bmbt, block, 1, mp->m_bmap_dmxr[1]);
+ pp = XFS_BMBT_PTR_ADDR(mp, block, 1, mp->m_bmap_dmxr[1]);
bno = be64_to_cpu(*pp);
if (unlikely((error =
xfs_bmap_count_tree(mp, tp, ifp, bno, level, count)) < 0)) {
@@ -6448,9 +6520,9 @@ xfs_bmap_count_tree(
} else {
/* count all level 1 nodes and their leaves */
for (;;) {
- nextbno = be64_to_cpu(block->bb_rightsib);
+ nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
numrecs = be16_to_cpu(block->bb_numrecs);
- xfs_bmap_disk_count_leaves(0, block, numrecs, count);
+ xfs_bmap_disk_count_leaves(mp, block, numrecs, count);
xfs_trans_brelse(tp, bp);
if (nextbno == NULLFSBLOCK)
break;
@@ -6459,7 +6531,7 @@ xfs_bmap_count_tree(
XFS_BMAP_BTREE_REF)))
return error;
*count += 1;
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
+ block = XFS_BUF_TO_BLOCK(bp);
}
}
return 0;
@@ -6489,8 +6561,8 @@ xfs_bmap_count_leaves(
*/
STATIC void
xfs_bmap_disk_count_leaves(
- xfs_extnum_t idx,
- xfs_bmbt_block_t *block,
+ struct xfs_mount *mp,
+ struct xfs_btree_block *block,
int numrecs,
int *count)
{
@@ -6498,7 +6570,7 @@ xfs_bmap_disk_count_leaves(
xfs_bmbt_rec_t *frp;
for (b = 1; b <= numrecs; b++) {
- frp = XFS_BTREE_REC_ADDR(xfs_bmbt, block, idx + b);
+ frp = XFS_BMBT_REC_ADDR(mp, block, b);
*count += xfs_bmbt_disk_get_blockcount(frp);
}
}
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 9f3e3a836d15..284571c05ed0 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -137,9 +137,7 @@ typedef struct xfs_bmalloca {
char conv; /* overwriting unwritten extents */
} xfs_bmalloca_t;
-#ifdef __KERNEL__
-
-#if defined(XFS_BMAP_TRACE)
+#if defined(__KERNEL__) && defined(XFS_BMAP_TRACE)
/*
* Trace operations for bmap extent tracing
*/
@@ -163,9 +161,12 @@ xfs_bmap_trace_exlist(
int whichfork); /* data or attr fork */
#define XFS_BMAP_TRACE_EXLIST(ip,c,w) \
xfs_bmap_trace_exlist(__func__,ip,c,w)
-#else
+
+#else /* __KERNEL__ && XFS_BMAP_TRACE */
+
#define XFS_BMAP_TRACE_EXLIST(ip,c,w)
-#endif
+
+#endif /* __KERNEL__ && XFS_BMAP_TRACE */
/*
* Convert inode from non-attributed to attributed.
@@ -206,20 +207,6 @@ xfs_bmap_compute_maxlevels(
int whichfork); /* data or attr fork */
/*
- * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
- * caller. Frees all the extents that need freeing, which must be done
- * last due to locking considerations.
- *
- * Return 1 if the given transaction was committed and a new one allocated,
- * and 0 otherwise.
- */
-int /* error */
-xfs_bmap_finish(
- struct xfs_trans **tp, /* transaction pointer addr */
- xfs_bmap_free_t *flist, /* i/o: list extents to free */
- int *committed); /* xact committed or not */
-
-/*
* Returns the file-relative block number of the first unused block in the file.
* This is the lowest-address hole if the file has holes, else the first block
* past the end of file.
@@ -344,14 +331,43 @@ xfs_bunmapi(
int *done); /* set if not done yet */
/*
- * Fcntl interface to xfs_bmapi.
+ * Check an extent list, which has just been read, for
+ * any bit in the extent flag field.
+ */
+int
+xfs_check_nostate_extents(
+ struct xfs_ifork *ifp,
+ xfs_extnum_t idx,
+ xfs_extnum_t num);
+
+#ifdef __KERNEL__
+
+/*
+ * Routine to be called at transaction's end by xfs_bmapi, xfs_bunmapi
+ * caller. Frees all the extents that need freeing, which must be done
+ * last due to locking considerations.
+ *
+ * Return 1 if the given transaction was committed and a new one allocated,
+ * and 0 otherwise.
+ */
+int /* error */
+xfs_bmap_finish(
+ struct xfs_trans **tp, /* transaction pointer addr */
+ xfs_bmap_free_t *flist, /* i/o: list extents to free */
+ int *committed); /* xact committed or not */
+
+/* bmap to userspace formatter - copy to user & advance pointer */
+typedef int (*xfs_bmap_format_t)(void **, struct getbmapx *, int *);
+
+/*
+ * Get inode's extents as described in bmv, and format for output.
*/
int /* error code */
xfs_getbmap(
xfs_inode_t *ip,
- struct getbmap *bmv, /* user bmap structure */
- void __user *ap, /* pointer to user's array */
- int iflags); /* interface flags */
+ struct getbmapx *bmv, /* user bmap structure */
+ xfs_bmap_format_t formatter, /* format to user */
+ void *arg); /* formatter arg */
/*
* Check if the endoff is outside the last extent. If so the caller will grow
@@ -375,16 +391,6 @@ xfs_bmap_count_blocks(
int *count);
/*
- * Check an extent list, which has just been read, for
- * any bit in the extent flag field.
- */
-int
-xfs_check_nostate_extents(
- struct xfs_ifork *ifp,
- xfs_extnum_t idx,
- xfs_extnum_t num);
-
-/*
* Search the extent records for the entry containing block bno.
* If bno lies in a hole, point to the next entry. If bno lies
* past eof, *eofp will be set, and *prevp will contain the last
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 23efad29a5cd..8f1ec73725d3 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -37,1406 +37,13 @@
#include "xfs_inode_item.h"
#include "xfs_alloc.h"
#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
#include "xfs_ialloc.h"
#include "xfs_itable.h"
#include "xfs_bmap.h"
#include "xfs_error.h"
#include "xfs_quota.h"
-#if defined(XFS_BMBT_TRACE)
-ktrace_t *xfs_bmbt_trace_buf;
-#endif
-
-/*
- * Prototypes for internal btree functions.
- */
-
-
-STATIC int xfs_bmbt_killroot(xfs_btree_cur_t *);
-STATIC void xfs_bmbt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_bmbt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC int xfs_bmbt_lshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_bmbt_rshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_bmbt_split(xfs_btree_cur_t *, int, xfs_fsblock_t *,
- __uint64_t *, xfs_btree_cur_t **, int *);
-STATIC int xfs_bmbt_updkey(xfs_btree_cur_t *, xfs_bmbt_key_t *, int);
-
-
-#if defined(XFS_BMBT_TRACE)
-
-static char ARGS[] = "args";
-static char ENTRY[] = "entry";
-static char ERROR[] = "error";
-#undef EXIT
-static char EXIT[] = "exit";
-
-/*
- * Add a trace buffer entry for the arguments given to the routine,
- * generic form.
- */
-STATIC void
-xfs_bmbt_trace_enter(
- const char *func,
- xfs_btree_cur_t *cur,
- char *s,
- int type,
- int line,
- __psunsigned_t a0,
- __psunsigned_t a1,
- __psunsigned_t a2,
- __psunsigned_t a3,
- __psunsigned_t a4,
- __psunsigned_t a5,
- __psunsigned_t a6,
- __psunsigned_t a7,
- __psunsigned_t a8,
- __psunsigned_t a9,
- __psunsigned_t a10)
-{
- xfs_inode_t *ip;
- int whichfork;
-
- ip = cur->bc_private.b.ip;
- whichfork = cur->bc_private.b.whichfork;
- ktrace_enter(xfs_bmbt_trace_buf,
- (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
- (void *)func, (void *)s, (void *)ip, (void *)cur,
- (void *)a0, (void *)a1, (void *)a2, (void *)a3,
- (void *)a4, (void *)a5, (void *)a6, (void *)a7,
- (void *)a8, (void *)a9, (void *)a10);
- ASSERT(ip->i_btrace);
- ktrace_enter(ip->i_btrace,
- (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
- (void *)func, (void *)s, (void *)ip, (void *)cur,
- (void *)a0, (void *)a1, (void *)a2, (void *)a3,
- (void *)a4, (void *)a5, (void *)a6, (void *)a7,
- (void *)a8, (void *)a9, (void *)a10);
-}
-/*
- * Add a trace buffer entry for arguments, for a buffer & 1 integer arg.
- */
-STATIC void
-xfs_bmbt_trace_argbi(
- const char *func,
- xfs_btree_cur_t *cur,
- xfs_buf_t *b,
- int i,
- int line)
-{
- xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBI, line,
- (__psunsigned_t)b, i, 0, 0,
- 0, 0, 0, 0,
- 0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for a buffer & 2 integer args.
- */
-STATIC void
-xfs_bmbt_trace_argbii(
- const char *func,
- xfs_btree_cur_t *cur,
- xfs_buf_t *b,
- int i0,
- int i1,
- int line)
-{
- xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGBII, line,
- (__psunsigned_t)b, i0, i1, 0,
- 0, 0, 0, 0,
- 0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for 3 block-length args
- * and an integer arg.
- */
-STATIC void
-xfs_bmbt_trace_argfffi(
- const char *func,
- xfs_btree_cur_t *cur,
- xfs_dfiloff_t o,
- xfs_dfsbno_t b,
- xfs_dfilblks_t i,
- int j,
- int line)
-{
- xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGFFFI, line,
- o >> 32, (int)o, b >> 32, (int)b,
- i >> 32, (int)i, (int)j, 0,
- 0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for one integer arg.
- */
-STATIC void
-xfs_bmbt_trace_argi(
- const char *func,
- xfs_btree_cur_t *cur,
- int i,
- int line)
-{
- xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGI, line,
- i, 0, 0, 0,
- 0, 0, 0, 0,
- 0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for int, fsblock, key.
- */
-STATIC void
-xfs_bmbt_trace_argifk(
- const char *func,
- xfs_btree_cur_t *cur,
- int i,
- xfs_fsblock_t f,
- xfs_dfiloff_t o,
- int line)
-{
- xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
- i, (xfs_dfsbno_t)f >> 32, (int)f, o >> 32,
- (int)o, 0, 0, 0,
- 0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for int, fsblock, rec.
- */
-STATIC void
-xfs_bmbt_trace_argifr(
- const char *func,
- xfs_btree_cur_t *cur,
- int i,
- xfs_fsblock_t f,
- xfs_bmbt_rec_t *r,
- int line)
-{
- xfs_dfsbno_t b;
- xfs_dfilblks_t c;
- xfs_dfsbno_t d;
- xfs_dfiloff_t o;
- xfs_bmbt_irec_t s;
-
- d = (xfs_dfsbno_t)f;
- xfs_bmbt_disk_get_all(r, &s);
- o = (xfs_dfiloff_t)s.br_startoff;
- b = (xfs_dfsbno_t)s.br_startblock;
- c = s.br_blockcount;
- xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFR, line,
- i, d >> 32, (int)d, o >> 32,
- (int)o, b >> 32, (int)b, c >> 32,
- (int)c, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for arguments, for int, key.
- */
-STATIC void
-xfs_bmbt_trace_argik(
- const char *func,
- xfs_btree_cur_t *cur,
- int i,
- xfs_bmbt_key_t *k,
- int line)
-{
- xfs_dfiloff_t o;
-
- o = be64_to_cpu(k->br_startoff);
- xfs_bmbt_trace_enter(func, cur, ARGS, XFS_BMBT_KTRACE_ARGIFK, line,
- i, o >> 32, (int)o, 0,
- 0, 0, 0, 0,
- 0, 0, 0);
-}
-
-/*
- * Add a trace buffer entry for the cursor/operation.
- */
-STATIC void
-xfs_bmbt_trace_cursor(
- const char *func,
- xfs_btree_cur_t *cur,
- char *s,
- int line)
-{
- xfs_bmbt_rec_host_t r;
-
- xfs_bmbt_set_all(&r, &cur->bc_rec.b);
- xfs_bmbt_trace_enter(func, cur, s, XFS_BMBT_KTRACE_CUR, line,
- (cur->bc_nlevels << 24) | (cur->bc_private.b.flags << 16) |
- cur->bc_private.b.allocated,
- r.l0 >> 32, (int)r.l0,
- r.l1 >> 32, (int)r.l1,
- (unsigned long)cur->bc_bufs[0], (unsigned long)cur->bc_bufs[1],
- (unsigned long)cur->bc_bufs[2], (unsigned long)cur->bc_bufs[3],
- (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1],
- (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]);
-}
-
-#define XFS_BMBT_TRACE_ARGBI(c,b,i) \
- xfs_bmbt_trace_argbi(__func__, c, b, i, __LINE__)
-#define XFS_BMBT_TRACE_ARGBII(c,b,i,j) \
- xfs_bmbt_trace_argbii(__func__, c, b, i, j, __LINE__)
-#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j) \
- xfs_bmbt_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
-#define XFS_BMBT_TRACE_ARGI(c,i) \
- xfs_bmbt_trace_argi(__func__, c, i, __LINE__)
-#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s) \
- xfs_bmbt_trace_argifk(__func__, c, i, f, s, __LINE__)
-#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r) \
- xfs_bmbt_trace_argifr(__func__, c, i, f, r, __LINE__)
-#define XFS_BMBT_TRACE_ARGIK(c,i,k) \
- xfs_bmbt_trace_argik(__func__, c, i, k, __LINE__)
-#define XFS_BMBT_TRACE_CURSOR(c,s) \
- xfs_bmbt_trace_cursor(__func__, c, s, __LINE__)
-#else
-#define XFS_BMBT_TRACE_ARGBI(c,b,i)
-#define XFS_BMBT_TRACE_ARGBII(c,b,i,j)
-#define XFS_BMBT_TRACE_ARGFFFI(c,o,b,i,j)
-#define XFS_BMBT_TRACE_ARGI(c,i)
-#define XFS_BMBT_TRACE_ARGIFK(c,i,f,s)
-#define XFS_BMBT_TRACE_ARGIFR(c,i,f,r)
-#define XFS_BMBT_TRACE_ARGIK(c,i,k)
-#define XFS_BMBT_TRACE_CURSOR(c,s)
-#endif /* XFS_BMBT_TRACE */
-
-
-/*
- * Internal functions.
- */
-
-/*
- * Delete record pointed to by cur/level.
- */
-STATIC int /* error */
-xfs_bmbt_delrec(
- xfs_btree_cur_t *cur,
- int level,
- int *stat) /* success/failure */
-{
- xfs_bmbt_block_t *block; /* bmap btree block */
- xfs_fsblock_t bno; /* fs-relative block number */
- xfs_buf_t *bp; /* buffer for block */
- int error; /* error return value */
- int i; /* loop counter */
- int j; /* temp state */
- xfs_bmbt_key_t key; /* bmap btree key */
- xfs_bmbt_key_t *kp=NULL; /* pointer to bmap btree key */
- xfs_fsblock_t lbno; /* left sibling block number */
- xfs_buf_t *lbp; /* left buffer pointer */
- xfs_bmbt_block_t *left; /* left btree block */
- xfs_bmbt_key_t *lkp; /* left btree key */
- xfs_bmbt_ptr_t *lpp; /* left address pointer */
- int lrecs=0; /* left record count */
- xfs_bmbt_rec_t *lrp; /* left record pointer */
- xfs_mount_t *mp; /* file system mount point */
- xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */
- int ptr; /* key/record index */
- xfs_fsblock_t rbno; /* right sibling block number */
- xfs_buf_t *rbp; /* right buffer pointer */
- xfs_bmbt_block_t *right; /* right btree block */
- xfs_bmbt_key_t *rkp; /* right btree key */
- xfs_bmbt_rec_t *rp; /* pointer to bmap btree rec */
- xfs_bmbt_ptr_t *rpp; /* right address pointer */
- xfs_bmbt_block_t *rrblock; /* right-right btree block */
- xfs_buf_t *rrbp; /* right-right buffer pointer */
- int rrecs=0; /* right record count */
- xfs_bmbt_rec_t *rrp; /* right record pointer */
- xfs_btree_cur_t *tcur; /* temporary btree cursor */
- int numrecs; /* temporary numrec count */
- int numlrecs, numrrecs;
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGI(cur, level);
- ptr = cur->bc_ptrs[level];
- tcur = NULL;
- if (ptr == 0) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- block = xfs_bmbt_get_block(cur, level, &bp);
- numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
-#endif
- if (ptr > numrecs) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- XFS_STATS_INC(xs_bmbt_delrec);
- if (level > 0) {
- kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
- pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-#ifdef DEBUG
- for (i = ptr; i < numrecs; i++) {
- if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- }
-#endif
- if (ptr < numrecs) {
- memmove(&kp[ptr - 1], &kp[ptr],
- (numrecs - ptr) * sizeof(*kp));
- memmove(&pp[ptr - 1], &pp[ptr],
- (numrecs - ptr) * sizeof(*pp));
- xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs - 1);
- xfs_bmbt_log_keys(cur, bp, ptr, numrecs - 1);
- }
- } else {
- rp = XFS_BMAP_REC_IADDR(block, 1, cur);
- if (ptr < numrecs) {
- memmove(&rp[ptr - 1], &rp[ptr],
- (numrecs - ptr) * sizeof(*rp));
- xfs_bmbt_log_recs(cur, bp, ptr, numrecs - 1);
- }
- if (ptr == 1) {
- key.br_startoff =
- cpu_to_be64(xfs_bmbt_disk_get_startoff(rp));
- kp = &key;
- }
- }
- numrecs--;
- block->bb_numrecs = cpu_to_be16(numrecs);
- xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
- /*
- * We're at the root level.
- * First, shrink the root block in-memory.
- * Try to get rid of the next level down.
- * If we can't then there's nothing left to do.
- */
- if (level == cur->bc_nlevels - 1) {
- xfs_iroot_realloc(cur->bc_private.b.ip, -1,
- cur->bc_private.b.whichfork);
- if ((error = xfs_bmbt_killroot(cur))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
- }
- if (ptr == 1 && (error = xfs_bmbt_updkey(cur, kp, level + 1))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- if (numrecs >= XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
- if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &j))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
- }
- rbno = be64_to_cpu(block->bb_rightsib);
- lbno = be64_to_cpu(block->bb_leftsib);
- /*
- * One child of root, need to get a chance to copy its contents
- * into the root and delete it. Can't go up to next level,
- * there's nothing to delete there.
- */
- if (lbno == NULLFSBLOCK && rbno == NULLFSBLOCK &&
- level == cur->bc_nlevels - 2) {
- if ((error = xfs_bmbt_killroot(cur))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
- }
- ASSERT(rbno != NULLFSBLOCK || lbno != NULLFSBLOCK);
- if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- bno = NULLFSBLOCK;
- if (rbno != NULLFSBLOCK) {
- i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_bmbt_increment(tcur, level, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- rbp = tcur->bc_bufs[level];
- right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
-#endif
- bno = be64_to_cpu(right->bb_leftsib);
- if (be16_to_cpu(right->bb_numrecs) - 1 >=
- XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
- if ((error = xfs_bmbt_lshift(tcur, level, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- if (i) {
- ASSERT(be16_to_cpu(block->bb_numrecs) >=
- XFS_BMAP_BLOCK_IMINRECS(level, tcur));
- xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
- tcur = NULL;
- if (level > 0) {
- if ((error = xfs_bmbt_decrement(cur,
- level, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur,
- ERROR);
- goto error0;
- }
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
- }
- }
- rrecs = be16_to_cpu(right->bb_numrecs);
- if (lbno != NULLFSBLOCK) {
- i = xfs_btree_firstrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- }
- }
- if (lbno != NULLFSBLOCK) {
- i = xfs_btree_firstrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- /*
- * decrement to last in block
- */
- if ((error = xfs_bmbt_decrement(tcur, level, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- i = xfs_btree_firstrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- lbp = tcur->bc_bufs[level];
- left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
-#endif
- bno = be64_to_cpu(left->bb_rightsib);
- if (be16_to_cpu(left->bb_numrecs) - 1 >=
- XFS_BMAP_BLOCK_IMINRECS(level, cur)) {
- if ((error = xfs_bmbt_rshift(tcur, level, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- if (i) {
- ASSERT(be16_to_cpu(block->bb_numrecs) >=
- XFS_BMAP_BLOCK_IMINRECS(level, tcur));
- xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
- tcur = NULL;
- if (level == 0)
- cur->bc_ptrs[0]++;
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
- }
- }
- lrecs = be16_to_cpu(left->bb_numrecs);
- }
- xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
- tcur = NULL;
- mp = cur->bc_mp;
- ASSERT(bno != NULLFSBLOCK);
- if (lbno != NULLFSBLOCK &&
- lrecs + be16_to_cpu(block->bb_numrecs) <= XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
- rbno = bno;
- right = block;
- rbp = bp;
- if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, lbno, 0, &lbp,
- XFS_BMAP_BTREE_REF))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- left = XFS_BUF_TO_BMBT_BLOCK(lbp);
- if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- } else if (rbno != NULLFSBLOCK &&
- rrecs + be16_to_cpu(block->bb_numrecs) <=
- XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
- lbno = bno;
- left = block;
- lbp = bp;
- if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, rbno, 0, &rbp,
- XFS_BMAP_BTREE_REF))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- right = XFS_BUF_TO_BMBT_BLOCK(rbp);
- if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- lrecs = be16_to_cpu(left->bb_numrecs);
- } else {
- if (level > 0 && (error = xfs_bmbt_decrement(cur, level, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
- }
- numlrecs = be16_to_cpu(left->bb_numrecs);
- numrrecs = be16_to_cpu(right->bb_numrecs);
- if (level > 0) {
- lkp = XFS_BMAP_KEY_IADDR(left, numlrecs + 1, cur);
- lpp = XFS_BMAP_PTR_IADDR(left, numlrecs + 1, cur);
- rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
- rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
- for (i = 0; i < numrrecs; i++) {
- if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- }
-#endif
- memcpy(lkp, rkp, numrrecs * sizeof(*lkp));
- memcpy(lpp, rpp, numrrecs * sizeof(*lpp));
- xfs_bmbt_log_keys(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
- xfs_bmbt_log_ptrs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
- } else {
- lrp = XFS_BMAP_REC_IADDR(left, numlrecs + 1, cur);
- rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
- memcpy(lrp, rrp, numrrecs * sizeof(*lrp));
- xfs_bmbt_log_recs(cur, lbp, numlrecs + 1, numlrecs + numrrecs);
- }
- be16_add_cpu(&left->bb_numrecs, numrrecs);
- left->bb_rightsib = right->bb_rightsib;
- xfs_bmbt_log_block(cur, lbp, XFS_BB_RIGHTSIB | XFS_BB_NUMRECS);
- if (be64_to_cpu(left->bb_rightsib) != NULLDFSBNO) {
- if ((error = xfs_btree_read_bufl(mp, cur->bc_tp,
- be64_to_cpu(left->bb_rightsib),
- 0, &rrbp, XFS_BMAP_BTREE_REF))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
- if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- rrblock->bb_leftsib = cpu_to_be64(lbno);
- xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
- }
- xfs_bmap_add_free(XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(rbp)), 1,
- cur->bc_private.b.flist, mp);
- cur->bc_private.b.ip->i_d.di_nblocks--;
- xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
- XFS_TRANS_MOD_DQUOT_BYINO(mp, cur->bc_tp, cur->bc_private.b.ip,
- XFS_TRANS_DQ_BCOUNT, -1L);
- xfs_trans_binval(cur->bc_tp, rbp);
- if (bp != lbp) {
- cur->bc_bufs[level] = lbp;
- cur->bc_ptrs[level] += lrecs;
- cur->bc_ra[level] = 0;
- } else if ((error = xfs_bmbt_increment(cur, level + 1, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- goto error0;
- }
- if (level > 0)
- cur->bc_ptrs[level]--;
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 2;
- return 0;
-
-error0:
- if (tcur)
- xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
- return error;
-}
-
-/*
- * Insert one record/level. Return information to the caller
- * allowing the next level up to proceed if necessary.
- */
-STATIC int /* error */
-xfs_bmbt_insrec(
- xfs_btree_cur_t *cur,
- int level,
- xfs_fsblock_t *bnop,
- xfs_bmbt_rec_t *recp,
- xfs_btree_cur_t **curp,
- int *stat) /* no-go/done/continue */
-{
- xfs_bmbt_block_t *block; /* bmap btree block */
- xfs_buf_t *bp; /* buffer for block */
- int error; /* error return value */
- int i; /* loop index */
- xfs_bmbt_key_t key; /* bmap btree key */
- xfs_bmbt_key_t *kp=NULL; /* pointer to bmap btree key */
- int logflags; /* inode logging flags */
- xfs_fsblock_t nbno; /* new block number */
- struct xfs_btree_cur *ncur; /* new btree cursor */
- __uint64_t startoff; /* new btree key value */
- xfs_bmbt_rec_t nrec; /* new record count */
- int optr; /* old key/record index */
- xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */
- int ptr; /* key/record index */
- xfs_bmbt_rec_t *rp=NULL; /* pointer to bmap btree rec */
- int numrecs;
-
- ASSERT(level < cur->bc_nlevels);
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGIFR(cur, level, *bnop, recp);
- ncur = NULL;
- key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(recp));
- optr = ptr = cur->bc_ptrs[level];
- if (ptr == 0) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- XFS_STATS_INC(xs_bmbt_insrec);
- block = xfs_bmbt_get_block(cur, level, &bp);
- numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- if (ptr <= numrecs) {
- if (level == 0) {
- rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
- xfs_btree_check_rec(XFS_BTNUM_BMAP, recp, rp);
- } else {
- kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
- xfs_btree_check_key(XFS_BTNUM_BMAP, &key, kp);
- }
- }
-#endif
- nbno = NULLFSBLOCK;
- if (numrecs == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
- if (numrecs < XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
- /*
- * A root block, that can be made bigger.
- */
- xfs_iroot_realloc(cur->bc_private.b.ip, 1,
- cur->bc_private.b.whichfork);
- block = xfs_bmbt_get_block(cur, level, &bp);
- } else if (level == cur->bc_nlevels - 1) {
- if ((error = xfs_bmbt_newroot(cur, &logflags, stat)) ||
- *stat == 0) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
- logflags);
- block = xfs_bmbt_get_block(cur, level, &bp);
- } else {
- if ((error = xfs_bmbt_rshift(cur, level, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- if (i) {
- /* nothing */
- } else {
- if ((error = xfs_bmbt_lshift(cur, level, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- if (i) {
- optr = ptr = cur->bc_ptrs[level];
- } else {
- if ((error = xfs_bmbt_split(cur, level,
- &nbno, &startoff, &ncur,
- &i))) {
- XFS_BMBT_TRACE_CURSOR(cur,
- ERROR);
- return error;
- }
- if (i) {
- block = xfs_bmbt_get_block(
- cur, level, &bp);
-#ifdef DEBUG
- if ((error =
- xfs_btree_check_lblock(cur,
- block, level, bp))) {
- XFS_BMBT_TRACE_CURSOR(
- cur, ERROR);
- return error;
- }
-#endif
- ptr = cur->bc_ptrs[level];
- xfs_bmbt_disk_set_allf(&nrec,
- startoff, 0, 0,
- XFS_EXT_NORM);
- } else {
- XFS_BMBT_TRACE_CURSOR(cur,
- EXIT);
- *stat = 0;
- return 0;
- }
- }
- }
- }
- }
- numrecs = be16_to_cpu(block->bb_numrecs);
- if (level > 0) {
- kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
- pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
-#ifdef DEBUG
- for (i = numrecs; i >= ptr; i--) {
- if ((error = xfs_btree_check_lptr_disk(cur, pp[i - 1],
- level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- }
-#endif
- memmove(&kp[ptr], &kp[ptr - 1],
- (numrecs - ptr + 1) * sizeof(*kp));
- memmove(&pp[ptr], &pp[ptr - 1],
- (numrecs - ptr + 1) * sizeof(*pp));
-#ifdef DEBUG
- if ((error = xfs_btree_check_lptr(cur, *bnop, level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- kp[ptr - 1] = key;
- pp[ptr - 1] = cpu_to_be64(*bnop);
- numrecs++;
- block->bb_numrecs = cpu_to_be16(numrecs);
- xfs_bmbt_log_keys(cur, bp, ptr, numrecs);
- xfs_bmbt_log_ptrs(cur, bp, ptr, numrecs);
- } else {
- rp = XFS_BMAP_REC_IADDR(block, 1, cur);
- memmove(&rp[ptr], &rp[ptr - 1],
- (numrecs - ptr + 1) * sizeof(*rp));
- rp[ptr - 1] = *recp;
- numrecs++;
- block->bb_numrecs = cpu_to_be16(numrecs);
- xfs_bmbt_log_recs(cur, bp, ptr, numrecs);
- }
- xfs_bmbt_log_block(cur, bp, XFS_BB_NUMRECS);
-#ifdef DEBUG
- if (ptr < numrecs) {
- if (level == 0)
- xfs_btree_check_rec(XFS_BTNUM_BMAP, rp + ptr - 1,
- rp + ptr);
- else
- xfs_btree_check_key(XFS_BTNUM_BMAP, kp + ptr - 1,
- kp + ptr);
- }
-#endif
- if (optr == 1 && (error = xfs_bmbt_updkey(cur, &key, level + 1))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- *bnop = nbno;
- if (nbno != NULLFSBLOCK) {
- *recp = nrec;
- *curp = ncur;
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
-}
-
-STATIC int
-xfs_bmbt_killroot(
- xfs_btree_cur_t *cur)
-{
- xfs_bmbt_block_t *block;
- xfs_bmbt_block_t *cblock;
- xfs_buf_t *cbp;
- xfs_bmbt_key_t *ckp;
- xfs_bmbt_ptr_t *cpp;
-#ifdef DEBUG
- int error;
-#endif
- int i;
- xfs_bmbt_key_t *kp;
- xfs_inode_t *ip;
- xfs_ifork_t *ifp;
- int level;
- xfs_bmbt_ptr_t *pp;
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- level = cur->bc_nlevels - 1;
- ASSERT(level >= 1);
- /*
- * Don't deal with the root block needs to be a leaf case.
- * We're just going to turn the thing back into extents anyway.
- */
- if (level == 1) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- return 0;
- }
- block = xfs_bmbt_get_block(cur, level, &cbp);
- /*
- * Give up if the root has multiple children.
- */
- if (be16_to_cpu(block->bb_numrecs) != 1) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- return 0;
- }
- /*
- * Only do this if the next level will fit.
- * Then the data must be copied up to the inode,
- * instead of freeing the root you free the next level.
- */
- cbp = cur->bc_bufs[level - 1];
- cblock = XFS_BUF_TO_BMBT_BLOCK(cbp);
- if (be16_to_cpu(cblock->bb_numrecs) > XFS_BMAP_BLOCK_DMAXRECS(level, cur)) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- return 0;
- }
- ASSERT(be64_to_cpu(cblock->bb_leftsib) == NULLDFSBNO);
- ASSERT(be64_to_cpu(cblock->bb_rightsib) == NULLDFSBNO);
- ip = cur->bc_private.b.ip;
- ifp = XFS_IFORK_PTR(ip, cur->bc_private.b.whichfork);
- ASSERT(XFS_BMAP_BLOCK_IMAXRECS(level, cur) ==
- XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes));
- i = (int)(be16_to_cpu(cblock->bb_numrecs) - XFS_BMAP_BLOCK_IMAXRECS(level, cur));
- if (i) {
- xfs_iroot_realloc(ip, i, cur->bc_private.b.whichfork);
- block = ifp->if_broot;
- }
- be16_add_cpu(&block->bb_numrecs, i);
- ASSERT(block->bb_numrecs == cblock->bb_numrecs);
- kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
- ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
- memcpy(kp, ckp, be16_to_cpu(block->bb_numrecs) * sizeof(*kp));
- pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
- cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
-#ifdef DEBUG
- for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
- if ((error = xfs_btree_check_lptr_disk(cur, cpp[i], level - 1))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- }
-#endif
- memcpy(pp, cpp, be16_to_cpu(block->bb_numrecs) * sizeof(*pp));
- xfs_bmap_add_free(XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(cbp)), 1,
- cur->bc_private.b.flist, cur->bc_mp);
- ip->i_d.di_nblocks--;
- XFS_TRANS_MOD_DQUOT_BYINO(cur->bc_mp, cur->bc_tp, ip,
- XFS_TRANS_DQ_BCOUNT, -1L);
- xfs_trans_binval(cur->bc_tp, cbp);
- cur->bc_bufs[level - 1] = NULL;
- be16_add_cpu(&block->bb_level, -1);
- xfs_trans_log_inode(cur->bc_tp, ip,
- XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
- cur->bc_nlevels--;
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- return 0;
-}
-
-/*
- * Log key values from the btree block.
- */
-STATIC void
-xfs_bmbt_log_keys(
- xfs_btree_cur_t *cur,
- xfs_buf_t *bp,
- int kfirst,
- int klast)
-{
- xfs_trans_t *tp;
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGBII(cur, bp, kfirst, klast);
- tp = cur->bc_tp;
- if (bp) {
- xfs_bmbt_block_t *block;
- int first;
- xfs_bmbt_key_t *kp;
- int last;
-
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
- kp = XFS_BMAP_KEY_DADDR(block, 1, cur);
- first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
- last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
- xfs_trans_log_buf(tp, bp, first, last);
- } else {
- xfs_inode_t *ip;
-
- ip = cur->bc_private.b.ip;
- xfs_trans_log_inode(tp, ip,
- XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-
-/*
- * Log pointer values from the btree block.
- */
-STATIC void
-xfs_bmbt_log_ptrs(
- xfs_btree_cur_t *cur,
- xfs_buf_t *bp,
- int pfirst,
- int plast)
-{
- xfs_trans_t *tp;
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGBII(cur, bp, pfirst, plast);
- tp = cur->bc_tp;
- if (bp) {
- xfs_bmbt_block_t *block;
- int first;
- int last;
- xfs_bmbt_ptr_t *pp;
-
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
- pp = XFS_BMAP_PTR_DADDR(block, 1, cur);
- first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
- last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
- xfs_trans_log_buf(tp, bp, first, last);
- } else {
- xfs_inode_t *ip;
-
- ip = cur->bc_private.b.ip;
- xfs_trans_log_inode(tp, ip,
- XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-
-/*
- * Lookup the record. The cursor is made to point to it, based on dir.
- */
-STATIC int /* error */
-xfs_bmbt_lookup(
- xfs_btree_cur_t *cur,
- xfs_lookup_t dir,
- int *stat) /* success/failure */
-{
- xfs_bmbt_block_t *block=NULL;
- xfs_buf_t *bp;
- xfs_daddr_t d;
- xfs_sfiloff_t diff;
- int error; /* error return value */
- xfs_fsblock_t fsbno=0;
- int high;
- int i;
- int keyno=0;
- xfs_bmbt_key_t *kkbase=NULL;
- xfs_bmbt_key_t *kkp;
- xfs_bmbt_rec_t *krbase=NULL;
- xfs_bmbt_rec_t *krp;
- int level;
- int low;
- xfs_mount_t *mp;
- xfs_bmbt_ptr_t *pp;
- xfs_bmbt_irec_t *rp;
- xfs_fileoff_t startoff;
- xfs_trans_t *tp;
-
- XFS_STATS_INC(xs_bmbt_lookup);
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGI(cur, (int)dir);
- tp = cur->bc_tp;
- mp = cur->bc_mp;
- rp = &cur->bc_rec.b;
- for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
- if (level < cur->bc_nlevels - 1) {
- d = XFS_FSB_TO_DADDR(mp, fsbno);
- bp = cur->bc_bufs[level];
- if (bp && XFS_BUF_ADDR(bp) != d)
- bp = NULL;
- if (!bp) {
- if ((error = xfs_btree_read_bufl(mp, tp, fsbno,
- 0, &bp, XFS_BMAP_BTREE_REF))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- xfs_btree_setbuf(cur, level, bp);
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
- if ((error = xfs_btree_check_lblock(cur, block,
- level, bp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- } else
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
- } else
- block = xfs_bmbt_get_block(cur, level, &bp);
- if (diff == 0)
- keyno = 1;
- else {
- if (level > 0)
- kkbase = XFS_BMAP_KEY_IADDR(block, 1, cur);
- else
- krbase = XFS_BMAP_REC_IADDR(block, 1, cur);
- low = 1;
- if (!(high = be16_to_cpu(block->bb_numrecs))) {
- ASSERT(level == 0);
- cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- while (low <= high) {
- XFS_STATS_INC(xs_bmbt_compare);
- keyno = (low + high) >> 1;
- if (level > 0) {
- kkp = kkbase + keyno - 1;
- startoff = be64_to_cpu(kkp->br_startoff);
- } else {
- krp = krbase + keyno - 1;
- startoff = xfs_bmbt_disk_get_startoff(krp);
- }
- diff = (xfs_sfiloff_t)
- (startoff - rp->br_startoff);
- if (diff < 0)
- low = keyno + 1;
- else if (diff > 0)
- high = keyno - 1;
- else
- break;
- }
- }
- if (level > 0) {
- if (diff > 0 && --keyno < 1)
- keyno = 1;
- pp = XFS_BMAP_PTR_IADDR(block, keyno, cur);
- fsbno = be64_to_cpu(*pp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lptr(cur, fsbno, level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- cur->bc_ptrs[level] = keyno;
- }
- }
- if (dir != XFS_LOOKUP_LE && diff < 0) {
- keyno++;
- /*
- * If ge search and we went off the end of the block, but it's
- * not the last block, we're in the wrong block.
- */
- if (dir == XFS_LOOKUP_GE && keyno > be16_to_cpu(block->bb_numrecs) &&
- be64_to_cpu(block->bb_rightsib) != NULLDFSBNO) {
- cur->bc_ptrs[0] = keyno;
- if ((error = xfs_bmbt_increment(cur, 0, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- XFS_WANT_CORRUPTED_RETURN(i == 1);
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
- }
- }
- else if (dir == XFS_LOOKUP_LE && diff > 0)
- keyno--;
- cur->bc_ptrs[0] = keyno;
- if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs)) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- } else {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
- }
- return 0;
-}
-
-/*
- * Move 1 record left from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int /* error */
-xfs_bmbt_lshift(
- xfs_btree_cur_t *cur,
- int level,
- int *stat) /* success/failure */
-{
- int error; /* error return value */
-#ifdef DEBUG
- int i; /* loop counter */
-#endif
- xfs_bmbt_key_t key; /* bmap btree key */
- xfs_buf_t *lbp; /* left buffer pointer */
- xfs_bmbt_block_t *left; /* left btree block */
- xfs_bmbt_key_t *lkp=NULL; /* left btree key */
- xfs_bmbt_ptr_t *lpp; /* left address pointer */
- int lrecs; /* left record count */
- xfs_bmbt_rec_t *lrp=NULL; /* left record pointer */
- xfs_mount_t *mp; /* file system mount point */
- xfs_buf_t *rbp; /* right buffer pointer */
- xfs_bmbt_block_t *right; /* right btree block */
- xfs_bmbt_key_t *rkp=NULL; /* right btree key */
- xfs_bmbt_ptr_t *rpp=NULL; /* right address pointer */
- xfs_bmbt_rec_t *rrp=NULL; /* right record pointer */
- int rrecs; /* right record count */
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGI(cur, level);
- if (level == cur->bc_nlevels - 1) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- rbp = cur->bc_bufs[level];
- right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- if (be64_to_cpu(right->bb_leftsib) == NULLDFSBNO) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- if (cur->bc_ptrs[level] <= 1) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- mp = cur->bc_mp;
- if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, be64_to_cpu(right->bb_leftsib), 0,
- &lbp, XFS_BMAP_BTREE_REF))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- left = XFS_BUF_TO_BMBT_BLOCK(lbp);
- if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- if (be16_to_cpu(left->bb_numrecs) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- lrecs = be16_to_cpu(left->bb_numrecs) + 1;
- if (level > 0) {
- lkp = XFS_BMAP_KEY_IADDR(left, lrecs, cur);
- rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
- *lkp = *rkp;
- xfs_bmbt_log_keys(cur, lbp, lrecs, lrecs);
- lpp = XFS_BMAP_PTR_IADDR(left, lrecs, cur);
- rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lptr_disk(cur, *rpp, level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- *lpp = *rpp;
- xfs_bmbt_log_ptrs(cur, lbp, lrecs, lrecs);
- } else {
- lrp = XFS_BMAP_REC_IADDR(left, lrecs, cur);
- rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
- *lrp = *rrp;
- xfs_bmbt_log_recs(cur, lbp, lrecs, lrecs);
- }
- left->bb_numrecs = cpu_to_be16(lrecs);
- xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
-#ifdef DEBUG
- if (level > 0)
- xfs_btree_check_key(XFS_BTNUM_BMAP, lkp - 1, lkp);
- else
- xfs_btree_check_rec(XFS_BTNUM_BMAP, lrp - 1, lrp);
-#endif
- rrecs = be16_to_cpu(right->bb_numrecs) - 1;
- right->bb_numrecs = cpu_to_be16(rrecs);
- xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
- if (level > 0) {
-#ifdef DEBUG
- for (i = 0; i < rrecs; i++) {
- if ((error = xfs_btree_check_lptr_disk(cur, rpp[i + 1],
- level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- }
-#endif
- memmove(rkp, rkp + 1, rrecs * sizeof(*rkp));
- memmove(rpp, rpp + 1, rrecs * sizeof(*rpp));
- xfs_bmbt_log_keys(cur, rbp, 1, rrecs);
- xfs_bmbt_log_ptrs(cur, rbp, 1, rrecs);
- } else {
- memmove(rrp, rrp + 1, rrecs * sizeof(*rrp));
- xfs_bmbt_log_recs(cur, rbp, 1, rrecs);
- key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp));
- rkp = &key;
- }
- if ((error = xfs_bmbt_updkey(cur, rkp, level + 1))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- cur->bc_ptrs[level]--;
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
-}
-
-/*
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int /* error */
-xfs_bmbt_rshift(
- xfs_btree_cur_t *cur,
- int level,
- int *stat) /* success/failure */
-{
- int error; /* error return value */
- int i; /* loop counter */
- xfs_bmbt_key_t key; /* bmap btree key */
- xfs_buf_t *lbp; /* left buffer pointer */
- xfs_bmbt_block_t *left; /* left btree block */
- xfs_bmbt_key_t *lkp; /* left btree key */
- xfs_bmbt_ptr_t *lpp; /* left address pointer */
- xfs_bmbt_rec_t *lrp; /* left record pointer */
- xfs_mount_t *mp; /* file system mount point */
- xfs_buf_t *rbp; /* right buffer pointer */
- xfs_bmbt_block_t *right; /* right btree block */
- xfs_bmbt_key_t *rkp; /* right btree key */
- xfs_bmbt_ptr_t *rpp; /* right address pointer */
- xfs_bmbt_rec_t *rrp=NULL; /* right record pointer */
- struct xfs_btree_cur *tcur; /* temporary btree cursor */
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGI(cur, level);
- if (level == cur->bc_nlevels - 1) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- lbp = cur->bc_bufs[level];
- left = XFS_BUF_TO_BMBT_BLOCK(lbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, left, level, lbp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- if (be64_to_cpu(left->bb_rightsib) == NULLDFSBNO) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- mp = cur->bc_mp;
- if ((error = xfs_btree_read_bufl(mp, cur->bc_tp, be64_to_cpu(left->bb_rightsib), 0,
- &rbp, XFS_BMAP_BTREE_REF))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- right = XFS_BUF_TO_BMBT_BLOCK(rbp);
- if ((error = xfs_btree_check_lblock(cur, right, level, rbp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- if (be16_to_cpu(right->bb_numrecs) == XFS_BMAP_BLOCK_IMAXRECS(level, cur)) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- if (level > 0) {
- lkp = XFS_BMAP_KEY_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
- lpp = XFS_BMAP_PTR_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
- rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
- rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
- for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
- if ((error = xfs_btree_check_lptr_disk(cur, rpp[i], level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- }
-#endif
- memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
- memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-#ifdef DEBUG
- if ((error = xfs_btree_check_lptr_disk(cur, *lpp, level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- *rkp = *lkp;
- *rpp = *lpp;
- xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
- xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
- } else {
- lrp = XFS_BMAP_REC_IADDR(left, be16_to_cpu(left->bb_numrecs), cur);
- rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
- memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
- *rrp = *lrp;
- xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
- key.br_startoff = cpu_to_be64(xfs_bmbt_disk_get_startoff(rrp));
- rkp = &key;
- }
- be16_add_cpu(&left->bb_numrecs, -1);
- xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS);
- be16_add_cpu(&right->bb_numrecs, 1);
-#ifdef DEBUG
- if (level > 0)
- xfs_btree_check_key(XFS_BTNUM_BMAP, rkp, rkp + 1);
- else
- xfs_btree_check_rec(XFS_BTNUM_BMAP, rrp, rrp + 1);
-#endif
- xfs_bmbt_log_block(cur, rbp, XFS_BB_NUMRECS);
- if ((error = xfs_btree_dup_cursor(cur, &tcur))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_bmbt_increment(tcur, level, &i))) {
- XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
- goto error1;
- }
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_bmbt_updkey(tcur, rkp, level + 1))) {
- XFS_BMBT_TRACE_CURSOR(tcur, ERROR);
- goto error1;
- }
- xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
-error0:
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
-error1:
- xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
- return error;
-}
-
/*
* Determine the extent state.
*/
@@ -1453,229 +60,15 @@ xfs_extent_state(
return XFS_EXT_NORM;
}
-
-/*
- * Split cur/level block in half.
- * Return new block number and its first record (to be inserted into parent).
- */
-STATIC int /* error */
-xfs_bmbt_split(
- xfs_btree_cur_t *cur,
- int level,
- xfs_fsblock_t *bnop,
- __uint64_t *startoff,
- xfs_btree_cur_t **curp,
- int *stat) /* success/failure */
-{
- xfs_alloc_arg_t args; /* block allocation args */
- int error; /* error return value */
- int i; /* loop counter */
- xfs_fsblock_t lbno; /* left sibling block number */
- xfs_buf_t *lbp; /* left buffer pointer */
- xfs_bmbt_block_t *left; /* left btree block */
- xfs_bmbt_key_t *lkp; /* left btree key */
- xfs_bmbt_ptr_t *lpp; /* left address pointer */
- xfs_bmbt_rec_t *lrp; /* left record pointer */
- xfs_buf_t *rbp; /* right buffer pointer */
- xfs_bmbt_block_t *right; /* right btree block */
- xfs_bmbt_key_t *rkp; /* right btree key */
- xfs_bmbt_ptr_t *rpp; /* right address pointer */
- xfs_bmbt_block_t *rrblock; /* right-right btree block */
- xfs_buf_t *rrbp; /* right-right buffer pointer */
- xfs_bmbt_rec_t *rrp; /* right record pointer */
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGIFK(cur, level, *bnop, *startoff);
- args.tp = cur->bc_tp;
- args.mp = cur->bc_mp;
- lbp = cur->bc_bufs[level];
- lbno = XFS_DADDR_TO_FSB(args.mp, XFS_BUF_ADDR(lbp));
- left = XFS_BUF_TO_BMBT_BLOCK(lbp);
- args.fsbno = cur->bc_private.b.firstblock;
- args.firstblock = args.fsbno;
- args.minleft = 0;
- if (args.fsbno == NULLFSBLOCK) {
- args.fsbno = lbno;
- args.type = XFS_ALLOCTYPE_START_BNO;
- /*
- * Make sure there is sufficient room left in the AG to
- * complete a full tree split for an extent insert. If
- * we are converting the middle part of an extent then
- * we may need space for two tree splits.
- *
- * We are relying on the caller to make the correct block
- * reservation for this operation to succeed. If the
- * reservation amount is insufficient then we may fail a
- * block allocation here and corrupt the filesystem.
- */
- args.minleft = xfs_trans_get_block_res(args.tp);
- } else if (cur->bc_private.b.flist->xbf_low)
- args.type = XFS_ALLOCTYPE_START_BNO;
- else
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- args.mod = args.alignment = args.total = args.isfl =
- args.userdata = args.minalignslop = 0;
- args.minlen = args.maxlen = args.prod = 1;
- args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
- if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return XFS_ERROR(ENOSPC);
- }
- if ((error = xfs_alloc_vextent(&args))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- if (args.fsbno == NULLFSBLOCK && args.minleft) {
- /*
- * Could not find an AG with enough free space to satisfy
- * a full btree split. Try again without minleft and if
- * successful activate the lowspace algorithm.
- */
- args.fsbno = 0;
- args.type = XFS_ALLOCTYPE_FIRST_AG;
- args.minleft = 0;
- if ((error = xfs_alloc_vextent(&args))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- cur->bc_private.b.flist->xbf_low = 1;
- }
- if (args.fsbno == NULLFSBLOCK) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- ASSERT(args.len == 1);
- cur->bc_private.b.firstblock = args.fsbno;
- cur->bc_private.b.allocated++;
- cur->bc_private.b.ip->i_d.di_nblocks++;
- xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
- XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
- XFS_TRANS_DQ_BCOUNT, 1L);
- rbp = xfs_btree_get_bufl(args.mp, args.tp, args.fsbno, 0);
- right = XFS_BUF_TO_BMBT_BLOCK(rbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, left, level, rbp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- right->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
- right->bb_level = left->bb_level;
- right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
- if ((be16_to_cpu(left->bb_numrecs) & 1) &&
- cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
- be16_add_cpu(&right->bb_numrecs, 1);
- i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
- if (level > 0) {
- lkp = XFS_BMAP_KEY_IADDR(left, i, cur);
- lpp = XFS_BMAP_PTR_IADDR(left, i, cur);
- rkp = XFS_BMAP_KEY_IADDR(right, 1, cur);
- rpp = XFS_BMAP_PTR_IADDR(right, 1, cur);
-#ifdef DEBUG
- for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
- if ((error = xfs_btree_check_lptr_disk(cur, lpp[i], level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- }
-#endif
- memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
- memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
- xfs_bmbt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- xfs_bmbt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- *startoff = be64_to_cpu(rkp->br_startoff);
- } else {
- lrp = XFS_BMAP_REC_IADDR(left, i, cur);
- rrp = XFS_BMAP_REC_IADDR(right, 1, cur);
- memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
- xfs_bmbt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- *startoff = xfs_bmbt_disk_get_startoff(rrp);
- }
- be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
- right->bb_rightsib = left->bb_rightsib;
- left->bb_rightsib = cpu_to_be64(args.fsbno);
- right->bb_leftsib = cpu_to_be64(lbno);
- xfs_bmbt_log_block(cur, rbp, XFS_BB_ALL_BITS);
- xfs_bmbt_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
- if (be64_to_cpu(right->bb_rightsib) != NULLDFSBNO) {
- if ((error = xfs_btree_read_bufl(args.mp, args.tp,
- be64_to_cpu(right->bb_rightsib), 0, &rrbp,
- XFS_BMAP_BTREE_REF))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- rrblock = XFS_BUF_TO_BMBT_BLOCK(rrbp);
- if ((error = xfs_btree_check_lblock(cur, rrblock, level, rrbp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- rrblock->bb_leftsib = cpu_to_be64(args.fsbno);
- xfs_bmbt_log_block(cur, rrbp, XFS_BB_LEFTSIB);
- }
- if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
- xfs_btree_setbuf(cur, level, rbp);
- cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
- }
- if (level + 1 < cur->bc_nlevels) {
- if ((error = xfs_btree_dup_cursor(cur, curp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- (*curp)->bc_ptrs[level + 1]++;
- }
- *bnop = args.fsbno;
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
-}
-
-
-/*
- * Update keys for the record.
- */
-STATIC int
-xfs_bmbt_updkey(
- xfs_btree_cur_t *cur,
- xfs_bmbt_key_t *keyp, /* on-disk format */
- int level)
-{
- xfs_bmbt_block_t *block;
- xfs_buf_t *bp;
-#ifdef DEBUG
- int error;
-#endif
- xfs_bmbt_key_t *kp;
- int ptr;
-
- ASSERT(level >= 1);
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGIK(cur, level, keyp);
- for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
- block = xfs_bmbt_get_block(cur, level, &bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- ptr = cur->bc_ptrs[level];
- kp = XFS_BMAP_KEY_IADDR(block, ptr, cur);
- *kp = *keyp;
- xfs_bmbt_log_keys(cur, bp, ptr, ptr);
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- return 0;
-}
-
/*
* Convert on-disk form of btree root to in-memory form.
*/
void
xfs_bmdr_to_bmbt(
+ struct xfs_mount *mp,
xfs_bmdr_block_t *dblock,
int dblocklen,
- xfs_bmbt_block_t *rblock,
+ struct xfs_btree_block *rblock,
int rblocklen)
{
int dmxr;
@@ -1688,129 +81,19 @@ xfs_bmdr_to_bmbt(
rblock->bb_level = dblock->bb_level;
ASSERT(be16_to_cpu(rblock->bb_level) > 0);
rblock->bb_numrecs = dblock->bb_numrecs;
- rblock->bb_leftsib = cpu_to_be64(NULLDFSBNO);
- rblock->bb_rightsib = cpu_to_be64(NULLDFSBNO);
- dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0);
- fkp = XFS_BTREE_KEY_ADDR(xfs_bmdr, dblock, 1);
- tkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen);
- fpp = XFS_BTREE_PTR_ADDR(xfs_bmdr, dblock, 1, dmxr);
- tpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
+ rblock->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
+ rblock->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
+ dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
+ fkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+ tkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+ fpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
+ tpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
dmxr = be16_to_cpu(dblock->bb_numrecs);
memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
}
/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int /* error */
-xfs_bmbt_decrement(
- xfs_btree_cur_t *cur,
- int level,
- int *stat) /* success/failure */
-{
- xfs_bmbt_block_t *block;
- xfs_buf_t *bp;
- int error; /* error return value */
- xfs_fsblock_t fsbno;
- int lev;
- xfs_mount_t *mp;
- xfs_trans_t *tp;
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGI(cur, level);
- ASSERT(level < cur->bc_nlevels);
- if (level < cur->bc_nlevels - 1)
- xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
- if (--cur->bc_ptrs[level] > 0) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
- }
- block = xfs_bmbt_get_block(cur, level, &bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- if (be64_to_cpu(block->bb_leftsib) == NULLDFSBNO) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
- if (--cur->bc_ptrs[lev] > 0)
- break;
- if (lev < cur->bc_nlevels - 1)
- xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
- }
- if (lev == cur->bc_nlevels) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- tp = cur->bc_tp;
- mp = cur->bc_mp;
- for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
- fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur));
- if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
- XFS_BMAP_BTREE_REF))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- lev--;
- xfs_btree_setbuf(cur, lev, bp);
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
- if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
-}
-
-/*
- * Delete the record pointed to by cur.
- */
-int /* error */
-xfs_bmbt_delete(
- xfs_btree_cur_t *cur,
- int *stat) /* success/failure */
-{
- int error; /* error return value */
- int i;
- int level;
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- for (level = 0, i = 2; i == 2; level++) {
- if ((error = xfs_bmbt_delrec(cur, level, &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- }
- if (i == 0) {
- for (level = 1; level < cur->bc_nlevels; level++) {
- if (cur->bc_ptrs[level] == 0) {
- if ((error = xfs_bmbt_decrement(cur, level,
- &i))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- break;
- }
- }
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = i;
- return 0;
-}
-
-/*
* Convert a compressed bmap extent record to an uncompressed form.
* This code must be in sync with the routines xfs_bmbt_get_startoff,
* xfs_bmbt_get_startblock, xfs_bmbt_get_blockcount and xfs_bmbt_get_state.
@@ -1864,31 +147,6 @@ xfs_bmbt_get_all(
}
/*
- * Get the block pointer for the given level of the cursor.
- * Fill in the buffer pointer, if applicable.
- */
-xfs_bmbt_block_t *
-xfs_bmbt_get_block(
- xfs_btree_cur_t *cur,
- int level,
- xfs_buf_t **bpp)
-{
- xfs_ifork_t *ifp;
- xfs_bmbt_block_t *rval;
-
- if (level < cur->bc_nlevels - 1) {
- *bpp = cur->bc_bufs[level];
- rval = XFS_BUF_TO_BMBT_BLOCK(*bpp);
- } else {
- *bpp = NULL;
- ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
- cur->bc_private.b.whichfork);
- rval = ifp->if_broot;
- }
- return rval;
-}
-
-/*
* Extract the blockcount field from an in memory bmap extent record.
*/
xfs_filblks_t
@@ -1950,7 +208,8 @@ xfs_bmbt_disk_get_all(
xfs_bmbt_rec_t *r,
xfs_bmbt_irec_t *s)
{
- __xfs_bmbt_get_all(be64_to_cpu(r->l0), be64_to_cpu(r->l1), s);
+ __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
+ get_unaligned_be64(&r->l1), s);
}
/*
@@ -1974,348 +233,6 @@ xfs_bmbt_disk_get_startoff(
XFS_MASK64LO(64 - BMBT_EXNTFLAG_BITLEN)) >> 9;
}
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int /* error */
-xfs_bmbt_increment(
- xfs_btree_cur_t *cur,
- int level,
- int *stat) /* success/failure */
-{
- xfs_bmbt_block_t *block;
- xfs_buf_t *bp;
- int error; /* error return value */
- xfs_fsblock_t fsbno;
- int lev;
- xfs_mount_t *mp;
- xfs_trans_t *tp;
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGI(cur, level);
- ASSERT(level < cur->bc_nlevels);
- if (level < cur->bc_nlevels - 1)
- xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
- block = xfs_bmbt_get_block(cur, level, &bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, block, level, bp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
- }
- if (be64_to_cpu(block->bb_rightsib) == NULLDFSBNO) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
- block = xfs_bmbt_get_block(cur, lev, &bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
- break;
- if (lev < cur->bc_nlevels - 1)
- xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
- }
- if (lev == cur->bc_nlevels) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- tp = cur->bc_tp;
- mp = cur->bc_mp;
- for (block = xfs_bmbt_get_block(cur, lev, &bp); lev > level; ) {
- fsbno = be64_to_cpu(*XFS_BMAP_PTR_IADDR(block, cur->bc_ptrs[lev], cur));
- if ((error = xfs_btree_read_bufl(mp, tp, fsbno, 0, &bp,
- XFS_BMAP_BTREE_REF))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- lev--;
- xfs_btree_setbuf(cur, lev, bp);
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
- if ((error = xfs_btree_check_lblock(cur, block, lev, bp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- cur->bc_ptrs[lev] = 1;
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 1;
- return 0;
-}
-
-/*
- * Insert the current record at the point referenced by cur.
- *
- * A multi-level split of the tree on insert will invalidate the original
- * cursor. All callers of this function should assume that the cursor is
- * no longer valid and revalidate it.
- */
-int /* error */
-xfs_bmbt_insert(
- xfs_btree_cur_t *cur,
- int *stat) /* success/failure */
-{
- int error; /* error return value */
- int i;
- int level;
- xfs_fsblock_t nbno;
- xfs_btree_cur_t *ncur;
- xfs_bmbt_rec_t nrec;
- xfs_btree_cur_t *pcur;
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- level = 0;
- nbno = NULLFSBLOCK;
- xfs_bmbt_disk_set_all(&nrec, &cur->bc_rec.b);
- ncur = NULL;
- pcur = cur;
- do {
- if ((error = xfs_bmbt_insrec(pcur, level++, &nbno, &nrec, &ncur,
- &i))) {
- if (pcur != cur)
- xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if (pcur != cur && (ncur || nbno == NULLFSBLOCK)) {
- cur->bc_nlevels = pcur->bc_nlevels;
- cur->bc_private.b.allocated +=
- pcur->bc_private.b.allocated;
- pcur->bc_private.b.allocated = 0;
- ASSERT((cur->bc_private.b.firstblock != NULLFSBLOCK) ||
- XFS_IS_REALTIME_INODE(cur->bc_private.b.ip));
- cur->bc_private.b.firstblock =
- pcur->bc_private.b.firstblock;
- ASSERT(cur->bc_private.b.flist ==
- pcur->bc_private.b.flist);
- xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
- }
- if (ncur) {
- pcur = ncur;
- ncur = NULL;
- }
- } while (nbno != NULLFSBLOCK);
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = i;
- return 0;
-error0:
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
-}
-
-/*
- * Log fields from the btree block header.
- */
-void
-xfs_bmbt_log_block(
- xfs_btree_cur_t *cur,
- xfs_buf_t *bp,
- int fields)
-{
- int first;
- int last;
- xfs_trans_t *tp;
- static const short offsets[] = {
- offsetof(xfs_bmbt_block_t, bb_magic),
- offsetof(xfs_bmbt_block_t, bb_level),
- offsetof(xfs_bmbt_block_t, bb_numrecs),
- offsetof(xfs_bmbt_block_t, bb_leftsib),
- offsetof(xfs_bmbt_block_t, bb_rightsib),
- sizeof(xfs_bmbt_block_t)
- };
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGBI(cur, bp, fields);
- tp = cur->bc_tp;
- if (bp) {
- xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first,
- &last);
- xfs_trans_log_buf(tp, bp, first, last);
- } else
- xfs_trans_log_inode(tp, cur->bc_private.b.ip,
- XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-
-/*
- * Log record values from the btree block.
- */
-void
-xfs_bmbt_log_recs(
- xfs_btree_cur_t *cur,
- xfs_buf_t *bp,
- int rfirst,
- int rlast)
-{
- xfs_bmbt_block_t *block;
- int first;
- int last;
- xfs_bmbt_rec_t *rp;
- xfs_trans_t *tp;
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGBII(cur, bp, rfirst, rlast);
- ASSERT(bp);
- tp = cur->bc_tp;
- block = XFS_BUF_TO_BMBT_BLOCK(bp);
- rp = XFS_BMAP_REC_DADDR(block, 1, cur);
- first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
- last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
- xfs_trans_log_buf(tp, bp, first, last);
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
-}
-
-int /* error */
-xfs_bmbt_lookup_eq(
- xfs_btree_cur_t *cur,
- xfs_fileoff_t off,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
- int *stat) /* success/failure */
-{
- cur->bc_rec.b.br_startoff = off;
- cur->bc_rec.b.br_startblock = bno;
- cur->bc_rec.b.br_blockcount = len;
- return xfs_bmbt_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-
-int /* error */
-xfs_bmbt_lookup_ge(
- xfs_btree_cur_t *cur,
- xfs_fileoff_t off,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
- int *stat) /* success/failure */
-{
- cur->bc_rec.b.br_startoff = off;
- cur->bc_rec.b.br_startblock = bno;
- cur->bc_rec.b.br_blockcount = len;
- return xfs_bmbt_lookup(cur, XFS_LOOKUP_GE, stat);
-}
-
-/*
- * Give the bmap btree a new root block. Copy the old broot contents
- * down into a real block and make the broot point to it.
- */
-int /* error */
-xfs_bmbt_newroot(
- xfs_btree_cur_t *cur, /* btree cursor */
- int *logflags, /* logging flags for inode */
- int *stat) /* return status - 0 fail */
-{
- xfs_alloc_arg_t args; /* allocation arguments */
- xfs_bmbt_block_t *block; /* bmap btree block */
- xfs_buf_t *bp; /* buffer for block */
- xfs_bmbt_block_t *cblock; /* child btree block */
- xfs_bmbt_key_t *ckp; /* child key pointer */
- xfs_bmbt_ptr_t *cpp; /* child ptr pointer */
- int error; /* error return code */
-#ifdef DEBUG
- int i; /* loop counter */
-#endif
- xfs_bmbt_key_t *kp; /* pointer to bmap btree key */
- int level; /* btree level */
- xfs_bmbt_ptr_t *pp; /* pointer to bmap block addr */
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- level = cur->bc_nlevels - 1;
- block = xfs_bmbt_get_block(cur, level, &bp);
- /*
- * Copy the root into a real block.
- */
- args.mp = cur->bc_mp;
- pp = XFS_BMAP_PTR_IADDR(block, 1, cur);
- args.tp = cur->bc_tp;
- args.fsbno = cur->bc_private.b.firstblock;
- args.mod = args.minleft = args.alignment = args.total = args.isfl =
- args.userdata = args.minalignslop = 0;
- args.minlen = args.maxlen = args.prod = 1;
- args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
- args.firstblock = args.fsbno;
- if (args.fsbno == NULLFSBLOCK) {
-#ifdef DEBUG
- if ((error = xfs_btree_check_lptr_disk(cur, *pp, level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- args.fsbno = be64_to_cpu(*pp);
- args.type = XFS_ALLOCTYPE_START_BNO;
- } else if (cur->bc_private.b.flist->xbf_low)
- args.type = XFS_ALLOCTYPE_START_BNO;
- else
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- if ((error = xfs_alloc_vextent(&args))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- if (args.fsbno == NULLFSBLOCK) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *stat = 0;
- return 0;
- }
- ASSERT(args.len == 1);
- cur->bc_private.b.firstblock = args.fsbno;
- cur->bc_private.b.allocated++;
- cur->bc_private.b.ip->i_d.di_nblocks++;
- XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
- XFS_TRANS_DQ_BCOUNT, 1L);
- bp = xfs_btree_get_bufl(args.mp, cur->bc_tp, args.fsbno, 0);
- cblock = XFS_BUF_TO_BMBT_BLOCK(bp);
- *cblock = *block;
- be16_add_cpu(&block->bb_level, 1);
- block->bb_numrecs = cpu_to_be16(1);
- cur->bc_nlevels++;
- cur->bc_ptrs[level + 1] = 1;
- kp = XFS_BMAP_KEY_IADDR(block, 1, cur);
- ckp = XFS_BMAP_KEY_IADDR(cblock, 1, cur);
- memcpy(ckp, kp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*kp));
- cpp = XFS_BMAP_PTR_IADDR(cblock, 1, cur);
-#ifdef DEBUG
- for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
- if ((error = xfs_btree_check_lptr_disk(cur, pp[i], level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- }
-#endif
- memcpy(cpp, pp, be16_to_cpu(cblock->bb_numrecs) * sizeof(*pp));
-#ifdef DEBUG
- if ((error = xfs_btree_check_lptr(cur, args.fsbno, level))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- *pp = cpu_to_be64(args.fsbno);
- xfs_iroot_realloc(cur->bc_private.b.ip, 1 - be16_to_cpu(cblock->bb_numrecs),
- cur->bc_private.b.whichfork);
- xfs_btree_setbuf(cur, level, bp);
- /*
- * Do all this logging at the end so that
- * the root is at the right level.
- */
- xfs_bmbt_log_block(cur, bp, XFS_BB_ALL_BITS);
- xfs_bmbt_log_keys(cur, bp, 1, be16_to_cpu(cblock->bb_numrecs));
- xfs_bmbt_log_ptrs(cur, bp, 1, be16_to_cpu(cblock->bb_numrecs));
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- *logflags |=
- XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork);
- *stat = 1;
- return 0;
-}
/*
* Set all the fields in a bmap extent record from the arguments.
@@ -2512,7 +429,8 @@ xfs_bmbt_set_state(
*/
void
xfs_bmbt_to_bmdr(
- xfs_bmbt_block_t *rblock,
+ struct xfs_mount *mp,
+ struct xfs_btree_block *rblock,
int rblocklen,
xfs_bmdr_block_t *dblock,
int dblocklen)
@@ -2524,67 +442,22 @@ xfs_bmbt_to_bmdr(
__be64 *tpp;
ASSERT(be32_to_cpu(rblock->bb_magic) == XFS_BMAP_MAGIC);
- ASSERT(be64_to_cpu(rblock->bb_leftsib) == NULLDFSBNO);
- ASSERT(be64_to_cpu(rblock->bb_rightsib) == NULLDFSBNO);
+ ASSERT(be64_to_cpu(rblock->bb_u.l.bb_leftsib) == NULLDFSBNO);
+ ASSERT(be64_to_cpu(rblock->bb_u.l.bb_rightsib) == NULLDFSBNO);
ASSERT(be16_to_cpu(rblock->bb_level) > 0);
dblock->bb_level = rblock->bb_level;
dblock->bb_numrecs = rblock->bb_numrecs;
- dmxr = (int)XFS_BTREE_BLOCK_MAXRECS(dblocklen, xfs_bmdr, 0);
- fkp = XFS_BMAP_BROOT_KEY_ADDR(rblock, 1, rblocklen);
- tkp = XFS_BTREE_KEY_ADDR(xfs_bmdr, dblock, 1);
- fpp = XFS_BMAP_BROOT_PTR_ADDR(rblock, 1, rblocklen);
- tpp = XFS_BTREE_PTR_ADDR(xfs_bmdr, dblock, 1, dmxr);
+ dmxr = xfs_bmdr_maxrecs(mp, dblocklen, 0);
+ fkp = XFS_BMBT_KEY_ADDR(mp, rblock, 1);
+ tkp = XFS_BMDR_KEY_ADDR(dblock, 1);
+ fpp = XFS_BMAP_BROOT_PTR_ADDR(mp, rblock, 1, rblocklen);
+ tpp = XFS_BMDR_PTR_ADDR(dblock, 1, dmxr);
dmxr = be16_to_cpu(dblock->bb_numrecs);
memcpy(tkp, fkp, sizeof(*fkp) * dmxr);
memcpy(tpp, fpp, sizeof(*fpp) * dmxr);
}
/*
- * Update the record to the passed values.
- */
-int
-xfs_bmbt_update(
- xfs_btree_cur_t *cur,
- xfs_fileoff_t off,
- xfs_fsblock_t bno,
- xfs_filblks_t len,
- xfs_exntst_t state)
-{
- xfs_bmbt_block_t *block;
- xfs_buf_t *bp;
- int error;
- xfs_bmbt_key_t key;
- int ptr;
- xfs_bmbt_rec_t *rp;
-
- XFS_BMBT_TRACE_CURSOR(cur, ENTRY);
- XFS_BMBT_TRACE_ARGFFFI(cur, (xfs_dfiloff_t)off, (xfs_dfsbno_t)bno,
- (xfs_dfilblks_t)len, (int)state);
- block = xfs_bmbt_get_block(cur, 0, &bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_lblock(cur, block, 0, bp))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
-#endif
- ptr = cur->bc_ptrs[0];
- rp = XFS_BMAP_REC_IADDR(block, ptr, cur);
- xfs_bmbt_disk_set_allf(rp, off, bno, len, state);
- xfs_bmbt_log_recs(cur, bp, ptr, ptr);
- if (ptr > 1) {
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- return 0;
- }
- key.br_startoff = cpu_to_be64(off);
- if ((error = xfs_bmbt_updkey(cur, &key, 1))) {
- XFS_BMBT_TRACE_CURSOR(cur, ERROR);
- return error;
- }
- XFS_BMBT_TRACE_CURSOR(cur, EXIT);
- return 0;
-}
-
-/*
* Check extent records, which have just been read, for
* any bit in the extent flag field. ASSERT on debug
* kernels, as this condition should not occur.
@@ -2608,3 +481,451 @@ xfs_check_nostate_extents(
}
return 0;
}
+
+
+STATIC struct xfs_btree_cur *
+xfs_bmbt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ struct xfs_btree_cur *new;
+
+ new = xfs_bmbt_init_cursor(cur->bc_mp, cur->bc_tp,
+ cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+
+ /*
+ * Copy the firstblock, flist, and flags values,
+ * since init cursor doesn't get them.
+ */
+ new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
+ new->bc_private.b.flist = cur->bc_private.b.flist;
+ new->bc_private.b.flags = cur->bc_private.b.flags;
+
+ return new;
+}
+
+STATIC void
+xfs_bmbt_update_cursor(
+ struct xfs_btree_cur *src,
+ struct xfs_btree_cur *dst)
+{
+ ASSERT((dst->bc_private.b.firstblock != NULLFSBLOCK) ||
+ (dst->bc_private.b.ip->i_d.di_flags & XFS_DIFLAG_REALTIME));
+ ASSERT(dst->bc_private.b.flist == src->bc_private.b.flist);
+
+ dst->bc_private.b.allocated += src->bc_private.b.allocated;
+ dst->bc_private.b.firstblock = src->bc_private.b.firstblock;
+
+ src->bc_private.b.allocated = 0;
+}
+
+STATIC int
+xfs_bmbt_alloc_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int length,
+ int *stat)
+{
+ xfs_alloc_arg_t args; /* block allocation args */
+ int error; /* error return value */
+
+ memset(&args, 0, sizeof(args));
+ args.tp = cur->bc_tp;
+ args.mp = cur->bc_mp;
+ args.fsbno = cur->bc_private.b.firstblock;
+ args.firstblock = args.fsbno;
+
+ if (args.fsbno == NULLFSBLOCK) {
+ args.fsbno = be64_to_cpu(start->l);
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ /*
+ * Make sure there is sufficient room left in the AG to
+ * complete a full tree split for an extent insert. If
+ * we are converting the middle part of an extent then
+ * we may need space for two tree splits.
+ *
+ * We are relying on the caller to make the correct block
+ * reservation for this operation to succeed. If the
+ * reservation amount is insufficient then we may fail a
+ * block allocation here and corrupt the filesystem.
+ */
+ args.minleft = xfs_trans_get_block_res(args.tp);
+ } else if (cur->bc_private.b.flist->xbf_low) {
+ args.type = XFS_ALLOCTYPE_START_BNO;
+ } else {
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+ }
+
+ args.minlen = args.maxlen = args.prod = 1;
+ args.wasdel = cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL;
+ if (!args.wasdel && xfs_trans_get_block_res(args.tp) == 0) {
+ error = XFS_ERROR(ENOSPC);
+ goto error0;
+ }
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ goto error0;
+
+ if (args.fsbno == NULLFSBLOCK && args.minleft) {
+ /*
+ * Could not find an AG with enough free space to satisfy
+ * a full btree split. Try again without minleft and if
+ * successful activate the lowspace algorithm.
+ */
+ args.fsbno = 0;
+ args.type = XFS_ALLOCTYPE_FIRST_AG;
+ args.minleft = 0;
+ error = xfs_alloc_vextent(&args);
+ if (error)
+ goto error0;
+ cur->bc_private.b.flist->xbf_low = 1;
+ }
+ if (args.fsbno == NULLFSBLOCK) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+ }
+ ASSERT(args.len == 1);
+ cur->bc_private.b.firstblock = args.fsbno;
+ cur->bc_private.b.allocated++;
+ cur->bc_private.b.ip->i_d.di_nblocks++;
+ xfs_trans_log_inode(args.tp, cur->bc_private.b.ip, XFS_ILOG_CORE);
+ XFS_TRANS_MOD_DQUOT_BYINO(args.mp, args.tp, cur->bc_private.b.ip,
+ XFS_TRANS_DQ_BCOUNT, 1L);
+
+ new->l = cpu_to_be64(args.fsbno);
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+
+ error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+STATIC int
+xfs_bmbt_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ struct xfs_inode *ip = cur->bc_private.b.ip;
+ struct xfs_trans *tp = cur->bc_tp;
+ xfs_fsblock_t fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+
+ xfs_bmap_add_free(fsbno, 1, cur->bc_private.b.flist, mp);
+ ip->i_d.di_nblocks--;
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ XFS_TRANS_MOD_DQUOT_BYINO(mp, tp, ip, XFS_TRANS_DQ_BCOUNT, -1L);
+ xfs_trans_binval(tp, bp);
+ return 0;
+}
+
+STATIC int
+xfs_bmbt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level == cur->bc_nlevels - 1) {
+ struct xfs_ifork *ifp;
+
+ ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+ cur->bc_private.b.whichfork);
+
+ return xfs_bmbt_maxrecs(cur->bc_mp,
+ ifp->if_broot_bytes, level == 0) / 2;
+ }
+
+ return cur->bc_mp->m_bmap_dmnr[level != 0];
+}
+
+int
+xfs_bmbt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level == cur->bc_nlevels - 1) {
+ struct xfs_ifork *ifp;
+
+ ifp = XFS_IFORK_PTR(cur->bc_private.b.ip,
+ cur->bc_private.b.whichfork);
+
+ return xfs_bmbt_maxrecs(cur->bc_mp,
+ ifp->if_broot_bytes, level == 0);
+ }
+
+ return cur->bc_mp->m_bmap_dmxr[level != 0];
+
+}
+
+/*
+ * Get the maximum records we could store in the on-disk format.
+ *
+ * For non-root nodes this is equivalent to xfs_bmbt_get_maxrecs, but
+ * for the root node this checks the available space in the dinode fork
+ * so that we can resize the in-memory buffer to match it. After a
+ * resize to the maximum size this function returns the same value
+ * as xfs_bmbt_get_maxrecs for the root node, too.
+ */
+STATIC int
+xfs_bmbt_get_dmaxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
+{
+ if (level != cur->bc_nlevels - 1)
+ return cur->bc_mp->m_bmap_dmxr[level != 0];
+ return xfs_bmdr_maxrecs(cur->bc_mp, cur->bc_private.b.forksize,
+ level == 0);
+}
+
+STATIC void
+xfs_bmbt_init_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ key->bmbt.br_startoff =
+ cpu_to_be64(xfs_bmbt_disk_get_startoff(&rec->bmbt));
+}
+
+STATIC void
+xfs_bmbt_init_rec_from_key(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
+{
+ ASSERT(key->bmbt.br_startoff != 0);
+
+ xfs_bmbt_disk_set_allf(&rec->bmbt, be64_to_cpu(key->bmbt.br_startoff),
+ 0, 0, XFS_EXT_NORM);
+}
+
+STATIC void
+xfs_bmbt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ xfs_bmbt_disk_set_all(&rec->bmbt, &cur->bc_rec.b);
+}
+
+STATIC void
+xfs_bmbt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ ptr->l = 0;
+}
+
+STATIC __int64_t
+xfs_bmbt_key_diff(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key)
+{
+ return (__int64_t)be64_to_cpu(key->bmbt.br_startoff) -
+ cur->bc_rec.b.br_startoff;
+}
+
+#ifdef DEBUG
+STATIC int
+xfs_bmbt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ return be64_to_cpu(k1->bmbt.br_startoff) <
+ be64_to_cpu(k2->bmbt.br_startoff);
+}
+
+STATIC int
+xfs_bmbt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *r1,
+ union xfs_btree_rec *r2)
+{
+ return xfs_bmbt_disk_get_startoff(&r1->bmbt) +
+ xfs_bmbt_disk_get_blockcount(&r1->bmbt) <=
+ xfs_bmbt_disk_get_startoff(&r2->bmbt);
+}
+#endif /* DEBUG */
+
+#ifdef XFS_BTREE_TRACE
+ktrace_t *xfs_bmbt_trace_buf;
+
+STATIC void
+xfs_bmbt_trace_enter(
+ struct xfs_btree_cur *cur,
+ const char *func,
+ char *s,
+ int type,
+ int line,
+ __psunsigned_t a0,
+ __psunsigned_t a1,
+ __psunsigned_t a2,
+ __psunsigned_t a3,
+ __psunsigned_t a4,
+ __psunsigned_t a5,
+ __psunsigned_t a6,
+ __psunsigned_t a7,
+ __psunsigned_t a8,
+ __psunsigned_t a9,
+ __psunsigned_t a10)
+{
+ struct xfs_inode *ip = cur->bc_private.b.ip;
+ int whichfork = cur->bc_private.b.whichfork;
+
+ ktrace_enter(xfs_bmbt_trace_buf,
+ (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
+ (void *)func, (void *)s, (void *)ip, (void *)cur,
+ (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+ (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+ (void *)a8, (void *)a9, (void *)a10);
+ ktrace_enter(ip->i_btrace,
+ (void *)((__psint_t)type | (whichfork << 8) | (line << 16)),
+ (void *)func, (void *)s, (void *)ip, (void *)cur,
+ (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+ (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+ (void *)a8, (void *)a9, (void *)a10);
+}
+
+STATIC void
+xfs_bmbt_trace_cursor(
+ struct xfs_btree_cur *cur,
+ __uint32_t *s0,
+ __uint64_t *l0,
+ __uint64_t *l1)
+{
+ struct xfs_bmbt_rec_host r;
+
+ xfs_bmbt_set_all(&r, &cur->bc_rec.b);
+
+ *s0 = (cur->bc_nlevels << 24) |
+ (cur->bc_private.b.flags << 16) |
+ cur->bc_private.b.allocated;
+ *l0 = r.l0;
+ *l1 = r.l1;
+}
+
+STATIC void
+xfs_bmbt_trace_key(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key,
+ __uint64_t *l0,
+ __uint64_t *l1)
+{
+ *l0 = be64_to_cpu(key->bmbt.br_startoff);
+ *l1 = 0;
+}
+
+STATIC void
+xfs_bmbt_trace_record(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ __uint64_t *l0,
+ __uint64_t *l1,
+ __uint64_t *l2)
+{
+ struct xfs_bmbt_irec irec;
+
+ xfs_bmbt_disk_get_all(&rec->bmbt, &irec);
+ *l0 = irec.br_startoff;
+ *l1 = irec.br_startblock;
+ *l2 = irec.br_blockcount;
+}
+#endif /* XFS_BTREE_TRACE */
+
+static const struct xfs_btree_ops xfs_bmbt_ops = {
+ .rec_len = sizeof(xfs_bmbt_rec_t),
+ .key_len = sizeof(xfs_bmbt_key_t),
+
+ .dup_cursor = xfs_bmbt_dup_cursor,
+ .update_cursor = xfs_bmbt_update_cursor,
+ .alloc_block = xfs_bmbt_alloc_block,
+ .free_block = xfs_bmbt_free_block,
+ .get_maxrecs = xfs_bmbt_get_maxrecs,
+ .get_minrecs = xfs_bmbt_get_minrecs,
+ .get_dmaxrecs = xfs_bmbt_get_dmaxrecs,
+ .init_key_from_rec = xfs_bmbt_init_key_from_rec,
+ .init_rec_from_key = xfs_bmbt_init_rec_from_key,
+ .init_rec_from_cur = xfs_bmbt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur,
+ .key_diff = xfs_bmbt_key_diff,
+
+#ifdef DEBUG
+ .keys_inorder = xfs_bmbt_keys_inorder,
+ .recs_inorder = xfs_bmbt_recs_inorder,
+#endif
+
+#ifdef XFS_BTREE_TRACE
+ .trace_enter = xfs_bmbt_trace_enter,
+ .trace_cursor = xfs_bmbt_trace_cursor,
+ .trace_key = xfs_bmbt_trace_key,
+ .trace_record = xfs_bmbt_trace_record,
+#endif
+};
+
+/*
+ * Allocate a new bmap btree cursor.
+ */
+struct xfs_btree_cur * /* new bmap btree cursor */
+xfs_bmbt_init_cursor(
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_inode *ip, /* inode owning the btree */
+ int whichfork) /* data or attr fork */
+{
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_btree_cur *cur;
+
+ cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+
+ cur->bc_tp = tp;
+ cur->bc_mp = mp;
+ cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
+ cur->bc_btnum = XFS_BTNUM_BMAP;
+ cur->bc_blocklog = mp->m_sb.sb_blocklog;
+
+ cur->bc_ops = &xfs_bmbt_ops;
+ cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
+
+ cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
+ cur->bc_private.b.ip = ip;
+ cur->bc_private.b.firstblock = NULLFSBLOCK;
+ cur->bc_private.b.flist = NULL;
+ cur->bc_private.b.allocated = 0;
+ cur->bc_private.b.flags = 0;
+ cur->bc_private.b.whichfork = whichfork;
+
+ return cur;
+}
+
+/*
+ * Calculate number of records in a bmap btree block.
+ */
+int
+xfs_bmbt_maxrecs(
+ struct xfs_mount *mp,
+ int blocklen,
+ int leaf)
+{
+ blocklen -= XFS_BMBT_BLOCK_LEN(mp);
+
+ if (leaf)
+ return blocklen / sizeof(xfs_bmbt_rec_t);
+ return blocklen / (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t));
+}
+
+/*
+ * Calculate number of records in a bmap btree inode root.
+ */
+int
+xfs_bmdr_maxrecs(
+ struct xfs_mount *mp,
+ int blocklen,
+ int leaf)
+{
+ blocklen -= sizeof(xfs_bmdr_block_t);
+
+ if (leaf)
+ return blocklen / sizeof(xfs_bmdr_rec_t);
+ return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
+}
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index cd0d4b4bb816..a4555abb6622 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -21,9 +21,10 @@
#define XFS_BMAP_MAGIC 0x424d4150 /* 'BMAP' */
struct xfs_btree_cur;
-struct xfs_btree_lblock;
+struct xfs_btree_block;
struct xfs_mount;
struct xfs_inode;
+struct xfs_trans;
/*
* Bmap root header, on-disk form only.
@@ -145,71 +146,60 @@ typedef struct xfs_bmbt_key {
/* btree pointer type */
typedef __be64 xfs_bmbt_ptr_t, xfs_bmdr_ptr_t;
-/* btree block header type */
-typedef struct xfs_btree_lblock xfs_bmbt_block_t;
-
-#define XFS_BUF_TO_BMBT_BLOCK(bp) ((xfs_bmbt_block_t *)XFS_BUF_PTR(bp))
-
-#define XFS_BMAP_RBLOCK_DSIZE(lev,cur) ((cur)->bc_private.b.forksize)
-#define XFS_BMAP_RBLOCK_ISIZE(lev,cur) \
- ((int)XFS_IFORK_PTR((cur)->bc_private.b.ip, \
- (cur)->bc_private.b.whichfork)->if_broot_bytes)
-
-#define XFS_BMAP_BLOCK_DMAXRECS(lev,cur) \
- (((lev) == (cur)->bc_nlevels - 1 ? \
- XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur), \
- xfs_bmdr, (lev) == 0) : \
- ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0])))
-#define XFS_BMAP_BLOCK_IMAXRECS(lev,cur) \
- (((lev) == (cur)->bc_nlevels - 1 ? \
- XFS_BTREE_BLOCK_MAXRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur),\
- xfs_bmbt, (lev) == 0) : \
- ((cur)->bc_mp->m_bmap_dmxr[(lev) != 0])))
-
-#define XFS_BMAP_BLOCK_DMINRECS(lev,cur) \
- (((lev) == (cur)->bc_nlevels - 1 ? \
- XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_DSIZE(lev,cur),\
- xfs_bmdr, (lev) == 0) : \
- ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0])))
-#define XFS_BMAP_BLOCK_IMINRECS(lev,cur) \
- (((lev) == (cur)->bc_nlevels - 1 ? \
- XFS_BTREE_BLOCK_MINRECS(XFS_BMAP_RBLOCK_ISIZE(lev,cur),\
- xfs_bmbt, (lev) == 0) : \
- ((cur)->bc_mp->m_bmap_dmnr[(lev) != 0])))
-
-#define XFS_BMAP_REC_DADDR(bb,i,cur) (XFS_BTREE_REC_ADDR(xfs_bmbt, bb, i))
-
-#define XFS_BMAP_REC_IADDR(bb,i,cur) (XFS_BTREE_REC_ADDR(xfs_bmbt, bb, i))
-
-#define XFS_BMAP_KEY_DADDR(bb,i,cur) \
- (XFS_BTREE_KEY_ADDR(xfs_bmbt, bb, i))
-
-#define XFS_BMAP_KEY_IADDR(bb,i,cur) \
- (XFS_BTREE_KEY_ADDR(xfs_bmbt, bb, i))
-
-#define XFS_BMAP_PTR_DADDR(bb,i,cur) \
- (XFS_BTREE_PTR_ADDR(xfs_bmbt, bb, i, XFS_BMAP_BLOCK_DMAXRECS( \
- be16_to_cpu((bb)->bb_level), cur)))
-#define XFS_BMAP_PTR_IADDR(bb,i,cur) \
- (XFS_BTREE_PTR_ADDR(xfs_bmbt, bb, i, XFS_BMAP_BLOCK_IMAXRECS( \
- be16_to_cpu((bb)->bb_level), cur)))
+/*
+ * Btree block header size depends on a superblock flag.
+ *
+ * (not quite yet, but soon)
+ */
+#define XFS_BMBT_BLOCK_LEN(mp) XFS_BTREE_LBLOCK_LEN
+
+#define XFS_BMBT_REC_ADDR(mp, block, index) \
+ ((xfs_bmbt_rec_t *) \
+ ((char *)(block) + \
+ XFS_BMBT_BLOCK_LEN(mp) + \
+ ((index) - 1) * sizeof(xfs_bmbt_rec_t)))
+
+#define XFS_BMBT_KEY_ADDR(mp, block, index) \
+ ((xfs_bmbt_key_t *) \
+ ((char *)(block) + \
+ XFS_BMBT_BLOCK_LEN(mp) + \
+ ((index) - 1) * sizeof(xfs_bmbt_key_t)))
+
+#define XFS_BMBT_PTR_ADDR(mp, block, index, maxrecs) \
+ ((xfs_bmbt_ptr_t *) \
+ ((char *)(block) + \
+ XFS_BMBT_BLOCK_LEN(mp) + \
+ (maxrecs) * sizeof(xfs_bmbt_key_t) + \
+ ((index) - 1) * sizeof(xfs_bmbt_ptr_t)))
+
+#define XFS_BMDR_REC_ADDR(block, index) \
+ ((xfs_bmdr_rec_t *) \
+ ((char *)(block) + \
+ sizeof(struct xfs_bmdr_block) + \
+ ((index) - 1) * sizeof(xfs_bmdr_rec_t)))
+
+#define XFS_BMDR_KEY_ADDR(block, index) \
+ ((xfs_bmdr_key_t *) \
+ ((char *)(block) + \
+ sizeof(struct xfs_bmdr_block) + \
+ ((index) - 1) * sizeof(xfs_bmdr_key_t)))
+
+#define XFS_BMDR_PTR_ADDR(block, index, maxrecs) \
+ ((xfs_bmdr_ptr_t *) \
+ ((char *)(block) + \
+ sizeof(struct xfs_bmdr_block) + \
+ (maxrecs) * sizeof(xfs_bmdr_key_t) + \
+ ((index) - 1) * sizeof(xfs_bmdr_ptr_t)))
/*
* These are to be used when we know the size of the block and
* we don't have a cursor.
*/
-#define XFS_BMAP_BROOT_REC_ADDR(bb,i,sz) \
- (XFS_BTREE_REC_ADDR(xfs_bmbt,bb,i))
-#define XFS_BMAP_BROOT_KEY_ADDR(bb,i,sz) \
- (XFS_BTREE_KEY_ADDR(xfs_bmbt,bb,i))
-#define XFS_BMAP_BROOT_PTR_ADDR(bb,i,sz) \
- (XFS_BTREE_PTR_ADDR(xfs_bmbt,bb,i,XFS_BMAP_BROOT_MAXRECS(sz)))
-
-#define XFS_BMAP_BROOT_NUMRECS(bb) be16_to_cpu((bb)->bb_numrecs)
-#define XFS_BMAP_BROOT_MAXRECS(sz) XFS_BTREE_BLOCK_MAXRECS(sz,xfs_bmbt,0)
+#define XFS_BMAP_BROOT_PTR_ADDR(mp, bb, i, sz) \
+ XFS_BMBT_PTR_ADDR(mp, bb, i, xfs_bmbt_maxrecs(mp, sz, 0))
#define XFS_BMAP_BROOT_SPACE_CALC(nrecs) \
- (int)(sizeof(xfs_bmbt_block_t) + \
+ (int)(XFS_BTREE_LBLOCK_LEN + \
((nrecs) * (sizeof(xfs_bmbt_key_t) + sizeof(xfs_bmbt_ptr_t))))
#define XFS_BMAP_BROOT_SPACE(bb) \
@@ -223,42 +213,12 @@ typedef struct xfs_btree_lblock xfs_bmbt_block_t;
*/
#define XFS_BM_MAXLEVELS(mp,w) ((mp)->m_bm_maxlevels[(w)])
-#define XFS_BMAP_SANITY_CHECK(mp,bb,level) \
- (be32_to_cpu((bb)->bb_magic) == XFS_BMAP_MAGIC && \
- be16_to_cpu((bb)->bb_level) == level && \
- be16_to_cpu((bb)->bb_numrecs) > 0 && \
- be16_to_cpu((bb)->bb_numrecs) <= (mp)->m_bmap_dmxr[(level) != 0])
-
-
-#ifdef __KERNEL__
-
-#if defined(XFS_BMBT_TRACE)
-/*
- * Trace buffer entry types.
- */
-#define XFS_BMBT_KTRACE_ARGBI 1
-#define XFS_BMBT_KTRACE_ARGBII 2
-#define XFS_BMBT_KTRACE_ARGFFFI 3
-#define XFS_BMBT_KTRACE_ARGI 4
-#define XFS_BMBT_KTRACE_ARGIFK 5
-#define XFS_BMBT_KTRACE_ARGIFR 6
-#define XFS_BMBT_KTRACE_ARGIK 7
-#define XFS_BMBT_KTRACE_CUR 8
-
-#define XFS_BMBT_TRACE_SIZE 4096 /* size of global trace buffer */
-#define XFS_BMBT_KTRACE_SIZE 32 /* size of per-inode trace buffer */
-extern ktrace_t *xfs_bmbt_trace_buf;
-#endif
-
/*
* Prototypes for xfs_bmap.c to call.
*/
-extern void xfs_bmdr_to_bmbt(xfs_bmdr_block_t *, int, xfs_bmbt_block_t *, int);
-extern int xfs_bmbt_decrement(struct xfs_btree_cur *, int, int *);
-extern int xfs_bmbt_delete(struct xfs_btree_cur *, int *);
+extern void xfs_bmdr_to_bmbt(struct xfs_mount *, xfs_bmdr_block_t *, int,
+ struct xfs_btree_block *, int);
extern void xfs_bmbt_get_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
-extern xfs_bmbt_block_t *xfs_bmbt_get_block(struct xfs_btree_cur *cur,
- int, struct xfs_buf **bpp);
extern xfs_filblks_t xfs_bmbt_get_blockcount(xfs_bmbt_rec_host_t *r);
extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
@@ -268,22 +228,6 @@ extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
-extern int xfs_bmbt_increment(struct xfs_btree_cur *, int, int *);
-extern int xfs_bmbt_insert(struct xfs_btree_cur *, int *);
-extern void xfs_bmbt_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
-extern void xfs_bmbt_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int,
- int);
-extern int xfs_bmbt_lookup_eq(struct xfs_btree_cur *, xfs_fileoff_t,
- xfs_fsblock_t, xfs_filblks_t, int *);
-extern int xfs_bmbt_lookup_ge(struct xfs_btree_cur *, xfs_fileoff_t,
- xfs_fsblock_t, xfs_filblks_t, int *);
-
-/*
- * Give the bmap btree a new root block. Copy the old broot contents
- * down into a real block and make the broot point to it.
- */
-extern int xfs_bmbt_newroot(struct xfs_btree_cur *cur, int *lflags, int *stat);
-
extern void xfs_bmbt_set_all(xfs_bmbt_rec_host_t *r, xfs_bmbt_irec_t *s);
extern void xfs_bmbt_set_allf(xfs_bmbt_rec_host_t *r, xfs_fileoff_t o,
xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
@@ -296,10 +240,15 @@ extern void xfs_bmbt_disk_set_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
extern void xfs_bmbt_disk_set_allf(xfs_bmbt_rec_t *r, xfs_fileoff_t o,
xfs_fsblock_t b, xfs_filblks_t c, xfs_exntst_t v);
-extern void xfs_bmbt_to_bmdr(xfs_bmbt_block_t *, int, xfs_bmdr_block_t *, int);
-extern int xfs_bmbt_update(struct xfs_btree_cur *, xfs_fileoff_t,
- xfs_fsblock_t, xfs_filblks_t, xfs_exntst_t);
+extern void xfs_bmbt_to_bmdr(struct xfs_mount *, struct xfs_btree_block *, int,
+ xfs_bmdr_block_t *, int);
+
+extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
+extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
+
+extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
+ struct xfs_trans *, struct xfs_inode *, int);
-#endif /* __KERNEL__ */
#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index cc593a84c345..7ed59267420d 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -34,7 +34,9 @@
#include "xfs_attr_sf.h"
#include "xfs_dinode.h"
#include "xfs_inode.h"
+#include "xfs_inode_item.h"
#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
#include "xfs_ialloc.h"
#include "xfs_error.h"
@@ -50,135 +52,33 @@ const __uint32_t xfs_magics[XFS_BTNUM_MAX] = {
XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, XFS_BMAP_MAGIC, XFS_IBT_MAGIC
};
-/*
- * Checking routine: return maxrecs for the block.
- */
-STATIC int /* number of records fitting in block */
-xfs_btree_maxrecs(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_btree_block_t *block) /* generic btree block pointer */
-{
- switch (cur->bc_btnum) {
- case XFS_BTNUM_BNO:
- case XFS_BTNUM_CNT:
- return (int)XFS_ALLOC_BLOCK_MAXRECS(
- be16_to_cpu(block->bb_h.bb_level), cur);
- case XFS_BTNUM_BMAP:
- return (int)XFS_BMAP_BLOCK_IMAXRECS(
- be16_to_cpu(block->bb_h.bb_level), cur);
- case XFS_BTNUM_INO:
- return (int)XFS_INOBT_BLOCK_MAXRECS(
- be16_to_cpu(block->bb_h.bb_level), cur);
- default:
- ASSERT(0);
- return 0;
- }
-}
-
-/*
- * External routines.
- */
-
-#ifdef DEBUG
-/*
- * Debug routine: check that block header is ok.
- */
-void
-xfs_btree_check_block(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_btree_block_t *block, /* generic btree block pointer */
- int level, /* level of the btree block */
- xfs_buf_t *bp) /* buffer containing block, if any */
-{
- if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
- xfs_btree_check_lblock(cur, (xfs_btree_lblock_t *)block, level,
- bp);
- else
- xfs_btree_check_sblock(cur, (xfs_btree_sblock_t *)block, level,
- bp);
-}
-
-/*
- * Debug routine: check that keys are in the right order.
- */
-void
-xfs_btree_check_key(
- xfs_btnum_t btnum, /* btree identifier */
- void *ak1, /* pointer to left (lower) key */
- void *ak2) /* pointer to right (higher) key */
-{
- switch (btnum) {
- case XFS_BTNUM_BNO: {
- xfs_alloc_key_t *k1;
- xfs_alloc_key_t *k2;
-
- k1 = ak1;
- k2 = ak2;
- ASSERT(be32_to_cpu(k1->ar_startblock) < be32_to_cpu(k2->ar_startblock));
- break;
- }
- case XFS_BTNUM_CNT: {
- xfs_alloc_key_t *k1;
- xfs_alloc_key_t *k2;
-
- k1 = ak1;
- k2 = ak2;
- ASSERT(be32_to_cpu(k1->ar_blockcount) < be32_to_cpu(k2->ar_blockcount) ||
- (k1->ar_blockcount == k2->ar_blockcount &&
- be32_to_cpu(k1->ar_startblock) < be32_to_cpu(k2->ar_startblock)));
- break;
- }
- case XFS_BTNUM_BMAP: {
- xfs_bmbt_key_t *k1;
- xfs_bmbt_key_t *k2;
-
- k1 = ak1;
- k2 = ak2;
- ASSERT(be64_to_cpu(k1->br_startoff) < be64_to_cpu(k2->br_startoff));
- break;
- }
- case XFS_BTNUM_INO: {
- xfs_inobt_key_t *k1;
- xfs_inobt_key_t *k2;
-
- k1 = ak1;
- k2 = ak2;
- ASSERT(be32_to_cpu(k1->ir_startino) < be32_to_cpu(k2->ir_startino));
- break;
- }
- default:
- ASSERT(0);
- }
-}
-#endif /* DEBUG */
-/*
- * Checking routine: check that long form block header is ok.
- */
-/* ARGSUSED */
-int /* error (0 or EFSCORRUPTED) */
+STATIC int /* error (0 or EFSCORRUPTED) */
xfs_btree_check_lblock(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_btree_lblock_t *block, /* btree long form block pointer */
+ struct xfs_btree_cur *cur, /* btree cursor */
+ struct xfs_btree_block *block, /* btree long form block pointer */
int level, /* level of the btree block */
- xfs_buf_t *bp) /* buffer for block, if any */
+ struct xfs_buf *bp) /* buffer for block, if any */
{
int lblock_ok; /* block passes checks */
- xfs_mount_t *mp; /* file system mount point */
+ struct xfs_mount *mp; /* file system mount point */
mp = cur->bc_mp;
lblock_ok =
be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
be16_to_cpu(block->bb_level) == level &&
be16_to_cpu(block->bb_numrecs) <=
- xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) &&
- block->bb_leftsib &&
- (be64_to_cpu(block->bb_leftsib) == NULLDFSBNO ||
- XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_leftsib))) &&
- block->bb_rightsib &&
- (be64_to_cpu(block->bb_rightsib) == NULLDFSBNO ||
- XFS_FSB_SANITY_CHECK(mp, be64_to_cpu(block->bb_rightsib)));
- if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp, XFS_ERRTAG_BTREE_CHECK_LBLOCK,
+ cur->bc_ops->get_maxrecs(cur, level) &&
+ block->bb_u.l.bb_leftsib &&
+ (be64_to_cpu(block->bb_u.l.bb_leftsib) == NULLDFSBNO ||
+ XFS_FSB_SANITY_CHECK(mp,
+ be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
+ block->bb_u.l.bb_rightsib &&
+ (be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO ||
+ XFS_FSB_SANITY_CHECK(mp,
+ be64_to_cpu(block->bb_u.l.bb_rightsib)));
+ if (unlikely(XFS_TEST_ERROR(!lblock_ok, mp,
+ XFS_ERRTAG_BTREE_CHECK_LBLOCK,
XFS_RANDOM_BTREE_CHECK_LBLOCK))) {
if (bp)
xfs_buftrace("LBTREE ERROR", bp);
@@ -189,98 +89,15 @@ xfs_btree_check_lblock(
return 0;
}
-/*
- * Checking routine: check that (long) pointer is ok.
- */
-int /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lptr(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_dfsbno_t ptr, /* btree block disk address */
- int level) /* btree block level */
-{
- xfs_mount_t *mp; /* file system mount point */
-
- mp = cur->bc_mp;
- XFS_WANT_CORRUPTED_RETURN(
- level > 0 &&
- ptr != NULLDFSBNO &&
- XFS_FSB_SANITY_CHECK(mp, ptr));
- return 0;
-}
-
-#ifdef DEBUG
-/*
- * Debug routine: check that records are in the right order.
- */
-void
-xfs_btree_check_rec(
- xfs_btnum_t btnum, /* btree identifier */
- void *ar1, /* pointer to left (lower) record */
- void *ar2) /* pointer to right (higher) record */
-{
- switch (btnum) {
- case XFS_BTNUM_BNO: {
- xfs_alloc_rec_t *r1;
- xfs_alloc_rec_t *r2;
-
- r1 = ar1;
- r2 = ar2;
- ASSERT(be32_to_cpu(r1->ar_startblock) +
- be32_to_cpu(r1->ar_blockcount) <=
- be32_to_cpu(r2->ar_startblock));
- break;
- }
- case XFS_BTNUM_CNT: {
- xfs_alloc_rec_t *r1;
- xfs_alloc_rec_t *r2;
-
- r1 = ar1;
- r2 = ar2;
- ASSERT(be32_to_cpu(r1->ar_blockcount) < be32_to_cpu(r2->ar_blockcount) ||
- (r1->ar_blockcount == r2->ar_blockcount &&
- be32_to_cpu(r1->ar_startblock) < be32_to_cpu(r2->ar_startblock)));
- break;
- }
- case XFS_BTNUM_BMAP: {
- xfs_bmbt_rec_t *r1;
- xfs_bmbt_rec_t *r2;
-
- r1 = ar1;
- r2 = ar2;
- ASSERT(xfs_bmbt_disk_get_startoff(r1) +
- xfs_bmbt_disk_get_blockcount(r1) <=
- xfs_bmbt_disk_get_startoff(r2));
- break;
- }
- case XFS_BTNUM_INO: {
- xfs_inobt_rec_t *r1;
- xfs_inobt_rec_t *r2;
-
- r1 = ar1;
- r2 = ar2;
- ASSERT(be32_to_cpu(r1->ir_startino) + XFS_INODES_PER_CHUNK <=
- be32_to_cpu(r2->ir_startino));
- break;
- }
- default:
- ASSERT(0);
- }
-}
-#endif /* DEBUG */
-
-/*
- * Checking routine: check that block header is ok.
- */
-/* ARGSUSED */
-int /* error (0 or EFSCORRUPTED) */
+STATIC int /* error (0 or EFSCORRUPTED) */
xfs_btree_check_sblock(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_btree_sblock_t *block, /* btree short form block pointer */
+ struct xfs_btree_cur *cur, /* btree cursor */
+ struct xfs_btree_block *block, /* btree short form block pointer */
int level, /* level of the btree block */
- xfs_buf_t *bp) /* buffer containing block */
+ struct xfs_buf *bp) /* buffer containing block */
{
- xfs_buf_t *agbp; /* buffer for ag. freespace struct */
- xfs_agf_t *agf; /* ag. freespace structure */
+ struct xfs_buf *agbp; /* buffer for ag. freespace struct */
+ struct xfs_agf *agf; /* ag. freespace structure */
xfs_agblock_t agflen; /* native ag. freespace length */
int sblock_ok; /* block passes checks */
@@ -291,13 +108,13 @@ xfs_btree_check_sblock(
be32_to_cpu(block->bb_magic) == xfs_magics[cur->bc_btnum] &&
be16_to_cpu(block->bb_level) == level &&
be16_to_cpu(block->bb_numrecs) <=
- xfs_btree_maxrecs(cur, (xfs_btree_block_t *)block) &&
- (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK ||
- be32_to_cpu(block->bb_leftsib) < agflen) &&
- block->bb_leftsib &&
- (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK ||
- be32_to_cpu(block->bb_rightsib) < agflen) &&
- block->bb_rightsib;
+ cur->bc_ops->get_maxrecs(cur, level) &&
+ (be32_to_cpu(block->bb_u.s.bb_leftsib) == NULLAGBLOCK ||
+ be32_to_cpu(block->bb_u.s.bb_leftsib) < agflen) &&
+ block->bb_u.s.bb_leftsib &&
+ (be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK ||
+ be32_to_cpu(block->bb_u.s.bb_rightsib) < agflen) &&
+ block->bb_u.s.bb_rightsib;
if (unlikely(XFS_TEST_ERROR(!sblock_ok, cur->bc_mp,
XFS_ERRTAG_BTREE_CHECK_SBLOCK,
XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
@@ -311,27 +128,78 @@ xfs_btree_check_sblock(
}
/*
- * Checking routine: check that (short) pointer is ok.
+ * Debug routine: check that block header is ok.
+ */
+int
+xfs_btree_check_block(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ struct xfs_btree_block *block, /* generic btree block pointer */
+ int level, /* level of the btree block */
+ struct xfs_buf *bp) /* buffer containing block, if any */
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return xfs_btree_check_lblock(cur, block, level, bp);
+ else
+ return xfs_btree_check_sblock(cur, block, level, bp);
+}
+
+/*
+ * Check that (long) pointer is ok.
*/
int /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_lptr(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_dfsbno_t bno, /* btree block disk address */
+ int level) /* btree block level */
+{
+ XFS_WANT_CORRUPTED_RETURN(
+ level > 0 &&
+ bno != NULLDFSBNO &&
+ XFS_FSB_SANITY_CHECK(cur->bc_mp, bno));
+ return 0;
+}
+
+#ifdef DEBUG
+/*
+ * Check that (short) pointer is ok.
+ */
+STATIC int /* error (0 or EFSCORRUPTED) */
xfs_btree_check_sptr(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agblock_t ptr, /* btree block disk address */
- int level) /* btree block level */
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agblock_t bno, /* btree block disk address */
+ int level) /* btree block level */
{
- xfs_buf_t *agbp; /* buffer for ag. freespace struct */
- xfs_agf_t *agf; /* ag. freespace structure */
+ xfs_agblock_t agblocks = cur->bc_mp->m_sb.sb_agblocks;
- agbp = cur->bc_private.a.agbp;
- agf = XFS_BUF_TO_AGF(agbp);
XFS_WANT_CORRUPTED_RETURN(
level > 0 &&
- ptr != NULLAGBLOCK && ptr != 0 &&
- ptr < be32_to_cpu(agf->agf_length));
+ bno != NULLAGBLOCK &&
+ bno != 0 &&
+ bno < agblocks);
return 0;
}
/*
+ * Check that block ptr is ok.
+ */
+STATIC int /* error (0 or EFSCORRUPTED) */
+xfs_btree_check_ptr(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ union xfs_btree_ptr *ptr, /* btree block disk address */
+ int index, /* offset from ptr to check */
+ int level) /* btree block level */
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ return xfs_btree_check_lptr(cur,
+ be64_to_cpu((&ptr->l)[index]), level);
+ } else {
+ return xfs_btree_check_sptr(cur,
+ be32_to_cpu((&ptr->s)[index]), level);
+ }
+}
+#endif
+
+/*
* Delete the btree cursor.
*/
void
@@ -387,16 +255,17 @@ xfs_btree_dup_cursor(
tp = cur->bc_tp;
mp = cur->bc_mp;
+
/*
* Allocate a new cursor like the old one.
*/
- new = xfs_btree_init_cursor(mp, tp, cur->bc_private.a.agbp,
- cur->bc_private.a.agno, cur->bc_btnum, cur->bc_private.b.ip,
- cur->bc_private.b.whichfork);
+ new = cur->bc_ops->dup_cursor(cur);
+
/*
* Copy the record currently in the cursor.
*/
new->bc_rec = cur->bc_rec;
+
/*
* For each level current, re-get the buffer and copy the ptr value.
*/
@@ -416,46 +285,174 @@ xfs_btree_dup_cursor(
} else
new->bc_bufs[i] = NULL;
}
- /*
- * For bmap btrees, copy the firstblock, flist, and flags values,
- * since init cursor doesn't get them.
- */
- if (new->bc_btnum == XFS_BTNUM_BMAP) {
- new->bc_private.b.firstblock = cur->bc_private.b.firstblock;
- new->bc_private.b.flist = cur->bc_private.b.flist;
- new->bc_private.b.flags = cur->bc_private.b.flags;
- }
*ncur = new;
return 0;
}
/*
+ * XFS btree block layout and addressing:
+ *
+ * There are two types of blocks in the btree: leaf and non-leaf blocks.
+ *
+ * The leaf record start with a header then followed by records containing
+ * the values. A non-leaf block also starts with the same header, and
+ * then first contains lookup keys followed by an equal number of pointers
+ * to the btree blocks at the previous level.
+ *
+ * +--------+-------+-------+-------+-------+-------+-------+
+ * Leaf: | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
+ * +--------+-------+-------+-------+-------+-------+-------+
+ *
+ * +--------+-------+-------+-------+-------+-------+-------+
+ * Non-Leaf: | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
+ * +--------+-------+-------+-------+-------+-------+-------+
+ *
+ * The header is called struct xfs_btree_block for reasons better left unknown
+ * and comes in different versions for short (32bit) and long (64bit) block
+ * pointers. The record and key structures are defined by the btree instances
+ * and opaque to the btree core. The block pointers are simple disk endian
+ * integers, available in a short (32bit) and long (64bit) variant.
+ *
+ * The helpers below calculate the offset of a given record, key or pointer
+ * into a btree block (xfs_btree_*_offset) or return a pointer to the given
+ * record, key or pointer (xfs_btree_*_addr). Note that all addressing
+ * inside the btree block is done using indices starting at one, not zero!
+ */
+
+/*
+ * Return size of the btree block header for this btree instance.
+ */
+static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
+{
+ return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+ XFS_BTREE_LBLOCK_LEN :
+ XFS_BTREE_SBLOCK_LEN;
+}
+
+/*
+ * Return size of btree block pointers for this btree instance.
+ */
+static inline size_t xfs_btree_ptr_len(struct xfs_btree_cur *cur)
+{
+ return (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+ sizeof(__be64) : sizeof(__be32);
+}
+
+/*
+ * Calculate offset of the n-th record in a btree block.
+ */
+STATIC size_t
+xfs_btree_rec_offset(
+ struct xfs_btree_cur *cur,
+ int n)
+{
+ return xfs_btree_block_len(cur) +
+ (n - 1) * cur->bc_ops->rec_len;
+}
+
+/*
+ * Calculate offset of the n-th key in a btree block.
+ */
+STATIC size_t
+xfs_btree_key_offset(
+ struct xfs_btree_cur *cur,
+ int n)
+{
+ return xfs_btree_block_len(cur) +
+ (n - 1) * cur->bc_ops->key_len;
+}
+
+/*
+ * Calculate offset of the n-th block pointer in a btree block.
+ */
+STATIC size_t
+xfs_btree_ptr_offset(
+ struct xfs_btree_cur *cur,
+ int n,
+ int level)
+{
+ return xfs_btree_block_len(cur) +
+ cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
+ (n - 1) * xfs_btree_ptr_len(cur);
+}
+
+/*
+ * Return a pointer to the n-th record in the btree block.
+ */
+STATIC union xfs_btree_rec *
+xfs_btree_rec_addr(
+ struct xfs_btree_cur *cur,
+ int n,
+ struct xfs_btree_block *block)
+{
+ return (union xfs_btree_rec *)
+ ((char *)block + xfs_btree_rec_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th key in the btree block.
+ */
+STATIC union xfs_btree_key *
+xfs_btree_key_addr(
+ struct xfs_btree_cur *cur,
+ int n,
+ struct xfs_btree_block *block)
+{
+ return (union xfs_btree_key *)
+ ((char *)block + xfs_btree_key_offset(cur, n));
+}
+
+/*
+ * Return a pointer to the n-th block pointer in the btree block.
+ */
+STATIC union xfs_btree_ptr *
+xfs_btree_ptr_addr(
+ struct xfs_btree_cur *cur,
+ int n,
+ struct xfs_btree_block *block)
+{
+ int level = xfs_btree_get_level(block);
+
+ ASSERT(block->bb_level != 0);
+
+ return (union xfs_btree_ptr *)
+ ((char *)block + xfs_btree_ptr_offset(cur, n, level));
+}
+
+/*
+ * Get a the root block which is stored in the inode.
+ *
+ * For now this btree implementation assumes the btree root is always
+ * stored in the if_broot field of an inode fork.
+ */
+STATIC struct xfs_btree_block *
+xfs_btree_get_iroot(
+ struct xfs_btree_cur *cur)
+{
+ struct xfs_ifork *ifp;
+
+ ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, cur->bc_private.b.whichfork);
+ return (struct xfs_btree_block *)ifp->if_broot;
+}
+
+/*
* Retrieve the block pointer from the cursor at the given level.
- * This may be a bmap btree root or from a buffer.
+ * This may be an inode btree root or from a buffer.
*/
-STATIC xfs_btree_block_t * /* generic btree block pointer */
+STATIC struct xfs_btree_block * /* generic btree block pointer */
xfs_btree_get_block(
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_btree_cur *cur, /* btree cursor */
int level, /* level in btree */
- xfs_buf_t **bpp) /* buffer containing the block */
-{
- xfs_btree_block_t *block; /* return value */
- xfs_buf_t *bp; /* return buffer */
- xfs_ifork_t *ifp; /* inode fork pointer */
- int whichfork; /* data or attr fork */
-
- if (cur->bc_btnum == XFS_BTNUM_BMAP && level == cur->bc_nlevels - 1) {
- whichfork = cur->bc_private.b.whichfork;
- ifp = XFS_IFORK_PTR(cur->bc_private.b.ip, whichfork);
- block = (xfs_btree_block_t *)ifp->if_broot;
- bp = NULL;
- } else {
- bp = cur->bc_bufs[level];
- block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_buf **bpp) /* buffer containing the block */
+{
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ (level == cur->bc_nlevels - 1)) {
+ *bpp = NULL;
+ return xfs_btree_get_iroot(cur);
}
- ASSERT(block != NULL);
- *bpp = bp;
- return block;
+
+ *bpp = cur->bc_bufs[level];
+ return XFS_BUF_TO_BLOCK(*bpp);
}
/*
@@ -505,97 +502,6 @@ xfs_btree_get_bufs(
}
/*
- * Allocate a new btree cursor.
- * The cursor is either for allocation (A) or bmap (B) or inodes (I).
- */
-xfs_btree_cur_t * /* new btree cursor */
-xfs_btree_init_cursor(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_buf_t *agbp, /* (A only) buffer for agf structure */
- /* (I only) buffer for agi structure */
- xfs_agnumber_t agno, /* (AI only) allocation group number */
- xfs_btnum_t btnum, /* btree identifier */
- xfs_inode_t *ip, /* (B only) inode owning the btree */
- int whichfork) /* (B only) data or attr fork */
-{
- xfs_agf_t *agf; /* (A) allocation group freespace */
- xfs_agi_t *agi; /* (I) allocation group inodespace */
- xfs_btree_cur_t *cur; /* return value */
- xfs_ifork_t *ifp; /* (I) inode fork pointer */
- int nlevels=0; /* number of levels in the btree */
-
- ASSERT(xfs_btree_cur_zone != NULL);
- /*
- * Allocate a new cursor.
- */
- cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
- /*
- * Deduce the number of btree levels from the arguments.
- */
- switch (btnum) {
- case XFS_BTNUM_BNO:
- case XFS_BTNUM_CNT:
- agf = XFS_BUF_TO_AGF(agbp);
- nlevels = be32_to_cpu(agf->agf_levels[btnum]);
- break;
- case XFS_BTNUM_BMAP:
- ifp = XFS_IFORK_PTR(ip, whichfork);
- nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
- break;
- case XFS_BTNUM_INO:
- agi = XFS_BUF_TO_AGI(agbp);
- nlevels = be32_to_cpu(agi->agi_level);
- break;
- default:
- ASSERT(0);
- }
- /*
- * Fill in the common fields.
- */
- cur->bc_tp = tp;
- cur->bc_mp = mp;
- cur->bc_nlevels = nlevels;
- cur->bc_btnum = btnum;
- cur->bc_blocklog = mp->m_sb.sb_blocklog;
- /*
- * Fill in private fields.
- */
- switch (btnum) {
- case XFS_BTNUM_BNO:
- case XFS_BTNUM_CNT:
- /*
- * Allocation btree fields.
- */
- cur->bc_private.a.agbp = agbp;
- cur->bc_private.a.agno = agno;
- break;
- case XFS_BTNUM_INO:
- /*
- * Inode allocation btree fields.
- */
- cur->bc_private.a.agbp = agbp;
- cur->bc_private.a.agno = agno;
- break;
- case XFS_BTNUM_BMAP:
- /*
- * Bmap btree fields.
- */
- cur->bc_private.b.forksize = XFS_IFORK_SIZE(ip, whichfork);
- cur->bc_private.b.ip = ip;
- cur->bc_private.b.firstblock = NULLFSBLOCK;
- cur->bc_private.b.flist = NULL;
- cur->bc_private.b.allocated = 0;
- cur->bc_private.b.flags = 0;
- cur->bc_private.b.whichfork = whichfork;
- break;
- default:
- ASSERT(0);
- }
- return cur;
-}
-
-/*
* Check for the cursor referring to the last block at the given level.
*/
int /* 1=is last block, 0=not last block */
@@ -603,12 +509,12 @@ xfs_btree_islastblock(
xfs_btree_cur_t *cur, /* btree cursor */
int level) /* level to check */
{
- xfs_btree_block_t *block; /* generic btree block pointer */
+ struct xfs_btree_block *block; /* generic btree block pointer */
xfs_buf_t *bp; /* buffer containing block */
block = xfs_btree_get_block(cur, level, &bp);
xfs_btree_check_block(cur, block, level, bp);
- if (XFS_BTREE_LONG_PTRS(cur->bc_btnum))
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
return be64_to_cpu(block->bb_u.l.bb_rightsib) == NULLDFSBNO;
else
return be32_to_cpu(block->bb_u.s.bb_rightsib) == NULLAGBLOCK;
@@ -618,12 +524,12 @@ xfs_btree_islastblock(
* Change the cursor to point to the first record at the given level.
* Other levels are unaffected.
*/
-int /* success=1, failure=0 */
+STATIC int /* success=1, failure=0 */
xfs_btree_firstrec(
xfs_btree_cur_t *cur, /* btree cursor */
int level) /* level to change */
{
- xfs_btree_block_t *block; /* generic btree block pointer */
+ struct xfs_btree_block *block; /* generic btree block pointer */
xfs_buf_t *bp; /* buffer containing block */
/*
@@ -634,7 +540,7 @@ xfs_btree_firstrec(
/*
* It's empty, there is no such record.
*/
- if (!block->bb_h.bb_numrecs)
+ if (!block->bb_numrecs)
return 0;
/*
* Set the ptr value to 1, that's the first record/key.
@@ -647,12 +553,12 @@ xfs_btree_firstrec(
* Change the cursor to point to the last record in the current block
* at the given level. Other levels are unaffected.
*/
-int /* success=1, failure=0 */
+STATIC int /* success=1, failure=0 */
xfs_btree_lastrec(
xfs_btree_cur_t *cur, /* btree cursor */
int level) /* level to change */
{
- xfs_btree_block_t *block; /* generic btree block pointer */
+ struct xfs_btree_block *block; /* generic btree block pointer */
xfs_buf_t *bp; /* buffer containing block */
/*
@@ -663,12 +569,12 @@ xfs_btree_lastrec(
/*
* It's empty, there is no such record.
*/
- if (!block->bb_h.bb_numrecs)
+ if (!block->bb_numrecs)
return 0;
/*
* Set the ptr value to numrecs, that's the last record/key.
*/
- cur->bc_ptrs[level] = be16_to_cpu(block->bb_h.bb_numrecs);
+ cur->bc_ptrs[level] = be16_to_cpu(block->bb_numrecs);
return 1;
}
@@ -817,66 +723,84 @@ xfs_btree_reada_bufs(
xfs_baread(mp->m_ddev_targp, d, mp->m_bsize * count);
}
+STATIC int
+xfs_btree_readahead_lblock(
+ struct xfs_btree_cur *cur,
+ int lr,
+ struct xfs_btree_block *block)
+{
+ int rval = 0;
+ xfs_fsblock_t left = be64_to_cpu(block->bb_u.l.bb_leftsib);
+ xfs_fsblock_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
+
+ if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
+ xfs_btree_reada_bufl(cur->bc_mp, left, 1);
+ rval++;
+ }
+
+ if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
+ xfs_btree_reada_bufl(cur->bc_mp, right, 1);
+ rval++;
+ }
+
+ return rval;
+}
+
+STATIC int
+xfs_btree_readahead_sblock(
+ struct xfs_btree_cur *cur,
+ int lr,
+ struct xfs_btree_block *block)
+{
+ int rval = 0;
+ xfs_agblock_t left = be32_to_cpu(block->bb_u.s.bb_leftsib);
+ xfs_agblock_t right = be32_to_cpu(block->bb_u.s.bb_rightsib);
+
+
+ if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
+ xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+ left, 1);
+ rval++;
+ }
+
+ if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
+ xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
+ right, 1);
+ rval++;
+ }
+
+ return rval;
+}
+
/*
* Read-ahead btree blocks, at the given level.
* Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
*/
-int
-xfs_btree_readahead_core(
- xfs_btree_cur_t *cur, /* btree cursor */
+STATIC int
+xfs_btree_readahead(
+ struct xfs_btree_cur *cur, /* btree cursor */
int lev, /* level in btree */
int lr) /* left/right bits */
{
- xfs_alloc_block_t *a;
- xfs_bmbt_block_t *b;
- xfs_inobt_block_t *i;
- int rval = 0;
+ struct xfs_btree_block *block;
+
+ /*
+ * No readahead needed if we are at the root level and the
+ * btree root is stored in the inode.
+ */
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ (lev == cur->bc_nlevels - 1))
+ return 0;
+
+ if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
+ return 0;
- ASSERT(cur->bc_bufs[lev] != NULL);
cur->bc_ra[lev] |= lr;
- switch (cur->bc_btnum) {
- case XFS_BTNUM_BNO:
- case XFS_BTNUM_CNT:
- a = XFS_BUF_TO_ALLOC_BLOCK(cur->bc_bufs[lev]);
- if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(a->bb_leftsib) != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
- be32_to_cpu(a->bb_leftsib), 1);
- rval++;
- }
- if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(a->bb_rightsib) != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
- be32_to_cpu(a->bb_rightsib), 1);
- rval++;
- }
- break;
- case XFS_BTNUM_BMAP:
- b = XFS_BUF_TO_BMBT_BLOCK(cur->bc_bufs[lev]);
- if ((lr & XFS_BTCUR_LEFTRA) && be64_to_cpu(b->bb_leftsib) != NULLDFSBNO) {
- xfs_btree_reada_bufl(cur->bc_mp, be64_to_cpu(b->bb_leftsib), 1);
- rval++;
- }
- if ((lr & XFS_BTCUR_RIGHTRA) && be64_to_cpu(b->bb_rightsib) != NULLDFSBNO) {
- xfs_btree_reada_bufl(cur->bc_mp, be64_to_cpu(b->bb_rightsib), 1);
- rval++;
- }
- break;
- case XFS_BTNUM_INO:
- i = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]);
- if ((lr & XFS_BTCUR_LEFTRA) && be32_to_cpu(i->bb_leftsib) != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
- be32_to_cpu(i->bb_leftsib), 1);
- rval++;
- }
- if ((lr & XFS_BTCUR_RIGHTRA) && be32_to_cpu(i->bb_rightsib) != NULLAGBLOCK) {
- xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
- be32_to_cpu(i->bb_rightsib), 1);
- rval++;
- }
- break;
- default:
- ASSERT(0);
- }
- return rval;
+ block = XFS_BUF_TO_BLOCK(cur->bc_bufs[lev]);
+
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return xfs_btree_readahead_lblock(cur, lr, block);
+ return xfs_btree_readahead_sblock(cur, lr, block);
}
/*
@@ -889,7 +813,7 @@ xfs_btree_setbuf(
int lev, /* level in btree */
xfs_buf_t *bp) /* new buffer to set */
{
- xfs_btree_block_t *b; /* btree block */
+ struct xfs_btree_block *b; /* btree block */
xfs_buf_t *obp; /* old buffer pointer */
obp = cur->bc_bufs[lev];
@@ -900,7 +824,7 @@ xfs_btree_setbuf(
if (!bp)
return;
b = XFS_BUF_TO_BLOCK(bp);
- if (XFS_BTREE_LONG_PTRS(cur->bc_btnum)) {
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
if (be64_to_cpu(b->bb_u.l.bb_leftsib) == NULLDFSBNO)
cur->bc_ra[lev] |= XFS_BTCUR_LEFTRA;
if (be64_to_cpu(b->bb_u.l.bb_rightsib) == NULLDFSBNO)
@@ -912,3 +836,2855 @@ xfs_btree_setbuf(
cur->bc_ra[lev] |= XFS_BTCUR_RIGHTRA;
}
}
+
+STATIC int
+xfs_btree_ptr_is_null(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ return be64_to_cpu(ptr->l) == NULLFSBLOCK;
+ else
+ return be32_to_cpu(ptr->s) == NULLAGBLOCK;
+}
+
+STATIC void
+xfs_btree_set_ptr_null(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ ptr->l = cpu_to_be64(NULLFSBLOCK);
+ else
+ ptr->s = cpu_to_be32(NULLAGBLOCK);
+}
+
+/*
+ * Get/set/init sibling pointers
+ */
+STATIC void
+xfs_btree_get_sibling(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_ptr *ptr,
+ int lr)
+{
+ ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (lr == XFS_BB_RIGHTSIB)
+ ptr->l = block->bb_u.l.bb_rightsib;
+ else
+ ptr->l = block->bb_u.l.bb_leftsib;
+ } else {
+ if (lr == XFS_BB_RIGHTSIB)
+ ptr->s = block->bb_u.s.bb_rightsib;
+ else
+ ptr->s = block->bb_u.s.bb_leftsib;
+ }
+}
+
+STATIC void
+xfs_btree_set_sibling(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_ptr *ptr,
+ int lr)
+{
+ ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
+
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (lr == XFS_BB_RIGHTSIB)
+ block->bb_u.l.bb_rightsib = ptr->l;
+ else
+ block->bb_u.l.bb_leftsib = ptr->l;
+ } else {
+ if (lr == XFS_BB_RIGHTSIB)
+ block->bb_u.s.bb_rightsib = ptr->s;
+ else
+ block->bb_u.s.bb_leftsib = ptr->s;
+ }
+}
+
+STATIC void
+xfs_btree_init_block(
+ struct xfs_btree_cur *cur,
+ int level,
+ int numrecs,
+ struct xfs_btree_block *new) /* new block */
+{
+ new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
+ new->bb_level = cpu_to_be16(level);
+ new->bb_numrecs = cpu_to_be16(numrecs);
+
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ new->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
+ new->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
+ } else {
+ new->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+ new->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+ }
+}
+
+/*
+ * Return true if ptr is the last record in the btree and
+ * we need to track updateѕ to this record. The decision
+ * will be further refined in the update_lastrec method.
+ */
+STATIC int
+xfs_btree_is_lastrec(
+ struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ int level)
+{
+ union xfs_btree_ptr ptr;
+
+ if (level > 0)
+ return 0;
+ if (!(cur->bc_flags & XFS_BTREE_LASTREC_UPDATE))
+ return 0;
+
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+ if (!xfs_btree_ptr_is_null(cur, &ptr))
+ return 0;
+ return 1;
+}
+
+STATIC void
+xfs_btree_buf_to_ptr(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
+ union xfs_btree_ptr *ptr)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+ ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
+ XFS_BUF_ADDR(bp)));
+ else {
+ ptr->s = cpu_to_be32(XFS_DADDR_TO_AGBNO(cur->bc_mp,
+ XFS_BUF_ADDR(bp)));
+ }
+}
+
+STATIC xfs_daddr_t
+xfs_btree_ptr_to_daddr(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ ASSERT(be64_to_cpu(ptr->l) != NULLFSBLOCK);
+
+ return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+ } else {
+ ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
+ ASSERT(be32_to_cpu(ptr->s) != NULLAGBLOCK);
+
+ return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
+ be32_to_cpu(ptr->s));
+ }
+}
+
+STATIC void
+xfs_btree_set_refs(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
+{
+ switch (cur->bc_btnum) {
+ case XFS_BTNUM_BNO:
+ case XFS_BTNUM_CNT:
+ XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_ALLOC_BTREE_REF);
+ break;
+ case XFS_BTNUM_INO:
+ XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_INOMAP, XFS_INO_BTREE_REF);
+ break;
+ case XFS_BTNUM_BMAP:
+ XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_MAP, XFS_BMAP_BTREE_REF);
+ break;
+ default:
+ ASSERT(0);
+ }
+}
+
+STATIC int
+xfs_btree_get_buf_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ int flags,
+ struct xfs_btree_block **block,
+ struct xfs_buf **bpp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_daddr_t d;
+
+ /* need to sort out how callers deal with failures first */
+ ASSERT(!(flags & XFS_BUF_TRYLOCK));
+
+ d = xfs_btree_ptr_to_daddr(cur, ptr);
+ *bpp = xfs_trans_get_buf(cur->bc_tp, mp->m_ddev_targp, d,
+ mp->m_bsize, flags);
+
+ ASSERT(*bpp);
+ ASSERT(!XFS_BUF_GETERROR(*bpp));
+
+ *block = XFS_BUF_TO_BLOCK(*bpp);
+ return 0;
+}
+
+/*
+ * Read in the buffer at the given ptr and return the buffer and
+ * the block pointer within the buffer.
+ */
+STATIC int
+xfs_btree_read_buf_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ int level,
+ int flags,
+ struct xfs_btree_block **block,
+ struct xfs_buf **bpp)
+{
+ struct xfs_mount *mp = cur->bc_mp;
+ xfs_daddr_t d;
+ int error;
+
+ /* need to sort out how callers deal with failures first */
+ ASSERT(!(flags & XFS_BUF_TRYLOCK));
+
+ d = xfs_btree_ptr_to_daddr(cur, ptr);
+ error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
+ mp->m_bsize, flags, bpp);
+ if (error)
+ return error;
+
+ ASSERT(*bpp != NULL);
+ ASSERT(!XFS_BUF_GETERROR(*bpp));
+
+ xfs_btree_set_refs(cur, *bpp);
+ *block = XFS_BUF_TO_BLOCK(*bpp);
+
+ error = xfs_btree_check_block(cur, *block, level, *bpp);
+ if (error)
+ xfs_trans_brelse(cur->bc_tp, *bpp);
+ return error;
+}
+
+/*
+ * Copy keys from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_keys(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *dst_key,
+ union xfs_btree_key *src_key,
+ int numkeys)
+{
+ ASSERT(numkeys >= 0);
+ memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
+}
+
+/*
+ * Copy records from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_recs(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *dst_rec,
+ union xfs_btree_rec *src_rec,
+ int numrecs)
+{
+ ASSERT(numrecs >= 0);
+ memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
+}
+
+/*
+ * Copy block pointers from one btree block to another.
+ */
+STATIC void
+xfs_btree_copy_ptrs(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *dst_ptr,
+ union xfs_btree_ptr *src_ptr,
+ int numptrs)
+{
+ ASSERT(numptrs >= 0);
+ memcpy(dst_ptr, src_ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+
+/*
+ * Shift keys one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_keys(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key,
+ int dir,
+ int numkeys)
+{
+ char *dst_key;
+
+ ASSERT(numkeys >= 0);
+ ASSERT(dir == 1 || dir == -1);
+
+ dst_key = (char *)key + (dir * cur->bc_ops->key_len);
+ memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
+}
+
+/*
+ * Shift records one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_recs(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ int dir,
+ int numrecs)
+{
+ char *dst_rec;
+
+ ASSERT(numrecs >= 0);
+ ASSERT(dir == 1 || dir == -1);
+
+ dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
+ memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
+}
+
+/*
+ * Shift block pointers one index left/right inside a single btree block.
+ */
+STATIC void
+xfs_btree_shift_ptrs(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr,
+ int dir,
+ int numptrs)
+{
+ char *dst_ptr;
+
+ ASSERT(numptrs >= 0);
+ ASSERT(dir == 1 || dir == -1);
+
+ dst_ptr = (char *)ptr + (dir * xfs_btree_ptr_len(cur));
+ memmove(dst_ptr, ptr, numptrs * xfs_btree_ptr_len(cur));
+}
+
+/*
+ * Log key values from the btree block.
+ */
+STATIC void
+xfs_btree_log_keys(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
+ int first,
+ int last)
+{
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+ if (bp) {
+ xfs_trans_log_buf(cur->bc_tp, bp,
+ xfs_btree_key_offset(cur, first),
+ xfs_btree_key_offset(cur, last + 1) - 1);
+ } else {
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+ xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log record values from the btree block.
+ */
+void
+xfs_btree_log_recs(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
+ int first,
+ int last)
+{
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+ xfs_trans_log_buf(cur->bc_tp, bp,
+ xfs_btree_rec_offset(cur, first),
+ xfs_btree_rec_offset(cur, last + 1) - 1);
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log block pointer fields from a btree block (nonleaf).
+ */
+STATIC void
+xfs_btree_log_ptrs(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ struct xfs_buf *bp, /* buffer containing btree block */
+ int first, /* index of first pointer to log */
+ int last) /* index of last pointer to log */
+{
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGBII(cur, bp, first, last);
+
+ if (bp) {
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ int level = xfs_btree_get_level(block);
+
+ xfs_trans_log_buf(cur->bc_tp, bp,
+ xfs_btree_ptr_offset(cur, first, level),
+ xfs_btree_ptr_offset(cur, last + 1, level) - 1);
+ } else {
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+ xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Log fields from a btree block header.
+ */
+void
+xfs_btree_log_block(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ struct xfs_buf *bp, /* buffer containing btree block */
+ int fields) /* mask of fields: XFS_BB_... */
+{
+ int first; /* first byte offset logged */
+ int last; /* last byte offset logged */
+ static const short soffsets[] = { /* table of offsets (short) */
+ offsetof(struct xfs_btree_block, bb_magic),
+ offsetof(struct xfs_btree_block, bb_level),
+ offsetof(struct xfs_btree_block, bb_numrecs),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
+ offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
+ XFS_BTREE_SBLOCK_LEN
+ };
+ static const short loffsets[] = { /* table of offsets (long) */
+ offsetof(struct xfs_btree_block, bb_magic),
+ offsetof(struct xfs_btree_block, bb_level),
+ offsetof(struct xfs_btree_block, bb_numrecs),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
+ offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
+ XFS_BTREE_LBLOCK_LEN
+ };
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGBI(cur, bp, fields);
+
+ if (bp) {
+ xfs_btree_offsets(fields,
+ (cur->bc_flags & XFS_BTREE_LONG_PTRS) ?
+ loffsets : soffsets,
+ XFS_BB_NUM_BITS, &first, &last);
+ xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+ } else {
+ xfs_trans_log_inode(cur->bc_tp, cur->bc_private.b.ip,
+ xfs_ilog_fbroot(cur->bc_private.b.whichfork));
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+}
+
+/*
+ * Increment cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int /* error */
+xfs_btree_increment(
+ struct xfs_btree_cur *cur,
+ int level,
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_block *block;
+ union xfs_btree_ptr ptr;
+ struct xfs_buf *bp;
+ int error; /* error return value */
+ int lev;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGI(cur, level);
+
+ ASSERT(level < cur->bc_nlevels);
+
+ /* Read-ahead to the right at this level. */
+ xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+
+ /* Get a pointer to the btree block. */
+ block = xfs_btree_get_block(cur, level, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto error0;
+#endif
+
+ /* We're done if we remain in the block after the increment. */
+ if (++cur->bc_ptrs[level] <= xfs_btree_get_numrecs(block))
+ goto out1;
+
+ /* Fail if we just went off the right edge of the tree. */
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+ if (xfs_btree_ptr_is_null(cur, &ptr))
+ goto out0;
+
+ XFS_BTREE_STATS_INC(cur, increment);
+
+ /*
+ * March up the tree incrementing pointers.
+ * Stop when we don't go off the right edge of a block.
+ */
+ for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+ block = xfs_btree_get_block(cur, lev, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, lev, bp);
+ if (error)
+ goto error0;
+#endif
+
+ if (++cur->bc_ptrs[lev] <= xfs_btree_get_numrecs(block))
+ break;
+
+ /* Read-ahead the right block for the next loop. */
+ xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
+ }
+
+ /*
+ * If we went off the root then we are either seriously
+ * confused or have the tree root in an inode.
+ */
+ if (lev == cur->bc_nlevels) {
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ goto out0;
+ ASSERT(0);
+ error = EFSCORRUPTED;
+ goto error0;
+ }
+ ASSERT(lev < cur->bc_nlevels);
+
+ /*
+ * Now walk back down the tree, fixing up the cursor's buffer
+ * pointers and key numbers.
+ */
+ for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+ union xfs_btree_ptr *ptrp;
+
+ ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+ error = xfs_btree_read_buf_block(cur, ptrp, --lev,
+ 0, &block, &bp);
+ if (error)
+ goto error0;
+
+ xfs_btree_setbuf(cur, lev, bp);
+ cur->bc_ptrs[lev] = 1;
+ }
+out1:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+
+out0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Decrement cursor by one record at the level.
+ * For nonzero levels the leaf-ward information is untouched.
+ */
+int /* error */
+xfs_btree_decrement(
+ struct xfs_btree_cur *cur,
+ int level,
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_block *block;
+ xfs_buf_t *bp;
+ int error; /* error return value */
+ int lev;
+ union xfs_btree_ptr ptr;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGI(cur, level);
+
+ ASSERT(level < cur->bc_nlevels);
+
+ /* Read-ahead to the left at this level. */
+ xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
+
+ /* We're done if we remain in the block after the decrement. */
+ if (--cur->bc_ptrs[level] > 0)
+ goto out1;
+
+ /* Get a pointer to the btree block. */
+ block = xfs_btree_get_block(cur, level, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto error0;
+#endif
+
+ /* Fail if we just went off the left edge of the tree. */
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+ if (xfs_btree_ptr_is_null(cur, &ptr))
+ goto out0;
+
+ XFS_BTREE_STATS_INC(cur, decrement);
+
+ /*
+ * March up the tree decrementing pointers.
+ * Stop when we don't go off the left edge of a block.
+ */
+ for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
+ if (--cur->bc_ptrs[lev] > 0)
+ break;
+ /* Read-ahead the left block for the next loop. */
+ xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
+ }
+
+ /*
+ * If we went off the root then we are seriously confused.
+ * or the root of the tree is in an inode.
+ */
+ if (lev == cur->bc_nlevels) {
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
+ goto out0;
+ ASSERT(0);
+ error = EFSCORRUPTED;
+ goto error0;
+ }
+ ASSERT(lev < cur->bc_nlevels);
+
+ /*
+ * Now walk back down the tree, fixing up the cursor's buffer
+ * pointers and key numbers.
+ */
+ for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
+ union xfs_btree_ptr *ptrp;
+
+ ptrp = xfs_btree_ptr_addr(cur, cur->bc_ptrs[lev], block);
+ error = xfs_btree_read_buf_block(cur, ptrp, --lev,
+ 0, &block, &bp);
+ if (error)
+ goto error0;
+ xfs_btree_setbuf(cur, lev, bp);
+ cur->bc_ptrs[lev] = xfs_btree_get_numrecs(block);
+ }
+out1:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+
+out0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+STATIC int
+xfs_btree_lookup_get_block(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int level, /* level in the btree */
+ union xfs_btree_ptr *pp, /* ptr to btree block */
+ struct xfs_btree_block **blkp) /* return btree block */
+{
+ struct xfs_buf *bp; /* buffer pointer for btree block */
+ int error = 0;
+
+ /* special case the root block if in an inode */
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ (level == cur->bc_nlevels - 1)) {
+ *blkp = xfs_btree_get_iroot(cur);
+ return 0;
+ }
+
+ /*
+ * If the old buffer at this level for the disk address we are
+ * looking for re-use it.
+ *
+ * Otherwise throw it away and get a new one.
+ */
+ bp = cur->bc_bufs[level];
+ if (bp && XFS_BUF_ADDR(bp) == xfs_btree_ptr_to_daddr(cur, pp)) {
+ *blkp = XFS_BUF_TO_BLOCK(bp);
+ return 0;
+ }
+
+ error = xfs_btree_read_buf_block(cur, pp, level, 0, blkp, &bp);
+ if (error)
+ return error;
+
+ xfs_btree_setbuf(cur, level, bp);
+ return 0;
+}
+
+/*
+ * Get current search key. For level 0 we don't actually have a key
+ * structure so we make one up from the record. For all other levels
+ * we just return the right key.
+ */
+STATIC union xfs_btree_key *
+xfs_lookup_get_search_key(
+ struct xfs_btree_cur *cur,
+ int level,
+ int keyno,
+ struct xfs_btree_block *block,
+ union xfs_btree_key *kp)
+{
+ if (level == 0) {
+ cur->bc_ops->init_key_from_rec(kp,
+ xfs_btree_rec_addr(cur, keyno, block));
+ return kp;
+ }
+
+ return xfs_btree_key_addr(cur, keyno, block);
+}
+
+/*
+ * Lookup the record. The cursor is made to point to it, based on dir.
+ * Return 0 if can't find any such record, 1 for success.
+ */
+int /* error */
+xfs_btree_lookup(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_lookup_t dir, /* <=, ==, or >= */
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_block *block; /* current btree block */
+ __int64_t diff; /* difference for the current key */
+ int error; /* error return value */
+ int keyno; /* current key number */
+ int level; /* level in the btree */
+ union xfs_btree_ptr *pp; /* ptr to btree block */
+ union xfs_btree_ptr ptr; /* ptr to btree block */
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGI(cur, dir);
+
+ XFS_BTREE_STATS_INC(cur, lookup);
+
+ block = NULL;
+ keyno = 0;
+
+ /* initialise start pointer from cursor */
+ cur->bc_ops->init_ptr_from_cur(cur, &ptr);
+ pp = &ptr;
+
+ /*
+ * Iterate over each level in the btree, starting at the root.
+ * For each level above the leaves, find the key we need, based
+ * on the lookup record, then follow the corresponding block
+ * pointer down to the next level.
+ */
+ for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
+ /* Get the block we need to do the lookup on. */
+ error = xfs_btree_lookup_get_block(cur, level, pp, &block);
+ if (error)
+ goto error0;
+
+ if (diff == 0) {
+ /*
+ * If we already had a key match at a higher level, we
+ * know we need to use the first entry in this block.
+ */
+ keyno = 1;
+ } else {
+ /* Otherwise search this block. Do a binary search. */
+
+ int high; /* high entry number */
+ int low; /* low entry number */
+
+ /* Set low and high entry numbers, 1-based. */
+ low = 1;
+ high = xfs_btree_get_numrecs(block);
+ if (!high) {
+ /* Block is empty, must be an empty leaf. */
+ ASSERT(level == 0 && cur->bc_nlevels == 1);
+
+ cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+ }
+
+ /* Binary search the block. */
+ while (low <= high) {
+ union xfs_btree_key key;
+ union xfs_btree_key *kp;
+
+ XFS_BTREE_STATS_INC(cur, compare);
+
+ /* keyno is average of low and high. */
+ keyno = (low + high) >> 1;
+
+ /* Get current search key */
+ kp = xfs_lookup_get_search_key(cur, level,
+ keyno, block, &key);
+
+ /*
+ * Compute difference to get next direction:
+ * - less than, move right
+ * - greater than, move left
+ * - equal, we're done
+ */
+ diff = cur->bc_ops->key_diff(cur, kp);
+ if (diff < 0)
+ low = keyno + 1;
+ else if (diff > 0)
+ high = keyno - 1;
+ else
+ break;
+ }
+ }
+
+ /*
+ * If there are more levels, set up for the next level
+ * by getting the block number and filling in the cursor.
+ */
+ if (level > 0) {
+ /*
+ * If we moved left, need the previous key number,
+ * unless there isn't one.
+ */
+ if (diff > 0 && --keyno < 1)
+ keyno = 1;
+ pp = xfs_btree_ptr_addr(cur, keyno, block);
+
+#ifdef DEBUG
+ error = xfs_btree_check_ptr(cur, pp, 0, level);
+ if (error)
+ goto error0;
+#endif
+ cur->bc_ptrs[level] = keyno;
+ }
+ }
+
+ /* Done with the search. See if we need to adjust the results. */
+ if (dir != XFS_LOOKUP_LE && diff < 0) {
+ keyno++;
+ /*
+ * If ge search and we went off the end of the block, but it's
+ * not the last block, we're in the wrong block.
+ */
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+ if (dir == XFS_LOOKUP_GE &&
+ keyno > xfs_btree_get_numrecs(block) &&
+ !xfs_btree_ptr_is_null(cur, &ptr)) {
+ int i;
+
+ cur->bc_ptrs[0] = keyno;
+ error = xfs_btree_increment(cur, 0, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_RETURN(i == 1);
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+ }
+ } else if (dir == XFS_LOOKUP_LE && diff > 0)
+ keyno--;
+ cur->bc_ptrs[0] = keyno;
+
+ /* Return if we succeeded or not. */
+ if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
+ *stat = 0;
+ else if (dir != XFS_LOOKUP_EQ || diff == 0)
+ *stat = 1;
+ else
+ *stat = 0;
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Update keys at all levels from here to the root along the cursor's path.
+ */
+STATIC int
+xfs_btree_updkey(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *keyp,
+ int level)
+{
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+ union xfs_btree_key *kp;
+ int ptr;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGIK(cur, level, keyp);
+
+ ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) || level >= 1);
+
+ /*
+ * Go up the tree from this level toward the root.
+ * At each level, update the key value to the value input.
+ * Stop when we reach a level where the cursor isn't pointing
+ * at the first entry in the block.
+ */
+ for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
+#ifdef DEBUG
+ int error;
+#endif
+ block = xfs_btree_get_block(cur, level, &bp);
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+ }
+#endif
+ ptr = cur->bc_ptrs[level];
+ kp = xfs_btree_key_addr(cur, ptr, block);
+ xfs_btree_copy_keys(cur, kp, keyp, 1);
+ xfs_btree_log_keys(cur, bp, ptr, ptr);
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return 0;
+}
+
+/*
+ * Update the record referred to by cur to the value in the
+ * given record. This either works (return 0) or gets an
+ * EFSCORRUPTED error.
+ */
+int
+xfs_btree_update(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ struct xfs_btree_block *block;
+ struct xfs_buf *bp;
+ int error;
+ int ptr;
+ union xfs_btree_rec *rp;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGR(cur, rec);
+
+ /* Pick up the current block. */
+ block = xfs_btree_get_block(cur, 0, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, 0, bp);
+ if (error)
+ goto error0;
+#endif
+ /* Get the address of the rec to be updated. */
+ ptr = cur->bc_ptrs[0];
+ rp = xfs_btree_rec_addr(cur, ptr, block);
+
+ /* Fill in the new contents and log them. */
+ xfs_btree_copy_recs(cur, rp, rec, 1);
+ xfs_btree_log_recs(cur, bp, ptr, ptr);
+
+ /*
+ * If we are tracking the last record in the tree and
+ * we are at the far right edge of the tree, update it.
+ */
+ if (xfs_btree_is_lastrec(cur, block, 0)) {
+ cur->bc_ops->update_lastrec(cur, block, rec,
+ ptr, LASTREC_UPDATE);
+ }
+
+ /* Updating first rec in leaf. Pass new key value up to our parent. */
+ if (ptr == 1) {
+ union xfs_btree_key key;
+
+ cur->bc_ops->init_key_from_rec(&key, rec);
+ error = xfs_btree_updkey(cur, &key, 1);
+ if (error)
+ goto error0;
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Move 1 record left from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int /* error */
+xfs_btree_lshift(
+ struct xfs_btree_cur *cur,
+ int level,
+ int *stat) /* success/failure */
+{
+ union xfs_btree_key key; /* btree key */
+ struct xfs_buf *lbp; /* left buffer pointer */
+ struct xfs_btree_block *left; /* left btree block */
+ int lrecs; /* left record count */
+ struct xfs_buf *rbp; /* right buffer pointer */
+ struct xfs_btree_block *right; /* right btree block */
+ int rrecs; /* right record count */
+ union xfs_btree_ptr lptr; /* left btree pointer */
+ union xfs_btree_key *rkp = NULL; /* right btree key */
+ union xfs_btree_ptr *rpp = NULL; /* right address pointer */
+ union xfs_btree_rec *rrp = NULL; /* right record pointer */
+ int error; /* error return value */
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGI(cur, level);
+
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ level == cur->bc_nlevels - 1)
+ goto out0;
+
+ /* Set up variables for this block as "right". */
+ right = xfs_btree_get_block(cur, level, &rbp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, right, level, rbp);
+ if (error)
+ goto error0;
+#endif
+
+ /* If we've got no left sibling then we can't shift an entry left. */
+ xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+ if (xfs_btree_ptr_is_null(cur, &lptr))
+ goto out0;
+
+ /*
+ * If the cursor entry is the one that would be moved, don't
+ * do it... it's too complicated.
+ */
+ if (cur->bc_ptrs[level] <= 1)
+ goto out0;
+
+ /* Set up the left neighbor as "left". */
+ error = xfs_btree_read_buf_block(cur, &lptr, level, 0, &left, &lbp);
+ if (error)
+ goto error0;
+
+ /* If it's full, it can't take another entry. */
+ lrecs = xfs_btree_get_numrecs(left);
+ if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
+ goto out0;
+
+ rrecs = xfs_btree_get_numrecs(right);
+
+ /*
+ * We add one entry to the left side and remove one for the right side.
+ * Accout for it here, the changes will be updated on disk and logged
+ * later.
+ */
+ lrecs++;
+ rrecs--;
+
+ XFS_BTREE_STATS_INC(cur, lshift);
+ XFS_BTREE_STATS_ADD(cur, moves, 1);
+
+ /*
+ * If non-leaf, copy a key and a ptr to the left block.
+ * Log the changes to the left block.
+ */
+ if (level > 0) {
+ /* It's a non-leaf. Move keys and pointers. */
+ union xfs_btree_key *lkp; /* left btree key */
+ union xfs_btree_ptr *lpp; /* left address pointer */
+
+ lkp = xfs_btree_key_addr(cur, lrecs, left);
+ rkp = xfs_btree_key_addr(cur, 1, right);
+
+ lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+ rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+ error = xfs_btree_check_ptr(cur, rpp, 0, level);
+ if (error)
+ goto error0;
+#endif
+ xfs_btree_copy_keys(cur, lkp, rkp, 1);
+ xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
+
+ xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
+ xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
+
+ ASSERT(cur->bc_ops->keys_inorder(cur,
+ xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
+ } else {
+ /* It's a leaf. Move records. */
+ union xfs_btree_rec *lrp; /* left record pointer */
+
+ lrp = xfs_btree_rec_addr(cur, lrecs, left);
+ rrp = xfs_btree_rec_addr(cur, 1, right);
+
+ xfs_btree_copy_recs(cur, lrp, rrp, 1);
+ xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
+
+ ASSERT(cur->bc_ops->recs_inorder(cur,
+ xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
+ }
+
+ xfs_btree_set_numrecs(left, lrecs);
+ xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+
+ xfs_btree_set_numrecs(right, rrecs);
+ xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+
+ /*
+ * Slide the contents of right down one entry.
+ */
+ XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
+ if (level > 0) {
+ /* It's a nonleaf. operate on keys and ptrs */
+#ifdef DEBUG
+ int i; /* loop index */
+
+ for (i = 0; i < rrecs; i++) {
+ error = xfs_btree_check_ptr(cur, rpp, i + 1, level);
+ if (error)
+ goto error0;
+ }
+#endif
+ xfs_btree_shift_keys(cur,
+ xfs_btree_key_addr(cur, 2, right),
+ -1, rrecs);
+ xfs_btree_shift_ptrs(cur,
+ xfs_btree_ptr_addr(cur, 2, right),
+ -1, rrecs);
+
+ xfs_btree_log_keys(cur, rbp, 1, rrecs);
+ xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+ } else {
+ /* It's a leaf. operate on records */
+ xfs_btree_shift_recs(cur,
+ xfs_btree_rec_addr(cur, 2, right),
+ -1, rrecs);
+ xfs_btree_log_recs(cur, rbp, 1, rrecs);
+
+ /*
+ * If it's the first record in the block, we'll need a key
+ * structure to pass up to the next level (updkey).
+ */
+ cur->bc_ops->init_key_from_rec(&key,
+ xfs_btree_rec_addr(cur, 1, right));
+ rkp = &key;
+ }
+
+ /* Update the parent key values of right. */
+ error = xfs_btree_updkey(cur, rkp, level + 1);
+ if (error)
+ goto error0;
+
+ /* Slide the cursor value left one. */
+ cur->bc_ptrs[level]--;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+
+out0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Move 1 record right from cur/level if possible.
+ * Update cur to reflect the new path.
+ */
+STATIC int /* error */
+xfs_btree_rshift(
+ struct xfs_btree_cur *cur,
+ int level,
+ int *stat) /* success/failure */
+{
+ union xfs_btree_key key; /* btree key */
+ struct xfs_buf *lbp; /* left buffer pointer */
+ struct xfs_btree_block *left; /* left btree block */
+ struct xfs_buf *rbp; /* right buffer pointer */
+ struct xfs_btree_block *right; /* right btree block */
+ struct xfs_btree_cur *tcur; /* temporary btree cursor */
+ union xfs_btree_ptr rptr; /* right block pointer */
+ union xfs_btree_key *rkp; /* right btree key */
+ int rrecs; /* right record count */
+ int lrecs; /* left record count */
+ int error; /* error return value */
+ int i; /* loop counter */
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGI(cur, level);
+
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ (level == cur->bc_nlevels - 1))
+ goto out0;
+
+ /* Set up variables for this block as "left". */
+ left = xfs_btree_get_block(cur, level, &lbp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, left, level, lbp);
+ if (error)
+ goto error0;
+#endif
+
+ /* If we've got no right sibling then we can't shift an entry right. */
+ xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+ if (xfs_btree_ptr_is_null(cur, &rptr))
+ goto out0;
+
+ /*
+ * If the cursor entry is the one that would be moved, don't
+ * do it... it's too complicated.
+ */
+ lrecs = xfs_btree_get_numrecs(left);
+ if (cur->bc_ptrs[level] >= lrecs)
+ goto out0;
+
+ /* Set up the right neighbor as "right". */
+ error = xfs_btree_read_buf_block(cur, &rptr, level, 0, &right, &rbp);
+ if (error)
+ goto error0;
+
+ /* If it's full, it can't take another entry. */
+ rrecs = xfs_btree_get_numrecs(right);
+ if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
+ goto out0;
+
+ XFS_BTREE_STATS_INC(cur, rshift);
+ XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+
+ /*
+ * Make a hole at the start of the right neighbor block, then
+ * copy the last left block entry to the hole.
+ */
+ if (level > 0) {
+ /* It's a nonleaf. make a hole in the keys and ptrs */
+ union xfs_btree_key *lkp;
+ union xfs_btree_ptr *lpp;
+ union xfs_btree_ptr *rpp;
+
+ lkp = xfs_btree_key_addr(cur, lrecs, left);
+ lpp = xfs_btree_ptr_addr(cur, lrecs, left);
+ rkp = xfs_btree_key_addr(cur, 1, right);
+ rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+#ifdef DEBUG
+ for (i = rrecs - 1; i >= 0; i--) {
+ error = xfs_btree_check_ptr(cur, rpp, i, level);
+ if (error)
+ goto error0;
+ }
+#endif
+
+ xfs_btree_shift_keys(cur, rkp, 1, rrecs);
+ xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
+
+#ifdef DEBUG
+ error = xfs_btree_check_ptr(cur, lpp, 0, level);
+ if (error)
+ goto error0;
+#endif
+
+ /* Now put the new data in, and log it. */
+ xfs_btree_copy_keys(cur, rkp, lkp, 1);
+ xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
+
+ xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
+ xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
+
+ ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
+ xfs_btree_key_addr(cur, 2, right)));
+ } else {
+ /* It's a leaf. make a hole in the records */
+ union xfs_btree_rec *lrp;
+ union xfs_btree_rec *rrp;
+
+ lrp = xfs_btree_rec_addr(cur, lrecs, left);
+ rrp = xfs_btree_rec_addr(cur, 1, right);
+
+ xfs_btree_shift_recs(cur, rrp, 1, rrecs);
+
+ /* Now put the new data in, and log it. */
+ xfs_btree_copy_recs(cur, rrp, lrp, 1);
+ xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
+
+ cur->bc_ops->init_key_from_rec(&key, rrp);
+ rkp = &key;
+
+ ASSERT(cur->bc_ops->recs_inorder(cur, rrp,
+ xfs_btree_rec_addr(cur, 2, right)));
+ }
+
+ /*
+ * Decrement and log left's numrecs, bump and log right's numrecs.
+ */
+ xfs_btree_set_numrecs(left, --lrecs);
+ xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
+
+ xfs_btree_set_numrecs(right, ++rrecs);
+ xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
+
+ /*
+ * Using a temporary cursor, update the parent key values of the
+ * block on the right.
+ */
+ error = xfs_btree_dup_cursor(cur, &tcur);
+ if (error)
+ goto error0;
+ i = xfs_btree_lastrec(tcur, level);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ error = xfs_btree_increment(tcur, level, &i);
+ if (error)
+ goto error1;
+
+ error = xfs_btree_updkey(tcur, rkp, level + 1);
+ if (error)
+ goto error1;
+
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+
+out0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+
+error1:
+ XFS_BTREE_TRACE_CURSOR(tcur, XBT_ERROR);
+ xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Split cur/level block in half.
+ * Return new block number and the key to its first
+ * record (to be inserted into parent).
+ */
+STATIC int /* error */
+xfs_btree_split(
+ struct xfs_btree_cur *cur,
+ int level,
+ union xfs_btree_ptr *ptrp,
+ union xfs_btree_key *key,
+ struct xfs_btree_cur **curp,
+ int *stat) /* success/failure */
+{
+ union xfs_btree_ptr lptr; /* left sibling block ptr */
+ struct xfs_buf *lbp; /* left buffer pointer */
+ struct xfs_btree_block *left; /* left btree block */
+ union xfs_btree_ptr rptr; /* right sibling block ptr */
+ struct xfs_buf *rbp; /* right buffer pointer */
+ struct xfs_btree_block *right; /* right btree block */
+ union xfs_btree_ptr rrptr; /* right-right sibling ptr */
+ struct xfs_buf *rrbp; /* right-right buffer pointer */
+ struct xfs_btree_block *rrblock; /* right-right btree block */
+ int lrecs;
+ int rrecs;
+ int src_index;
+ int error; /* error return value */
+#ifdef DEBUG
+ int i;
+#endif
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGIPK(cur, level, *ptrp, key);
+
+ XFS_BTREE_STATS_INC(cur, split);
+
+ /* Set up left block (current one). */
+ left = xfs_btree_get_block(cur, level, &lbp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, left, level, lbp);
+ if (error)
+ goto error0;
+#endif
+
+ xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+
+ /* Allocate the new block. If we can't do it, we're toast. Give up. */
+ error = cur->bc_ops->alloc_block(cur, &lptr, &rptr, 1, stat);
+ if (error)
+ goto error0;
+ if (*stat == 0)
+ goto out0;
+ XFS_BTREE_STATS_INC(cur, alloc);
+
+ /* Set up the new block as "right". */
+ error = xfs_btree_get_buf_block(cur, &rptr, 0, &right, &rbp);
+ if (error)
+ goto error0;
+
+ /* Fill in the btree header for the new right block. */
+ xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right);
+
+ /*
+ * Split the entries between the old and the new block evenly.
+ * Make sure that if there's an odd number of entries now, that
+ * each new block will have the same number of entries.
+ */
+ lrecs = xfs_btree_get_numrecs(left);
+ rrecs = lrecs / 2;
+ if ((lrecs & 1) && cur->bc_ptrs[level] <= rrecs + 1)
+ rrecs++;
+ src_index = (lrecs - rrecs + 1);
+
+ XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+
+ /*
+ * Copy btree block entries from the left block over to the
+ * new block, the right. Update the right block and log the
+ * changes.
+ */
+ if (level > 0) {
+ /* It's a non-leaf. Move keys and pointers. */
+ union xfs_btree_key *lkp; /* left btree key */
+ union xfs_btree_ptr *lpp; /* left address pointer */
+ union xfs_btree_key *rkp; /* right btree key */
+ union xfs_btree_ptr *rpp; /* right address pointer */
+
+ lkp = xfs_btree_key_addr(cur, src_index, left);
+ lpp = xfs_btree_ptr_addr(cur, src_index, left);
+ rkp = xfs_btree_key_addr(cur, 1, right);
+ rpp = xfs_btree_ptr_addr(cur, 1, right);
+
+#ifdef DEBUG
+ for (i = src_index; i < rrecs; i++) {
+ error = xfs_btree_check_ptr(cur, lpp, i, level);
+ if (error)
+ goto error0;
+ }
+#endif
+
+ xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
+ xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
+
+ xfs_btree_log_keys(cur, rbp, 1, rrecs);
+ xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
+
+ /* Grab the keys to the entries moved to the right block */
+ xfs_btree_copy_keys(cur, key, rkp, 1);
+ } else {
+ /* It's a leaf. Move records. */
+ union xfs_btree_rec *lrp; /* left record pointer */
+ union xfs_btree_rec *rrp; /* right record pointer */
+
+ lrp = xfs_btree_rec_addr(cur, src_index, left);
+ rrp = xfs_btree_rec_addr(cur, 1, right);
+
+ xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
+ xfs_btree_log_recs(cur, rbp, 1, rrecs);
+
+ cur->bc_ops->init_key_from_rec(key,
+ xfs_btree_rec_addr(cur, 1, right));
+ }
+
+
+ /*
+ * Find the left block number by looking in the buffer.
+ * Adjust numrecs, sibling pointers.
+ */
+ xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
+ xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
+ xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+ xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
+
+ lrecs -= rrecs;
+ xfs_btree_set_numrecs(left, lrecs);
+ xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
+
+ xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
+ xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+
+ /*
+ * If there's a block to the new block's right, make that block
+ * point back to right instead of to left.
+ */
+ if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
+ error = xfs_btree_read_buf_block(cur, &rrptr, level,
+ 0, &rrblock, &rrbp);
+ if (error)
+ goto error0;
+ xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
+ xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+ }
+ /*
+ * If the cursor is really in the right block, move it there.
+ * If it's just pointing past the last entry in left, then we'll
+ * insert there, so don't change anything in that case.
+ */
+ if (cur->bc_ptrs[level] > lrecs + 1) {
+ xfs_btree_setbuf(cur, level, rbp);
+ cur->bc_ptrs[level] -= lrecs;
+ }
+ /*
+ * If there are more levels, we'll need another cursor which refers
+ * the right block, no matter where this cursor was.
+ */
+ if (level + 1 < cur->bc_nlevels) {
+ error = xfs_btree_dup_cursor(cur, curp);
+ if (error)
+ goto error0;
+ (*curp)->bc_ptrs[level + 1]++;
+ }
+ *ptrp = rptr;
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+out0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Copy the old inode root contents into a real block and make the
+ * broot point to it.
+ */
+int /* error */
+xfs_btree_new_iroot(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int *logflags, /* logging flags for inode */
+ int *stat) /* return status - 0 fail */
+{
+ struct xfs_buf *cbp; /* buffer for cblock */
+ struct xfs_btree_block *block; /* btree block */
+ struct xfs_btree_block *cblock; /* child btree block */
+ union xfs_btree_key *ckp; /* child key pointer */
+ union xfs_btree_ptr *cpp; /* child ptr pointer */
+ union xfs_btree_key *kp; /* pointer to btree key */
+ union xfs_btree_ptr *pp; /* pointer to block addr */
+ union xfs_btree_ptr nptr; /* new block addr */
+ int level; /* btree level */
+ int error; /* error return code */
+#ifdef DEBUG
+ int i; /* loop counter */
+#endif
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_STATS_INC(cur, newroot);
+
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+
+ level = cur->bc_nlevels - 1;
+
+ block = xfs_btree_get_iroot(cur);
+ pp = xfs_btree_ptr_addr(cur, 1, block);
+
+ /* Allocate the new block. If we can't do it, we're toast. Give up. */
+ error = cur->bc_ops->alloc_block(cur, pp, &nptr, 1, stat);
+ if (error)
+ goto error0;
+ if (*stat == 0) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return 0;
+ }
+ XFS_BTREE_STATS_INC(cur, alloc);
+
+ /* Copy the root into a real block. */
+ error = xfs_btree_get_buf_block(cur, &nptr, 0, &cblock, &cbp);
+ if (error)
+ goto error0;
+
+ memcpy(cblock, block, xfs_btree_block_len(cur));
+
+ be16_add_cpu(&block->bb_level, 1);
+ xfs_btree_set_numrecs(block, 1);
+ cur->bc_nlevels++;
+ cur->bc_ptrs[level + 1] = 1;
+
+ kp = xfs_btree_key_addr(cur, 1, block);
+ ckp = xfs_btree_key_addr(cur, 1, cblock);
+ xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
+
+ cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+ for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
+ error = xfs_btree_check_ptr(cur, pp, i, level);
+ if (error)
+ goto error0;
+ }
+#endif
+ xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
+
+#ifdef DEBUG
+ error = xfs_btree_check_ptr(cur, &nptr, 0, level);
+ if (error)
+ goto error0;
+#endif
+ xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
+
+ xfs_iroot_realloc(cur->bc_private.b.ip,
+ 1 - xfs_btree_get_numrecs(cblock),
+ cur->bc_private.b.whichfork);
+
+ xfs_btree_setbuf(cur, level, cbp);
+
+ /*
+ * Do all this logging at the end so that
+ * the root is at the right level.
+ */
+ xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
+ xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+ xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
+
+ *logflags |=
+ XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork);
+ *stat = 1;
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return 0;
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Allocate a new root block, fill it in.
+ */
+STATIC int /* error */
+xfs_btree_new_root(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_block *block; /* one half of the old root block */
+ struct xfs_buf *bp; /* buffer containing block */
+ int error; /* error return value */
+ struct xfs_buf *lbp; /* left buffer pointer */
+ struct xfs_btree_block *left; /* left btree block */
+ struct xfs_buf *nbp; /* new (root) buffer */
+ struct xfs_btree_block *new; /* new (root) btree block */
+ int nptr; /* new value for key index, 1 or 2 */
+ struct xfs_buf *rbp; /* right buffer pointer */
+ struct xfs_btree_block *right; /* right btree block */
+ union xfs_btree_ptr rptr;
+ union xfs_btree_ptr lptr;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_STATS_INC(cur, newroot);
+
+ /* initialise our start point from the cursor */
+ cur->bc_ops->init_ptr_from_cur(cur, &rptr);
+
+ /* Allocate the new block. If we can't do it, we're toast. Give up. */
+ error = cur->bc_ops->alloc_block(cur, &rptr, &lptr, 1, stat);
+ if (error)
+ goto error0;
+ if (*stat == 0)
+ goto out0;
+ XFS_BTREE_STATS_INC(cur, alloc);
+
+ /* Set up the new block. */
+ error = xfs_btree_get_buf_block(cur, &lptr, 0, &new, &nbp);
+ if (error)
+ goto error0;
+
+ /* Set the root in the holding structure increasing the level by 1. */
+ cur->bc_ops->set_root(cur, &lptr, 1);
+
+ /*
+ * At the previous root level there are now two blocks: the old root,
+ * and the new block generated when it was split. We don't know which
+ * one the cursor is pointing at, so we set up variables "left" and
+ * "right" for each case.
+ */
+ block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
+ if (error)
+ goto error0;
+#endif
+
+ xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+ if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+ /* Our block is left, pick up the right block. */
+ lbp = bp;
+ xfs_btree_buf_to_ptr(cur, lbp, &lptr);
+ left = block;
+ error = xfs_btree_read_buf_block(cur, &rptr,
+ cur->bc_nlevels - 1, 0, &right, &rbp);
+ if (error)
+ goto error0;
+ bp = rbp;
+ nptr = 1;
+ } else {
+ /* Our block is right, pick up the left block. */
+ rbp = bp;
+ xfs_btree_buf_to_ptr(cur, rbp, &rptr);
+ right = block;
+ xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
+ error = xfs_btree_read_buf_block(cur, &lptr,
+ cur->bc_nlevels - 1, 0, &left, &lbp);
+ if (error)
+ goto error0;
+ bp = lbp;
+ nptr = 2;
+ }
+ /* Fill in the new block's btree header and log it. */
+ xfs_btree_init_block(cur, cur->bc_nlevels, 2, new);
+ xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
+ ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
+ !xfs_btree_ptr_is_null(cur, &rptr));
+
+ /* Fill in the key data in the new root. */
+ if (xfs_btree_get_level(left) > 0) {
+ xfs_btree_copy_keys(cur,
+ xfs_btree_key_addr(cur, 1, new),
+ xfs_btree_key_addr(cur, 1, left), 1);
+ xfs_btree_copy_keys(cur,
+ xfs_btree_key_addr(cur, 2, new),
+ xfs_btree_key_addr(cur, 1, right), 1);
+ } else {
+ cur->bc_ops->init_key_from_rec(
+ xfs_btree_key_addr(cur, 1, new),
+ xfs_btree_rec_addr(cur, 1, left));
+ cur->bc_ops->init_key_from_rec(
+ xfs_btree_key_addr(cur, 2, new),
+ xfs_btree_rec_addr(cur, 1, right));
+ }
+ xfs_btree_log_keys(cur, nbp, 1, 2);
+
+ /* Fill in the pointer data in the new root. */
+ xfs_btree_copy_ptrs(cur,
+ xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
+ xfs_btree_copy_ptrs(cur,
+ xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
+ xfs_btree_log_ptrs(cur, nbp, 1, 2);
+
+ /* Fix up the cursor. */
+ xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
+ cur->bc_ptrs[cur->bc_nlevels] = nptr;
+ cur->bc_nlevels++;
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+out0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+}
+
+STATIC int
+xfs_btree_make_block_unfull(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int level, /* btree level */
+ int numrecs,/* # of recs in block */
+ int *oindex,/* old tree index */
+ int *index, /* new tree index */
+ union xfs_btree_ptr *nptr, /* new btree ptr */
+ struct xfs_btree_cur **ncur, /* new btree cursor */
+ union xfs_btree_rec *nrec, /* new record */
+ int *stat)
+{
+ union xfs_btree_key key; /* new btree key value */
+ int error = 0;
+
+ if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ level == cur->bc_nlevels - 1) {
+ struct xfs_inode *ip = cur->bc_private.b.ip;
+
+ if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
+ /* A root block that can be made bigger. */
+
+ xfs_iroot_realloc(ip, 1, cur->bc_private.b.whichfork);
+ } else {
+ /* A root block that needs replacing */
+ int logflags = 0;
+
+ error = xfs_btree_new_iroot(cur, &logflags, stat);
+ if (error || *stat == 0)
+ return error;
+
+ xfs_trans_log_inode(cur->bc_tp, ip, logflags);
+ }
+
+ return 0;
+ }
+
+ /* First, try shifting an entry to the right neighbor. */
+ error = xfs_btree_rshift(cur, level, stat);
+ if (error || *stat)
+ return error;
+
+ /* Next, try shifting an entry to the left neighbor. */
+ error = xfs_btree_lshift(cur, level, stat);
+ if (error)
+ return error;
+
+ if (*stat) {
+ *oindex = *index = cur->bc_ptrs[level];
+ return 0;
+ }
+
+ /*
+ * Next, try splitting the current block in half.
+ *
+ * If this works we have to re-set our variables because we
+ * could be in a different block now.
+ */
+ error = xfs_btree_split(cur, level, nptr, &key, ncur, stat);
+ if (error || *stat == 0)
+ return error;
+
+
+ *index = cur->bc_ptrs[level];
+ cur->bc_ops->init_rec_from_key(&key, nrec);
+ return 0;
+}
+
+/*
+ * Insert one record/level. Return information to the caller
+ * allowing the next level up to proceed if necessary.
+ */
+STATIC int
+xfs_btree_insrec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int level, /* level to insert record at */
+ union xfs_btree_ptr *ptrp, /* i/o: block number inserted */
+ union xfs_btree_rec *recp, /* i/o: record data inserted */
+ struct xfs_btree_cur **curp, /* output: new cursor replacing cur */
+ int *stat) /* success/failure */
+{
+ struct xfs_btree_block *block; /* btree block */
+ struct xfs_buf *bp; /* buffer for block */
+ union xfs_btree_key key; /* btree key */
+ union xfs_btree_ptr nptr; /* new block ptr */
+ struct xfs_btree_cur *ncur; /* new btree cursor */
+ union xfs_btree_rec nrec; /* new record count */
+ int optr; /* old key/record index */
+ int ptr; /* key/record index */
+ int numrecs;/* number of records */
+ int error; /* error return value */
+#ifdef DEBUG
+ int i;
+#endif
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGIPR(cur, level, *ptrp, recp);
+
+ ncur = NULL;
+
+ /*
+ * If we have an external root pointer, and we've made it to the
+ * root level, allocate a new root block and we're done.
+ */
+ if (!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
+ (level >= cur->bc_nlevels)) {
+ error = xfs_btree_new_root(cur, stat);
+ xfs_btree_set_ptr_null(cur, ptrp);
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return error;
+ }
+
+ /* If we're off the left edge, return failure. */
+ ptr = cur->bc_ptrs[level];
+ if (ptr == 0) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+ }
+
+ /* Make a key out of the record data to be inserted, and save it. */
+ cur->bc_ops->init_key_from_rec(&key, recp);
+
+ optr = ptr;
+
+ XFS_BTREE_STATS_INC(cur, insrec);
+
+ /* Get pointers to the btree buffer and block. */
+ block = xfs_btree_get_block(cur, level, &bp);
+ numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto error0;
+
+ /* Check that the new entry is being inserted in the right place. */
+ if (ptr <= numrecs) {
+ if (level == 0) {
+ ASSERT(cur->bc_ops->recs_inorder(cur, recp,
+ xfs_btree_rec_addr(cur, ptr, block)));
+ } else {
+ ASSERT(cur->bc_ops->keys_inorder(cur, &key,
+ xfs_btree_key_addr(cur, ptr, block)));
+ }
+ }
+#endif
+
+ /*
+ * If the block is full, we can't insert the new entry until we
+ * make the block un-full.
+ */
+ xfs_btree_set_ptr_null(cur, &nptr);
+ if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
+ error = xfs_btree_make_block_unfull(cur, level, numrecs,
+ &optr, &ptr, &nptr, &ncur, &nrec, stat);
+ if (error || *stat == 0)
+ goto error0;
+ }
+
+ /*
+ * The current block may have changed if the block was
+ * previously full and we have just made space in it.
+ */
+ block = xfs_btree_get_block(cur, level, &bp);
+ numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ return error;
+#endif
+
+ /*
+ * At this point we know there's room for our new entry in the block
+ * we're pointing at.
+ */
+ XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
+
+ if (level > 0) {
+ /* It's a nonleaf. make a hole in the keys and ptrs */
+ union xfs_btree_key *kp;
+ union xfs_btree_ptr *pp;
+
+ kp = xfs_btree_key_addr(cur, ptr, block);
+ pp = xfs_btree_ptr_addr(cur, ptr, block);
+
+#ifdef DEBUG
+ for (i = numrecs - ptr; i >= 0; i--) {
+ error = xfs_btree_check_ptr(cur, pp, i, level);
+ if (error)
+ return error;
+ }
+#endif
+
+ xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
+ xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
+
+#ifdef DEBUG
+ error = xfs_btree_check_ptr(cur, ptrp, 0, level);
+ if (error)
+ goto error0;
+#endif
+
+ /* Now put the new data in, bump numrecs and log it. */
+ xfs_btree_copy_keys(cur, kp, &key, 1);
+ xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
+ numrecs++;
+ xfs_btree_set_numrecs(block, numrecs);
+ xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
+ xfs_btree_log_keys(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+ if (ptr < numrecs) {
+ ASSERT(cur->bc_ops->keys_inorder(cur, kp,
+ xfs_btree_key_addr(cur, ptr + 1, block)));
+ }
+#endif
+ } else {
+ /* It's a leaf. make a hole in the records */
+ union xfs_btree_rec *rp;
+
+ rp = xfs_btree_rec_addr(cur, ptr, block);
+
+ xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
+
+ /* Now put the new data in, bump numrecs and log it. */
+ xfs_btree_copy_recs(cur, rp, recp, 1);
+ xfs_btree_set_numrecs(block, ++numrecs);
+ xfs_btree_log_recs(cur, bp, ptr, numrecs);
+#ifdef DEBUG
+ if (ptr < numrecs) {
+ ASSERT(cur->bc_ops->recs_inorder(cur, rp,
+ xfs_btree_rec_addr(cur, ptr + 1, block)));
+ }
+#endif
+ }
+
+ /* Log the new number of records in the btree header. */
+ xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+
+ /* If we inserted at the start of a block, update the parents' keys. */
+ if (optr == 1) {
+ error = xfs_btree_updkey(cur, &key, level + 1);
+ if (error)
+ goto error0;
+ }
+
+ /*
+ * If we are tracking the last record in the tree and
+ * we are at the far right edge of the tree, update it.
+ */
+ if (xfs_btree_is_lastrec(cur, block, level)) {
+ cur->bc_ops->update_lastrec(cur, block, recp,
+ ptr, LASTREC_INSREC);
+ }
+
+ /*
+ * Return the new block number, if any.
+ * If there is one, give back a record value and a cursor too.
+ */
+ *ptrp = nptr;
+ if (!xfs_btree_ptr_is_null(cur, &nptr)) {
+ *recp = nrec;
+ *curp = ncur;
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Insert the record at the point referenced by cur.
+ *
+ * A multi-level split of the tree on insert will invalidate the original
+ * cursor. All callers of this function should assume that the cursor is
+ * no longer valid and revalidate it.
+ */
+int
+xfs_btree_insert(
+ struct xfs_btree_cur *cur,
+ int *stat)
+{
+ int error; /* error return value */
+ int i; /* result value, 0 for failure */
+ int level; /* current level number in btree */
+ union xfs_btree_ptr nptr; /* new block number (split result) */
+ struct xfs_btree_cur *ncur; /* new cursor (split result) */
+ struct xfs_btree_cur *pcur; /* previous level's cursor */
+ union xfs_btree_rec rec; /* record to insert */
+
+ level = 0;
+ ncur = NULL;
+ pcur = cur;
+
+ xfs_btree_set_ptr_null(cur, &nptr);
+ cur->bc_ops->init_rec_from_cur(cur, &rec);
+
+ /*
+ * Loop going up the tree, starting at the leaf level.
+ * Stop when we don't get a split block, that must mean that
+ * the insert is finished with this level.
+ */
+ do {
+ /*
+ * Insert nrec/nptr into this level of the tree.
+ * Note if we fail, nptr will be null.
+ */
+ error = xfs_btree_insrec(pcur, level, &nptr, &rec, &ncur, &i);
+ if (error) {
+ if (pcur != cur)
+ xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
+ goto error0;
+ }
+
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ level++;
+
+ /*
+ * See if the cursor we just used is trash.
+ * Can't trash the caller's cursor, but otherwise we should
+ * if ncur is a new cursor or we're about to be done.
+ */
+ if (pcur != cur &&
+ (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
+ /* Save the state from the cursor before we trash it */
+ if (cur->bc_ops->update_cursor)
+ cur->bc_ops->update_cursor(pcur, cur);
+ cur->bc_nlevels = pcur->bc_nlevels;
+ xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
+ }
+ /* If we got a new cursor, switch to it. */
+ if (ncur) {
+ pcur = ncur;
+ ncur = NULL;
+ }
+ } while (!xfs_btree_ptr_is_null(cur, &nptr));
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = i;
+ return 0;
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Try to merge a non-leaf block back into the inode root.
+ *
+ * Note: the killroot names comes from the fact that we're effectively
+ * killing the old root block. But because we can't just delete the
+ * inode we have to copy the single block it was pointing to into the
+ * inode.
+ */
+int
+xfs_btree_kill_iroot(
+ struct xfs_btree_cur *cur)
+{
+ int whichfork = cur->bc_private.b.whichfork;
+ struct xfs_inode *ip = cur->bc_private.b.ip;
+ struct xfs_ifork *ifp = XFS_IFORK_PTR(ip, whichfork);
+ struct xfs_btree_block *block;
+ struct xfs_btree_block *cblock;
+ union xfs_btree_key *kp;
+ union xfs_btree_key *ckp;
+ union xfs_btree_ptr *pp;
+ union xfs_btree_ptr *cpp;
+ struct xfs_buf *cbp;
+ int level;
+ int index;
+ int numrecs;
+#ifdef DEBUG
+ union xfs_btree_ptr ptr;
+ int i;
+#endif
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+ ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+ ASSERT(cur->bc_nlevels > 1);
+
+ /*
+ * Don't deal with the root block needs to be a leaf case.
+ * We're just going to turn the thing back into extents anyway.
+ */
+ level = cur->bc_nlevels - 1;
+ if (level == 1)
+ goto out0;
+
+ /*
+ * Give up if the root has multiple children.
+ */
+ block = xfs_btree_get_iroot(cur);
+ if (xfs_btree_get_numrecs(block) != 1)
+ goto out0;
+
+ cblock = xfs_btree_get_block(cur, level - 1, &cbp);
+ numrecs = xfs_btree_get_numrecs(cblock);
+
+ /*
+ * Only do this if the next level will fit.
+ * Then the data must be copied up to the inode,
+ * instead of freeing the root you free the next level.
+ */
+ if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
+ goto out0;
+
+ XFS_BTREE_STATS_INC(cur, killroot);
+
+#ifdef DEBUG
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
+ ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+ xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
+ ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
+#endif
+
+ index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
+ if (index) {
+ xfs_iroot_realloc(cur->bc_private.b.ip, index,
+ cur->bc_private.b.whichfork);
+ block = ifp->if_broot;
+ }
+
+ be16_add_cpu(&block->bb_numrecs, index);
+ ASSERT(block->bb_numrecs == cblock->bb_numrecs);
+
+ kp = xfs_btree_key_addr(cur, 1, block);
+ ckp = xfs_btree_key_addr(cur, 1, cblock);
+ xfs_btree_copy_keys(cur, kp, ckp, numrecs);
+
+ pp = xfs_btree_ptr_addr(cur, 1, block);
+ cpp = xfs_btree_ptr_addr(cur, 1, cblock);
+#ifdef DEBUG
+ for (i = 0; i < numrecs; i++) {
+ int error;
+
+ error = xfs_btree_check_ptr(cur, cpp, i, level - 1);
+ if (error) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+ }
+ }
+#endif
+ xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
+
+ cur->bc_ops->free_block(cur, cbp);
+ XFS_BTREE_STATS_INC(cur, free);
+
+ cur->bc_bufs[level - 1] = NULL;
+ be16_add_cpu(&block->bb_level, -1);
+ xfs_trans_log_inode(cur->bc_tp, ip,
+ XFS_ILOG_CORE | XFS_ILOG_FBROOT(cur->bc_private.b.whichfork));
+ cur->bc_nlevels--;
+out0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ return 0;
+}
+
+STATIC int
+xfs_btree_dec_cursor(
+ struct xfs_btree_cur *cur,
+ int level,
+ int *stat)
+{
+ int error;
+ int i;
+
+ if (level > 0) {
+ error = xfs_btree_decrement(cur, level, &i);
+ if (error)
+ return error;
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+}
+
+/*
+ * Single level of the btree record deletion routine.
+ * Delete record pointed to by cur/level.
+ * Remove the record from its block then rebalance the tree.
+ * Return 0 for error, 1 for done, 2 to go on to the next level.
+ */
+STATIC int /* error */
+xfs_btree_delrec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int level, /* level removing record from */
+ int *stat) /* fail/done/go-on */
+{
+ struct xfs_btree_block *block; /* btree block */
+ union xfs_btree_ptr cptr; /* current block ptr */
+ struct xfs_buf *bp; /* buffer for block */
+ int error; /* error return value */
+ int i; /* loop counter */
+ union xfs_btree_key key; /* storage for keyp */
+ union xfs_btree_key *keyp = &key; /* passed to the next level */
+ union xfs_btree_ptr lptr; /* left sibling block ptr */
+ struct xfs_buf *lbp; /* left buffer pointer */
+ struct xfs_btree_block *left; /* left btree block */
+ int lrecs = 0; /* left record count */
+ int ptr; /* key/record index */
+ union xfs_btree_ptr rptr; /* right sibling block ptr */
+ struct xfs_buf *rbp; /* right buffer pointer */
+ struct xfs_btree_block *right; /* right btree block */
+ struct xfs_btree_block *rrblock; /* right-right btree block */
+ struct xfs_buf *rrbp; /* right-right buffer pointer */
+ int rrecs = 0; /* right record count */
+ struct xfs_btree_cur *tcur; /* temporary btree cursor */
+ int numrecs; /* temporary numrec count */
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_TRACE_ARGI(cur, level);
+
+ tcur = NULL;
+
+ /* Get the index of the entry being deleted, check for nothing there. */
+ ptr = cur->bc_ptrs[level];
+ if (ptr == 0) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+ }
+
+ /* Get the buffer & block containing the record or key/ptr. */
+ block = xfs_btree_get_block(cur, level, &bp);
+ numrecs = xfs_btree_get_numrecs(block);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, level, bp);
+ if (error)
+ goto error0;
+#endif
+
+ /* Fail if we're off the end of the block. */
+ if (ptr > numrecs) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 0;
+ return 0;
+ }
+
+ XFS_BTREE_STATS_INC(cur, delrec);
+ XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
+
+ /* Excise the entries being deleted. */
+ if (level > 0) {
+ /* It's a nonleaf. operate on keys and ptrs */
+ union xfs_btree_key *lkp;
+ union xfs_btree_ptr *lpp;
+
+ lkp = xfs_btree_key_addr(cur, ptr + 1, block);
+ lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
+
+#ifdef DEBUG
+ for (i = 0; i < numrecs - ptr; i++) {
+ error = xfs_btree_check_ptr(cur, lpp, i, level);
+ if (error)
+ goto error0;
+ }
+#endif
+
+ if (ptr < numrecs) {
+ xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
+ xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
+ xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
+ xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
+ }
+
+ /*
+ * If it's the first record in the block, we'll need to pass a
+ * key up to the next level (updkey).
+ */
+ if (ptr == 1)
+ keyp = xfs_btree_key_addr(cur, 1, block);
+ } else {
+ /* It's a leaf. operate on records */
+ if (ptr < numrecs) {
+ xfs_btree_shift_recs(cur,
+ xfs_btree_rec_addr(cur, ptr + 1, block),
+ -1, numrecs - ptr);
+ xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
+ }
+
+ /*
+ * If it's the first record in the block, we'll need a key
+ * structure to pass up to the next level (updkey).
+ */
+ if (ptr == 1) {
+ cur->bc_ops->init_key_from_rec(&key,
+ xfs_btree_rec_addr(cur, 1, block));
+ keyp = &key;
+ }
+ }
+
+ /*
+ * Decrement and log the number of entries in the block.
+ */
+ xfs_btree_set_numrecs(block, --numrecs);
+ xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
+
+ /*
+ * If we are tracking the last record in the tree and
+ * we are at the far right edge of the tree, update it.
+ */
+ if (xfs_btree_is_lastrec(cur, block, level)) {
+ cur->bc_ops->update_lastrec(cur, block, NULL,
+ ptr, LASTREC_DELREC);
+ }
+
+ /*
+ * We're at the root level. First, shrink the root block in-memory.
+ * Try to get rid of the next level down. If we can't then there's
+ * nothing left to do.
+ */
+ if (level == cur->bc_nlevels - 1) {
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ xfs_iroot_realloc(cur->bc_private.b.ip, -1,
+ cur->bc_private.b.whichfork);
+
+ error = xfs_btree_kill_iroot(cur);
+ if (error)
+ goto error0;
+
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ *stat = 1;
+ return 0;
+ }
+
+ /*
+ * If this is the root level, and there's only one entry left,
+ * and it's NOT the leaf level, then we can get rid of this
+ * level.
+ */
+ if (numrecs == 1 && level > 0) {
+ union xfs_btree_ptr *pp;
+ /*
+ * pp is still set to the first pointer in the block.
+ * Make it the new root of the btree.
+ */
+ pp = xfs_btree_ptr_addr(cur, 1, block);
+ error = cur->bc_ops->kill_root(cur, bp, level, pp);
+ if (error)
+ goto error0;
+ } else if (level > 0) {
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ }
+ *stat = 1;
+ return 0;
+ }
+
+ /*
+ * If we deleted the leftmost entry in the block, update the
+ * key values above us in the tree.
+ */
+ if (ptr == 1) {
+ error = xfs_btree_updkey(cur, keyp, level + 1);
+ if (error)
+ goto error0;
+ }
+
+ /*
+ * If the number of records remaining in the block is at least
+ * the minimum, we're done.
+ */
+ if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ return 0;
+ }
+
+ /*
+ * Otherwise, we have to move some records around to keep the
+ * tree balanced. Look at the left and right sibling blocks to
+ * see if we can re-balance by moving only one record.
+ */
+ xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+ xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
+
+ if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
+ /*
+ * One child of root, need to get a chance to copy its contents
+ * into the root and delete it. Can't go up to next level,
+ * there's nothing to delete there.
+ */
+ if (xfs_btree_ptr_is_null(cur, &rptr) &&
+ xfs_btree_ptr_is_null(cur, &lptr) &&
+ level == cur->bc_nlevels - 2) {
+ error = xfs_btree_kill_iroot(cur);
+ if (!error)
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ return 0;
+ }
+ }
+
+ ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
+ !xfs_btree_ptr_is_null(cur, &lptr));
+
+ /*
+ * Duplicate the cursor so our btree manipulations here won't
+ * disrupt the next level up.
+ */
+ error = xfs_btree_dup_cursor(cur, &tcur);
+ if (error)
+ goto error0;
+
+ /*
+ * If there's a right sibling, see if it's ok to shift an entry
+ * out of it.
+ */
+ if (!xfs_btree_ptr_is_null(cur, &rptr)) {
+ /*
+ * Move the temp cursor to the last entry in the next block.
+ * Actually any entry but the first would suffice.
+ */
+ i = xfs_btree_lastrec(tcur, level);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ error = xfs_btree_increment(tcur, level, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ i = xfs_btree_lastrec(tcur, level);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ /* Grab a pointer to the block. */
+ right = xfs_btree_get_block(tcur, level, &rbp);
+#ifdef DEBUG
+ error = xfs_btree_check_block(tcur, right, level, rbp);
+ if (error)
+ goto error0;
+#endif
+ /* Grab the current block number, for future use. */
+ xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
+
+ /*
+ * If right block is full enough so that removing one entry
+ * won't make it too empty, and left-shifting an entry out
+ * of right to us works, we're done.
+ */
+ if (xfs_btree_get_numrecs(right) - 1 >=
+ cur->bc_ops->get_minrecs(tcur, level)) {
+ error = xfs_btree_lshift(tcur, level, &i);
+ if (error)
+ goto error0;
+ if (i) {
+ ASSERT(xfs_btree_get_numrecs(block) >=
+ cur->bc_ops->get_minrecs(tcur, level));
+
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ tcur = NULL;
+
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ return 0;
+ }
+ }
+
+ /*
+ * Otherwise, grab the number of records in right for
+ * future reference, and fix up the temp cursor to point
+ * to our block again (last record).
+ */
+ rrecs = xfs_btree_get_numrecs(right);
+ if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+ i = xfs_btree_firstrec(tcur, level);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ error = xfs_btree_decrement(tcur, level, &i);
+ if (error)
+ goto error0;
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+ }
+ }
+
+ /*
+ * If there's a left sibling, see if it's ok to shift an entry
+ * out of it.
+ */
+ if (!xfs_btree_ptr_is_null(cur, &lptr)) {
+ /*
+ * Move the temp cursor to the first entry in the
+ * previous block.
+ */
+ i = xfs_btree_firstrec(tcur, level);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ error = xfs_btree_decrement(tcur, level, &i);
+ if (error)
+ goto error0;
+ i = xfs_btree_firstrec(tcur, level);
+ XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+
+ /* Grab a pointer to the block. */
+ left = xfs_btree_get_block(tcur, level, &lbp);
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, left, level, lbp);
+ if (error)
+ goto error0;
+#endif
+ /* Grab the current block number, for future use. */
+ xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
+
+ /*
+ * If left block is full enough so that removing one entry
+ * won't make it too empty, and right-shifting an entry out
+ * of left to us works, we're done.
+ */
+ if (xfs_btree_get_numrecs(left) - 1 >=
+ cur->bc_ops->get_minrecs(tcur, level)) {
+ error = xfs_btree_rshift(tcur, level, &i);
+ if (error)
+ goto error0;
+ if (i) {
+ ASSERT(xfs_btree_get_numrecs(block) >=
+ cur->bc_ops->get_minrecs(tcur, level));
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ tcur = NULL;
+ if (level == 0)
+ cur->bc_ptrs[0]++;
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = 1;
+ return 0;
+ }
+ }
+
+ /*
+ * Otherwise, grab the number of records in right for
+ * future reference.
+ */
+ lrecs = xfs_btree_get_numrecs(left);
+ }
+
+ /* Delete the temp cursor, we're done with it. */
+ xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+ tcur = NULL;
+
+ /* If here, we need to do a join to keep the tree balanced. */
+ ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
+
+ if (!xfs_btree_ptr_is_null(cur, &lptr) &&
+ lrecs + xfs_btree_get_numrecs(block) <=
+ cur->bc_ops->get_maxrecs(cur, level)) {
+ /*
+ * Set "right" to be the starting block,
+ * "left" to be the left neighbor.
+ */
+ rptr = cptr;
+ right = block;
+ rbp = bp;
+ error = xfs_btree_read_buf_block(cur, &lptr, level,
+ 0, &left, &lbp);
+ if (error)
+ goto error0;
+
+ /*
+ * If that won't work, see if we can join with the right neighbor block.
+ */
+ } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
+ rrecs + xfs_btree_get_numrecs(block) <=
+ cur->bc_ops->get_maxrecs(cur, level)) {
+ /*
+ * Set "left" to be the starting block,
+ * "right" to be the right neighbor.
+ */
+ lptr = cptr;
+ left = block;
+ lbp = bp;
+ error = xfs_btree_read_buf_block(cur, &rptr, level,
+ 0, &right, &rbp);
+ if (error)
+ goto error0;
+
+ /*
+ * Otherwise, we can't fix the imbalance.
+ * Just return. This is probably a logic error, but it's not fatal.
+ */
+ } else {
+ error = xfs_btree_dec_cursor(cur, level, stat);
+ if (error)
+ goto error0;
+ return 0;
+ }
+
+ rrecs = xfs_btree_get_numrecs(right);
+ lrecs = xfs_btree_get_numrecs(left);
+
+ /*
+ * We're now going to join "left" and "right" by moving all the stuff
+ * in "right" to "left" and deleting "right".
+ */
+ XFS_BTREE_STATS_ADD(cur, moves, rrecs);
+ if (level > 0) {
+ /* It's a non-leaf. Move keys and pointers. */
+ union xfs_btree_key *lkp; /* left btree key */
+ union xfs_btree_ptr *lpp; /* left address pointer */
+ union xfs_btree_key *rkp; /* right btree key */
+ union xfs_btree_ptr *rpp; /* right address pointer */
+
+ lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
+ lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
+ rkp = xfs_btree_key_addr(cur, 1, right);
+ rpp = xfs_btree_ptr_addr(cur, 1, right);
+#ifdef DEBUG
+ for (i = 1; i < rrecs; i++) {
+ error = xfs_btree_check_ptr(cur, rpp, i, level);
+ if (error)
+ goto error0;
+ }
+#endif
+ xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
+ xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
+
+ xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
+ xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
+ } else {
+ /* It's a leaf. Move records. */
+ union xfs_btree_rec *lrp; /* left record pointer */
+ union xfs_btree_rec *rrp; /* right record pointer */
+
+ lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
+ rrp = xfs_btree_rec_addr(cur, 1, right);
+
+ xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
+ xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
+ }
+
+ XFS_BTREE_STATS_INC(cur, join);
+
+ /*
+ * Fix up the the number of records and right block pointer in the
+ * surviving block, and log it.
+ */
+ xfs_btree_set_numrecs(left, lrecs + rrecs);
+ xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB),
+ xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+ xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
+
+ /* If there is a right sibling, point it to the remaining block. */
+ xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
+ if (!xfs_btree_ptr_is_null(cur, &cptr)) {
+ error = xfs_btree_read_buf_block(cur, &cptr, level,
+ 0, &rrblock, &rrbp);
+ if (error)
+ goto error0;
+ xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
+ xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
+ }
+
+ /* Free the deleted block. */
+ error = cur->bc_ops->free_block(cur, rbp);
+ if (error)
+ goto error0;
+ XFS_BTREE_STATS_INC(cur, free);
+
+ /*
+ * If we joined with the left neighbor, set the buffer in the
+ * cursor to the left block, and fix up the index.
+ */
+ if (bp != lbp) {
+ cur->bc_bufs[level] = lbp;
+ cur->bc_ptrs[level] += lrecs;
+ cur->bc_ra[level] = 0;
+ }
+ /*
+ * If we joined with the right neighbor and there's a level above
+ * us, increment the cursor at that level.
+ */
+ else if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) ||
+ (level + 1 < cur->bc_nlevels)) {
+ error = xfs_btree_increment(cur, level + 1, &i);
+ if (error)
+ goto error0;
+ }
+
+ /*
+ * Readjust the ptr at this level if it's not a leaf, since it's
+ * still pointing at the deletion point, which makes the cursor
+ * inconsistent. If this makes the ptr 0, the caller fixes it up.
+ * We can't use decrement because it would change the next level up.
+ */
+ if (level > 0)
+ cur->bc_ptrs[level]--;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ /* Return value means the next level up has something to do. */
+ *stat = 2;
+ return 0;
+
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ if (tcur)
+ xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
+ return error;
+}
+
+/*
+ * Delete the record pointed to by cur.
+ * The cursor refers to the place where the record was (could be inserted)
+ * when the operation returns.
+ */
+int /* error */
+xfs_btree_delete(
+ struct xfs_btree_cur *cur,
+ int *stat) /* success/failure */
+{
+ int error; /* error return value */
+ int level;
+ int i;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+ /*
+ * Go up the tree, starting at leaf level.
+ *
+ * If 2 is returned then a join was done; go to the next level.
+ * Otherwise we are done.
+ */
+ for (level = 0, i = 2; i == 2; level++) {
+ error = xfs_btree_delrec(cur, level, &i);
+ if (error)
+ goto error0;
+ }
+
+ if (i == 0) {
+ for (level = 1; level < cur->bc_nlevels; level++) {
+ if (cur->bc_ptrs[level] == 0) {
+ error = xfs_btree_decrement(cur, level, &i);
+ if (error)
+ goto error0;
+ break;
+ }
+ }
+ }
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+ *stat = i;
+ return 0;
+error0:
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+ return error;
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int /* error */
+xfs_btree_get_rec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ union xfs_btree_rec **recp, /* output: btree record */
+ int *stat) /* output: success/failure */
+{
+ struct xfs_btree_block *block; /* btree block */
+ struct xfs_buf *bp; /* buffer pointer */
+ int ptr; /* record number */
+#ifdef DEBUG
+ int error; /* error return value */
+#endif
+
+ ptr = cur->bc_ptrs[0];
+ block = xfs_btree_get_block(cur, 0, &bp);
+
+#ifdef DEBUG
+ error = xfs_btree_check_block(cur, block, 0, bp);
+ if (error)
+ return error;
+#endif
+
+ /*
+ * Off the right end or left end, return failure.
+ */
+ if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
+ *stat = 0;
+ return 0;
+ }
+
+ /*
+ * Point to the record and extract its data.
+ */
+ *recp = xfs_btree_rec_addr(cur, ptr, block);
+ *stat = 1;
+ return 0;
+}
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 1f528a2a3754..789fffdf8b2f 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -39,39 +39,19 @@ extern kmem_zone_t *xfs_btree_cur_zone;
#define XFS_BTNUM_INO ((xfs_btnum_t)XFS_BTNUM_INOi)
/*
- * Short form header: space allocation btrees.
- */
-typedef struct xfs_btree_sblock {
- __be32 bb_magic; /* magic number for block type */
- __be16 bb_level; /* 0 is a leaf */
- __be16 bb_numrecs; /* current # of data records */
- __be32 bb_leftsib; /* left sibling block or NULLAGBLOCK */
- __be32 bb_rightsib; /* right sibling block or NULLAGBLOCK */
-} xfs_btree_sblock_t;
-
-/*
- * Long form header: bmap btrees.
- */
-typedef struct xfs_btree_lblock {
- __be32 bb_magic; /* magic number for block type */
- __be16 bb_level; /* 0 is a leaf */
- __be16 bb_numrecs; /* current # of data records */
- __be64 bb_leftsib; /* left sibling block or NULLDFSBNO */
- __be64 bb_rightsib; /* right sibling block or NULLDFSBNO */
-} xfs_btree_lblock_t;
-
-/*
- * Combined header and structure, used by common code.
+ * Generic btree header.
+ *
+ * This is a comination of the actual format used on disk for short and long
+ * format btrees. The first three fields are shared by both format, but
+ * the pointers are different and should be used with care.
+ *
+ * To get the size of the actual short or long form headers please use
+ * the size macros below. Never use sizeof(xfs_btree_block).
*/
-typedef struct xfs_btree_hdr
-{
+struct xfs_btree_block {
__be32 bb_magic; /* magic number for block type */
__be16 bb_level; /* 0 is a leaf */
__be16 bb_numrecs; /* current # of data records */
-} xfs_btree_hdr_t;
-
-typedef struct xfs_btree_block {
- xfs_btree_hdr_t bb_h; /* header */
union {
struct {
__be32 bb_leftsib;
@@ -82,7 +62,36 @@ typedef struct xfs_btree_block {
__be64 bb_rightsib;
} l; /* long form pointers */
} bb_u; /* rest */
-} xfs_btree_block_t;
+};
+
+#define XFS_BTREE_SBLOCK_LEN 16 /* size of a short form block */
+#define XFS_BTREE_LBLOCK_LEN 24 /* size of a long form block */
+
+
+/*
+ * Generic key, ptr and record wrapper structures.
+ *
+ * These are disk format structures, and are converted where necessary
+ * by the btree specific code that needs to interpret them.
+ */
+union xfs_btree_ptr {
+ __be32 s; /* short form ptr */
+ __be64 l; /* long form ptr */
+};
+
+union xfs_btree_key {
+ xfs_bmbt_key_t bmbt;
+ xfs_bmdr_key_t bmbr; /* bmbt root block */
+ xfs_alloc_key_t alloc;
+ xfs_inobt_key_t inobt;
+};
+
+union xfs_btree_rec {
+ xfs_bmbt_rec_t bmbt;
+ xfs_bmdr_rec_t bmbr; /* bmbt root block */
+ xfs_alloc_rec_t alloc;
+ xfs_inobt_rec_t inobt;
+};
/*
* For logging record fields.
@@ -96,46 +105,131 @@ typedef struct xfs_btree_block {
#define XFS_BB_ALL_BITS ((1 << XFS_BB_NUM_BITS) - 1)
/*
- * Boolean to select which form of xfs_btree_block_t.bb_u to use.
- */
-#define XFS_BTREE_LONG_PTRS(btnum) ((btnum) == XFS_BTNUM_BMAP)
-
-/*
* Magic numbers for btree blocks.
*/
extern const __uint32_t xfs_magics[];
/*
- * Maximum and minimum records in a btree block.
- * Given block size, type prefix, and leaf flag (0 or 1).
- * The divisor below is equivalent to lf ? (e1) : (e2) but that produces
- * compiler warnings.
- */
-#define XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) \
- ((int)(((bsz) - (uint)sizeof(t ## _block_t)) / \
- (((lf) * (uint)sizeof(t ## _rec_t)) + \
- ((1 - (lf)) * \
- ((uint)sizeof(t ## _key_t) + (uint)sizeof(t ## _ptr_t))))))
-#define XFS_BTREE_BLOCK_MINRECS(bsz,t,lf) \
- (XFS_BTREE_BLOCK_MAXRECS(bsz,t,lf) / 2)
-
-/*
- * Record, key, and pointer address calculation macros.
- * Given block size, type prefix, block pointer, and index of requested entry
- * (first entry numbered 1).
- */
-#define XFS_BTREE_REC_ADDR(t,bb,i) \
- ((t ## _rec_t *)((char *)(bb) + sizeof(t ## _block_t) + \
- ((i) - 1) * sizeof(t ## _rec_t)))
-#define XFS_BTREE_KEY_ADDR(t,bb,i) \
- ((t ## _key_t *)((char *)(bb) + sizeof(t ## _block_t) + \
- ((i) - 1) * sizeof(t ## _key_t)))
-#define XFS_BTREE_PTR_ADDR(t,bb,i,mxr) \
- ((t ## _ptr_t *)((char *)(bb) + sizeof(t ## _block_t) + \
- (mxr) * sizeof(t ## _key_t) + ((i) - 1) * sizeof(t ## _ptr_t)))
+ * Generic stats interface
+ */
+#define __XFS_BTREE_STATS_INC(type, stat) \
+ XFS_STATS_INC(xs_ ## type ## _2_ ## stat)
+#define XFS_BTREE_STATS_INC(cur, stat) \
+do { \
+ switch (cur->bc_btnum) { \
+ case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(abtb, stat); break; \
+ case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(abtc, stat); break; \
+ case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(bmbt, stat); break; \
+ case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(ibt, stat); break; \
+ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
+ } \
+} while (0)
+
+#define __XFS_BTREE_STATS_ADD(type, stat, val) \
+ XFS_STATS_ADD(xs_ ## type ## _2_ ## stat, val)
+#define XFS_BTREE_STATS_ADD(cur, stat, val) \
+do { \
+ switch (cur->bc_btnum) { \
+ case XFS_BTNUM_BNO: __XFS_BTREE_STATS_ADD(abtb, stat, val); break; \
+ case XFS_BTNUM_CNT: __XFS_BTREE_STATS_ADD(abtc, stat, val); break; \
+ case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_ADD(bmbt, stat, val); break; \
+ case XFS_BTNUM_INO: __XFS_BTREE_STATS_ADD(ibt, stat, val); break; \
+ case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
+ } \
+} while (0)
#define XFS_BTREE_MAXLEVELS 8 /* max of all btrees */
+struct xfs_btree_ops {
+ /* size of the key and record structures */
+ size_t key_len;
+ size_t rec_len;
+
+ /* cursor operations */
+ struct xfs_btree_cur *(*dup_cursor)(struct xfs_btree_cur *);
+ void (*update_cursor)(struct xfs_btree_cur *src,
+ struct xfs_btree_cur *dst);
+
+ /* update btree root pointer */
+ void (*set_root)(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *nptr, int level_change);
+ int (*kill_root)(struct xfs_btree_cur *cur, struct xfs_buf *bp,
+ int level, union xfs_btree_ptr *newroot);
+
+ /* block allocation / freeing */
+ int (*alloc_block)(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *start_bno,
+ union xfs_btree_ptr *new_bno,
+ int length, int *stat);
+ int (*free_block)(struct xfs_btree_cur *cur, struct xfs_buf *bp);
+
+ /* update last record information */
+ void (*update_lastrec)(struct xfs_btree_cur *cur,
+ struct xfs_btree_block *block,
+ union xfs_btree_rec *rec,
+ int ptr, int reason);
+
+ /* records in block/level */
+ int (*get_minrecs)(struct xfs_btree_cur *cur, int level);
+ int (*get_maxrecs)(struct xfs_btree_cur *cur, int level);
+
+ /* records on disk. Matter for the root in inode case. */
+ int (*get_dmaxrecs)(struct xfs_btree_cur *cur, int level);
+
+ /* init values of btree structures */
+ void (*init_key_from_rec)(union xfs_btree_key *key,
+ union xfs_btree_rec *rec);
+ void (*init_rec_from_key)(union xfs_btree_key *key,
+ union xfs_btree_rec *rec);
+ void (*init_rec_from_cur)(struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec);
+ void (*init_ptr_from_cur)(struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr);
+
+ /* difference between key value and cursor value */
+ __int64_t (*key_diff)(struct xfs_btree_cur *cur,
+ union xfs_btree_key *key);
+
+#ifdef DEBUG
+ /* check that k1 is lower than k2 */
+ int (*keys_inorder)(struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2);
+
+ /* check that r1 is lower than r2 */
+ int (*recs_inorder)(struct xfs_btree_cur *cur,
+ union xfs_btree_rec *r1,
+ union xfs_btree_rec *r2);
+#endif
+
+ /* btree tracing */
+#ifdef XFS_BTREE_TRACE
+ void (*trace_enter)(struct xfs_btree_cur *, const char *,
+ char *, int, int, __psunsigned_t,
+ __psunsigned_t, __psunsigned_t,
+ __psunsigned_t, __psunsigned_t,
+ __psunsigned_t, __psunsigned_t,
+ __psunsigned_t, __psunsigned_t,
+ __psunsigned_t, __psunsigned_t);
+ void (*trace_cursor)(struct xfs_btree_cur *, __uint32_t *,
+ __uint64_t *, __uint64_t *);
+ void (*trace_key)(struct xfs_btree_cur *,
+ union xfs_btree_key *, __uint64_t *,
+ __uint64_t *);
+ void (*trace_record)(struct xfs_btree_cur *,
+ union xfs_btree_rec *, __uint64_t *,
+ __uint64_t *, __uint64_t *);
+#endif
+};
+
+/*
+ * Reasons for the update_lastrec method to be called.
+ */
+#define LASTREC_UPDATE 0
+#define LASTREC_INSREC 1
+#define LASTREC_DELREC 2
+
+
/*
* Btree cursor structure.
* This collects all information needed by the btree code in one place.
@@ -144,6 +238,8 @@ typedef struct xfs_btree_cur
{
struct xfs_trans *bc_tp; /* transaction we're in, if any */
struct xfs_mount *bc_mp; /* file system mount struct */
+ const struct xfs_btree_ops *bc_ops;
+ uint bc_flags; /* btree features - below */
union {
xfs_alloc_rec_incore_t a;
xfs_bmbt_irec_t b;
@@ -175,94 +271,40 @@ typedef struct xfs_btree_cur
} bc_private; /* per-btree type data */
} xfs_btree_cur_t;
+/* cursor flags */
+#define XFS_BTREE_LONG_PTRS (1<<0) /* pointers are 64bits long */
+#define XFS_BTREE_ROOT_IN_INODE (1<<1) /* root may be variable size */
+#define XFS_BTREE_LASTREC_UPDATE (1<<2) /* track last rec externally */
+
+
#define XFS_BTREE_NOERROR 0
#define XFS_BTREE_ERROR 1
/*
* Convert from buffer to btree block header.
*/
-#define XFS_BUF_TO_BLOCK(bp) ((xfs_btree_block_t *)XFS_BUF_PTR(bp))
-#define XFS_BUF_TO_LBLOCK(bp) ((xfs_btree_lblock_t *)XFS_BUF_PTR(bp))
-#define XFS_BUF_TO_SBLOCK(bp) ((xfs_btree_sblock_t *)XFS_BUF_PTR(bp))
+#define XFS_BUF_TO_BLOCK(bp) ((struct xfs_btree_block *)XFS_BUF_PTR(bp))
-#ifdef __KERNEL__
-
-#ifdef DEBUG
/*
- * Debug routine: check that block header is ok.
+ * Check that block header is ok.
*/
-void
+int
xfs_btree_check_block(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_btree_block_t *block, /* generic btree block pointer */
- int level, /* level of the btree block */
- struct xfs_buf *bp); /* buffer containing block, if any */
-
-/*
- * Debug routine: check that keys are in the right order.
- */
-void
-xfs_btree_check_key(
- xfs_btnum_t btnum, /* btree identifier */
- void *ak1, /* pointer to left (lower) key */
- void *ak2); /* pointer to right (higher) key */
-
-/*
- * Debug routine: check that records are in the right order.
- */
-void
-xfs_btree_check_rec(
- xfs_btnum_t btnum, /* btree identifier */
- void *ar1, /* pointer to left (lower) record */
- void *ar2); /* pointer to right (higher) record */
-#else
-#define xfs_btree_check_block(a,b,c,d)
-#define xfs_btree_check_key(a,b,c)
-#define xfs_btree_check_rec(a,b,c)
-#endif /* DEBUG */
-
-/*
- * Checking routine: check that long form block header is ok.
- */
-int /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_lblock(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_btree_lblock_t *block, /* btree long form block pointer */
+ struct xfs_btree_cur *cur, /* btree cursor */
+ struct xfs_btree_block *block, /* generic btree block pointer */
int level, /* level of the btree block */
struct xfs_buf *bp); /* buffer containing block, if any */
/*
- * Checking routine: check that (long) pointer is ok.
+ * Check that (long) pointer is ok.
*/
int /* error (0 or EFSCORRUPTED) */
xfs_btree_check_lptr(
- xfs_btree_cur_t *cur, /* btree cursor */
+ struct xfs_btree_cur *cur, /* btree cursor */
xfs_dfsbno_t ptr, /* btree block disk address */
int level); /* btree block level */
-#define xfs_btree_check_lptr_disk(cur, ptr, level) \
- xfs_btree_check_lptr(cur, be64_to_cpu(ptr), level)
-
-/*
- * Checking routine: check that short form block header is ok.
- */
-int /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sblock(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_btree_sblock_t *block, /* btree short form block pointer */
- int level, /* level of the btree block */
- struct xfs_buf *bp); /* buffer containing block */
-
-/*
- * Checking routine: check that (short) pointer is ok.
- */
-int /* error (0 or EFSCORRUPTED) */
-xfs_btree_check_sptr(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agblock_t ptr, /* btree block disk address */
- int level); /* btree block level */
-
/*
* Delete the btree cursor.
*/
@@ -281,15 +323,6 @@ xfs_btree_dup_cursor(
xfs_btree_cur_t **ncur);/* output cursor */
/*
- * Change the cursor to point to the first record in the current block
- * at the given level. Other levels are unaffected.
- */
-int /* success=1, failure=0 */
-xfs_btree_firstrec(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level); /* level to change */
-
-/*
* Get a buffer for the block, return it with no data read.
* Long-form addressing.
*/
@@ -313,20 +346,6 @@ xfs_btree_get_bufs(
uint lock); /* lock flags for get_buf */
/*
- * Allocate a new btree cursor.
- * The cursor is either for allocation (A) or bmap (B).
- */
-xfs_btree_cur_t * /* new btree cursor */
-xfs_btree_init_cursor(
- struct xfs_mount *mp, /* file system mount point */
- struct xfs_trans *tp, /* transaction pointer */
- struct xfs_buf *agbp, /* (A only) buffer for agf structure */
- xfs_agnumber_t agno, /* (A only) allocation group number */
- xfs_btnum_t btnum, /* btree identifier */
- struct xfs_inode *ip, /* (B only) inode owning the btree */
- int whichfork); /* (B only) data/attr fork */
-
-/*
* Check for the cursor referring to the last block at the given level.
*/
int /* 1=is last block, 0=not last block */
@@ -335,15 +354,6 @@ xfs_btree_islastblock(
int level); /* level to check */
/*
- * Change the cursor to point to the last record in the current block
- * at the given level. Other levels are unaffected.
- */
-int /* success=1, failure=0 */
-xfs_btree_lastrec(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level); /* level to change */
-
-/*
* Compute first and last byte offsets for the fields given.
* Interprets the offsets table, which contains struct field offsets.
*/
@@ -404,39 +414,53 @@ xfs_btree_reada_bufs(
xfs_extlen_t count); /* count of filesystem blocks */
/*
- * Read-ahead btree blocks, at the given level.
- * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
+ * Set the buffer for level "lev" in the cursor to bp, releasing
+ * any previous buffer.
*/
-int /* readahead block count */
-xfs_btree_readahead_core(
+void
+xfs_btree_setbuf(
xfs_btree_cur_t *cur, /* btree cursor */
int lev, /* level in btree */
- int lr); /* left/right bits */
+ struct xfs_buf *bp); /* new buffer to set */
-static inline int /* readahead block count */
-xfs_btree_readahead(
- xfs_btree_cur_t *cur, /* btree cursor */
- int lev, /* level in btree */
- int lr) /* left/right bits */
-{
- if ((cur->bc_ra[lev] | lr) == cur->bc_ra[lev])
- return 0;
- return xfs_btree_readahead_core(cur, lev, lr);
-}
+/*
+ * Common btree core entry points.
+ */
+int xfs_btree_increment(struct xfs_btree_cur *, int, int *);
+int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
+int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
+int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
+int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
+int xfs_btree_kill_iroot(struct xfs_btree_cur *);
+int xfs_btree_insert(struct xfs_btree_cur *, int *);
+int xfs_btree_delete(struct xfs_btree_cur *, int *);
+int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
+/*
+ * Internal btree helpers also used by xfs_bmap.c.
+ */
+void xfs_btree_log_block(struct xfs_btree_cur *, struct xfs_buf *, int);
+void xfs_btree_log_recs(struct xfs_btree_cur *, struct xfs_buf *, int, int);
/*
- * Set the buffer for level "lev" in the cursor to bp, releasing
- * any previous buffer.
+ * Helpers.
*/
-void
-xfs_btree_setbuf(
- xfs_btree_cur_t *cur, /* btree cursor */
- int lev, /* level in btree */
- struct xfs_buf *bp); /* new buffer to set */
+static inline int xfs_btree_get_numrecs(struct xfs_btree_block *block)
+{
+ return be16_to_cpu(block->bb_numrecs);
+}
+
+static inline void xfs_btree_set_numrecs(struct xfs_btree_block *block,
+ __uint16_t numrecs)
+{
+ block->bb_numrecs = cpu_to_be16(numrecs);
+}
-#endif /* __KERNEL__ */
+static inline int xfs_btree_get_level(struct xfs_btree_block *block)
+{
+ return be16_to_cpu(block->bb_level);
+}
/*
diff --git a/fs/xfs/xfs_btree_trace.c b/fs/xfs/xfs_btree_trace.c
new file mode 100644
index 000000000000..44ff942a0fda
--- /dev/null
+++ b/fs/xfs/xfs_btree_trace.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#include "xfs.h"
+#include "xfs_types.h"
+#include "xfs_inum.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_alloc_btree.h"
+#include "xfs_ialloc_btree.h"
+#include "xfs_inode.h"
+#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
+
+STATIC void
+xfs_btree_trace_ptr(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr ptr,
+ __psunsigned_t *high,
+ __psunsigned_t *low)
+{
+ if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ __u64 val = be64_to_cpu(ptr.l);
+ *high = val >> 32;
+ *low = (int)val;
+ } else {
+ *high = 0;
+ *low = be32_to_cpu(ptr.s);
+ }
+}
+
+/*
+ * Add a trace buffer entry for arguments, for a buffer & 1 integer arg.
+ */
+void
+xfs_btree_trace_argbi(
+ const char *func,
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *b,
+ int i,
+ int line)
+{
+ cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBI,
+ line, (__psunsigned_t)b, i, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for a buffer & 2 integer args.
+ */
+void
+xfs_btree_trace_argbii(
+ const char *func,
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *b,
+ int i0,
+ int i1,
+ int line)
+{
+ cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGBII,
+ line, (__psunsigned_t)b, i0, i1, 0, 0, 0, 0,
+ 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for 3 block-length args
+ * and an integer arg.
+ */
+void
+xfs_btree_trace_argfffi(
+ const char *func,
+ struct xfs_btree_cur *cur,
+ xfs_dfiloff_t o,
+ xfs_dfsbno_t b,
+ xfs_dfilblks_t i,
+ int j,
+ int line)
+{
+ cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGFFFI,
+ line,
+ o >> 32, (int)o,
+ b >> 32, (int)b,
+ i >> 32, (int)i,
+ (int)j, 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for one integer arg.
+ */
+void
+xfs_btree_trace_argi(
+ const char *func,
+ struct xfs_btree_cur *cur,
+ int i,
+ int line)
+{
+ cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGI,
+ line, i, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for int, fsblock, key.
+ */
+void
+xfs_btree_trace_argipk(
+ const char *func,
+ struct xfs_btree_cur *cur,
+ int i,
+ union xfs_btree_ptr ptr,
+ union xfs_btree_key *key,
+ int line)
+{
+ __psunsigned_t high, low;
+ __uint64_t l0, l1;
+
+ xfs_btree_trace_ptr(cur, ptr, &high, &low);
+ cur->bc_ops->trace_key(cur, key, &l0, &l1);
+ cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPK,
+ line, i, high, low,
+ l0 >> 32, (int)l0,
+ l1 >> 32, (int)l1,
+ 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for int, fsblock, rec.
+ */
+void
+xfs_btree_trace_argipr(
+ const char *func,
+ struct xfs_btree_cur *cur,
+ int i,
+ union xfs_btree_ptr ptr,
+ union xfs_btree_rec *rec,
+ int line)
+{
+ __psunsigned_t high, low;
+ __uint64_t l0, l1, l2;
+
+ xfs_btree_trace_ptr(cur, ptr, &high, &low);
+ cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2);
+ cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIPR,
+ line, i,
+ high, low,
+ l0 >> 32, (int)l0,
+ l1 >> 32, (int)l1,
+ l2 >> 32, (int)l2,
+ 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for int, key.
+ */
+void
+xfs_btree_trace_argik(
+ const char *func,
+ struct xfs_btree_cur *cur,
+ int i,
+ union xfs_btree_key *key,
+ int line)
+{
+ __uint64_t l0, l1;
+
+ cur->bc_ops->trace_key(cur, key, &l0, &l1);
+ cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGIK,
+ line, i,
+ l0 >> 32, (int)l0,
+ l1 >> 32, (int)l1,
+ 0, 0, 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for arguments, for record.
+ */
+void
+xfs_btree_trace_argr(
+ const char *func,
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ int line)
+{
+ __uint64_t l0, l1, l2;
+
+ cur->bc_ops->trace_record(cur, rec, &l0, &l1, &l2);
+ cur->bc_ops->trace_enter(cur, func, XBT_ARGS, XFS_BTREE_KTRACE_ARGR,
+ line,
+ l0 >> 32, (int)l0,
+ l1 >> 32, (int)l1,
+ l2 >> 32, (int)l2,
+ 0, 0, 0, 0, 0);
+}
+
+/*
+ * Add a trace buffer entry for the cursor/operation.
+ */
+void
+xfs_btree_trace_cursor(
+ const char *func,
+ struct xfs_btree_cur *cur,
+ int type,
+ int line)
+{
+ __uint32_t s0;
+ __uint64_t l0, l1;
+ char *s;
+
+ switch (type) {
+ case XBT_ARGS:
+ s = "args";
+ break;
+ case XBT_ENTRY:
+ s = "entry";
+ break;
+ case XBT_ERROR:
+ s = "error";
+ break;
+ case XBT_EXIT:
+ s = "exit";
+ break;
+ default:
+ s = "unknown";
+ break;
+ }
+
+ cur->bc_ops->trace_cursor(cur, &s0, &l0, &l1);
+ cur->bc_ops->trace_enter(cur, func, s, XFS_BTREE_KTRACE_CUR, line,
+ s0,
+ l0 >> 32, (int)l0,
+ l1 >> 32, (int)l1,
+ (__psunsigned_t)cur->bc_bufs[0],
+ (__psunsigned_t)cur->bc_bufs[1],
+ (__psunsigned_t)cur->bc_bufs[2],
+ (__psunsigned_t)cur->bc_bufs[3],
+ (cur->bc_ptrs[0] << 16) | cur->bc_ptrs[1],
+ (cur->bc_ptrs[2] << 16) | cur->bc_ptrs[3]);
+}
diff --git a/fs/xfs/xfs_btree_trace.h b/fs/xfs/xfs_btree_trace.h
new file mode 100644
index 000000000000..b3f5eb3c3c6c
--- /dev/null
+++ b/fs/xfs/xfs_btree_trace.h
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2008 Silicon Graphics, Inc.
+ * All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+#ifndef __XFS_BTREE_TRACE_H__
+#define __XFS_BTREE_TRACE_H__
+
+struct xfs_btree_cur;
+struct xfs_buf;
+
+
+/*
+ * Trace hooks.
+ * i,j = integer (32 bit)
+ * b = btree block buffer (xfs_buf_t)
+ * p = btree ptr
+ * r = btree record
+ * k = btree key
+ */
+
+#ifdef XFS_BTREE_TRACE
+
+/*
+ * Trace buffer entry types.
+ */
+#define XFS_BTREE_KTRACE_ARGBI 1
+#define XFS_BTREE_KTRACE_ARGBII 2
+#define XFS_BTREE_KTRACE_ARGFFFI 3
+#define XFS_BTREE_KTRACE_ARGI 4
+#define XFS_BTREE_KTRACE_ARGIPK 5
+#define XFS_BTREE_KTRACE_ARGIPR 6
+#define XFS_BTREE_KTRACE_ARGIK 7
+#define XFS_BTREE_KTRACE_ARGR 8
+#define XFS_BTREE_KTRACE_CUR 9
+
+/*
+ * Sub-types for cursor traces.
+ */
+#define XBT_ARGS 0
+#define XBT_ENTRY 1
+#define XBT_ERROR 2
+#define XBT_EXIT 3
+
+void xfs_btree_trace_argbi(const char *, struct xfs_btree_cur *,
+ struct xfs_buf *, int, int);
+void xfs_btree_trace_argbii(const char *, struct xfs_btree_cur *,
+ struct xfs_buf *, int, int, int);
+void xfs_btree_trace_argfffi(const char *, struct xfs_btree_cur *,
+ xfs_dfiloff_t, xfs_dfsbno_t, xfs_dfilblks_t, int, int);
+void xfs_btree_trace_argi(const char *, struct xfs_btree_cur *, int, int);
+void xfs_btree_trace_argipk(const char *, struct xfs_btree_cur *, int,
+ union xfs_btree_ptr, union xfs_btree_key *, int);
+void xfs_btree_trace_argipr(const char *, struct xfs_btree_cur *, int,
+ union xfs_btree_ptr, union xfs_btree_rec *, int);
+void xfs_btree_trace_argik(const char *, struct xfs_btree_cur *, int,
+ union xfs_btree_key *, int);
+void xfs_btree_trace_argr(const char *, struct xfs_btree_cur *,
+ union xfs_btree_rec *, int);
+void xfs_btree_trace_cursor(const char *, struct xfs_btree_cur *, int, int);
+
+
+#define XFS_ALLOCBT_TRACE_SIZE 4096 /* size of global trace buffer */
+extern ktrace_t *xfs_allocbt_trace_buf;
+
+#define XFS_INOBT_TRACE_SIZE 4096 /* size of global trace buffer */
+extern ktrace_t *xfs_inobt_trace_buf;
+
+#define XFS_BMBT_TRACE_SIZE 4096 /* size of global trace buffer */
+#define XFS_BMBT_KTRACE_SIZE 32 /* size of per-inode trace buffer */
+extern ktrace_t *xfs_bmbt_trace_buf;
+
+
+#define XFS_BTREE_TRACE_ARGBI(c, b, i) \
+ xfs_btree_trace_argbi(__func__, c, b, i, __LINE__)
+#define XFS_BTREE_TRACE_ARGBII(c, b, i, j) \
+ xfs_btree_trace_argbii(__func__, c, b, i, j, __LINE__)
+#define XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j) \
+ xfs_btree_trace_argfffi(__func__, c, o, b, i, j, __LINE__)
+#define XFS_BTREE_TRACE_ARGI(c, i) \
+ xfs_btree_trace_argi(__func__, c, i, __LINE__)
+#define XFS_BTREE_TRACE_ARGIPK(c, i, p, k) \
+ xfs_btree_trace_argipk(__func__, c, i, p, k, __LINE__)
+#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r) \
+ xfs_btree_trace_argipr(__func__, c, i, p, r, __LINE__)
+#define XFS_BTREE_TRACE_ARGIK(c, i, k) \
+ xfs_btree_trace_argik(__func__, c, i, k, __LINE__)
+#define XFS_BTREE_TRACE_ARGR(c, r) \
+ xfs_btree_trace_argr(__func__, c, r, __LINE__)
+#define XFS_BTREE_TRACE_CURSOR(c, t) \
+ xfs_btree_trace_cursor(__func__, c, t, __LINE__)
+#else
+#define XFS_BTREE_TRACE_ARGBI(c, b, i)
+#define XFS_BTREE_TRACE_ARGBII(c, b, i, j)
+#define XFS_BTREE_TRACE_ARGFFFI(c, o, b, i, j)
+#define XFS_BTREE_TRACE_ARGI(c, i)
+#define XFS_BTREE_TRACE_ARGIPK(c, i, p, s)
+#define XFS_BTREE_TRACE_ARGIPR(c, i, p, r)
+#define XFS_BTREE_TRACE_ARGIK(c, i, k)
+#define XFS_BTREE_TRACE_ARGR(c, r)
+#define XFS_BTREE_TRACE_CURSOR(c, t)
+#endif /* XFS_BTREE_TRACE */
+
+#endif /* __XFS_BTREE_TRACE_H__ */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 002fc2617c8e..92af4098c7e8 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -375,7 +375,7 @@ xfs_buf_item_unpin(
xfs_buf_log_item_t *bip,
int stale)
{
- xfs_mount_t *mp;
+ struct xfs_ail *ailp;
xfs_buf_t *bp;
int freed;
@@ -387,7 +387,7 @@ xfs_buf_item_unpin(
xfs_buftrace("XFS_UNPIN", bp);
freed = atomic_dec_and_test(&bip->bli_refcount);
- mp = bip->bli_item.li_mountp;
+ ailp = bip->bli_item.li_ailp;
xfs_bunpin(bp);
if (freed && stale) {
ASSERT(bip->bli_flags & XFS_BLI_STALE);
@@ -399,17 +399,17 @@ xfs_buf_item_unpin(
xfs_buftrace("XFS_UNPIN STALE", bp);
/*
* If we get called here because of an IO error, we may
- * or may not have the item on the AIL. xfs_trans_delete_ail()
+ * or may not have the item on the AIL. xfs_trans_ail_delete()
* will take care of that situation.
- * xfs_trans_delete_ail() drops the AIL lock.
+ * xfs_trans_ail_delete() drops the AIL lock.
*/
if (bip->bli_flags & XFS_BLI_STALE_INODE) {
xfs_buf_do_callbacks(bp, (xfs_log_item_t *)bip);
XFS_BUF_SET_FSPRIVATE(bp, NULL);
XFS_BUF_CLR_IODONE_FUNC(bp);
} else {
- spin_lock(&mp->m_ail_lock);
- xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip);
+ spin_lock(&ailp->xa_lock);
+ xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
xfs_buf_item_relse(bp);
ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL);
}
@@ -707,8 +707,8 @@ xfs_buf_item_init(
* the first. If we do already have one, there is
* nothing to do here so return.
*/
- if (XFS_BUF_FSPRIVATE3(bp, xfs_mount_t *) != mp)
- XFS_BUF_SET_FSPRIVATE3(bp, mp);
+ if (bp->b_mount != mp)
+ bp->b_mount = mp;
XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
if (XFS_BUF_FSPRIVATE(bp, void *) != NULL) {
lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
@@ -731,6 +731,7 @@ xfs_buf_item_init(
bip->bli_item.li_type = XFS_LI_BUF;
bip->bli_item.li_ops = &xfs_buf_item_ops;
bip->bli_item.li_mountp = mp;
+ bip->bli_item.li_ailp = mp->m_ail;
bip->bli_buf = bp;
xfs_buf_hold(bp);
bip->bli_format.blf_type = XFS_LI_BUF;
@@ -997,21 +998,7 @@ xfs_buf_iodone_callbacks(
xfs_buf_do_callbacks(bp, lip);
XFS_BUF_SET_FSPRIVATE(bp, NULL);
XFS_BUF_CLR_IODONE_FUNC(bp);
-
- /*
- * XFS_SHUT flag gets set when we go thru the
- * entire buffer cache and deliberately start
- * throwing away delayed write buffers.
- * Since there's no biowait done on those,
- * we should just brelse them.
- */
- if (XFS_BUF_ISSHUT(bp)) {
- XFS_BUF_UNSHUT(bp);
- xfs_buf_relse(bp);
- } else {
- xfs_biodone(bp);
- }
-
+ xfs_biodone(bp);
return;
}
@@ -1122,27 +1109,23 @@ xfs_buf_iodone(
xfs_buf_t *bp,
xfs_buf_log_item_t *bip)
{
- struct xfs_mount *mp;
+ struct xfs_ail *ailp = bip->bli_item.li_ailp;
ASSERT(bip->bli_buf == bp);
xfs_buf_rele(bp);
- mp = bip->bli_item.li_mountp;
/*
* If we are forcibly shutting down, this may well be
* off the AIL already. That's because we simulate the
* log-committed callbacks to unpin these buffers. Or we may never
* have put this item on AIL because of the transaction was
- * aborted forcibly. xfs_trans_delete_ail() takes care of these.
+ * aborted forcibly. xfs_trans_ail_delete() takes care of these.
*
* Either way, AIL is useless if we're forcing a shutdown.
*/
- spin_lock(&mp->m_ail_lock);
- /*
- * xfs_trans_delete_ail() drops the AIL lock.
- */
- xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip);
+ spin_lock(&ailp->xa_lock);
+ xfs_trans_ail_delete(ailp, (xfs_log_item_t *)bip);
xfs_buf_item_free(bip);
}
diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h
deleted file mode 100644
index d2ce5dd70d87..000000000000
--- a/fs/xfs/xfs_clnt.h
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_CLNT_H__
-#define __XFS_CLNT_H__
-
-/*
- * XFS arguments structure, constructed from the arguments we
- * are passed via the mount system call.
- *
- * NOTE: The mount system call is handled differently between
- * Linux and IRIX. In IRIX we worked work with a binary data
- * structure coming in across the syscall interface from user
- * space (the mount userspace knows about each filesystem type
- * and the set of valid options for it, and converts the users
- * argument string into a binary structure _before_ making the
- * system call), and the ABI issues that this implies.
- *
- * In Linux, we are passed a comma separated set of options;
- * ie. a NULL terminated string of characters. Userspace mount
- * code does not have any knowledge of mount options expected by
- * each filesystem type and so each filesystem parses its mount
- * options in kernel space.
- *
- * For the Linux port, we kept this structure pretty much intact
- * and use it internally (because the existing code groks it).
- */
-struct xfs_mount_args {
- int flags; /* flags -> see XFSMNT_... macros below */
- int flags2; /* flags -> see XFSMNT2_... macros below */
- int logbufs; /* Number of log buffers, -1 to default */
- int logbufsize; /* Size of log buffers, -1 to default */
- char fsname[MAXNAMELEN+1]; /* data device name */
- char rtname[MAXNAMELEN+1]; /* realtime device filename */
- char logname[MAXNAMELEN+1]; /* journal device filename */
- char mtpt[MAXNAMELEN+1]; /* filesystem mount point */
- int sunit; /* stripe unit (BBs) */
- int swidth; /* stripe width (BBs), multiple of sunit */
- uchar_t iosizelog; /* log2 of the preferred I/O size */
- int ihashsize; /* inode hash table size (buckets) */
-};
-
-/*
- * XFS mount option flags -- args->flags1
- */
-#define XFSMNT_ATTR2 0x00000001 /* allow ATTR2 EA format */
-#define XFSMNT_WSYNC 0x00000002 /* safe mode nfs mount
- * compatible */
-#define XFSMNT_INO64 0x00000004 /* move inode numbers up
- * past 2^32 */
-#define XFSMNT_UQUOTA 0x00000008 /* user quota accounting */
-#define XFSMNT_PQUOTA 0x00000010 /* IRIX prj quota accounting */
-#define XFSMNT_UQUOTAENF 0x00000020 /* user quota limit
- * enforcement */
-#define XFSMNT_PQUOTAENF 0x00000040 /* IRIX project quota limit
- * enforcement */
-#define XFSMNT_QUIET 0x00000080 /* don't report mount errors */
-#define XFSMNT_NOALIGN 0x00000200 /* don't allocate at
- * stripe boundaries*/
-#define XFSMNT_RETERR 0x00000400 /* return error to user */
-#define XFSMNT_NORECOVERY 0x00000800 /* no recovery, implies
- * read-only mount */
-#define XFSMNT_SHARED 0x00001000 /* shared XFS mount */
-#define XFSMNT_IOSIZE 0x00002000 /* optimize for I/O size */
-#define XFSMNT_OSYNCISOSYNC 0x00004000 /* o_sync is REALLY o_sync */
- /* (osyncisdsync is default) */
-#define XFSMNT_NOATTR2 0x00008000 /* turn off ATTR2 EA format */
-#define XFSMNT_32BITINODES 0x00200000 /* restrict inodes to 32
- * bits of address space */
-#define XFSMNT_GQUOTA 0x00400000 /* group quota accounting */
-#define XFSMNT_GQUOTAENF 0x00800000 /* group quota limit
- * enforcement */
-#define XFSMNT_NOUUID 0x01000000 /* Ignore fs uuid */
-#define XFSMNT_DMAPI 0x02000000 /* enable dmapi/xdsm */
-#define XFSMNT_BARRIER 0x04000000 /* use write barriers */
-#define XFSMNT_IKEEP 0x08000000 /* inode cluster delete */
-#define XFSMNT_SWALLOC 0x10000000 /* turn on stripe width
- * allocation */
-#define XFSMNT_DIRSYNC 0x40000000 /* sync creat,link,unlink,rename
- * symlink,mkdir,rmdir,mknod */
-#define XFSMNT_FLAGS2 0x80000000 /* more flags set in flags2 */
-
-/*
- * XFS mount option flags -- args->flags2
- */
-#define XFSMNT2_COMPAT_IOSIZE 0x00000001 /* don't report large preferred
- * I/O size in stat(2) */
-#define XFSMNT2_FILESTREAMS 0x00000002 /* enable the filestreams
- * allocator */
-
-#endif /* __XFS_CLNT_H__ */
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 8be0b00ede9a..70b710c1792d 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -72,27 +72,7 @@ typedef struct xfs_da_intnode {
typedef struct xfs_da_node_hdr xfs_da_node_hdr_t;
typedef struct xfs_da_node_entry xfs_da_node_entry_t;
-#define XFS_DA_MAXHASH ((xfs_dahash_t)-1) /* largest valid hash value */
-
#define XFS_LBSIZE(mp) (mp)->m_sb.sb_blocksize
-#define XFS_LBLOG(mp) (mp)->m_sb.sb_blocklog
-
-#define XFS_DA_MAKE_BNOENTRY(mp,bno,entry) \
- (((bno) << (mp)->m_dircook_elog) | (entry))
-#define XFS_DA_MAKE_COOKIE(mp,bno,entry,hash) \
- (((xfs_off_t)XFS_DA_MAKE_BNOENTRY(mp, bno, entry) << 32) | (hash))
-#define XFS_DA_COOKIE_HASH(mp,cookie) ((xfs_dahash_t)cookie)
-#define XFS_DA_COOKIE_BNO(mp,cookie) \
- ((((xfs_off_t)(cookie) >> 31) == -1LL ? \
- (xfs_dablk_t)0 : \
- (xfs_dablk_t)((xfs_off_t)(cookie) >> \
- ((mp)->m_dircook_elog + 32))))
-#define XFS_DA_COOKIE_ENTRY(mp,cookie) \
- ((((xfs_off_t)(cookie) >> 31) == -1LL ? \
- (xfs_dablk_t)0 : \
- (xfs_dablk_t)(((xfs_off_t)(cookie) >> 32) & \
- ((1 << (mp)->m_dircook_elog) - 1))))
-
/*========================================================================
* Btree searching and modification structure definitions.
@@ -226,9 +206,8 @@ struct xfs_nameops {
};
-#ifdef __KERNEL__
/*========================================================================
- * Function prototypes for the kernel.
+ * Function prototypes.
*========================================================================*/
/*
@@ -289,6 +268,5 @@ xfs_daddr_t xfs_da_blkno(xfs_dabuf_t *dabuf);
extern struct kmem_zone *xfs_da_state_zone;
extern struct kmem_zone *xfs_dabuf_zone;
-#endif /* __KERNEL__ */
#endif /* __XFS_DA_BTREE_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index 75b0cd4da0ea..b4c1ee713492 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -49,9 +49,8 @@
*/
int
xfs_swapext(
- xfs_swapext_t __user *sxu)
+ xfs_swapext_t *sxp)
{
- xfs_swapext_t *sxp;
xfs_inode_t *ip, *tip;
struct file *file, *target_file;
int error = 0;
@@ -62,11 +61,6 @@ xfs_swapext(
goto out;
}
- if (copy_from_user(sxp, sxu, sizeof(xfs_swapext_t))) {
- error = XFS_ERROR(EFAULT);
- goto out_free_sxp;
- }
-
/* Pull information for the target fd */
file = fget((int)sxp->sx_fdtarget);
if (!file) {
diff --git a/fs/xfs/xfs_dfrag.h b/fs/xfs/xfs_dfrag.h
index da178205be68..4f55a6306558 100644
--- a/fs/xfs/xfs_dfrag.h
+++ b/fs/xfs/xfs_dfrag.h
@@ -46,7 +46,7 @@ typedef struct xfs_swapext
/*
* Syscall interface for xfs_swapext
*/
-int xfs_swapext(struct xfs_swapext __user *sx);
+int xfs_swapext(struct xfs_swapext *sx);
int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,
struct xfs_swapext *sxp);
diff --git a/fs/xfs/xfs_dinode.h b/fs/xfs/xfs_dinode.h
index c9065eaf2a4d..162e8726df5e 100644
--- a/fs/xfs/xfs_dinode.h
+++ b/fs/xfs/xfs_dinode.h
@@ -18,32 +18,29 @@
#ifndef __XFS_DINODE_H__
#define __XFS_DINODE_H__
-struct xfs_buf;
-struct xfs_mount;
+#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
+#define XFS_DINODE_GOOD_VERSION(v) (((v) == 1 || (v) == 2))
-#define XFS_DINODE_VERSION_1 1
-#define XFS_DINODE_VERSION_2 2
-#define XFS_DINODE_GOOD_VERSION(v) \
- (((v) == XFS_DINODE_VERSION_1 || (v) == XFS_DINODE_VERSION_2))
-#define XFS_DINODE_MAGIC 0x494e /* 'IN' */
-
-/*
- * Disk inode structure.
- * This is just the header; the inode is expanded to fill a variable size
- * with the last field expanding. It is split into the core and "other"
- * because we only need the core part in the in-core inode.
- */
typedef struct xfs_timestamp {
__be32 t_sec; /* timestamp seconds */
__be32 t_nsec; /* timestamp nanoseconds */
} xfs_timestamp_t;
/*
- * Note: Coordinate changes to this structure with the XFS_DI_* #defines
- * below, the offsets table in xfs_ialloc_log_di() and struct xfs_icdinode
- * in xfs_inode.h.
+ * On-disk inode structure.
+ *
+ * This is just the header or "dinode core", the inode is expanded to fill a
+ * variable size the leftover area split into a data and an attribute fork.
+ * The format of the data and attribute fork depends on the format of the
+ * inode as indicated by di_format and di_aformat. To access the data and
+ * attribute use the XFS_DFORK_PTR, XFS_DFORK_DPTR, and XFS_DFORK_PTR macros
+ * below.
+ *
+ * There is a very similar struct icdinode in xfs_inode which matches the
+ * layout of the first 96 bytes of this structure, but is kept in native
+ * format instead of big endian.
*/
-typedef struct xfs_dinode_core {
+typedef struct xfs_dinode {
__be16 di_magic; /* inode magic # = XFS_DINODE_MAGIC */
__be16 di_mode; /* mode and type of file */
__u8 di_version; /* inode version */
@@ -69,34 +66,12 @@ typedef struct xfs_dinode_core {
__be16 di_dmstate; /* DMIG state info */
__be16 di_flags; /* random flags, XFS_DIFLAG_... */
__be32 di_gen; /* generation number */
-} xfs_dinode_core_t;
-#define DI_MAX_FLUSH 0xffff
+ /* di_next_unlinked is the only non-core field in the old dinode */
+ __be32 di_next_unlinked;/* agi unlinked list ptr */
+} __attribute__((packed)) xfs_dinode_t;
-typedef struct xfs_dinode
-{
- xfs_dinode_core_t di_core;
- /*
- * In adding anything between the core and the union, be
- * sure to update the macros like XFS_LITINO below and
- * XFS_BMAP_RBLOCK_DSIZE in xfs_bmap_btree.h.
- */
- __be32 di_next_unlinked;/* agi unlinked list ptr */
- union {
- xfs_bmdr_block_t di_bmbt; /* btree root block */
- xfs_bmbt_rec_32_t di_bmx[1]; /* extent list */
- xfs_dir2_sf_t di_dir2sf; /* shortform directory v2 */
- char di_c[1]; /* local contents */
- __be32 di_dev; /* device for S_IFCHR/S_IFBLK */
- uuid_t di_muuid; /* mount point value */
- char di_symlink[1]; /* local symbolic link */
- } di_u;
- union {
- xfs_bmdr_block_t di_abmbt; /* btree root block */
- xfs_bmbt_rec_32_t di_abmx[1]; /* extent list */
- xfs_attr_shortform_t di_attrsf; /* shortform attribute list */
- } di_a;
-} xfs_dinode_t;
+#define DI_MAX_FLUSH 0xffff
/*
* The 32 bit link count in the inode theoretically maxes out at UINT_MAX.
@@ -107,50 +82,14 @@ typedef struct xfs_dinode
#define XFS_MAXLINK_1 65535U
/*
- * Bit names for logging disk inodes only
- */
-#define XFS_DI_MAGIC 0x0000001
-#define XFS_DI_MODE 0x0000002
-#define XFS_DI_VERSION 0x0000004
-#define XFS_DI_FORMAT 0x0000008
-#define XFS_DI_ONLINK 0x0000010
-#define XFS_DI_UID 0x0000020
-#define XFS_DI_GID 0x0000040
-#define XFS_DI_NLINK 0x0000080
-#define XFS_DI_PROJID 0x0000100
-#define XFS_DI_PAD 0x0000200
-#define XFS_DI_ATIME 0x0000400
-#define XFS_DI_MTIME 0x0000800
-#define XFS_DI_CTIME 0x0001000
-#define XFS_DI_SIZE 0x0002000
-#define XFS_DI_NBLOCKS 0x0004000
-#define XFS_DI_EXTSIZE 0x0008000
-#define XFS_DI_NEXTENTS 0x0010000
-#define XFS_DI_NAEXTENTS 0x0020000
-#define XFS_DI_FORKOFF 0x0040000
-#define XFS_DI_AFORMAT 0x0080000
-#define XFS_DI_DMEVMASK 0x0100000
-#define XFS_DI_DMSTATE 0x0200000
-#define XFS_DI_FLAGS 0x0400000
-#define XFS_DI_GEN 0x0800000
-#define XFS_DI_NEXT_UNLINKED 0x1000000
-#define XFS_DI_U 0x2000000
-#define XFS_DI_A 0x4000000
-#define XFS_DI_NUM_BITS 27
-#define XFS_DI_ALL_BITS ((1 << XFS_DI_NUM_BITS) - 1)
-#define XFS_DI_CORE_BITS (XFS_DI_ALL_BITS & ~(XFS_DI_U|XFS_DI_A))
-
-/*
* Values for di_format
*/
-typedef enum xfs_dinode_fmt
-{
- XFS_DINODE_FMT_DEV, /* CHR, BLK: di_dev */
- XFS_DINODE_FMT_LOCAL, /* DIR, REG: di_c */
- /* LNK: di_symlink */
- XFS_DINODE_FMT_EXTENTS, /* DIR, REG, LNK: di_bmx */
- XFS_DINODE_FMT_BTREE, /* DIR, REG, LNK: di_bmbt */
- XFS_DINODE_FMT_UUID /* MNT: di_uuid */
+typedef enum xfs_dinode_fmt {
+ XFS_DINODE_FMT_DEV, /* xfs_dev_t */
+ XFS_DINODE_FMT_LOCAL, /* bulk data */
+ XFS_DINODE_FMT_EXTENTS, /* struct xfs_bmbt_rec */
+ XFS_DINODE_FMT_BTREE, /* struct xfs_bmdr_block */
+ XFS_DINODE_FMT_UUID /* uuid_t */
} xfs_dinode_fmt_t;
/*
@@ -166,13 +105,13 @@ typedef enum xfs_dinode_fmt
*/
#define XFS_LITINO(mp) ((mp)->m_litino)
#define XFS_BROOT_SIZE_ADJ \
- (sizeof(xfs_bmbt_block_t) - sizeof(xfs_bmdr_block_t))
+ (XFS_BTREE_LBLOCK_LEN - sizeof(xfs_bmdr_block_t))
/*
* Inode data & attribute fork sizes, per inode.
*/
-#define XFS_DFORK_Q(dip) ((dip)->di_core.di_forkoff != 0)
-#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_core.di_forkoff << 3))
+#define XFS_DFORK_Q(dip) ((dip)->di_forkoff != 0)
+#define XFS_DFORK_BOFF(dip) ((int)((dip)->di_forkoff << 3))
#define XFS_DFORK_DSIZE(dip,mp) \
(XFS_DFORK_Q(dip) ? \
@@ -187,23 +126,42 @@ typedef enum xfs_dinode_fmt
XFS_DFORK_DSIZE(dip, mp) : \
XFS_DFORK_ASIZE(dip, mp))
-#define XFS_DFORK_DPTR(dip) ((dip)->di_u.di_c)
+/*
+ * Return pointers to the data or attribute forks.
+ */
+#define XFS_DFORK_DPTR(dip) \
+ ((char *)(dip) + sizeof(struct xfs_dinode))
#define XFS_DFORK_APTR(dip) \
- ((dip)->di_u.di_c + XFS_DFORK_BOFF(dip))
+ (XFS_DFORK_DPTR(dip) + XFS_DFORK_BOFF(dip))
#define XFS_DFORK_PTR(dip,w) \
((w) == XFS_DATA_FORK ? XFS_DFORK_DPTR(dip) : XFS_DFORK_APTR(dip))
+
#define XFS_DFORK_FORMAT(dip,w) \
((w) == XFS_DATA_FORK ? \
- (dip)->di_core.di_format : \
- (dip)->di_core.di_aformat)
+ (dip)->di_format : \
+ (dip)->di_aformat)
#define XFS_DFORK_NEXTENTS(dip,w) \
((w) == XFS_DATA_FORK ? \
- be32_to_cpu((dip)->di_core.di_nextents) : \
- be16_to_cpu((dip)->di_core.di_anextents))
+ be32_to_cpu((dip)->di_nextents) : \
+ be16_to_cpu((dip)->di_anextents))
#define XFS_BUF_TO_DINODE(bp) ((xfs_dinode_t *)XFS_BUF_PTR(bp))
/*
+ * For block and character special files the 32bit dev_t is stored at the
+ * beginning of the data fork.
+ */
+static inline xfs_dev_t xfs_dinode_get_rdev(struct xfs_dinode *dip)
+{
+ return be32_to_cpu(*(__be32 *)XFS_DFORK_DPTR(dip));
+}
+
+static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
+{
+ *(__be32 *)XFS_DFORK_DPTR(dip) = cpu_to_be32(rdev);
+}
+
+/*
* Values for di_flags
* There should be a one-to-one correspondence between these flags and the
* XFS_XFLAG_s.
diff --git a/fs/xfs/xfs_dir2_sf.h b/fs/xfs/xfs_dir2_sf.h
index deecc9d238f8..6ac44b550d39 100644
--- a/fs/xfs/xfs_dir2_sf.h
+++ b/fs/xfs/xfs_dir2_sf.h
@@ -34,13 +34,6 @@ struct xfs_mount;
struct xfs_trans;
/*
- * Maximum size of a shortform directory.
- */
-#define XFS_DIR2_SF_MAX_SIZE \
- (XFS_DINODE_MAX_SIZE - (uint)sizeof(xfs_dinode_core_t) - \
- (uint)sizeof(xfs_agino_t))
-
-/*
* Inode number stored as 8 8-bit values.
*/
typedef struct { __uint8_t i[8]; } xfs_dir2_ino8_t;
diff --git a/fs/xfs/xfs_dmops.c b/fs/xfs/xfs_dmops.c
index a1e55fb9d5dd..e71e2581c0c3 100644
--- a/fs/xfs/xfs_dmops.c
+++ b/fs/xfs/xfs_dmops.c
@@ -25,7 +25,6 @@
#include "xfs_inum.h"
#include "xfs_ag.h"
#include "xfs_mount.h"
-#include "xfs_clnt.h"
static struct xfs_dmops xfs_dmcore_stub = {
@@ -38,9 +37,9 @@ static struct xfs_dmops xfs_dmcore_stub = {
};
int
-xfs_dmops_get(struct xfs_mount *mp, struct xfs_mount_args *args)
+xfs_dmops_get(struct xfs_mount *mp)
{
- if (args->flags & XFSMNT_DMAPI) {
+ if (mp->m_flags & XFS_MOUNT_DMAPI) {
cmn_err(CE_WARN,
"XFS: dmapi support not available in this kernel.");
return EINVAL;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index f227ecd1a294..92d5cd5bf4f2 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -153,21 +153,6 @@ xfs_errortag_clearall(xfs_mount_t *mp, int loud)
}
#endif /* DEBUG */
-static void
-xfs_fs_vcmn_err(int level, xfs_mount_t *mp, char *fmt, va_list ap)
-{
- if (mp != NULL) {
- char *newfmt;
- int len = 16 + mp->m_fsname_len + strlen(fmt);
-
- newfmt = kmem_alloc(len, KM_SLEEP);
- sprintf(newfmt, "Filesystem \"%s\": %s", mp->m_fsname, fmt);
- icmn_err(level, newfmt, ap);
- kmem_free(newfmt);
- } else {
- icmn_err(level, fmt, ap);
- }
-}
void
xfs_fs_cmn_err(int level, xfs_mount_t *mp, char *fmt, ...)
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 11543f10b0c6..0c93051c4651 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -159,11 +159,15 @@ extern int xfs_errortag_clearall(xfs_mount_t *mp, int loud);
#define XFS_PTAG_FSBLOCK_ZERO 0x00000080
struct xfs_mount;
-/* PRINTFLIKE4 */
+
+extern void xfs_fs_vcmn_err(int level, struct xfs_mount *mp,
+ char *fmt, va_list ap)
+ __attribute__ ((format (printf, 3, 0)));
extern void xfs_cmn_err(int panic_tag, int level, struct xfs_mount *mp,
- char *fmt, ...);
-/* PRINTFLIKE3 */
-extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...);
+ char *fmt, ...)
+ __attribute__ ((format (printf, 4, 5)));
+extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...)
+ __attribute__ ((format (printf, 3, 4)));
extern void xfs_hex_dump(void *p, int length);
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 8aa28f751b2a..05a4bdd4be39 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -108,19 +108,16 @@ xfs_efi_item_pin(xfs_efi_log_item_t *efip)
STATIC void
xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
{
- xfs_mount_t *mp;
+ struct xfs_ail *ailp = efip->efi_item.li_ailp;
- mp = efip->efi_item.li_mountp;
- spin_lock(&mp->m_ail_lock);
+ spin_lock(&ailp->xa_lock);
if (efip->efi_flags & XFS_EFI_CANCELED) {
- /*
- * xfs_trans_delete_ail() drops the AIL lock.
- */
- xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip);
+ /* xfs_trans_ail_delete() drops the AIL lock. */
+ xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
xfs_efi_item_free(efip);
} else {
efip->efi_flags |= XFS_EFI_COMMITTED;
- spin_unlock(&mp->m_ail_lock);
+ spin_unlock(&ailp->xa_lock);
}
}
@@ -134,26 +131,23 @@ xfs_efi_item_unpin(xfs_efi_log_item_t *efip, int stale)
STATIC void
xfs_efi_item_unpin_remove(xfs_efi_log_item_t *efip, xfs_trans_t *tp)
{
- xfs_mount_t *mp;
+ struct xfs_ail *ailp = efip->efi_item.li_ailp;
xfs_log_item_desc_t *lidp;
- mp = efip->efi_item.li_mountp;
- spin_lock(&mp->m_ail_lock);
+ spin_lock(&ailp->xa_lock);
if (efip->efi_flags & XFS_EFI_CANCELED) {
/*
* free the xaction descriptor pointing to this item
*/
lidp = xfs_trans_find_item(tp, (xfs_log_item_t *) efip);
xfs_trans_free_item(tp, lidp);
- /*
- * pull the item off the AIL.
- * xfs_trans_delete_ail() drops the AIL lock.
- */
- xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip);
+
+ /* xfs_trans_ail_delete() drops the AIL lock. */
+ xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
xfs_efi_item_free(efip);
} else {
efip->efi_flags |= XFS_EFI_COMMITTED;
- spin_unlock(&mp->m_ail_lock);
+ spin_unlock(&ailp->xa_lock);
}
}
@@ -268,6 +262,7 @@ xfs_efi_init(xfs_mount_t *mp,
efip->efi_item.li_type = XFS_LI_EFI;
efip->efi_item.li_ops = &xfs_efi_item_ops;
efip->efi_item.li_mountp = mp;
+ efip->efi_item.li_ailp = mp->m_ail;
efip->efi_format.efi_nextents = nextents;
efip->efi_format.efi_id = (__psint_t)(void*)efip;
@@ -345,25 +340,22 @@ void
xfs_efi_release(xfs_efi_log_item_t *efip,
uint nextents)
{
- xfs_mount_t *mp;
- int extents_left;
+ struct xfs_ail *ailp = efip->efi_item.li_ailp;
+ int extents_left;
- mp = efip->efi_item.li_mountp;
ASSERT(efip->efi_next_extent > 0);
ASSERT(efip->efi_flags & XFS_EFI_COMMITTED);
- spin_lock(&mp->m_ail_lock);
+ spin_lock(&ailp->xa_lock);
ASSERT(efip->efi_next_extent >= nextents);
efip->efi_next_extent -= nextents;
extents_left = efip->efi_next_extent;
if (extents_left == 0) {
- /*
- * xfs_trans_delete_ail() drops the AIL lock.
- */
- xfs_trans_delete_ail(mp, (xfs_log_item_t *)efip);
+ /* xfs_trans_ail_delete() drops the AIL lock. */
+ xfs_trans_ail_delete(ailp, (xfs_log_item_t *)efip);
xfs_efi_item_free(efip);
} else {
- spin_unlock(&mp->m_ail_lock);
+ spin_unlock(&ailp->xa_lock);
}
}
@@ -565,6 +557,7 @@ xfs_efd_init(xfs_mount_t *mp,
efdp->efd_item.li_type = XFS_LI_EFD;
efdp->efd_item.li_ops = &xfs_efd_item_ops;
efdp->efd_item.li_mountp = mp;
+ efdp->efd_item.li_ailp = mp->m_ail;
efdp->efd_efip = efip;
efdp->efd_format.efd_nextents = nextents;
efdp->efd_format.efd_efi_id = efip->efi_format.efi_id;
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index 01c0cc88d3f3..589c41c38446 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -113,22 +113,14 @@ struct getbmapx {
#define BMV_IF_ATTRFORK 0x1 /* return attr fork rather than data */
#define BMV_IF_NO_DMAPI_READ 0x2 /* Do not generate DMAPI read event */
#define BMV_IF_PREALLOC 0x4 /* rtn status BMV_OF_PREALLOC if req */
-#define BMV_IF_VALID (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC)
-#ifdef __KERNEL__
-#define BMV_IF_EXTENDED 0x40000000 /* getpmapx if set */
-#endif
+#define BMV_IF_DELALLOC 0x8 /* rtn status BMV_OF_DELALLOC if req */
+#define BMV_IF_VALID \
+ (BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|BMV_IF_DELALLOC)
/* bmv_oflags values - returned for for each non-header segment */
#define BMV_OF_PREALLOC 0x1 /* segment = unwritten pre-allocation */
-
-/* Convert getbmap <-> getbmapx - move fields from p1 to p2. */
-#define GETBMAP_CONVERT(p1,p2) { \
- p2.bmv_offset = p1.bmv_offset; \
- p2.bmv_block = p1.bmv_block; \
- p2.bmv_length = p1.bmv_length; \
- p2.bmv_count = p1.bmv_count; \
- p2.bmv_entries = p1.bmv_entries; }
-
+#define BMV_OF_DELALLOC 0x2 /* segment = delayed allocation */
+#define BMV_OF_LAST 0x4 /* segment is the last in the file */
/*
* Structure for XFS_IOC_FSSETDM.
@@ -426,10 +418,6 @@ typedef struct xfs_handle {
#define XFS_IOC_GETXFLAGS FS_IOC_GETFLAGS
#define XFS_IOC_SETXFLAGS FS_IOC_SETFLAGS
#define XFS_IOC_GETVERSION FS_IOC_GETVERSION
-/* 32-bit compat counterparts */
-#define XFS_IOC32_GETXFLAGS FS_IOC32_GETFLAGS
-#define XFS_IOC32_SETXFLAGS FS_IOC32_SETFLAGS
-#define XFS_IOC32_GETVERSION FS_IOC32_GETVERSION
/*
* ioctl commands that replace IRIX fcntl()'s
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 84583cf73db3..852b6d32e8d0 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -126,7 +126,7 @@ xfs_growfs_data_private(
xfs_extlen_t agsize;
xfs_extlen_t tmpsize;
xfs_alloc_rec_t *arec;
- xfs_btree_sblock_t *block;
+ struct xfs_btree_block *block;
xfs_buf_t *bp;
int bucket;
int dpct;
@@ -251,14 +251,14 @@ xfs_growfs_data_private(
bp = xfs_buf_get(mp->m_ddev_targp,
XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
BTOBB(mp->m_sb.sb_blocksize), 0);
- block = XFS_BUF_TO_SBLOCK(bp);
+ block = XFS_BUF_TO_BLOCK(bp);
memset(block, 0, mp->m_sb.sb_blocksize);
block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
block->bb_level = 0;
block->bb_numrecs = cpu_to_be16(1);
- block->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
- block->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
- arec = XFS_BTREE_REC_ADDR(xfs_alloc, block, 1);
+ block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+ block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+ arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
arec->ar_blockcount = cpu_to_be32(
agsize - be32_to_cpu(arec->ar_startblock));
@@ -272,14 +272,14 @@ xfs_growfs_data_private(
bp = xfs_buf_get(mp->m_ddev_targp,
XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
BTOBB(mp->m_sb.sb_blocksize), 0);
- block = XFS_BUF_TO_SBLOCK(bp);
+ block = XFS_BUF_TO_BLOCK(bp);
memset(block, 0, mp->m_sb.sb_blocksize);
block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
block->bb_level = 0;
block->bb_numrecs = cpu_to_be16(1);
- block->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
- block->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
- arec = XFS_BTREE_REC_ADDR(xfs_alloc, block, 1);
+ block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+ block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+ arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
arec->ar_blockcount = cpu_to_be32(
agsize - be32_to_cpu(arec->ar_startblock));
@@ -294,13 +294,13 @@ xfs_growfs_data_private(
bp = xfs_buf_get(mp->m_ddev_targp,
XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
BTOBB(mp->m_sb.sb_blocksize), 0);
- block = XFS_BUF_TO_SBLOCK(bp);
+ block = XFS_BUF_TO_BLOCK(bp);
memset(block, 0, mp->m_sb.sb_blocksize);
block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
block->bb_level = 0;
block->bb_numrecs = 0;
- block->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
- block->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+ block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
+ block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
error = xfs_bwrite(mp, bp);
if (error) {
goto error0;
@@ -435,6 +435,9 @@ xfs_growfs_data(
xfs_growfs_data_t *in)
{
int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return XFS_ERROR(EPERM);
if (!mutex_trylock(&mp->m_growlock))
return XFS_ERROR(EWOULDBLOCK);
error = xfs_growfs_data_private(mp, in);
@@ -448,6 +451,9 @@ xfs_growfs_log(
xfs_growfs_log_t *in)
{
int error;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return XFS_ERROR(EPERM);
if (!mutex_trylock(&mp->m_growlock))
return XFS_ERROR(EWOULDBLOCK);
error = xfs_growfs_log_private(mp, in);
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index aad8c5da38af..e6ebbaeb4dc6 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -41,68 +41,6 @@
#include "xfs_error.h"
#include "xfs_bmap.h"
-/*
- * Log specified fields for the inode given by bp and off.
- */
-STATIC void
-xfs_ialloc_log_di(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_buf_t *bp, /* inode buffer */
- int off, /* index of inode in buffer */
- int fields) /* bitmask of fields to log */
-{
- int first; /* first byte number */
- int ioffset; /* off in bytes */
- int last; /* last byte number */
- xfs_mount_t *mp; /* mount point structure */
- static const short offsets[] = { /* field offsets */
- /* keep in sync with bits */
- offsetof(xfs_dinode_core_t, di_magic),
- offsetof(xfs_dinode_core_t, di_mode),
- offsetof(xfs_dinode_core_t, di_version),
- offsetof(xfs_dinode_core_t, di_format),
- offsetof(xfs_dinode_core_t, di_onlink),
- offsetof(xfs_dinode_core_t, di_uid),
- offsetof(xfs_dinode_core_t, di_gid),
- offsetof(xfs_dinode_core_t, di_nlink),
- offsetof(xfs_dinode_core_t, di_projid),
- offsetof(xfs_dinode_core_t, di_pad),
- offsetof(xfs_dinode_core_t, di_atime),
- offsetof(xfs_dinode_core_t, di_mtime),
- offsetof(xfs_dinode_core_t, di_ctime),
- offsetof(xfs_dinode_core_t, di_size),
- offsetof(xfs_dinode_core_t, di_nblocks),
- offsetof(xfs_dinode_core_t, di_extsize),
- offsetof(xfs_dinode_core_t, di_nextents),
- offsetof(xfs_dinode_core_t, di_anextents),
- offsetof(xfs_dinode_core_t, di_forkoff),
- offsetof(xfs_dinode_core_t, di_aformat),
- offsetof(xfs_dinode_core_t, di_dmevmask),
- offsetof(xfs_dinode_core_t, di_dmstate),
- offsetof(xfs_dinode_core_t, di_flags),
- offsetof(xfs_dinode_core_t, di_gen),
- offsetof(xfs_dinode_t, di_next_unlinked),
- offsetof(xfs_dinode_t, di_u),
- offsetof(xfs_dinode_t, di_a),
- sizeof(xfs_dinode_t)
- };
-
-
- ASSERT(offsetof(xfs_dinode_t, di_core) == 0);
- ASSERT((fields & (XFS_DI_U|XFS_DI_A)) == 0);
- mp = tp->t_mountp;
- /*
- * Get the inode-relative first and last bytes for these fields
- */
- xfs_btree_offsets(fields, offsets, XFS_DI_NUM_BITS, &first, &last);
- /*
- * Convert to buffer offsets and log it.
- */
- ioffset = off << mp->m_sb.sb_inodelog;
- first += ioffset;
- last += ioffset;
- xfs_trans_log_buf(tp, bp, first, last);
-}
/*
* Allocation group level functions.
@@ -119,6 +57,102 @@ xfs_ialloc_cluster_alignment(
}
/*
+ * Lookup the record equal to ino in the btree given by cur.
+ */
+STATIC int /* error */
+xfs_inobt_lookup_eq(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agino_t ino, /* starting inode of chunk */
+ __int32_t fcnt, /* free inode count */
+ xfs_inofree_t free, /* free inode mask */
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.i.ir_startino = ino;
+ cur->bc_rec.i.ir_freecount = fcnt;
+ cur->bc_rec.i.ir_free = free;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
+}
+
+/*
+ * Lookup the first record greater than or equal to ino
+ * in the btree given by cur.
+ */
+int /* error */
+xfs_inobt_lookup_ge(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agino_t ino, /* starting inode of chunk */
+ __int32_t fcnt, /* free inode count */
+ xfs_inofree_t free, /* free inode mask */
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.i.ir_startino = ino;
+ cur->bc_rec.i.ir_freecount = fcnt;
+ cur->bc_rec.i.ir_free = free;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/*
+ * Lookup the first record less than or equal to ino
+ * in the btree given by cur.
+ */
+int /* error */
+xfs_inobt_lookup_le(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agino_t ino, /* starting inode of chunk */
+ __int32_t fcnt, /* free inode count */
+ xfs_inofree_t free, /* free inode mask */
+ int *stat) /* success/failure */
+{
+ cur->bc_rec.i.ir_startino = ino;
+ cur->bc_rec.i.ir_freecount = fcnt;
+ cur->bc_rec.i.ir_free = free;
+ return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [ino, fcnt, free].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int /* error */
+xfs_inobt_update(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agino_t ino, /* starting inode of chunk */
+ __int32_t fcnt, /* free inode count */
+ xfs_inofree_t free) /* free inode mask */
+{
+ union xfs_btree_rec rec;
+
+ rec.inobt.ir_startino = cpu_to_be32(ino);
+ rec.inobt.ir_freecount = cpu_to_be32(fcnt);
+ rec.inobt.ir_free = cpu_to_be64(free);
+ return xfs_btree_update(cur, &rec);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int /* error */
+xfs_inobt_get_rec(
+ struct xfs_btree_cur *cur, /* btree cursor */
+ xfs_agino_t *ino, /* output: starting inode of chunk */
+ __int32_t *fcnt, /* output: number of free inodes */
+ xfs_inofree_t *free, /* output: free inode mask */
+ int *stat) /* output: success/failure */
+{
+ union xfs_btree_rec *rec;
+ int error;
+
+ error = xfs_btree_get_rec(cur, &rec, stat);
+ if (!error && *stat == 1) {
+ *ino = be32_to_cpu(rec->inobt.ir_startino);
+ *fcnt = be32_to_cpu(rec->inobt.ir_freecount);
+ *free = be64_to_cpu(rec->inobt.ir_free);
+ }
+ return error;
+}
+
+/*
* Allocate new inodes in the allocation group specified by agbp.
* Return 0 for success, else error code.
*/
@@ -287,9 +321,9 @@ xfs_ialloc_ag_alloc(
* able to use the file system.
*/
if (xfs_sb_version_hasnlink(&args.mp->m_sb))
- version = XFS_DINODE_VERSION_2;
+ version = 2;
else
- version = XFS_DINODE_VERSION_1;
+ version = 1;
/*
* Seed the new inode cluster with a random generation number. This
@@ -310,18 +344,25 @@ xfs_ialloc_ag_alloc(
XFS_BUF_LOCK);
ASSERT(fbuf);
ASSERT(!XFS_BUF_GETERROR(fbuf));
+
/*
- * Set initial values for the inodes in this buffer.
+ * Initialize all inodes in this buffer and then log them.
+ *
+ * XXX: It would be much better if we had just one transaction to
+ * log a whole cluster of inodes instead of all the indivdual
+ * transactions causing a lot of log traffic.
*/
xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
for (i = 0; i < ninodes; i++) {
+ int ioffset = i << args.mp->m_sb.sb_inodelog;
+ uint isize = sizeof(struct xfs_dinode);
+
free = XFS_MAKE_IPTR(args.mp, fbuf, i);
- free->di_core.di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
- free->di_core.di_version = version;
- free->di_core.di_gen = cpu_to_be32(gen);
+ free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+ free->di_version = version;
+ free->di_gen = cpu_to_be32(gen);
free->di_next_unlinked = cpu_to_be32(NULLAGINO);
- xfs_ialloc_log_di(tp, fbuf, i,
- XFS_DI_CORE_BITS | XFS_DI_NEXT_UNLINKED);
+ xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
}
xfs_trans_inode_alloc_buf(tp, fbuf);
}
@@ -335,8 +376,7 @@ xfs_ialloc_ag_alloc(
/*
* Insert records describing the new inode chunk into the btree.
*/
- cur = xfs_btree_init_cursor(args.mp, tp, agbp, agno,
- XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
+ cur = xfs_inobt_init_cursor(args.mp, tp, agbp, agno);
for (thisino = newino;
thisino < newino + newlen;
thisino += XFS_INODES_PER_CHUNK) {
@@ -346,7 +386,7 @@ xfs_ialloc_ag_alloc(
return error;
}
ASSERT(i == 0);
- if ((error = xfs_inobt_insert(cur, &i))) {
+ if ((error = xfs_btree_insert(cur, &i))) {
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
return error;
}
@@ -676,8 +716,7 @@ nextag:
*/
agno = tagno;
*IO_agbp = NULL;
- cur = xfs_btree_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno),
- XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
/*
* If pagino is 0 (this is the root inode allocation) use newino.
* This must work because we've just allocated some.
@@ -697,7 +736,7 @@ nextag:
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
freecount += rec.ir_freecount;
- if ((error = xfs_inobt_increment(cur, 0, &i)))
+ if ((error = xfs_btree_increment(cur, 0, &i)))
goto error0;
} while (i == 1);
@@ -741,7 +780,7 @@ nextag:
/*
* Search left with tcur, back up 1 record.
*/
- if ((error = xfs_inobt_decrement(tcur, 0, &i)))
+ if ((error = xfs_btree_decrement(tcur, 0, &i)))
goto error1;
doneleft = !i;
if (!doneleft) {
@@ -755,7 +794,7 @@ nextag:
/*
* Search right with cur, go forward 1 record.
*/
- if ((error = xfs_inobt_increment(cur, 0, &i)))
+ if ((error = xfs_btree_increment(cur, 0, &i)))
goto error1;
doneright = !i;
if (!doneright) {
@@ -817,7 +856,7 @@ nextag:
* further left.
*/
if (useleft) {
- if ((error = xfs_inobt_decrement(tcur, 0,
+ if ((error = xfs_btree_decrement(tcur, 0,
&i)))
goto error1;
doneleft = !i;
@@ -837,7 +876,7 @@ nextag:
* further right.
*/
else {
- if ((error = xfs_inobt_increment(cur, 0,
+ if ((error = xfs_btree_increment(cur, 0,
&i)))
goto error1;
doneright = !i;
@@ -892,7 +931,7 @@ nextag:
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
if (rec.ir_freecount > 0)
break;
- if ((error = xfs_inobt_increment(cur, 0, &i)))
+ if ((error = xfs_btree_increment(cur, 0, &i)))
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
}
@@ -926,7 +965,7 @@ nextag:
goto error0;
XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
freecount += rec.ir_freecount;
- if ((error = xfs_inobt_increment(cur, 0, &i)))
+ if ((error = xfs_btree_increment(cur, 0, &i)))
goto error0;
} while (i == 1);
ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
@@ -1022,8 +1061,7 @@ xfs_difree(
/*
* Initialize the cursor.
*/
- cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
- (xfs_inode_t *)0, 0);
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
#ifdef DEBUG
if (cur->bc_nlevels == 1) {
int freecount = 0;
@@ -1036,7 +1074,7 @@ xfs_difree(
goto error0;
if (i) {
freecount += rec.ir_freecount;
- if ((error = xfs_inobt_increment(cur, 0, &i)))
+ if ((error = xfs_btree_increment(cur, 0, &i)))
goto error0;
}
} while (i == 1);
@@ -1098,8 +1136,8 @@ xfs_difree(
xfs_trans_mod_sb(tp, XFS_TRANS_SB_ICOUNT, -ilen);
xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -(ilen - 1));
- if ((error = xfs_inobt_delete(cur, &i))) {
- cmn_err(CE_WARN, "xfs_difree: xfs_inobt_delete returned an error %d on %s.\n",
+ if ((error = xfs_btree_delete(cur, &i))) {
+ cmn_err(CE_WARN, "xfs_difree: xfs_btree_delete returned an error %d on %s.\n",
error, mp->m_fsname);
goto error0;
}
@@ -1141,7 +1179,7 @@ xfs_difree(
goto error0;
if (i) {
freecount += rec.ir_freecount;
- if ((error = xfs_inobt_increment(cur, 0, &i)))
+ if ((error = xfs_btree_increment(cur, 0, &i)))
goto error0;
}
} while (i == 1);
@@ -1158,36 +1196,28 @@ error0:
}
/*
- * Return the location of the inode in bno/off, for mapping it into a buffer.
+ * Return the location of the inode in imap, for mapping it into a buffer.
*/
-/*ARGSUSED*/
int
-xfs_dilocate(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
+xfs_imap(
+ xfs_mount_t *mp, /* file system mount structure */
+ xfs_trans_t *tp, /* transaction pointer */
xfs_ino_t ino, /* inode to locate */
- xfs_fsblock_t *bno, /* output: block containing inode */
- int *len, /* output: num blocks in inode cluster */
- int *off, /* output: index in block of inode */
- uint flags) /* flags concerning inode lookup */
+ struct xfs_imap *imap, /* location map structure */
+ uint flags) /* flags for inode btree lookup */
{
xfs_agblock_t agbno; /* block number of inode in the alloc group */
- xfs_buf_t *agbp; /* agi buffer */
xfs_agino_t agino; /* inode number within alloc group */
xfs_agnumber_t agno; /* allocation group number */
int blks_per_cluster; /* num blocks per inode cluster */
xfs_agblock_t chunk_agbno; /* first block in inode chunk */
- xfs_agino_t chunk_agino; /* first agino in inode chunk */
- __int32_t chunk_cnt; /* count of free inodes in chunk */
- xfs_inofree_t chunk_free; /* mask of free inodes in chunk */
xfs_agblock_t cluster_agbno; /* first block in inode cluster */
- xfs_btree_cur_t *cur; /* inode btree cursor */
int error; /* error code */
- int i; /* temp state */
int offset; /* index of inode in its buffer */
int offset_agbno; /* blks from chunk start to inode */
ASSERT(ino != NULLFSINO);
+
/*
* Split up the inode number into its parts.
*/
@@ -1198,24 +1228,24 @@ xfs_dilocate(
ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
#ifdef DEBUG
/* no diagnostics for bulkstat, ino comes from userspace */
- if (flags & XFS_IMAP_BULKSTAT)
+ if (flags & XFS_IGET_BULKSTAT)
return XFS_ERROR(EINVAL);
if (agno >= mp->m_sb.sb_agcount) {
xfs_fs_cmn_err(CE_ALERT, mp,
- "xfs_dilocate: agno (%d) >= "
+ "xfs_imap: agno (%d) >= "
"mp->m_sb.sb_agcount (%d)",
agno, mp->m_sb.sb_agcount);
}
if (agbno >= mp->m_sb.sb_agblocks) {
xfs_fs_cmn_err(CE_ALERT, mp,
- "xfs_dilocate: agbno (0x%llx) >= "
+ "xfs_imap: agbno (0x%llx) >= "
"mp->m_sb.sb_agblocks (0x%lx)",
(unsigned long long) agbno,
(unsigned long) mp->m_sb.sb_agblocks);
}
if (ino != XFS_AGINO_TO_INO(mp, agno, agino)) {
xfs_fs_cmn_err(CE_ALERT, mp,
- "xfs_dilocate: ino (0x%llx) != "
+ "xfs_imap: ino (0x%llx) != "
"XFS_AGINO_TO_INO(mp, agno, agino) "
"(0x%llx)",
ino, XFS_AGINO_TO_INO(mp, agno, agino));
@@ -1224,65 +1254,89 @@ xfs_dilocate(
#endif /* DEBUG */
return XFS_ERROR(EINVAL);
}
- if ((mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) ||
- !(flags & XFS_IMAP_LOOKUP)) {
+
+ /*
+ * If the inode cluster size is the same as the blocksize or
+ * smaller we get to the buffer by simple arithmetics.
+ */
+ if (XFS_INODE_CLUSTER_SIZE(mp) <= mp->m_sb.sb_blocksize) {
offset = XFS_INO_TO_OFFSET(mp, ino);
ASSERT(offset < mp->m_sb.sb_inopblock);
- *bno = XFS_AGB_TO_FSB(mp, agno, agbno);
- *off = offset;
- *len = 1;
+
+ imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
+ imap->im_len = XFS_FSB_TO_BB(mp, 1);
+ imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
return 0;
}
+
blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_blocklog;
- if (*bno != NULLFSBLOCK) {
+
+ /*
+ * If we get a block number passed from bulkstat we can use it to
+ * find the buffer easily.
+ */
+ if (imap->im_blkno) {
offset = XFS_INO_TO_OFFSET(mp, ino);
ASSERT(offset < mp->m_sb.sb_inopblock);
- cluster_agbno = XFS_FSB_TO_AGBNO(mp, *bno);
- *off = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
- offset;
- *len = blks_per_cluster;
+
+ cluster_agbno = XFS_DADDR_TO_AGBNO(mp, imap->im_blkno);
+ offset += (agbno - cluster_agbno) * mp->m_sb.sb_inopblock;
+
+ imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+ imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
return 0;
}
+
+ /*
+ * If the inode chunks are aligned then use simple maths to
+ * find the location. Otherwise we have to do a btree
+ * lookup to find the location.
+ */
if (mp->m_inoalign_mask) {
offset_agbno = agbno & mp->m_inoalign_mask;
chunk_agbno = agbno - offset_agbno;
} else {
+ xfs_btree_cur_t *cur; /* inode btree cursor */
+ xfs_agino_t chunk_agino; /* first agino in inode chunk */
+ __int32_t chunk_cnt; /* count of free inodes in chunk */
+ xfs_inofree_t chunk_free; /* mask of free inodes in chunk */
+ xfs_buf_t *agbp; /* agi buffer */
+ int i; /* temp state */
+
down_read(&mp->m_peraglock);
error = xfs_ialloc_read_agi(mp, tp, agno, &agbp);
up_read(&mp->m_peraglock);
if (error) {
-#ifdef DEBUG
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+ xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
"xfs_ialloc_read_agi() returned "
"error %d, agno %d",
error, agno);
-#endif /* DEBUG */
return error;
}
- cur = xfs_btree_init_cursor(mp, tp, agbp, agno, XFS_BTNUM_INO,
- (xfs_inode_t *)0, 0);
- if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
-#ifdef DEBUG
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+
+ cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
+ error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i);
+ if (error) {
+ xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
"xfs_inobt_lookup_le() failed");
-#endif /* DEBUG */
goto error0;
}
- if ((error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
- &chunk_free, &i))) {
-#ifdef DEBUG
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+
+ error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
+ &chunk_free, &i);
+ if (error) {
+ xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
"xfs_inobt_get_rec() failed");
-#endif /* DEBUG */
goto error0;
}
if (i == 0) {
#ifdef DEBUG
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_dilocate: "
+ xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
"xfs_inobt_get_rec() failed");
#endif /* DEBUG */
error = XFS_ERROR(EINVAL);
}
+ error0:
xfs_trans_brelse(tp, agbp);
xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
if (error)
@@ -1290,19 +1344,35 @@ xfs_dilocate(
chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino);
offset_agbno = agbno - chunk_agbno;
}
+
ASSERT(agbno >= chunk_agbno);
cluster_agbno = chunk_agbno +
((offset_agbno / blks_per_cluster) * blks_per_cluster);
offset = ((agbno - cluster_agbno) * mp->m_sb.sb_inopblock) +
XFS_INO_TO_OFFSET(mp, ino);
- *bno = XFS_AGB_TO_FSB(mp, agno, cluster_agbno);
- *off = offset;
- *len = blks_per_cluster;
+
+ imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
+ imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
+ imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+
+ /*
+ * If the inode number maps to a block outside the bounds
+ * of the file system then return NULL rather than calling
+ * read_buf and panicing when we get an error from the
+ * driver.
+ */
+ if ((imap->im_blkno + imap->im_len) >
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
+ xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
+ "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
+ " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
+ (unsigned long long) imap->im_blkno,
+ (unsigned long long) imap->im_len,
+ XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
+ return XFS_ERROR(EINVAL);
+ }
+
return 0;
-error0:
- xfs_trans_brelse(tp, agbp);
- xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
- return error;
}
/*
@@ -1370,70 +1440,95 @@ xfs_ialloc_log_agi(
xfs_trans_log_buf(tp, bp, first, last);
}
+#ifdef DEBUG
+STATIC void
+xfs_check_agi_unlinked(
+ struct xfs_agi *agi)
+{
+ int i;
+
+ for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
+ ASSERT(agi->agi_unlinked[i]);
+}
+#else
+#define xfs_check_agi_unlinked(agi)
+#endif
+
/*
* Read in the allocation group header (inode allocation section)
*/
int
-xfs_ialloc_read_agi(
- xfs_mount_t *mp, /* file system mount structure */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_buf_t **bpp) /* allocation group hdr buf */
+xfs_read_agi(
+ struct xfs_mount *mp, /* file system mount structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_agnumber_t agno, /* allocation group number */
+ struct xfs_buf **bpp) /* allocation group hdr buf */
{
- xfs_agi_t *agi; /* allocation group header */
- int agi_ok; /* agi is consistent */
- xfs_buf_t *bp; /* allocation group hdr buf */
- xfs_perag_t *pag; /* per allocation group data */
- int error;
+ struct xfs_agi *agi; /* allocation group header */
+ int agi_ok; /* agi is consistent */
+ int error;
ASSERT(agno != NULLAGNUMBER);
- error = xfs_trans_read_buf(
- mp, tp, mp->m_ddev_targp,
+
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, &bp);
+ XFS_FSS_TO_BB(mp, 1), 0, bpp);
if (error)
return error;
- ASSERT(bp && !XFS_BUF_GETERROR(bp));
+
+ ASSERT(*bpp && !XFS_BUF_GETERROR(*bpp));
+ agi = XFS_BUF_TO_AGI(*bpp);
/*
* Validate the magic number of the agi block.
*/
- agi = XFS_BUF_TO_AGI(bp);
- agi_ok =
- be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
- XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
+ agi_ok = be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
+ XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) &&
+ be32_to_cpu(agi->agi_seqno) == agno;
if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
XFS_RANDOM_IALLOC_READ_AGI))) {
- XFS_CORRUPTION_ERROR("xfs_ialloc_read_agi", XFS_ERRLEVEL_LOW,
+ XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW,
mp, agi);
- xfs_trans_brelse(tp, bp);
+ xfs_trans_brelse(tp, *bpp);
return XFS_ERROR(EFSCORRUPTED);
}
+
+ XFS_BUF_SET_VTYPE_REF(*bpp, B_FS_AGI, XFS_AGI_REF);
+
+ xfs_check_agi_unlinked(agi);
+ return 0;
+}
+
+int
+xfs_ialloc_read_agi(
+ struct xfs_mount *mp, /* file system mount structure */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_agnumber_t agno, /* allocation group number */
+ struct xfs_buf **bpp) /* allocation group hdr buf */
+{
+ struct xfs_agi *agi; /* allocation group header */
+ struct xfs_perag *pag; /* per allocation group data */
+ int error;
+
+ error = xfs_read_agi(mp, tp, agno, bpp);
+ if (error)
+ return error;
+
+ agi = XFS_BUF_TO_AGI(*bpp);
pag = &mp->m_perag[agno];
+
if (!pag->pagi_init) {
pag->pagi_freecount = be32_to_cpu(agi->agi_freecount);
pag->pagi_count = be32_to_cpu(agi->agi_count);
pag->pagi_init = 1;
- } else {
- /*
- * It's possible for these to be out of sync if
- * we are in the middle of a forced shutdown.
- */
- ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
- XFS_FORCED_SHUTDOWN(mp));
}
-#ifdef DEBUG
- {
- int i;
-
- for (i = 0; i < XFS_AGI_UNLINKED_BUCKETS; i++)
- ASSERT(agi->agi_unlinked[i]);
- }
-#endif
-
- XFS_BUF_SET_VTYPE_REF(bp, B_FS_AGI, XFS_AGI_REF);
- *bpp = bp;
+ /*
+ * It's possible for these to be out of sync if
+ * we are in the middle of a forced shutdown.
+ */
+ ASSERT(pag->pagi_freecount == be32_to_cpu(agi->agi_freecount) ||
+ XFS_FORCED_SHUTDOWN(mp));
return 0;
}
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 4e30ec1d13bc..50f558a4e0a8 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -20,6 +20,7 @@
struct xfs_buf;
struct xfs_dinode;
+struct xfs_imap;
struct xfs_mount;
struct xfs_trans;
@@ -56,7 +57,6 @@ static inline int xfs_ialloc_find_free(xfs_inofree_t *fp)
}
-#ifdef __KERNEL__
/*
* Allocate an inode on disk.
* Mode is used to tell whether the new inode will need space, and whether
@@ -105,17 +105,14 @@ xfs_difree(
xfs_ino_t *first_ino); /* first inode in deleted cluster */
/*
- * Return the location of the inode in bno/len/off,
- * for mapping it into a buffer.
+ * Return the location of the inode in imap, for mapping it into a buffer.
*/
int
-xfs_dilocate(
+xfs_imap(
struct xfs_mount *mp, /* file system mount structure */
struct xfs_trans *tp, /* transaction pointer */
xfs_ino_t ino, /* inode to locate */
- xfs_fsblock_t *bno, /* output: block containing inode */
- int *len, /* output: num blocks in cluster*/
- int *off, /* output: index in block of inode */
+ struct xfs_imap *imap, /* location map structure */
uint flags); /* flags for inode btree lookup */
/*
@@ -154,6 +151,24 @@ xfs_ialloc_pagi_init(
struct xfs_trans *tp, /* transaction pointer */
xfs_agnumber_t agno); /* allocation group number */
-#endif /* __KERNEL__ */
+/*
+ * Lookup the first record greater than or equal to ino
+ * in the btree given by cur.
+ */
+int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino,
+ __int32_t fcnt, xfs_inofree_t free, int *stat);
+
+/*
+ * Lookup the first record less than or equal to ino
+ * in the btree given by cur.
+ */
+int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
+ __int32_t fcnt, xfs_inofree_t free, int *stat);
+
+/*
+ * Get the data from the pointed-to record.
+ */
+extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino,
+ __int32_t *fcnt, xfs_inofree_t *free, int *stat);
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 83502f3edef0..99f2408e8d8e 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -35,2044 +35,349 @@
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
#include "xfs_ialloc.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
-STATIC void xfs_inobt_log_block(xfs_trans_t *, xfs_buf_t *, int);
-STATIC void xfs_inobt_log_keys(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_inobt_log_ptrs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC void xfs_inobt_log_recs(xfs_btree_cur_t *, xfs_buf_t *, int, int);
-STATIC int xfs_inobt_lshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_inobt_newroot(xfs_btree_cur_t *, int *);
-STATIC int xfs_inobt_rshift(xfs_btree_cur_t *, int, int *);
-STATIC int xfs_inobt_split(xfs_btree_cur_t *, int, xfs_agblock_t *,
- xfs_inobt_key_t *, xfs_btree_cur_t **, int *);
-STATIC int xfs_inobt_updkey(xfs_btree_cur_t *, xfs_inobt_key_t *, int);
-/*
- * Single level of the xfs_inobt_delete record deletion routine.
- * Delete record pointed to by cur/level.
- * Remove the record from its block then rebalance the tree.
- * Return 0 for error, 1 for done, 2 to go on to the next level.
- */
-STATIC int /* error */
-xfs_inobt_delrec(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level removing record from */
- int *stat) /* fail/done/go-on */
+STATIC int
+xfs_inobt_get_minrecs(
+ struct xfs_btree_cur *cur,
+ int level)
{
- xfs_buf_t *agbp; /* buffer for a.g. inode header */
- xfs_mount_t *mp; /* mount structure */
- xfs_agi_t *agi; /* allocation group inode header */
- xfs_inobt_block_t *block; /* btree block record/key lives in */
- xfs_agblock_t bno; /* btree block number */
- xfs_buf_t *bp; /* buffer for block */
- int error; /* error return value */
- int i; /* loop index */
- xfs_inobt_key_t key; /* kp points here if block is level 0 */
- xfs_inobt_key_t *kp = NULL; /* pointer to btree keys */
- xfs_agblock_t lbno; /* left block's block number */
- xfs_buf_t *lbp; /* left block's buffer pointer */
- xfs_inobt_block_t *left; /* left btree block */
- xfs_inobt_key_t *lkp; /* left block key pointer */
- xfs_inobt_ptr_t *lpp; /* left block address pointer */
- int lrecs = 0; /* number of records in left block */
- xfs_inobt_rec_t *lrp; /* left block record pointer */
- xfs_inobt_ptr_t *pp = NULL; /* pointer to btree addresses */
- int ptr; /* index in btree block for this rec */
- xfs_agblock_t rbno; /* right block's block number */
- xfs_buf_t *rbp; /* right block's buffer pointer */
- xfs_inobt_block_t *right; /* right btree block */
- xfs_inobt_key_t *rkp; /* right block key pointer */
- xfs_inobt_rec_t *rp; /* pointer to btree records */
- xfs_inobt_ptr_t *rpp; /* right block address pointer */
- int rrecs = 0; /* number of records in right block */
- int numrecs;
- xfs_inobt_rec_t *rrp; /* right block record pointer */
- xfs_btree_cur_t *tcur; /* temporary btree cursor */
-
- mp = cur->bc_mp;
-
- /*
- * Get the index of the entry being deleted, check for nothing there.
- */
- ptr = cur->bc_ptrs[level];
- if (ptr == 0) {
- *stat = 0;
- return 0;
- }
-
- /*
- * Get the buffer & block containing the record or key/ptr.
- */
- bp = cur->bc_bufs[level];
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
- return error;
-#endif
- /*
- * Fail if we're off the end of the block.
- */
+ return cur->bc_mp->m_inobt_mnr[level != 0];
+}
- numrecs = be16_to_cpu(block->bb_numrecs);
- if (ptr > numrecs) {
- *stat = 0;
- return 0;
- }
- /*
- * It's a nonleaf. Excise the key and ptr being deleted, by
- * sliding the entries past them down one.
- * Log the changed areas of the block.
- */
- if (level > 0) {
- kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
- pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
- for (i = ptr; i < numrecs; i++) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i]), level)))
- return error;
- }
-#endif
- if (ptr < numrecs) {
- memmove(&kp[ptr - 1], &kp[ptr],
- (numrecs - ptr) * sizeof(*kp));
- memmove(&pp[ptr - 1], &pp[ptr],
- (numrecs - ptr) * sizeof(*kp));
- xfs_inobt_log_keys(cur, bp, ptr, numrecs - 1);
- xfs_inobt_log_ptrs(cur, bp, ptr, numrecs - 1);
- }
- }
- /*
- * It's a leaf. Excise the record being deleted, by sliding the
- * entries past it down one. Log the changed areas of the block.
- */
- else {
- rp = XFS_INOBT_REC_ADDR(block, 1, cur);
- if (ptr < numrecs) {
- memmove(&rp[ptr - 1], &rp[ptr],
- (numrecs - ptr) * sizeof(*rp));
- xfs_inobt_log_recs(cur, bp, ptr, numrecs - 1);
- }
- /*
- * If it's the first record in the block, we'll need a key
- * structure to pass up to the next level (updkey).
- */
- if (ptr == 1) {
- key.ir_startino = rp->ir_startino;
- kp = &key;
- }
- }
- /*
- * Decrement and log the number of entries in the block.
- */
- numrecs--;
- block->bb_numrecs = cpu_to_be16(numrecs);
- xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
- /*
- * Is this the root level? If so, we're almost done.
- */
- if (level == cur->bc_nlevels - 1) {
- /*
- * If this is the root level,
- * and there's only one entry left,
- * and it's NOT the leaf level,
- * then we can get rid of this level.
- */
- if (numrecs == 1 && level > 0) {
- agbp = cur->bc_private.a.agbp;
- agi = XFS_BUF_TO_AGI(agbp);
- /*
- * pp is still set to the first pointer in the block.
- * Make it the new root of the btree.
- */
- bno = be32_to_cpu(agi->agi_root);
- agi->agi_root = *pp;
- be32_add_cpu(&agi->agi_level, -1);
- /*
- * Free the block.
- */
- if ((error = xfs_free_extent(cur->bc_tp,
- XFS_AGB_TO_FSB(mp, cur->bc_private.a.agno, bno), 1)))
- return error;
- xfs_trans_binval(cur->bc_tp, bp);
- xfs_ialloc_log_agi(cur->bc_tp, agbp,
- XFS_AGI_ROOT | XFS_AGI_LEVEL);
- /*
- * Update the cursor so there's one fewer level.
- */
- cur->bc_bufs[level] = NULL;
- cur->bc_nlevels--;
- } else if (level > 0 &&
- (error = xfs_inobt_decrement(cur, level, &i)))
- return error;
- *stat = 1;
- return 0;
- }
- /*
- * If we deleted the leftmost entry in the block, update the
- * key values above us in the tree.
- */
- if (ptr == 1 && (error = xfs_inobt_updkey(cur, kp, level + 1)))
- return error;
- /*
- * If the number of records remaining in the block is at least
- * the minimum, we're done.
- */
- if (numrecs >= XFS_INOBT_BLOCK_MINRECS(level, cur)) {
- if (level > 0 &&
- (error = xfs_inobt_decrement(cur, level, &i)))
- return error;
- *stat = 1;
- return 0;
- }
- /*
- * Otherwise, we have to move some records around to keep the
- * tree balanced. Look at the left and right sibling blocks to
- * see if we can re-balance by moving only one record.
- */
- rbno = be32_to_cpu(block->bb_rightsib);
- lbno = be32_to_cpu(block->bb_leftsib);
- bno = NULLAGBLOCK;
- ASSERT(rbno != NULLAGBLOCK || lbno != NULLAGBLOCK);
- /*
- * Duplicate the cursor so our btree manipulations here won't
- * disrupt the next level up.
- */
- if ((error = xfs_btree_dup_cursor(cur, &tcur)))
- return error;
- /*
- * If there's a right sibling, see if it's ok to shift an entry
- * out of it.
- */
- if (rbno != NULLAGBLOCK) {
- /*
- * Move the temp cursor to the last entry in the next block.
- * Actually any entry but the first would suffice.
- */
- i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- if ((error = xfs_inobt_increment(tcur, level, &i)))
- goto error0;
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- i = xfs_btree_lastrec(tcur, level);
- XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
- /*
- * Grab a pointer to the block.
- */
- rbp = tcur->bc_bufs[level];
- right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
- goto error0;
-#endif
- /*
- * Grab the current block number, for future use.
- */
- bno = be32_to_cpu(right->bb_leftsib);
- /*
- * If right block is full enough so that removing one entry
- * won't make it too empty, and left-shifting an entry out
- * of right to us works, we're done.
- */
- if (be16_to_cpu(right->bb_numrecs) - 1 >=
- XFS_INOBT_BLOCK_MINRECS(level, cur)) {
- if ((error = xfs_inobt_lshift(tcur, level, &i)))
- goto error0;
- if (i) {
- ASSERT(be16_to_cpu(block->bb_numrecs) >=
- XFS_INOBT_BLOCK_MINRECS(level, cur));
- xfs_btree_del_cursor(tcur,
- XFS_BTREE_NOERROR);
- if (level > 0 &&
- (error = xfs_inobt_decrement(cur, level,
- &i)))
- return error;
- *stat = 1;
- return 0;
- }
- }
- /*
- * Otherwise, grab the number of records in right for
- * future reference, and fix up the temp cursor to point
- * to our block again (last record).
- */
- rrecs = be16_to_cpu(right->bb_numrecs);
- if (lbno != NULLAGBLOCK) {
- xfs_btree_firstrec(tcur, level);
- if ((error = xfs_inobt_decrement(tcur, level, &i)))
- goto error0;
- }
- }
- /*
- * If there's a left sibling, see if it's ok to shift an entry
- * out of it.
- */
- if (lbno != NULLAGBLOCK) {
- /*
- * Move the temp cursor to the first entry in the
- * previous block.
- */
- xfs_btree_firstrec(tcur, level);
- if ((error = xfs_inobt_decrement(tcur, level, &i)))
- goto error0;
- xfs_btree_firstrec(tcur, level);
- /*
- * Grab a pointer to the block.
- */
- lbp = tcur->bc_bufs[level];
- left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
- goto error0;
-#endif
- /*
- * Grab the current block number, for future use.
- */
- bno = be32_to_cpu(left->bb_rightsib);
- /*
- * If left block is full enough so that removing one entry
- * won't make it too empty, and right-shifting an entry out
- * of left to us works, we're done.
- */
- if (be16_to_cpu(left->bb_numrecs) - 1 >=
- XFS_INOBT_BLOCK_MINRECS(level, cur)) {
- if ((error = xfs_inobt_rshift(tcur, level, &i)))
- goto error0;
- if (i) {
- ASSERT(be16_to_cpu(block->bb_numrecs) >=
- XFS_INOBT_BLOCK_MINRECS(level, cur));
- xfs_btree_del_cursor(tcur,
- XFS_BTREE_NOERROR);
- if (level == 0)
- cur->bc_ptrs[0]++;
- *stat = 1;
- return 0;
- }
- }
- /*
- * Otherwise, grab the number of records in right for
- * future reference.
- */
- lrecs = be16_to_cpu(left->bb_numrecs);
- }
- /*
- * Delete the temp cursor, we're done with it.
- */
- xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
- /*
- * If here, we need to do a join to keep the tree balanced.
- */
- ASSERT(bno != NULLAGBLOCK);
- /*
- * See if we can join with the left neighbor block.
- */
- if (lbno != NULLAGBLOCK &&
- lrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
- /*
- * Set "right" to be the starting block,
- * "left" to be the left neighbor.
- */
- rbno = bno;
- right = block;
- rrecs = be16_to_cpu(right->bb_numrecs);
- rbp = bp;
- if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
- cur->bc_private.a.agno, lbno, 0, &lbp,
- XFS_INO_BTREE_REF)))
- return error;
- left = XFS_BUF_TO_INOBT_BLOCK(lbp);
- lrecs = be16_to_cpu(left->bb_numrecs);
- if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
- return error;
- }
- /*
- * If that won't work, see if we can join with the right neighbor block.
- */
- else if (rbno != NULLAGBLOCK &&
- rrecs + numrecs <= XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
- /*
- * Set "left" to be the starting block,
- * "right" to be the right neighbor.
- */
- lbno = bno;
- left = block;
- lrecs = be16_to_cpu(left->bb_numrecs);
- lbp = bp;
- if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
- cur->bc_private.a.agno, rbno, 0, &rbp,
- XFS_INO_BTREE_REF)))
- return error;
- right = XFS_BUF_TO_INOBT_BLOCK(rbp);
- rrecs = be16_to_cpu(right->bb_numrecs);
- if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
- return error;
- }
- /*
- * Otherwise, we can't fix the imbalance.
- * Just return. This is probably a logic error, but it's not fatal.
- */
- else {
- if (level > 0 && (error = xfs_inobt_decrement(cur, level, &i)))
- return error;
- *stat = 1;
- return 0;
- }
- /*
- * We're now going to join "left" and "right" by moving all the stuff
- * in "right" to "left" and deleting "right".
- */
- if (level > 0) {
- /*
- * It's a non-leaf. Move keys and pointers.
- */
- lkp = XFS_INOBT_KEY_ADDR(left, lrecs + 1, cur);
- lpp = XFS_INOBT_PTR_ADDR(left, lrecs + 1, cur);
- rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
- rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
- for (i = 0; i < rrecs; i++) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
- return error;
- }
-#endif
- memcpy(lkp, rkp, rrecs * sizeof(*lkp));
- memcpy(lpp, rpp, rrecs * sizeof(*lpp));
- xfs_inobt_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
- xfs_inobt_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
- } else {
- /*
- * It's a leaf. Move records.
- */
- lrp = XFS_INOBT_REC_ADDR(left, lrecs + 1, cur);
- rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
- memcpy(lrp, rrp, rrecs * sizeof(*lrp));
- xfs_inobt_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
- }
- /*
- * If we joined with the left neighbor, set the buffer in the
- * cursor to the left block, and fix up the index.
- */
- if (bp != lbp) {
- xfs_btree_setbuf(cur, level, lbp);
- cur->bc_ptrs[level] += lrecs;
- }
- /*
- * If we joined with the right neighbor and there's a level above
- * us, increment the cursor at that level.
- */
- else if (level + 1 < cur->bc_nlevels &&
- (error = xfs_alloc_increment(cur, level + 1, &i)))
- return error;
- /*
- * Fix up the number of records in the surviving block.
- */
- lrecs += rrecs;
- left->bb_numrecs = cpu_to_be16(lrecs);
- /*
- * Fix up the right block pointer in the surviving block, and log it.
- */
- left->bb_rightsib = right->bb_rightsib;
- xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
- /*
- * If there is a right sibling now, make it point to the
- * remaining block.
- */
- if (be32_to_cpu(left->bb_rightsib) != NULLAGBLOCK) {
- xfs_inobt_block_t *rrblock;
- xfs_buf_t *rrbp;
+STATIC struct xfs_btree_cur *
+xfs_inobt_dup_cursor(
+ struct xfs_btree_cur *cur)
+{
+ return xfs_inobt_init_cursor(cur->bc_mp, cur->bc_tp,
+ cur->bc_private.a.agbp, cur->bc_private.a.agno);
+}
- if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
- cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib), 0,
- &rrbp, XFS_INO_BTREE_REF)))
- return error;
- rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
- if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
- return error;
- rrblock->bb_leftsib = cpu_to_be32(lbno);
- xfs_inobt_log_block(cur->bc_tp, rrbp, XFS_BB_LEFTSIB);
- }
- /*
- * Free the deleting block.
- */
- if ((error = xfs_free_extent(cur->bc_tp, XFS_AGB_TO_FSB(mp,
- cur->bc_private.a.agno, rbno), 1)))
- return error;
- xfs_trans_binval(cur->bc_tp, rbp);
- /*
- * Readjust the ptr at this level if it's not a leaf, since it's
- * still pointing at the deletion point, which makes the cursor
- * inconsistent. If this makes the ptr 0, the caller fixes it up.
- * We can't use decrement because it would change the next level up.
- */
- if (level > 0)
- cur->bc_ptrs[level]--;
- /*
- * Return value means the next level up has something to do.
- */
- *stat = 2;
- return 0;
+STATIC void
+xfs_inobt_set_root(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *nptr,
+ int inc) /* level change */
+{
+ struct xfs_buf *agbp = cur->bc_private.a.agbp;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
-error0:
- xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
- return error;
+ agi->agi_root = nptr->s;
+ be32_add_cpu(&agi->agi_level, inc);
+ xfs_ialloc_log_agi(cur->bc_tp, agbp, XFS_AGI_ROOT | XFS_AGI_LEVEL);
}
-/*
- * Insert one record/level. Return information to the caller
- * allowing the next level up to proceed if necessary.
- */
-STATIC int /* error */
-xfs_inobt_insrec(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level to insert record at */
- xfs_agblock_t *bnop, /* i/o: block number inserted */
- xfs_inobt_rec_t *recp, /* i/o: record data inserted */
- xfs_btree_cur_t **curp, /* output: new cursor replacing cur */
- int *stat) /* success/failure */
+STATIC int
+xfs_inobt_alloc_block(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *start,
+ union xfs_btree_ptr *new,
+ int length,
+ int *stat)
{
- xfs_inobt_block_t *block; /* btree block record/key lives in */
- xfs_buf_t *bp; /* buffer for block */
- int error; /* error return value */
- int i; /* loop index */
- xfs_inobt_key_t key; /* key value being inserted */
- xfs_inobt_key_t *kp=NULL; /* pointer to btree keys */
- xfs_agblock_t nbno; /* block number of allocated block */
- xfs_btree_cur_t *ncur; /* new cursor to be used at next lvl */
- xfs_inobt_key_t nkey; /* new key value, from split */
- xfs_inobt_rec_t nrec; /* new record value, for caller */
- int numrecs;
- int optr; /* old ptr value */
- xfs_inobt_ptr_t *pp; /* pointer to btree addresses */
- int ptr; /* index in btree block for this rec */
- xfs_inobt_rec_t *rp=NULL; /* pointer to btree records */
+ xfs_alloc_arg_t args; /* block allocation args */
+ int error; /* error return value */
+ xfs_agblock_t sbno = be32_to_cpu(start->s);
- /*
- * GCC doesn't understand the (arguably complex) control flow in
- * this function and complains about uninitialized structure fields
- * without this.
- */
- memset(&nrec, 0, sizeof(nrec));
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
- /*
- * If we made it to the root level, allocate a new root block
- * and we're done.
- */
- if (level >= cur->bc_nlevels) {
- error = xfs_inobt_newroot(cur, &i);
- *bnop = NULLAGBLOCK;
- *stat = i;
+ memset(&args, 0, sizeof(args));
+ args.tp = cur->bc_tp;
+ args.mp = cur->bc_mp;
+ args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, sbno);
+ args.minlen = 1;
+ args.maxlen = 1;
+ args.prod = 1;
+ args.type = XFS_ALLOCTYPE_NEAR_BNO;
+
+ error = xfs_alloc_vextent(&args);
+ if (error) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
}
- /*
- * Make a key out of the record data to be inserted, and save it.
- */
- key.ir_startino = recp->ir_startino;
- optr = ptr = cur->bc_ptrs[level];
- /*
- * If we're off the left edge, return failure.
- */
- if (ptr == 0) {
+ if (args.fsbno == NULLFSBLOCK) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
*stat = 0;
return 0;
}
- /*
- * Get pointers to the btree buffer and block.
- */
- bp = cur->bc_bufs[level];
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
- numrecs = be16_to_cpu(block->bb_numrecs);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
- return error;
- /*
- * Check that the new entry is being inserted in the right place.
- */
- if (ptr <= numrecs) {
- if (level == 0) {
- rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
- xfs_btree_check_rec(cur->bc_btnum, recp, rp);
- } else {
- kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
- xfs_btree_check_key(cur->bc_btnum, &key, kp);
- }
- }
-#endif
- nbno = NULLAGBLOCK;
- ncur = NULL;
- /*
- * If the block is full, we can't insert the new entry until we
- * make the block un-full.
- */
- if (numrecs == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
- /*
- * First, try shifting an entry to the right neighbor.
- */
- if ((error = xfs_inobt_rshift(cur, level, &i)))
- return error;
- if (i) {
- /* nothing */
- }
- /*
- * Next, try shifting an entry to the left neighbor.
- */
- else {
- if ((error = xfs_inobt_lshift(cur, level, &i)))
- return error;
- if (i) {
- optr = ptr = cur->bc_ptrs[level];
- } else {
- /*
- * Next, try splitting the current block
- * in half. If this works we have to
- * re-set our variables because
- * we could be in a different block now.
- */
- if ((error = xfs_inobt_split(cur, level, &nbno,
- &nkey, &ncur, &i)))
- return error;
- if (i) {
- bp = cur->bc_bufs[level];
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur,
- block, level, bp)))
- return error;
-#endif
- ptr = cur->bc_ptrs[level];
- nrec.ir_startino = nkey.ir_startino;
- } else {
- /*
- * Otherwise the insert fails.
- */
- *stat = 0;
- return 0;
- }
- }
- }
- }
- /*
- * At this point we know there's room for our new entry in the block
- * we're pointing at.
- */
- numrecs = be16_to_cpu(block->bb_numrecs);
- if (level > 0) {
- /*
- * It's a non-leaf entry. Make a hole for the new data
- * in the key and ptr regions of the block.
- */
- kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
- pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
-#ifdef DEBUG
- for (i = numrecs; i >= ptr; i--) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(pp[i - 1]), level)))
- return error;
- }
-#endif
- memmove(&kp[ptr], &kp[ptr - 1],
- (numrecs - ptr + 1) * sizeof(*kp));
- memmove(&pp[ptr], &pp[ptr - 1],
- (numrecs - ptr + 1) * sizeof(*pp));
- /*
- * Now stuff the new data in, bump numrecs and log the new data.
- */
-#ifdef DEBUG
- if ((error = xfs_btree_check_sptr(cur, *bnop, level)))
- return error;
-#endif
- kp[ptr - 1] = key;
- pp[ptr - 1] = cpu_to_be32(*bnop);
- numrecs++;
- block->bb_numrecs = cpu_to_be16(numrecs);
- xfs_inobt_log_keys(cur, bp, ptr, numrecs);
- xfs_inobt_log_ptrs(cur, bp, ptr, numrecs);
- } else {
- /*
- * It's a leaf entry. Make a hole for the new record.
- */
- rp = XFS_INOBT_REC_ADDR(block, 1, cur);
- memmove(&rp[ptr], &rp[ptr - 1],
- (numrecs - ptr + 1) * sizeof(*rp));
- /*
- * Now stuff the new record in, bump numrecs
- * and log the new data.
- */
- rp[ptr - 1] = *recp;
- numrecs++;
- block->bb_numrecs = cpu_to_be16(numrecs);
- xfs_inobt_log_recs(cur, bp, ptr, numrecs);
- }
- /*
- * Log the new number of records in the btree header.
- */
- xfs_inobt_log_block(cur->bc_tp, bp, XFS_BB_NUMRECS);
-#ifdef DEBUG
- /*
- * Check that the key/record is in the right place, now.
- */
- if (ptr < numrecs) {
- if (level == 0)
- xfs_btree_check_rec(cur->bc_btnum, rp + ptr - 1,
- rp + ptr);
- else
- xfs_btree_check_key(cur->bc_btnum, kp + ptr - 1,
- kp + ptr);
- }
-#endif
- /*
- * If we inserted at the start of a block, update the parents' keys.
- */
- if (optr == 1 && (error = xfs_inobt_updkey(cur, &key, level + 1)))
- return error;
- /*
- * Return the new block number, if any.
- * If there is one, give back a record value and a cursor too.
- */
- *bnop = nbno;
- if (nbno != NULLAGBLOCK) {
- *recp = nrec;
- *curp = ncur;
- }
+ ASSERT(args.len == 1);
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+
+ new->s = cpu_to_be32(XFS_FSB_TO_AGBNO(args.mp, args.fsbno));
*stat = 1;
return 0;
}
-/*
- * Log header fields from a btree block.
- */
-STATIC void
-xfs_inobt_log_block(
- xfs_trans_t *tp, /* transaction pointer */
- xfs_buf_t *bp, /* buffer containing btree block */
- int fields) /* mask of fields: XFS_BB_... */
+STATIC int
+xfs_inobt_free_block(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp)
{
- int first; /* first byte offset logged */
- int last; /* last byte offset logged */
- static const short offsets[] = { /* table of offsets */
- offsetof(xfs_inobt_block_t, bb_magic),
- offsetof(xfs_inobt_block_t, bb_level),
- offsetof(xfs_inobt_block_t, bb_numrecs),
- offsetof(xfs_inobt_block_t, bb_leftsib),
- offsetof(xfs_inobt_block_t, bb_rightsib),
- sizeof(xfs_inobt_block_t)
- };
+ xfs_fsblock_t fsbno;
+ int error;
- xfs_btree_offsets(fields, offsets, XFS_BB_NUM_BITS, &first, &last);
- xfs_trans_log_buf(tp, bp, first, last);
+ fsbno = XFS_DADDR_TO_FSB(cur->bc_mp, XFS_BUF_ADDR(bp));
+ error = xfs_free_extent(cur->bc_tp, fsbno, 1);
+ if (error)
+ return error;
+
+ xfs_trans_binval(cur->bc_tp, bp);
+ return error;
}
-/*
- * Log keys from a btree block (nonleaf).
- */
-STATIC void
-xfs_inobt_log_keys(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_buf_t *bp, /* buffer containing btree block */
- int kfirst, /* index of first key to log */
- int klast) /* index of last key to log */
+STATIC int
+xfs_inobt_get_maxrecs(
+ struct xfs_btree_cur *cur,
+ int level)
{
- xfs_inobt_block_t *block; /* btree block to log from */
- int first; /* first byte offset logged */
- xfs_inobt_key_t *kp; /* key pointer in btree block */
- int last; /* last byte offset logged */
-
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
- kp = XFS_INOBT_KEY_ADDR(block, 1, cur);
- first = (int)((xfs_caddr_t)&kp[kfirst - 1] - (xfs_caddr_t)block);
- last = (int)(((xfs_caddr_t)&kp[klast] - 1) - (xfs_caddr_t)block);
- xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+ return cur->bc_mp->m_inobt_mxr[level != 0];
}
-/*
- * Log block pointer fields from a btree block (nonleaf).
- */
STATIC void
-xfs_inobt_log_ptrs(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_buf_t *bp, /* buffer containing btree block */
- int pfirst, /* index of first pointer to log */
- int plast) /* index of last pointer to log */
+xfs_inobt_init_key_from_rec(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
{
- xfs_inobt_block_t *block; /* btree block to log from */
- int first; /* first byte offset logged */
- int last; /* last byte offset logged */
- xfs_inobt_ptr_t *pp; /* block-pointer pointer in btree blk */
-
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
- pp = XFS_INOBT_PTR_ADDR(block, 1, cur);
- first = (int)((xfs_caddr_t)&pp[pfirst - 1] - (xfs_caddr_t)block);
- last = (int)(((xfs_caddr_t)&pp[plast] - 1) - (xfs_caddr_t)block);
- xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+ key->inobt.ir_startino = rec->inobt.ir_startino;
}
-/*
- * Log records from a btree block (leaf).
- */
STATIC void
-xfs_inobt_log_recs(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_buf_t *bp, /* buffer containing btree block */
- int rfirst, /* index of first record to log */
- int rlast) /* index of last record to log */
+xfs_inobt_init_rec_from_key(
+ union xfs_btree_key *key,
+ union xfs_btree_rec *rec)
{
- xfs_inobt_block_t *block; /* btree block to log from */
- int first; /* first byte offset logged */
- int last; /* last byte offset logged */
- xfs_inobt_rec_t *rp; /* record pointer for btree block */
+ rec->inobt.ir_startino = key->inobt.ir_startino;
+}
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
- rp = XFS_INOBT_REC_ADDR(block, 1, cur);
- first = (int)((xfs_caddr_t)&rp[rfirst - 1] - (xfs_caddr_t)block);
- last = (int)(((xfs_caddr_t)&rp[rlast] - 1) - (xfs_caddr_t)block);
- xfs_trans_log_buf(cur->bc_tp, bp, first, last);
+STATIC void
+xfs_inobt_init_rec_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec)
+{
+ rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
+ rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
+ rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
}
/*
- * Lookup the record. The cursor is made to point to it, based on dir.
- * Return 0 if can't find any such record, 1 for success.
+ * intial value of ptr for lookup
*/
-STATIC int /* error */
-xfs_inobt_lookup(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_lookup_t dir, /* <=, ==, or >= */
- int *stat) /* success/failure */
+STATIC void
+xfs_inobt_init_ptr_from_cur(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_ptr *ptr)
{
- xfs_agblock_t agbno; /* a.g. relative btree block number */
- xfs_agnumber_t agno; /* allocation group number */
- xfs_inobt_block_t *block=NULL; /* current btree block */
- __int64_t diff; /* difference for the current key */
- int error; /* error return value */
- int keyno=0; /* current key number */
- int level; /* level in the btree */
- xfs_mount_t *mp; /* file system mount point */
-
- /*
- * Get the allocation group header, and the root block number.
- */
- mp = cur->bc_mp;
- {
- xfs_agi_t *agi; /* a.g. inode header */
-
- agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
- agno = be32_to_cpu(agi->agi_seqno);
- agbno = be32_to_cpu(agi->agi_root);
- }
- /*
- * Iterate over each level in the btree, starting at the root.
- * For each level above the leaves, find the key we need, based
- * on the lookup record, then follow the corresponding block
- * pointer down to the next level.
- */
- for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
- xfs_buf_t *bp; /* buffer pointer for btree block */
- xfs_daddr_t d; /* disk address of btree block */
-
- /*
- * Get the disk address we're looking for.
- */
- d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- /*
- * If the old buffer at this level is for a different block,
- * throw it away, otherwise just use it.
- */
- bp = cur->bc_bufs[level];
- if (bp && XFS_BUF_ADDR(bp) != d)
- bp = NULL;
- if (!bp) {
- /*
- * Need to get a new buffer. Read it, then
- * set it in the cursor, releasing the old one.
- */
- if ((error = xfs_btree_read_bufs(mp, cur->bc_tp,
- agno, agbno, 0, &bp, XFS_INO_BTREE_REF)))
- return error;
- xfs_btree_setbuf(cur, level, bp);
- /*
- * Point to the btree block, now that we have the buffer
- */
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
- if ((error = xfs_btree_check_sblock(cur, block, level,
- bp)))
- return error;
- } else
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
- /*
- * If we already had a key match at a higher level, we know
- * we need to use the first entry in this block.
- */
- if (diff == 0)
- keyno = 1;
- /*
- * Otherwise we need to search this block. Do a binary search.
- */
- else {
- int high; /* high entry number */
- xfs_inobt_key_t *kkbase=NULL;/* base of keys in block */
- xfs_inobt_rec_t *krbase=NULL;/* base of records in block */
- int low; /* low entry number */
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
- /*
- * Get a pointer to keys or records.
- */
- if (level > 0)
- kkbase = XFS_INOBT_KEY_ADDR(block, 1, cur);
- else
- krbase = XFS_INOBT_REC_ADDR(block, 1, cur);
- /*
- * Set low and high entry numbers, 1-based.
- */
- low = 1;
- if (!(high = be16_to_cpu(block->bb_numrecs))) {
- /*
- * If the block is empty, the tree must
- * be an empty leaf.
- */
- ASSERT(level == 0 && cur->bc_nlevels == 1);
- cur->bc_ptrs[0] = dir != XFS_LOOKUP_LE;
- *stat = 0;
- return 0;
- }
- /*
- * Binary search the block.
- */
- while (low <= high) {
- xfs_agino_t startino; /* key value */
-
- /*
- * keyno is average of low and high.
- */
- keyno = (low + high) >> 1;
- /*
- * Get startino.
- */
- if (level > 0) {
- xfs_inobt_key_t *kkp;
-
- kkp = kkbase + keyno - 1;
- startino = be32_to_cpu(kkp->ir_startino);
- } else {
- xfs_inobt_rec_t *krp;
-
- krp = krbase + keyno - 1;
- startino = be32_to_cpu(krp->ir_startino);
- }
- /*
- * Compute difference to get next direction.
- */
- diff = (__int64_t)
- startino - cur->bc_rec.i.ir_startino;
- /*
- * Less than, move right.
- */
- if (diff < 0)
- low = keyno + 1;
- /*
- * Greater than, move left.
- */
- else if (diff > 0)
- high = keyno - 1;
- /*
- * Equal, we're done.
- */
- else
- break;
- }
- }
- /*
- * If there are more levels, set up for the next level
- * by getting the block number and filling in the cursor.
- */
- if (level > 0) {
- /*
- * If we moved left, need the previous key number,
- * unless there isn't one.
- */
- if (diff > 0 && --keyno < 1)
- keyno = 1;
- agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, keyno, cur));
-#ifdef DEBUG
- if ((error = xfs_btree_check_sptr(cur, agbno, level)))
- return error;
-#endif
- cur->bc_ptrs[level] = keyno;
- }
- }
- /*
- * Done with the search.
- * See if we need to adjust the results.
- */
- if (dir != XFS_LOOKUP_LE && diff < 0) {
- keyno++;
- /*
- * If ge search and we went off the end of the block, but it's
- * not the last block, we're in the wrong block.
- */
- if (dir == XFS_LOOKUP_GE &&
- keyno > be16_to_cpu(block->bb_numrecs) &&
- be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
- int i;
+ ASSERT(cur->bc_private.a.agno == be32_to_cpu(agi->agi_seqno));
- cur->bc_ptrs[0] = keyno;
- if ((error = xfs_inobt_increment(cur, 0, &i)))
- return error;
- ASSERT(i == 1);
- *stat = 1;
- return 0;
- }
- }
- else if (dir == XFS_LOOKUP_LE && diff > 0)
- keyno--;
- cur->bc_ptrs[0] = keyno;
- /*
- * Return if we succeeded or not.
- */
- if (keyno == 0 || keyno > be16_to_cpu(block->bb_numrecs))
- *stat = 0;
- else
- *stat = ((dir != XFS_LOOKUP_EQ) || (diff == 0));
- return 0;
+ ptr->s = agi->agi_root;
}
-/*
- * Move 1 record left from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int /* error */
-xfs_inobt_lshift(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level to shift record on */
- int *stat) /* success/failure */
+STATIC __int64_t
+xfs_inobt_key_diff(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key)
{
- int error; /* error return value */
-#ifdef DEBUG
- int i; /* loop index */
-#endif
- xfs_inobt_key_t key; /* key value for leaf level upward */
- xfs_buf_t *lbp; /* buffer for left neighbor block */
- xfs_inobt_block_t *left; /* left neighbor btree block */
- xfs_inobt_key_t *lkp=NULL; /* key pointer for left block */
- xfs_inobt_ptr_t *lpp; /* address pointer for left block */
- xfs_inobt_rec_t *lrp=NULL; /* record pointer for left block */
- int nrec; /* new number of left block entries */
- xfs_buf_t *rbp; /* buffer for right (current) block */
- xfs_inobt_block_t *right; /* right (current) btree block */
- xfs_inobt_key_t *rkp=NULL; /* key pointer for right block */
- xfs_inobt_ptr_t *rpp=NULL; /* address pointer for right block */
- xfs_inobt_rec_t *rrp=NULL; /* record pointer for right block */
-
- /*
- * Set up variables for this block as "right".
- */
- rbp = cur->bc_bufs[level];
- right = XFS_BUF_TO_INOBT_BLOCK(rbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
- return error;
-#endif
- /*
- * If we've got no left sibling then we can't shift an entry left.
- */
- if (be32_to_cpu(right->bb_leftsib) == NULLAGBLOCK) {
- *stat = 0;
- return 0;
- }
- /*
- * If the cursor entry is the one that would be moved, don't
- * do it... it's too complicated.
- */
- if (cur->bc_ptrs[level] <= 1) {
- *stat = 0;
- return 0;
- }
- /*
- * Set up the left neighbor as "left".
- */
- if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agno, be32_to_cpu(right->bb_leftsib),
- 0, &lbp, XFS_INO_BTREE_REF)))
- return error;
- left = XFS_BUF_TO_INOBT_BLOCK(lbp);
- if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
- return error;
- /*
- * If it's full, it can't take another entry.
- */
- if (be16_to_cpu(left->bb_numrecs) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
- *stat = 0;
- return 0;
- }
- nrec = be16_to_cpu(left->bb_numrecs) + 1;
- /*
- * If non-leaf, copy a key and a ptr to the left block.
- */
- if (level > 0) {
- lkp = XFS_INOBT_KEY_ADDR(left, nrec, cur);
- rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
- *lkp = *rkp;
- xfs_inobt_log_keys(cur, lbp, nrec, nrec);
- lpp = XFS_INOBT_PTR_ADDR(left, nrec, cur);
- rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*rpp), level)))
- return error;
-#endif
- *lpp = *rpp;
- xfs_inobt_log_ptrs(cur, lbp, nrec, nrec);
- }
- /*
- * If leaf, copy a record to the left block.
- */
- else {
- lrp = XFS_INOBT_REC_ADDR(left, nrec, cur);
- rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
- *lrp = *rrp;
- xfs_inobt_log_recs(cur, lbp, nrec, nrec);
- }
- /*
- * Bump and log left's numrecs, decrement and log right's numrecs.
- */
- be16_add_cpu(&left->bb_numrecs, 1);
- xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
-#ifdef DEBUG
- if (level > 0)
- xfs_btree_check_key(cur->bc_btnum, lkp - 1, lkp);
- else
- xfs_btree_check_rec(cur->bc_btnum, lrp - 1, lrp);
-#endif
- be16_add_cpu(&right->bb_numrecs, -1);
- xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
- /*
- * Slide the contents of right down one entry.
- */
- if (level > 0) {
-#ifdef DEBUG
- for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i + 1]),
- level)))
- return error;
- }
-#endif
- memmove(rkp, rkp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
- memmove(rpp, rpp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
- xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- } else {
- memmove(rrp, rrp + 1, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
- xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- key.ir_startino = rrp->ir_startino;
- rkp = &key;
- }
- /*
- * Update the parent key values of right.
- */
- if ((error = xfs_inobt_updkey(cur, rkp, level + 1)))
- return error;
- /*
- * Slide the cursor value left one.
- */
- cur->bc_ptrs[level]--;
- *stat = 1;
- return 0;
+ return (__int64_t)be32_to_cpu(key->inobt.ir_startino) -
+ cur->bc_rec.i.ir_startino;
}
-/*
- * Allocate a new root block, fill it in.
- */
-STATIC int /* error */
-xfs_inobt_newroot(
- xfs_btree_cur_t *cur, /* btree cursor */
- int *stat) /* success/failure */
+STATIC int
+xfs_inobt_kill_root(
+ struct xfs_btree_cur *cur,
+ struct xfs_buf *bp,
+ int level,
+ union xfs_btree_ptr *newroot)
{
- xfs_agi_t *agi; /* a.g. inode header */
- xfs_alloc_arg_t args; /* allocation argument structure */
- xfs_inobt_block_t *block; /* one half of the old root block */
- xfs_buf_t *bp; /* buffer containing block */
- int error; /* error return value */
- xfs_inobt_key_t *kp; /* btree key pointer */
- xfs_agblock_t lbno; /* left block number */
- xfs_buf_t *lbp; /* left buffer pointer */
- xfs_inobt_block_t *left; /* left btree block */
- xfs_buf_t *nbp; /* new (root) buffer */
- xfs_inobt_block_t *new; /* new (root) btree block */
- int nptr; /* new value for key index, 1 or 2 */
- xfs_inobt_ptr_t *pp; /* btree address pointer */
- xfs_agblock_t rbno; /* right block number */
- xfs_buf_t *rbp; /* right buffer pointer */
- xfs_inobt_block_t *right; /* right btree block */
- xfs_inobt_rec_t *rp; /* btree record pointer */
+ int error;
- ASSERT(cur->bc_nlevels < XFS_IN_MAXLEVELS(cur->bc_mp));
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+ XFS_BTREE_STATS_INC(cur, killroot);
/*
- * Get a block & a buffer.
+ * Update the root pointer, decreasing the level by 1 and then
+ * free the old root.
*/
- agi = XFS_BUF_TO_AGI(cur->bc_private.a.agbp);
- args.tp = cur->bc_tp;
- args.mp = cur->bc_mp;
- args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno,
- be32_to_cpu(agi->agi_root));
- args.mod = args.minleft = args.alignment = args.total = args.wasdel =
- args.isfl = args.userdata = args.minalignslop = 0;
- args.minlen = args.maxlen = args.prod = 1;
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- if ((error = xfs_alloc_vextent(&args)))
+ xfs_inobt_set_root(cur, newroot, -1);
+ error = xfs_inobt_free_block(cur, bp);
+ if (error) {
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
return error;
- /*
- * None available, we fail.
- */
- if (args.fsbno == NULLFSBLOCK) {
- *stat = 0;
- return 0;
- }
- ASSERT(args.len == 1);
- nbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
- new = XFS_BUF_TO_INOBT_BLOCK(nbp);
- /*
- * Set the root data in the a.g. inode structure.
- */
- agi->agi_root = cpu_to_be32(args.agbno);
- be32_add_cpu(&agi->agi_level, 1);
- xfs_ialloc_log_agi(args.tp, cur->bc_private.a.agbp,
- XFS_AGI_ROOT | XFS_AGI_LEVEL);
- /*
- * At the previous root level there are now two blocks: the old
- * root, and the new block generated when it was split.
- * We don't know which one the cursor is pointing at, so we
- * set up variables "left" and "right" for each case.
- */
- bp = cur->bc_bufs[cur->bc_nlevels - 1];
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, cur->bc_nlevels - 1, bp)))
- return error;
-#endif
- if (be32_to_cpu(block->bb_rightsib) != NULLAGBLOCK) {
- /*
- * Our block is left, pick up the right block.
- */
- lbp = bp;
- lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
- left = block;
- rbno = be32_to_cpu(left->bb_rightsib);
- if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
- rbno, 0, &rbp, XFS_INO_BTREE_REF)))
- return error;
- bp = rbp;
- right = XFS_BUF_TO_INOBT_BLOCK(rbp);
- if ((error = xfs_btree_check_sblock(cur, right,
- cur->bc_nlevels - 1, rbp)))
- return error;
- nptr = 1;
- } else {
- /*
- * Our block is right, pick up the left block.
- */
- rbp = bp;
- rbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(rbp));
- right = block;
- lbno = be32_to_cpu(right->bb_leftsib);
- if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
- lbno, 0, &lbp, XFS_INO_BTREE_REF)))
- return error;
- bp = lbp;
- left = XFS_BUF_TO_INOBT_BLOCK(lbp);
- if ((error = xfs_btree_check_sblock(cur, left,
- cur->bc_nlevels - 1, lbp)))
- return error;
- nptr = 2;
- }
- /*
- * Fill in the new block's btree header and log it.
- */
- new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
- new->bb_level = cpu_to_be16(cur->bc_nlevels);
- new->bb_numrecs = cpu_to_be16(2);
- new->bb_leftsib = cpu_to_be32(NULLAGBLOCK);
- new->bb_rightsib = cpu_to_be32(NULLAGBLOCK);
- xfs_inobt_log_block(args.tp, nbp, XFS_BB_ALL_BITS);
- ASSERT(lbno != NULLAGBLOCK && rbno != NULLAGBLOCK);
- /*
- * Fill in the key data in the new root.
- */
- kp = XFS_INOBT_KEY_ADDR(new, 1, cur);
- if (be16_to_cpu(left->bb_level) > 0) {
- kp[0] = *XFS_INOBT_KEY_ADDR(left, 1, cur);
- kp[1] = *XFS_INOBT_KEY_ADDR(right, 1, cur);
- } else {
- rp = XFS_INOBT_REC_ADDR(left, 1, cur);
- kp[0].ir_startino = rp->ir_startino;
- rp = XFS_INOBT_REC_ADDR(right, 1, cur);
- kp[1].ir_startino = rp->ir_startino;
}
- xfs_inobt_log_keys(cur, nbp, 1, 2);
- /*
- * Fill in the pointer data in the new root.
- */
- pp = XFS_INOBT_PTR_ADDR(new, 1, cur);
- pp[0] = cpu_to_be32(lbno);
- pp[1] = cpu_to_be32(rbno);
- xfs_inobt_log_ptrs(cur, nbp, 1, 2);
- /*
- * Fix up the cursor.
- */
- xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
- cur->bc_ptrs[cur->bc_nlevels] = nptr;
- cur->bc_nlevels++;
- *stat = 1;
- return 0;
-}
-/*
- * Move 1 record right from cur/level if possible.
- * Update cur to reflect the new path.
- */
-STATIC int /* error */
-xfs_inobt_rshift(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level to shift record on */
- int *stat) /* success/failure */
-{
- int error; /* error return value */
- int i; /* loop index */
- xfs_inobt_key_t key; /* key value for leaf level upward */
- xfs_buf_t *lbp; /* buffer for left (current) block */
- xfs_inobt_block_t *left; /* left (current) btree block */
- xfs_inobt_key_t *lkp; /* key pointer for left block */
- xfs_inobt_ptr_t *lpp; /* address pointer for left block */
- xfs_inobt_rec_t *lrp; /* record pointer for left block */
- xfs_buf_t *rbp; /* buffer for right neighbor block */
- xfs_inobt_block_t *right; /* right neighbor btree block */
- xfs_inobt_key_t *rkp; /* key pointer for right block */
- xfs_inobt_ptr_t *rpp; /* address pointer for right block */
- xfs_inobt_rec_t *rrp=NULL; /* record pointer for right block */
- xfs_btree_cur_t *tcur; /* temporary cursor */
+ XFS_BTREE_STATS_INC(cur, free);
- /*
- * Set up variables for this block as "left".
- */
- lbp = cur->bc_bufs[level];
- left = XFS_BUF_TO_INOBT_BLOCK(lbp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
- return error;
-#endif
- /*
- * If we've got no right sibling then we can't shift an entry right.
- */
- if (be32_to_cpu(left->bb_rightsib) == NULLAGBLOCK) {
- *stat = 0;
- return 0;
- }
- /*
- * If the cursor entry is the one that would be moved, don't
- * do it... it's too complicated.
- */
- if (cur->bc_ptrs[level] >= be16_to_cpu(left->bb_numrecs)) {
- *stat = 0;
- return 0;
- }
- /*
- * Set up the right neighbor as "right".
- */
- if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agno, be32_to_cpu(left->bb_rightsib),
- 0, &rbp, XFS_INO_BTREE_REF)))
- return error;
- right = XFS_BUF_TO_INOBT_BLOCK(rbp);
- if ((error = xfs_btree_check_sblock(cur, right, level, rbp)))
- return error;
- /*
- * If it's full, it can't take another entry.
- */
- if (be16_to_cpu(right->bb_numrecs) == XFS_INOBT_BLOCK_MAXRECS(level, cur)) {
- *stat = 0;
- return 0;
- }
- /*
- * Make a hole at the start of the right neighbor block, then
- * copy the last left block entry to the hole.
- */
- if (level > 0) {
- lkp = XFS_INOBT_KEY_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
- lpp = XFS_INOBT_PTR_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
- rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
- rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
- for (i = be16_to_cpu(right->bb_numrecs) - 1; i >= 0; i--) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(rpp[i]), level)))
- return error;
- }
-#endif
- memmove(rkp + 1, rkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
- memmove(rpp + 1, rpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
-#ifdef DEBUG
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(*lpp), level)))
- return error;
-#endif
- *rkp = *lkp;
- *rpp = *lpp;
- xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
- xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
- } else {
- lrp = XFS_INOBT_REC_ADDR(left, be16_to_cpu(left->bb_numrecs), cur);
- rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
- memmove(rrp + 1, rrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
- *rrp = *lrp;
- xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs) + 1);
- key.ir_startino = rrp->ir_startino;
- rkp = &key;
- }
- /*
- * Decrement and log left's numrecs, bump and log right's numrecs.
- */
- be16_add_cpu(&left->bb_numrecs, -1);
- xfs_inobt_log_block(cur->bc_tp, lbp, XFS_BB_NUMRECS);
- be16_add_cpu(&right->bb_numrecs, 1);
-#ifdef DEBUG
- if (level > 0)
- xfs_btree_check_key(cur->bc_btnum, rkp, rkp + 1);
- else
- xfs_btree_check_rec(cur->bc_btnum, rrp, rrp + 1);
-#endif
- xfs_inobt_log_block(cur->bc_tp, rbp, XFS_BB_NUMRECS);
- /*
- * Using a temporary cursor, update the parent key values of the
- * block on the right.
- */
- if ((error = xfs_btree_dup_cursor(cur, &tcur)))
- return error;
- xfs_btree_lastrec(tcur, level);
- if ((error = xfs_inobt_increment(tcur, level, &i)) ||
- (error = xfs_inobt_updkey(tcur, rkp, level + 1))) {
- xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
- return error;
- }
- xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
- *stat = 1;
+ cur->bc_bufs[level] = NULL;
+ cur->bc_nlevels--;
+
+ XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
return 0;
}
-/*
- * Split cur/level block in half.
- * Return new block number and its first record (to be inserted into parent).
- */
-STATIC int /* error */
-xfs_inobt_split(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level to split */
- xfs_agblock_t *bnop, /* output: block number allocated */
- xfs_inobt_key_t *keyp, /* output: first key of new block */
- xfs_btree_cur_t **curp, /* output: new cursor */
- int *stat) /* success/failure */
-{
- xfs_alloc_arg_t args; /* allocation argument structure */
- int error; /* error return value */
- int i; /* loop index/record number */
- xfs_agblock_t lbno; /* left (current) block number */
- xfs_buf_t *lbp; /* buffer for left block */
- xfs_inobt_block_t *left; /* left (current) btree block */
- xfs_inobt_key_t *lkp; /* left btree key pointer */
- xfs_inobt_ptr_t *lpp; /* left btree address pointer */
- xfs_inobt_rec_t *lrp; /* left btree record pointer */
- xfs_buf_t *rbp; /* buffer for right block */
- xfs_inobt_block_t *right; /* right (new) btree block */
- xfs_inobt_key_t *rkp; /* right btree key pointer */
- xfs_inobt_ptr_t *rpp; /* right btree address pointer */
- xfs_inobt_rec_t *rrp; /* right btree record pointer */
-
- /*
- * Set up left block (current one).
- */
- lbp = cur->bc_bufs[level];
- args.tp = cur->bc_tp;
- args.mp = cur->bc_mp;
- lbno = XFS_DADDR_TO_AGBNO(args.mp, XFS_BUF_ADDR(lbp));
- /*
- * Allocate the new block.
- * If we can't do it, we're toast. Give up.
- */
- args.fsbno = XFS_AGB_TO_FSB(args.mp, cur->bc_private.a.agno, lbno);
- args.mod = args.minleft = args.alignment = args.total = args.wasdel =
- args.isfl = args.userdata = args.minalignslop = 0;
- args.minlen = args.maxlen = args.prod = 1;
- args.type = XFS_ALLOCTYPE_NEAR_BNO;
- if ((error = xfs_alloc_vextent(&args)))
- return error;
- if (args.fsbno == NULLFSBLOCK) {
- *stat = 0;
- return 0;
- }
- ASSERT(args.len == 1);
- rbp = xfs_btree_get_bufs(args.mp, args.tp, args.agno, args.agbno, 0);
- /*
- * Set up the new block as "right".
- */
- right = XFS_BUF_TO_INOBT_BLOCK(rbp);
- /*
- * "Left" is the current (according to the cursor) block.
- */
- left = XFS_BUF_TO_INOBT_BLOCK(lbp);
#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, left, level, lbp)))
- return error;
-#endif
- /*
- * Fill in the btree header for the new block.
- */
- right->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
- right->bb_level = left->bb_level;
- right->bb_numrecs = cpu_to_be16(be16_to_cpu(left->bb_numrecs) / 2);
- /*
- * Make sure that if there's an odd number of entries now, that
- * each new block will have the same number of entries.
- */
- if ((be16_to_cpu(left->bb_numrecs) & 1) &&
- cur->bc_ptrs[level] <= be16_to_cpu(right->bb_numrecs) + 1)
- be16_add_cpu(&right->bb_numrecs, 1);
- i = be16_to_cpu(left->bb_numrecs) - be16_to_cpu(right->bb_numrecs) + 1;
- /*
- * For non-leaf blocks, copy keys and addresses over to the new block.
- */
- if (level > 0) {
- lkp = XFS_INOBT_KEY_ADDR(left, i, cur);
- lpp = XFS_INOBT_PTR_ADDR(left, i, cur);
- rkp = XFS_INOBT_KEY_ADDR(right, 1, cur);
- rpp = XFS_INOBT_PTR_ADDR(right, 1, cur);
-#ifdef DEBUG
- for (i = 0; i < be16_to_cpu(right->bb_numrecs); i++) {
- if ((error = xfs_btree_check_sptr(cur, be32_to_cpu(lpp[i]), level)))
- return error;
- }
-#endif
- memcpy(rkp, lkp, be16_to_cpu(right->bb_numrecs) * sizeof(*rkp));
- memcpy(rpp, lpp, be16_to_cpu(right->bb_numrecs) * sizeof(*rpp));
- xfs_inobt_log_keys(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- xfs_inobt_log_ptrs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- *keyp = *rkp;
- }
- /*
- * For leaf blocks, copy records over to the new block.
- */
- else {
- lrp = XFS_INOBT_REC_ADDR(left, i, cur);
- rrp = XFS_INOBT_REC_ADDR(right, 1, cur);
- memcpy(rrp, lrp, be16_to_cpu(right->bb_numrecs) * sizeof(*rrp));
- xfs_inobt_log_recs(cur, rbp, 1, be16_to_cpu(right->bb_numrecs));
- keyp->ir_startino = rrp->ir_startino;
- }
- /*
- * Find the left block number by looking in the buffer.
- * Adjust numrecs, sibling pointers.
- */
- be16_add_cpu(&left->bb_numrecs, -(be16_to_cpu(right->bb_numrecs)));
- right->bb_rightsib = left->bb_rightsib;
- left->bb_rightsib = cpu_to_be32(args.agbno);
- right->bb_leftsib = cpu_to_be32(lbno);
- xfs_inobt_log_block(args.tp, rbp, XFS_BB_ALL_BITS);
- xfs_inobt_log_block(args.tp, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
- /*
- * If there's a block to the new block's right, make that block
- * point back to right instead of to left.
- */
- if (be32_to_cpu(right->bb_rightsib) != NULLAGBLOCK) {
- xfs_inobt_block_t *rrblock; /* rr btree block */
- xfs_buf_t *rrbp; /* buffer for rrblock */
-
- if ((error = xfs_btree_read_bufs(args.mp, args.tp, args.agno,
- be32_to_cpu(right->bb_rightsib), 0, &rrbp,
- XFS_INO_BTREE_REF)))
- return error;
- rrblock = XFS_BUF_TO_INOBT_BLOCK(rrbp);
- if ((error = xfs_btree_check_sblock(cur, rrblock, level, rrbp)))
- return error;
- rrblock->bb_leftsib = cpu_to_be32(args.agbno);
- xfs_inobt_log_block(args.tp, rrbp, XFS_BB_LEFTSIB);
- }
- /*
- * If the cursor is really in the right block, move it there.
- * If it's just pointing past the last entry in left, then we'll
- * insert there, so don't change anything in that case.
- */
- if (cur->bc_ptrs[level] > be16_to_cpu(left->bb_numrecs) + 1) {
- xfs_btree_setbuf(cur, level, rbp);
- cur->bc_ptrs[level] -= be16_to_cpu(left->bb_numrecs);
- }
- /*
- * If there are more levels, we'll need another cursor which refers
- * the right block, no matter where this cursor was.
- */
- if (level + 1 < cur->bc_nlevels) {
- if ((error = xfs_btree_dup_cursor(cur, curp)))
- return error;
- (*curp)->bc_ptrs[level + 1]++;
- }
- *bnop = args.agbno;
- *stat = 1;
- return 0;
+STATIC int
+xfs_inobt_keys_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *k1,
+ union xfs_btree_key *k2)
+{
+ return be32_to_cpu(k1->inobt.ir_startino) <
+ be32_to_cpu(k2->inobt.ir_startino);
}
-/*
- * Update keys at all levels from here to the root along the cursor's path.
- */
-STATIC int /* error */
-xfs_inobt_updkey(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_inobt_key_t *keyp, /* new key value to update to */
- int level) /* starting level for update */
+STATIC int
+xfs_inobt_recs_inorder(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *r1,
+ union xfs_btree_rec *r2)
{
- int ptr; /* index of key in block */
-
- /*
- * Go up the tree from this level toward the root.
- * At each level, update the key value to the value input.
- * Stop when we reach a level where the cursor isn't pointing
- * at the first entry in the block.
- */
- for (ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
- xfs_buf_t *bp; /* buffer for block */
- xfs_inobt_block_t *block; /* btree block */
-#ifdef DEBUG
- int error; /* error return value */
-#endif
- xfs_inobt_key_t *kp; /* ptr to btree block keys */
-
- bp = cur->bc_bufs[level];
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
- return error;
-#endif
- ptr = cur->bc_ptrs[level];
- kp = XFS_INOBT_KEY_ADDR(block, ptr, cur);
- *kp = *keyp;
- xfs_inobt_log_keys(cur, bp, ptr, ptr);
- }
- return 0;
+ return be32_to_cpu(r1->inobt.ir_startino) + XFS_INODES_PER_CHUNK <=
+ be32_to_cpu(r2->inobt.ir_startino);
}
+#endif /* DEBUG */
-/*
- * Externally visible routines.
- */
+#ifdef XFS_BTREE_TRACE
+ktrace_t *xfs_inobt_trace_buf;
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int /* error */
-xfs_inobt_decrement(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level in btree, 0 is leaf */
- int *stat) /* success/failure */
+STATIC void
+xfs_inobt_trace_enter(
+ struct xfs_btree_cur *cur,
+ const char *func,
+ char *s,
+ int type,
+ int line,
+ __psunsigned_t a0,
+ __psunsigned_t a1,
+ __psunsigned_t a2,
+ __psunsigned_t a3,
+ __psunsigned_t a4,
+ __psunsigned_t a5,
+ __psunsigned_t a6,
+ __psunsigned_t a7,
+ __psunsigned_t a8,
+ __psunsigned_t a9,
+ __psunsigned_t a10)
{
- xfs_inobt_block_t *block; /* btree block */
- int error;
- int lev; /* btree level */
-
- ASSERT(level < cur->bc_nlevels);
- /*
- * Read-ahead to the left at this level.
- */
- xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
- /*
- * Decrement the ptr at this level. If we're still in the block
- * then we're done.
- */
- if (--cur->bc_ptrs[level] > 0) {
- *stat = 1;
- return 0;
- }
- /*
- * Get a pointer to the btree block.
- */
- block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[level]);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, level,
- cur->bc_bufs[level])))
- return error;
-#endif
- /*
- * If we just went off the left edge of the tree, return failure.
- */
- if (be32_to_cpu(block->bb_leftsib) == NULLAGBLOCK) {
- *stat = 0;
- return 0;
- }
- /*
- * March up the tree decrementing pointers.
- * Stop when we don't go off the left edge of a block.
- */
- for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
- if (--cur->bc_ptrs[lev] > 0)
- break;
- /*
- * Read-ahead the left block, we're going to read it
- * in the next loop.
- */
- xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
- }
- /*
- * If we went off the root then we are seriously confused.
- */
- ASSERT(lev < cur->bc_nlevels);
- /*
- * Now walk back down the tree, fixing up the cursor's buffer
- * pointers and key numbers.
- */
- for (block = XFS_BUF_TO_INOBT_BLOCK(cur->bc_bufs[lev]); lev > level; ) {
- xfs_agblock_t agbno; /* block number of btree block */
- xfs_buf_t *bp; /* buffer containing btree block */
-
- agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
- if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agno, agbno, 0, &bp,
- XFS_INO_BTREE_REF)))
- return error;
- lev--;
- xfs_btree_setbuf(cur, lev, bp);
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
- if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
- return error;
- cur->bc_ptrs[lev] = be16_to_cpu(block->bb_numrecs);
- }
- *stat = 1;
- return 0;
+ ktrace_enter(xfs_inobt_trace_buf, (void *)(__psint_t)type,
+ (void *)func, (void *)s, NULL, (void *)cur,
+ (void *)a0, (void *)a1, (void *)a2, (void *)a3,
+ (void *)a4, (void *)a5, (void *)a6, (void *)a7,
+ (void *)a8, (void *)a9, (void *)a10);
}
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-int /* error */
-xfs_inobt_delete(
- xfs_btree_cur_t *cur, /* btree cursor */
- int *stat) /* success/failure */
+STATIC void
+xfs_inobt_trace_cursor(
+ struct xfs_btree_cur *cur,
+ __uint32_t *s0,
+ __uint64_t *l0,
+ __uint64_t *l1)
{
- int error;
- int i; /* result code */
- int level; /* btree level */
-
- /*
- * Go up the tree, starting at leaf level.
- * If 2 is returned then a join was done; go to the next level.
- * Otherwise we are done.
- */
- for (level = 0, i = 2; i == 2; level++) {
- if ((error = xfs_inobt_delrec(cur, level, &i)))
- return error;
- }
- if (i == 0) {
- for (level = 1; level < cur->bc_nlevels; level++) {
- if (cur->bc_ptrs[level] == 0) {
- if ((error = xfs_inobt_decrement(cur, level, &i)))
- return error;
- break;
- }
- }
- }
- *stat = i;
- return 0;
+ *s0 = cur->bc_private.a.agno;
+ *l0 = cur->bc_rec.i.ir_startino;
+ *l1 = cur->bc_rec.i.ir_free;
}
-
-/*
- * Get the data from the pointed-to record.
- */
-int /* error */
-xfs_inobt_get_rec(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agino_t *ino, /* output: starting inode of chunk */
- __int32_t *fcnt, /* output: number of free inodes */
- xfs_inofree_t *free, /* output: free inode mask */
- int *stat) /* output: success/failure */
+STATIC void
+xfs_inobt_trace_key(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_key *key,
+ __uint64_t *l0,
+ __uint64_t *l1)
{
- xfs_inobt_block_t *block; /* btree block */
- xfs_buf_t *bp; /* buffer containing btree block */
-#ifdef DEBUG
- int error; /* error return value */
-#endif
- int ptr; /* record number */
- xfs_inobt_rec_t *rec; /* record data */
-
- bp = cur->bc_bufs[0];
- ptr = cur->bc_ptrs[0];
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
- return error;
-#endif
- /*
- * Off the right end or left end, return failure.
- */
- if (ptr > be16_to_cpu(block->bb_numrecs) || ptr <= 0) {
- *stat = 0;
- return 0;
- }
- /*
- * Point to the record and extract its data.
- */
- rec = XFS_INOBT_REC_ADDR(block, ptr, cur);
- *ino = be32_to_cpu(rec->ir_startino);
- *fcnt = be32_to_cpu(rec->ir_freecount);
- *free = be64_to_cpu(rec->ir_free);
- *stat = 1;
- return 0;
+ *l0 = be32_to_cpu(key->inobt.ir_startino);
+ *l1 = 0;
}
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-int /* error */
-xfs_inobt_increment(
- xfs_btree_cur_t *cur, /* btree cursor */
- int level, /* level in btree, 0 is leaf */
- int *stat) /* success/failure */
+STATIC void
+xfs_inobt_trace_record(
+ struct xfs_btree_cur *cur,
+ union xfs_btree_rec *rec,
+ __uint64_t *l0,
+ __uint64_t *l1,
+ __uint64_t *l2)
{
- xfs_inobt_block_t *block; /* btree block */
- xfs_buf_t *bp; /* buffer containing btree block */
- int error; /* error return value */
- int lev; /* btree level */
+ *l0 = be32_to_cpu(rec->inobt.ir_startino);
+ *l1 = be32_to_cpu(rec->inobt.ir_freecount);
+ *l2 = be64_to_cpu(rec->inobt.ir_free);
+}
+#endif /* XFS_BTREE_TRACE */
+
+static const struct xfs_btree_ops xfs_inobt_ops = {
+ .rec_len = sizeof(xfs_inobt_rec_t),
+ .key_len = sizeof(xfs_inobt_key_t),
+
+ .dup_cursor = xfs_inobt_dup_cursor,
+ .set_root = xfs_inobt_set_root,
+ .kill_root = xfs_inobt_kill_root,
+ .alloc_block = xfs_inobt_alloc_block,
+ .free_block = xfs_inobt_free_block,
+ .get_minrecs = xfs_inobt_get_minrecs,
+ .get_maxrecs = xfs_inobt_get_maxrecs,
+ .init_key_from_rec = xfs_inobt_init_key_from_rec,
+ .init_rec_from_key = xfs_inobt_init_rec_from_key,
+ .init_rec_from_cur = xfs_inobt_init_rec_from_cur,
+ .init_ptr_from_cur = xfs_inobt_init_ptr_from_cur,
+ .key_diff = xfs_inobt_key_diff,
- ASSERT(level < cur->bc_nlevels);
- /*
- * Read-ahead to the right at this level.
- */
- xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
- /*
- * Get a pointer to the btree block.
- */
- bp = cur->bc_bufs[level];
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, level, bp)))
- return error;
-#endif
- /*
- * Increment the ptr at this level. If we're still in the block
- * then we're done.
- */
- if (++cur->bc_ptrs[level] <= be16_to_cpu(block->bb_numrecs)) {
- *stat = 1;
- return 0;
- }
- /*
- * If we just went off the right edge of the tree, return failure.
- */
- if (be32_to_cpu(block->bb_rightsib) == NULLAGBLOCK) {
- *stat = 0;
- return 0;
- }
- /*
- * March up the tree incrementing pointers.
- * Stop when we don't go off the right edge of a block.
- */
- for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
- bp = cur->bc_bufs[lev];
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
- return error;
+ .keys_inorder = xfs_inobt_keys_inorder,
+ .recs_inorder = xfs_inobt_recs_inorder,
#endif
- if (++cur->bc_ptrs[lev] <= be16_to_cpu(block->bb_numrecs))
- break;
- /*
- * Read-ahead the right block, we're going to read it
- * in the next loop.
- */
- xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
- }
- /*
- * If we went off the root then we are seriously confused.
- */
- ASSERT(lev < cur->bc_nlevels);
- /*
- * Now walk back down the tree, fixing up the cursor's buffer
- * pointers and key numbers.
- */
- for (bp = cur->bc_bufs[lev], block = XFS_BUF_TO_INOBT_BLOCK(bp);
- lev > level; ) {
- xfs_agblock_t agbno; /* block number of btree block */
- agbno = be32_to_cpu(*XFS_INOBT_PTR_ADDR(block, cur->bc_ptrs[lev], cur));
- if ((error = xfs_btree_read_bufs(cur->bc_mp, cur->bc_tp,
- cur->bc_private.a.agno, agbno, 0, &bp,
- XFS_INO_BTREE_REF)))
- return error;
- lev--;
- xfs_btree_setbuf(cur, lev, bp);
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
- if ((error = xfs_btree_check_sblock(cur, block, lev, bp)))
- return error;
- cur->bc_ptrs[lev] = 1;
- }
- *stat = 1;
- return 0;
-}
+#ifdef XFS_BTREE_TRACE
+ .trace_enter = xfs_inobt_trace_enter,
+ .trace_cursor = xfs_inobt_trace_cursor,
+ .trace_key = xfs_inobt_trace_key,
+ .trace_record = xfs_inobt_trace_record,
+#endif
+};
/*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
+ * Allocate a new inode btree cursor.
*/
-int /* error */
-xfs_inobt_insert(
- xfs_btree_cur_t *cur, /* btree cursor */
- int *stat) /* success/failure */
+struct xfs_btree_cur * /* new inode btree cursor */
+xfs_inobt_init_cursor(
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ struct xfs_buf *agbp, /* buffer for agi structure */
+ xfs_agnumber_t agno) /* allocation group number */
{
- int error; /* error return value */
- int i; /* result value, 0 for failure */
- int level; /* current level number in btree */
- xfs_agblock_t nbno; /* new block number (split result) */
- xfs_btree_cur_t *ncur; /* new cursor (split result) */
- xfs_inobt_rec_t nrec; /* record being inserted this level */
- xfs_btree_cur_t *pcur; /* previous level's cursor */
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp);
+ struct xfs_btree_cur *cur;
- level = 0;
- nbno = NULLAGBLOCK;
- nrec.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino);
- nrec.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount);
- nrec.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free);
- ncur = NULL;
- pcur = cur;
- /*
- * Loop going up the tree, starting at the leaf level.
- * Stop when we don't get a split block, that must mean that
- * the insert is finished with this level.
- */
- do {
- /*
- * Insert nrec/nbno into this level of the tree.
- * Note if we fail, nbno will be null.
- */
- if ((error = xfs_inobt_insrec(pcur, level++, &nbno, &nrec, &ncur,
- &i))) {
- if (pcur != cur)
- xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
- return error;
- }
- /*
- * See if the cursor we just used is trash.
- * Can't trash the caller's cursor, but otherwise we should
- * if ncur is a new cursor or we're about to be done.
- */
- if (pcur != cur && (ncur || nbno == NULLAGBLOCK)) {
- cur->bc_nlevels = pcur->bc_nlevels;
- xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
- }
- /*
- * If we got a new cursor, switch to it.
- */
- if (ncur) {
- pcur = ncur;
- ncur = NULL;
- }
- } while (nbno != NULLAGBLOCK);
- *stat = i;
- return 0;
-}
+ cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
-/*
- * Lookup the record equal to ino in the btree given by cur.
- */
-int /* error */
-xfs_inobt_lookup_eq(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agino_t ino, /* starting inode of chunk */
- __int32_t fcnt, /* free inode count */
- xfs_inofree_t free, /* free inode mask */
- int *stat) /* success/failure */
-{
- cur->bc_rec.i.ir_startino = ino;
- cur->bc_rec.i.ir_freecount = fcnt;
- cur->bc_rec.i.ir_free = free;
- return xfs_inobt_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
+ cur->bc_tp = tp;
+ cur->bc_mp = mp;
+ cur->bc_nlevels = be32_to_cpu(agi->agi_level);
+ cur->bc_btnum = XFS_BTNUM_INO;
+ cur->bc_blocklog = mp->m_sb.sb_blocklog;
-/*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
- */
-int /* error */
-xfs_inobt_lookup_ge(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agino_t ino, /* starting inode of chunk */
- __int32_t fcnt, /* free inode count */
- xfs_inofree_t free, /* free inode mask */
- int *stat) /* success/failure */
-{
- cur->bc_rec.i.ir_startino = ino;
- cur->bc_rec.i.ir_freecount = fcnt;
- cur->bc_rec.i.ir_free = free;
- return xfs_inobt_lookup(cur, XFS_LOOKUP_GE, stat);
-}
+ cur->bc_ops = &xfs_inobt_ops;
-/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
- */
-int /* error */
-xfs_inobt_lookup_le(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agino_t ino, /* starting inode of chunk */
- __int32_t fcnt, /* free inode count */
- xfs_inofree_t free, /* free inode mask */
- int *stat) /* success/failure */
-{
- cur->bc_rec.i.ir_startino = ino;
- cur->bc_rec.i.ir_freecount = fcnt;
- cur->bc_rec.i.ir_free = free;
- return xfs_inobt_lookup(cur, XFS_LOOKUP_LE, stat);
+ cur->bc_private.a.agbp = agbp;
+ cur->bc_private.a.agno = agno;
+
+ return cur;
}
/*
- * Update the record referred to by cur, to the value given
- * by [ino, fcnt, free].
- * This either works (return 0) or gets an EFSCORRUPTED error.
+ * Calculate number of records in an inobt btree block.
*/
-int /* error */
-xfs_inobt_update(
- xfs_btree_cur_t *cur, /* btree cursor */
- xfs_agino_t ino, /* starting inode of chunk */
- __int32_t fcnt, /* free inode count */
- xfs_inofree_t free) /* free inode mask */
+int
+xfs_inobt_maxrecs(
+ struct xfs_mount *mp,
+ int blocklen,
+ int leaf)
{
- xfs_inobt_block_t *block; /* btree block to update */
- xfs_buf_t *bp; /* buffer containing btree block */
- int error; /* error return value */
- int ptr; /* current record number (updating) */
- xfs_inobt_rec_t *rp; /* pointer to updated record */
+ blocklen -= XFS_INOBT_BLOCK_LEN(mp);
- /*
- * Pick up the current block.
- */
- bp = cur->bc_bufs[0];
- block = XFS_BUF_TO_INOBT_BLOCK(bp);
-#ifdef DEBUG
- if ((error = xfs_btree_check_sblock(cur, block, 0, bp)))
- return error;
-#endif
- /*
- * Get the address of the rec to be updated.
- */
- ptr = cur->bc_ptrs[0];
- rp = XFS_INOBT_REC_ADDR(block, ptr, cur);
- /*
- * Fill in the new contents and log them.
- */
- rp->ir_startino = cpu_to_be32(ino);
- rp->ir_freecount = cpu_to_be32(fcnt);
- rp->ir_free = cpu_to_be64(free);
- xfs_inobt_log_recs(cur, bp, ptr, ptr);
- /*
- * Updating first record in leaf. Pass new key value up to our parent.
- */
- if (ptr == 1) {
- xfs_inobt_key_t key; /* key containing [ino] */
-
- key.ir_startino = cpu_to_be32(ino);
- if ((error = xfs_inobt_updkey(cur, &key, 1)))
- return error;
- }
- return 0;
+ if (leaf)
+ return blocklen / sizeof(xfs_inobt_rec_t);
+ return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t));
}
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index 8efc4a5b8b92..37e5dd01a577 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -24,7 +24,6 @@
struct xfs_buf;
struct xfs_btree_cur;
-struct xfs_btree_sblock;
struct xfs_mount;
/*
@@ -70,11 +69,6 @@ typedef struct xfs_inobt_key {
/* btree pointer type */
typedef __be32 xfs_inobt_ptr_t;
-/* btree block header type */
-typedef struct xfs_btree_sblock xfs_inobt_block_t;
-
-#define XFS_BUF_TO_INOBT_BLOCK(bp) ((xfs_inobt_block_t *)XFS_BUF_PTR(bp))
-
/*
* Bit manipulations for ir_free.
*/
@@ -85,14 +79,6 @@ typedef struct xfs_btree_sblock xfs_inobt_block_t;
#define XFS_INOBT_CLR_FREE(rp,i) ((rp)->ir_free &= ~XFS_INOBT_MASK(i))
/*
- * Real block structures have a size equal to the disk block size.
- */
-#define XFS_INOBT_BLOCK_MAXRECS(lev,cur) ((cur)->bc_mp->m_inobt_mxr[lev != 0])
-#define XFS_INOBT_BLOCK_MINRECS(lev,cur) ((cur)->bc_mp->m_inobt_mnr[lev != 0])
-#define XFS_INOBT_IS_LAST_REC(cur) \
- ((cur)->bc_ptrs[0] == be16_to_cpu(XFS_BUF_TO_INOBT_BLOCK((cur)->bc_bufs[0])->bb_numrecs))
-
-/*
* Maximum number of inode btree levels.
*/
#define XFS_IN_MAXLEVELS(mp) ((mp)->m_in_maxlevels)
@@ -104,75 +90,38 @@ typedef struct xfs_btree_sblock xfs_inobt_block_t;
#define XFS_PREALLOC_BLOCKS(mp) ((xfs_agblock_t)(XFS_IBT_BLOCK(mp) + 1))
/*
- * Record, key, and pointer address macros for btree blocks.
- */
-#define XFS_INOBT_REC_ADDR(bb,i,cur) \
- (XFS_BTREE_REC_ADDR(xfs_inobt, bb, i))
-
-#define XFS_INOBT_KEY_ADDR(bb,i,cur) \
- (XFS_BTREE_KEY_ADDR(xfs_inobt, bb, i))
-
-#define XFS_INOBT_PTR_ADDR(bb,i,cur) \
- (XFS_BTREE_PTR_ADDR(xfs_inobt, bb, \
- i, XFS_INOBT_BLOCK_MAXRECS(1, cur)))
-
-/*
- * Decrement cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_inobt_decrement(struct xfs_btree_cur *cur, int level, int *stat);
-
-/*
- * Delete the record pointed to by cur.
- * The cursor refers to the place where the record was (could be inserted)
- * when the operation returns.
- */
-extern int xfs_inobt_delete(struct xfs_btree_cur *cur, int *stat);
-
-/*
- * Get the data from the pointed-to record.
- */
-extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino,
- __int32_t *fcnt, xfs_inofree_t *free, int *stat);
-
-/*
- * Increment cursor by one record at the level.
- * For nonzero levels the leaf-ward information is untouched.
- */
-extern int xfs_inobt_increment(struct xfs_btree_cur *cur, int level, int *stat);
-
-/*
- * Insert the current record at the point referenced by cur.
- * The cursor may be inconsistent on return if splits have been done.
- */
-extern int xfs_inobt_insert(struct xfs_btree_cur *cur, int *stat);
-
-/*
- * Lookup the record equal to ino in the btree given by cur.
- */
-extern int xfs_inobt_lookup_eq(struct xfs_btree_cur *cur, xfs_agino_t ino,
- __int32_t fcnt, xfs_inofree_t free, int *stat);
-
-/*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
- */
-extern int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino,
- __int32_t fcnt, xfs_inofree_t free, int *stat);
-
-/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
+ * Btree block header size depends on a superblock flag.
+ *
+ * (not quite yet, but soon)
*/
-extern int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
- __int32_t fcnt, xfs_inofree_t free, int *stat);
+#define XFS_INOBT_BLOCK_LEN(mp) XFS_BTREE_SBLOCK_LEN
/*
- * Update the record referred to by cur, to the value given
- * by [ino, fcnt, free].
- * This either works (return 0) or gets an EFSCORRUPTED error.
- */
-extern int xfs_inobt_update(struct xfs_btree_cur *cur, xfs_agino_t ino,
- __int32_t fcnt, xfs_inofree_t free);
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_INOBT_REC_ADDR(mp, block, index) \
+ ((xfs_inobt_rec_t *) \
+ ((char *)(block) + \
+ XFS_INOBT_BLOCK_LEN(mp) + \
+ (((index) - 1) * sizeof(xfs_inobt_rec_t))))
+
+#define XFS_INOBT_KEY_ADDR(mp, block, index) \
+ ((xfs_inobt_key_t *) \
+ ((char *)(block) + \
+ XFS_INOBT_BLOCK_LEN(mp) + \
+ ((index) - 1) * sizeof(xfs_inobt_key_t)))
+
+#define XFS_INOBT_PTR_ADDR(mp, block, index, maxrecs) \
+ ((xfs_inobt_ptr_t *) \
+ ((char *)(block) + \
+ XFS_INOBT_BLOCK_LEN(mp) + \
+ (maxrecs) * sizeof(xfs_inobt_key_t) + \
+ ((index) - 1) * sizeof(xfs_inobt_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
+ struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
+extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index e229e9e001c2..e2fb6210d4c5 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -38,281 +38,283 @@
#include "xfs_ialloc.h"
#include "xfs_quota.h"
#include "xfs_utils.h"
+#include "xfs_trans_priv.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_btree_trace.h"
+#include "xfs_dir2_trace.h"
+
/*
- * Look up an inode by number in the given file system.
- * The inode is looked up in the cache held in each AG.
- * If the inode is found in the cache, attach it to the provided
- * vnode.
- *
- * If it is not in core, read it in from the file system's device,
- * add it to the cache and attach the provided vnode.
- *
- * The inode is locked according to the value of the lock_flags parameter.
- * This flag parameter indicates how and if the inode's IO lock and inode lock
- * should be taken.
- *
- * mp -- the mount point structure for the current file system. It points
- * to the inode hash table.
- * tp -- a pointer to the current transaction if there is one. This is
- * simply passed through to the xfs_iread() call.
- * ino -- the number of the inode desired. This is the unique identifier
- * within the file system for the inode being requested.
- * lock_flags -- flags indicating how to lock the inode. See the comment
- * for xfs_ilock() for a list of valid values.
- * bno -- the block number starting the buffer containing the inode,
- * if known (as by bulkstat), else 0.
+ * Allocate and initialise an xfs_inode.
*/
-STATIC int
-xfs_iget_core(
- struct inode *inode,
- xfs_mount_t *mp,
- xfs_trans_t *tp,
- xfs_ino_t ino,
- uint flags,
- uint lock_flags,
- xfs_inode_t **ipp,
- xfs_daddr_t bno)
+STATIC struct xfs_inode *
+xfs_inode_alloc(
+ struct xfs_mount *mp,
+ xfs_ino_t ino)
{
- struct inode *old_inode;
- xfs_inode_t *ip;
- xfs_inode_t *iq;
- int error;
- unsigned long first_index, mask;
- xfs_perag_t *pag;
- xfs_agino_t agino;
+ struct xfs_inode *ip;
- /* the radix tree exists only in inode capable AGs */
- if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi)
- return EINVAL;
+ /*
+ * if this didn't occur in transactions, we could use
+ * KM_MAYFAIL and return NULL here on ENOMEM. Set the
+ * code up to do this anyway.
+ */
+ ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+ if (!ip)
+ return NULL;
- /* get the perag structure and ensure that it's inode capable */
- pag = xfs_get_perag(mp, ino);
- if (!pag->pagi_inodeok)
- return EINVAL;
- ASSERT(pag->pag_ici_init);
- agino = XFS_INO_TO_AGINO(mp, ino);
+ ASSERT(atomic_read(&ip->i_iocount) == 0);
+ ASSERT(atomic_read(&ip->i_pincount) == 0);
+ ASSERT(!spin_is_locked(&ip->i_flags_lock));
+ ASSERT(completion_done(&ip->i_flush));
-again:
- read_lock(&pag->pag_ici_lock);
- ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+ /*
+ * initialise the VFS inode here to get failures
+ * out of the way early.
+ */
+ if (!inode_init_always(mp->m_super, VFS_I(ip))) {
+ kmem_zone_free(xfs_inode_zone, ip);
+ return NULL;
+ }
+
+ /* initialise the xfs inode */
+ ip->i_ino = ino;
+ ip->i_mount = mp;
+ memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
+ ip->i_afp = NULL;
+ memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
+ ip->i_flags = 0;
+ ip->i_update_core = 0;
+ ip->i_update_size = 0;
+ ip->i_delayed_blks = 0;
+ memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+ ip->i_size = 0;
+ ip->i_new_size = 0;
+
+ /*
+ * Initialize inode's trace buffers.
+ */
+#ifdef XFS_INODE_TRACE
+ ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_BMAP_TRACE
+ ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_BTREE_TRACE
+ ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_RW_TRACE
+ ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_ILOCK_TRACE
+ ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
+#endif
+#ifdef XFS_DIR2_TRACE
+ ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
+#endif
+
+ return ip;
+}
+
+/*
+ * Check the validity of the inode we just found it the cache
+ */
+static int
+xfs_iget_cache_hit(
+ struct xfs_perag *pag,
+ struct xfs_inode *ip,
+ int flags,
+ int lock_flags) __releases(pag->pag_ici_lock)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ int error = EAGAIN;
+
+ /*
+ * If INEW is set this inode is being set up
+ * If IRECLAIM is set this inode is being torn down
+ * Pause and try again.
+ */
+ if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
+ XFS_STATS_INC(xs_ig_frecycle);
+ goto out_error;
+ }
+
+ /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
+ if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
- if (ip != NULL) {
/*
- * If INEW is set this inode is being set up
- * we need to pause and try again.
+ * If lookup is racing with unlink, then we should return an
+ * error immediately so we don't remove it from the reclaim
+ * list and potentially leak the inode.
*/
- if (xfs_iflags_test(ip, XFS_INEW)) {
- read_unlock(&pag->pag_ici_lock);
- delay(1);
- XFS_STATS_INC(xs_ig_frecycle);
-
- goto again;
+ if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
+ error = ENOENT;
+ goto out_error;
}
- old_inode = ip->i_vnode;
- if (old_inode == NULL) {
- /*
- * If IRECLAIM is set this inode is
- * on its way out of the system,
- * we need to pause and try again.
- */
- if (xfs_iflags_test(ip, XFS_IRECLAIM)) {
- read_unlock(&pag->pag_ici_lock);
- delay(1);
- XFS_STATS_INC(xs_ig_frecycle);
-
- goto again;
- }
- ASSERT(xfs_iflags_test(ip, XFS_IRECLAIMABLE));
-
- /*
- * If lookup is racing with unlink, then we
- * should return an error immediately so we
- * don't remove it from the reclaim list and
- * potentially leak the inode.
- */
- if ((ip->i_d.di_mode == 0) &&
- !(flags & XFS_IGET_CREATE)) {
- read_unlock(&pag->pag_ici_lock);
- xfs_put_perag(mp, pag);
- return ENOENT;
- }
-
- xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
-
- XFS_STATS_INC(xs_ig_found);
- xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
- read_unlock(&pag->pag_ici_lock);
-
- XFS_MOUNT_ILOCK(mp);
- list_del_init(&ip->i_reclaim);
- XFS_MOUNT_IUNLOCK(mp);
-
- goto finish_inode;
-
- } else if (inode != old_inode) {
- /* The inode is being torn down, pause and
- * try again.
- */
- if (old_inode->i_state & (I_FREEING | I_CLEAR)) {
- read_unlock(&pag->pag_ici_lock);
- delay(1);
- XFS_STATS_INC(xs_ig_frecycle);
-
- goto again;
- }
-/* Chances are the other vnode (the one in the inode) is being torn
-* down right now, and we landed on top of it. Question is, what do
-* we do? Unhook the old inode and hook up the new one?
-*/
- cmn_err(CE_PANIC,
- "xfs_iget_core: ambiguous vns: vp/0x%p, invp/0x%p",
- old_inode, inode);
- }
+ xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
/*
- * Inode cache hit
+ * We need to re-initialise the VFS inode as it has been
+ * 'freed' by the VFS. Do this here so we can deal with
+ * errors cleanly, then tag it so it can be set up correctly
+ * later.
*/
- read_unlock(&pag->pag_ici_lock);
- XFS_STATS_INC(xs_ig_found);
-
-finish_inode:
- if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
- xfs_put_perag(mp, pag);
- return ENOENT;
+ if (!inode_init_always(mp->m_super, VFS_I(ip))) {
+ error = ENOMEM;
+ goto out_error;
}
- if (lock_flags != 0)
- xfs_ilock(ip, lock_flags);
+ /*
+ * We must set the XFS_INEW flag before clearing the
+ * XFS_IRECLAIMABLE flag so that if a racing lookup does
+ * not find the XFS_IRECLAIMABLE above but has the igrab()
+ * below succeed we can safely check XFS_INEW to detect
+ * that this inode is still being initialised.
+ */
+ xfs_iflags_set(ip, XFS_INEW);
+ xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
+
+ /* clear the radix tree reclaim flag as well. */
+ __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+ } else if (!igrab(VFS_I(ip))) {
+ /* If the VFS inode is being torn down, pause and try again. */
+ XFS_STATS_INC(xs_ig_frecycle);
+ goto out_error;
+ } else if (xfs_iflags_test(ip, XFS_INEW)) {
+ /*
+ * We are racing with another cache hit that is
+ * currently recycling this inode out of the XFS_IRECLAIMABLE
+ * state. Wait for the initialisation to complete before
+ * continuing.
+ */
+ wait_on_inode(VFS_I(ip));
+ }
- xfs_iflags_clear(ip, XFS_ISTALE);
- xfs_itrace_exit_tag(ip, "xfs_iget.found");
- goto return_ip;
+ if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+ error = ENOENT;
+ iput(VFS_I(ip));
+ goto out_error;
}
- /*
- * Inode cache miss
- */
+ /* We've got a live one. */
read_unlock(&pag->pag_ici_lock);
- XFS_STATS_INC(xs_ig_missed);
- /*
- * Read the disk inode attributes into a new inode structure and get
- * a new vnode for it. This should also initialize i_ino and i_mount.
- */
- error = xfs_iread(mp, tp, ino, &ip, bno,
- (flags & XFS_IGET_BULKSTAT) ? XFS_IMAP_BULKSTAT : 0);
- if (error) {
- xfs_put_perag(mp, pag);
- return error;
- }
+ if (lock_flags != 0)
+ xfs_ilock(ip, lock_flags);
- xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
+ xfs_iflags_clear(ip, XFS_ISTALE);
+ xfs_itrace_exit_tag(ip, "xfs_iget.found");
+ XFS_STATS_INC(xs_ig_found);
+ return 0;
+
+out_error:
+ read_unlock(&pag->pag_ici_lock);
+ return error;
+}
- mrlock_init(&ip->i_lock, MRLOCK_ALLOW_EQUAL_PRI|MRLOCK_BARRIER,
- "xfsino", ip->i_ino);
- mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
- init_waitqueue_head(&ip->i_ipin_wait);
- atomic_set(&ip->i_pincount, 0);
+static int
+xfs_iget_cache_miss(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ xfs_trans_t *tp,
+ xfs_ino_t ino,
+ struct xfs_inode **ipp,
+ xfs_daddr_t bno,
+ int flags,
+ int lock_flags) __releases(pag->pag_ici_lock)
+{
+ struct xfs_inode *ip;
+ int error;
+ unsigned long first_index, mask;
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
- /*
- * Because we want to use a counting completion, complete
- * the flush completion once to allow a single access to
- * the flush completion without blocking.
- */
- init_completion(&ip->i_flush);
- complete(&ip->i_flush);
+ ip = xfs_inode_alloc(mp, ino);
+ if (!ip)
+ return ENOMEM;
- if (lock_flags)
- xfs_ilock(ip, lock_flags);
+ error = xfs_iread(mp, tp, ip, bno, flags);
+ if (error)
+ goto out_destroy;
+
+ xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
- xfs_idestroy(ip);
- xfs_put_perag(mp, pag);
- return ENOENT;
+ error = ENOENT;
+ goto out_destroy;
}
+ if (lock_flags)
+ xfs_ilock(ip, lock_flags);
+
/*
* Preload the radix tree so we can insert safely under the
- * write spinlock.
+ * write spinlock. Note that we cannot sleep inside the preload
+ * region.
*/
if (radix_tree_preload(GFP_KERNEL)) {
- xfs_idestroy(ip);
- delay(1);
- goto again;
+ error = EAGAIN;
+ goto out_unlock;
}
+
mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1);
first_index = agino & mask;
write_lock(&pag->pag_ici_lock);
- /*
- * insert the new inode
- */
+
+ /* insert the new inode */
error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
if (unlikely(error)) {
- BUG_ON(error != -EEXIST);
- write_unlock(&pag->pag_ici_lock);
- radix_tree_preload_end();
- xfs_idestroy(ip);
+ WARN_ON(error != -EEXIST);
XFS_STATS_INC(xs_ig_dup);
- goto again;
+ error = EAGAIN;
+ goto out_preload_end;
}
- /*
- * These values _must_ be set before releasing the radix tree lock!
- */
+ /* These values _must_ be set before releasing the radix tree lock! */
ip->i_udquot = ip->i_gdquot = NULL;
xfs_iflags_set(ip, XFS_INEW);
write_unlock(&pag->pag_ici_lock);
radix_tree_preload_end();
-
- /*
- * Link ip to its mount and thread it on the mount's inode list.
- */
- XFS_MOUNT_ILOCK(mp);
- if ((iq = mp->m_inodes)) {
- ASSERT(iq->i_mprev->i_mnext == iq);
- ip->i_mprev = iq->i_mprev;
- iq->i_mprev->i_mnext = ip;
- iq->i_mprev = ip;
- ip->i_mnext = iq;
- } else {
- ip->i_mnext = ip;
- ip->i_mprev = ip;
- }
- mp->m_inodes = ip;
-
- XFS_MOUNT_IUNLOCK(mp);
- xfs_put_perag(mp, pag);
-
- return_ip:
- ASSERT(ip->i_df.if_ext_max ==
- XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
-
- xfs_iflags_set(ip, XFS_IMODIFIED);
*ipp = ip;
-
- /*
- * Set up the Linux with the Linux inode.
- */
- ip->i_vnode = inode;
- inode->i_private = ip;
-
- /*
- * If we have a real type for an on-disk inode, we can set ops(&unlock)
- * now. If it's a new inode being created, xfs_ialloc will handle it.
- */
- if (ip->i_d.di_mode != 0)
- xfs_setup_inode(ip);
return 0;
-}
+out_preload_end:
+ write_unlock(&pag->pag_ici_lock);
+ radix_tree_preload_end();
+out_unlock:
+ if (lock_flags)
+ xfs_iunlock(ip, lock_flags);
+out_destroy:
+ xfs_destroy_inode(ip);
+ return error;
+}
/*
- * The 'normal' internal xfs_iget, if needed it will
- * 'allocate', or 'get', the vnode.
+ * Look up an inode by number in the given file system.
+ * The inode is looked up in the cache held in each AG.
+ * If the inode is found in the cache, initialise the vfs inode
+ * if necessary.
+ *
+ * If it is not in core, read it in from the file system's device,
+ * add it to the cache and initialise the vfs inode.
+ *
+ * The inode is locked according to the value of the lock_flags parameter.
+ * This flag parameter indicates how and if the inode's IO lock and inode lock
+ * should be taken.
+ *
+ * mp -- the mount point structure for the current file system. It points
+ * to the inode hash table.
+ * tp -- a pointer to the current transaction if there is one. This is
+ * simply passed through to the xfs_iread() call.
+ * ino -- the number of the inode desired. This is the unique identifier
+ * within the file system for the inode being requested.
+ * lock_flags -- flags indicating how to lock the inode. See the comment
+ * for xfs_ilock() for a list of valid values.
+ * bno -- the block number starting the buffer containing the inode,
+ * if known (as by bulkstat), else 0.
*/
int
xfs_iget(
@@ -324,61 +326,64 @@ xfs_iget(
xfs_inode_t **ipp,
xfs_daddr_t bno)
{
- struct inode *inode;
xfs_inode_t *ip;
int error;
+ xfs_perag_t *pag;
+ xfs_agino_t agino;
- XFS_STATS_INC(xs_ig_attempts);
+ /* the radix tree exists only in inode capable AGs */
+ if (XFS_INO_TO_AGNO(mp, ino) >= mp->m_maxagi)
+ return EINVAL;
-retry:
- inode = iget_locked(mp->m_super, ino);
- if (!inode)
- /* If we got no inode we are out of memory */
- return ENOMEM;
+ /* get the perag structure and ensure that it's inode capable */
+ pag = xfs_get_perag(mp, ino);
+ if (!pag->pagi_inodeok)
+ return EINVAL;
+ ASSERT(pag->pag_ici_init);
+ agino = XFS_INO_TO_AGINO(mp, ino);
- if (inode->i_state & I_NEW) {
- XFS_STATS_INC(vn_active);
- XFS_STATS_INC(vn_alloc);
-
- error = xfs_iget_core(inode, mp, tp, ino, flags,
- lock_flags, ipp, bno);
- if (error) {
- make_bad_inode(inode);
- if (inode->i_state & I_NEW)
- unlock_new_inode(inode);
- iput(inode);
- }
- return error;
+again:
+ error = 0;
+ read_lock(&pag->pag_ici_lock);
+ ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+
+ if (ip) {
+ error = xfs_iget_cache_hit(pag, ip, flags, lock_flags);
+ if (error)
+ goto out_error_or_again;
+ } else {
+ read_unlock(&pag->pag_ici_lock);
+ XFS_STATS_INC(xs_ig_missed);
+
+ error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip, bno,
+ flags, lock_flags);
+ if (error)
+ goto out_error_or_again;
}
+ xfs_put_perag(mp, pag);
+ *ipp = ip;
+
+ ASSERT(ip->i_df.if_ext_max ==
+ XFS_IFORK_DSIZE(ip) / sizeof(xfs_bmbt_rec_t));
/*
- * If the inode is not fully constructed due to
- * filehandle mismatches wait for the inode to go
- * away and try again.
- *
- * iget_locked will call __wait_on_freeing_inode
- * to wait for the inode to go away.
+ * If we have a real type for an on-disk inode, we can set ops(&unlock)
+ * now. If it's a new inode being created, xfs_ialloc will handle it.
*/
- if (is_bad_inode(inode)) {
- iput(inode);
- delay(1);
- goto retry;
- }
+ if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
+ xfs_setup_inode(ip);
+ return 0;
- ip = XFS_I(inode);
- if (!ip) {
- iput(inode);
+out_error_or_again:
+ if (error == EAGAIN) {
delay(1);
- goto retry;
+ goto again;
}
-
- if (lock_flags != 0)
- xfs_ilock(ip, lock_flags);
- XFS_STATS_INC(xs_ig_found);
- *ipp = ip;
- return 0;
+ xfs_put_perag(mp, pag);
+ return error;
}
+
/*
* Look for the inode corresponding to the given ino in the hash table.
* If it is there and its i_transp pointer matches tp, return it.
@@ -444,99 +449,109 @@ xfs_iput_new(
IRELE(ip);
}
-
/*
- * This routine embodies the part of the reclaim code that pulls
- * the inode from the inode hash table and the mount structure's
- * inode list.
- * This should only be called from xfs_reclaim().
+ * This is called free all the memory associated with an inode.
+ * It must free the inode itself and any buffers allocated for
+ * if_extents/if_data and if_broot. It must also free the lock
+ * associated with the inode.
+ *
+ * Note: because we don't initialise everything on reallocation out
+ * of the zone, we must ensure we nullify everything correctly before
+ * freeing the structure.
*/
void
-xfs_ireclaim(xfs_inode_t *ip)
+xfs_ireclaim(
+ struct xfs_inode *ip)
{
- /*
- * Remove from old hash list and mount list.
- */
- XFS_STATS_INC(xs_ig_reclaims);
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
- xfs_iextract(ip);
-
- /*
- * Here we do a spurious inode lock in order to coordinate with
- * xfs_sync(). This is because xfs_sync() references the inodes
- * in the mount list without taking references on the corresponding
- * vnodes. We make that OK here by ensuring that we wait until
- * the inode is unlocked in xfs_sync() before we go ahead and
- * free it. We get both the regular lock and the io lock because
- * the xfs_sync() code may need to drop the regular one but will
- * still hold the io lock.
- */
- xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-
- /*
- * Release dquots (and their references) if any. An inode may escape
- * xfs_inactive and get here via vn_alloc->vn_reclaim path.
- */
- XFS_QM_DQDETACH(ip->i_mount, ip);
-
- /*
- * Pull our behavior descriptor from the vnode chain.
- */
- if (ip->i_vnode) {
- ip->i_vnode->i_private = NULL;
- ip->i_vnode = NULL;
- }
+ XFS_STATS_INC(xs_ig_reclaims);
/*
- * Free all memory associated with the inode.
+ * Remove the inode from the per-AG radix tree. It doesn't matter
+ * if it was never added to it because radix_tree_delete can deal
+ * with that case just fine.
*/
- xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
- xfs_idestroy(ip);
-}
-
-/*
- * This routine removes an about-to-be-destroyed inode from
- * all of the lists in which it is located with the exception
- * of the behavior chain.
- */
-void
-xfs_iextract(
- xfs_inode_t *ip)
-{
- xfs_mount_t *mp = ip->i_mount;
- xfs_perag_t *pag = xfs_get_perag(mp, ip->i_ino);
- xfs_inode_t *iq;
-
+ pag = xfs_get_perag(mp, ip->i_ino);
write_lock(&pag->pag_ici_lock);
radix_tree_delete(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino));
write_unlock(&pag->pag_ici_lock);
xfs_put_perag(mp, pag);
/*
- * Remove from mount's inode list.
+ * Here we do an (almost) spurious inode lock in order to coordinate
+ * with inode cache radix tree lookups. This is because the lookup
+ * can reference the inodes in the cache without taking references.
+ *
+ * We make that OK here by ensuring that we wait until the inode is
+ * unlocked after the lookup before we go ahead and free it. We get
+ * both the ilock and the iolock because the code may need to drop the
+ * ilock one but will still hold the iolock.
*/
- XFS_MOUNT_ILOCK(mp);
- ASSERT((ip->i_mnext != NULL) && (ip->i_mprev != NULL));
- iq = ip->i_mnext;
- iq->i_mprev = ip->i_mprev;
- ip->i_mprev->i_mnext = iq;
-
+ xfs_ilock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
/*
- * Fix up the head pointer if it points to the inode being deleted.
+ * Release dquots (and their references) if any.
*/
- if (mp->m_inodes == ip) {
- if (ip == iq) {
- mp->m_inodes = NULL;
- } else {
- mp->m_inodes = iq;
- }
+ XFS_QM_DQDETACH(ip->i_mount, ip);
+ xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+ switch (ip->i_d.di_mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFDIR:
+ case S_IFLNK:
+ xfs_idestroy_fork(ip, XFS_DATA_FORK);
+ break;
}
- /* Deal with the deleted inodes list */
- list_del_init(&ip->i_reclaim);
+ if (ip->i_afp)
+ xfs_idestroy_fork(ip, XFS_ATTR_FORK);
- mp->m_ireclaims++;
- XFS_MOUNT_IUNLOCK(mp);
+#ifdef XFS_INODE_TRACE
+ ktrace_free(ip->i_trace);
+#endif
+#ifdef XFS_BMAP_TRACE
+ ktrace_free(ip->i_xtrace);
+#endif
+#ifdef XFS_BTREE_TRACE
+ ktrace_free(ip->i_btrace);
+#endif
+#ifdef XFS_RW_TRACE
+ ktrace_free(ip->i_rwtrace);
+#endif
+#ifdef XFS_ILOCK_TRACE
+ ktrace_free(ip->i_lock_trace);
+#endif
+#ifdef XFS_DIR2_TRACE
+ ktrace_free(ip->i_dir_trace);
+#endif
+ if (ip->i_itemp) {
+ /*
+ * Only if we are shutting down the fs will we see an
+ * inode still in the AIL. If it is there, we should remove
+ * it to prevent a use-after-free from occurring.
+ */
+ xfs_log_item_t *lip = &ip->i_itemp->ili_item;
+ struct xfs_ail *ailp = lip->li_ailp;
+
+ ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
+ XFS_FORCED_SHUTDOWN(ip->i_mount));
+ if (lip->li_flags & XFS_LI_IN_AIL) {
+ spin_lock(&ailp->xa_lock);
+ if (lip->li_flags & XFS_LI_IN_AIL)
+ xfs_trans_ail_delete(ailp, lip);
+ else
+ spin_unlock(&ailp->xa_lock);
+ }
+ xfs_inode_item_destroy(ip);
+ ip->i_itemp = NULL;
+ }
+ /* asserts to verify all state is correct here */
+ ASSERT(atomic_read(&ip->i_iocount) == 0);
+ ASSERT(atomic_read(&ip->i_pincount) == 0);
+ ASSERT(!spin_is_locked(&ip->i_flags_lock));
+ ASSERT(completion_done(&ip->i_flush));
+ kmem_zone_free(xfs_inode_zone, ip);
}
/*
@@ -737,7 +752,7 @@ xfs_iunlock(
* it is in the AIL and anyone is waiting on it. Don't do
* this if the caller has asked us not to.
*/
- xfs_trans_unlocked_item(ip->i_mount,
+ xfs_trans_unlocked_item(ip->i_itemp->ili_item.li_ailp,
(xfs_log_item_t*)(ip->i_itemp));
}
xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address);
@@ -790,3 +805,51 @@ xfs_isilocked(
}
#endif
+#ifdef XFS_INODE_TRACE
+
+#define KTRACE_ENTER(ip, vk, s, line, ra) \
+ ktrace_enter((ip)->i_trace, \
+/* 0 */ (void *)(__psint_t)(vk), \
+/* 1 */ (void *)(s), \
+/* 2 */ (void *)(__psint_t) line, \
+/* 3 */ (void *)(__psint_t)atomic_read(&VFS_I(ip)->i_count), \
+/* 4 */ (void *)(ra), \
+/* 5 */ NULL, \
+/* 6 */ (void *)(__psint_t)current_cpu(), \
+/* 7 */ (void *)(__psint_t)current_pid(), \
+/* 8 */ (void *)__return_address, \
+/* 9 */ NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+
+/*
+ * Vnode tracing code.
+ */
+void
+_xfs_itrace_entry(xfs_inode_t *ip, const char *func, inst_t *ra)
+{
+ KTRACE_ENTER(ip, INODE_KTRACE_ENTRY, func, 0, ra);
+}
+
+void
+_xfs_itrace_exit(xfs_inode_t *ip, const char *func, inst_t *ra)
+{
+ KTRACE_ENTER(ip, INODE_KTRACE_EXIT, func, 0, ra);
+}
+
+void
+xfs_itrace_hold(xfs_inode_t *ip, char *file, int line, inst_t *ra)
+{
+ KTRACE_ENTER(ip, INODE_KTRACE_HOLD, file, line, ra);
+}
+
+void
+_xfs_itrace_ref(xfs_inode_t *ip, char *file, int line, inst_t *ra)
+{
+ KTRACE_ENTER(ip, INODE_KTRACE_REF, file, line, ra);
+}
+
+void
+xfs_itrace_rele(xfs_inode_t *ip, char *file, int line, inst_t *ra)
+{
+ KTRACE_ENTER(ip, INODE_KTRACE_RELE, file, line, ra);
+}
+#endif /* XFS_INODE_TRACE */
diff --git a/fs/xfs/xfs_imap.h b/fs/xfs/xfs_imap.h
deleted file mode 100644
index d36450003983..000000000000
--- a/fs/xfs/xfs_imap.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Copyright (c) 2000,2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#ifndef __XFS_IMAP_H__
-#define __XFS_IMAP_H__
-
-/*
- * This is the structure passed to xfs_imap() to map
- * an inode number to its on disk location.
- */
-typedef struct xfs_imap {
- xfs_daddr_t im_blkno; /* starting BB of inode chunk */
- uint im_len; /* length in BBs of inode chunk */
- xfs_agblock_t im_agblkno; /* logical block of inode chunk in ag */
- ushort im_ioffset; /* inode offset in block in "inodes" */
- ushort im_boffset; /* inode offset in block in bytes */
-} xfs_imap_t;
-
-#ifdef __KERNEL__
-struct xfs_mount;
-struct xfs_trans;
-int xfs_imap(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
- xfs_imap_t *, uint);
-#endif
-
-#endif /* __XFS_IMAP_H__ */
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index a391b955df01..5a5e035e5d38 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -23,7 +23,6 @@
#include "xfs_bit.h"
#include "xfs_log.h"
#include "xfs_inum.h"
-#include "xfs_imap.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
#include "xfs_sb.h"
@@ -41,6 +40,7 @@
#include "xfs_buf_item.h"
#include "xfs_inode_item.h"
#include "xfs_btree.h"
+#include "xfs_btree_trace.h"
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_bmap.h"
@@ -133,10 +133,10 @@ STATIC int
xfs_imap_to_bp(
xfs_mount_t *mp,
xfs_trans_t *tp,
- xfs_imap_t *imap,
+ struct xfs_imap *imap,
xfs_buf_t **bpp,
uint buf_flags,
- uint imap_flags)
+ uint iget_flags)
{
int error;
int i;
@@ -173,12 +173,12 @@ xfs_imap_to_bp(
dip = (xfs_dinode_t *)xfs_buf_offset(bp,
(i << mp->m_sb.sb_inodelog));
- di_ok = be16_to_cpu(dip->di_core.di_magic) == XFS_DINODE_MAGIC &&
- XFS_DINODE_GOOD_VERSION(dip->di_core.di_version);
+ di_ok = be16_to_cpu(dip->di_magic) == XFS_DINODE_MAGIC &&
+ XFS_DINODE_GOOD_VERSION(dip->di_version);
if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
XFS_ERRTAG_ITOBP_INOTOBP,
XFS_RANDOM_ITOBP_INOTOBP))) {
- if (imap_flags & XFS_IMAP_BULKSTAT) {
+ if (iget_flags & XFS_IGET_BULKSTAT) {
xfs_trans_brelse(tp, bp);
return XFS_ERROR(EINVAL);
}
@@ -190,7 +190,7 @@ xfs_imap_to_bp(
"daddr %lld #%d (magic=%x)",
XFS_BUFTARG_NAME(mp->m_ddev_targp),
(unsigned long long)imap->im_blkno, i,
- be16_to_cpu(dip->di_core.di_magic));
+ be16_to_cpu(dip->di_magic));
#endif
xfs_trans_brelse(tp, bp);
return XFS_ERROR(EFSCORRUPTED);
@@ -221,25 +221,26 @@ xfs_imap_to_bp(
* Use xfs_imap() to determine the size and location of the
* buffer to read from disk.
*/
-STATIC int
+int
xfs_inotobp(
xfs_mount_t *mp,
xfs_trans_t *tp,
xfs_ino_t ino,
xfs_dinode_t **dipp,
xfs_buf_t **bpp,
- int *offset)
+ int *offset,
+ uint imap_flags)
{
- xfs_imap_t imap;
+ struct xfs_imap imap;
xfs_buf_t *bp;
int error;
imap.im_blkno = 0;
- error = xfs_imap(mp, tp, ino, &imap, XFS_IMAP_LOOKUP);
+ error = xfs_imap(mp, tp, ino, &imap, imap_flags);
if (error)
return error;
- error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, 0);
+ error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags);
if (error)
return error;
@@ -260,15 +261,11 @@ xfs_inotobp(
* If a non-zero error is returned, then the contents of bpp and
* dipp are undefined.
*
- * If the inode is new and has not yet been initialized, use xfs_imap()
- * to determine the size and location of the buffer to read from disk.
- * If the inode has already been mapped to its buffer and read in once,
- * then use the mapping information stored in the inode rather than
- * calling xfs_imap(). This allows us to avoid the overhead of looking
- * at the inode btree for small block file systems (see xfs_dilocate()).
- * We can tell whether the inode has been mapped in before by comparing
- * its disk block address to 0. Only uninitialized inodes will have
- * 0 for the disk block address.
+ * The inode is expected to already been mapped to its buffer and read
+ * in once, thus we can use the mapping information stored in the inode
+ * rather than calling xfs_imap(). This allows us to avoid the overhead
+ * of looking at the inode btree for small block file systems
+ * (see xfs_imap()).
*/
int
xfs_itobp(
@@ -277,40 +274,14 @@ xfs_itobp(
xfs_inode_t *ip,
xfs_dinode_t **dipp,
xfs_buf_t **bpp,
- xfs_daddr_t bno,
- uint imap_flags,
uint buf_flags)
{
- xfs_imap_t imap;
xfs_buf_t *bp;
int error;
- if (ip->i_blkno == (xfs_daddr_t)0) {
- imap.im_blkno = bno;
- error = xfs_imap(mp, tp, ip->i_ino, &imap,
- XFS_IMAP_LOOKUP | imap_flags);
- if (error)
- return error;
+ ASSERT(ip->i_imap.im_blkno != 0);
- /*
- * Fill in the fields in the inode that will be used to
- * map the inode to its buffer from now on.
- */
- ip->i_blkno = imap.im_blkno;
- ip->i_len = imap.im_len;
- ip->i_boffset = imap.im_boffset;
- } else {
- /*
- * We've already mapped the inode once, so just use the
- * mapping that we saved the first time.
- */
- imap.im_blkno = ip->i_blkno;
- imap.im_len = ip->i_len;
- imap.im_boffset = ip->i_boffset;
- }
- ASSERT(bno == 0 || bno == imap.im_blkno);
-
- error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, imap_flags);
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp, buf_flags, 0);
if (error)
return error;
@@ -321,7 +292,7 @@ xfs_itobp(
return EAGAIN;
}
- *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+ *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
*bpp = bp;
return 0;
}
@@ -348,26 +319,26 @@ xfs_iformat(
XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
error = 0;
- if (unlikely(be32_to_cpu(dip->di_core.di_nextents) +
- be16_to_cpu(dip->di_core.di_anextents) >
- be64_to_cpu(dip->di_core.di_nblocks))) {
+ if (unlikely(be32_to_cpu(dip->di_nextents) +
+ be16_to_cpu(dip->di_anextents) >
+ be64_to_cpu(dip->di_nblocks))) {
xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
"corrupt dinode %Lu, extent total = %d, nblocks = %Lu.",
(unsigned long long)ip->i_ino,
- (int)(be32_to_cpu(dip->di_core.di_nextents) +
- be16_to_cpu(dip->di_core.di_anextents)),
+ (int)(be32_to_cpu(dip->di_nextents) +
+ be16_to_cpu(dip->di_anextents)),
(unsigned long long)
- be64_to_cpu(dip->di_core.di_nblocks));
+ be64_to_cpu(dip->di_nblocks));
XFS_CORRUPTION_ERROR("xfs_iformat(1)", XFS_ERRLEVEL_LOW,
ip->i_mount, dip);
return XFS_ERROR(EFSCORRUPTED);
}
- if (unlikely(dip->di_core.di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
+ if (unlikely(dip->di_forkoff > ip->i_mount->m_sb.sb_inodesize)) {
xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
"corrupt dinode %Lu, forkoff = 0x%x.",
(unsigned long long)ip->i_ino,
- dip->di_core.di_forkoff);
+ dip->di_forkoff);
XFS_CORRUPTION_ERROR("xfs_iformat(2)", XFS_ERRLEVEL_LOW,
ip->i_mount, dip);
return XFS_ERROR(EFSCORRUPTED);
@@ -378,25 +349,25 @@ xfs_iformat(
case S_IFCHR:
case S_IFBLK:
case S_IFSOCK:
- if (unlikely(dip->di_core.di_format != XFS_DINODE_FMT_DEV)) {
+ if (unlikely(dip->di_format != XFS_DINODE_FMT_DEV)) {
XFS_CORRUPTION_ERROR("xfs_iformat(3)", XFS_ERRLEVEL_LOW,
ip->i_mount, dip);
return XFS_ERROR(EFSCORRUPTED);
}
ip->i_d.di_size = 0;
ip->i_size = 0;
- ip->i_df.if_u2.if_rdev = be32_to_cpu(dip->di_u.di_dev);
+ ip->i_df.if_u2.if_rdev = xfs_dinode_get_rdev(dip);
break;
case S_IFREG:
case S_IFLNK:
case S_IFDIR:
- switch (dip->di_core.di_format) {
+ switch (dip->di_format) {
case XFS_DINODE_FMT_LOCAL:
/*
* no local regular files yet
*/
- if (unlikely((be16_to_cpu(dip->di_core.di_mode) & S_IFMT) == S_IFREG)) {
+ if (unlikely((be16_to_cpu(dip->di_mode) & S_IFMT) == S_IFREG)) {
xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
"corrupt inode %Lu "
"(local format for regular file).",
@@ -407,7 +378,7 @@ xfs_iformat(
return XFS_ERROR(EFSCORRUPTED);
}
- di_size = be64_to_cpu(dip->di_core.di_size);
+ di_size = be64_to_cpu(dip->di_size);
if (unlikely(di_size > XFS_DFORK_DSIZE(dip, ip->i_mount))) {
xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
"corrupt inode %Lu "
@@ -449,7 +420,7 @@ xfs_iformat(
ip->i_afp = kmem_zone_zalloc(xfs_ifork_zone, KM_SLEEP);
ip->i_afp->if_ext_max =
XFS_IFORK_ASIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
- switch (dip->di_core.di_aformat) {
+ switch (dip->di_aformat) {
case XFS_DINODE_FMT_LOCAL:
atp = (xfs_attr_shortform_t *)XFS_DFORK_APTR(dip);
size = be16_to_cpu(atp->hdr.totsize);
@@ -621,7 +592,7 @@ xfs_iformat_btree(
ifp = XFS_IFORK_PTR(ip, whichfork);
dfp = (xfs_bmdr_block_t *)XFS_DFORK_PTR(dip, whichfork);
size = XFS_BMAP_BROOT_SPACE(dfp);
- nrecs = XFS_BMAP_BROOT_NUMRECS(dfp);
+ nrecs = be16_to_cpu(dfp->bb_numrecs);
/*
* blow out if -- fork has less extents than can fit in
@@ -649,8 +620,9 @@ xfs_iformat_btree(
* Copy and convert from the on-disk structure
* to the in-memory structure.
*/
- xfs_bmdr_to_bmbt(dfp, XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
- ifp->if_broot, size);
+ xfs_bmdr_to_bmbt(ip->i_mount, dfp,
+ XFS_DFORK_SIZE(dip, ip->i_mount, whichfork),
+ ifp->if_broot, size);
ifp->if_flags &= ~XFS_IFEXTENTS;
ifp->if_flags |= XFS_IFBROOT;
@@ -660,7 +632,7 @@ xfs_iformat_btree(
void
xfs_dinode_from_disk(
xfs_icdinode_t *to,
- xfs_dinode_core_t *from)
+ xfs_dinode_t *from)
{
to->di_magic = be16_to_cpu(from->di_magic);
to->di_mode = be16_to_cpu(from->di_mode);
@@ -694,7 +666,7 @@ xfs_dinode_from_disk(
void
xfs_dinode_to_disk(
- xfs_dinode_core_t *to,
+ xfs_dinode_t *to,
xfs_icdinode_t *from)
{
to->di_magic = cpu_to_be16(from->di_magic);
@@ -781,93 +753,57 @@ uint
xfs_dic2xflags(
xfs_dinode_t *dip)
{
- xfs_dinode_core_t *dic = &dip->di_core;
-
- return _xfs_dic2xflags(be16_to_cpu(dic->di_flags)) |
+ return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
(XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
}
/*
- * Given a mount structure and an inode number, return a pointer
- * to a newly allocated in-core inode corresponding to the given
- * inode number.
- *
- * Initialize the inode's attributes and extent pointers if it
- * already has them (it will not if the inode has no links).
+ * Read the disk inode attributes into the in-core inode structure.
*/
int
xfs_iread(
xfs_mount_t *mp,
xfs_trans_t *tp,
- xfs_ino_t ino,
- xfs_inode_t **ipp,
+ xfs_inode_t *ip,
xfs_daddr_t bno,
- uint imap_flags)
+ uint iget_flags)
{
xfs_buf_t *bp;
xfs_dinode_t *dip;
- xfs_inode_t *ip;
int error;
- ASSERT(xfs_inode_zone != NULL);
-
- ip = kmem_zone_zalloc(xfs_inode_zone, KM_SLEEP);
- ip->i_ino = ino;
- ip->i_mount = mp;
- atomic_set(&ip->i_iocount, 0);
- spin_lock_init(&ip->i_flags_lock);
-
/*
- * Get pointer's to the on-disk inode and the buffer containing it.
- * If the inode number refers to a block outside the file system
- * then xfs_itobp() will return NULL. In this case we should
- * return NULL as well. Set i_blkno to 0 so that xfs_itobp() will
- * know that this is a new incore inode.
+ * Fill in the location information in the in-core inode.
*/
- error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK);
- if (error) {
- kmem_zone_free(xfs_inode_zone, ip);
+ ip->i_imap.im_blkno = bno;
+ error = xfs_imap(mp, tp, ip->i_ino, &ip->i_imap, iget_flags);
+ if (error)
return error;
- }
+ ASSERT(bno == 0 || bno == ip->i_imap.im_blkno);
/*
- * Initialize inode's trace buffers.
- * Do this before xfs_iformat in case it adds entries.
+ * Get pointers to the on-disk inode and the buffer containing it.
*/
-#ifdef XFS_INODE_TRACE
- ip->i_trace = ktrace_alloc(INODE_TRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_BMAP_TRACE
- ip->i_xtrace = ktrace_alloc(XFS_BMAP_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_BMBT_TRACE
- ip->i_btrace = ktrace_alloc(XFS_BMBT_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_RW_TRACE
- ip->i_rwtrace = ktrace_alloc(XFS_RW_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_ILOCK_TRACE
- ip->i_lock_trace = ktrace_alloc(XFS_ILOCK_KTRACE_SIZE, KM_NOFS);
-#endif
-#ifdef XFS_DIR2_TRACE
- ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
-#endif
+ error = xfs_imap_to_bp(mp, tp, &ip->i_imap, &bp,
+ XFS_BUF_LOCK, iget_flags);
+ if (error)
+ return error;
+ dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
/*
* If we got something that isn't an inode it means someone
* (nfs or dmi) has a stale handle.
*/
- if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC) {
- kmem_zone_free(xfs_inode_zone, ip);
- xfs_trans_brelse(tp, bp);
+ if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC) {
#ifdef DEBUG
xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
- "dip->di_core.di_magic (0x%x) != "
+ "dip->di_magic (0x%x) != "
"XFS_DINODE_MAGIC (0x%x)",
- be16_to_cpu(dip->di_core.di_magic),
+ be16_to_cpu(dip->di_magic),
XFS_DINODE_MAGIC);
#endif /* DEBUG */
- return XFS_ERROR(EINVAL);
+ error = XFS_ERROR(EINVAL);
+ goto out_brelse;
}
/*
@@ -877,24 +813,22 @@ xfs_iread(
* specific information.
* Otherwise, just get the truly permanent information.
*/
- if (dip->di_core.di_mode) {
- xfs_dinode_from_disk(&ip->i_d, &dip->di_core);
+ if (dip->di_mode) {
+ xfs_dinode_from_disk(&ip->i_d, dip);
error = xfs_iformat(ip, dip);
if (error) {
- kmem_zone_free(xfs_inode_zone, ip);
- xfs_trans_brelse(tp, bp);
#ifdef DEBUG
xfs_fs_cmn_err(CE_ALERT, mp, "xfs_iread: "
"xfs_iformat() returned error %d",
error);
#endif /* DEBUG */
- return error;
+ goto out_brelse;
}
} else {
- ip->i_d.di_magic = be16_to_cpu(dip->di_core.di_magic);
- ip->i_d.di_version = dip->di_core.di_version;
- ip->i_d.di_gen = be32_to_cpu(dip->di_core.di_gen);
- ip->i_d.di_flushiter = be16_to_cpu(dip->di_core.di_flushiter);
+ ip->i_d.di_magic = be16_to_cpu(dip->di_magic);
+ ip->i_d.di_version = dip->di_version;
+ ip->i_d.di_gen = be32_to_cpu(dip->di_gen);
+ ip->i_d.di_flushiter = be16_to_cpu(dip->di_flushiter);
/*
* Make sure to pull in the mode here as well in
* case the inode is released without being used.
@@ -911,8 +845,6 @@ xfs_iread(
XFS_IFORK_DSIZE(ip) / (uint)sizeof(xfs_bmbt_rec_t);
}
- INIT_LIST_HEAD(&ip->i_reclaim);
-
/*
* The inode format changed when we moved the link count and
* made it 32 bits long. If this is an old format inode,
@@ -924,7 +856,7 @@ xfs_iread(
* the new format. We don't change the version number so that we
* can distinguish this from a real new format inode.
*/
- if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+ if (ip->i_d.di_version == 1) {
ip->i_d.di_nlink = ip->i_d.di_onlink;
ip->i_d.di_onlink = 0;
ip->i_d.di_projid = 0;
@@ -938,7 +870,7 @@ xfs_iread(
* around for a while. This helps to keep recently accessed
* meta-data in-core longer.
*/
- XFS_BUF_SET_REF(bp, XFS_INO_REF);
+ XFS_BUF_SET_REF(bp, XFS_INO_REF);
/*
* Use xfs_trans_brelse() to release the buffer containing the
@@ -953,9 +885,9 @@ xfs_iread(
* to worry about the inode being changed just because we released
* the buffer.
*/
+ out_brelse:
xfs_trans_brelse(tp, bp);
- *ipp = ip;
- return 0;
+ return error;
}
/*
@@ -1049,6 +981,7 @@ xfs_ialloc(
uint flags;
int error;
timespec_t tv;
+ int filestreams = 0;
/*
* Call the space management code to pick
@@ -1056,9 +989,8 @@ xfs_ialloc(
*/
error = xfs_dialloc(tp, pip ? pip->i_ino : 0, mode, okalloc,
ialloc_context, call_again, &ino);
- if (error != 0) {
+ if (error)
return error;
- }
if (*call_again || ino == NULLFSINO) {
*ipp = NULL;
return 0;
@@ -1072,9 +1004,8 @@ xfs_ialloc(
*/
error = xfs_trans_iget(tp->t_mountp, tp, ino,
XFS_IGET_CREATE, XFS_ILOCK_EXCL, &ip);
- if (error != 0) {
+ if (error)
return error;
- }
ASSERT(ip != NULL);
ip->i_d.di_mode = (__uint16_t)mode;
@@ -1093,8 +1024,8 @@ xfs_ialloc(
* here rather than here and in the flush/logging code.
*/
if (xfs_sb_version_hasnlink(&tp->t_mountp->m_sb) &&
- ip->i_d.di_version == XFS_DINODE_VERSION_1) {
- ip->i_d.di_version = XFS_DINODE_VERSION_2;
+ ip->i_d.di_version == 1) {
+ ip->i_d.di_version = 2;
/*
* We've already zeroed the old link count, the projid field,
* and the pad field.
@@ -1104,7 +1035,7 @@ xfs_ialloc(
/*
* Project ids won't be stored on disk if we are using a version 1 inode.
*/
- if ((prid != 0) && (ip->i_d.di_version == XFS_DINODE_VERSION_1))
+ if ((prid != 0) && (ip->i_d.di_version == 1))
xfs_bump_ino_vers2(tp, ip);
if (pip && XFS_INHERIT_GID(pip)) {
@@ -1155,13 +1086,12 @@ xfs_ialloc(
flags |= XFS_ILOG_DEV;
break;
case S_IFREG:
- if (pip && xfs_inode_is_filestream(pip)) {
- error = xfs_filestream_associate(pip, ip);
- if (error < 0)
- return -error;
- if (!error)
- xfs_iflags_set(ip, XFS_IFILESTREAM);
- }
+ /*
+ * we can't set up filestreams until after the VFS inode
+ * is set up properly.
+ */
+ if (pip && xfs_inode_is_filestream(pip))
+ filestreams = 1;
/* fall through */
case S_IFDIR:
if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
@@ -1227,6 +1157,15 @@ xfs_ialloc(
/* now that we have an i_mode we can setup inode ops and unlock */
xfs_setup_inode(ip);
+ /* now we have set up the vfs inode we can associate the filestream */
+ if (filestreams) {
+ error = xfs_filestream_associate(pip, ip);
+ if (error < 0)
+ return -error;
+ if (!error)
+ xfs_iflags_set(ip, XFS_IFILESTREAM);
+ }
+
*ipp = ip;
return 0;
}
@@ -1383,8 +1322,8 @@ xfs_itrunc_trace(
* direct I/O with the truncate operation. Also, because we hold
* the IOLOCK in exclusive mode, we prevent new direct I/Os from being
* started until the truncate completes and drops the lock. Essentially,
- * the vn_iowait() call forms an I/O barrier that provides strict ordering
- * between direct I/Os and the truncate operation.
+ * the xfs_ioend_wait() call forms an I/O barrier that provides strict
+ * ordering between direct I/Os and the truncate operation.
*
* The flags parameter can have either the value XFS_ITRUNC_DEFINITE
* or XFS_ITRUNC_MAYBE. The XFS_ITRUNC_MAYBE value should be used
@@ -1415,7 +1354,7 @@ xfs_itruncate_start(
/* wait for the completion of any pending DIOs */
if (new_size == 0 || new_size < ip->i_size)
- vn_iowait(ip);
+ xfs_ioend_wait(ip);
/*
* Call toss_pages or flushinval_pages to get rid of pages
@@ -1726,8 +1665,14 @@ xfs_itruncate_finish(
xfs_trans_ijoin(ntp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
xfs_trans_ihold(ntp, ip);
- if (!error)
- error = xfs_trans_reserve(ntp, 0,
+ if (error)
+ return error;
+ /*
+ * transaction commit worked ok so we can drop the extra ticket
+ * reference that we gained in xfs_trans_dup()
+ */
+ xfs_log_ticket_put(ntp->t_ticket);
+ error = xfs_trans_reserve(ntp, 0,
XFS_ITRUNCATE_LOG_RES(mp), 0,
XFS_TRANS_PERM_LOG_RES,
XFS_ITRUNCATE_LOG_COUNT);
@@ -1781,13 +1726,10 @@ xfs_iunlink(
xfs_dinode_t *dip;
xfs_buf_t *agibp;
xfs_buf_t *ibp;
- xfs_agnumber_t agno;
- xfs_daddr_t agdaddr;
xfs_agino_t agino;
short bucket_index;
int offset;
int error;
- int agi_ok;
ASSERT(ip->i_d.di_nlink == 0);
ASSERT(ip->i_d.di_mode != 0);
@@ -1795,31 +1737,15 @@ xfs_iunlink(
mp = tp->t_mountp;
- agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
- agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
-
/*
* Get the agi buffer first. It ensures lock ordering
* on the list.
*/
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
- XFS_FSS_TO_BB(mp, 1), 0, &agibp);
+ error = xfs_read_agi(mp, tp, XFS_INO_TO_AGNO(mp, ip->i_ino), &agibp);
if (error)
return error;
-
- /*
- * Validate the magic number of the agi block.
- */
agi = XFS_BUF_TO_AGI(agibp);
- agi_ok =
- be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
- XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
- if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK,
- XFS_RANDOM_IUNLINK))) {
- XFS_CORRUPTION_ERROR("xfs_iunlink", XFS_ERRLEVEL_LOW, mp, agi);
- xfs_trans_brelse(tp, agibp);
- return XFS_ERROR(EFSCORRUPTED);
- }
+
/*
* Get the index into the agi hash table for the
* list this inode will go on.
@@ -1837,14 +1763,14 @@ xfs_iunlink(
* Here we put the head pointer into our next pointer,
* and then we fall through to point the head at us.
*/
- error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
+ error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
if (error)
return error;
ASSERT(be32_to_cpu(dip->di_next_unlinked) == NULLAGINO);
/* both on-disk, don't endian flip twice */
dip->di_next_unlinked = agi->agi_unlinked[bucket_index];
- offset = ip->i_boffset +
+ offset = ip->i_imap.im_boffset +
offsetof(xfs_dinode_t, di_next_unlinked);
xfs_trans_inode_buf(tp, ibp);
xfs_trans_log_buf(tp, ibp, offset,
@@ -1879,7 +1805,6 @@ xfs_iunlink_remove(
xfs_buf_t *agibp;
xfs_buf_t *ibp;
xfs_agnumber_t agno;
- xfs_daddr_t agdaddr;
xfs_agino_t agino;
xfs_agino_t next_agino;
xfs_buf_t *last_ibp;
@@ -1887,45 +1812,20 @@ xfs_iunlink_remove(
short bucket_index;
int offset, last_offset = 0;
int error;
- int agi_ok;
- /*
- * First pull the on-disk inode from the AGI unlinked list.
- */
mp = tp->t_mountp;
-
agno = XFS_INO_TO_AGNO(mp, ip->i_ino);
- agdaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
/*
* Get the agi buffer first. It ensures lock ordering
* on the list.
*/
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, agdaddr,
- XFS_FSS_TO_BB(mp, 1), 0, &agibp);
- if (error) {
- cmn_err(CE_WARN,
- "xfs_iunlink_remove: xfs_trans_read_buf() returned an error %d on %s. Returning error.",
- error, mp->m_fsname);
+ error = xfs_read_agi(mp, tp, agno, &agibp);
+ if (error)
return error;
- }
- /*
- * Validate the magic number of the agi block.
- */
+
agi = XFS_BUF_TO_AGI(agibp);
- agi_ok =
- be32_to_cpu(agi->agi_magicnum) == XFS_AGI_MAGIC &&
- XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
- if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IUNLINK_REMOVE,
- XFS_RANDOM_IUNLINK_REMOVE))) {
- XFS_CORRUPTION_ERROR("xfs_iunlink_remove", XFS_ERRLEVEL_LOW,
- mp, agi);
- xfs_trans_brelse(tp, agibp);
- cmn_err(CE_WARN,
- "xfs_iunlink_remove: XFS_TEST_ERROR() returned an error on %s. Returning EFSCORRUPTED.",
- mp->m_fsname);
- return XFS_ERROR(EFSCORRUPTED);
- }
+
/*
* Get the index into the agi hash table for the
* list this inode will go on.
@@ -1945,7 +1845,7 @@ xfs_iunlink_remove(
* of dealing with the buffer when there is no need to
* change it.
*/
- error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
+ error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
if (error) {
cmn_err(CE_WARN,
"xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
@@ -1956,7 +1856,7 @@ xfs_iunlink_remove(
ASSERT(next_agino != 0);
if (next_agino != NULLAGINO) {
dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
- offset = ip->i_boffset +
+ offset = ip->i_imap.im_boffset +
offsetof(xfs_dinode_t, di_next_unlinked);
xfs_trans_inode_buf(tp, ibp);
xfs_trans_log_buf(tp, ibp, offset,
@@ -1992,7 +1892,7 @@ xfs_iunlink_remove(
}
next_ino = XFS_AGINO_TO_INO(mp, agno, next_agino);
error = xfs_inotobp(mp, tp, next_ino, &last_dip,
- &last_ibp, &last_offset);
+ &last_ibp, &last_offset, 0);
if (error) {
cmn_err(CE_WARN,
"xfs_iunlink_remove: xfs_inotobp() returned an error %d on %s. Returning error.",
@@ -2007,7 +1907,7 @@ xfs_iunlink_remove(
* Now last_ibp points to the buffer previous to us on
* the unlinked list. Pull us from the list.
*/
- error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
+ error = xfs_itobp(mp, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
if (error) {
cmn_err(CE_WARN,
"xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
@@ -2019,7 +1919,7 @@ xfs_iunlink_remove(
ASSERT(next_agino != agino);
if (next_agino != NULLAGINO) {
dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
- offset = ip->i_boffset +
+ offset = ip->i_imap.im_boffset +
offsetof(xfs_dinode_t, di_next_unlinked);
xfs_trans_inode_buf(tp, ibp);
xfs_trans_log_buf(tp, ibp, offset,
@@ -2160,9 +2060,9 @@ xfs_ifree_cluster(
iip = (xfs_inode_log_item_t *)lip;
ASSERT(iip->ili_logged == 1);
lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
- spin_lock(&mp->m_ail_lock);
- iip->ili_flush_lsn = iip->ili_item.li_lsn;
- spin_unlock(&mp->m_ail_lock);
+ xfs_trans_ail_copy_lsn(mp->m_ail,
+ &iip->ili_flush_lsn,
+ &iip->ili_item.li_lsn);
xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
pre_flushed++;
}
@@ -2183,9 +2083,8 @@ xfs_ifree_cluster(
iip->ili_last_fields = iip->ili_format.ilf_fields;
iip->ili_format.ilf_fields = 0;
iip->ili_logged = 1;
- spin_lock(&mp->m_ail_lock);
- iip->ili_flush_lsn = iip->ili_item.li_lsn;
- spin_unlock(&mp->m_ail_lock);
+ xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
+ &iip->ili_item.li_lsn);
xfs_buf_attach_iodone(bp,
(void(*)(xfs_buf_t*,xfs_log_item_t*))
@@ -2263,7 +2162,7 @@ xfs_ifree(
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
+ error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, XFS_BUF_LOCK);
if (error)
return error;
@@ -2279,7 +2178,7 @@ xfs_ifree(
* This is a temporary hack that would require a proper fix
* in the future.
*/
- dip->di_core.di_mode = 0;
+ dip->di_mode = 0;
if (delete) {
xfs_ifree_cluster(ip, tp, first_ino);
@@ -2312,9 +2211,10 @@ xfs_iroot_realloc(
int rec_diff,
int whichfork)
{
+ struct xfs_mount *mp = ip->i_mount;
int cur_max;
xfs_ifork_t *ifp;
- xfs_bmbt_block_t *new_broot;
+ struct xfs_btree_block *new_broot;
int new_max;
size_t new_size;
char *np;
@@ -2335,8 +2235,7 @@ xfs_iroot_realloc(
*/
if (ifp->if_broot_bytes == 0) {
new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(rec_diff);
- ifp->if_broot = (xfs_bmbt_block_t*)kmem_alloc(new_size,
- KM_SLEEP);
+ ifp->if_broot = kmem_alloc(new_size, KM_SLEEP);
ifp->if_broot_bytes = (int)new_size;
return;
}
@@ -2347,18 +2246,16 @@ xfs_iroot_realloc(
* location. The records don't change location because
* they are kept butted up against the btree block header.
*/
- cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
+ cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
new_max = cur_max + rec_diff;
new_size = (size_t)XFS_BMAP_BROOT_SPACE_CALC(new_max);
- ifp->if_broot = (xfs_bmbt_block_t *)
- kmem_realloc(ifp->if_broot,
- new_size,
+ ifp->if_broot = kmem_realloc(ifp->if_broot, new_size,
(size_t)XFS_BMAP_BROOT_SPACE_CALC(cur_max), /* old size */
KM_SLEEP);
- op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
- ifp->if_broot_bytes);
- np = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
- (int)new_size);
+ op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+ ifp->if_broot_bytes);
+ np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
+ (int)new_size);
ifp->if_broot_bytes = (int)new_size;
ASSERT(ifp->if_broot_bytes <=
XFS_IFORK_SIZE(ip, whichfork) + XFS_BROOT_SIZE_ADJ);
@@ -2372,7 +2269,7 @@ xfs_iroot_realloc(
* records, just get rid of the root and clear the status bit.
*/
ASSERT((ifp->if_broot != NULL) && (ifp->if_broot_bytes > 0));
- cur_max = XFS_BMAP_BROOT_MAXRECS(ifp->if_broot_bytes);
+ cur_max = xfs_bmbt_maxrecs(mp, ifp->if_broot_bytes, 0);
new_max = cur_max + rec_diff;
ASSERT(new_max >= 0);
if (new_max > 0)
@@ -2380,11 +2277,11 @@ xfs_iroot_realloc(
else
new_size = 0;
if (new_size > 0) {
- new_broot = (xfs_bmbt_block_t *)kmem_alloc(new_size, KM_SLEEP);
+ new_broot = kmem_alloc(new_size, KM_SLEEP);
/*
* First copy over the btree block header.
*/
- memcpy(new_broot, ifp->if_broot, sizeof(xfs_bmbt_block_t));
+ memcpy(new_broot, ifp->if_broot, XFS_BTREE_LBLOCK_LEN);
} else {
new_broot = NULL;
ifp->if_flags &= ~XFS_IFBROOT;
@@ -2397,18 +2294,16 @@ xfs_iroot_realloc(
/*
* First copy the records.
*/
- op = (char *)XFS_BMAP_BROOT_REC_ADDR(ifp->if_broot, 1,
- ifp->if_broot_bytes);
- np = (char *)XFS_BMAP_BROOT_REC_ADDR(new_broot, 1,
- (int)new_size);
+ op = (char *)XFS_BMBT_REC_ADDR(mp, ifp->if_broot, 1);
+ np = (char *)XFS_BMBT_REC_ADDR(mp, new_broot, 1);
memcpy(np, op, new_max * (uint)sizeof(xfs_bmbt_rec_t));
/*
* Then copy the pointers.
*/
- op = (char *)XFS_BMAP_BROOT_PTR_ADDR(ifp->if_broot, 1,
+ op = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, ifp->if_broot, 1,
ifp->if_broot_bytes);
- np = (char *)XFS_BMAP_BROOT_PTR_ADDR(new_broot, 1,
+ np = (char *)XFS_BMAP_BROOT_PTR_ADDR(mp, new_broot, 1,
(int)new_size);
memcpy(np, op, new_max * (uint)sizeof(xfs_dfsbno_t));
}
@@ -2511,64 +2406,6 @@ xfs_idata_realloc(
ASSERT(ifp->if_bytes <= XFS_IFORK_SIZE(ip, whichfork));
}
-
-
-
-/*
- * Map inode to disk block and offset.
- *
- * mp -- the mount point structure for the current file system
- * tp -- the current transaction
- * ino -- the inode number of the inode to be located
- * imap -- this structure is filled in with the information necessary
- * to retrieve the given inode from disk
- * flags -- flags to pass to xfs_dilocate indicating whether or not
- * lookups in the inode btree were OK or not
- */
-int
-xfs_imap(
- xfs_mount_t *mp,
- xfs_trans_t *tp,
- xfs_ino_t ino,
- xfs_imap_t *imap,
- uint flags)
-{
- xfs_fsblock_t fsbno;
- int len;
- int off;
- int error;
-
- fsbno = imap->im_blkno ?
- XFS_DADDR_TO_FSB(mp, imap->im_blkno) : NULLFSBLOCK;
- error = xfs_dilocate(mp, tp, ino, &fsbno, &len, &off, flags);
- if (error)
- return error;
-
- imap->im_blkno = XFS_FSB_TO_DADDR(mp, fsbno);
- imap->im_len = XFS_FSB_TO_BB(mp, len);
- imap->im_agblkno = XFS_FSB_TO_AGBNO(mp, fsbno);
- imap->im_ioffset = (ushort)off;
- imap->im_boffset = (ushort)(off << mp->m_sb.sb_inodelog);
-
- /*
- * If the inode number maps to a block outside the bounds
- * of the file system then return NULL rather than calling
- * read_buf and panicing when we get an error from the
- * driver.
- */
- if ((imap->im_blkno + imap->im_len) >
- XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks)) {
- xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
- "(imap->im_blkno (0x%llx) + imap->im_len (0x%llx)) > "
- " XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) (0x%llx)",
- (unsigned long long) imap->im_blkno,
- (unsigned long long) imap->im_len,
- XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks));
- return EINVAL;
- }
- return 0;
-}
-
void
xfs_idestroy_fork(
xfs_inode_t *ip,
@@ -2613,70 +2450,6 @@ xfs_idestroy_fork(
}
/*
- * This is called free all the memory associated with an inode.
- * It must free the inode itself and any buffers allocated for
- * if_extents/if_data and if_broot. It must also free the lock
- * associated with the inode.
- */
-void
-xfs_idestroy(
- xfs_inode_t *ip)
-{
- switch (ip->i_d.di_mode & S_IFMT) {
- case S_IFREG:
- case S_IFDIR:
- case S_IFLNK:
- xfs_idestroy_fork(ip, XFS_DATA_FORK);
- break;
- }
- if (ip->i_afp)
- xfs_idestroy_fork(ip, XFS_ATTR_FORK);
- mrfree(&ip->i_lock);
- mrfree(&ip->i_iolock);
-
-#ifdef XFS_INODE_TRACE
- ktrace_free(ip->i_trace);
-#endif
-#ifdef XFS_BMAP_TRACE
- ktrace_free(ip->i_xtrace);
-#endif
-#ifdef XFS_BMBT_TRACE
- ktrace_free(ip->i_btrace);
-#endif
-#ifdef XFS_RW_TRACE
- ktrace_free(ip->i_rwtrace);
-#endif
-#ifdef XFS_ILOCK_TRACE
- ktrace_free(ip->i_lock_trace);
-#endif
-#ifdef XFS_DIR2_TRACE
- ktrace_free(ip->i_dir_trace);
-#endif
- if (ip->i_itemp) {
- /*
- * Only if we are shutting down the fs will we see an
- * inode still in the AIL. If it is there, we should remove
- * it to prevent a use-after-free from occurring.
- */
- xfs_mount_t *mp = ip->i_mount;
- xfs_log_item_t *lip = &ip->i_itemp->ili_item;
-
- ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
- XFS_FORCED_SHUTDOWN(ip->i_mount));
- if (lip->li_flags & XFS_LI_IN_AIL) {
- spin_lock(&mp->m_ail_lock);
- if (lip->li_flags & XFS_LI_IN_AIL)
- xfs_trans_delete_ail(mp, lip);
- else
- spin_unlock(&mp->m_ail_lock);
- }
- xfs_inode_item_destroy(ip);
- }
- kmem_zone_free(xfs_inode_zone, ip);
-}
-
-
-/*
* Increment the pin count of the given buffer.
* This value is protected by ipinlock spinlock in the mount structure.
*/
@@ -2880,7 +2653,7 @@ xfs_iflush_fork(
ASSERT(ifp->if_broot_bytes <=
(XFS_IFORK_SIZE(ip, whichfork) +
XFS_BROOT_SIZE_ADJ));
- xfs_bmbt_to_bmdr(ifp->if_broot, ifp->if_broot_bytes,
+ xfs_bmbt_to_bmdr(mp, ifp->if_broot, ifp->if_broot_bytes,
(xfs_bmdr_block_t *)cp,
XFS_DFORK_SIZE(dip, mp, whichfork));
}
@@ -2889,15 +2662,16 @@ xfs_iflush_fork(
case XFS_DINODE_FMT_DEV:
if (iip->ili_format.ilf_fields & XFS_ILOG_DEV) {
ASSERT(whichfork == XFS_DATA_FORK);
- dip->di_u.di_dev = cpu_to_be32(ip->i_df.if_u2.if_rdev);
+ xfs_dinode_put_rdev(dip, ip->i_df.if_u2.if_rdev);
}
break;
case XFS_DINODE_FMT_UUID:
if (iip->ili_format.ilf_fields & XFS_ILOG_UUID) {
ASSERT(whichfork == XFS_DATA_FORK);
- memcpy(&dip->di_u.di_muuid, &ip->i_df.if_u2.if_uuid,
- sizeof(uuid_t));
+ memcpy(XFS_DFORK_DPTR(dip),
+ &ip->i_df.if_u2.if_uuid,
+ sizeof(uuid_t));
}
break;
@@ -3030,7 +2804,6 @@ cluster_corrupt_out:
XFS_BUF_CLR_BDSTRAT_FUNC(bp);
XFS_BUF_UNDONE(bp);
XFS_BUF_STALE(bp);
- XFS_BUF_SHUT(bp);
XFS_BUF_ERROR(bp,EIO);
xfs_biodone(bp);
} else {
@@ -3172,7 +2945,7 @@ xfs_iflush(
/*
* Get the buffer containing the on-disk inode.
*/
- error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0,
+ error = xfs_itobp(mp, NULL, ip, &dip, &bp,
noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK);
if (error || !bp) {
xfs_ifunlock(ip);
@@ -3253,7 +3026,7 @@ xfs_iflush_int(
}
/* set *dip = inode's place in the buffer */
- dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_boffset);
+ dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset);
/*
* Clear i_update_core before copying out the data.
@@ -3275,11 +3048,11 @@ xfs_iflush_int(
*/
xfs_synchronize_atime(ip);
- if (XFS_TEST_ERROR(be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC,
+ if (XFS_TEST_ERROR(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC,
mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) {
xfs_cmn_err(XFS_PTAG_IFLUSH, CE_ALERT, mp,
"xfs_iflush: Bad inode %Lu magic number 0x%x, ptr 0x%p",
- ip->i_ino, be16_to_cpu(dip->di_core.di_magic), dip);
+ ip->i_ino, be16_to_cpu(dip->di_magic), dip);
goto corrupt_out;
}
if (XFS_TEST_ERROR(ip->i_d.di_magic != XFS_DINODE_MAGIC,
@@ -3342,7 +3115,7 @@ xfs_iflush_int(
* because if the inode is dirty at all the core must
* be.
*/
- xfs_dinode_to_disk(&dip->di_core, &ip->i_d);
+ xfs_dinode_to_disk(dip, &ip->i_d);
/* Wrap, we never let the log put out DI_MAX_FLUSH */
if (ip->i_d.di_flushiter == DI_MAX_FLUSH)
@@ -3354,28 +3127,27 @@ xfs_iflush_int(
* convert back to the old inode format. If the superblock version
* has been updated, then make the conversion permanent.
*/
- ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
- xfs_sb_version_hasnlink(&mp->m_sb));
- if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+ ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
+ if (ip->i_d.di_version == 1) {
if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
/*
* Convert it back.
*/
ASSERT(ip->i_d.di_nlink <= XFS_MAXLINK_1);
- dip->di_core.di_onlink = cpu_to_be16(ip->i_d.di_nlink);
+ dip->di_onlink = cpu_to_be16(ip->i_d.di_nlink);
} else {
/*
* The superblock version has already been bumped,
* so just make the conversion to the new inode
* format permanent.
*/
- ip->i_d.di_version = XFS_DINODE_VERSION_2;
- dip->di_core.di_version = XFS_DINODE_VERSION_2;
+ ip->i_d.di_version = 2;
+ dip->di_version = 2;
ip->i_d.di_onlink = 0;
- dip->di_core.di_onlink = 0;
+ dip->di_onlink = 0;
memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
- memset(&(dip->di_core.di_pad[0]), 0,
- sizeof(dip->di_core.di_pad));
+ memset(&(dip->di_pad[0]), 0,
+ sizeof(dip->di_pad));
ASSERT(ip->i_d.di_projid == 0);
}
}
@@ -3418,10 +3190,8 @@ xfs_iflush_int(
iip->ili_format.ilf_fields = 0;
iip->ili_logged = 1;
- ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */
- spin_lock(&mp->m_ail_lock);
- iip->ili_flush_lsn = iip->ili_item.li_lsn;
- spin_unlock(&mp->m_ail_lock);
+ xfs_trans_ail_copy_lsn(mp->m_ail, &iip->ili_flush_lsn,
+ &iip->ili_item.li_lsn);
/*
* Attach the function xfs_iflush_done to the inode's
@@ -3459,45 +3229,8 @@ corrupt_out:
}
-/*
- * Flush all inactive inodes in mp.
- */
-void
-xfs_iflush_all(
- xfs_mount_t *mp)
-{
- xfs_inode_t *ip;
-
- again:
- XFS_MOUNT_ILOCK(mp);
- ip = mp->m_inodes;
- if (ip == NULL)
- goto out;
-
- do {
- /* Make sure we skip markers inserted by sync */
- if (ip->i_mount == NULL) {
- ip = ip->i_mnext;
- continue;
- }
-
- if (!VFS_I(ip)) {
- XFS_MOUNT_IUNLOCK(mp);
- xfs_finish_reclaim(ip, 0, XFS_IFLUSH_ASYNC);
- goto again;
- }
-
- ASSERT(vn_count(VFS_I(ip)) == 0);
-
- ip = ip->i_mnext;
- } while (ip != mp->m_inodes);
- out:
- XFS_MOUNT_IUNLOCK(mp);
-}
#ifdef XFS_ILOCK_TRACE
-ktrace_t *xfs_ilock_trace_buf;
-
void
xfs_ilock_trace(xfs_inode_t *ip, int lock, unsigned int lockflags, inst_t *ra)
{
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 6be310d41daf..1f175fa34b22 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -19,8 +19,7 @@
#define __XFS_INODE_H__
struct xfs_dinode;
-struct xfs_dinode_core;
-
+struct xfs_inode;
/*
* Fork identifiers.
@@ -63,7 +62,7 @@ typedef struct xfs_ext_irec {
typedef struct xfs_ifork {
int if_bytes; /* bytes in if_u1 */
int if_real_bytes; /* bytes allocated in if_u1 */
- xfs_bmbt_block_t *if_broot; /* file's incore btree root */
+ struct xfs_btree_block *if_broot; /* file's incore btree root */
short if_broot_bytes; /* bytes allocated for root */
unsigned char if_flags; /* per-fork flags */
unsigned char if_ext_max; /* max # of extent records */
@@ -84,52 +83,14 @@ typedef struct xfs_ifork {
} xfs_ifork_t;
/*
- * Flags for xfs_ichgtime().
+ * Inode location information. Stored in the inode and passed to
+ * xfs_imap_to_bp() to get a buffer and dinode for a given inode.
*/
-#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */
-#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
-
-/*
- * Per-fork incore inode flags.
- */
-#define XFS_IFINLINE 0x01 /* Inline data is read in */
-#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */
-#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */
-#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */
-
-/*
- * Flags for xfs_itobp(), xfs_imap() and xfs_dilocate().
- */
-#define XFS_IMAP_LOOKUP 0x1
-#define XFS_IMAP_BULKSTAT 0x2
-
-#ifdef __KERNEL__
-struct bhv_desc;
-struct cred;
-struct ktrace;
-struct xfs_buf;
-struct xfs_bmap_free;
-struct xfs_bmbt_irec;
-struct xfs_bmbt_block;
-struct xfs_inode;
-struct xfs_inode_log_item;
-struct xfs_mount;
-struct xfs_trans;
-struct xfs_dquot;
-
-#if defined(XFS_ILOCK_TRACE)
-#define XFS_ILOCK_KTRACE_SIZE 32
-extern ktrace_t *xfs_ilock_trace_buf;
-extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *);
-#else
-#define xfs_ilock_trace(i,n,f,ra)
-#endif
-
-typedef struct dm_attrs_s {
- __uint32_t da_dmevmask; /* DMIG event mask */
- __uint16_t da_dmstate; /* DMIG state info */
- __uint16_t da_pad; /* DMIG extra padding */
-} dm_attrs_t;
+struct xfs_imap {
+ xfs_daddr_t im_blkno; /* starting BB of inode chunk */
+ ushort im_len; /* length in BBs of inode chunk */
+ ushort im_boffset; /* inode offset in block in bytes */
+};
/*
* This is the xfs in-core inode structure.
@@ -160,7 +121,7 @@ typedef struct xfs_ictimestamp {
} xfs_ictimestamp_t;
/*
- * NOTE: This structure must be kept identical to struct xfs_dinode_core
+ * NOTE: This structure must be kept identical to struct xfs_dinode
* in xfs_dinode.h except for the endianess annotations.
*/
typedef struct xfs_icdinode {
@@ -191,27 +152,97 @@ typedef struct xfs_icdinode {
__uint32_t di_gen; /* generation number */
} xfs_icdinode_t;
-typedef struct {
- struct xfs_inode *ip_mnext; /* next inode in mount list */
- struct xfs_inode *ip_mprev; /* ptr to prev inode */
- struct xfs_mount *ip_mount; /* fs mount struct ptr */
-} xfs_iptr_t;
+/*
+ * Flags for xfs_ichgtime().
+ */
+#define XFS_ICHGTIME_MOD 0x1 /* data fork modification timestamp */
+#define XFS_ICHGTIME_CHG 0x2 /* inode field change timestamp */
+
+/*
+ * Per-fork incore inode flags.
+ */
+#define XFS_IFINLINE 0x01 /* Inline data is read in */
+#define XFS_IFEXTENTS 0x02 /* All extent pointers are read in */
+#define XFS_IFBROOT 0x04 /* i_broot points to the bmap b-tree root */
+#define XFS_IFEXTIREC 0x08 /* Indirection array of extent blocks */
+
+/*
+ * Fork handling.
+ */
+
+#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0)
+#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3))
+
+#define XFS_IFORK_PTR(ip,w) \
+ ((w) == XFS_DATA_FORK ? \
+ &(ip)->i_df : \
+ (ip)->i_afp)
+#define XFS_IFORK_DSIZE(ip) \
+ (XFS_IFORK_Q(ip) ? \
+ XFS_IFORK_BOFF(ip) : \
+ XFS_LITINO((ip)->i_mount))
+#define XFS_IFORK_ASIZE(ip) \
+ (XFS_IFORK_Q(ip) ? \
+ XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \
+ 0)
+#define XFS_IFORK_SIZE(ip,w) \
+ ((w) == XFS_DATA_FORK ? \
+ XFS_IFORK_DSIZE(ip) : \
+ XFS_IFORK_ASIZE(ip))
+#define XFS_IFORK_FORMAT(ip,w) \
+ ((w) == XFS_DATA_FORK ? \
+ (ip)->i_d.di_format : \
+ (ip)->i_d.di_aformat)
+#define XFS_IFORK_FMT_SET(ip,w,n) \
+ ((w) == XFS_DATA_FORK ? \
+ ((ip)->i_d.di_format = (n)) : \
+ ((ip)->i_d.di_aformat = (n)))
+#define XFS_IFORK_NEXTENTS(ip,w) \
+ ((w) == XFS_DATA_FORK ? \
+ (ip)->i_d.di_nextents : \
+ (ip)->i_d.di_anextents)
+#define XFS_IFORK_NEXT_SET(ip,w,n) \
+ ((w) == XFS_DATA_FORK ? \
+ ((ip)->i_d.di_nextents = (n)) : \
+ ((ip)->i_d.di_anextents = (n)))
+
+
+
+#ifdef __KERNEL__
+
+struct bhv_desc;
+struct cred;
+struct ktrace;
+struct xfs_buf;
+struct xfs_bmap_free;
+struct xfs_bmbt_irec;
+struct xfs_inode_log_item;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot;
+
+#if defined(XFS_ILOCK_TRACE)
+#define XFS_ILOCK_KTRACE_SIZE 32
+extern void xfs_ilock_trace(struct xfs_inode *, int, unsigned int, inst_t *);
+#else
+#define xfs_ilock_trace(i,n,f,ra)
+#endif
+
+typedef struct dm_attrs_s {
+ __uint32_t da_dmevmask; /* DMIG event mask */
+ __uint16_t da_dmstate; /* DMIG state info */
+ __uint16_t da_pad; /* DMIG extra padding */
+} dm_attrs_t;
typedef struct xfs_inode {
/* Inode linking and identification information. */
- struct xfs_inode *i_mnext; /* next inode in mount list */
- struct xfs_inode *i_mprev; /* ptr to prev inode */
struct xfs_mount *i_mount; /* fs mount struct ptr */
- struct list_head i_reclaim; /* reclaim list */
- struct inode *i_vnode; /* vnode backpointer */
struct xfs_dquot *i_udquot; /* user dquot */
struct xfs_dquot *i_gdquot; /* group dquot */
/* Inode location stuff */
xfs_ino_t i_ino; /* inode number (agno/agino)*/
- xfs_daddr_t i_blkno; /* blkno of inode buffer */
- ushort i_len; /* len of inode buffer */
- ushort i_boffset; /* off of inode in buffer */
+ struct xfs_imap i_imap; /* location for xfs_imap() */
/* Extent information. */
xfs_ifork_t *i_afp; /* attribute fork pointer */
@@ -230,7 +261,6 @@ typedef struct xfs_inode {
unsigned short i_flags; /* see defined flags below */
unsigned char i_update_core; /* timestamps/size is dirty */
unsigned char i_update_size; /* di_size field is dirty */
- unsigned int i_gen; /* generation count */
unsigned int i_delayed_blks; /* count of delay alloc blks */
xfs_icdinode_t i_d; /* most of ondisk inode */
@@ -238,6 +268,10 @@ typedef struct xfs_inode {
xfs_fsize_t i_size; /* in-memory size */
xfs_fsize_t i_new_size; /* size when write completes */
atomic_t i_iocount; /* outstanding I/O count */
+
+ /* VFS inode */
+ struct inode i_vnode; /* embedded VFS inode */
+
/* Trace buffers per inode. */
#ifdef XFS_INODE_TRACE
struct ktrace *i_trace; /* general inode trace */
@@ -245,7 +279,7 @@ typedef struct xfs_inode {
#ifdef XFS_BMAP_TRACE
struct ktrace *i_xtrace; /* inode extent list trace */
#endif
-#ifdef XFS_BMBT_TRACE
+#ifdef XFS_BTREE_TRACE
struct ktrace *i_btrace; /* inode bmap btree trace */
#endif
#ifdef XFS_RW_TRACE
@@ -265,13 +299,30 @@ typedef struct xfs_inode {
/* Convert from vfs inode to xfs inode */
static inline struct xfs_inode *XFS_I(struct inode *inode)
{
- return (struct xfs_inode *)inode->i_private;
+ return container_of(inode, struct xfs_inode, i_vnode);
}
/* convert from xfs inode to vfs inode */
static inline struct inode *VFS_I(struct xfs_inode *ip)
{
- return (struct inode *)ip->i_vnode;
+ return &ip->i_vnode;
+}
+
+/*
+ * Get rid of a partially initialized inode.
+ *
+ * We have to go through destroy_inode to make sure allocations
+ * from init_inode_always like the security data are undone.
+ *
+ * We mark the inode bad so that it takes the short cut in
+ * the reclaim path instead of going through the flush path
+ * which doesn't make sense for an inode that has never seen the
+ * light of day.
+ */
+static inline void xfs_destroy_inode(struct xfs_inode *ip)
+{
+ make_bad_inode(VFS_I(ip));
+ return destroy_inode(VFS_I(ip));
}
/*
@@ -327,65 +378,36 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
spin_unlock(&ip->i_flags_lock);
return ret;
}
-#endif /* __KERNEL__ */
-
/*
- * Fork handling.
+ * Manage the i_flush queue embedded in the inode. This completion
+ * queue synchronizes processes attempting to flush the in-core
+ * inode back to disk.
*/
+static inline void xfs_iflock(xfs_inode_t *ip)
+{
+ wait_for_completion(&ip->i_flush);
+}
-#define XFS_IFORK_Q(ip) ((ip)->i_d.di_forkoff != 0)
-#define XFS_IFORK_BOFF(ip) ((int)((ip)->i_d.di_forkoff << 3))
-
-#define XFS_IFORK_PTR(ip,w) \
- ((w) == XFS_DATA_FORK ? \
- &(ip)->i_df : \
- (ip)->i_afp)
-#define XFS_IFORK_DSIZE(ip) \
- (XFS_IFORK_Q(ip) ? \
- XFS_IFORK_BOFF(ip) : \
- XFS_LITINO((ip)->i_mount))
-#define XFS_IFORK_ASIZE(ip) \
- (XFS_IFORK_Q(ip) ? \
- XFS_LITINO((ip)->i_mount) - XFS_IFORK_BOFF(ip) : \
- 0)
-#define XFS_IFORK_SIZE(ip,w) \
- ((w) == XFS_DATA_FORK ? \
- XFS_IFORK_DSIZE(ip) : \
- XFS_IFORK_ASIZE(ip))
-#define XFS_IFORK_FORMAT(ip,w) \
- ((w) == XFS_DATA_FORK ? \
- (ip)->i_d.di_format : \
- (ip)->i_d.di_aformat)
-#define XFS_IFORK_FMT_SET(ip,w,n) \
- ((w) == XFS_DATA_FORK ? \
- ((ip)->i_d.di_format = (n)) : \
- ((ip)->i_d.di_aformat = (n)))
-#define XFS_IFORK_NEXTENTS(ip,w) \
- ((w) == XFS_DATA_FORK ? \
- (ip)->i_d.di_nextents : \
- (ip)->i_d.di_anextents)
-#define XFS_IFORK_NEXT_SET(ip,w,n) \
- ((w) == XFS_DATA_FORK ? \
- ((ip)->i_d.di_nextents = (n)) : \
- ((ip)->i_d.di_anextents = (n)))
+static inline int xfs_iflock_nowait(xfs_inode_t *ip)
+{
+ return try_wait_for_completion(&ip->i_flush);
+}
-#ifdef __KERNEL__
+static inline void xfs_ifunlock(xfs_inode_t *ip)
+{
+ complete(&ip->i_flush);
+}
/*
* In-core inode flags.
*/
-#define XFS_IGRIO 0x0001 /* inode used for guaranteed rate i/o */
-#define XFS_IUIOSZ 0x0002 /* inode i/o sizes have been explicitly set */
-#define XFS_IQUIESCE 0x0004 /* we have started quiescing for this inode */
-#define XFS_IRECLAIM 0x0008 /* we have started reclaiming this inode */
-#define XFS_ISTALE 0x0010 /* inode has been staled */
-#define XFS_IRECLAIMABLE 0x0020 /* inode can be reclaimed */
-#define XFS_INEW 0x0040
-#define XFS_IFILESTREAM 0x0080 /* inode is in a filestream directory */
-#define XFS_IMODIFIED 0x0100 /* XFS inode state possibly differs */
- /* to the Linux inode state. */
-#define XFS_ITRUNCATED 0x0200 /* truncated down so flush-on-close */
+#define XFS_IRECLAIM 0x0001 /* we have started reclaiming this inode */
+#define XFS_ISTALE 0x0002 /* inode has been staled */
+#define XFS_IRECLAIMABLE 0x0004 /* inode can be reclaimed */
+#define XFS_INEW 0x0008 /* inode has just been allocated */
+#define XFS_IFILESTREAM 0x0010 /* inode is in a filestream directory */
+#define XFS_ITRUNCATED 0x0020 /* truncated down so flush-on-close */
/*
* Flags for inode locking.
@@ -460,16 +482,8 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags)
((pip)->i_d.di_mode & S_ISGID))
/*
- * Flags for xfs_iget()
- */
-#define XFS_IGET_CREATE 0x1
-#define XFS_IGET_BULKSTAT 0x2
-
-/*
* xfs_iget.c prototypes.
*/
-void xfs_ihash_init(struct xfs_mount *);
-void xfs_ihash_free(struct xfs_mount *);
xfs_inode_t *xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
struct xfs_trans *);
int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
@@ -484,25 +498,13 @@ int xfs_isilocked(xfs_inode_t *, uint);
uint xfs_ilock_map_shared(xfs_inode_t *);
void xfs_iunlock_map_shared(xfs_inode_t *, uint);
void xfs_ireclaim(xfs_inode_t *);
-int xfs_finish_reclaim(xfs_inode_t *, int, int);
-int xfs_finish_reclaim_all(struct xfs_mount *, int);
/*
* xfs_inode.c prototypes.
*/
-int xfs_itobp(struct xfs_mount *, struct xfs_trans *,
- xfs_inode_t *, struct xfs_dinode **, struct xfs_buf **,
- xfs_daddr_t, uint, uint);
-int xfs_iread(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
- xfs_inode_t **, xfs_daddr_t, uint);
-int xfs_iread_extents(struct xfs_trans *, xfs_inode_t *, int);
int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, mode_t,
xfs_nlink_t, xfs_dev_t, cred_t *, xfs_prid_t,
int, struct xfs_buf **, boolean_t *, xfs_inode_t **);
-void xfs_dinode_from_disk(struct xfs_icdinode *,
- struct xfs_dinode_core *);
-void xfs_dinode_to_disk(struct xfs_dinode_core *,
- struct xfs_icdinode *);
uint xfs_ip2xflags(struct xfs_inode *);
uint xfs_dic2xflags(struct xfs_dinode *);
@@ -513,17 +515,10 @@ int xfs_itruncate_finish(struct xfs_trans **, xfs_inode_t *,
xfs_fsize_t, int, int);
int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
-void xfs_idestroy_fork(xfs_inode_t *, int);
-void xfs_idestroy(xfs_inode_t *);
-void xfs_idata_realloc(xfs_inode_t *, int, int);
-void xfs_iextract(xfs_inode_t *);
void xfs_iext_realloc(xfs_inode_t *, int, int);
-void xfs_iroot_realloc(xfs_inode_t *, int, int);
void xfs_ipin(xfs_inode_t *);
void xfs_iunpin(xfs_inode_t *);
-int xfs_iextents_copy(xfs_inode_t *, xfs_bmbt_rec_t *, int);
int xfs_iflush(xfs_inode_t *, uint);
-void xfs_iflush_all(struct xfs_mount *);
void xfs_ichgtime(xfs_inode_t *, int);
xfs_fsize_t xfs_file_last_byte(xfs_inode_t *);
void xfs_lock_inodes(xfs_inode_t **, int, uint);
@@ -532,6 +527,77 @@ void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
void xfs_synchronize_atime(xfs_inode_t *);
void xfs_mark_inode_dirty_sync(xfs_inode_t *);
+#if defined(XFS_INODE_TRACE)
+
+#define INODE_TRACE_SIZE 16 /* number of trace entries */
+#define INODE_KTRACE_ENTRY 1
+#define INODE_KTRACE_EXIT 2
+#define INODE_KTRACE_HOLD 3
+#define INODE_KTRACE_REF 4
+#define INODE_KTRACE_RELE 5
+
+extern void _xfs_itrace_entry(struct xfs_inode *, const char *, inst_t *);
+extern void _xfs_itrace_exit(struct xfs_inode *, const char *, inst_t *);
+extern void xfs_itrace_hold(struct xfs_inode *, char *, int, inst_t *);
+extern void _xfs_itrace_ref(struct xfs_inode *, char *, int, inst_t *);
+extern void xfs_itrace_rele(struct xfs_inode *, char *, int, inst_t *);
+#define xfs_itrace_entry(ip) \
+ _xfs_itrace_entry(ip, __func__, (inst_t *)__return_address)
+#define xfs_itrace_exit(ip) \
+ _xfs_itrace_exit(ip, __func__, (inst_t *)__return_address)
+#define xfs_itrace_exit_tag(ip, tag) \
+ _xfs_itrace_exit(ip, tag, (inst_t *)__return_address)
+#define xfs_itrace_ref(ip) \
+ _xfs_itrace_ref(ip, __FILE__, __LINE__, (inst_t *)__return_address)
+
+#else
+#define xfs_itrace_entry(a)
+#define xfs_itrace_exit(a)
+#define xfs_itrace_exit_tag(a, b)
+#define xfs_itrace_hold(a, b, c, d)
+#define xfs_itrace_ref(a)
+#define xfs_itrace_rele(a, b, c, d)
+#endif
+
+#define IHOLD(ip) \
+do { \
+ ASSERT(atomic_read(&VFS_I(ip)->i_count) > 0) ; \
+ atomic_inc(&(VFS_I(ip)->i_count)); \
+ xfs_itrace_hold((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
+} while (0)
+
+#define IRELE(ip) \
+do { \
+ xfs_itrace_rele((ip), __FILE__, __LINE__, (inst_t *)__return_address); \
+ iput(VFS_I(ip)); \
+} while (0)
+
+#endif /* __KERNEL__ */
+
+/*
+ * Flags for xfs_iget()
+ */
+#define XFS_IGET_CREATE 0x1
+#define XFS_IGET_BULKSTAT 0x2
+
+int xfs_inotobp(struct xfs_mount *, struct xfs_trans *,
+ xfs_ino_t, struct xfs_dinode **,
+ struct xfs_buf **, int *, uint);
+int xfs_itobp(struct xfs_mount *, struct xfs_trans *,
+ struct xfs_inode *, struct xfs_dinode **,
+ struct xfs_buf **, uint);
+int xfs_iread(struct xfs_mount *, struct xfs_trans *,
+ struct xfs_inode *, xfs_daddr_t, uint);
+void xfs_dinode_from_disk(struct xfs_icdinode *,
+ struct xfs_dinode *);
+void xfs_dinode_to_disk(struct xfs_dinode *,
+ struct xfs_icdinode *);
+void xfs_idestroy_fork(struct xfs_inode *, int);
+void xfs_idata_realloc(struct xfs_inode *, int, int);
+void xfs_iroot_realloc(struct xfs_inode *, int, int);
+int xfs_iread_extents(struct xfs_trans *, struct xfs_inode *, int);
+int xfs_iextents_copy(struct xfs_inode *, xfs_bmbt_rec_t *, int);
+
xfs_bmbt_rec_host_t *xfs_iext_get_ext(xfs_ifork_t *, xfs_extnum_t);
void xfs_iext_insert(xfs_ifork_t *, xfs_extnum_t, xfs_extnum_t,
xfs_bmbt_irec_t *);
@@ -561,7 +627,8 @@ void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
#ifdef DEBUG
-void xfs_isize_check(struct xfs_mount *, xfs_inode_t *, xfs_fsize_t);
+void xfs_isize_check(struct xfs_mount *, struct xfs_inode *,
+ xfs_fsize_t);
#else /* DEBUG */
#define xfs_isize_check(mp, ip, isize)
#endif /* DEBUG */
@@ -576,26 +643,4 @@ extern struct kmem_zone *xfs_ifork_zone;
extern struct kmem_zone *xfs_inode_zone;
extern struct kmem_zone *xfs_ili_zone;
-/*
- * Manage the i_flush queue embedded in the inode. This completion
- * queue synchronizes processes attempting to flush the in-core
- * inode back to disk.
- */
-static inline void xfs_iflock(xfs_inode_t *ip)
-{
- wait_for_completion(&ip->i_flush);
-}
-
-static inline int xfs_iflock_nowait(xfs_inode_t *ip)
-{
- return try_wait_for_completion(&ip->i_flush);
-}
-
-static inline void xfs_ifunlock(xfs_inode_t *ip)
-{
- complete(&ip->i_flush);
-}
-
-#endif /* __KERNEL__ */
-
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 97c7452e2620..977c4aec587e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -281,7 +281,7 @@ xfs_inode_item_format(
xfs_mark_inode_dirty_sync(ip);
vecp->i_addr = (xfs_caddr_t)&ip->i_d;
- vecp->i_len = sizeof(xfs_dinode_core_t);
+ vecp->i_len = sizeof(struct xfs_icdinode);
XLOG_VEC_SET_TYPE(vecp, XLOG_REG_TYPE_ICORE);
vecp++;
nvecs++;
@@ -296,9 +296,8 @@ xfs_inode_item_format(
* has a new version number, then we don't bother converting back.
*/
mp = ip->i_mount;
- ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1 ||
- xfs_sb_version_hasnlink(&mp->m_sb));
- if (ip->i_d.di_version == XFS_DINODE_VERSION_1) {
+ ASSERT(ip->i_d.di_version == 1 || xfs_sb_version_hasnlink(&mp->m_sb));
+ if (ip->i_d.di_version == 1) {
if (!xfs_sb_version_hasnlink(&mp->m_sb)) {
/*
* Convert it back.
@@ -311,7 +310,7 @@ xfs_inode_item_format(
* so just make the conversion to the new inode
* format permanent.
*/
- ip->i_d.di_version = XFS_DINODE_VERSION_2;
+ ip->i_d.di_version = 2;
ip->i_d.di_onlink = 0;
memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
}
@@ -932,6 +931,7 @@ xfs_inode_item_init(
iip->ili_item.li_type = XFS_LI_INODE;
iip->ili_item.li_ops = &xfs_inode_item_ops;
iip->ili_item.li_mountp = mp;
+ iip->ili_item.li_ailp = mp->m_ail;
iip->ili_inode = ip;
/*
@@ -942,9 +942,9 @@ xfs_inode_item_init(
iip->ili_format.ilf_type = XFS_LI_INODE;
iip->ili_format.ilf_ino = ip->i_ino;
- iip->ili_format.ilf_blkno = ip->i_blkno;
- iip->ili_format.ilf_len = ip->i_len;
- iip->ili_format.ilf_boffset = ip->i_boffset;
+ iip->ili_format.ilf_blkno = ip->i_imap.im_blkno;
+ iip->ili_format.ilf_len = ip->i_imap.im_len;
+ iip->ili_format.ilf_boffset = ip->i_imap.im_boffset;
}
/*
@@ -976,9 +976,8 @@ xfs_iflush_done(
xfs_buf_t *bp,
xfs_inode_log_item_t *iip)
{
- xfs_inode_t *ip;
-
- ip = iip->ili_inode;
+ xfs_inode_t *ip = iip->ili_inode;
+ struct xfs_ail *ailp = iip->ili_item.li_ailp;
/*
* We only want to pull the item from the AIL if it is
@@ -991,15 +990,12 @@ xfs_iflush_done(
*/
if (iip->ili_logged &&
(iip->ili_item.li_lsn == iip->ili_flush_lsn)) {
- spin_lock(&ip->i_mount->m_ail_lock);
+ spin_lock(&ailp->xa_lock);
if (iip->ili_item.li_lsn == iip->ili_flush_lsn) {
- /*
- * xfs_trans_delete_ail() drops the AIL lock.
- */
- xfs_trans_delete_ail(ip->i_mount,
- (xfs_log_item_t*)iip);
+ /* xfs_trans_ail_delete() drops the AIL lock. */
+ xfs_trans_ail_delete(ailp, (xfs_log_item_t*)iip);
} else {
- spin_unlock(&ip->i_mount->m_ail_lock);
+ spin_unlock(&ailp->xa_lock);
}
}
@@ -1031,21 +1027,20 @@ void
xfs_iflush_abort(
xfs_inode_t *ip)
{
- xfs_inode_log_item_t *iip;
+ xfs_inode_log_item_t *iip = ip->i_itemp;
xfs_mount_t *mp;
iip = ip->i_itemp;
mp = ip->i_mount;
if (iip) {
+ struct xfs_ail *ailp = iip->ili_item.li_ailp;
if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
- spin_lock(&mp->m_ail_lock);
+ spin_lock(&ailp->xa_lock);
if (iip->ili_item.li_flags & XFS_LI_IN_AIL) {
- /*
- * xfs_trans_delete_ail() drops the AIL lock.
- */
- xfs_trans_delete_ail(mp, (xfs_log_item_t *)iip);
+ /* xfs_trans_ail_delete() drops the AIL lock. */
+ xfs_trans_ail_delete(ailp, (xfs_log_item_t *)iip);
} else
- spin_unlock(&mp->m_ail_lock);
+ spin_unlock(&ailp->xa_lock);
}
iip->ili_logged = 0;
/*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 40513077ab36..1ff04cc323ad 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -112,6 +112,24 @@ typedef struct xfs_inode_log_format_64 {
#define XFS_ILI_IOLOCKED_ANY (XFS_ILI_IOLOCKED_EXCL | XFS_ILI_IOLOCKED_SHARED)
+#define XFS_ILOG_FBROOT(w) xfs_ilog_fbroot(w)
+static inline int xfs_ilog_fbroot(int w)
+{
+ return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
+}
+
+#define XFS_ILOG_FEXT(w) xfs_ilog_fext(w)
+static inline int xfs_ilog_fext(int w)
+{
+ return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
+}
+
+#define XFS_ILOG_FDATA(w) xfs_ilog_fdata(w)
+static inline int xfs_ilog_fdata(int w)
+{
+ return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
+}
+
#ifdef __KERNEL__
struct xfs_buf;
@@ -148,26 +166,6 @@ typedef struct xfs_inode_log_item {
} xfs_inode_log_item_t;
-#define XFS_ILOG_FDATA(w) xfs_ilog_fdata(w)
-static inline int xfs_ilog_fdata(int w)
-{
- return (w == XFS_DATA_FORK ? XFS_ILOG_DDATA : XFS_ILOG_ADATA);
-}
-
-#endif /* __KERNEL__ */
-
-#define XFS_ILOG_FBROOT(w) xfs_ilog_fbroot(w)
-static inline int xfs_ilog_fbroot(int w)
-{
- return (w == XFS_DATA_FORK ? XFS_ILOG_DBROOT : XFS_ILOG_ABROOT);
-}
-
-#define XFS_ILOG_FEXT(w) xfs_ilog_fext(w)
-static inline int xfs_ilog_fext(int w)
-{
- return (w == XFS_DATA_FORK ? XFS_ILOG_DEXT : XFS_ILOG_AEXT);
-}
-
static inline int xfs_inode_clean(xfs_inode_t *ip)
{
return (!ip->i_itemp ||
@@ -175,9 +173,6 @@ static inline int xfs_inode_clean(xfs_inode_t *ip)
!ip->i_update_core;
}
-
-#ifdef __KERNEL__
-
extern void xfs_inode_item_init(struct xfs_inode *, struct xfs_mount *);
extern void xfs_inode_item_destroy(struct xfs_inode *);
extern void xfs_iflush_done(struct xfs_buf *, xfs_inode_log_item_t *);
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 67f22b2b44b3..911062cf73a6 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -290,7 +290,6 @@ STATIC int
xfs_iomap_eof_align_last_fsb(
xfs_mount_t *mp,
xfs_inode_t *ip,
- xfs_fsize_t isize,
xfs_extlen_t extsize,
xfs_fileoff_t *last_fsb)
{
@@ -306,14 +305,14 @@ xfs_iomap_eof_align_last_fsb(
* stripe width and we are allocating past the allocation eof.
*/
else if (mp->m_swidth && (mp->m_flags & XFS_MOUNT_SWALLOC) &&
- (isize >= XFS_FSB_TO_B(mp, mp->m_swidth)))
+ (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_swidth)))
new_last_fsb = roundup_64(*last_fsb, mp->m_swidth);
/*
* Roundup the allocation request to a stripe unit (m_dalign) boundary
* if the file size is >= stripe unit size, and we are allocating past
* the allocation eof.
*/
- else if (mp->m_dalign && (isize >= XFS_FSB_TO_B(mp, mp->m_dalign)))
+ else if (mp->m_dalign && (ip->i_size >= XFS_FSB_TO_B(mp, mp->m_dalign)))
new_last_fsb = roundup_64(*last_fsb, mp->m_dalign);
/*
@@ -403,7 +402,6 @@ xfs_iomap_write_direct(
xfs_filblks_t count_fsb, resaligned;
xfs_fsblock_t firstfsb;
xfs_extlen_t extsz, temp;
- xfs_fsize_t isize;
int nimaps;
int bmapi_flag;
int quota_flag;
@@ -426,15 +424,10 @@ xfs_iomap_write_direct(
rt = XFS_IS_REALTIME_INODE(ip);
extsz = xfs_get_extsz_hint(ip);
- isize = ip->i_size;
- if (ip->i_new_size > isize)
- isize = ip->i_new_size;
-
offset_fsb = XFS_B_TO_FSBT(mp, offset);
last_fsb = XFS_B_TO_FSB(mp, ((xfs_ufsize_t)(offset + count)));
- if ((offset + count) > isize) {
- error = xfs_iomap_eof_align_last_fsb(mp, ip, isize, extsz,
- &last_fsb);
+ if ((offset + count) > ip->i_size) {
+ error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
if (error)
goto error_out;
} else {
@@ -559,7 +552,6 @@ STATIC int
xfs_iomap_eof_want_preallocate(
xfs_mount_t *mp,
xfs_inode_t *ip,
- xfs_fsize_t isize,
xfs_off_t offset,
size_t count,
int ioflag,
@@ -573,7 +565,7 @@ xfs_iomap_eof_want_preallocate(
int n, error, imaps;
*prealloc = 0;
- if ((ioflag & BMAPI_SYNC) || (offset + count) <= isize)
+ if ((ioflag & BMAPI_SYNC) || (offset + count) <= ip->i_size)
return 0;
/*
@@ -617,7 +609,6 @@ xfs_iomap_write_delay(
xfs_fileoff_t ioalign;
xfs_fsblock_t firstblock;
xfs_extlen_t extsz;
- xfs_fsize_t isize;
int nimaps;
xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
int prealloc, fsynced = 0;
@@ -637,11 +628,7 @@ xfs_iomap_write_delay(
offset_fsb = XFS_B_TO_FSBT(mp, offset);
retry:
- isize = ip->i_size;
- if (ip->i_new_size > isize)
- isize = ip->i_new_size;
-
- error = xfs_iomap_eof_want_preallocate(mp, ip, isize, offset, count,
+ error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
ioflag, imap, XFS_WRITE_IMAPS, &prealloc);
if (error)
return error;
@@ -655,8 +642,7 @@ retry:
}
if (prealloc || extsz) {
- error = xfs_iomap_eof_align_last_fsb(mp, ip, isize, extsz,
- &last_fsb);
+ error = xfs_iomap_eof_align_last_fsb(mp, ip, extsz, &last_fsb);
if (error)
return error;
}
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index cf6754a3c5b3..e19d0a8d5618 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -69,7 +69,7 @@ xfs_bulkstat_one_iget(
}
ASSERT(ip != NULL);
- ASSERT(ip->i_blkno != (xfs_daddr_t)0);
+ ASSERT(ip->i_imap.im_blkno != 0);
dic = &ip->i_d;
@@ -125,13 +125,9 @@ STATIC void
xfs_bulkstat_one_dinode(
xfs_mount_t *mp, /* mount point for filesystem */
xfs_ino_t ino, /* inode number to get data for */
- xfs_dinode_t *dip, /* dinode inode pointer */
+ xfs_dinode_t *dic, /* dinode inode pointer */
xfs_bstat_t *buf) /* return buffer */
{
- xfs_dinode_core_t *dic; /* dinode core info pointer */
-
- dic = &dip->di_core;
-
/*
* The inode format changed when we moved the link count and
* made it 32 bits long. If this is an old format inode,
@@ -143,7 +139,7 @@ xfs_bulkstat_one_dinode(
* the new format. We don't change the version number so that we
* can distinguish this from a real new format inode.
*/
- if (dic->di_version == XFS_DINODE_VERSION_1) {
+ if (dic->di_version == 1) {
buf->bs_nlink = be16_to_cpu(dic->di_onlink);
buf->bs_projid = 0;
} else {
@@ -162,7 +158,7 @@ xfs_bulkstat_one_dinode(
buf->bs_mtime.tv_nsec = be32_to_cpu(dic->di_mtime.t_nsec);
buf->bs_ctime.tv_sec = be32_to_cpu(dic->di_ctime.t_sec);
buf->bs_ctime.tv_nsec = be32_to_cpu(dic->di_ctime.t_nsec);
- buf->bs_xflags = xfs_dic2xflags(dip);
+ buf->bs_xflags = xfs_dic2xflags(dic);
buf->bs_extsize = be32_to_cpu(dic->di_extsize) << mp->m_sb.sb_blocklog;
buf->bs_extents = be32_to_cpu(dic->di_nextents);
buf->bs_gen = be32_to_cpu(dic->di_gen);
@@ -173,7 +169,7 @@ xfs_bulkstat_one_dinode(
switch (dic->di_format) {
case XFS_DINODE_FMT_DEV:
- buf->bs_rdev = be32_to_cpu(dip->di_u.di_dev);
+ buf->bs_rdev = xfs_dinode_get_rdev(dic);
buf->bs_blksize = BLKDEV_IOSIZE;
buf->bs_blocks = 0;
break;
@@ -192,27 +188,34 @@ xfs_bulkstat_one_dinode(
}
}
+/* Return 0 on success or positive error */
STATIC int
xfs_bulkstat_one_fmt(
void __user *ubuffer,
+ int ubsize,
+ int *ubused,
const xfs_bstat_t *buffer)
{
+ if (ubsize < sizeof(*buffer))
+ return XFS_ERROR(ENOMEM);
if (copy_to_user(ubuffer, buffer, sizeof(*buffer)))
- return -EFAULT;
- return sizeof(*buffer);
+ return XFS_ERROR(EFAULT);
+ if (ubused)
+ *ubused = sizeof(*buffer);
+ return 0;
}
/*
* Return stat information for one inode.
* Return 0 if ok, else errno.
*/
-int /* error status */
-xfs_bulkstat_one(
+int /* error status */
+xfs_bulkstat_one_int(
xfs_mount_t *mp, /* mount point for filesystem */
xfs_ino_t ino, /* inode number to get data for */
void __user *buffer, /* buffer to place output in */
int ubsize, /* size of buffer */
- void *private_data, /* my private data */
+ bulkstat_one_fmt_pf formatter, /* formatter, copy to user */
xfs_daddr_t bno, /* starting bno of inode cluster */
int *ubused, /* bytes used by me */
void *dibuff, /* on-disk inode buffer */
@@ -221,15 +224,12 @@ xfs_bulkstat_one(
xfs_bstat_t *buf; /* return buffer */
int error = 0; /* error value */
xfs_dinode_t *dip; /* dinode inode pointer */
- bulkstat_one_fmt_pf formatter = private_data ? : xfs_bulkstat_one_fmt;
dip = (xfs_dinode_t *)dibuff;
*stat = BULKSTAT_RV_NOTHING;
if (!buffer || xfs_internal_inum(mp, ino))
return XFS_ERROR(EINVAL);
- if (ubsize < sizeof(*buf))
- return XFS_ERROR(ENOMEM);
buf = kmem_alloc(sizeof(*buf), KM_SLEEP);
@@ -244,21 +244,34 @@ xfs_bulkstat_one(
xfs_bulkstat_one_dinode(mp, ino, dip, buf);
}
- error = formatter(buffer, buf);
- if (error < 0) {
- error = EFAULT;
+ error = formatter(buffer, ubsize, ubused, buf);
+ if (error)
goto out_free;
- }
*stat = BULKSTAT_RV_DIDONE;
- if (ubused)
- *ubused = error;
out_free:
kmem_free(buf);
return error;
}
+int
+xfs_bulkstat_one(
+ xfs_mount_t *mp, /* mount point for filesystem */
+ xfs_ino_t ino, /* inode number to get data for */
+ void __user *buffer, /* buffer to place output in */
+ int ubsize, /* size of buffer */
+ void *private_data, /* my private data */
+ xfs_daddr_t bno, /* starting bno of inode cluster */
+ int *ubused, /* bytes used by me */
+ void *dibuff, /* on-disk inode buffer */
+ int *stat) /* BULKSTAT_RV_... */
+{
+ return xfs_bulkstat_one_int(mp, ino, buffer, ubsize,
+ xfs_bulkstat_one_fmt, bno,
+ ubused, dibuff, stat);
+}
+
/*
* Test to see whether we can use the ondisk inode directly, based
* on the given bulkstat flags, filling in dipp accordingly.
@@ -287,19 +300,19 @@ xfs_bulkstat_use_dinode(
* to disk yet. This is a temporary hack that would require a proper
* fix in the future.
*/
- if (be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC ||
- !XFS_DINODE_GOOD_VERSION(dip->di_core.di_version) ||
- !dip->di_core.di_mode)
+ if (be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC ||
+ !XFS_DINODE_GOOD_VERSION(dip->di_version) ||
+ !dip->di_mode)
return 0;
if (flags & BULKSTAT_FG_QUICK) {
*dipp = dip;
return 1;
}
/* BULKSTAT_FG_INLINE: if attr fork is local, or not there, use it */
- aformat = dip->di_core.di_aformat;
+ aformat = dip->di_aformat;
if ((XFS_DFORK_Q(dip) == 0) ||
(aformat == XFS_DINODE_FMT_LOCAL) ||
- (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_core.di_anextents)) {
+ (aformat == XFS_DINODE_FMT_EXTENTS && !dip->di_anextents)) {
*dipp = dip;
return 1;
}
@@ -359,7 +372,6 @@ xfs_bulkstat(
int ubused; /* bytes used by formatter */
xfs_buf_t *bp; /* ptr to on-disk inode cluster buf */
xfs_dinode_t *dip; /* ptr into bp for specific inode */
- xfs_inode_t *ip; /* ptr to in-core inode struct */
/*
* Get the last inode value, see if there's nothing to do.
@@ -416,8 +428,7 @@ xfs_bulkstat(
/*
* Allocate and initialize a btree cursor for ialloc btree.
*/
- cur = xfs_btree_init_cursor(mp, NULL, agbp, agno, XFS_BTNUM_INO,
- (xfs_inode_t *)0, 0);
+ cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
irbp = irbuf;
irbufend = irbuf + nirbuf;
end_of_ag = 0;
@@ -472,7 +483,7 @@ xfs_bulkstat(
* In any case, increment to the next record.
*/
if (!error)
- error = xfs_inobt_increment(cur, 0, &tmp);
+ error = xfs_btree_increment(cur, 0, &tmp);
} else {
/*
* Start of ag. Lookup the first inode chunk.
@@ -539,7 +550,7 @@ xfs_bulkstat(
* Set agino to after this chunk and bump the cursor.
*/
agino = gino + XFS_INODES_PER_CHUNK;
- error = xfs_inobt_increment(cur, 0, &tmp);
+ error = xfs_btree_increment(cur, 0, &tmp);
cond_resched();
}
/*
@@ -586,6 +597,8 @@ xfs_bulkstat(
if (flags & (BULKSTAT_FG_QUICK |
BULKSTAT_FG_INLINE)) {
+ int offset;
+
ino = XFS_AGINO_TO_INO(mp, agno,
agino);
bno = XFS_AGB_TO_DADDR(mp, agno,
@@ -594,21 +607,15 @@ xfs_bulkstat(
/*
* Get the inode cluster buffer
*/
- ASSERT(xfs_inode_zone != NULL);
- ip = kmem_zone_zalloc(xfs_inode_zone,
- KM_SLEEP);
- ip->i_ino = ino;
- ip->i_mount = mp;
- spin_lock_init(&ip->i_flags_lock);
if (bp)
xfs_buf_relse(bp);
- error = xfs_itobp(mp, NULL, ip,
- &dip, &bp, bno,
- XFS_IMAP_BULKSTAT,
- XFS_BUF_LOCK);
+
+ error = xfs_inotobp(mp, NULL, ino, &dip,
+ &bp, &offset,
+ XFS_IGET_BULKSTAT);
+
if (!error)
- clustidx = ip->i_boffset / mp->m_sb.sb_inodesize;
- kmem_zone_free(xfs_inode_zone, ip);
+ clustidx = offset / mp->m_sb.sb_inodesize;
if (XFS_TEST_ERROR(error != 0,
mp, XFS_ERRTAG_BULKSTAT_READ_CHUNK,
XFS_RANDOM_BULKSTAT_READ_CHUNK)) {
@@ -842,8 +849,7 @@ xfs_inumbers(
agino = 0;
continue;
}
- cur = xfs_btree_init_cursor(mp, NULL, agbp, agno,
- XFS_BTNUM_INO, (xfs_inode_t *)0, 0);
+ cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp);
if (error) {
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
@@ -887,7 +893,7 @@ xfs_inumbers(
bufidx = 0;
}
if (left) {
- error = xfs_inobt_increment(cur, 0, &tmp);
+ error = xfs_btree_increment(cur, 0, &tmp);
if (error) {
xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
cur = NULL;
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index a1f18fce9b70..1fb04e7deb61 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -71,9 +71,23 @@ xfs_bulkstat_single(
typedef int (*bulkstat_one_fmt_pf)( /* used size in bytes or negative error */
void __user *ubuffer, /* buffer to write to */
+ int ubsize, /* remaining user buffer sz */
+ int *ubused, /* bytes used by formatter */
const xfs_bstat_t *buffer); /* buffer to read from */
int
+xfs_bulkstat_one_int(
+ xfs_mount_t *mp,
+ xfs_ino_t ino,
+ void __user *buffer,
+ int ubsize,
+ bulkstat_one_fmt_pf formatter,
+ xfs_daddr_t bno,
+ int *ubused,
+ void *dibuff,
+ int *stat);
+
+int
xfs_bulkstat_one(
xfs_mount_t *mp,
xfs_ino_t ino,
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3608a0f0a5f6..f4726f702a9e 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -100,12 +100,11 @@ STATIC void xlog_ungrant_log_space(xlog_t *log,
/* local ticket functions */
-STATIC xlog_ticket_t *xlog_ticket_get(xlog_t *log,
+STATIC xlog_ticket_t *xlog_ticket_alloc(xlog_t *log,
int unit_bytes,
int count,
char clientid,
uint flags);
-STATIC void xlog_ticket_put(xlog_t *log, xlog_ticket_t *ticket);
#if defined(DEBUG)
STATIC void xlog_verify_dest_ptr(xlog_t *log, __psint_t ptr);
@@ -360,7 +359,7 @@ xfs_log_done(xfs_mount_t *mp,
*/
xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)");
xlog_ungrant_log_space(log, ticket);
- xlog_ticket_put(log, ticket);
+ xfs_log_ticket_put(ticket);
} else {
xlog_trace_loggrant(log, ticket, "xfs_log_done: (permanent)");
xlog_regrant_reserve_log_space(log, ticket);
@@ -514,7 +513,7 @@ xfs_log_reserve(xfs_mount_t *mp,
retval = xlog_regrant_write_log_space(log, internal_ticket);
} else {
/* may sleep if need to allocate more tickets */
- internal_ticket = xlog_ticket_get(log, unit_bytes, cnt,
+ internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
client, flags);
if (!internal_ticket)
return XFS_ERROR(ENOMEM);
@@ -572,12 +571,12 @@ xfs_log_mount(
/*
* Initialize the AIL now we have a log.
*/
- spin_lock_init(&mp->m_ail_lock);
error = xfs_trans_ail_init(mp);
if (error) {
cmn_err(CE_WARN, "XFS: AIL initialisation failed: error %d", error);
goto error;
}
+ mp->m_log->l_ailp = mp->m_ail;
/*
* skip log recovery on a norecovery mount. pretend it all
@@ -730,8 +729,8 @@ xfs_log_unmount_write(xfs_mount_t *mp)
spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
atomic_inc(&iclog->ic_refcnt);
- spin_unlock(&log->l_icloglock);
xlog_state_want_sync(log, iclog);
+ spin_unlock(&log->l_icloglock);
error = xlog_state_release_iclog(log, iclog);
spin_lock(&log->l_icloglock);
@@ -749,7 +748,7 @@ xfs_log_unmount_write(xfs_mount_t *mp)
if (tic) {
xlog_trace_loggrant(log, tic, "unmount rec");
xlog_ungrant_log_space(log, tic);
- xlog_ticket_put(log, tic);
+ xfs_log_ticket_put(tic);
}
} else {
/*
@@ -768,9 +767,9 @@ xfs_log_unmount_write(xfs_mount_t *mp)
spin_lock(&log->l_icloglock);
iclog = log->l_iclog;
atomic_inc(&iclog->ic_refcnt);
- spin_unlock(&log->l_icloglock);
xlog_state_want_sync(log, iclog);
+ spin_unlock(&log->l_icloglock);
error = xlog_state_release_iclog(log, iclog);
spin_lock(&log->l_icloglock);
@@ -906,7 +905,7 @@ xfs_log_move_tail(xfs_mount_t *mp,
int
xfs_log_need_covered(xfs_mount_t *mp)
{
- int needed = 0, gen;
+ int needed = 0;
xlog_t *log = mp->m_log;
if (!xfs_fs_writable(mp))
@@ -915,7 +914,7 @@ xfs_log_need_covered(xfs_mount_t *mp)
spin_lock(&log->l_icloglock);
if (((log->l_covered_state == XLOG_STATE_COVER_NEED) ||
(log->l_covered_state == XLOG_STATE_COVER_NEED2))
- && !xfs_trans_first_ail(mp, &gen)
+ && !xfs_trans_ail_tail(log->l_ailp)
&& xlog_iclogs_empty(log)) {
if (log->l_covered_state == XLOG_STATE_COVER_NEED)
log->l_covered_state = XLOG_STATE_COVER_DONE;
@@ -952,7 +951,7 @@ xlog_assign_tail_lsn(xfs_mount_t *mp)
xfs_lsn_t tail_lsn;
xlog_t *log = mp->m_log;
- tail_lsn = xfs_trans_tail_ail(mp);
+ tail_lsn = xfs_trans_ail_tail(mp->m_ail);
spin_lock(&log->l_grant_lock);
if (tail_lsn != 0) {
log->l_tail_lsn = tail_lsn;
@@ -1030,12 +1029,6 @@ xlog_iodone(xfs_buf_t *bp)
ASSERT(XFS_BUF_FSPRIVATE2(bp, unsigned long) == (unsigned long) 2);
XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
aborted = 0;
-
- /*
- * Some versions of cpp barf on the recursive definition of
- * ic_log -> hic_fields.ic_log and expand ic_log twice when
- * it is passed through two macros. Workaround broken cpp.
- */
l = iclog->ic_log;
/*
@@ -1302,7 +1295,7 @@ xlog_alloc_log(xfs_mount_t *mp,
XFS_BUF_SET_BDSTRAT_FUNC(bp, xlog_bdstrat_cb);
XFS_BUF_SET_FSPRIVATE2(bp, (unsigned long)1);
iclog->ic_bp = bp;
- iclog->hic_data = bp->b_addr;
+ iclog->ic_data = bp->b_addr;
#ifdef DEBUG
log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header);
#endif
@@ -1322,7 +1315,7 @@ xlog_alloc_log(xfs_mount_t *mp,
atomic_set(&iclog->ic_refcnt, 0);
spin_lock_init(&iclog->ic_callback_lock);
iclog->ic_callback_tail = &(iclog->ic_callback);
- iclog->ic_datap = (char *)iclog->hic_data + log->l_iclog_hsize;
+ iclog->ic_datap = (char *)iclog->ic_data + log->l_iclog_hsize;
ASSERT(XFS_BUF_ISBUSY(iclog->ic_bp));
ASSERT(XFS_BUF_VALUSEMA(iclog->ic_bp) <= 0);
@@ -1446,7 +1439,7 @@ xlog_grant_push_ail(xfs_mount_t *mp,
*/
if (threshold_lsn &&
!XLOG_FORCED_SHUTDOWN(log))
- xfs_trans_push_ail(mp, threshold_lsn);
+ xfs_trans_ail_push(log->l_ailp, threshold_lsn);
} /* xlog_grant_push_ail */
@@ -1991,7 +1984,9 @@ xlog_write(xfs_mount_t * mp,
if (iclog->ic_size - log_offset <= sizeof(xlog_op_header_t)) {
xlog_state_finish_copy(log, iclog, record_cnt, data_cnt);
record_cnt = data_cnt = 0;
+ spin_lock(&log->l_icloglock);
xlog_state_want_sync(log, iclog);
+ spin_unlock(&log->l_icloglock);
if (commit_iclog) {
ASSERT(flags & XLOG_COMMIT_TRANS);
*commit_iclog = iclog;
@@ -3200,7 +3195,7 @@ try_again:
STATIC void
xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
{
- spin_lock(&log->l_icloglock);
+ ASSERT(spin_is_locked(&log->l_icloglock));
if (iclog->ic_state == XLOG_STATE_ACTIVE) {
xlog_state_switch_iclogs(log, iclog, 0);
@@ -3208,10 +3203,7 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
ASSERT(iclog->ic_state &
(XLOG_STATE_WANT_SYNC|XLOG_STATE_IOERROR));
}
-
- spin_unlock(&log->l_icloglock);
-} /* xlog_state_want_sync */
-
+}
/*****************************************************************************
@@ -3222,22 +3214,33 @@ xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
*/
/*
- * Free a used ticket.
+ * Free a used ticket when it's refcount falls to zero.
*/
-STATIC void
-xlog_ticket_put(xlog_t *log,
- xlog_ticket_t *ticket)
+void
+xfs_log_ticket_put(
+ xlog_ticket_t *ticket)
{
- sv_destroy(&ticket->t_wait);
- kmem_zone_free(xfs_log_ticket_zone, ticket);
-} /* xlog_ticket_put */
+ ASSERT(atomic_read(&ticket->t_ref) > 0);
+ if (atomic_dec_and_test(&ticket->t_ref)) {
+ sv_destroy(&ticket->t_wait);
+ kmem_zone_free(xfs_log_ticket_zone, ticket);
+ }
+}
+xlog_ticket_t *
+xfs_log_ticket_get(
+ xlog_ticket_t *ticket)
+{
+ ASSERT(atomic_read(&ticket->t_ref) > 0);
+ atomic_inc(&ticket->t_ref);
+ return ticket;
+}
/*
* Allocate and initialise a new log ticket.
*/
STATIC xlog_ticket_t *
-xlog_ticket_get(xlog_t *log,
+xlog_ticket_alloc(xlog_t *log,
int unit_bytes,
int cnt,
char client,
@@ -3308,6 +3311,7 @@ xlog_ticket_get(xlog_t *log,
unit_bytes += 2*BBSIZE;
}
+ atomic_set(&tic->t_ref, 1);
tic->t_unit_res = unit_bytes;
tic->t_curr_res = unit_bytes;
tic->t_cnt = cnt;
@@ -3323,7 +3327,7 @@ xlog_ticket_get(xlog_t *log,
xlog_tic_reset_res(tic);
return tic;
-} /* xlog_ticket_get */
+}
/******************************************************************************
@@ -3452,7 +3456,7 @@ xlog_verify_iclog(xlog_t *log,
ptr = iclog->ic_datap;
base_ptr = ptr;
ophead = (xlog_op_header_t *)ptr;
- xhdr = (xlog_in_core_2_t *)&iclog->ic_header;
+ xhdr = iclog->ic_data;
for (i = 0; i < len; i++) {
ophead = (xlog_op_header_t *)ptr;
@@ -3558,7 +3562,8 @@ xfs_log_force_umount(
if (!log ||
log->l_flags & XLOG_ACTIVE_RECOVERY) {
mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
- XFS_BUF_DONE(mp->m_sb_bp);
+ if (mp->m_sb_bp)
+ XFS_BUF_DONE(mp->m_sb_bp);
return 0;
}
@@ -3579,7 +3584,9 @@ xfs_log_force_umount(
spin_lock(&log->l_icloglock);
spin_lock(&log->l_grant_lock);
mp->m_flags |= XFS_MOUNT_FS_SHUTDOWN;
- XFS_BUF_DONE(mp->m_sb_bp);
+ if (mp->m_sb_bp)
+ XFS_BUF_DONE(mp->m_sb_bp);
+
/*
* This flag is sort of redundant because of the mount flag, but
* it's good to maintain the separation between the log and the rest
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index d47b91f10822..8a3e84e900a3 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -134,6 +134,7 @@ typedef struct xfs_log_callback {
#ifdef __KERNEL__
/* Log manager interfaces */
struct xfs_mount;
+struct xlog_ticket;
xfs_lsn_t xfs_log_done(struct xfs_mount *mp,
xfs_log_ticket_t ticket,
void **iclog,
@@ -177,6 +178,9 @@ int xfs_log_need_covered(struct xfs_mount *mp);
void xlog_iodone(struct xfs_buf *);
+struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket);
+void xfs_log_ticket_put(struct xlog_ticket *ticket);
+
#endif
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index e7d8f84443fa..654167be0efb 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -245,6 +245,7 @@ typedef struct xlog_ticket {
struct xlog_ticket *t_next; /* :4|8 */
struct xlog_ticket *t_prev; /* :4|8 */
xlog_tid_t t_tid; /* transaction identifier : 4 */
+ atomic_t t_ref; /* ticket reference count : 4 */
int t_curr_res; /* current reservation in bytes : 4 */
int t_unit_res; /* unit reservation in bytes : 4 */
char t_ocnt; /* original count : 1 */
@@ -309,6 +310,16 @@ typedef struct xlog_rec_ext_header {
} xlog_rec_ext_header_t;
#ifdef __KERNEL__
+
+/*
+ * Quite misnamed, because this union lays out the actual on-disk log buffer.
+ */
+typedef union xlog_in_core2 {
+ xlog_rec_header_t hic_header;
+ xlog_rec_ext_header_t hic_xheader;
+ char hic_sector[XLOG_HEADER_SIZE];
+} xlog_in_core_2_t;
+
/*
* - A log record header is 512 bytes. There is plenty of room to grow the
* xlog_rec_header_t into the reserved space.
@@ -338,7 +349,7 @@ typedef struct xlog_rec_ext_header {
* We'll put all the read-only and l_icloglock fields in the first cacheline,
* and move everything else out to subsequent cachelines.
*/
-typedef struct xlog_iclog_fields {
+typedef struct xlog_in_core {
sv_t ic_force_wait;
sv_t ic_write_wait;
struct xlog_in_core *ic_next;
@@ -361,41 +372,11 @@ typedef struct xlog_iclog_fields {
/* reference counts need their own cacheline */
atomic_t ic_refcnt ____cacheline_aligned_in_smp;
-} xlog_iclog_fields_t;
-
-typedef union xlog_in_core2 {
- xlog_rec_header_t hic_header;
- xlog_rec_ext_header_t hic_xheader;
- char hic_sector[XLOG_HEADER_SIZE];
-} xlog_in_core_2_t;
-
-typedef struct xlog_in_core {
- xlog_iclog_fields_t hic_fields;
- xlog_in_core_2_t *hic_data;
+ xlog_in_core_2_t *ic_data;
+#define ic_header ic_data->hic_header
} xlog_in_core_t;
/*
- * Defines to save our code from this glop.
- */
-#define ic_force_wait hic_fields.ic_force_wait
-#define ic_write_wait hic_fields.ic_write_wait
-#define ic_next hic_fields.ic_next
-#define ic_prev hic_fields.ic_prev
-#define ic_bp hic_fields.ic_bp
-#define ic_log hic_fields.ic_log
-#define ic_callback hic_fields.ic_callback
-#define ic_callback_lock hic_fields.ic_callback_lock
-#define ic_callback_tail hic_fields.ic_callback_tail
-#define ic_trace hic_fields.ic_trace
-#define ic_size hic_fields.ic_size
-#define ic_offset hic_fields.ic_offset
-#define ic_refcnt hic_fields.ic_refcnt
-#define ic_bwritecnt hic_fields.ic_bwritecnt
-#define ic_state hic_fields.ic_state
-#define ic_datap hic_fields.ic_datap
-#define ic_header hic_data->hic_header
-
-/*
* The reservation head lsn is not made up of a cycle number and block number.
* Instead, it uses a cycle number and byte number. Logs don't expect to
* overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -404,6 +385,7 @@ typedef struct xlog_in_core {
typedef struct log {
/* The following fields don't need locking */
struct xfs_mount *l_mp; /* mount point */
+ struct xfs_ail *l_ailp; /* AIL log is working with */
struct xfs_buf *l_xbuf; /* extra buffer for log
* wrapping */
struct xfs_buftarg *l_targ; /* buftarg of log */
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 70e3ba32e6be..35cca98bd94c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -36,7 +36,6 @@
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_inode_item.h"
-#include "xfs_imap.h"
#include "xfs_alloc.h"
#include "xfs_ialloc.h"
#include "xfs_log_priv.h"
@@ -54,10 +53,8 @@ STATIC void xlog_recover_insert_item_backq(xlog_recover_item_t **q,
xlog_recover_item_t *item);
#if defined(DEBUG)
STATIC void xlog_recover_check_summary(xlog_t *);
-STATIC void xlog_recover_check_ail(xfs_mount_t *, xfs_log_item_t *, int);
#else
#define xlog_recover_check_summary(log)
-#define xlog_recover_check_ail(mp, lip, gen)
#endif
@@ -270,21 +267,16 @@ STATIC void
xlog_recover_iodone(
struct xfs_buf *bp)
{
- xfs_mount_t *mp;
-
- ASSERT(XFS_BUF_FSPRIVATE(bp, void *));
-
if (XFS_BUF_GETERROR(bp)) {
/*
* We're not going to bother about retrying
* this during recovery. One strike!
*/
- mp = XFS_BUF_FSPRIVATE(bp, xfs_mount_t *);
xfs_ioerror_alert("xlog_recover_iodone",
- mp, bp, XFS_BUF_ADDR(bp));
- xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
+ bp->b_mount, bp, XFS_BUF_ADDR(bp));
+ xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
}
- XFS_BUF_SET_FSPRIVATE(bp, NULL);
+ bp->b_mount = NULL;
XFS_BUF_CLR_IODONE_FUNC(bp);
xfs_biodone(bp);
}
@@ -2228,9 +2220,8 @@ xlog_recover_do_buffer_trans(
XFS_BUF_STALE(bp);
error = xfs_bwrite(mp, bp);
} else {
- ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
- XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
- XFS_BUF_SET_FSPRIVATE(bp, mp);
+ ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
+ bp->b_mount = mp;
XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
xfs_bdwrite(mp, bp);
}
@@ -2247,7 +2238,6 @@ xlog_recover_do_inode_trans(
xfs_inode_log_format_t *in_f;
xfs_mount_t *mp;
xfs_buf_t *bp;
- xfs_imap_t imap;
xfs_dinode_t *dip;
xfs_ino_t ino;
int len;
@@ -2275,54 +2265,35 @@ xlog_recover_do_inode_trans(
}
ino = in_f->ilf_ino;
mp = log->l_mp;
- if (ITEM_TYPE(item) == XFS_LI_INODE) {
- imap.im_blkno = (xfs_daddr_t)in_f->ilf_blkno;
- imap.im_len = in_f->ilf_len;
- imap.im_boffset = in_f->ilf_boffset;
- } else {
- /*
- * It's an old inode format record. We don't know where
- * its cluster is located on disk, and we can't allow
- * xfs_imap() to figure it out because the inode btrees
- * are not ready to be used. Therefore do not pass the
- * XFS_IMAP_LOOKUP flag to xfs_imap(). This will give
- * us only the single block in which the inode lives
- * rather than its cluster, so we must make sure to
- * invalidate the buffer when we write it out below.
- */
- imap.im_blkno = 0;
- error = xfs_imap(log->l_mp, NULL, ino, &imap, 0);
- if (error)
- goto error;
- }
/*
* Inode buffers can be freed, look out for it,
* and do not replay the inode.
*/
- if (xlog_check_buffer_cancelled(log, imap.im_blkno, imap.im_len, 0)) {
+ if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
+ in_f->ilf_len, 0)) {
error = 0;
goto error;
}
- bp = xfs_buf_read_flags(mp->m_ddev_targp, imap.im_blkno, imap.im_len,
- XFS_BUF_LOCK);
+ bp = xfs_buf_read_flags(mp->m_ddev_targp, in_f->ilf_blkno,
+ in_f->ilf_len, XFS_BUF_LOCK);
if (XFS_BUF_ISERROR(bp)) {
xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
- bp, imap.im_blkno);
+ bp, in_f->ilf_blkno);
error = XFS_BUF_GETERROR(bp);
xfs_buf_relse(bp);
goto error;
}
error = 0;
ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
- dip = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
+ dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
/*
* Make sure the place we're flushing out to really looks
* like an inode!
*/
- if (unlikely(be16_to_cpu(dip->di_core.di_magic) != XFS_DINODE_MAGIC)) {
+ if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
xfs_buf_relse(bp);
xfs_fs_cmn_err(CE_ALERT, mp,
"xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
@@ -2345,12 +2316,12 @@ xlog_recover_do_inode_trans(
}
/* Skip replay when the on disk inode is newer than the log one */
- if (dicp->di_flushiter < be16_to_cpu(dip->di_core.di_flushiter)) {
+ if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
/*
* Deal with the wrap case, DI_MAX_FLUSH is less
* than smaller numbers
*/
- if (be16_to_cpu(dip->di_core.di_flushiter) == DI_MAX_FLUSH &&
+ if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
/* do nothing */
} else {
@@ -2410,7 +2381,7 @@ xlog_recover_do_inode_trans(
error = EFSCORRUPTED;
goto error;
}
- if (unlikely(item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t))) {
+ if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
XFS_ERRLEVEL_LOW, mp, dicp);
xfs_buf_relse(bp);
@@ -2422,23 +2393,24 @@ xlog_recover_do_inode_trans(
}
/* The core is in in-core format */
- xfs_dinode_to_disk(&dip->di_core,
- (xfs_icdinode_t *)item->ri_buf[1].i_addr);
+ xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr);
/* the rest is in on-disk format */
- if (item->ri_buf[1].i_len > sizeof(xfs_dinode_core_t)) {
- memcpy((xfs_caddr_t) dip + sizeof(xfs_dinode_core_t),
- item->ri_buf[1].i_addr + sizeof(xfs_dinode_core_t),
- item->ri_buf[1].i_len - sizeof(xfs_dinode_core_t));
+ if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
+ memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),
+ item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),
+ item->ri_buf[1].i_len - sizeof(struct xfs_icdinode));
}
fields = in_f->ilf_fields;
switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
case XFS_ILOG_DEV:
- dip->di_u.di_dev = cpu_to_be32(in_f->ilf_u.ilfu_rdev);
+ xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
break;
case XFS_ILOG_UUID:
- dip->di_u.di_muuid = in_f->ilf_u.ilfu_uuid;
+ memcpy(XFS_DFORK_DPTR(dip),
+ &in_f->ilf_u.ilfu_uuid,
+ sizeof(uuid_t));
break;
}
@@ -2454,12 +2426,12 @@ xlog_recover_do_inode_trans(
switch (fields & XFS_ILOG_DFORK) {
case XFS_ILOG_DDATA:
case XFS_ILOG_DEXT:
- memcpy(&dip->di_u, src, len);
+ memcpy(XFS_DFORK_DPTR(dip), src, len);
break;
case XFS_ILOG_DBROOT:
- xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
- &(dip->di_u.di_bmbt),
+ xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
+ (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
XFS_DFORK_DSIZE(dip, mp));
break;
@@ -2496,8 +2468,8 @@ xlog_recover_do_inode_trans(
case XFS_ILOG_ABROOT:
dest = XFS_DFORK_APTR(dip);
- xfs_bmbt_to_bmdr((xfs_bmbt_block_t *)src, len,
- (xfs_bmdr_block_t*)dest,
+ xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
+ len, (xfs_bmdr_block_t*)dest,
XFS_DFORK_ASIZE(dip, mp));
break;
@@ -2512,9 +2484,8 @@ xlog_recover_do_inode_trans(
write_inode_buffer:
if (ITEM_TYPE(item) == XFS_LI_INODE) {
- ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
- XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
- XFS_BUF_SET_FSPRIVATE(bp, mp);
+ ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
+ bp->b_mount = mp;
XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
xfs_bdwrite(mp, bp);
} else {
@@ -2645,9 +2616,8 @@ xlog_recover_do_dquot_trans(
memcpy(ddq, recddq, item->ri_buf[1].i_len);
ASSERT(dq_f->qlf_size == 2);
- ASSERT(XFS_BUF_FSPRIVATE(bp, void *) == NULL ||
- XFS_BUF_FSPRIVATE(bp, xfs_mount_t *) == mp);
- XFS_BUF_SET_FSPRIVATE(bp, mp);
+ ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
+ bp->b_mount = mp;
XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
xfs_bdwrite(mp, bp);
@@ -2689,11 +2659,11 @@ xlog_recover_do_efi_trans(
efip->efi_next_extent = efi_formatp->efi_nextents;
efip->efi_flags |= XFS_EFI_COMMITTED;
- spin_lock(&mp->m_ail_lock);
+ spin_lock(&log->l_ailp->xa_lock);
/*
- * xfs_trans_update_ail() drops the AIL lock.
+ * xfs_trans_ail_update() drops the AIL lock.
*/
- xfs_trans_update_ail(mp, (xfs_log_item_t *)efip, lsn);
+ xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
return 0;
}
@@ -2712,12 +2682,12 @@ xlog_recover_do_efd_trans(
xlog_recover_item_t *item,
int pass)
{
- xfs_mount_t *mp;
xfs_efd_log_format_t *efd_formatp;
xfs_efi_log_item_t *efip = NULL;
xfs_log_item_t *lip;
- int gen;
__uint64_t efi_id;
+ struct xfs_ail_cursor cur;
+ struct xfs_ail *ailp = log->l_ailp;
if (pass == XLOG_RECOVER_PASS1) {
return;
@@ -2734,25 +2704,26 @@ xlog_recover_do_efd_trans(
* Search for the efi with the id in the efd format structure
* in the AIL.
*/
- mp = log->l_mp;
- spin_lock(&mp->m_ail_lock);
- lip = xfs_trans_first_ail(mp, &gen);
+ spin_lock(&ailp->xa_lock);
+ lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
while (lip != NULL) {
if (lip->li_type == XFS_LI_EFI) {
efip = (xfs_efi_log_item_t *)lip;
if (efip->efi_format.efi_id == efi_id) {
/*
- * xfs_trans_delete_ail() drops the
+ * xfs_trans_ail_delete() drops the
* AIL lock.
*/
- xfs_trans_delete_ail(mp, lip);
+ xfs_trans_ail_delete(ailp, lip);
xfs_efi_item_free(efip);
- return;
+ spin_lock(&ailp->xa_lock);
+ break;
}
}
- lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+ lip = xfs_trans_ail_cursor_next(ailp, &cur);
}
- spin_unlock(&mp->m_ail_lock);
+ xfs_trans_ail_cursor_done(ailp, &cur);
+ spin_unlock(&ailp->xa_lock);
}
/*
@@ -3036,33 +3007,6 @@ abort_error:
}
/*
- * Verify that once we've encountered something other than an EFI
- * in the AIL that there are no more EFIs in the AIL.
- */
-#if defined(DEBUG)
-STATIC void
-xlog_recover_check_ail(
- xfs_mount_t *mp,
- xfs_log_item_t *lip,
- int gen)
-{
- int orig_gen = gen;
-
- do {
- ASSERT(lip->li_type != XFS_LI_EFI);
- lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
- /*
- * The check will be bogus if we restart from the
- * beginning of the AIL, so ASSERT that we don't.
- * We never should since we're holding the AIL lock
- * the entire time.
- */
- ASSERT(gen == orig_gen);
- } while (lip != NULL);
-}
-#endif /* DEBUG */
-
-/*
* When this is called, all of the EFIs which did not have
* corresponding EFDs should be in the AIL. What we do now
* is free the extents associated with each one.
@@ -3086,20 +3030,23 @@ xlog_recover_process_efis(
{
xfs_log_item_t *lip;
xfs_efi_log_item_t *efip;
- int gen;
- xfs_mount_t *mp;
int error = 0;
+ struct xfs_ail_cursor cur;
+ struct xfs_ail *ailp;
- mp = log->l_mp;
- spin_lock(&mp->m_ail_lock);
-
- lip = xfs_trans_first_ail(mp, &gen);
+ ailp = log->l_ailp;
+ spin_lock(&ailp->xa_lock);
+ lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
while (lip != NULL) {
/*
* We're done when we see something other than an EFI.
+ * There should be no EFIs left in the AIL now.
*/
if (lip->li_type != XFS_LI_EFI) {
- xlog_recover_check_ail(mp, lip, gen);
+#ifdef DEBUG
+ for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
+ ASSERT(lip->li_type != XFS_LI_EFI);
+#endif
break;
}
@@ -3108,18 +3055,20 @@ xlog_recover_process_efis(
*/
efip = (xfs_efi_log_item_t *)lip;
if (efip->efi_flags & XFS_EFI_RECOVERED) {
- lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+ lip = xfs_trans_ail_cursor_next(ailp, &cur);
continue;
}
- spin_unlock(&mp->m_ail_lock);
- error = xlog_recover_process_efi(mp, efip);
+ spin_unlock(&ailp->xa_lock);
+ error = xlog_recover_process_efi(log->l_mp, efip);
+ spin_lock(&ailp->xa_lock);
if (error)
- return error;
- spin_lock(&mp->m_ail_lock);
- lip = xfs_trans_next_ail(mp, lip, &gen, NULL);
+ goto out;
+ lip = xfs_trans_ail_cursor_next(ailp, &cur);
}
- spin_unlock(&mp->m_ail_lock);
+out:
+ xfs_trans_ail_cursor_done(ailp, &cur);
+ spin_unlock(&ailp->xa_lock);
return error;
}
@@ -3140,19 +3089,16 @@ xlog_recover_clear_agi_bucket(
int error;
tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
- error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp), 0, 0, 0);
- if (!error)
- error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, &agibp);
+ error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),
+ 0, 0, 0);
if (error)
goto out_abort;
- error = EINVAL;
- agi = XFS_BUF_TO_AGI(agibp);
- if (be32_to_cpu(agi->agi_magicnum) != XFS_AGI_MAGIC)
+ error = xfs_read_agi(mp, tp, agno, &agibp);
+ if (error)
goto out_abort;
+ agi = XFS_BUF_TO_AGI(agibp);
agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
offset = offsetof(xfs_agi_t, agi_unlinked) +
(sizeof(xfs_agino_t) * bucket);
@@ -3172,6 +3118,62 @@ out_error:
return;
}
+STATIC xfs_agino_t
+xlog_recover_process_one_iunlink(
+ struct xfs_mount *mp,
+ xfs_agnumber_t agno,
+ xfs_agino_t agino,
+ int bucket)
+{
+ struct xfs_buf *ibp;
+ struct xfs_dinode *dip;
+ struct xfs_inode *ip;
+ xfs_ino_t ino;
+ int error;
+
+ ino = XFS_AGINO_TO_INO(mp, agno, agino);
+ error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
+ if (error)
+ goto fail;
+
+ /*
+ * Get the on disk inode to find the next inode in the bucket.
+ */
+ error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XFS_BUF_LOCK);
+ if (error)
+ goto fail_iput;
+
+ ASSERT(ip->i_d.di_nlink == 0);
+ ASSERT(ip->i_d.di_mode != 0);
+
+ /* setup for the next pass */
+ agino = be32_to_cpu(dip->di_next_unlinked);
+ xfs_buf_relse(ibp);
+
+ /*
+ * Prevent any DMAPI event from being sent when the reference on
+ * the inode is dropped.
+ */
+ ip->i_d.di_dmevmask = 0;
+
+ IRELE(ip);
+ return agino;
+
+ fail_iput:
+ IRELE(ip);
+ fail:
+ /*
+ * We can't read in the inode this bucket points to, or this inode
+ * is messed up. Just ditch this bucket of inodes. We will lose
+ * some inodes and space, but at least we won't hang.
+ *
+ * Call xlog_recover_clear_agi_bucket() to perform a transaction to
+ * clear the inode pointer in the bucket.
+ */
+ xlog_recover_clear_agi_bucket(mp, agno, bucket);
+ return NULLAGINO;
+}
+
/*
* xlog_iunlink_recover
*
@@ -3192,11 +3194,7 @@ xlog_recover_process_iunlinks(
xfs_agnumber_t agno;
xfs_agi_t *agi;
xfs_buf_t *agibp;
- xfs_buf_t *ibp;
- xfs_dinode_t *dip;
- xfs_inode_t *ip;
xfs_agino_t agino;
- xfs_ino_t ino;
int bucket;
int error;
uint mp_dmevmask;
@@ -3213,22 +3211,21 @@ xlog_recover_process_iunlinks(
/*
* Find the agi for this ag.
*/
- agibp = xfs_buf_read(mp->m_ddev_targp,
- XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0);
- if (XFS_BUF_ISERROR(agibp)) {
- xfs_ioerror_alert("xlog_recover_process_iunlinks(#1)",
- log->l_mp, agibp,
- XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)));
+ error = xfs_read_agi(mp, NULL, agno, &agibp);
+ if (error) {
+ /*
+ * AGI is b0rked. Don't process it.
+ *
+ * We should probably mark the filesystem as corrupt
+ * after we've recovered all the ag's we can....
+ */
+ continue;
}
agi = XFS_BUF_TO_AGI(agibp);
- ASSERT(XFS_AGI_MAGIC == be32_to_cpu(agi->agi_magicnum));
for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
-
agino = be32_to_cpu(agi->agi_unlinked[bucket]);
while (agino != NULLAGINO) {
-
/*
* Release the agi buffer so that it can
* be acquired in the normal course of the
@@ -3236,87 +3233,17 @@ xlog_recover_process_iunlinks(
*/
xfs_buf_relse(agibp);
- ino = XFS_AGINO_TO_INO(mp, agno, agino);
- error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
- ASSERT(error || (ip != NULL));
-
- if (!error) {
- /*
- * Get the on disk inode to find the
- * next inode in the bucket.
- */
- error = xfs_itobp(mp, NULL, ip, &dip,
- &ibp, 0, 0,
- XFS_BUF_LOCK);
- ASSERT(error || (dip != NULL));
- }
-
- if (!error) {
- ASSERT(ip->i_d.di_nlink == 0);
-
- /* setup for the next pass */
- agino = be32_to_cpu(
- dip->di_next_unlinked);
- xfs_buf_relse(ibp);
- /*
- * Prevent any DMAPI event from
- * being sent when the
- * reference on the inode is
- * dropped.
- */
- ip->i_d.di_dmevmask = 0;
-
- /*
- * If this is a new inode, handle
- * it specially. Otherwise,
- * just drop our reference to the
- * inode. If there are no
- * other references, this will
- * send the inode to
- * xfs_inactive() which will
- * truncate the file and free
- * the inode.
- */
- if (ip->i_d.di_mode == 0)
- xfs_iput_new(ip, 0);
- else
- IRELE(ip);
- } else {
- /*
- * We can't read in the inode
- * this bucket points to, or
- * this inode is messed up. Just
- * ditch this bucket of inodes. We
- * will lose some inodes and space,
- * but at least we won't hang. Call
- * xlog_recover_clear_agi_bucket()
- * to perform a transaction to clear
- * the inode pointer in the bucket.
- */
- xlog_recover_clear_agi_bucket(mp, agno,
- bucket);
-
- agino = NULLAGINO;
- }
+ agino = xlog_recover_process_one_iunlink(mp,
+ agno, agino, bucket);
/*
* Reacquire the agibuffer and continue around
- * the loop.
+ * the loop. This should never fail as we know
+ * the buffer was good earlier on.
*/
- agibp = xfs_buf_read(mp->m_ddev_targp,
- XFS_AG_DADDR(mp, agno,
- XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0);
- if (XFS_BUF_ISERROR(agibp)) {
- xfs_ioerror_alert(
- "xlog_recover_process_iunlinks(#2)",
- log->l_mp, agibp,
- XFS_AG_DADDR(mp, agno,
- XFS_AGI_DADDR(mp)));
- }
+ error = xfs_read_agi(mp, NULL, agno, &agibp);
+ ASSERT(error == 0);
agi = XFS_BUF_TO_AGI(agibp);
- ASSERT(XFS_AGI_MAGIC == be32_to_cpu(
- agi->agi_magicnum));
}
}
@@ -3367,7 +3294,6 @@ xlog_pack_data(
int size = iclog->ic_offset + roundoff;
__be32 cycle_lsn;
xfs_caddr_t dp;
- xlog_in_core_2_t *xhdr;
xlog_pack_data_checksum(log, iclog, size);
@@ -3382,7 +3308,8 @@ xlog_pack_data(
}
if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
- xhdr = (xlog_in_core_2_t *)&iclog->ic_header;
+ xlog_in_core_2_t *xhdr = iclog->ic_data;
+
for ( ; i < BTOBB(size); i++) {
j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -3440,7 +3367,6 @@ xlog_unpack_data(
xlog_t *log)
{
int i, j, k;
- xlog_in_core_2_t *xhdr;
for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
@@ -3449,7 +3375,7 @@ xlog_unpack_data(
}
if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
- xhdr = (xlog_in_core_2_t *)rhead;
+ xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
@@ -4003,11 +3929,8 @@ xlog_recover_check_summary(
{
xfs_mount_t *mp;
xfs_agf_t *agfp;
- xfs_agi_t *agip;
xfs_buf_t *agfbp;
xfs_buf_t *agibp;
- xfs_daddr_t agfdaddr;
- xfs_daddr_t agidaddr;
xfs_buf_t *sbbp;
#ifdef XFS_LOUD_RECOVERY
xfs_sb_t *sbp;
@@ -4016,6 +3939,7 @@ xlog_recover_check_summary(
__uint64_t freeblks;
__uint64_t itotal;
__uint64_t ifree;
+ int error;
mp = log->l_mp;
@@ -4023,37 +3947,27 @@ xlog_recover_check_summary(
itotal = 0LL;
ifree = 0LL;
for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
- agfdaddr = XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp));
- agfbp = xfs_buf_read(mp->m_ddev_targp, agfdaddr,
- XFS_FSS_TO_BB(mp, 1), 0);
- if (XFS_BUF_ISERROR(agfbp)) {
- xfs_ioerror_alert("xlog_recover_check_summary(agf)",
- mp, agfbp, agfdaddr);
- }
- agfp = XFS_BUF_TO_AGF(agfbp);
- ASSERT(XFS_AGF_MAGIC == be32_to_cpu(agfp->agf_magicnum));
- ASSERT(XFS_AGF_GOOD_VERSION(be32_to_cpu(agfp->agf_versionnum)));
- ASSERT(be32_to_cpu(agfp->agf_seqno) == agno);
-
- freeblks += be32_to_cpu(agfp->agf_freeblks) +
- be32_to_cpu(agfp->agf_flcount);
- xfs_buf_relse(agfbp);
-
- agidaddr = XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp));
- agibp = xfs_buf_read(mp->m_ddev_targp, agidaddr,
- XFS_FSS_TO_BB(mp, 1), 0);
- if (XFS_BUF_ISERROR(agibp)) {
- xfs_ioerror_alert("xlog_recover_check_summary(agi)",
- mp, agibp, agidaddr);
+ error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
+ if (error) {
+ xfs_fs_cmn_err(CE_ALERT, mp,
+ "xlog_recover_check_summary(agf)"
+ "agf read failed agno %d error %d",
+ agno, error);
+ } else {
+ agfp = XFS_BUF_TO_AGF(agfbp);
+ freeblks += be32_to_cpu(agfp->agf_freeblks) +
+ be32_to_cpu(agfp->agf_flcount);
+ xfs_buf_relse(agfbp);
}
- agip = XFS_BUF_TO_AGI(agibp);
- ASSERT(XFS_AGI_MAGIC == be32_to_cpu(agip->agi_magicnum));
- ASSERT(XFS_AGI_GOOD_VERSION(be32_to_cpu(agip->agi_versionnum)));
- ASSERT(be32_to_cpu(agip->agi_seqno) == agno);
- itotal += be32_to_cpu(agip->agi_count);
- ifree += be32_to_cpu(agip->agi_freecount);
- xfs_buf_relse(agibp);
+ error = xfs_read_agi(mp, NULL, agno, &agibp);
+ if (!error) {
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
+
+ itotal += be32_to_cpu(agi->agi_count);
+ ifree += be32_to_cpu(agi->agi_freecount);
+ xfs_buf_relse(agibp);
+ }
}
sbbp = xfs_getsb(mp, 0);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 15f5dd22fbb2..3c97c6463a4e 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -567,8 +567,6 @@ xfs_readsb(xfs_mount_t *mp, int flags)
STATIC void
xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
{
- int i;
-
mp->m_agfrotor = mp->m_agirotor = 0;
spin_lock_init(&mp->m_agirotor_lock);
mp->m_maxagi = mp->m_sb.sb_agcount;
@@ -577,12 +575,10 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
mp->m_sectbb_log = sbp->sb_sectlog - BBSHIFT;
mp->m_agno_log = xfs_highbit32(sbp->sb_agcount - 1) + 1;
mp->m_agino_log = sbp->sb_inopblog + sbp->sb_agblklog;
- mp->m_litino = sbp->sb_inodesize -
- ((uint)sizeof(xfs_dinode_core_t) + (uint)sizeof(xfs_agino_t));
+ mp->m_litino = sbp->sb_inodesize - sizeof(struct xfs_dinode);
mp->m_blockmask = sbp->sb_blocksize - 1;
mp->m_blockwsize = sbp->sb_blocksize >> XFS_WORDLOG;
mp->m_blockwmask = mp->m_blockwsize - 1;
- INIT_LIST_HEAD(&mp->m_del_inodes);
/*
* Setup for attributes, in case they get created.
@@ -605,24 +601,20 @@ xfs_mount_common(xfs_mount_t *mp, xfs_sb_t *sbp)
}
ASSERT(mp->m_attroffset < XFS_LITINO(mp));
- for (i = 0; i < 2; i++) {
- mp->m_alloc_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
- xfs_alloc, i == 0);
- mp->m_alloc_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
- xfs_alloc, i == 0);
- }
- for (i = 0; i < 2; i++) {
- mp->m_bmap_dmxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
- xfs_bmbt, i == 0);
- mp->m_bmap_dmnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
- xfs_bmbt, i == 0);
- }
- for (i = 0; i < 2; i++) {
- mp->m_inobt_mxr[i] = XFS_BTREE_BLOCK_MAXRECS(sbp->sb_blocksize,
- xfs_inobt, i == 0);
- mp->m_inobt_mnr[i] = XFS_BTREE_BLOCK_MINRECS(sbp->sb_blocksize,
- xfs_inobt, i == 0);
- }
+ mp->m_alloc_mxr[0] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 1);
+ mp->m_alloc_mxr[1] = xfs_allocbt_maxrecs(mp, sbp->sb_blocksize, 0);
+ mp->m_alloc_mnr[0] = mp->m_alloc_mxr[0] / 2;
+ mp->m_alloc_mnr[1] = mp->m_alloc_mxr[1] / 2;
+
+ mp->m_inobt_mxr[0] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 1);
+ mp->m_inobt_mxr[1] = xfs_inobt_maxrecs(mp, sbp->sb_blocksize, 0);
+ mp->m_inobt_mnr[0] = mp->m_inobt_mxr[0] / 2;
+ mp->m_inobt_mnr[1] = mp->m_inobt_mxr[1] / 2;
+
+ mp->m_bmap_dmxr[0] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 1);
+ mp->m_bmap_dmxr[1] = xfs_bmbt_maxrecs(mp, sbp->sb_blocksize, 0);
+ mp->m_bmap_dmnr[0] = mp->m_bmap_dmxr[0] / 2;
+ mp->m_bmap_dmnr[1] = mp->m_bmap_dmxr[1] / 2;
mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
@@ -1228,6 +1220,16 @@ xfs_unmountfs(
__uint64_t resblks;
int error;
+ /*
+ * Release dquot that rootinode, rbmino and rsumino might be holding,
+ * and release the quota inodes.
+ */
+ XFS_QM_UNMOUNT(mp);
+
+ if (mp->m_rbmip)
+ IRELE(mp->m_rbmip);
+ if (mp->m_rsumip)
+ IRELE(mp->m_rsumip);
IRELE(mp->m_rootip);
/*
@@ -1241,7 +1243,7 @@ xfs_unmountfs(
* need to force the log first.
*/
xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
- xfs_iflush_all(mp);
+ xfs_reclaim_inodes(mp, 0, XFS_IFLUSH_ASYNC);
XFS_QM_DQPURGEALL(mp, XFS_QMOPT_QUOTALL | XFS_QMOPT_UMOUNTING);
@@ -1288,11 +1290,6 @@ xfs_unmountfs(
xfs_unmountfs_wait(mp); /* wait for async bufs */
xfs_log_unmount(mp); /* Done! No more fs ops. */
- /*
- * All inodes from this mount point should be freed.
- */
- ASSERT(mp->m_inodes == NULL);
-
if ((mp->m_flags & XFS_MOUNT_NOUUID) == 0)
uuid_table_remove(&mp->m_sb.sb_uuid);
@@ -1365,24 +1362,6 @@ xfs_log_sbcount(
return error;
}
-STATIC void
-xfs_mark_shared_ro(
- xfs_mount_t *mp,
- xfs_buf_t *bp)
-{
- xfs_dsb_t *sb = XFS_BUF_TO_SBP(bp);
- __uint16_t version;
-
- if (!(sb->sb_flags & XFS_SBF_READONLY))
- sb->sb_flags |= XFS_SBF_READONLY;
-
- version = be16_to_cpu(sb->sb_versionnum);
- if ((version & XFS_SB_VERSION_NUMBITS) != XFS_SB_VERSION_4 ||
- !(version & XFS_SB_VERSION_SHAREDBIT))
- version |= XFS_SB_VERSION_SHAREDBIT;
- sb->sb_versionnum = cpu_to_be16(version);
-}
-
int
xfs_unmountfs_writesb(xfs_mount_t *mp)
{
@@ -1398,12 +1377,6 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
sbp = xfs_getsb(mp, 0);
- /*
- * mark shared-readonly if desired
- */
- if (mp->m_mk_sharedro)
- xfs_mark_shared_ro(mp, sbp);
-
XFS_BUF_UNDONE(sbp);
XFS_BUF_UNREAD(sbp);
XFS_BUF_UNDELAYWRITE(sbp);
@@ -1415,8 +1388,6 @@ xfs_unmountfs_writesb(xfs_mount_t *mp)
if (error)
xfs_ioerror_alert("xfs_unmountfs_writesb",
mp, sbp, XFS_BUF_ADDR(sbp));
- if (error && mp->m_mk_sharedro)
- xfs_fs_cmn_err(CE_ALERT, mp, "Superblock write error detected while unmounting. Filesystem may not be marked shared readonly");
xfs_buf_relse(sbp);
}
return error;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index f3c1024b1241..c1e028467327 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -18,7 +18,6 @@
#ifndef __XFS_MOUNT_H__
#define __XFS_MOUNT_H__
-
typedef struct xfs_trans_reservations {
uint tr_write; /* extent alloc trans */
uint tr_itruncate; /* truncate trans */
@@ -44,14 +43,16 @@ typedef struct xfs_trans_reservations {
} xfs_trans_reservations_t;
#ifndef __KERNEL__
-/*
- * Moved here from xfs_ag.h to avoid reordering header files
- */
+
#define XFS_DADDR_TO_AGNO(mp,d) \
((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
#define XFS_DADDR_TO_AGBNO(mp,d) \
((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
-#else
+
+#else /* __KERNEL__ */
+
+#include "xfs_sync.h"
+
struct cred;
struct log;
struct xfs_mount_args;
@@ -62,6 +63,7 @@ struct xfs_extdelta;
struct xfs_swapext;
struct xfs_mru_cache;
struct xfs_nameops;
+struct xfs_ail;
/*
* Prototypes and functions for the Data Migration subsystem.
@@ -115,7 +117,7 @@ struct xfs_quotainfo;
typedef int (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *);
typedef int (*xfs_qmmount_t)(struct xfs_mount *, uint, uint);
-typedef int (*xfs_qmunmount_t)(struct xfs_mount *);
+typedef void (*xfs_qmunmount_t)(struct xfs_mount *);
typedef void (*xfs_qmdone_t)(struct xfs_mount *);
typedef void (*xfs_dqrele_t)(struct xfs_dquot *);
typedef int (*xfs_dqattach_t)(struct xfs_inode *, uint);
@@ -132,7 +134,7 @@ typedef struct xfs_dquot * (*xfs_dqvopchown_t)(
struct xfs_dquot **, struct xfs_dquot *);
typedef int (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *,
struct xfs_dquot *, struct xfs_dquot *, uint);
-typedef void (*xfs_dqstatvfs_t)(struct xfs_inode *, bhv_statvfs_t *);
+typedef void (*xfs_dqstatvfs_t)(struct xfs_inode *, struct kstatfs *);
typedef int (*xfs_dqsync_t)(struct xfs_mount *, int flags);
typedef int (*xfs_quotactl_t)(struct xfs_mount *, int, int, xfs_caddr_t);
@@ -223,18 +225,10 @@ extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
#endif
-typedef struct xfs_ail {
- struct list_head xa_ail;
- uint xa_gen;
- struct task_struct *xa_task;
- xfs_lsn_t xa_target;
-} xfs_ail_t;
-
typedef struct xfs_mount {
struct super_block *m_super;
xfs_tid_t m_tid; /* next unused tid for fs */
- spinlock_t m_ail_lock; /* fs AIL mutex */
- xfs_ail_t m_ail; /* fs active log item list */
+ struct xfs_ail *m_ail; /* fs active log item list */
xfs_sb_t m_sb; /* copy of fs superblock */
spinlock_t m_sb_lock; /* sb counter lock */
struct xfs_buf *m_sb_bp; /* buffer for superblock */
@@ -247,10 +241,6 @@ typedef struct xfs_mount {
xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */
spinlock_t m_agirotor_lock;/* .. and lock protecting it */
xfs_agnumber_t m_maxagi; /* highest inode alloc group */
- struct xfs_inode *m_inodes; /* active inode list */
- struct list_head m_del_inodes; /* inodes to reclaim */
- mutex_t m_ilock; /* inode list mutex */
- uint m_ireclaims; /* count of calls to reclaim*/
uint m_readio_log; /* min read size log bytes */
uint m_readio_blocks; /* min read size blocks */
uint m_writeio_log; /* min write size log bytes */
@@ -267,7 +257,6 @@ typedef struct xfs_mount {
xfs_buftarg_t *m_ddev_targp; /* saves taking the address */
xfs_buftarg_t *m_logdev_targp;/* ptr to log device */
xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */
- __uint8_t m_dircook_elog; /* log d-cookie entry bits */
__uint8_t m_blkbit_log; /* blocklog + NBBY */
__uint8_t m_blkbb_log; /* blocklog - BBSHIFT */
__uint8_t m_agno_log; /* log #ag's */
@@ -276,12 +265,12 @@ typedef struct xfs_mount {
uint m_blockmask; /* sb_blocksize-1 */
uint m_blockwsize; /* sb_blocksize in words */
uint m_blockwmask; /* blockwsize-1 */
- uint m_alloc_mxr[2]; /* XFS_ALLOC_BLOCK_MAXRECS */
- uint m_alloc_mnr[2]; /* XFS_ALLOC_BLOCK_MINRECS */
- uint m_bmap_dmxr[2]; /* XFS_BMAP_BLOCK_DMAXRECS */
- uint m_bmap_dmnr[2]; /* XFS_BMAP_BLOCK_DMINRECS */
- uint m_inobt_mxr[2]; /* XFS_INOBT_BLOCK_MAXRECS */
- uint m_inobt_mnr[2]; /* XFS_INOBT_BLOCK_MINRECS */
+ uint m_alloc_mxr[2]; /* max alloc btree records */
+ uint m_alloc_mnr[2]; /* min alloc btree records */
+ uint m_bmap_dmxr[2]; /* max bmap btree records */
+ uint m_bmap_dmnr[2]; /* min bmap btree records */
+ uint m_inobt_mxr[2]; /* max inobt btree records */
+ uint m_inobt_mnr[2]; /* min inobt btree records */
uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
uint m_in_maxlevels; /* XFS_IN_MAXLEVELS */
@@ -312,9 +301,6 @@ typedef struct xfs_mount {
int m_sinoalign; /* stripe unit inode alignment */
int m_attr_magicpct;/* 37% of the blocksize */
int m_dir_magicpct; /* 37% of the dir blocksize */
- __uint8_t m_mk_sharedro; /* mark shared ro on unmount */
- __uint8_t m_inode_quiesce;/* call quiesce on new inodes.
- field governed by m_ilock */
__uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */
int m_dirblksize; /* directory block sz--bytes */
@@ -362,7 +348,6 @@ typedef struct xfs_mount {
#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */
#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */
#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */
-#define XFS_MOUNT_SHARED (1ULL << 11) /* shared mount */
#define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */
#define XFS_MOUNT_OSYNCISOSYNC (1ULL << 13) /* o_sync is REALLY o_sync */
/* osyncisdsync is now default*/
@@ -439,6 +424,16 @@ void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
#define xfs_force_shutdown(m,f) \
xfs_do_force_shutdown(m, f, __FILE__, __LINE__)
+#define SHUTDOWN_META_IO_ERROR 0x0001 /* write attempt to metadata failed */
+#define SHUTDOWN_LOG_IO_ERROR 0x0002 /* write attempt to the log failed */
+#define SHUTDOWN_FORCE_UMOUNT 0x0004 /* shutdown from a forced unmount */
+#define SHUTDOWN_CORRUPT_INCORE 0x0008 /* corrupt in-memory data structures */
+#define SHUTDOWN_REMOTE_REQ 0x0010 /* shutdown came from remote cell */
+#define SHUTDOWN_DEVICE_REQ 0x0020 /* failed all paths to the device */
+
+#define xfs_test_for_freeze(mp) ((mp)->m_super->s_frozen)
+#define xfs_wait_for_freeze(mp,l) vfs_check_frozen((mp)->m_super, (l))
+
/*
* Flags for xfs_mountfs
*/
@@ -508,14 +503,12 @@ typedef struct xfs_mod_sb {
#define XFS_MOUNT_ILOCK(mp) mutex_lock(&((mp)->m_ilock))
#define XFS_MOUNT_IUNLOCK(mp) mutex_unlock(&((mp)->m_ilock))
-extern void xfs_mod_sb(xfs_trans_t *, __int64_t);
extern int xfs_log_sbcount(xfs_mount_t *, uint);
extern int xfs_mountfs(xfs_mount_t *mp);
extern void xfs_mountfs_check_barriers(xfs_mount_t *mp);
extern void xfs_unmountfs(xfs_mount_t *);
extern int xfs_unmountfs_writesb(xfs_mount_t *);
-extern int xfs_unmount_flush(xfs_mount_t *, int);
extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
int64_t, int);
@@ -525,20 +518,20 @@ extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
extern int xfs_readsb(xfs_mount_t *, int);
extern void xfs_freesb(xfs_mount_t *);
extern int xfs_fs_writable(xfs_mount_t *);
-extern int xfs_syncsub(xfs_mount_t *, int, int *);
-extern int xfs_sync_inodes(xfs_mount_t *, int, int *);
-extern xfs_agnumber_t xfs_initialize_perag(xfs_mount_t *, xfs_agnumber_t);
-extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
-extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
-extern int xfs_dmops_get(struct xfs_mount *, struct xfs_mount_args *);
+extern int xfs_dmops_get(struct xfs_mount *);
extern void xfs_dmops_put(struct xfs_mount *);
-extern int xfs_qmops_get(struct xfs_mount *, struct xfs_mount_args *);
+extern int xfs_qmops_get(struct xfs_mount *);
extern void xfs_qmops_put(struct xfs_mount *);
extern struct xfs_dmops xfs_dmcore_xfs;
#endif /* __KERNEL__ */
+extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
+extern xfs_agnumber_t xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t);
+extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
+extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
+
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qmops.c b/fs/xfs/xfs_qmops.c
index a294e58db8dd..27f80581520a 100644
--- a/fs/xfs/xfs_qmops.c
+++ b/fs/xfs/xfs_qmops.c
@@ -28,7 +28,6 @@
#include "xfs_mount.h"
#include "xfs_quota.h"
#include "xfs_error.h"
-#include "xfs_clnt.h"
STATIC struct xfs_dquot *
@@ -131,9 +130,9 @@ static struct xfs_qmops xfs_qmcore_stub = {
};
int
-xfs_qmops_get(struct xfs_mount *mp, struct xfs_mount_args *args)
+xfs_qmops_get(struct xfs_mount *mp)
{
- if (args->flags & (XFSMNT_UQUOTA | XFSMNT_PQUOTA | XFSMNT_GQUOTA)) {
+ if (XFS_IS_QUOTA_RUNNING(mp)) {
#ifdef CONFIG_XFS_QUOTA
mp->m_qm_ops = &xfs_qmcore_xfs;
#else
diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h
index 12c4ec775af8..48965ecaa155 100644
--- a/fs/xfs/xfs_quota.h
+++ b/fs/xfs/xfs_quota.h
@@ -84,11 +84,9 @@ typedef struct xfs_dqblk {
#define XFS_DQ_USER 0x0001 /* a user quota */
#define XFS_DQ_PROJ 0x0002 /* project quota */
#define XFS_DQ_GROUP 0x0004 /* a group quota */
-#define XFS_DQ_FLOCKED 0x0008 /* flush lock taken */
-#define XFS_DQ_DIRTY 0x0010 /* dquot is dirty */
-#define XFS_DQ_WANT 0x0020 /* for lookup/reclaim race */
-#define XFS_DQ_INACTIVE 0x0040 /* dq off mplist & hashlist */
-#define XFS_DQ_MARKER 0x0080 /* sentinel */
+#define XFS_DQ_DIRTY 0x0008 /* dquot is dirty */
+#define XFS_DQ_WANT 0x0010 /* for lookup/reclaim race */
+#define XFS_DQ_INACTIVE 0x0020 /* dq off mplist & hashlist */
#define XFS_DQ_ALLTYPES (XFS_DQ_USER|XFS_DQ_PROJ|XFS_DQ_GROUP)
diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c
index c903130be7fd..86471bb40fd4 100644
--- a/fs/xfs/xfs_rename.c
+++ b/fs/xfs/xfs_rename.c
@@ -42,31 +42,6 @@
/*
- * Given an array of up to 4 inode pointers, unlock the pointed to inodes.
- * If there are fewer than 4 entries in the array, the empty entries will
- * be at the end and will have NULL pointers in them.
- */
-STATIC void
-xfs_rename_unlock4(
- xfs_inode_t **i_tab,
- uint lock_mode)
-{
- int i;
-
- xfs_iunlock(i_tab[0], lock_mode);
- for (i = 1; i < 4; i++) {
- if (i_tab[i] == NULL)
- break;
-
- /*
- * Watch out for duplicate entries in the table.
- */
- if (i_tab[i] != i_tab[i-1])
- xfs_iunlock(i_tab[i], lock_mode);
- }
-}
-
-/*
* Enter all inodes for a rename transaction into a sorted array.
*/
STATIC void
@@ -205,19 +180,6 @@ xfs_rename(
xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
/*
- * If we are using project inheritance, we only allow renames
- * into our tree when the project IDs are the same; else the
- * tree quota mechanism would be circumvented.
- */
- if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
- (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
- error = XFS_ERROR(EXDEV);
- xfs_rename_unlock4(inodes, XFS_ILOCK_EXCL);
- xfs_trans_cancel(tp, cancel_flags);
- goto std_return;
- }
-
- /*
* Join all the inodes to the transaction. From this point on,
* we can rely on either trans_commit or trans_cancel to unlock
* them. Note that we need to add a vnode reference to the
@@ -242,6 +204,17 @@ xfs_rename(
}
/*
+ * If we are using project inheritance, we only allow renames
+ * into our tree when the project IDs are the same; else the
+ * tree quota mechanism would be circumvented.
+ */
+ if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) &&
+ (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) {
+ error = XFS_ERROR(EXDEV);
+ goto error_return;
+ }
+
+ /*
* Set up the target.
*/
if (target_ip == NULL) {
@@ -367,19 +340,11 @@ xfs_rename(
&first_block, &free_list, spaceres);
if (error)
goto abort_return;
- xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- /*
- * Update the generation counts on all the directory inodes
- * that we're modifying.
- */
- src_dp->i_gen++;
+ xfs_ichgtime(src_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
xfs_trans_log_inode(tp, src_dp, XFS_ILOG_CORE);
-
- if (new_parent) {
- target_dp->i_gen++;
+ if (new_parent)
xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
- }
/*
* If this is a synchronous mount, make sure that the
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index e2f68de16159..edf12c7b834c 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -85,7 +85,6 @@ xfs_growfs_rt_alloc(
{
xfs_fileoff_t bno; /* block number in file */
xfs_buf_t *bp; /* temporary buffer for zeroing */
- int cancelflags; /* flags for xfs_trans_cancel */
int committed; /* transaction committed flag */
xfs_daddr_t d; /* disk block address */
int error; /* error return value */
@@ -96,15 +95,16 @@ xfs_growfs_rt_alloc(
xfs_bmbt_irec_t map; /* block map output */
int nmap; /* number of block maps */
int resblks; /* space reservation */
- xfs_trans_t *tp; /* transaction pointer */
/*
* Allocate space to the file, as necessary.
*/
while (oblocks < nblocks) {
+ int cancelflags = 0;
+ xfs_trans_t *tp;
+
tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC);
resblks = XFS_GROWFSRT_SPACE_RES(mp, nblocks - oblocks);
- cancelflags = 0;
/*
* Reserve space & log for one extent added to the file.
*/
@@ -171,7 +171,9 @@ xfs_growfs_rt_alloc(
mp->m_bsize, 0);
if (bp == NULL) {
error = XFS_ERROR(EIO);
- goto error_cancel;
+error_cancel:
+ xfs_trans_cancel(tp, cancelflags);
+ goto error;
}
memset(XFS_BUF_PTR(bp), 0, mp->m_sb.sb_blocksize);
xfs_trans_log_buf(tp, bp, 0, mp->m_sb.sb_blocksize - 1);
@@ -188,8 +190,6 @@ xfs_growfs_rt_alloc(
oblocks = map.br_startoff + map.br_blockcount;
}
return 0;
-error_cancel:
- xfs_trans_cancel(tp, cancelflags);
error:
return error;
}
@@ -1856,7 +1856,6 @@ xfs_growfs_rt(
{
xfs_rtblock_t bmbno; /* bitmap block number */
xfs_buf_t *bp; /* temporary buffer */
- int cancelflags; /* flags for xfs_trans_cancel */
int error; /* error return value */
xfs_inode_t *ip; /* bitmap inode, used as lock */
xfs_mount_t *nmp; /* new (fake) mount structure */
@@ -1872,13 +1871,13 @@ xfs_growfs_rt(
xfs_extlen_t rsumblocks; /* current number of rt summary blks */
xfs_sb_t *sbp; /* old superblock */
xfs_fsblock_t sumbno; /* summary block number */
- xfs_trans_t *tp; /* transaction pointer */
sbp = &mp->m_sb;
- cancelflags = 0;
/*
* Initial error checking.
*/
+ if (!capable(CAP_SYS_ADMIN))
+ return XFS_ERROR(EPERM);
if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL ||
(nrblocks = in->newblocks) <= sbp->sb_rblocks ||
(sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize)))
@@ -1942,6 +1941,9 @@ xfs_growfs_rt(
((sbp->sb_rextents & ((1 << mp->m_blkbit_log) - 1)) != 0);
bmbno < nrbmblocks;
bmbno++) {
+ xfs_trans_t *tp;
+ int cancelflags = 0;
+
*nmp = *mp;
nsbp = &nmp->m_sb;
/*
@@ -1967,16 +1969,15 @@ xfs_growfs_rt(
* Start a transaction, get the log reservation.
*/
tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_FREE);
- cancelflags = 0;
if ((error = xfs_trans_reserve(tp, 0,
XFS_GROWRTFREE_LOG_RES(nmp), 0, 0, 0)))
- break;
+ goto error_cancel;
/*
* Lock out other callers by grabbing the bitmap inode lock.
*/
if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rbmino, 0,
XFS_ILOCK_EXCL, &ip)))
- break;
+ goto error_cancel;
ASSERT(ip == mp->m_rbmip);
/*
* Update the bitmap inode's size.
@@ -1990,7 +1991,7 @@ xfs_growfs_rt(
*/
if ((error = xfs_trans_iget(mp, tp, mp->m_sb.sb_rsumino, 0,
XFS_ILOCK_EXCL, &ip)))
- break;
+ goto error_cancel;
ASSERT(ip == mp->m_rsumip);
/*
* Update the summary inode's size.
@@ -2005,7 +2006,7 @@ xfs_growfs_rt(
mp->m_rsumlevels != nmp->m_rsumlevels) {
error = xfs_rtcopy_summary(mp, nmp, tp);
if (error)
- break;
+ goto error_cancel;
}
/*
* Update superblock fields.
@@ -2031,8 +2032,11 @@ xfs_growfs_rt(
bp = NULL;
error = xfs_rtfree_range(nmp, tp, sbp->sb_rextents,
nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno);
- if (error)
+ if (error) {
+error_cancel:
+ xfs_trans_cancel(tp, cancelflags);
break;
+ }
/*
* Mark more blocks free in the superblock.
*/
@@ -2045,15 +2049,10 @@ xfs_growfs_rt(
mp->m_rsumsize = nrsumsize;
error = xfs_trans_commit(tp, 0);
- if (error) {
- tp = NULL;
+ if (error)
break;
- }
}
- if (error && tp)
- xfs_trans_cancel(tp, cancelflags);
-
/*
* Free the fake mp structure.
*/
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index 3a82576dde9a..36f3a21c54d2 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -406,7 +406,7 @@ xfs_bwrite(
* XXXsup how does this work for quotas.
*/
XFS_BUF_SET_BDSTRAT_FUNC(bp, xfs_bdstrat_cb);
- XFS_BUF_SET_FSPRIVATE3(bp, mp);
+ bp->b_mount = mp;
XFS_BUF_WRITE(bp);
if ((error = XFS_bwrite(bp))) {
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index 3f8cf1587f4c..1ed71916e4c9 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -79,6 +79,7 @@ struct xfs_mount;
#define XFS_SB_VERSION2_LAZYSBCOUNTBIT 0x00000002 /* Superblk counters */
#define XFS_SB_VERSION2_RESERVED4BIT 0x00000004
#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */
+#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
#define XFS_SB_VERSION2_OKREALFBITS \
(XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
@@ -296,30 +297,34 @@ typedef enum {
#define XFS_SB_VERSION_NUM(sbp) ((sbp)->sb_versionnum & XFS_SB_VERSION_NUMBITS)
-#ifdef __KERNEL__
static inline int xfs_sb_good_version(xfs_sb_t *sbp)
{
- return (((sbp->sb_versionnum >= XFS_SB_VERSION_1) && \
- (sbp->sb_versionnum <= XFS_SB_VERSION_3)) || \
- ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
- !((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || \
- ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && \
- (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) && \
- (sbp->sb_shared_vn <= XFS_SB_MAX_SHARED_VN)));
-}
+ /* We always support version 1-3 */
+ if (sbp->sb_versionnum >= XFS_SB_VERSION_1 &&
+ sbp->sb_versionnum <= XFS_SB_VERSION_3)
+ return 1;
+
+ /* We support version 4 if all feature bits are supported */
+ if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) {
+ if ((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) ||
+ ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) &&
+ (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS)))
+ return 0;
+
+#ifdef __KERNEL__
+ if (sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
+ return 0;
#else
-static inline int xfs_sb_good_version(xfs_sb_t *sbp)
-{
- return (((sbp->sb_versionnum >= XFS_SB_VERSION_1) && \
- (sbp->sb_versionnum <= XFS_SB_VERSION_3)) || \
- ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
- !((sbp->sb_versionnum & ~XFS_SB_VERSION_OKREALBITS) || \
- ((sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT) && \
- (sbp->sb_features2 & ~XFS_SB_VERSION2_OKREALBITS))) && \
- (!(sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) || \
- (sbp->sb_shared_vn <= XFS_SB_MAX_SHARED_VN))));
+ if ((sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT) &&
+ sbp->sb_shared_vn > XFS_SB_MAX_SHARED_VN)
+ return 0;
+#endif
+
+ return 1;
+ }
+
+ return 0;
}
-#endif /* __KERNEL__ */
/*
* Detect a mismatched features2 field. Older kernels read/wrote
@@ -332,123 +337,127 @@ static inline int xfs_sb_has_mismatched_features2(xfs_sb_t *sbp)
static inline unsigned xfs_sb_version_tonew(unsigned v)
{
- return ((((v) == XFS_SB_VERSION_1) ? \
- 0 : \
- (((v) == XFS_SB_VERSION_2) ? \
- XFS_SB_VERSION_ATTRBIT : \
- (XFS_SB_VERSION_ATTRBIT | XFS_SB_VERSION_NLINKBIT))) | \
- XFS_SB_VERSION_4);
+ if (v == XFS_SB_VERSION_1)
+ return XFS_SB_VERSION_4;
+
+ if (v == XFS_SB_VERSION_2)
+ return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
+
+ return XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT |
+ XFS_SB_VERSION_NLINKBIT;
}
static inline unsigned xfs_sb_version_toold(unsigned v)
{
- return (((v) & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT)) ? \
- 0 : \
- (((v) & XFS_SB_VERSION_NLINKBIT) ? \
- XFS_SB_VERSION_3 : \
- (((v) & XFS_SB_VERSION_ATTRBIT) ? \
- XFS_SB_VERSION_2 : \
- XFS_SB_VERSION_1)));
+ if (v & (XFS_SB_VERSION_QUOTABIT | XFS_SB_VERSION_ALIGNBIT))
+ return 0;
+ if (v & XFS_SB_VERSION_NLINKBIT)
+ return XFS_SB_VERSION_3;
+ if (v & XFS_SB_VERSION_ATTRBIT)
+ return XFS_SB_VERSION_2;
+ return XFS_SB_VERSION_1;
}
static inline int xfs_sb_version_hasattr(xfs_sb_t *sbp)
{
- return ((sbp)->sb_versionnum == XFS_SB_VERSION_2) || \
- ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \
- ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
- ((sbp)->sb_versionnum & XFS_SB_VERSION_ATTRBIT));
+ return sbp->sb_versionnum == XFS_SB_VERSION_2 ||
+ sbp->sb_versionnum == XFS_SB_VERSION_3 ||
+ (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_ATTRBIT));
}
static inline void xfs_sb_version_addattr(xfs_sb_t *sbp)
{
- (sbp)->sb_versionnum = (((sbp)->sb_versionnum == XFS_SB_VERSION_1) ? \
- XFS_SB_VERSION_2 : \
- ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) ? \
- ((sbp)->sb_versionnum | XFS_SB_VERSION_ATTRBIT) : \
- (XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT)));
+ if (sbp->sb_versionnum == XFS_SB_VERSION_1)
+ sbp->sb_versionnum = XFS_SB_VERSION_2;
+ else if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
+ sbp->sb_versionnum |= XFS_SB_VERSION_ATTRBIT;
+ else
+ sbp->sb_versionnum = XFS_SB_VERSION_4 | XFS_SB_VERSION_ATTRBIT;
}
static inline int xfs_sb_version_hasnlink(xfs_sb_t *sbp)
{
- return ((sbp)->sb_versionnum == XFS_SB_VERSION_3) || \
- ((XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
- ((sbp)->sb_versionnum & XFS_SB_VERSION_NLINKBIT));
+ return sbp->sb_versionnum == XFS_SB_VERSION_3 ||
+ (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_NLINKBIT));
}
static inline void xfs_sb_version_addnlink(xfs_sb_t *sbp)
{
- (sbp)->sb_versionnum = ((sbp)->sb_versionnum <= XFS_SB_VERSION_2 ? \
- XFS_SB_VERSION_3 : \
- ((sbp)->sb_versionnum | XFS_SB_VERSION_NLINKBIT));
+ if (sbp->sb_versionnum <= XFS_SB_VERSION_2)
+ sbp->sb_versionnum = XFS_SB_VERSION_3;
+ else
+ sbp->sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
}
static inline int xfs_sb_version_hasquota(xfs_sb_t *sbp)
{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
- ((sbp)->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_QUOTABIT);
}
static inline void xfs_sb_version_addquota(xfs_sb_t *sbp)
{
- (sbp)->sb_versionnum = \
- (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 ? \
- ((sbp)->sb_versionnum | XFS_SB_VERSION_QUOTABIT) : \
- (xfs_sb_version_tonew((sbp)->sb_versionnum) | \
- XFS_SB_VERSION_QUOTABIT));
+ if (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4)
+ sbp->sb_versionnum |= XFS_SB_VERSION_QUOTABIT;
+ else
+ sbp->sb_versionnum = xfs_sb_version_tonew(sbp->sb_versionnum) |
+ XFS_SB_VERSION_QUOTABIT;
}
static inline int xfs_sb_version_hasalign(xfs_sb_t *sbp)
{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
- ((sbp)->sb_versionnum & XFS_SB_VERSION_ALIGNBIT);
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_ALIGNBIT);
}
static inline int xfs_sb_version_hasdalign(xfs_sb_t *sbp)
{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
- ((sbp)->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_DALIGNBIT);
}
static inline int xfs_sb_version_hasshared(xfs_sb_t *sbp)
{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
- ((sbp)->sb_versionnum & XFS_SB_VERSION_SHAREDBIT);
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_SHAREDBIT);
}
static inline int xfs_sb_version_hasdirv2(xfs_sb_t *sbp)
{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
- ((sbp)->sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_DIRV2BIT);
}
static inline int xfs_sb_version_haslogv2(xfs_sb_t *sbp)
{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
- ((sbp)->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_LOGV2BIT);
}
static inline int xfs_sb_version_hasextflgbit(xfs_sb_t *sbp)
{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
- ((sbp)->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_EXTFLGBIT);
}
static inline int xfs_sb_version_hassector(xfs_sb_t *sbp)
{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
- ((sbp)->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_SECTORBIT);
}
static inline int xfs_sb_version_hasasciici(xfs_sb_t *sbp)
{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
(sbp->sb_versionnum & XFS_SB_VERSION_BORGBIT);
}
static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
{
- return (XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4) && \
- ((sbp)->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
+ return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_4 &&
+ (sbp->sb_versionnum & XFS_SB_VERSION_MOREBITSBIT);
}
/*
@@ -463,22 +472,20 @@ static inline int xfs_sb_version_hasmorebits(xfs_sb_t *sbp)
static inline int xfs_sb_version_haslazysbcount(xfs_sb_t *sbp)
{
- return (xfs_sb_version_hasmorebits(sbp) && \
- ((sbp)->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT));
+ return xfs_sb_version_hasmorebits(sbp) &&
+ (sbp->sb_features2 & XFS_SB_VERSION2_LAZYSBCOUNTBIT);
}
static inline int xfs_sb_version_hasattr2(xfs_sb_t *sbp)
{
- return (xfs_sb_version_hasmorebits(sbp)) && \
- ((sbp)->sb_features2 & XFS_SB_VERSION2_ATTR2BIT);
+ return xfs_sb_version_hasmorebits(sbp) &&
+ (sbp->sb_features2 & XFS_SB_VERSION2_ATTR2BIT);
}
static inline void xfs_sb_version_addattr2(xfs_sb_t *sbp)
{
- ((sbp)->sb_versionnum = \
- ((sbp)->sb_versionnum | XFS_SB_VERSION_MOREBITSBIT), \
- ((sbp)->sb_features2 = \
- ((sbp)->sb_features2 | XFS_SB_VERSION2_ATTR2BIT)));
+ sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
+ sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
}
static inline void xfs_sb_version_removeattr2(xfs_sb_t *sbp)
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 4e1c22a23be5..8570b826fedd 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -290,7 +290,7 @@ xfs_trans_dup(
ASSERT(tp->t_ticket != NULL);
ntp->t_flags = XFS_TRANS_PERM_LOG_RES | (tp->t_flags & XFS_TRANS_RESERVE);
- ntp->t_ticket = tp->t_ticket;
+ ntp->t_ticket = xfs_log_ticket_get(tp->t_ticket);
ntp->t_blk_res = tp->t_blk_res - tp->t_blk_res_used;
tp->t_blk_res = tp->t_blk_res_used;
ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
@@ -1260,6 +1260,13 @@ xfs_trans_roll(
trans = *tpp;
/*
+ * transaction commit worked ok so we can drop the extra ticket
+ * reference that we gained in xfs_trans_dup()
+ */
+ xfs_log_ticket_put(trans->t_ticket);
+
+
+ /*
* Reserve space in the log for th next transaction.
* This also pushes items in the "AIL", the list of logged items,
* out to disk if they are taking up space at the tail of the log
@@ -1383,11 +1390,12 @@ xfs_trans_chunk_committed(
xfs_log_item_desc_t *lidp;
xfs_log_item_t *lip;
xfs_lsn_t item_lsn;
- struct xfs_mount *mp;
int i;
lidp = licp->lic_descs;
for (i = 0; i < licp->lic_unused; i++, lidp++) {
+ struct xfs_ail *ailp;
+
if (xfs_lic_isfree(licp, i)) {
continue;
}
@@ -1424,19 +1432,19 @@ xfs_trans_chunk_committed(
* This would cause the earlier transaction to fail
* the test below.
*/
- mp = lip->li_mountp;
- spin_lock(&mp->m_ail_lock);
+ ailp = lip->li_ailp;
+ spin_lock(&ailp->xa_lock);
if (XFS_LSN_CMP(item_lsn, lip->li_lsn) > 0) {
/*
* This will set the item's lsn to item_lsn
* and update the position of the item in
* the AIL.
*
- * xfs_trans_update_ail() drops the AIL lock.
+ * xfs_trans_ail_update() drops the AIL lock.
*/
- xfs_trans_update_ail(mp, lip, item_lsn);
+ xfs_trans_ail_update(ailp, lip, item_lsn);
} else {
- spin_unlock(&mp->m_ail_lock);
+ spin_unlock(&ailp->xa_lock);
}
/*
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 74c80bd2b0ec..d6fe4a88d79f 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -18,6 +18,8 @@
#ifndef __XFS_TRANS_H__
#define __XFS_TRANS_H__
+struct xfs_log_item;
+
/*
* This is the structure written in the log at the head of
* every transaction. It identifies the type and id of the
@@ -98,76 +100,6 @@ typedef struct xfs_trans_header {
#define XFS_TRANS_TYPE_MAX 41
/* new transaction types need to be reflected in xfs_logprint(8) */
-
-#ifdef __KERNEL__
-struct xfs_buf;
-struct xfs_buftarg;
-struct xfs_efd_log_item;
-struct xfs_efi_log_item;
-struct xfs_inode;
-struct xfs_item_ops;
-struct xfs_log_iovec;
-struct xfs_log_item;
-struct xfs_log_item_desc;
-struct xfs_mount;
-struct xfs_trans;
-struct xfs_dquot_acct;
-
-typedef struct xfs_log_item {
- struct list_head li_ail; /* AIL pointers */
- xfs_lsn_t li_lsn; /* last on-disk lsn */
- struct xfs_log_item_desc *li_desc; /* ptr to current desc*/
- struct xfs_mount *li_mountp; /* ptr to fs mount */
- uint li_type; /* item type */
- uint li_flags; /* misc flags */
- struct xfs_log_item *li_bio_list; /* buffer item list */
- void (*li_cb)(struct xfs_buf *,
- struct xfs_log_item *);
- /* buffer item iodone */
- /* callback func */
- struct xfs_item_ops *li_ops; /* function list */
-} xfs_log_item_t;
-
-#define XFS_LI_IN_AIL 0x1
-#define XFS_LI_ABORTED 0x2
-
-typedef struct xfs_item_ops {
- uint (*iop_size)(xfs_log_item_t *);
- void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
- void (*iop_pin)(xfs_log_item_t *);
- void (*iop_unpin)(xfs_log_item_t *, int);
- void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
- uint (*iop_trylock)(xfs_log_item_t *);
- void (*iop_unlock)(xfs_log_item_t *);
- xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
- void (*iop_push)(xfs_log_item_t *);
- void (*iop_pushbuf)(xfs_log_item_t *);
- void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
-} xfs_item_ops_t;
-
-#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip)
-#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
-#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip)
-#define IOP_UNPIN(ip, flags) (*(ip)->li_ops->iop_unpin)(ip, flags)
-#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
-#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip)
-#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip)
-#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn)
-#define IOP_PUSH(ip) (*(ip)->li_ops->iop_push)(ip)
-#define IOP_PUSHBUF(ip) (*(ip)->li_ops->iop_pushbuf)(ip)
-#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
-
-/*
- * Return values for the IOP_TRYLOCK() routines.
- */
-#define XFS_ITEM_SUCCESS 0
-#define XFS_ITEM_PINNED 1
-#define XFS_ITEM_LOCKED 2
-#define XFS_ITEM_FLUSHING 3
-#define XFS_ITEM_PUSHBUF 4
-
-#endif /* __KERNEL__ */
-
/*
* This structure is used to track log items associated with
* a transaction. It points to the log item and keeps some
@@ -176,7 +108,7 @@ typedef struct xfs_item_ops {
* once we get to commit processing (see xfs_trans_commit()).
*/
typedef struct xfs_log_item_desc {
- xfs_log_item_t *lid_item;
+ struct xfs_log_item *lid_item;
ushort lid_size;
unsigned char lid_flags;
unsigned char lid_index;
@@ -276,94 +208,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
(xfs_caddr_t)(((xfs_log_item_chunk_t*)0)->lic_descs));
}
-#ifdef __KERNEL__
-/*
- * This structure is used to maintain a list of block ranges that have been
- * freed in the transaction. The ranges are listed in the perag[] busy list
- * between when they're freed and the transaction is committed to disk.
- */
-
-typedef struct xfs_log_busy_slot {
- xfs_agnumber_t lbc_ag;
- ushort lbc_idx; /* index in perag.busy[] */
-} xfs_log_busy_slot_t;
-
-#define XFS_LBC_NUM_SLOTS 31
-typedef struct xfs_log_busy_chunk {
- struct xfs_log_busy_chunk *lbc_next;
- uint lbc_free; /* free slots bitmask */
- ushort lbc_unused; /* first unused */
- xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS];
-} xfs_log_busy_chunk_t;
-
-#define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1)
-#define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1)
-
-#define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK)
-#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
-#define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)]))
-#define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK)
-#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
-
-/*
- * This is the type of function which can be given to xfs_trans_callback()
- * to be called upon the transaction's commit to disk.
- */
-typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
-
-/*
- * This is the structure maintained for every active transaction.
- */
-typedef struct xfs_trans {
- unsigned int t_magic; /* magic number */
- xfs_log_callback_t t_logcb; /* log callback struct */
- unsigned int t_type; /* transaction type */
- unsigned int t_log_res; /* amt of log space resvd */
- unsigned int t_log_count; /* count for perm log res */
- unsigned int t_blk_res; /* # of blocks resvd */
- unsigned int t_blk_res_used; /* # of resvd blocks used */
- unsigned int t_rtx_res; /* # of rt extents resvd */
- unsigned int t_rtx_res_used; /* # of resvd rt extents used */
- xfs_log_ticket_t t_ticket; /* log mgr ticket */
- xfs_lsn_t t_lsn; /* log seq num of start of
- * transaction. */
- xfs_lsn_t t_commit_lsn; /* log seq num of end of
- * transaction. */
- struct xfs_mount *t_mountp; /* ptr to fs mount struct */
- struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */
- xfs_trans_callback_t t_callback; /* transaction callback */
- void *t_callarg; /* callback arg */
- unsigned int t_flags; /* misc flags */
- int64_t t_icount_delta; /* superblock icount change */
- int64_t t_ifree_delta; /* superblock ifree change */
- int64_t t_fdblocks_delta; /* superblock fdblocks chg */
- int64_t t_res_fdblocks_delta; /* on-disk only chg */
- int64_t t_frextents_delta;/* superblock freextents chg*/
- int64_t t_res_frextents_delta; /* on-disk only chg */
-#ifdef DEBUG
- int64_t t_ag_freeblks_delta; /* debugging counter */
- int64_t t_ag_flist_delta; /* debugging counter */
- int64_t t_ag_btree_delta; /* debugging counter */
-#endif
- int64_t t_dblocks_delta;/* superblock dblocks change */
- int64_t t_agcount_delta;/* superblock agcount change */
- int64_t t_imaxpct_delta;/* superblock imaxpct change */
- int64_t t_rextsize_delta;/* superblock rextsize chg */
- int64_t t_rbmblocks_delta;/* superblock rbmblocks chg */
- int64_t t_rblocks_delta;/* superblock rblocks change */
- int64_t t_rextents_delta;/* superblocks rextents chg */
- int64_t t_rextslog_delta;/* superblocks rextslog chg */
- unsigned int t_items_free; /* log item descs free */
- xfs_log_item_chunk_t t_items; /* first log item desc chunk */
- xfs_trans_header_t t_header; /* header for in-log trans */
- unsigned int t_busy_free; /* busy descs free */
- xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */
- unsigned long t_pflags; /* saved process flags state */
-} xfs_trans_t;
-
-#endif /* __KERNEL__ */
-
-
#define XFS_TRANS_MAGIC 0x5452414E /* 'TRAN' */
/*
* Values for t_flags.
@@ -906,6 +750,157 @@ typedef struct xfs_trans {
#define XFS_DQUOT_REF 1
#ifdef __KERNEL__
+
+struct xfs_buf;
+struct xfs_buftarg;
+struct xfs_efd_log_item;
+struct xfs_efi_log_item;
+struct xfs_inode;
+struct xfs_item_ops;
+struct xfs_log_iovec;
+struct xfs_log_item_desc;
+struct xfs_mount;
+struct xfs_trans;
+struct xfs_dquot_acct;
+
+typedef struct xfs_log_item {
+ struct list_head li_ail; /* AIL pointers */
+ xfs_lsn_t li_lsn; /* last on-disk lsn */
+ struct xfs_log_item_desc *li_desc; /* ptr to current desc*/
+ struct xfs_mount *li_mountp; /* ptr to fs mount */
+ struct xfs_ail *li_ailp; /* ptr to AIL */
+ uint li_type; /* item type */
+ uint li_flags; /* misc flags */
+ struct xfs_log_item *li_bio_list; /* buffer item list */
+ void (*li_cb)(struct xfs_buf *,
+ struct xfs_log_item *);
+ /* buffer item iodone */
+ /* callback func */
+ struct xfs_item_ops *li_ops; /* function list */
+} xfs_log_item_t;
+
+#define XFS_LI_IN_AIL 0x1
+#define XFS_LI_ABORTED 0x2
+
+typedef struct xfs_item_ops {
+ uint (*iop_size)(xfs_log_item_t *);
+ void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
+ void (*iop_pin)(xfs_log_item_t *);
+ void (*iop_unpin)(xfs_log_item_t *, int);
+ void (*iop_unpin_remove)(xfs_log_item_t *, struct xfs_trans *);
+ uint (*iop_trylock)(xfs_log_item_t *);
+ void (*iop_unlock)(xfs_log_item_t *);
+ xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
+ void (*iop_push)(xfs_log_item_t *);
+ void (*iop_pushbuf)(xfs_log_item_t *);
+ void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
+} xfs_item_ops_t;
+
+#define IOP_SIZE(ip) (*(ip)->li_ops->iop_size)(ip)
+#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
+#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip)
+#define IOP_UNPIN(ip, flags) (*(ip)->li_ops->iop_unpin)(ip, flags)
+#define IOP_UNPIN_REMOVE(ip,tp) (*(ip)->li_ops->iop_unpin_remove)(ip, tp)
+#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip)
+#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip)
+#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn)
+#define IOP_PUSH(ip) (*(ip)->li_ops->iop_push)(ip)
+#define IOP_PUSHBUF(ip) (*(ip)->li_ops->iop_pushbuf)(ip)
+#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
+
+/*
+ * Return values for the IOP_TRYLOCK() routines.
+ */
+#define XFS_ITEM_SUCCESS 0
+#define XFS_ITEM_PINNED 1
+#define XFS_ITEM_LOCKED 2
+#define XFS_ITEM_FLUSHING 3
+#define XFS_ITEM_PUSHBUF 4
+
+/*
+ * This structure is used to maintain a list of block ranges that have been
+ * freed in the transaction. The ranges are listed in the perag[] busy list
+ * between when they're freed and the transaction is committed to disk.
+ */
+
+typedef struct xfs_log_busy_slot {
+ xfs_agnumber_t lbc_ag;
+ ushort lbc_idx; /* index in perag.busy[] */
+} xfs_log_busy_slot_t;
+
+#define XFS_LBC_NUM_SLOTS 31
+typedef struct xfs_log_busy_chunk {
+ struct xfs_log_busy_chunk *lbc_next;
+ uint lbc_free; /* free slots bitmask */
+ ushort lbc_unused; /* first unused */
+ xfs_log_busy_slot_t lbc_busy[XFS_LBC_NUM_SLOTS];
+} xfs_log_busy_chunk_t;
+
+#define XFS_LBC_MAX_SLOT (XFS_LBC_NUM_SLOTS - 1)
+#define XFS_LBC_FREEMASK ((1U << XFS_LBC_NUM_SLOTS) - 1)
+
+#define XFS_LBC_INIT(cp) ((cp)->lbc_free = XFS_LBC_FREEMASK)
+#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
+#define XFS_LBC_SLOT(cp, slot) (&((cp)->lbc_busy[(slot)]))
+#define XFS_LBC_VACANCY(cp) (((cp)->lbc_free) & XFS_LBC_FREEMASK)
+#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
+
+/*
+ * This is the type of function which can be given to xfs_trans_callback()
+ * to be called upon the transaction's commit to disk.
+ */
+typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *);
+
+/*
+ * This is the structure maintained for every active transaction.
+ */
+typedef struct xfs_trans {
+ unsigned int t_magic; /* magic number */
+ xfs_log_callback_t t_logcb; /* log callback struct */
+ unsigned int t_type; /* transaction type */
+ unsigned int t_log_res; /* amt of log space resvd */
+ unsigned int t_log_count; /* count for perm log res */
+ unsigned int t_blk_res; /* # of blocks resvd */
+ unsigned int t_blk_res_used; /* # of resvd blocks used */
+ unsigned int t_rtx_res; /* # of rt extents resvd */
+ unsigned int t_rtx_res_used; /* # of resvd rt extents used */
+ xfs_log_ticket_t t_ticket; /* log mgr ticket */
+ xfs_lsn_t t_lsn; /* log seq num of start of
+ * transaction. */
+ xfs_lsn_t t_commit_lsn; /* log seq num of end of
+ * transaction. */
+ struct xfs_mount *t_mountp; /* ptr to fs mount struct */
+ struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */
+ xfs_trans_callback_t t_callback; /* transaction callback */
+ void *t_callarg; /* callback arg */
+ unsigned int t_flags; /* misc flags */
+ int64_t t_icount_delta; /* superblock icount change */
+ int64_t t_ifree_delta; /* superblock ifree change */
+ int64_t t_fdblocks_delta; /* superblock fdblocks chg */
+ int64_t t_res_fdblocks_delta; /* on-disk only chg */
+ int64_t t_frextents_delta;/* superblock freextents chg*/
+ int64_t t_res_frextents_delta; /* on-disk only chg */
+#ifdef DEBUG
+ int64_t t_ag_freeblks_delta; /* debugging counter */
+ int64_t t_ag_flist_delta; /* debugging counter */
+ int64_t t_ag_btree_delta; /* debugging counter */
+#endif
+ int64_t t_dblocks_delta;/* superblock dblocks change */
+ int64_t t_agcount_delta;/* superblock agcount change */
+ int64_t t_imaxpct_delta;/* superblock imaxpct change */
+ int64_t t_rextsize_delta;/* superblock rextsize chg */
+ int64_t t_rbmblocks_delta;/* superblock rbmblocks chg */
+ int64_t t_rblocks_delta;/* superblock rblocks change */
+ int64_t t_rextents_delta;/* superblocks rextents chg */
+ int64_t t_rextslog_delta;/* superblocks rextslog chg */
+ unsigned int t_items_free; /* log item descs free */
+ xfs_log_item_chunk_t t_items; /* first log item desc chunk */
+ xfs_trans_header_t t_header; /* header for in-log trans */
+ unsigned int t_busy_free; /* busy descs free */
+ xfs_log_busy_chunk_t t_busy; /* busy/async free blocks */
+ unsigned long t_pflags; /* saved process flags state */
+} xfs_trans_t;
+
/*
* XFS transaction mechanism exported interfaces that are
* actually macros.
@@ -928,7 +923,6 @@ typedef struct xfs_trans {
/*
* XFS transaction mechanism exported interfaces.
*/
-void xfs_trans_init(struct xfs_mount *);
xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint);
xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint);
xfs_trans_t *xfs_trans_dup(xfs_trans_t *);
@@ -975,13 +969,8 @@ int _xfs_trans_commit(xfs_trans_t *,
int *);
#define xfs_trans_commit(tp, flags) _xfs_trans_commit(tp, flags, NULL)
void xfs_trans_cancel(xfs_trans_t *, int);
-int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
int xfs_trans_ail_init(struct xfs_mount *);
void xfs_trans_ail_destroy(struct xfs_mount *);
-void xfs_trans_push_ail(struct xfs_mount *, xfs_lsn_t);
-xfs_lsn_t xfs_trans_tail_ail(struct xfs_mount *);
-void xfs_trans_unlocked_item(struct xfs_mount *,
- xfs_log_item_t *);
xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
xfs_agnumber_t ag,
xfs_extlen_t idx);
@@ -990,4 +979,7 @@ extern kmem_zone_t *xfs_trans_zone;
#endif /* __KERNEL__ */
+void xfs_trans_init(struct xfs_mount *);
+int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *);
+
#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 1f77c00af566..2d47f10f8bed 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
+ * Copyright (c) 2008 Dave Chinner
* All Rights Reserved.
*
* This program is free software; you can redistribute it and/or
@@ -28,13 +29,13 @@
#include "xfs_trans_priv.h"
#include "xfs_error.h"
-STATIC void xfs_ail_insert(xfs_ail_t *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_delete(xfs_ail_t *, xfs_log_item_t *);
-STATIC xfs_log_item_t * xfs_ail_min(xfs_ail_t *);
-STATIC xfs_log_item_t * xfs_ail_next(xfs_ail_t *, xfs_log_item_t *);
+STATIC void xfs_ail_insert(struct xfs_ail *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_delete(struct xfs_ail *, xfs_log_item_t *);
+STATIC xfs_log_item_t * xfs_ail_min(struct xfs_ail *);
+STATIC xfs_log_item_t * xfs_ail_next(struct xfs_ail *, xfs_log_item_t *);
#ifdef DEBUG
-STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *);
+STATIC void xfs_ail_check(struct xfs_ail *, xfs_log_item_t *);
#else
#define xfs_ail_check(a,l)
#endif /* DEBUG */
@@ -50,20 +51,20 @@ STATIC void xfs_ail_check(xfs_ail_t *, xfs_log_item_t *);
* lsn of the last item in the AIL.
*/
xfs_lsn_t
-xfs_trans_tail_ail(
- xfs_mount_t *mp)
+xfs_trans_ail_tail(
+ struct xfs_ail *ailp)
{
xfs_lsn_t lsn;
xfs_log_item_t *lip;
- spin_lock(&mp->m_ail_lock);
- lip = xfs_ail_min(&mp->m_ail);
+ spin_lock(&ailp->xa_lock);
+ lip = xfs_ail_min(ailp);
if (lip == NULL) {
lsn = (xfs_lsn_t)0;
} else {
lsn = lip->li_lsn;
}
- spin_unlock(&mp->m_ail_lock);
+ spin_unlock(&ailp->xa_lock);
return lsn;
}
@@ -85,16 +86,125 @@ xfs_trans_tail_ail(
* any of the objects, so the lock is not needed.
*/
void
-xfs_trans_push_ail(
- xfs_mount_t *mp,
- xfs_lsn_t threshold_lsn)
+xfs_trans_ail_push(
+ struct xfs_ail *ailp,
+ xfs_lsn_t threshold_lsn)
{
- xfs_log_item_t *lip;
+ xfs_log_item_t *lip;
+
+ lip = xfs_ail_min(ailp);
+ if (lip && !XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
+ if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0)
+ xfsaild_wakeup(ailp, threshold_lsn);
+ }
+}
+
+/*
+ * AIL traversal cursor initialisation.
+ *
+ * The cursor keeps track of where our current traversal is up
+ * to by tracking the next ƣtem in the list for us. However, for
+ * this to be safe, removing an object from the AIL needs to invalidate
+ * any cursor that points to it. hence the traversal cursor needs to
+ * be linked to the struct xfs_ail so that deletion can search all the
+ * active cursors for invalidation.
+ *
+ * We don't link the push cursor because it is embedded in the struct
+ * xfs_ail and hence easily findable.
+ */
+STATIC void
+xfs_trans_ail_cursor_init(
+ struct xfs_ail *ailp,
+ struct xfs_ail_cursor *cur)
+{
+ cur->item = NULL;
+ if (cur == &ailp->xa_cursors)
+ return;
+
+ cur->next = ailp->xa_cursors.next;
+ ailp->xa_cursors.next = cur;
+}
+
+/*
+ * Set the cursor to the next item, because when we look
+ * up the cursor the current item may have been freed.
+ */
+STATIC void
+xfs_trans_ail_cursor_set(
+ struct xfs_ail *ailp,
+ struct xfs_ail_cursor *cur,
+ struct xfs_log_item *lip)
+{
+ if (lip)
+ cur->item = xfs_ail_next(ailp, lip);
+}
+
+/*
+ * Get the next item in the traversal and advance the cursor.
+ * If the cursor was invalidated (inidicated by a lip of 1),
+ * restart the traversal.
+ */
+struct xfs_log_item *
+xfs_trans_ail_cursor_next(
+ struct xfs_ail *ailp,
+ struct xfs_ail_cursor *cur)
+{
+ struct xfs_log_item *lip = cur->item;
+
+ if ((__psint_t)lip & 1)
+ lip = xfs_ail_min(ailp);
+ xfs_trans_ail_cursor_set(ailp, cur, lip);
+ return lip;
+}
+
+/*
+ * Now that the traversal is complete, we need to remove the cursor
+ * from the list of traversing cursors. Avoid removing the embedded
+ * push cursor, but use the fact it is alway present to make the
+ * list deletion simple.
+ */
+void
+xfs_trans_ail_cursor_done(
+ struct xfs_ail *ailp,
+ struct xfs_ail_cursor *done)
+{
+ struct xfs_ail_cursor *prev = NULL;
+ struct xfs_ail_cursor *cur;
+
+ done->item = NULL;
+ if (done == &ailp->xa_cursors)
+ return;
+ prev = &ailp->xa_cursors;
+ for (cur = prev->next; cur; prev = cur, cur = prev->next) {
+ if (cur == done) {
+ prev->next = cur->next;
+ break;
+ }
+ }
+ ASSERT(cur);
+}
+
+/*
+ * Invalidate any cursor that is pointing to this item. This is
+ * called when an item is removed from the AIL. Any cursor pointing
+ * to this object is now invalid and the traversal needs to be
+ * terminated so it doesn't reference a freed object. We set the
+ * cursor item to a value of 1 so we can distinguish between an
+ * invalidation and the end of the list when getting the next item
+ * from the cursor.
+ */
+STATIC void
+xfs_trans_ail_cursor_clear(
+ struct xfs_ail *ailp,
+ struct xfs_log_item *lip)
+{
+ struct xfs_ail_cursor *cur;
- lip = xfs_ail_min(&mp->m_ail);
- if (lip && !XFS_FORCED_SHUTDOWN(mp)) {
- if (XFS_LSN_CMP(threshold_lsn, mp->m_ail.xa_target) > 0)
- xfsaild_wakeup(mp, threshold_lsn);
+ /* need to search all cursors */
+ for (cur = &ailp->xa_cursors; cur; cur = cur->next) {
+ if (cur->item == lip)
+ cur->item = (struct xfs_log_item *)
+ ((__psint_t)cur->item | 1);
}
}
@@ -103,25 +213,27 @@ xfs_trans_push_ail(
* Return the current tree generation number for use
* in calls to xfs_trans_next_ail().
*/
-STATIC xfs_log_item_t *
-xfs_trans_first_push_ail(
- xfs_mount_t *mp,
- int *gen,
- xfs_lsn_t lsn)
+xfs_log_item_t *
+xfs_trans_ail_cursor_first(
+ struct xfs_ail *ailp,
+ struct xfs_ail_cursor *cur,
+ xfs_lsn_t lsn)
{
- xfs_log_item_t *lip;
+ xfs_log_item_t *lip;
- lip = xfs_ail_min(&mp->m_ail);
- *gen = (int)mp->m_ail.xa_gen;
+ xfs_trans_ail_cursor_init(ailp, cur);
+ lip = xfs_ail_min(ailp);
if (lsn == 0)
- return lip;
+ goto out;
- list_for_each_entry(lip, &mp->m_ail.xa_ail, li_ail) {
+ list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
if (XFS_LSN_CMP(lip->li_lsn, lsn) >= 0)
- return lip;
+ goto out;
}
-
- return NULL;
+ lip = NULL;
+out:
+ xfs_trans_ail_cursor_set(ailp, cur, lip);
+ return lip;
}
/*
@@ -129,29 +241,29 @@ xfs_trans_first_push_ail(
*/
long
xfsaild_push(
- xfs_mount_t *mp,
+ struct xfs_ail *ailp,
xfs_lsn_t *last_lsn)
{
long tout = 1000; /* milliseconds */
xfs_lsn_t last_pushed_lsn = *last_lsn;
- xfs_lsn_t target = mp->m_ail.xa_target;
+ xfs_lsn_t target = ailp->xa_target;
xfs_lsn_t lsn;
xfs_log_item_t *lip;
- int gen;
- int restarts;
int flush_log, count, stuck;
+ xfs_mount_t *mp = ailp->xa_mount;
+ struct xfs_ail_cursor *cur = &ailp->xa_cursors;
-#define XFS_TRANS_PUSH_AIL_RESTARTS 10
-
- spin_lock(&mp->m_ail_lock);
- lip = xfs_trans_first_push_ail(mp, &gen, *last_lsn);
+ spin_lock(&ailp->xa_lock);
+ xfs_trans_ail_cursor_init(ailp, cur);
+ lip = xfs_trans_ail_cursor_first(ailp, cur, *last_lsn);
if (!lip || XFS_FORCED_SHUTDOWN(mp)) {
/*
* AIL is empty or our push has reached the end.
*/
- spin_unlock(&mp->m_ail_lock);
+ xfs_trans_ail_cursor_done(ailp, cur);
+ spin_unlock(&ailp->xa_lock);
last_pushed_lsn = 0;
- goto out;
+ return tout;
}
XFS_STATS_INC(xs_push_ail);
@@ -169,7 +281,7 @@ xfsaild_push(
*/
tout = 10;
lsn = lip->li_lsn;
- flush_log = stuck = count = restarts = 0;
+ flush_log = stuck = count = 0;
while ((XFS_LSN_CMP(lip->li_lsn, target) < 0)) {
int lock_result;
/*
@@ -184,7 +296,7 @@ xfsaild_push(
* skip to the next item in the list.
*/
lock_result = IOP_TRYLOCK(lip);
- spin_unlock(&mp->m_ail_lock);
+ spin_unlock(&ailp->xa_lock);
switch (lock_result) {
case XFS_ITEM_SUCCESS:
XFS_STATS_INC(xs_push_ail_success);
@@ -221,7 +333,7 @@ xfsaild_push(
break;
}
- spin_lock(&mp->m_ail_lock);
+ spin_lock(&ailp->xa_lock);
/* should we bother continuing? */
if (XFS_FORCED_SHUTDOWN(mp))
break;
@@ -244,14 +356,13 @@ xfsaild_push(
if (stuck > 100)
break;
- lip = xfs_trans_next_ail(mp, lip, &gen, &restarts);
+ lip = xfs_trans_ail_cursor_next(ailp, cur);
if (lip == NULL)
break;
- if (restarts > XFS_TRANS_PUSH_AIL_RESTARTS)
- break;
lsn = lip->li_lsn;
}
- spin_unlock(&mp->m_ail_lock);
+ xfs_trans_ail_cursor_done(ailp, cur);
+ spin_unlock(&ailp->xa_lock);
if (flush_log) {
/*
@@ -274,8 +385,7 @@ xfsaild_push(
*/
tout += 20;
last_pushed_lsn = 0;
- } else if ((restarts > XFS_TRANS_PUSH_AIL_RESTARTS) ||
- ((stuck * 100) / count > 90)) {
+ } else if ((stuck * 100) / count > 90) {
/*
* Either there is a lot of contention on the AIL or we
* are stuck due to operations in progress. "Stuck" in this
@@ -287,7 +397,6 @@ xfsaild_push(
*/
tout += 10;
}
-out:
*last_lsn = last_pushed_lsn;
return tout;
} /* xfsaild_push */
@@ -303,7 +412,7 @@ out:
*/
void
xfs_trans_unlocked_item(
- xfs_mount_t *mp,
+ struct xfs_ail *ailp,
xfs_log_item_t *lip)
{
xfs_log_item_t *min_lip;
@@ -315,7 +424,7 @@ xfs_trans_unlocked_item(
* over some potentially valid data.
*/
if (!(lip->li_flags & XFS_LI_IN_AIL) ||
- XFS_FORCED_SHUTDOWN(mp)) {
+ XFS_FORCED_SHUTDOWN(ailp->xa_mount)) {
return;
}
@@ -331,10 +440,10 @@ xfs_trans_unlocked_item(
* the call to xfs_log_move_tail() doesn't do anything if there's
* not enough free space to wake people up so we're safe calling it.
*/
- min_lip = xfs_ail_min(&mp->m_ail);
+ min_lip = xfs_ail_min(ailp);
if (min_lip == lip)
- xfs_log_move_tail(mp, 1);
+ xfs_log_move_tail(ailp->xa_mount, 1);
} /* xfs_trans_unlocked_item */
@@ -347,41 +456,37 @@ xfs_trans_unlocked_item(
* we move in the AIL is the minimum one, update the tail lsn in the
* log manager.
*
- * Increment the AIL's generation count to indicate that the tree
- * has changed.
- *
* This function must be called with the AIL lock held. The lock
* is dropped before returning.
*/
void
-xfs_trans_update_ail(
- xfs_mount_t *mp,
+xfs_trans_ail_update(
+ struct xfs_ail *ailp,
xfs_log_item_t *lip,
- xfs_lsn_t lsn) __releases(mp->m_ail_lock)
+ xfs_lsn_t lsn) __releases(ailp->xa_lock)
{
- xfs_log_item_t *dlip=NULL;
+ xfs_log_item_t *dlip = NULL;
xfs_log_item_t *mlip; /* ptr to minimum lip */
- mlip = xfs_ail_min(&mp->m_ail);
+ mlip = xfs_ail_min(ailp);
if (lip->li_flags & XFS_LI_IN_AIL) {
- dlip = xfs_ail_delete(&mp->m_ail, lip);
+ dlip = xfs_ail_delete(ailp, lip);
ASSERT(dlip == lip);
+ xfs_trans_ail_cursor_clear(ailp, dlip);
} else {
lip->li_flags |= XFS_LI_IN_AIL;
}
lip->li_lsn = lsn;
-
- xfs_ail_insert(&mp->m_ail, lip);
- mp->m_ail.xa_gen++;
+ xfs_ail_insert(ailp, lip);
if (mlip == dlip) {
- mlip = xfs_ail_min(&mp->m_ail);
- spin_unlock(&mp->m_ail_lock);
- xfs_log_move_tail(mp, mlip->li_lsn);
+ mlip = xfs_ail_min(ailp);
+ spin_unlock(&ailp->xa_lock);
+ xfs_log_move_tail(ailp->xa_mount, mlip->li_lsn);
} else {
- spin_unlock(&mp->m_ail_lock);
+ spin_unlock(&ailp->xa_lock);
}
@@ -403,29 +508,30 @@ xfs_trans_update_ail(
* is dropped before returning.
*/
void
-xfs_trans_delete_ail(
- xfs_mount_t *mp,
- xfs_log_item_t *lip) __releases(mp->m_ail_lock)
+xfs_trans_ail_delete(
+ struct xfs_ail *ailp,
+ xfs_log_item_t *lip) __releases(ailp->xa_lock)
{
xfs_log_item_t *dlip;
xfs_log_item_t *mlip;
if (lip->li_flags & XFS_LI_IN_AIL) {
- mlip = xfs_ail_min(&mp->m_ail);
- dlip = xfs_ail_delete(&mp->m_ail, lip);
+ mlip = xfs_ail_min(ailp);
+ dlip = xfs_ail_delete(ailp, lip);
ASSERT(dlip == lip);
+ xfs_trans_ail_cursor_clear(ailp, dlip);
lip->li_flags &= ~XFS_LI_IN_AIL;
lip->li_lsn = 0;
- mp->m_ail.xa_gen++;
if (mlip == dlip) {
- mlip = xfs_ail_min(&mp->m_ail);
- spin_unlock(&mp->m_ail_lock);
- xfs_log_move_tail(mp, (mlip ? mlip->li_lsn : 0));
+ mlip = xfs_ail_min(ailp);
+ spin_unlock(&ailp->xa_lock);
+ xfs_log_move_tail(ailp->xa_mount,
+ (mlip ? mlip->li_lsn : 0));
} else {
- spin_unlock(&mp->m_ail_lock);
+ spin_unlock(&ailp->xa_lock);
}
}
else {
@@ -433,13 +539,13 @@ xfs_trans_delete_ail(
* If the file system is not being shutdown, we are in
* serious trouble if we get to this stage.
*/
- if (XFS_FORCED_SHUTDOWN(mp))
- spin_unlock(&mp->m_ail_lock);
- else {
+ struct xfs_mount *mp = ailp->xa_mount;
+
+ spin_unlock(&ailp->xa_lock);
+ if (!XFS_FORCED_SHUTDOWN(mp)) {
xfs_cmn_err(XFS_PTAG_AILDELETE, CE_ALERT, mp,
"%s: attempting to delete a log item that is not in the AIL",
__func__);
- spin_unlock(&mp->m_ail_lock);
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
}
}
@@ -448,56 +554,6 @@ xfs_trans_delete_ail(
/*
- * Return the item in the AIL with the smallest lsn.
- * Return the current tree generation number for use
- * in calls to xfs_trans_next_ail().
- */
-xfs_log_item_t *
-xfs_trans_first_ail(
- xfs_mount_t *mp,
- int *gen)
-{
- xfs_log_item_t *lip;
-
- lip = xfs_ail_min(&mp->m_ail);
- *gen = (int)mp->m_ail.xa_gen;
-
- return lip;
-}
-
-/*
- * If the generation count of the tree has not changed since the
- * caller last took something from the AIL, then return the elmt
- * in the tree which follows the one given. If the count has changed,
- * then return the minimum elmt of the AIL and bump the restarts counter
- * if one is given.
- */
-xfs_log_item_t *
-xfs_trans_next_ail(
- xfs_mount_t *mp,
- xfs_log_item_t *lip,
- int *gen,
- int *restarts)
-{
- xfs_log_item_t *nlip;
-
- ASSERT(mp && lip && gen);
- if (mp->m_ail.xa_gen == *gen) {
- nlip = xfs_ail_next(&mp->m_ail, lip);
- } else {
- nlip = xfs_ail_min(&mp->m_ail);
- *gen = (int)mp->m_ail.xa_gen;
- if (restarts != NULL) {
- XFS_STATS_INC(xs_push_ail_restarts);
- (*restarts)++;
- }
- }
-
- return (nlip);
-}
-
-
-/*
* The active item list (AIL) is a doubly linked list of log
* items sorted by ascending lsn. The base of the list is
* a forw/back pointer pair embedded in the xfs mount structure.
@@ -515,15 +571,35 @@ int
xfs_trans_ail_init(
xfs_mount_t *mp)
{
- INIT_LIST_HEAD(&mp->m_ail.xa_ail);
- return xfsaild_start(mp);
+ struct xfs_ail *ailp;
+ int error;
+
+ ailp = kmem_zalloc(sizeof(struct xfs_ail), KM_MAYFAIL);
+ if (!ailp)
+ return ENOMEM;
+
+ ailp->xa_mount = mp;
+ INIT_LIST_HEAD(&ailp->xa_ail);
+ spin_lock_init(&ailp->xa_lock);
+ error = xfsaild_start(ailp);
+ if (error)
+ goto out_free_ailp;
+ mp->m_ail = ailp;
+ return 0;
+
+out_free_ailp:
+ kmem_free(ailp);
+ return error;
}
void
xfs_trans_ail_destroy(
xfs_mount_t *mp)
{
- xfsaild_stop(mp);
+ struct xfs_ail *ailp = mp->m_ail;
+
+ xfsaild_stop(ailp);
+ kmem_free(ailp);
}
/*
@@ -534,7 +610,7 @@ xfs_trans_ail_destroy(
*/
STATIC void
xfs_ail_insert(
- xfs_ail_t *ailp,
+ struct xfs_ail *ailp,
xfs_log_item_t *lip)
/* ARGSUSED */
{
@@ -568,7 +644,7 @@ xfs_ail_insert(
/*ARGSUSED*/
STATIC xfs_log_item_t *
xfs_ail_delete(
- xfs_ail_t *ailp,
+ struct xfs_ail *ailp,
xfs_log_item_t *lip)
/* ARGSUSED */
{
@@ -585,7 +661,7 @@ xfs_ail_delete(
*/
STATIC xfs_log_item_t *
xfs_ail_min(
- xfs_ail_t *ailp)
+ struct xfs_ail *ailp)
/* ARGSUSED */
{
if (list_empty(&ailp->xa_ail))
@@ -601,7 +677,7 @@ xfs_ail_min(
*/
STATIC xfs_log_item_t *
xfs_ail_next(
- xfs_ail_t *ailp,
+ struct xfs_ail *ailp,
xfs_log_item_t *lip)
/* ARGSUSED */
{
@@ -617,7 +693,7 @@ xfs_ail_next(
*/
STATIC void
xfs_ail_check(
- xfs_ail_t *ailp,
+ struct xfs_ail *ailp,
xfs_log_item_t *lip)
{
xfs_log_item_t *prev_lip;
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 4e855b5ced66..8ee2f8c8b0a6 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -527,9 +527,8 @@ xfs_trans_brelse(xfs_trans_t *tp,
lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
if (lip->li_type == XFS_LI_BUF) {
bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
- xfs_trans_unlocked_item(
- bip->bli_item.li_mountp,
- lip);
+ xfs_trans_unlocked_item(bip->bli_item.li_ailp,
+ lip);
}
}
xfs_buf_relse(bp);
@@ -626,7 +625,7 @@ xfs_trans_brelse(xfs_trans_t *tp,
* tell the AIL that the buffer is being unlocked.
*/
if (bip != NULL) {
- xfs_trans_unlocked_item(bip->bli_item.li_mountp,
+ xfs_trans_unlocked_item(bip->bli_item.li_ailp,
(xfs_log_item_t*)bip);
}
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 2a1c0f071f91..23d276af2e0c 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -85,7 +85,6 @@ xfs_trans_iget(
{
int error;
xfs_inode_t *ip;
- xfs_inode_log_item_t *iip;
/*
* If the transaction pointer is NULL, just call the normal
@@ -138,34 +137,7 @@ xfs_trans_iget(
}
ASSERT(ip != NULL);
- /*
- * Get a log_item_desc to point at the new item.
- */
- if (ip->i_itemp == NULL)
- xfs_inode_item_init(ip, mp);
- iip = ip->i_itemp;
- (void) xfs_trans_add_item(tp, (xfs_log_item_t *)(iip));
-
- xfs_trans_inode_broot_debug(ip);
-
- /*
- * If the IO lock has been acquired, mark that in
- * the inode log item so we'll know to unlock it
- * when the transaction commits.
- */
- ASSERT(iip->ili_flags == 0);
- if (lock_flags & XFS_IOLOCK_EXCL) {
- iip->ili_flags |= XFS_ILI_IOLOCKED_EXCL;
- } else if (lock_flags & XFS_IOLOCK_SHARED) {
- iip->ili_flags |= XFS_ILI_IOLOCKED_SHARED;
- }
-
- /*
- * Initialize i_transp so we can find it with xfs_inode_incore()
- * above.
- */
- ip->i_transp = tp;
-
+ xfs_trans_ijoin(tp, ip, lock_flags);
*ipp = ip;
return 0;
}
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index 3c666e8317f8..e110bf57d7f4 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -22,6 +22,14 @@
#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
+/* XXX: from here down needed until struct xfs_trans has it's own ailp */
+#include "xfs_bit.h"
+#include "xfs_buf_item.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
STATIC int xfs_trans_unlock_chunk(xfs_log_item_chunk_t *,
int, int, xfs_lsn_t);
@@ -79,6 +87,7 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
lidp->lid_size = 0;
lip->li_desc = lidp;
lip->li_mountp = tp->t_mountp;
+ lip->li_ailp = tp->t_mountp->m_ail;
return lidp;
}
@@ -120,6 +129,7 @@ xfs_trans_add_item(xfs_trans_t *tp, xfs_log_item_t *lip)
lidp->lid_size = 0;
lip->li_desc = lidp;
lip->li_mountp = tp->t_mountp;
+ lip->li_ailp = tp->t_mountp->m_ail;
return lidp;
}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 3c748c456ed4..73e2ad397432 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -44,25 +44,93 @@ xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
xfs_extlen_t idx);
/*
- * From xfs_trans_ail.c
+ * AIL traversal cursor.
+ *
+ * Rather than using a generation number for detecting changes in the ail, use
+ * a cursor that is protected by the ail lock. The aild cursor exists in the
+ * struct xfs_ail, but other traversals can declare it on the stack and link it
+ * to the ail list.
+ *
+ * When an object is deleted from or moved int the AIL, the cursor list is
+ * searched to see if the object is a designated cursor item. If it is, it is
+ * deleted from the cursor so that the next time the cursor is used traversal
+ * will return to the start.
+ *
+ * This means a traversal colliding with a removal will cause a restart of the
+ * list scan, rather than any insertion or deletion anywhere in the list. The
+ * low bit of the item pointer is set if the cursor has been invalidated so
+ * that we can tell the difference between invalidation and reaching the end
+ * of the list to trigger traversal restarts.
*/
-void xfs_trans_update_ail(struct xfs_mount *mp,
- struct xfs_log_item *lip, xfs_lsn_t lsn)
- __releases(mp->m_ail_lock);
-void xfs_trans_delete_ail(struct xfs_mount *mp,
- struct xfs_log_item *lip)
- __releases(mp->m_ail_lock);
-struct xfs_log_item *xfs_trans_first_ail(struct xfs_mount *, int *);
-struct xfs_log_item *xfs_trans_next_ail(struct xfs_mount *,
- struct xfs_log_item *, int *, int *);
+struct xfs_ail_cursor {
+ struct xfs_ail_cursor *next;
+ struct xfs_log_item *item;
+};
+/*
+ * Private AIL structures.
+ *
+ * Eventually we need to drive the locking in here as well.
+ */
+struct xfs_ail {
+ struct xfs_mount *xa_mount;
+ struct list_head xa_ail;
+ uint xa_gen;
+ struct task_struct *xa_task;
+ xfs_lsn_t xa_target;
+ struct xfs_ail_cursor xa_cursors;
+ spinlock_t xa_lock;
+};
/*
- * AIL push thread support
+ * From xfs_trans_ail.c
*/
-long xfsaild_push(struct xfs_mount *, xfs_lsn_t *);
-void xfsaild_wakeup(struct xfs_mount *, xfs_lsn_t);
-int xfsaild_start(struct xfs_mount *);
-void xfsaild_stop(struct xfs_mount *);
+void xfs_trans_ail_update(struct xfs_ail *ailp,
+ struct xfs_log_item *lip, xfs_lsn_t lsn)
+ __releases(ailp->xa_lock);
+void xfs_trans_ail_delete(struct xfs_ail *ailp,
+ struct xfs_log_item *lip)
+ __releases(ailp->xa_lock);
+void xfs_trans_ail_push(struct xfs_ail *, xfs_lsn_t);
+void xfs_trans_unlocked_item(struct xfs_ail *,
+ xfs_log_item_t *);
+
+xfs_lsn_t xfs_trans_ail_tail(struct xfs_ail *ailp);
+
+struct xfs_log_item *xfs_trans_ail_cursor_first(struct xfs_ail *ailp,
+ struct xfs_ail_cursor *cur,
+ xfs_lsn_t lsn);
+struct xfs_log_item *xfs_trans_ail_cursor_next(struct xfs_ail *ailp,
+ struct xfs_ail_cursor *cur);
+void xfs_trans_ail_cursor_done(struct xfs_ail *ailp,
+ struct xfs_ail_cursor *cur);
+
+long xfsaild_push(struct xfs_ail *, xfs_lsn_t *);
+void xfsaild_wakeup(struct xfs_ail *, xfs_lsn_t);
+int xfsaild_start(struct xfs_ail *);
+void xfsaild_stop(struct xfs_ail *);
+#if BITS_PER_LONG != 64
+static inline void
+xfs_trans_ail_copy_lsn(
+ struct xfs_ail *ailp,
+ xfs_lsn_t *dst,
+ xfs_lsn_t *src)
+{
+ ASSERT(sizeof(xfs_lsn_t) == 8); /* don't lock if it shrinks */
+ spin_lock(&ailp->xa_lock);
+ *dst = *src;
+ spin_unlock(&ailp->xa_lock);
+}
+#else
+static inline void
+xfs_trans_ail_copy_lsn(
+ struct xfs_ail *ailp,
+ xfs_lsn_t *dst,
+ xfs_lsn_t *src)
+{
+ ASSERT(sizeof(xfs_lsn_t) == 8);
+ *dst = *src;
+}
+#endif
#endif /* __XFS_TRANS_PRIV_H__ */
diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c
index 35d4d414bcc2..fcc2285d03ed 100644
--- a/fs/xfs/xfs_utils.c
+++ b/fs/xfs/xfs_utils.c
@@ -172,6 +172,12 @@ xfs_dir_ialloc(
*ipp = NULL;
return code;
}
+
+ /*
+ * transaction commit worked ok so we can drop the extra ticket
+ * reference that we gained in xfs_trans_dup()
+ */
+ xfs_log_ticket_put(tp->t_ticket);
code = xfs_trans_reserve(tp, 0, log_res, 0,
XFS_TRANS_PERM_LOG_RES, log_count);
/*
@@ -268,9 +274,9 @@ xfs_bump_ino_vers2(
xfs_mount_t *mp;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
- ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1);
+ ASSERT(ip->i_d.di_version == 1);
- ip->i_d.di_version = XFS_DINODE_VERSION_2;
+ ip->i_d.di_version = 2;
ip->i_d.di_onlink = 0;
memset(&(ip->i_d.di_pad[0]), 0, sizeof(ip->i_d.di_pad));
mp = tp->t_mountp;
@@ -302,7 +308,7 @@ xfs_bumplink(
ASSERT(ip->i_d.di_nlink > 0);
ip->i_d.di_nlink++;
inc_nlink(VFS_I(ip));
- if ((ip->i_d.di_version == XFS_DINODE_VERSION_1) &&
+ if ((ip->i_d.di_version == 1) &&
(ip->i_d.di_nlink > XFS_MAXLINK_1)) {
/*
* The inode has increased its number of links beyond
diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c
deleted file mode 100644
index 439dd3939dda..000000000000
--- a/fs/xfs/xfs_vfsops.c
+++ /dev/null
@@ -1,757 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_bit.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_dir2.h"
-#include "xfs_dmapi.h"
-#include "xfs_mount.h"
-#include "xfs_da_btree.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_dir2_sf.h"
-#include "xfs_attr_sf.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_inode_item.h"
-#include "xfs_btree.h"
-#include "xfs_alloc.h"
-#include "xfs_ialloc.h"
-#include "xfs_quota.h"
-#include "xfs_error.h"
-#include "xfs_bmap.h"
-#include "xfs_rw.h"
-#include "xfs_buf_item.h"
-#include "xfs_log_priv.h"
-#include "xfs_dir2_trace.h"
-#include "xfs_extfree_item.h"
-#include "xfs_acl.h"
-#include "xfs_attr.h"
-#include "xfs_clnt.h"
-#include "xfs_mru_cache.h"
-#include "xfs_filestream.h"
-#include "xfs_fsops.h"
-#include "xfs_vnodeops.h"
-#include "xfs_vfsops.h"
-#include "xfs_utils.h"
-
-
-STATIC void
-xfs_quiesce_fs(
- xfs_mount_t *mp)
-{
- int count = 0, pincount;
-
- xfs_flush_buftarg(mp->m_ddev_targp, 0);
- xfs_finish_reclaim_all(mp, 0);
-
- /* This loop must run at least twice.
- * The first instance of the loop will flush
- * most meta data but that will generate more
- * meta data (typically directory updates).
- * Which then must be flushed and logged before
- * we can write the unmount record.
- */
- do {
- xfs_syncsub(mp, SYNC_INODE_QUIESCE, NULL);
- pincount = xfs_flush_buftarg(mp->m_ddev_targp, 1);
- if (!pincount) {
- delay(50);
- count++;
- }
- } while (count < 2);
-}
-
-/*
- * Second stage of a quiesce. The data is already synced, now we have to take
- * care of the metadata. New transactions are already blocked, so we need to
- * wait for any remaining transactions to drain out before proceding.
- */
-void
-xfs_attr_quiesce(
- xfs_mount_t *mp)
-{
- int error = 0;
-
- /* wait for all modifications to complete */
- while (atomic_read(&mp->m_active_trans) > 0)
- delay(100);
-
- /* flush inodes and push all remaining buffers out to disk */
- xfs_quiesce_fs(mp);
-
- ASSERT_ALWAYS(atomic_read(&mp->m_active_trans) == 0);
-
- /* Push the superblock and write an unmount record */
- error = xfs_log_sbcount(mp, 1);
- if (error)
- xfs_fs_cmn_err(CE_WARN, mp,
- "xfs_attr_quiesce: failed to log sb changes. "
- "Frozen image may not be consistent.");
- xfs_log_unmount_write(mp);
- xfs_unmountfs_writesb(mp);
-}
-
-/*
- * xfs_unmount_flush implements a set of flush operation on special
- * inodes, which are needed as a separate set of operations so that
- * they can be called as part of relocation process.
- */
-int
-xfs_unmount_flush(
- xfs_mount_t *mp, /* Mount structure we are getting
- rid of. */
- int relocation) /* Called from vfs relocation. */
-{
- xfs_inode_t *rip = mp->m_rootip;
- xfs_inode_t *rbmip;
- xfs_inode_t *rsumip = NULL;
- int error;
-
- xfs_ilock(rip, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
- xfs_iflock(rip);
-
- /*
- * Flush out the real time inodes.
- */
- if ((rbmip = mp->m_rbmip) != NULL) {
- xfs_ilock(rbmip, XFS_ILOCK_EXCL);
- xfs_iflock(rbmip);
- error = xfs_iflush(rbmip, XFS_IFLUSH_SYNC);
- xfs_iunlock(rbmip, XFS_ILOCK_EXCL);
-
- if (error == EFSCORRUPTED)
- goto fscorrupt_out;
-
- ASSERT(vn_count(VFS_I(rbmip)) == 1);
-
- rsumip = mp->m_rsumip;
- xfs_ilock(rsumip, XFS_ILOCK_EXCL);
- xfs_iflock(rsumip);
- error = xfs_iflush(rsumip, XFS_IFLUSH_SYNC);
- xfs_iunlock(rsumip, XFS_ILOCK_EXCL);
-
- if (error == EFSCORRUPTED)
- goto fscorrupt_out;
-
- ASSERT(vn_count(VFS_I(rsumip)) == 1);
- }
-
- /*
- * Synchronously flush root inode to disk
- */
- error = xfs_iflush(rip, XFS_IFLUSH_SYNC);
- if (error == EFSCORRUPTED)
- goto fscorrupt_out2;
-
- if (vn_count(VFS_I(rip)) != 1 && !relocation) {
- xfs_iunlock(rip, XFS_ILOCK_EXCL);
- return XFS_ERROR(EBUSY);
- }
-
- /*
- * Release dquot that rootinode, rbmino and rsumino might be holding,
- * flush and purge the quota inodes.
- */
- error = XFS_QM_UNMOUNT(mp);
- if (error == EFSCORRUPTED)
- goto fscorrupt_out2;
-
- if (rbmip) {
- IRELE(rbmip);
- IRELE(rsumip);
- }
-
- xfs_iunlock(rip, XFS_ILOCK_EXCL);
- return 0;
-
-fscorrupt_out:
- xfs_ifunlock(rip);
-
-fscorrupt_out2:
- xfs_iunlock(rip, XFS_ILOCK_EXCL);
-
- return XFS_ERROR(EFSCORRUPTED);
-}
-
-/*
- * xfs_sync flushes any pending I/O to file system vfsp.
- *
- * This routine is called by vfs_sync() to make sure that things make it
- * out to disk eventually, on sync() system calls to flush out everything,
- * and when the file system is unmounted. For the vfs_sync() case, all
- * we really need to do is sync out the log to make all of our meta-data
- * updates permanent (except for timestamps). For calls from pflushd(),
- * dirty pages are kept moving by calling pdflush() on the inodes
- * containing them. We also flush the inodes that we can lock without
- * sleeping and the superblock if we can lock it without sleeping from
- * vfs_sync() so that items at the tail of the log are always moving out.
- *
- * Flags:
- * SYNC_BDFLUSH - We're being called from vfs_sync() so we don't want
- * to sleep if we can help it. All we really need
- * to do is ensure that the log is synced at least
- * periodically. We also push the inodes and
- * superblock if we can lock them without sleeping
- * and they are not pinned.
- * SYNC_ATTR - We need to flush the inodes. If SYNC_BDFLUSH is not
- * set, then we really want to lock each inode and flush
- * it.
- * SYNC_WAIT - All the flushes that take place in this call should
- * be synchronous.
- * SYNC_DELWRI - This tells us to push dirty pages associated with
- * inodes. SYNC_WAIT and SYNC_BDFLUSH are used to
- * determine if they should be flushed sync, async, or
- * delwri.
- * SYNC_CLOSE - This flag is passed when the system is being
- * unmounted. We should sync and invalidate everything.
- * SYNC_FSDATA - This indicates that the caller would like to make
- * sure the superblock is safe on disk. We can ensure
- * this by simply making sure the log gets flushed
- * if SYNC_BDFLUSH is set, and by actually writing it
- * out otherwise.
- * SYNC_IOWAIT - The caller wants us to wait for all data I/O to complete
- * before we return (including direct I/O). Forms the drain
- * side of the write barrier needed to safely quiesce the
- * filesystem.
- *
- */
-int
-xfs_sync(
- xfs_mount_t *mp,
- int flags)
-{
- int error;
-
- /*
- * Get the Quota Manager to flush the dquots.
- *
- * If XFS quota support is not enabled or this filesystem
- * instance does not use quotas XFS_QM_DQSYNC will always
- * return zero.
- */
- error = XFS_QM_DQSYNC(mp, flags);
- if (error) {
- /*
- * If we got an IO error, we will be shutting down.
- * So, there's nothing more for us to do here.
- */
- ASSERT(error != EIO || XFS_FORCED_SHUTDOWN(mp));
- if (XFS_FORCED_SHUTDOWN(mp))
- return XFS_ERROR(error);
- }
-
- if (flags & SYNC_IOWAIT)
- xfs_filestream_flush(mp);
-
- return xfs_syncsub(mp, flags, NULL);
-}
-
-/*
- * xfs sync routine for internal use
- *
- * This routine supports all of the flags defined for the generic vfs_sync
- * interface as explained above under xfs_sync.
- *
- */
-int
-xfs_sync_inodes(
- xfs_mount_t *mp,
- int flags,
- int *bypassed)
-{
- xfs_inode_t *ip = NULL;
- struct inode *vp = NULL;
- int error;
- int last_error;
- uint64_t fflag;
- uint lock_flags;
- uint base_lock_flags;
- boolean_t mount_locked;
- boolean_t vnode_refed;
- int preempt;
- xfs_iptr_t *ipointer;
-#ifdef DEBUG
- boolean_t ipointer_in = B_FALSE;
-
-#define IPOINTER_SET ipointer_in = B_TRUE
-#define IPOINTER_CLR ipointer_in = B_FALSE
-#else
-#define IPOINTER_SET
-#define IPOINTER_CLR
-#endif
-
-
-/* Insert a marker record into the inode list after inode ip. The list
- * must be locked when this is called. After the call the list will no
- * longer be locked.
- */
-#define IPOINTER_INSERT(ip, mp) { \
- ASSERT(ipointer_in == B_FALSE); \
- ipointer->ip_mnext = ip->i_mnext; \
- ipointer->ip_mprev = ip; \
- ip->i_mnext = (xfs_inode_t *)ipointer; \
- ipointer->ip_mnext->i_mprev = (xfs_inode_t *)ipointer; \
- preempt = 0; \
- XFS_MOUNT_IUNLOCK(mp); \
- mount_locked = B_FALSE; \
- IPOINTER_SET; \
- }
-
-/* Remove the marker from the inode list. If the marker was the only item
- * in the list then there are no remaining inodes and we should zero out
- * the whole list. If we are the current head of the list then move the head
- * past us.
- */
-#define IPOINTER_REMOVE(ip, mp) { \
- ASSERT(ipointer_in == B_TRUE); \
- if (ipointer->ip_mnext != (xfs_inode_t *)ipointer) { \
- ip = ipointer->ip_mnext; \
- ip->i_mprev = ipointer->ip_mprev; \
- ipointer->ip_mprev->i_mnext = ip; \
- if (mp->m_inodes == (xfs_inode_t *)ipointer) { \
- mp->m_inodes = ip; \
- } \
- } else { \
- ASSERT(mp->m_inodes == (xfs_inode_t *)ipointer); \
- mp->m_inodes = NULL; \
- ip = NULL; \
- } \
- IPOINTER_CLR; \
- }
-
-#define XFS_PREEMPT_MASK 0x7f
-
- ASSERT(!(flags & SYNC_BDFLUSH));
-
- if (bypassed)
- *bypassed = 0;
- if (mp->m_flags & XFS_MOUNT_RDONLY)
- return 0;
- error = 0;
- last_error = 0;
- preempt = 0;
-
- /* Allocate a reference marker */
- ipointer = (xfs_iptr_t *)kmem_zalloc(sizeof(xfs_iptr_t), KM_SLEEP);
-
- fflag = XFS_B_ASYNC; /* default is don't wait */
- if (flags & SYNC_DELWRI)
- fflag = XFS_B_DELWRI;
- if (flags & SYNC_WAIT)
- fflag = 0; /* synchronous overrides all */
-
- base_lock_flags = XFS_ILOCK_SHARED;
- if (flags & (SYNC_DELWRI | SYNC_CLOSE)) {
- /*
- * We need the I/O lock if we're going to call any of
- * the flush/inval routines.
- */
- base_lock_flags |= XFS_IOLOCK_SHARED;
- }
-
- XFS_MOUNT_ILOCK(mp);
-
- ip = mp->m_inodes;
-
- mount_locked = B_TRUE;
- vnode_refed = B_FALSE;
-
- IPOINTER_CLR;
-
- do {
- ASSERT(ipointer_in == B_FALSE);
- ASSERT(vnode_refed == B_FALSE);
-
- lock_flags = base_lock_flags;
-
- /*
- * There were no inodes in the list, just break out
- * of the loop.
- */
- if (ip == NULL) {
- break;
- }
-
- /*
- * We found another sync thread marker - skip it
- */
- if (ip->i_mount == NULL) {
- ip = ip->i_mnext;
- continue;
- }
-
- vp = VFS_I(ip);
-
- /*
- * If the vnode is gone then this is being torn down,
- * call reclaim if it is flushed, else let regular flush
- * code deal with it later in the loop.
- */
-
- if (vp == NULL) {
- /* Skip ones already in reclaim */
- if (ip->i_flags & XFS_IRECLAIM) {
- ip = ip->i_mnext;
- continue;
- }
- if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0) {
- ip = ip->i_mnext;
- } else if ((xfs_ipincount(ip) == 0) &&
- xfs_iflock_nowait(ip)) {
- IPOINTER_INSERT(ip, mp);
-
- xfs_finish_reclaim(ip, 1,
- XFS_IFLUSH_DELWRI_ELSE_ASYNC);
-
- XFS_MOUNT_ILOCK(mp);
- mount_locked = B_TRUE;
- IPOINTER_REMOVE(ip, mp);
- } else {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- ip = ip->i_mnext;
- }
- continue;
- }
-
- if (VN_BAD(vp)) {
- ip = ip->i_mnext;
- continue;
- }
-
- if (XFS_FORCED_SHUTDOWN(mp) && !(flags & SYNC_CLOSE)) {
- XFS_MOUNT_IUNLOCK(mp);
- kmem_free(ipointer);
- return 0;
- }
-
- /*
- * Try to lock without sleeping. We're out of order with
- * the inode list lock here, so if we fail we need to drop
- * the mount lock and try again. If we're called from
- * bdflush() here, then don't bother.
- *
- * The inode lock here actually coordinates with the
- * almost spurious inode lock in xfs_ireclaim() to prevent
- * the vnode we handle here without a reference from
- * being freed while we reference it. If we lock the inode
- * while it's on the mount list here, then the spurious inode
- * lock in xfs_ireclaim() after the inode is pulled from
- * the mount list will sleep until we release it here.
- * This keeps the vnode from being freed while we reference
- * it.
- */
- if (xfs_ilock_nowait(ip, lock_flags) == 0) {
- if (vp == NULL) {
- ip = ip->i_mnext;
- continue;
- }
-
- vp = vn_grab(vp);
- if (vp == NULL) {
- ip = ip->i_mnext;
- continue;
- }
-
- IPOINTER_INSERT(ip, mp);
- xfs_ilock(ip, lock_flags);
-
- ASSERT(vp == VFS_I(ip));
- ASSERT(ip->i_mount == mp);
-
- vnode_refed = B_TRUE;
- }
-
- /* From here on in the loop we may have a marker record
- * in the inode list.
- */
-
- /*
- * If we have to flush data or wait for I/O completion
- * we need to drop the ilock that we currently hold.
- * If we need to drop the lock, insert a marker if we
- * have not already done so.
- */
- if ((flags & (SYNC_CLOSE|SYNC_IOWAIT)) ||
- ((flags & SYNC_DELWRI) && VN_DIRTY(vp))) {
- if (mount_locked) {
- IPOINTER_INSERT(ip, mp);
- }
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
- if (flags & SYNC_CLOSE) {
- /* Shutdown case. Flush and invalidate. */
- if (XFS_FORCED_SHUTDOWN(mp))
- xfs_tosspages(ip, 0, -1,
- FI_REMAPF);
- else
- error = xfs_flushinval_pages(ip,
- 0, -1, FI_REMAPF);
- } else if ((flags & SYNC_DELWRI) && VN_DIRTY(vp)) {
- error = xfs_flush_pages(ip, 0,
- -1, fflag, FI_NONE);
- }
-
- /*
- * When freezing, we need to wait ensure all I/O (including direct
- * I/O) is complete to ensure no further data modification can take
- * place after this point
- */
- if (flags & SYNC_IOWAIT)
- vn_iowait(ip);
-
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- }
-
- if ((flags & SYNC_ATTR) &&
- (ip->i_update_core ||
- (ip->i_itemp && ip->i_itemp->ili_format.ilf_fields))) {
- if (mount_locked)
- IPOINTER_INSERT(ip, mp);
-
- if (flags & SYNC_WAIT) {
- xfs_iflock(ip);
- error = xfs_iflush(ip, XFS_IFLUSH_SYNC);
-
- /*
- * If we can't acquire the flush lock, then the inode
- * is already being flushed so don't bother waiting.
- *
- * If we can lock it then do a delwri flush so we can
- * combine multiple inode flushes in each disk write.
- */
- } else if (xfs_iflock_nowait(ip)) {
- error = xfs_iflush(ip, XFS_IFLUSH_DELWRI);
- } else if (bypassed) {
- (*bypassed)++;
- }
- }
-
- if (lock_flags != 0) {
- xfs_iunlock(ip, lock_flags);
- }
-
- if (vnode_refed) {
- /*
- * If we had to take a reference on the vnode
- * above, then wait until after we've unlocked
- * the inode to release the reference. This is
- * because we can be already holding the inode
- * lock when IRELE() calls xfs_inactive().
- *
- * Make sure to drop the mount lock before calling
- * IRELE() so that we don't trip over ourselves if
- * we have to go for the mount lock again in the
- * inactive code.
- */
- if (mount_locked) {
- IPOINTER_INSERT(ip, mp);
- }
-
- IRELE(ip);
-
- vnode_refed = B_FALSE;
- }
-
- if (error) {
- last_error = error;
- }
-
- /*
- * bail out if the filesystem is corrupted.
- */
- if (error == EFSCORRUPTED) {
- if (!mount_locked) {
- XFS_MOUNT_ILOCK(mp);
- IPOINTER_REMOVE(ip, mp);
- }
- XFS_MOUNT_IUNLOCK(mp);
- ASSERT(ipointer_in == B_FALSE);
- kmem_free(ipointer);
- return XFS_ERROR(error);
- }
-
- /* Let other threads have a chance at the mount lock
- * if we have looped many times without dropping the
- * lock.
- */
- if ((++preempt & XFS_PREEMPT_MASK) == 0) {
- if (mount_locked) {
- IPOINTER_INSERT(ip, mp);
- }
- }
-
- if (mount_locked == B_FALSE) {
- XFS_MOUNT_ILOCK(mp);
- mount_locked = B_TRUE;
- IPOINTER_REMOVE(ip, mp);
- continue;
- }
-
- ASSERT(ipointer_in == B_FALSE);
- ip = ip->i_mnext;
-
- } while (ip != mp->m_inodes);
-
- XFS_MOUNT_IUNLOCK(mp);
-
- ASSERT(ipointer_in == B_FALSE);
-
- kmem_free(ipointer);
- return XFS_ERROR(last_error);
-}
-
-/*
- * xfs sync routine for internal use
- *
- * This routine supports all of the flags defined for the generic vfs_sync
- * interface as explained above under xfs_sync.
- *
- */
-int
-xfs_syncsub(
- xfs_mount_t *mp,
- int flags,
- int *bypassed)
-{
- int error = 0;
- int last_error = 0;
- uint log_flags = XFS_LOG_FORCE;
- xfs_buf_t *bp;
- xfs_buf_log_item_t *bip;
-
- /*
- * Sync out the log. This ensures that the log is periodically
- * flushed even if there is not enough activity to fill it up.
- */
- if (flags & SYNC_WAIT)
- log_flags |= XFS_LOG_SYNC;
-
- xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
-
- if (flags & (SYNC_ATTR|SYNC_DELWRI)) {
- if (flags & SYNC_BDFLUSH)
- xfs_finish_reclaim_all(mp, 1);
- else
- error = xfs_sync_inodes(mp, flags, bypassed);
- }
-
- /*
- * Flushing out dirty data above probably generated more
- * log activity, so if this isn't vfs_sync() then flush
- * the log again.
- */
- if (flags & SYNC_DELWRI) {
- xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
- }
-
- if (flags & SYNC_FSDATA) {
- /*
- * If this is vfs_sync() then only sync the superblock
- * if we can lock it without sleeping and it is not pinned.
- */
- if (flags & SYNC_BDFLUSH) {
- bp = xfs_getsb(mp, XFS_BUF_TRYLOCK);
- if (bp != NULL) {
- bip = XFS_BUF_FSPRIVATE(bp,xfs_buf_log_item_t*);
- if ((bip != NULL) &&
- xfs_buf_item_dirty(bip)) {
- if (!(XFS_BUF_ISPINNED(bp))) {
- XFS_BUF_ASYNC(bp);
- error = xfs_bwrite(mp, bp);
- } else {
- xfs_buf_relse(bp);
- }
- } else {
- xfs_buf_relse(bp);
- }
- }
- } else {
- bp = xfs_getsb(mp, 0);
- /*
- * If the buffer is pinned then push on the log so
- * we won't get stuck waiting in the write for
- * someone, maybe ourselves, to flush the log.
- * Even though we just pushed the log above, we
- * did not have the superblock buffer locked at
- * that point so it can become pinned in between
- * there and here.
- */
- if (XFS_BUF_ISPINNED(bp))
- xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
- if (flags & SYNC_WAIT)
- XFS_BUF_UNASYNC(bp);
- else
- XFS_BUF_ASYNC(bp);
- error = xfs_bwrite(mp, bp);
- }
- if (error) {
- last_error = error;
- }
- }
-
- /*
- * Now check to see if the log needs a "dummy" transaction.
- */
- if (!(flags & SYNC_REMOUNT) && xfs_log_need_covered(mp)) {
- xfs_trans_t *tp;
- xfs_inode_t *ip;
-
- /*
- * Put a dummy transaction in the log to tell
- * recovery that all others are OK.
- */
- tp = xfs_trans_alloc(mp, XFS_TRANS_DUMMY1);
- if ((error = xfs_trans_reserve(tp, 0,
- XFS_ICHANGE_LOG_RES(mp),
- 0, 0, 0))) {
- xfs_trans_cancel(tp, 0);
- return error;
- }
-
- ip = mp->m_rootip;
- xfs_ilock(ip, XFS_ILOCK_EXCL);
-
- xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
- xfs_trans_ihold(tp, ip);
- xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
- error = xfs_trans_commit(tp, 0);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_log_force(mp, (xfs_lsn_t)0, log_flags);
- }
-
- /*
- * When shutting down, we need to insure that the AIL is pushed
- * to disk or the filesystem can appear corrupt from the PROM.
- */
- if ((flags & (SYNC_CLOSE|SYNC_WAIT)) == (SYNC_CLOSE|SYNC_WAIT)) {
- XFS_bflush(mp->m_ddev_targp);
- if (mp->m_rtdev_targp) {
- XFS_bflush(mp->m_rtdev_targp);
- }
- }
-
- return XFS_ERROR(last_error);
-}
diff --git a/fs/xfs/xfs_vfsops.h b/fs/xfs/xfs_vfsops.h
deleted file mode 100644
index a74b05087da4..000000000000
--- a/fs/xfs/xfs_vfsops.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef _XFS_VFSOPS_H
-#define _XFS_VFSOPS_H 1
-
-struct cred;
-struct xfs_fid;
-struct inode;
-struct kstatfs;
-struct xfs_mount;
-struct xfs_mount_args;
-
-int xfs_sync(struct xfs_mount *mp, int flags);
-void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
- int lnnum);
-void xfs_attr_quiesce(struct xfs_mount *mp);
-
-#endif /* _XFS_VFSOPS_H */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 8b6812f66a15..f07bf8768c3a 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -54,33 +54,10 @@
#include "xfs_vnodeops.h"
int
-xfs_open(
- xfs_inode_t *ip)
-{
- int mode;
-
- if (XFS_FORCED_SHUTDOWN(ip->i_mount))
- return XFS_ERROR(EIO);
-
- /*
- * If it's a directory with any blocks, read-ahead block 0
- * as we're almost certain to have the next operation be a read there.
- */
- if (S_ISDIR(ip->i_d.di_mode) && ip->i_d.di_nextents > 0) {
- mode = xfs_ilock_map_shared(ip);
- if (ip->i_d.di_nextents > 0)
- (void)xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
- xfs_iunlock(ip, mode);
- }
- return 0;
-}
-
-int
xfs_setattr(
struct xfs_inode *ip,
struct iattr *iattr,
- int flags,
- cred_t *credp)
+ int flags)
{
xfs_mount_t *mp = ip->i_mount;
struct inode *inode = VFS_I(ip);
@@ -93,7 +70,6 @@ xfs_setattr(
gid_t gid=0, igid=0;
int timeflags = 0;
struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2;
- int file_owner;
int need_iolock = 1;
xfs_itrace_entry(ip);
@@ -104,6 +80,10 @@ xfs_setattr(
if (XFS_FORCED_SHUTDOWN(mp))
return XFS_ERROR(EIO);
+ code = -inode_change_ok(inode, iattr);
+ if (code)
+ return code;
+
olddquot1 = olddquot2 = NULL;
udqp = gdqp = NULL;
@@ -181,62 +161,8 @@ xfs_setattr(
xfs_ilock(ip, lock_flags);
- /* boolean: are we the file owner? */
- file_owner = (current_fsuid() == ip->i_d.di_uid);
-
- /*
- * Change various properties of a file.
- * Only the owner or users with CAP_FOWNER
- * capability may do these things.
- */
- if (mask & (ATTR_MODE|ATTR_UID|ATTR_GID)) {
- /*
- * CAP_FOWNER overrides the following restrictions:
- *
- * The user ID of the calling process must be equal
- * to the file owner ID, except in cases where the
- * CAP_FSETID capability is applicable.
- */
- if (!file_owner && !capable(CAP_FOWNER)) {
- code = XFS_ERROR(EPERM);
- goto error_return;
- }
-
- /*
- * CAP_FSETID overrides the following restrictions:
- *
- * The effective user ID of the calling process shall match
- * the file owner when setting the set-user-ID and
- * set-group-ID bits on that file.
- *
- * The effective group ID or one of the supplementary group
- * IDs of the calling process shall match the group owner of
- * the file when setting the set-group-ID bit on that file
- */
- if (mask & ATTR_MODE) {
- mode_t m = 0;
-
- if ((iattr->ia_mode & S_ISUID) && !file_owner)
- m |= S_ISUID;
- if ((iattr->ia_mode & S_ISGID) &&
- !in_group_p((gid_t)ip->i_d.di_gid))
- m |= S_ISGID;
-#if 0
- /* Linux allows this, Irix doesn't. */
- if ((iattr->ia_mode & S_ISVTX) && !S_ISDIR(ip->i_d.di_mode))
- m |= S_ISVTX;
-#endif
- if (m && !capable(CAP_FSETID))
- iattr->ia_mode &= ~m;
- }
- }
-
/*
* Change file ownership. Must be the owner or privileged.
- * If the system was configured with the "restricted_chown"
- * option, the owner is not permitted to give away the file,
- * and can change the group id only to a group of which he
- * or she is a member.
*/
if (mask & (ATTR_UID|ATTR_GID)) {
/*
@@ -251,23 +177,6 @@ xfs_setattr(
uid = (mask & ATTR_UID) ? iattr->ia_uid : iuid;
/*
- * CAP_CHOWN overrides the following restrictions:
- *
- * If _POSIX_CHOWN_RESTRICTED is defined, this capability
- * shall override the restriction that a process cannot
- * change the user ID of a file it owns and the restriction
- * that the group ID supplied to the chown() function
- * shall be equal to either the group ID or one of the
- * supplementary group IDs of the calling process.
- */
- if (restricted_chown &&
- (iuid != uid || (igid != gid &&
- !in_group_p((gid_t)gid))) &&
- !capable(CAP_CHOWN)) {
- code = XFS_ERROR(EPERM);
- goto error_return;
- }
- /*
* Do a quota reservation only if uid/gid is actually
* going to change.
*/
@@ -304,36 +213,22 @@ xfs_setattr(
code = XFS_ERROR(EINVAL);
goto error_return;
}
+
/*
* Make sure that the dquots are attached to the inode.
*/
- if ((code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED)))
+ code = XFS_QM_DQATTACH(mp, ip, XFS_QMOPT_ILOCKED);
+ if (code)
goto error_return;
- }
-
- /*
- * Change file access or modified times.
- */
- if (mask & (ATTR_ATIME|ATTR_MTIME)) {
- if (!file_owner) {
- if ((mask & (ATTR_MTIME_SET|ATTR_ATIME_SET)) &&
- !capable(CAP_FOWNER)) {
- code = XFS_ERROR(EPERM);
- goto error_return;
- }
- }
- }
- /*
- * Now we can make the changes. Before we join the inode
- * to the transaction, if ATTR_SIZE is set then take care of
- * the part of the truncation that must be done without the
- * inode lock. This needs to be done before joining the inode
- * to the transaction, because the inode cannot be unlocked
- * once it is a part of the transaction.
- */
- if (mask & ATTR_SIZE) {
- code = 0;
+ /*
+ * Now we can make the changes. Before we join the inode
+ * to the transaction, if ATTR_SIZE is set then take care of
+ * the part of the truncation that must be done without the
+ * inode lock. This needs to be done before joining the inode
+ * to the transaction, because the inode cannot be unlocked
+ * once it is a part of the transaction.
+ */
if (iattr->ia_size > ip->i_size) {
/*
* Do the first part of growing a file: zero any data
@@ -366,7 +261,7 @@ xfs_setattr(
}
/* wait for all I/O to complete */
- vn_iowait(ip);
+ xfs_ioend_wait(ip);
if (!code)
code = xfs_itruncate_data(ip, iattr->ia_size);
@@ -388,17 +283,10 @@ xfs_setattr(
}
commit_flags = XFS_TRANS_RELEASE_LOG_RES;
xfs_ilock(ip, XFS_ILOCK_EXCL);
- }
- if (tp) {
xfs_trans_ijoin(tp, ip, lock_flags);
xfs_trans_ihold(tp, ip);
- }
- /*
- * Truncate file. Must have write permission and not be a directory.
- */
- if (mask & ATTR_SIZE) {
/*
* Only change the c/mtime if we are changing the size
* or we are explicitly asked to change it. This handles
@@ -438,28 +326,13 @@ xfs_setattr(
*/
xfs_iflags_set(ip, XFS_ITRUNCATED);
}
- }
-
- /*
- * Change file access modes.
- */
- if (mask & ATTR_MODE) {
- ip->i_d.di_mode &= S_IFMT;
- ip->i_d.di_mode |= iattr->ia_mode & ~S_IFMT;
-
- inode->i_mode &= S_IFMT;
- inode->i_mode |= iattr->ia_mode & ~S_IFMT;
-
- xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE);
- timeflags |= XFS_ICHGTIME_CHG;
+ } else if (tp) {
+ xfs_trans_ijoin(tp, ip, lock_flags);
+ xfs_trans_ihold(tp, ip);
}
/*
* Change file ownership. Must be the owner or privileged.
- * If the system was configured with the "restricted_chown"
- * option, the owner is not permitted to give away the file,
- * and can change the group id only to a group of which he
- * or she is a member.
*/
if (mask & (ATTR_UID|ATTR_GID)) {
/*
@@ -503,6 +376,24 @@ xfs_setattr(
timeflags |= XFS_ICHGTIME_CHG;
}
+ /*
+ * Change file access modes.
+ */
+ if (mask & ATTR_MODE) {
+ umode_t mode = iattr->ia_mode;
+
+ if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+ mode &= ~S_ISGID;
+
+ ip->i_d.di_mode &= S_IFMT;
+ ip->i_d.di_mode |= mode & ~S_IFMT;
+
+ inode->i_mode &= S_IFMT;
+ inode->i_mode |= mode & ~S_IFMT;
+
+ xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+ timeflags |= XFS_ICHGTIME_CHG;
+ }
/*
* Change file access or modified times.
@@ -713,7 +604,7 @@ xfs_fsync(
return XFS_ERROR(EIO);
/* capture size updates in I/O completion before writing the inode. */
- error = filemap_fdatawait(VFS_I(ip)->i_mapping);
+ error = xfs_wait_on_pages(ip, 0, -1);
if (error)
return XFS_ERROR(error);
@@ -1029,6 +920,12 @@ xfs_inactive_symlink_rmt(
goto error0;
}
/*
+ * transaction commit worked ok so we can drop the extra ticket
+ * reference that we gained in xfs_trans_dup()
+ */
+ xfs_log_ticket_put(tp->t_ticket);
+
+ /*
* Remove the memory for extent descriptions (just bookkeeping).
*/
if (ip->i_df.if_bytes)
@@ -1625,8 +1522,6 @@ xfs_create(
xfs_trans_set_sync(tp);
}
- dp->i_gen++;
-
/*
* Attach the dquot(s) to the inodes and modify them incore.
* These ids of the inode couldn't have changed since the new
@@ -1993,13 +1888,6 @@ xfs_remove(
}
xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- /*
- * Bump the in memory generation count on the parent
- * directory so that other can know that it has changed.
- */
- dp->i_gen++;
- xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
-
if (is_dir) {
/*
* Drop the link from ip's "..".
@@ -2009,7 +1897,7 @@ xfs_remove(
goto out_bmap_cancel;
/*
- * Drop the link from dp to ip.
+ * Drop the "." link from ip to self.
*/
error = xfs_droplink(tp, ip);
if (error)
@@ -2017,14 +1905,14 @@ xfs_remove(
} else {
/*
* When removing a non-directory we need to log the parent
- * inode here for the i_gen update. For a directory this is
- * done implicitly by the xfs_droplink call for the ".." entry.
+ * inode here. For a directory this is done implicitly
+ * by the xfs_droplink call for the ".." entry.
*/
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
}
/*
- * Drop the "." link from ip to self.
+ * Drop the link from dp to ip.
*/
error = xfs_droplink(tp, ip);
if (error)
@@ -2178,7 +2066,6 @@ xfs_link(
if (error)
goto abort_return;
xfs_ichgtime(tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- tdp->i_gen++;
xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE);
error = xfs_bumplink(tp, sip);
@@ -2355,18 +2242,10 @@ xfs_mkdir(
}
xfs_ichgtime(dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
- /*
- * Bump the in memory version number of the parent directory
- * so that other processes accessing it will recognize that
- * the directory has changed.
- */
- dp->i_gen++;
-
error = xfs_dir_init(tp, cdp, dp);
if (error)
goto error2;
- cdp->i_gen = 1;
error = xfs_bumplink(tp, dp);
if (error)
goto error2;
@@ -2653,13 +2532,6 @@ xfs_symlink(
xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE);
/*
- * Bump the in memory version number of the parent directory
- * so that other processes accessing it will recognize that
- * the directory has changed.
- */
- dp->i_gen++;
-
- /*
* If this is a synchronous mount, make sure that the
* symlink transaction goes to disk before returning to
* the user.
@@ -2809,7 +2681,7 @@ xfs_reclaim(
return 0;
}
- vn_iowait(ip);
+ xfs_ioend_wait(ip);
ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
@@ -2833,122 +2705,10 @@ xfs_reclaim(
if (!ip->i_update_core && (ip->i_itemp == NULL)) {
xfs_ilock(ip, XFS_ILOCK_EXCL);
xfs_iflock(ip);
- return xfs_finish_reclaim(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
- } else {
- xfs_mount_t *mp = ip->i_mount;
-
- /* Protect sync and unpin from us */
- XFS_MOUNT_ILOCK(mp);
- spin_lock(&ip->i_flags_lock);
- __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
- VFS_I(ip)->i_private = NULL;
- ip->i_vnode = NULL;
- spin_unlock(&ip->i_flags_lock);
- list_add_tail(&ip->i_reclaim, &mp->m_del_inodes);
- XFS_MOUNT_IUNLOCK(mp);
- }
- return 0;
-}
-
-int
-xfs_finish_reclaim(
- xfs_inode_t *ip,
- int locked,
- int sync_mode)
-{
- xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino);
- struct inode *vp = VFS_I(ip);
-
- if (vp && VN_BAD(vp))
- goto reclaim;
-
- /* The hash lock here protects a thread in xfs_iget_core from
- * racing with us on linking the inode back with a vnode.
- * Once we have the XFS_IRECLAIM flag set it will not touch
- * us.
- */
- write_lock(&pag->pag_ici_lock);
- spin_lock(&ip->i_flags_lock);
- if (__xfs_iflags_test(ip, XFS_IRECLAIM) ||
- (!__xfs_iflags_test(ip, XFS_IRECLAIMABLE) && vp == NULL)) {
- spin_unlock(&ip->i_flags_lock);
- write_unlock(&pag->pag_ici_lock);
- if (locked) {
- xfs_ifunlock(ip);
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- }
- return 1;
- }
- __xfs_iflags_set(ip, XFS_IRECLAIM);
- spin_unlock(&ip->i_flags_lock);
- write_unlock(&pag->pag_ici_lock);
- xfs_put_perag(ip->i_mount, pag);
-
- /*
- * If the inode is still dirty, then flush it out. If the inode
- * is not in the AIL, then it will be OK to flush it delwri as
- * long as xfs_iflush() does not keep any references to the inode.
- * We leave that decision up to xfs_iflush() since it has the
- * knowledge of whether it's OK to simply do a delwri flush of
- * the inode or whether we need to wait until the inode is
- * pulled from the AIL.
- * We get the flush lock regardless, though, just to make sure
- * we don't free it while it is being flushed.
- */
- if (!locked) {
- xfs_ilock(ip, XFS_ILOCK_EXCL);
- xfs_iflock(ip);
+ xfs_iflags_set(ip, XFS_IRECLAIMABLE);
+ return xfs_reclaim_inode(ip, 1, XFS_IFLUSH_DELWRI_ELSE_SYNC);
}
-
- /*
- * In the case of a forced shutdown we rely on xfs_iflush() to
- * wait for the inode to be unpinned before returning an error.
- */
- if (xfs_iflush(ip, sync_mode) == 0) {
- /* synchronize with xfs_iflush_done */
- xfs_iflock(ip);
- xfs_ifunlock(ip);
- }
-
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
-
- reclaim:
- xfs_ireclaim(ip);
- return 0;
-}
-
-int
-xfs_finish_reclaim_all(xfs_mount_t *mp, int noblock)
-{
- int purged;
- xfs_inode_t *ip, *n;
- int done = 0;
-
- while (!done) {
- purged = 0;
- XFS_MOUNT_ILOCK(mp);
- list_for_each_entry_safe(ip, n, &mp->m_del_inodes, i_reclaim) {
- if (noblock) {
- if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL) == 0)
- continue;
- if (xfs_ipincount(ip) ||
- !xfs_iflock_nowait(ip)) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- continue;
- }
- }
- XFS_MOUNT_IUNLOCK(mp);
- if (xfs_finish_reclaim(ip, noblock,
- XFS_IFLUSH_DELWRI_ELSE_ASYNC))
- delay(1);
- purged = 1;
- break;
- }
-
- done = !purged;
- }
-
- XFS_MOUNT_IUNLOCK(mp);
+ xfs_inode_set_reclaim_tag(ip);
return 0;
}
@@ -3197,6 +2957,8 @@ xfs_zero_remaining_bytes(
bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize,
XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp);
+ if (!bp)
+ return XFS_ERROR(ENOMEM);
for (offset = startoff; offset <= endoff; offset = lastoffset + 1) {
offset_fsb = XFS_B_TO_FSBT(mp, offset);
@@ -3312,7 +3074,8 @@ xfs_free_file_space(
need_iolock = 0;
if (need_iolock) {
xfs_ilock(ip, XFS_IOLOCK_EXCL);
- vn_iowait(ip); /* wait for the completion of any pending DIOs */
+ /* wait for the completion of any pending DIOs */
+ xfs_ioend_wait(ip);
}
rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
@@ -3474,7 +3237,6 @@ xfs_change_file_space(
int cmd,
xfs_flock64_t *bf,
xfs_off_t offset,
- cred_t *credp,
int attr_flags)
{
xfs_mount_t *mp = ip->i_mount;
@@ -3562,7 +3324,7 @@ xfs_change_file_space(
iattr.ia_valid = ATTR_SIZE;
iattr.ia_size = startoffset;
- error = xfs_setattr(ip, &iattr, attr_flags, credp);
+ error = xfs_setattr(ip, &iattr, attr_flags);
if (error)
return error;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 7b0c2ab88333..76df328c61b4 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -14,9 +14,7 @@ struct xfs_inode;
struct xfs_iomap;
-int xfs_open(struct xfs_inode *ip);
-int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags,
- cred_t *credp);
+int xfs_setattr(struct xfs_inode *ip, struct iattr *vap, int flags);
#define XFS_ATTR_DMI 0x01 /* invocation from a DMI function */
#define XFS_ATTR_NONBLOCK 0x02 /* return EAGAIN if operation would block */
#define XFS_ATTR_NOLOCK 0x04 /* Don't grab any conflicting locks */
@@ -44,8 +42,7 @@ int xfs_inode_flush(struct xfs_inode *ip, int flags);
int xfs_set_dmattrs(struct xfs_inode *ip, u_int evmask, u_int16_t state);
int xfs_reclaim(struct xfs_inode *ip);
int xfs_change_file_space(struct xfs_inode *ip, int cmd,
- xfs_flock64_t *bf, xfs_off_t offset,
- cred_t *credp, int attr_flags);
+ xfs_flock64_t *bf, xfs_off_t offset, int attr_flags);
int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
struct xfs_inode *src_ip, struct xfs_inode *target_dp,
struct xfs_name *target_name, struct xfs_inode *target_ip);
@@ -56,8 +53,6 @@ int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value,
int xfs_attr_remove(struct xfs_inode *dp, const char *name, int flags);
int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
int flags, struct attrlist_cursor_kern *cursor);
-int xfs_ioctl(struct xfs_inode *ip, struct file *filp,
- int ioflags, unsigned int cmd, void __user *arg);
ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb,
const struct iovec *iovp, unsigned int segs,
loff_t *offset, int ioflags);
@@ -78,5 +73,6 @@ int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,
xfs_off_t last, int fiopt);
int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
xfs_off_t last, uint64_t flags, int fiopt);
+int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
#endif /* _XFS_VNODEOPS_H */