aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/binfmt_aout.c5
-rw-r--r--fs/binfmt_elf.c5
-rw-r--r--fs/binfmt_elf_fdpic.c6
-rw-r--r--fs/binfmt_em86.c4
-rw-r--r--fs/binfmt_flat.c5
-rw-r--r--fs/binfmt_misc.c4
-rw-r--r--fs/binfmt_script.c4
-rw-r--r--fs/binfmt_som.c5
-rw-r--r--fs/bio.c6
-rw-r--r--fs/block_dev.c129
-rw-r--r--fs/btrfs/disk-io.c8
-rw-r--r--fs/btrfs/file.c3
-rw-r--r--fs/btrfs/ioctl.c2
-rw-r--r--fs/buffer.c157
-rw-r--r--fs/ceph/export.c2
-rw-r--r--fs/cifs/Kconfig10
-rw-r--r--fs/cifs/cifs_debug.h72
-rw-r--r--fs/cifs/cifsacl.c765
-rw-r--r--fs/cifs/cifsacl.h66
-rw-r--r--fs/cifs/cifsfs.c17
-rw-r--r--fs/cifs/cifsglob.h36
-rw-r--r--fs/cifs/cifsproto.h10
-rw-r--r--fs/cifs/connect.c310
-rw-r--r--fs/cifs/dir.c43
-rw-r--r--fs/cifs/file.c206
-rw-r--r--fs/cifs/inode.c7
-rw-r--r--fs/cifs/netmisc.c14
-rw-r--r--fs/cifs/readdir.c55
-rw-r--r--fs/cifs/smb1ops.c35
-rw-r--r--fs/cifs/smb2file.c12
-rw-r--r--fs/cifs/smb2ops.c103
-rw-r--r--fs/cifs/smb2pdu.c5
-rw-r--r--fs/cifs/smb2proto.h4
-rw-r--r--fs/cifs/smb2transport.c13
-rw-r--r--fs/compat_ioctl.c3
-rw-r--r--fs/coredump.c4
-rw-r--r--fs/debugfs/inode.c1
-rw-r--r--fs/devpts/inode.c61
-rw-r--r--fs/direct-io.c8
-rw-r--r--fs/dlm/Kconfig2
-rw-r--r--fs/dlm/dlm_internal.h1
-rw-r--r--fs/dlm/lock.c16
-rw-r--r--fs/dlm/lowcomms.c5
-rw-r--r--fs/dlm/recover.c37
-rw-r--r--fs/eventpoll.c38
-rw-r--r--fs/exec.c37
-rw-r--r--fs/ext3/balloc.c5
-rw-r--r--fs/ext4/ialloc.c19
-rw-r--r--fs/file.c25
-rw-r--r--fs/fs-writeback.c2
-rw-r--r--fs/fs_struct.c24
-rw-r--r--fs/gfs2/file.c14
-rw-r--r--fs/gfs2/glock.c2
-rw-r--r--fs/gfs2/lops.c16
-rw-r--r--fs/gfs2/quota.c7
-rw-r--r--fs/gfs2/rgrp.c33
-rw-r--r--fs/gfs2/super.c3
-rw-r--r--fs/gfs2/trans.c8
-rw-r--r--fs/hugetlbfs/inode.c109
-rw-r--r--fs/inode.c18
-rw-r--r--fs/internal.h1
-rw-r--r--fs/jbd/transaction.c2
-rw-r--r--fs/jffs2/file.c39
-rw-r--r--fs/namei.c5
-rw-r--r--fs/nfs/dir.c7
-rw-r--r--fs/nfs/dns_resolve.c5
-rw-r--r--fs/nfs/inode.c5
-rw-r--r--fs/nfs/internal.h6
-rw-r--r--fs/nfs/mount_clnt.c2
-rw-r--r--fs/nfs/namespace.c19
-rw-r--r--fs/nfs/nfs4namespace.c3
-rw-r--r--fs/nfs/nfs4proc.c46
-rw-r--r--fs/nfs/pnfs.c4
-rw-r--r--fs/nfs/super.c51
-rw-r--r--fs/nfs/unlink.c2
-rw-r--r--fs/nilfs2/page.c2
-rw-r--r--fs/notify/fanotify/fanotify.c1
-rw-r--r--fs/notify/fanotify/fanotify_user.c3
-rw-r--r--fs/ocfs2/file.c5
-rw-r--r--fs/proc/array.c4
-rw-r--r--fs/proc/base.c124
-rw-r--r--fs/proc/proc_sysctl.c9
-rw-r--r--fs/pstore/inode.c7
-rw-r--r--fs/pstore/internal.h2
-rw-r--r--fs/pstore/platform.c14
-rw-r--r--fs/pstore/ram.c9
-rw-r--r--fs/reiserfs/inode.c10
-rw-r--r--fs/reiserfs/stree.c4
-rw-r--r--fs/reiserfs/super.c60
-rw-r--r--fs/splice.c5
-rw-r--r--fs/sysfs/file.c4
-rw-r--r--fs/ubifs/find.c12
-rw-r--r--fs/ubifs/lprops.c6
-rw-r--r--fs/ubifs/ubifs.h3
-rw-r--r--fs/xfs/Kconfig1
-rw-r--r--fs/xfs/Makefile4
-rw-r--r--fs/xfs/uuid.h6
-rw-r--r--fs/xfs/xfs_ag.h5
-rw-r--r--fs/xfs/xfs_alloc.c183
-rw-r--r--fs/xfs/xfs_alloc.h6
-rw-r--r--fs/xfs/xfs_alloc_btree.c79
-rw-r--r--fs/xfs/xfs_alloc_btree.h2
-rw-r--r--fs/xfs/xfs_aops.c137
-rw-r--r--fs/xfs/xfs_attr.c103
-rw-r--r--fs/xfs/xfs_attr_leaf.c163
-rw-r--r--fs/xfs/xfs_attr_leaf.h6
-rw-r--r--fs/xfs/xfs_bmap.c127
-rw-r--r--fs/xfs/xfs_bmap.h9
-rw-r--r--fs/xfs/xfs_bmap_btree.c63
-rw-r--r--fs/xfs/xfs_bmap_btree.h1
-rw-r--r--fs/xfs/xfs_btree.c111
-rw-r--r--fs/xfs/xfs_btree.h22
-rw-r--r--fs/xfs/xfs_buf.c73
-rw-r--r--fs/xfs/xfs_buf.h27
-rw-r--r--fs/xfs/xfs_buf_item.c18
-rw-r--r--fs/xfs/xfs_cksum.h63
-rw-r--r--fs/xfs/xfs_da_btree.c141
-rw-r--r--fs/xfs/xfs_da_btree.h10
-rw-r--r--fs/xfs/xfs_dfrag.c13
-rw-r--r--fs/xfs/xfs_dir2_block.c436
-rw-r--r--fs/xfs/xfs_dir2_data.c170
-rw-r--r--fs/xfs/xfs_dir2_leaf.c172
-rw-r--r--fs/xfs/xfs_dir2_node.c288
-rw-r--r--fs/xfs/xfs_dir2_priv.h19
-rw-r--r--fs/xfs/xfs_dquot.c134
-rw-r--r--fs/xfs/xfs_dquot.h2
-rw-r--r--fs/xfs/xfs_export.c1
-rw-r--r--fs/xfs/xfs_file.c42
-rw-r--r--fs/xfs/xfs_fs.h33
-rw-r--r--fs/xfs/xfs_fs_subr.c96
-rw-r--r--fs/xfs/xfs_fsops.c158
-rw-r--r--fs/xfs/xfs_globals.c4
-rw-r--r--fs/xfs/xfs_ialloc.c84
-rw-r--r--fs/xfs/xfs_ialloc.h4
-rw-r--r--fs/xfs/xfs_ialloc_btree.c55
-rw-r--r--fs/xfs/xfs_ialloc_btree.h2
-rw-r--r--fs/xfs/xfs_icache.c (renamed from fs/xfs/xfs_sync.c)914
-rw-r--r--fs/xfs/xfs_icache.h (renamed from fs/xfs/xfs_sync.h)28
-rw-r--r--fs/xfs/xfs_iget.c705
-rw-r--r--fs/xfs/xfs_inode.c440
-rw-r--r--fs/xfs/xfs_inode.h12
-rw-r--r--fs/xfs/xfs_ioctl.c23
-rw-r--r--fs/xfs/xfs_iomap.c35
-rw-r--r--fs/xfs/xfs_iops.c8
-rw-r--r--fs/xfs/xfs_itable.c4
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_log.c260
-rw-r--r--fs/xfs/xfs_log.h4
-rw-r--r--fs/xfs/xfs_log_priv.h12
-rw-r--r--fs/xfs/xfs_log_recover.c148
-rw-r--r--fs/xfs/xfs_mount.c163
-rw-r--r--fs/xfs/xfs_mount.h13
-rw-r--r--fs/xfs/xfs_qm.c22
-rw-r--r--fs/xfs/xfs_qm_syscalls.c6
-rw-r--r--fs/xfs/xfs_rtalloc.c16
-rw-r--r--fs/xfs/xfs_sb.h7
-rw-r--r--fs/xfs/xfs_super.c148
-rw-r--r--fs/xfs/xfs_super.h1
-rw-r--r--fs/xfs/xfs_sysctl.c9
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_trace.h60
-rw-r--r--fs/xfs/xfs_trans.h19
-rw-r--r--fs/xfs/xfs_trans_buf.c9
-rw-r--r--fs/xfs/xfs_vnodeops.c168
-rw-r--r--fs/xfs/xfs_vnodeops.h9
165 files changed, 5528 insertions, 3898 deletions
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 0e7a6f81ae36..6043567b95c2 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -30,7 +30,7 @@
#include <asm/cacheflush.h>
#include <asm/a.out-core.h>
-static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
+static int load_aout_binary(struct linux_binprm *);
static int load_aout_library(struct file*);
#ifdef CONFIG_COREDUMP
@@ -201,8 +201,9 @@ static unsigned long __user *create_aout_tables(char __user *p, struct linux_bin
* libraries. There is no binary dependent code anywhere else.
*/
-static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+static int load_aout_binary(struct linux_binprm * bprm)
{
+ struct pt_regs *regs = current_pt_regs();
struct exec ex;
unsigned long error;
unsigned long fd_offset;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index fbd9f60bd763..6d7d1647a68c 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -44,7 +44,7 @@
#define user_siginfo_t siginfo_t
#endif
-static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
+static int load_elf_binary(struct linux_binprm *bprm);
static int load_elf_library(struct file *);
static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
int, int, unsigned long);
@@ -558,7 +558,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
#endif
}
-static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+static int load_elf_binary(struct linux_binprm *bprm)
{
struct file *interpreter = NULL; /* to shut gcc up */
unsigned long load_addr = 0, load_bias = 0;
@@ -575,6 +575,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
unsigned long reloc_func_desc __maybe_unused = 0;
int executable_stack = EXSTACK_DEFAULT;
unsigned long def_flags = 0;
+ struct pt_regs *regs = current_pt_regs();
struct {
struct elfhdr elf_ex;
struct elfhdr interp_elf_ex;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index a46049154107..dc84732e554f 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -56,7 +56,7 @@ typedef char *elf_caddr_t;
MODULE_LICENSE("GPL");
-static int load_elf_fdpic_binary(struct linux_binprm *, struct pt_regs *);
+static int load_elf_fdpic_binary(struct linux_binprm *);
static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *);
static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *,
struct mm_struct *, const char *);
@@ -164,10 +164,10 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params,
/*
* load an fdpic binary into various bits of memory
*/
-static int load_elf_fdpic_binary(struct linux_binprm *bprm,
- struct pt_regs *regs)
+static int load_elf_fdpic_binary(struct linux_binprm *bprm)
{
struct elf_fdpic_params exec_params, interp_params;
+ struct pt_regs *regs = current_pt_regs();
struct elf_phdr *phdr;
unsigned long stack_size, entryaddr;
#ifdef ELF_FDPIC_PLAT_INIT
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 2790c7e1912e..4e6cce57d113 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -22,7 +22,7 @@
#define EM86_INTERP "/usr/bin/em86"
#define EM86_I_NAME "em86"
-static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
+static int load_em86(struct linux_binprm *bprm)
{
char *interp, *i_name, *i_arg;
struct file * file;
@@ -90,7 +90,7 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
if (retval < 0)
return retval;
- return search_binary_handler(bprm, regs);
+ return search_binary_handler(bprm);
}
static struct linux_binfmt em86_format = {
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index e280352b28f9..b56371981d16 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -88,7 +88,7 @@ struct lib_info {
static int load_flat_shared_library(int id, struct lib_info *p);
#endif
-static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs);
+static int load_flat_binary(struct linux_binprm *);
static int flat_core_dump(struct coredump_params *cprm);
static struct linux_binfmt flat_format = {
@@ -858,9 +858,10 @@ out:
* libraries. There is no binary dependent code anywhere else.
*/
-static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+static int load_flat_binary(struct linux_binprm * bprm)
{
struct lib_info libinfo;
+ struct pt_regs *regs = current_pt_regs();
unsigned long p = bprm->p;
unsigned long stack_len;
unsigned long start_addr;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 790b3cddca67..b0b70fbea06c 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -104,7 +104,7 @@ static Node *check_file(struct linux_binprm *bprm)
/*
* the loader itself
*/
-static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+static int load_misc_binary(struct linux_binprm *bprm)
{
Node *fmt;
struct file * interp_file = NULL;
@@ -199,7 +199,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
bprm->recursion_depth++;
- retval = search_binary_handler (bprm, regs);
+ retval = search_binary_handler(bprm);
if (retval < 0)
goto _error;
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index d3b8c1f63155..8c954997e7f7 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -14,7 +14,7 @@
#include <linux/err.h>
#include <linux/fs.h>
-static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
+static int load_script(struct linux_binprm *bprm)
{
const char *i_arg, *i_name;
char *cp;
@@ -95,7 +95,7 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
retval = prepare_binprm(bprm);
if (retval < 0)
return retval;
- return search_binary_handler(bprm,regs);
+ return search_binary_handler(bprm);
}
static struct linux_binfmt script_format = {
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 4517aaff61b4..4e00ed68d4a6 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -35,7 +35,7 @@
#include <linux/elf.h>
-static int load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs);
+static int load_som_binary(struct linux_binprm * bprm);
static int load_som_library(struct file *);
/*
@@ -180,13 +180,14 @@ out:
*/
static int
-load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+load_som_binary(struct linux_binprm * bprm)
{
int retval;
unsigned int size;
unsigned long som_entry;
struct som_hdr *som_ex;
struct som_exec_auxhdr *hpuxhdr;
+ struct pt_regs *regs = current_pt_regs();
/* Get the exec-header */
som_ex = (struct som_hdr *) bprm->buf;
diff --git a/fs/bio.c b/fs/bio.c
index 9298c65ad9c7..b96fc6ce4855 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -75,6 +75,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
unsigned int sz = sizeof(struct bio) + extra_size;
struct kmem_cache *slab = NULL;
struct bio_slab *bslab, *new_bio_slabs;
+ unsigned int new_bio_slab_max;
unsigned int i, entry = -1;
mutex_lock(&bio_slab_lock);
@@ -97,12 +98,13 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
goto out_unlock;
if (bio_slab_nr == bio_slab_max && entry == -1) {
- bio_slab_max <<= 1;
+ new_bio_slab_max = bio_slab_max << 1;
new_bio_slabs = krealloc(bio_slabs,
- bio_slab_max * sizeof(struct bio_slab),
+ new_bio_slab_max * sizeof(struct bio_slab),
GFP_KERNEL);
if (!new_bio_slabs)
goto out_unlock;
+ bio_slab_max = new_bio_slab_max;
bio_slabs = new_bio_slabs;
}
if (entry == -1)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b3c1d3dae77d..ab3a456f6650 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -70,19 +70,6 @@ static void bdev_inode_switch_bdi(struct inode *inode,
spin_unlock(&dst->wb.list_lock);
}
-sector_t blkdev_max_block(struct block_device *bdev)
-{
- sector_t retval = ~((sector_t)0);
- loff_t sz = i_size_read(bdev->bd_inode);
-
- if (sz) {
- unsigned int size = block_size(bdev);
- unsigned int sizebits = blksize_bits(size);
- retval = (sz >> sizebits);
- }
- return retval;
-}
-
/* Kill _all_ buffers and pagecache , dirty or not.. */
void kill_bdev(struct block_device *bdev)
{
@@ -116,8 +103,6 @@ EXPORT_SYMBOL(invalidate_bdev);
int set_blocksize(struct block_device *bdev, int size)
{
- struct address_space *mapping;
-
/* Size must be a power of two, and between 512 and PAGE_SIZE */
if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
return -EINVAL;
@@ -126,19 +111,6 @@ int set_blocksize(struct block_device *bdev, int size)
if (size < bdev_logical_block_size(bdev))
return -EINVAL;
- /* Prevent starting I/O or mapping the device */
- percpu_down_write(&bdev->bd_block_size_semaphore);
-
- /* Check that the block device is not memory mapped */
- mapping = bdev->bd_inode->i_mapping;
- mutex_lock(&mapping->i_mmap_mutex);
- if (mapping_mapped(mapping)) {
- mutex_unlock(&mapping->i_mmap_mutex);
- percpu_up_write(&bdev->bd_block_size_semaphore);
- return -EBUSY;
- }
- mutex_unlock(&mapping->i_mmap_mutex);
-
/* Don't change the size if it is same as current */
if (bdev->bd_block_size != size) {
sync_blockdev(bdev);
@@ -146,9 +118,6 @@ int set_blocksize(struct block_device *bdev, int size)
bdev->bd_inode->i_blkbits = blksize_bits(size);
kill_bdev(bdev);
}
-
- percpu_up_write(&bdev->bd_block_size_semaphore);
-
return 0;
}
@@ -181,52 +150,12 @@ static int
blkdev_get_block(struct inode *inode, sector_t iblock,
struct buffer_head *bh, int create)
{
- if (iblock >= blkdev_max_block(I_BDEV(inode))) {
- if (create)
- return -EIO;
-
- /*
- * for reads, we're just trying to fill a partial page.
- * return a hole, they will have to call get_block again
- * before they can fill it, and they will get -EIO at that
- * time
- */
- return 0;
- }
bh->b_bdev = I_BDEV(inode);
bh->b_blocknr = iblock;
set_buffer_mapped(bh);
return 0;
}
-static int
-blkdev_get_blocks(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create)
-{
- sector_t end_block = blkdev_max_block(I_BDEV(inode));
- unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
-
- if ((iblock + max_blocks) > end_block) {
- max_blocks = end_block - iblock;
- if ((long)max_blocks <= 0) {
- if (create)
- return -EIO; /* write fully beyond EOF */
- /*
- * It is a read which is fully beyond EOF. We return
- * a !buffer_mapped buffer
- */
- max_blocks = 0;
- }
- }
-
- bh->b_bdev = I_BDEV(inode);
- bh->b_blocknr = iblock;
- bh->b_size = max_blocks << inode->i_blkbits;
- if (max_blocks)
- set_buffer_mapped(bh);
- return 0;
-}
-
static ssize_t
blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
loff_t offset, unsigned long nr_segs)
@@ -235,7 +164,7 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
struct inode *inode = file->f_mapping->host;
return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
- nr_segs, blkdev_get_blocks, NULL, NULL, 0);
+ nr_segs, blkdev_get_block, NULL, NULL, 0);
}
int __sync_blockdev(struct block_device *bdev, int wait)
@@ -459,12 +388,6 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
if (!ei)
return NULL;
-
- if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) {
- kmem_cache_free(bdev_cachep, ei);
- return NULL;
- }
-
return &ei->vfs_inode;
}
@@ -473,8 +396,6 @@ static void bdev_i_callback(struct rcu_head *head)
struct inode *inode = container_of(head, struct inode, i_rcu);
struct bdev_inode *bdi = BDEV_I(inode);
- percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore);
-
kmem_cache_free(bdev_cachep, bdi);
}
@@ -1593,22 +1514,6 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
return blkdev_ioctl(bdev, mode, cmd, arg);
}
-ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
-{
- ssize_t ret;
- struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
-
- percpu_down_read(&bdev->bd_block_size_semaphore);
-
- ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
-
- percpu_up_read(&bdev->bd_block_size_semaphore);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(blkdev_aio_read);
-
/*
* Write data to the block device. Only intended for the block device itself
* and the raw driver which basically is a fake block device.
@@ -1620,16 +1525,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
unsigned long nr_segs, loff_t pos)
{
struct file *file = iocb->ki_filp;
- struct block_device *bdev = I_BDEV(file->f_mapping->host);
struct blk_plug plug;
ssize_t ret;
BUG_ON(iocb->ki_pos != pos);
blk_start_plug(&plug);
-
- percpu_down_read(&bdev->bd_block_size_semaphore);
-
ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
if (ret > 0 || ret == -EIOCBQUEUED) {
ssize_t err;
@@ -1638,27 +1539,25 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
if (err < 0 && ret > 0)
ret = err;
}
-
- percpu_up_read(&bdev->bd_block_size_semaphore);
-
blk_finish_plug(&plug);
-
return ret;
}
EXPORT_SYMBOL_GPL(blkdev_aio_write);
-static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
+static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos)
{
- int ret;
- struct block_device *bdev = I_BDEV(file->f_mapping->host);
-
- percpu_down_read(&bdev->bd_block_size_semaphore);
-
- ret = generic_file_mmap(file, vma);
+ struct file *file = iocb->ki_filp;
+ struct inode *bd_inode = file->f_mapping->host;
+ loff_t size = i_size_read(bd_inode);
- percpu_up_read(&bdev->bd_block_size_semaphore);
+ if (pos >= size)
+ return 0;
- return ret;
+ size -= pos;
+ if (size < INT_MAX)
+ nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
+ return generic_file_aio_read(iocb, iov, nr_segs, pos);
}
/*
@@ -1691,9 +1590,9 @@ const struct file_operations def_blk_fops = {
.llseek = block_llseek,
.read = do_sync_read,
.write = do_sync_write,
- .aio_read = blkdev_aio_read,
+ .aio_read = blkdev_aio_read,
.aio_write = blkdev_aio_write,
- .mmap = blkdev_mmap,
+ .mmap = generic_file_mmap,
.fsync = blkdev_fsync,
.unlocked_ioctl = block_ioctl,
#ifdef CONFIG_COMPAT
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7cda51995c1e..22a0439e5a86 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3416,8 +3416,8 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
num_dirty = root->fs_info->dirty_metadata_bytes;
if (num_dirty > thresh) {
- balance_dirty_pages_ratelimited_nr(
- root->fs_info->btree_inode->i_mapping, 1);
+ balance_dirty_pages_ratelimited(
+ root->fs_info->btree_inode->i_mapping);
}
return;
}
@@ -3437,8 +3437,8 @@ void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
num_dirty = root->fs_info->dirty_metadata_bytes;
if (num_dirty > thresh) {
- balance_dirty_pages_ratelimited_nr(
- root->fs_info->btree_inode->i_mapping, 1);
+ balance_dirty_pages_ratelimited(
+ root->fs_info->btree_inode->i_mapping);
}
return;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9ab1bed88116..a8ee75cb96ee 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1346,8 +1346,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
cond_resched();
- balance_dirty_pages_ratelimited_nr(inode->i_mapping,
- dirty_pages);
+ balance_dirty_pages_ratelimited(inode->i_mapping);
if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
btrfs_btree_balance_dirty(root, 1);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8fcf9a59c28d..5b3429ab8ec1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1225,7 +1225,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
}
defrag_count += ret;
- balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
+ balance_dirty_pages_ratelimited(inode->i_mapping);
mutex_unlock(&inode->i_mutex);
if (newer_than) {
diff --git a/fs/buffer.c b/fs/buffer.c
index b5f044283edb..6e9ed48064fc 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -555,7 +555,7 @@ void emergency_thaw_all(void)
*/
int sync_mapping_buffers(struct address_space *mapping)
{
- struct address_space *buffer_mapping = mapping->assoc_mapping;
+ struct address_space *buffer_mapping = mapping->private_data;
if (buffer_mapping == NULL || list_empty(&mapping->private_list))
return 0;
@@ -588,10 +588,10 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
struct address_space *buffer_mapping = bh->b_page->mapping;
mark_buffer_dirty(bh);
- if (!mapping->assoc_mapping) {
- mapping->assoc_mapping = buffer_mapping;
+ if (!mapping->private_data) {
+ mapping->private_data = buffer_mapping;
} else {
- BUG_ON(mapping->assoc_mapping != buffer_mapping);
+ BUG_ON(mapping->private_data != buffer_mapping);
}
if (!bh->b_assoc_map) {
spin_lock(&buffer_mapping->private_lock);
@@ -788,7 +788,7 @@ void invalidate_inode_buffers(struct inode *inode)
if (inode_has_buffers(inode)) {
struct address_space *mapping = &inode->i_data;
struct list_head *list = &mapping->private_list;
- struct address_space *buffer_mapping = mapping->assoc_mapping;
+ struct address_space *buffer_mapping = mapping->private_data;
spin_lock(&buffer_mapping->private_lock);
while (!list_empty(list))
@@ -811,7 +811,7 @@ int remove_inode_buffers(struct inode *inode)
if (inode_has_buffers(inode)) {
struct address_space *mapping = &inode->i_data;
struct list_head *list = &mapping->private_list;
- struct address_space *buffer_mapping = mapping->assoc_mapping;
+ struct address_space *buffer_mapping = mapping->private_data;
spin_lock(&buffer_mapping->private_lock);
while (!list_empty(list)) {
@@ -911,6 +911,18 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
attach_page_buffers(page, head);
}
+static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
+{
+ sector_t retval = ~((sector_t)0);
+ loff_t sz = i_size_read(bdev->bd_inode);
+
+ if (sz) {
+ unsigned int sizebits = blksize_bits(size);
+ retval = (sz >> sizebits);
+ }
+ return retval;
+}
+
/*
* Initialise the state of a blockdev page's buffers.
*/
@@ -921,7 +933,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
struct buffer_head *head = page_buffers(page);
struct buffer_head *bh = head;
int uptodate = PageUptodate(page);
- sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode));
+ sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
do {
if (!buffer_mapped(bh)) {
@@ -1553,6 +1565,28 @@ void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
EXPORT_SYMBOL(unmap_underlying_metadata);
/*
+ * Size is a power-of-two in the range 512..PAGE_SIZE,
+ * and the case we care about most is PAGE_SIZE.
+ *
+ * So this *could* possibly be written with those
+ * constraints in mind (relevant mostly if some
+ * architecture has a slow bit-scan instruction)
+ */
+static inline int block_size_bits(unsigned int blocksize)
+{
+ return ilog2(blocksize);
+}
+
+static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
+{
+ BUG_ON(!PageLocked(page));
+
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state);
+ return page_buffers(page);
+}
+
+/*
* NOTE! All mapped/uptodate combinations are valid:
*
* Mapped Uptodate Meaning
@@ -1589,19 +1623,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
sector_t block;
sector_t last_block;
struct buffer_head *bh, *head;
- const unsigned blocksize = 1 << inode->i_blkbits;
+ unsigned int blocksize, bbits;
int nr_underway = 0;
int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
WRITE_SYNC : WRITE);
- BUG_ON(!PageLocked(page));
-
- last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
-
- if (!page_has_buffers(page)) {
- create_empty_buffers(page, blocksize,
+ head = create_page_buffers(page, inode,
(1 << BH_Dirty)|(1 << BH_Uptodate));
- }
/*
* Be very careful. We have no exclusion from __set_page_dirty_buffers
@@ -1613,9 +1641,12 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
* handle that here by just cleaning them.
*/
- block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- head = page_buffers(page);
bh = head;
+ blocksize = bh->b_size;
+ bbits = block_size_bits(blocksize);
+
+ block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+ last_block = (i_size_read(inode) - 1) >> bbits;
/*
* Get all the dirty buffers mapped to disk addresses and
@@ -1806,12 +1837,10 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
BUG_ON(to > PAGE_CACHE_SIZE);
BUG_ON(from > to);
- blocksize = 1 << inode->i_blkbits;
- if (!page_has_buffers(page))
- create_empty_buffers(page, blocksize, 0);
- head = page_buffers(page);
+ head = create_page_buffers(page, inode, 0);
+ blocksize = head->b_size;
+ bbits = block_size_bits(blocksize);
- bbits = inode->i_blkbits;
block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
for(bh = head, block_start = 0; bh != head || !block_start;
@@ -1881,11 +1910,11 @@ static int __block_commit_write(struct inode *inode, struct page *page,
unsigned blocksize;
struct buffer_head *bh, *head;
- blocksize = 1 << inode->i_blkbits;
+ bh = head = page_buffers(page);
+ blocksize = bh->b_size;
- for(bh = head = page_buffers(page), block_start = 0;
- bh != head || !block_start;
- block_start=block_end, bh = bh->b_this_page) {
+ block_start = 0;
+ do {
block_end = block_start + blocksize;
if (block_end <= from || block_start >= to) {
if (!buffer_uptodate(bh))
@@ -1895,7 +1924,10 @@ static int __block_commit_write(struct inode *inode, struct page *page,
mark_buffer_dirty(bh);
}
clear_buffer_new(bh);
- }
+
+ block_start = block_end;
+ bh = bh->b_this_page;
+ } while (bh != head);
/*
* If this is a partial write which happened to make all buffers
@@ -2020,7 +2052,6 @@ EXPORT_SYMBOL(generic_write_end);
int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
unsigned long from)
{
- struct inode *inode = page->mapping->host;
unsigned block_start, block_end, blocksize;
unsigned to;
struct buffer_head *bh, *head;
@@ -2029,13 +2060,13 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
if (!page_has_buffers(page))
return 0;
- blocksize = 1 << inode->i_blkbits;
+ head = page_buffers(page);
+ blocksize = head->b_size;
to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
to = from + to;
if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
return 0;
- head = page_buffers(page);
bh = head;
block_start = 0;
do {
@@ -2068,18 +2099,16 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
struct inode *inode = page->mapping->host;
sector_t iblock, lblock;
struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
- unsigned int blocksize;
+ unsigned int blocksize, bbits;
int nr, i;
int fully_mapped = 1;
- BUG_ON(!PageLocked(page));
- blocksize = 1 << inode->i_blkbits;
- if (!page_has_buffers(page))
- create_empty_buffers(page, blocksize, 0);
- head = page_buffers(page);
+ head = create_page_buffers(page, inode, 0);
+ blocksize = head->b_size;
+ bbits = block_size_bits(blocksize);
- iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
- lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
+ iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+ lblock = (i_size_read(inode)+blocksize-1) >> bbits;
bh = head;
nr = 0;
i = 0;
@@ -2864,6 +2893,55 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
bio_put(bio);
}
+/*
+ * This allows us to do IO even on the odd last sectors
+ * of a device, even if the bh block size is some multiple
+ * of the physical sector size.
+ *
+ * We'll just truncate the bio to the size of the device,
+ * and clear the end of the buffer head manually.
+ *
+ * Truly out-of-range accesses will turn into actual IO
+ * errors, this only handles the "we need to be able to
+ * do IO at the final sector" case.
+ */
+static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
+{
+ sector_t maxsector;
+ unsigned bytes;
+
+ maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
+ if (!maxsector)
+ return;
+
+ /*
+ * If the *whole* IO is past the end of the device,
+ * let it through, and the IO layer will turn it into
+ * an EIO.
+ */
+ if (unlikely(bio->bi_sector >= maxsector))
+ return;
+
+ maxsector -= bio->bi_sector;
+ bytes = bio->bi_size;
+ if (likely((bytes >> 9) <= maxsector))
+ return;
+
+ /* Uhhuh. We've got a bh that straddles the device size! */
+ bytes = maxsector << 9;
+
+ /* Truncate the bio.. */
+ bio->bi_size = bytes;
+ bio->bi_io_vec[0].bv_len = bytes;
+
+ /* ..and clear the end of the buffer for reads */
+ if ((rw & RW_MASK) == READ) {
+ void *kaddr = kmap_atomic(bh->b_page);
+ memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes);
+ kunmap_atomic(kaddr);
+ }
+}
+
int submit_bh(int rw, struct buffer_head * bh)
{
struct bio *bio;
@@ -2900,6 +2978,9 @@ int submit_bh(int rw, struct buffer_head * bh)
bio->bi_end_io = end_bio_bh_io_sync;
bio->bi_private = bh;
+ /* Take care of bh's that straddle the end of the device */
+ guard_bh_eod(rw, bio, bh);
+
bio_get(bio);
submit_bio(rw, bio);
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 02ce90972d81..9349bb37a2fe 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -90,6 +90,8 @@ static int ceph_encode_fh(struct inode *inode, u32 *rawfh, int *max_len,
*max_len = handle_length;
type = 255;
}
+ if (dentry)
+ dput(dentry);
return type;
}
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 2075ddfffa73..21ff76c22a17 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -122,9 +122,17 @@ config CIFS_ACL
Allows fetching CIFS/NTFS ACL from the server. The DACL blob
is handed over to the application/caller.
+config CIFS_DEBUG
+ bool "Enable CIFS debugging routines"
+ default y
+ depends on CIFS
+ help
+ Enabling this option adds helpful debugging messages to
+ the cifs code which increases the size of the cifs module.
+ If unsure, say Y.
config CIFS_DEBUG2
bool "Enable additional CIFS debugging routines"
- depends on CIFS
+ depends on CIFS_DEBUG
help
Enabling this option adds a few more debugging routines
to the cifs code which slightly increases the size of
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index c0c68bb492d7..86e92ef2abc1 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -18,7 +18,6 @@
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*
*/
-#define CIFS_DEBUG /* BB temporary */
#ifndef _H_CIFS_DEBUG
#define _H_CIFS_DEBUG
@@ -37,49 +36,43 @@ void dump_smb(void *, int);
#define CIFS_RC 0x02
#define CIFS_TIMER 0x04
+extern int cifsFYI;
+extern int cifsERROR;
+
/*
* debug ON
* --------
*/
-#ifdef CIFS_DEBUG
+#ifdef CONFIG_CIFS_DEBUG
/* information message: e.g., configuration, major event */
-extern int cifsFYI;
-#define cifsfyi(fmt, arg...) \
+#define cifsfyi(fmt, ...) \
do { \
if (cifsFYI & CIFS_INFO) \
- printk(KERN_DEBUG "%s: " fmt "\n", __FILE__, ##arg); \
+ printk(KERN_DEBUG "%s: " fmt "\n", \
+ __FILE__, ##__VA_ARGS__); \
} while (0)
-#define cFYI(set, fmt, arg...) \
-do { \
- if (set) \
- cifsfyi(fmt, ##arg); \
+#define cFYI(set, fmt, ...) \
+do { \
+ if (set) \
+ cifsfyi(fmt, ##__VA_ARGS__); \
} while (0)
-#define cifswarn(fmt, arg...) \
- printk(KERN_WARNING fmt "\n", ##arg)
+#define cifswarn(fmt, ...) \
+ printk(KERN_WARNING fmt "\n", ##__VA_ARGS__)
-/* debug event message: */
-extern int cifsERROR;
-
-#define cEVENT(fmt, arg...) \
+/* error event message: e.g., i/o error */
+#define cifserror(fmt, ...) \
do { \
if (cifsERROR) \
- printk(KERN_EVENT "%s: " fmt "\n", __FILE__, ##arg); \
-} while (0)
-
-/* error event message: e.g., i/o error */
-#define cifserror(fmt, arg...) \
-do { \
- if (cifsERROR) \
- printk(KERN_ERR "CIFS VFS: " fmt "\n", ##arg); \
+ printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__); \
} while (0)
-#define cERROR(set, fmt, arg...) \
-do { \
- if (set) \
- cifserror(fmt, ##arg); \
+#define cERROR(set, fmt, ...) \
+do { \
+ if (set) \
+ cifserror(fmt, ##__VA_ARGS__); \
} while (0)
/*
@@ -87,10 +80,27 @@ do { \
* ---------
*/
#else /* _CIFS_DEBUG */
-#define cERROR(set, fmt, arg...)
-#define cEVENT(fmt, arg...)
-#define cFYI(set, fmt, arg...)
-#define cifserror(fmt, arg...)
+#define cifsfyi(fmt, ...) \
+do { \
+ if (0) \
+ printk(KERN_DEBUG "%s: " fmt "\n", \
+ __FILE__, ##__VA_ARGS__); \
+} while (0)
+#define cFYI(set, fmt, ...) \
+do { \
+ if (0 && set) \
+ cifsfyi(fmt, ##__VA_ARGS__); \
+} while (0)
+#define cifserror(fmt, ...) \
+do { \
+ if (0) \
+ printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__); \
+} while (0)
+#define cERROR(set, fmt, ...) \
+do { \
+ if (0 && set) \
+ cifserror(fmt, ##__VA_ARGS__); \
+} while (0)
#endif /* _CIFS_DEBUG */
#endif /* _H_CIFS_DEBUG */
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index fc783e264420..75c1ee699143 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -42,135 +42,27 @@ static const struct cifs_sid sid_authusers = {
/* group users */
static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
-const struct cred *root_cred;
-
-static void
-shrink_idmap_tree(struct rb_root *root, int nr_to_scan, int *nr_rem,
- int *nr_del)
-{
- struct rb_node *node;
- struct rb_node *tmp;
- struct cifs_sid_id *psidid;
-
- node = rb_first(root);
- while (node) {
- tmp = node;
- node = rb_next(tmp);
- psidid = rb_entry(tmp, struct cifs_sid_id, rbnode);
- if (nr_to_scan == 0 || *nr_del == nr_to_scan)
- ++(*nr_rem);
- else {
- if (time_after(jiffies, psidid->time + SID_MAP_EXPIRE)
- && psidid->refcount == 0) {
- rb_erase(tmp, root);
- ++(*nr_del);
- } else
- ++(*nr_rem);
- }
- }
-}
-
-/*
- * Run idmap cache shrinker.
- */
-static int
-cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc)
-{
- int nr_to_scan = sc->nr_to_scan;
- int nr_del = 0;
- int nr_rem = 0;
- struct rb_root *root;
-
- root = &uidtree;
- spin_lock(&siduidlock);
- shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
- spin_unlock(&siduidlock);
-
- root = &gidtree;
- spin_lock(&sidgidlock);
- shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
- spin_unlock(&sidgidlock);
-
- root = &siduidtree;
- spin_lock(&uidsidlock);
- shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
- spin_unlock(&uidsidlock);
-
- root = &sidgidtree;
- spin_lock(&gidsidlock);
- shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
- spin_unlock(&gidsidlock);
-
- return nr_rem;
-}
-
-static void
-sid_rb_insert(struct rb_root *root, unsigned long cid,
- struct cifs_sid_id **psidid, char *typestr)
-{
- char *strptr;
- struct rb_node *node = root->rb_node;
- struct rb_node *parent = NULL;
- struct rb_node **linkto = &(root->rb_node);
- struct cifs_sid_id *lsidid;
-
- while (node) {
- lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
- parent = node;
- if (cid > lsidid->id) {
- linkto = &(node->rb_left);
- node = node->rb_left;
- }
- if (cid < lsidid->id) {
- linkto = &(node->rb_right);
- node = node->rb_right;
- }
- }
-
- (*psidid)->id = cid;
- (*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
- (*psidid)->refcount = 0;
-
- sprintf((*psidid)->sidstr, "%s", typestr);
- strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr);
- sprintf(strptr, "%ld", cid);
-
- clear_bit(SID_ID_PENDING, &(*psidid)->state);
- clear_bit(SID_ID_MAPPED, &(*psidid)->state);
-
- rb_link_node(&(*psidid)->rbnode, parent, linkto);
- rb_insert_color(&(*psidid)->rbnode, root);
-}
-
-static struct cifs_sid_id *
-sid_rb_search(struct rb_root *root, unsigned long cid)
-{
- struct rb_node *node = root->rb_node;
- struct cifs_sid_id *lsidid;
-
- while (node) {
- lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
- if (cid > lsidid->id)
- node = node->rb_left;
- else if (cid < lsidid->id)
- node = node->rb_right;
- else /* node found */
- return lsidid;
- }
-
- return NULL;
-}
-
-static struct shrinker cifs_shrinker = {
- .shrink = cifs_idmap_shrinker,
- .seeks = DEFAULT_SEEKS,
-};
+static const struct cred *root_cred;
static int
cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
{
char *payload;
+ /*
+ * If the payload is less than or equal to the size of a pointer, then
+ * an allocation here is wasteful. Just copy the data directly to the
+ * payload.value union member instead.
+ *
+ * With this however, you must check the datalen before trying to
+ * dereference payload.data!
+ */
+ if (prep->datalen <= sizeof(key->payload)) {
+ key->payload.value = 0;
+ memcpy(&key->payload.value, prep->data, prep->datalen);
+ key->datalen = prep->datalen;
+ return 0;
+ }
payload = kmalloc(prep->datalen, GFP_KERNEL);
if (!payload)
return -ENOMEM;
@@ -184,10 +76,11 @@ cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
static inline void
cifs_idmap_key_destroy(struct key *key)
{
- kfree(key->payload.data);
+ if (key->datalen > sizeof(key->payload))
+ kfree(key->payload.data);
}
-struct key_type cifs_idmap_key_type = {
+static struct key_type cifs_idmap_key_type = {
.name = "cifs.idmap",
.instantiate = cifs_idmap_key_instantiate,
.destroy = cifs_idmap_key_destroy,
@@ -195,214 +88,174 @@ struct key_type cifs_idmap_key_type = {
.match = user_match,
};
-static void
-sid_to_str(struct cifs_sid *sidptr, char *sidstr)
+static char *
+sid_to_key_str(struct cifs_sid *sidptr, unsigned int type)
{
- int i;
- unsigned long saval;
- char *strptr;
+ int i, len;
+ unsigned int saval;
+ char *sidstr, *strptr;
+ unsigned long long id_auth_val;
+
+ /* 3 bytes for prefix */
+ sidstr = kmalloc(3 + SID_STRING_BASE_SIZE +
+ (SID_STRING_SUBAUTH_SIZE * sidptr->num_subauth),
+ GFP_KERNEL);
+ if (!sidstr)
+ return sidstr;
strptr = sidstr;
+ len = sprintf(strptr, "%cs:S-%hhu", type == SIDOWNER ? 'o' : 'g',
+ sidptr->revision);
+ strptr += len;
+
+ /* The authority field is a single 48-bit number */
+ id_auth_val = (unsigned long long)sidptr->authority[5];
+ id_auth_val |= (unsigned long long)sidptr->authority[4] << 8;
+ id_auth_val |= (unsigned long long)sidptr->authority[3] << 16;
+ id_auth_val |= (unsigned long long)sidptr->authority[2] << 24;
+ id_auth_val |= (unsigned long long)sidptr->authority[1] << 32;
+ id_auth_val |= (unsigned long long)sidptr->authority[0] << 48;
- sprintf(strptr, "%s", "S");
- strptr = sidstr + strlen(sidstr);
-
- sprintf(strptr, "-%d", sidptr->revision);
- strptr = sidstr + strlen(sidstr);
+ /*
+ * MS-DTYP states that if the authority is >= 2^32, then it should be
+ * expressed as a hex value.
+ */
+ if (id_auth_val <= UINT_MAX)
+ len = sprintf(strptr, "-%llu", id_auth_val);
+ else
+ len = sprintf(strptr, "-0x%llx", id_auth_val);
- for (i = 0; i < 6; ++i) {
- if (sidptr->authority[i]) {
- sprintf(strptr, "-%d", sidptr->authority[i]);
- strptr = sidstr + strlen(sidstr);
- }
- }
+ strptr += len;
for (i = 0; i < sidptr->num_subauth; ++i) {
saval = le32_to_cpu(sidptr->sub_auth[i]);
- sprintf(strptr, "-%ld", saval);
- strptr = sidstr + strlen(sidstr);
+ len = sprintf(strptr, "-%u", saval);
+ strptr += len;
}
+
+ return sidstr;
}
-static void
-id_rb_insert(struct rb_root *root, struct cifs_sid *sidptr,
- struct cifs_sid_id **psidid, char *typestr)
+/*
+ * if the two SIDs (roughly equivalent to a UUID for a user or group) are
+ * the same returns zero, if they do not match returns non-zero.
+ */
+static int
+compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
{
- int rc;
- char *strptr;
- struct rb_node *node = root->rb_node;
- struct rb_node *parent = NULL;
- struct rb_node **linkto = &(root->rb_node);
- struct cifs_sid_id *lsidid;
-
- while (node) {
- lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
- parent = node;
- rc = compare_sids(sidptr, &((lsidid)->sid));
- if (rc > 0) {
- linkto = &(node->rb_left);
- node = node->rb_left;
- } else if (rc < 0) {
- linkto = &(node->rb_right);
- node = node->rb_right;
- }
- }
-
- memcpy(&(*psidid)->sid, sidptr, sizeof(struct cifs_sid));
- (*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
- (*psidid)->refcount = 0;
+ int i;
+ int num_subauth, num_sat, num_saw;
- sprintf((*psidid)->sidstr, "%s", typestr);
- strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr);
- sid_to_str(&(*psidid)->sid, strptr);
+ if ((!ctsid) || (!cwsid))
+ return 1;
- clear_bit(SID_ID_PENDING, &(*psidid)->state);
- clear_bit(SID_ID_MAPPED, &(*psidid)->state);
+ /* compare the revision */
+ if (ctsid->revision != cwsid->revision) {
+ if (ctsid->revision > cwsid->revision)
+ return 1;
+ else
+ return -1;
+ }
- rb_link_node(&(*psidid)->rbnode, parent, linkto);
- rb_insert_color(&(*psidid)->rbnode, root);
-}
+ /* compare all of the six auth values */
+ for (i = 0; i < NUM_AUTHS; ++i) {
+ if (ctsid->authority[i] != cwsid->authority[i]) {
+ if (ctsid->authority[i] > cwsid->authority[i])
+ return 1;
+ else
+ return -1;
+ }
+ }
-static struct cifs_sid_id *
-id_rb_search(struct rb_root *root, struct cifs_sid *sidptr)
-{
- int rc;
- struct rb_node *node = root->rb_node;
- struct cifs_sid_id *lsidid;
-
- while (node) {
- lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
- rc = compare_sids(sidptr, &((lsidid)->sid));
- if (rc > 0) {
- node = node->rb_left;
- } else if (rc < 0) {
- node = node->rb_right;
- } else /* node found */
- return lsidid;
+ /* compare all of the subauth values if any */
+ num_sat = ctsid->num_subauth;
+ num_saw = cwsid->num_subauth;
+ num_subauth = num_sat < num_saw ? num_sat : num_saw;
+ if (num_subauth) {
+ for (i = 0; i < num_subauth; ++i) {
+ if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) {
+ if (le32_to_cpu(ctsid->sub_auth[i]) >
+ le32_to_cpu(cwsid->sub_auth[i]))
+ return 1;
+ else
+ return -1;
+ }
+ }
}
- return NULL;
+ return 0; /* sids compare/match */
}
-static int
-sidid_pending_wait(void *unused)
+static void
+cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src)
{
- schedule();
- return signal_pending(current) ? -ERESTARTSYS : 0;
+ int i;
+
+ dst->revision = src->revision;
+ dst->num_subauth = min_t(u8, src->num_subauth, SID_MAX_SUB_AUTHORITIES);
+ for (i = 0; i < NUM_AUTHS; ++i)
+ dst->authority[i] = src->authority[i];
+ for (i = 0; i < dst->num_subauth; ++i)
+ dst->sub_auth[i] = src->sub_auth[i];
}
static int
-id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid)
+id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid)
{
- int rc = 0;
+ int rc;
struct key *sidkey;
+ struct cifs_sid *ksid;
+ unsigned int ksid_size;
+ char desc[3 + 10 + 1]; /* 3 byte prefix + 10 bytes for value + NULL */
const struct cred *saved_cred;
- struct cifs_sid *lsid;
- struct cifs_sid_id *psidid, *npsidid;
- struct rb_root *cidtree;
- spinlock_t *cidlock;
-
- if (sidtype == SIDOWNER) {
- cidlock = &siduidlock;
- cidtree = &uidtree;
- } else if (sidtype == SIDGROUP) {
- cidlock = &sidgidlock;
- cidtree = &gidtree;
- } else
- return -EINVAL;
- spin_lock(cidlock);
- psidid = sid_rb_search(cidtree, cid);
-
- if (!psidid) { /* node does not exist, allocate one & attempt adding */
- spin_unlock(cidlock);
- npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL);
- if (!npsidid)
- return -ENOMEM;
-
- npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL);
- if (!npsidid->sidstr) {
- kfree(npsidid);
- return -ENOMEM;
- }
+ rc = snprintf(desc, sizeof(desc), "%ci:%u",
+ sidtype == SIDOWNER ? 'o' : 'g', cid);
+ if (rc >= sizeof(desc))
+ return -EINVAL;
- spin_lock(cidlock);
- psidid = sid_rb_search(cidtree, cid);
- if (psidid) { /* node happened to get inserted meanwhile */
- ++psidid->refcount;
- spin_unlock(cidlock);
- kfree(npsidid->sidstr);
- kfree(npsidid);
- } else {
- psidid = npsidid;
- sid_rb_insert(cidtree, cid, &psidid,
- sidtype == SIDOWNER ? "oi:" : "gi:");
- ++psidid->refcount;
- spin_unlock(cidlock);
- }
- } else {
- ++psidid->refcount;
- spin_unlock(cidlock);
+ rc = 0;
+ saved_cred = override_creds(root_cred);
+ sidkey = request_key(&cifs_idmap_key_type, desc, "");
+ if (IS_ERR(sidkey)) {
+ rc = -EINVAL;
+ cFYI(1, "%s: Can't map %cid %u to a SID", __func__,
+ sidtype == SIDOWNER ? 'u' : 'g', cid);
+ goto out_revert_creds;
+ } else if (sidkey->datalen < CIFS_SID_BASE_SIZE) {
+ rc = -EIO;
+ cFYI(1, "%s: Downcall contained malformed key "
+ "(datalen=%hu)", __func__, sidkey->datalen);
+ goto invalidate_key;
}
/*
- * If we are here, it is safe to access psidid and its fields
- * since a reference was taken earlier while holding the spinlock.
- * A reference on the node is put without holding the spinlock
- * and it is OK to do so in this case, shrinker will not erase
- * this node until all references are put and we do not access
- * any fields of the node after a reference is put .
+ * A sid is usually too large to be embedded in payload.value, but if
+ * there are no subauthorities and the host has 8-byte pointers, then
+ * it could be.
*/
- if (test_bit(SID_ID_MAPPED, &psidid->state)) {
- memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid));
- psidid->time = jiffies; /* update ts for accessing */
- goto id_sid_out;
- }
-
- if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) {
- rc = -EINVAL;
- goto id_sid_out;
+ ksid = sidkey->datalen <= sizeof(sidkey->payload) ?
+ (struct cifs_sid *)&sidkey->payload.value :
+ (struct cifs_sid *)sidkey->payload.data;
+
+ ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32));
+ if (ksid_size > sidkey->datalen) {
+ rc = -EIO;
+ cFYI(1, "%s: Downcall contained malformed key (datalen=%hu, "
+ "ksid_size=%u)", __func__, sidkey->datalen, ksid_size);
+ goto invalidate_key;
}
- if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) {
- saved_cred = override_creds(root_cred);
- sidkey = request_key(&cifs_idmap_key_type, psidid->sidstr, "");
- if (IS_ERR(sidkey)) {
- rc = -EINVAL;
- cFYI(1, "%s: Can't map and id to a SID", __func__);
- } else {
- lsid = (struct cifs_sid *)sidkey->payload.data;
- memcpy(&psidid->sid, lsid,
- sidkey->datalen < sizeof(struct cifs_sid) ?
- sidkey->datalen : sizeof(struct cifs_sid));
- memcpy(ssid, &psidid->sid,
- sidkey->datalen < sizeof(struct cifs_sid) ?
- sidkey->datalen : sizeof(struct cifs_sid));
- set_bit(SID_ID_MAPPED, &psidid->state);
- key_put(sidkey);
- kfree(psidid->sidstr);
- }
- psidid->time = jiffies; /* update ts for accessing */
- revert_creds(saved_cred);
- clear_bit(SID_ID_PENDING, &psidid->state);
- wake_up_bit(&psidid->state, SID_ID_PENDING);
- } else {
- rc = wait_on_bit(&psidid->state, SID_ID_PENDING,
- sidid_pending_wait, TASK_INTERRUPTIBLE);
- if (rc) {
- cFYI(1, "%s: sidid_pending_wait interrupted %d",
- __func__, rc);
- --psidid->refcount;
- return rc;
- }
- if (test_bit(SID_ID_MAPPED, &psidid->state))
- memcpy(ssid, &psidid->sid, sizeof(struct cifs_sid));
- else
- rc = -EINVAL;
- }
-id_sid_out:
- --psidid->refcount;
+ cifs_copy_sid(ssid, ksid);
+out_key_put:
+ key_put(sidkey);
+out_revert_creds:
+ revert_creds(saved_cred);
return rc;
+
+invalidate_key:
+ key_invalidate(sidkey);
+ goto out_key_put;
}
static int
@@ -410,111 +263,67 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
struct cifs_fattr *fattr, uint sidtype)
{
int rc;
- unsigned long cid;
- struct key *idkey;
+ struct key *sidkey;
+ char *sidstr;
const struct cred *saved_cred;
- struct cifs_sid_id *psidid, *npsidid;
- struct rb_root *cidtree;
- spinlock_t *cidlock;
-
- if (sidtype == SIDOWNER) {
- cid = cifs_sb->mnt_uid; /* default uid, in case upcall fails */
- cidlock = &siduidlock;
- cidtree = &uidtree;
- } else if (sidtype == SIDGROUP) {
- cid = cifs_sb->mnt_gid; /* default gid, in case upcall fails */
- cidlock = &sidgidlock;
- cidtree = &gidtree;
- } else
- return -ENOENT;
-
- spin_lock(cidlock);
- psidid = id_rb_search(cidtree, psid);
-
- if (!psidid) { /* node does not exist, allocate one & attempt adding */
- spin_unlock(cidlock);
- npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL);
- if (!npsidid)
- return -ENOMEM;
-
- npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL);
- if (!npsidid->sidstr) {
- kfree(npsidid);
- return -ENOMEM;
- }
-
- spin_lock(cidlock);
- psidid = id_rb_search(cidtree, psid);
- if (psidid) { /* node happened to get inserted meanwhile */
- ++psidid->refcount;
- spin_unlock(cidlock);
- kfree(npsidid->sidstr);
- kfree(npsidid);
- } else {
- psidid = npsidid;
- id_rb_insert(cidtree, psid, &psidid,
- sidtype == SIDOWNER ? "os:" : "gs:");
- ++psidid->refcount;
- spin_unlock(cidlock);
- }
- } else {
- ++psidid->refcount;
- spin_unlock(cidlock);
- }
+ uid_t fuid = cifs_sb->mnt_uid;
+ gid_t fgid = cifs_sb->mnt_gid;
/*
- * If we are here, it is safe to access psidid and its fields
- * since a reference was taken earlier while holding the spinlock.
- * A reference on the node is put without holding the spinlock
- * and it is OK to do so in this case, shrinker will not erase
- * this node until all references are put and we do not access
- * any fields of the node after a reference is put .
+ * If we have too many subauthorities, then something is really wrong.
+ * Just return an error.
*/
- if (test_bit(SID_ID_MAPPED, &psidid->state)) {
- cid = psidid->id;
- psidid->time = jiffies; /* update ts for accessing */
- goto sid_to_id_out;
+ if (unlikely(psid->num_subauth > SID_MAX_SUB_AUTHORITIES)) {
+ cFYI(1, "%s: %u subauthorities is too many!", __func__,
+ psid->num_subauth);
+ return -EIO;
}
- if (time_after(psidid->time + SID_MAP_RETRY, jiffies))
- goto sid_to_id_out;
-
- if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) {
- saved_cred = override_creds(root_cred);
- idkey = request_key(&cifs_idmap_key_type, psidid->sidstr, "");
- if (IS_ERR(idkey))
- cFYI(1, "%s: Can't map SID to an id", __func__);
- else {
- cid = *(unsigned long *)idkey->payload.value;
- psidid->id = cid;
- set_bit(SID_ID_MAPPED, &psidid->state);
- key_put(idkey);
- kfree(psidid->sidstr);
- }
- revert_creds(saved_cred);
- psidid->time = jiffies; /* update ts for accessing */
- clear_bit(SID_ID_PENDING, &psidid->state);
- wake_up_bit(&psidid->state, SID_ID_PENDING);
- } else {
- rc = wait_on_bit(&psidid->state, SID_ID_PENDING,
- sidid_pending_wait, TASK_INTERRUPTIBLE);
- if (rc) {
- cFYI(1, "%s: sidid_pending_wait interrupted %d",
- __func__, rc);
- --psidid->refcount; /* decremented without spinlock */
- return rc;
- }
- if (test_bit(SID_ID_MAPPED, &psidid->state))
- cid = psidid->id;
+ sidstr = sid_to_key_str(psid, sidtype);
+ if (!sidstr)
+ return -ENOMEM;
+
+ saved_cred = override_creds(root_cred);
+ sidkey = request_key(&cifs_idmap_key_type, sidstr, "");
+ if (IS_ERR(sidkey)) {
+ rc = -EINVAL;
+ cFYI(1, "%s: Can't map SID %s to a %cid", __func__, sidstr,
+ sidtype == SIDOWNER ? 'u' : 'g');
+ goto out_revert_creds;
+ }
+
+ /*
+ * FIXME: Here we assume that uid_t and gid_t are same size. It's
+ * probably a safe assumption but might be better to check based on
+ * sidtype.
+ */
+ if (sidkey->datalen != sizeof(uid_t)) {
+ rc = -EIO;
+ cFYI(1, "%s: Downcall contained malformed key "
+ "(datalen=%hu)", __func__, sidkey->datalen);
+ key_invalidate(sidkey);
+ goto out_key_put;
}
-sid_to_id_out:
- --psidid->refcount; /* decremented without spinlock */
if (sidtype == SIDOWNER)
- fattr->cf_uid = cid;
+ memcpy(&fuid, &sidkey->payload.value, sizeof(uid_t));
else
- fattr->cf_gid = cid;
+ memcpy(&fgid, &sidkey->payload.value, sizeof(gid_t));
+
+out_key_put:
+ key_put(sidkey);
+out_revert_creds:
+ revert_creds(saved_cred);
+ kfree(sidstr);
+ /*
+ * Note that we return 0 here unconditionally. If the mapping
+ * fails then we just fall back to using the mnt_uid/mnt_gid.
+ */
+ if (sidtype == SIDOWNER)
+ fattr->cf_uid = fuid;
+ else
+ fattr->cf_gid = fgid;
return 0;
}
@@ -561,17 +370,6 @@ init_cifs_idmap(void)
cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
root_cred = cred;
- spin_lock_init(&siduidlock);
- uidtree = RB_ROOT;
- spin_lock_init(&sidgidlock);
- gidtree = RB_ROOT;
-
- spin_lock_init(&uidsidlock);
- siduidtree = RB_ROOT;
- spin_lock_init(&gidsidlock);
- sidgidtree = RB_ROOT;
- register_shrinker(&cifs_shrinker);
-
cFYI(1, "cifs idmap keyring: %d", key_serial(keyring));
return 0;
@@ -588,95 +386,13 @@ exit_cifs_idmap(void)
key_revoke(root_cred->thread_keyring);
unregister_key_type(&cifs_idmap_key_type);
put_cred(root_cred);
- unregister_shrinker(&cifs_shrinker);
cFYI(1, "Unregistered %s key type", cifs_idmap_key_type.name);
}
-void
-cifs_destroy_idmaptrees(void)
-{
- struct rb_root *root;
- struct rb_node *node;
-
- root = &uidtree;
- spin_lock(&siduidlock);
- while ((node = rb_first(root)))
- rb_erase(node, root);
- spin_unlock(&siduidlock);
-
- root = &gidtree;
- spin_lock(&sidgidlock);
- while ((node = rb_first(root)))
- rb_erase(node, root);
- spin_unlock(&sidgidlock);
-
- root = &siduidtree;
- spin_lock(&uidsidlock);
- while ((node = rb_first(root)))
- rb_erase(node, root);
- spin_unlock(&uidsidlock);
-
- root = &sidgidtree;
- spin_lock(&gidsidlock);
- while ((node = rb_first(root)))
- rb_erase(node, root);
- spin_unlock(&gidsidlock);
-}
-
-/* if the two SIDs (roughly equivalent to a UUID for a user or group) are
- the same returns 1, if they do not match returns 0 */
-int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
-{
- int i;
- int num_subauth, num_sat, num_saw;
-
- if ((!ctsid) || (!cwsid))
- return 1;
-
- /* compare the revision */
- if (ctsid->revision != cwsid->revision) {
- if (ctsid->revision > cwsid->revision)
- return 1;
- else
- return -1;
- }
-
- /* compare all of the six auth values */
- for (i = 0; i < 6; ++i) {
- if (ctsid->authority[i] != cwsid->authority[i]) {
- if (ctsid->authority[i] > cwsid->authority[i])
- return 1;
- else
- return -1;
- }
- }
-
- /* compare all of the subauth values if any */
- num_sat = ctsid->num_subauth;
- num_saw = cwsid->num_subauth;
- num_subauth = num_sat < num_saw ? num_sat : num_saw;
- if (num_subauth) {
- for (i = 0; i < num_subauth; ++i) {
- if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) {
- if (le32_to_cpu(ctsid->sub_auth[i]) >
- le32_to_cpu(cwsid->sub_auth[i]))
- return 1;
- else
- return -1;
- }
- }
- }
-
- return 0; /* sids compare/match */
-}
-
-
/* copy ntsd, owner sid, and group sid from a security descriptor to another */
static void copy_sec_desc(const struct cifs_ntsd *pntsd,
struct cifs_ntsd *pnntsd, __u32 sidsoffset)
{
- int i;
-
struct cifs_sid *owner_sid_ptr, *group_sid_ptr;
struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr;
@@ -692,26 +408,14 @@ static void copy_sec_desc(const struct cifs_ntsd *pntsd,
owner_sid_ptr = (struct cifs_sid *)((char *)pntsd +
le32_to_cpu(pntsd->osidoffset));
nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset);
-
- nowner_sid_ptr->revision = owner_sid_ptr->revision;
- nowner_sid_ptr->num_subauth = owner_sid_ptr->num_subauth;
- for (i = 0; i < 6; i++)
- nowner_sid_ptr->authority[i] = owner_sid_ptr->authority[i];
- for (i = 0; i < 5; i++)
- nowner_sid_ptr->sub_auth[i] = owner_sid_ptr->sub_auth[i];
+ cifs_copy_sid(nowner_sid_ptr, owner_sid_ptr);
/* copy group sid */
group_sid_ptr = (struct cifs_sid *)((char *)pntsd +
le32_to_cpu(pntsd->gsidoffset));
ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset +
sizeof(struct cifs_sid));
-
- ngroup_sid_ptr->revision = group_sid_ptr->revision;
- ngroup_sid_ptr->num_subauth = group_sid_ptr->num_subauth;
- for (i = 0; i < 6; i++)
- ngroup_sid_ptr->authority[i] = group_sid_ptr->authority[i];
- for (i = 0; i < 5; i++)
- ngroup_sid_ptr->sub_auth[i] = group_sid_ptr->sub_auth[i];
+ cifs_copy_sid(ngroup_sid_ptr, group_sid_ptr);
return;
}
@@ -818,7 +522,7 @@ static __u16 fill_ace_for_sid(struct cifs_ace *pntace,
pntace->sid.revision = psid->revision;
pntace->sid.num_subauth = psid->num_subauth;
- for (i = 0; i < 6; i++)
+ for (i = 0; i < NUM_AUTHS; i++)
pntace->sid.authority[i] = psid->authority[i];
for (i = 0; i < psid->num_subauth; i++)
pntace->sid.sub_auth[i] = psid->sub_auth[i];
@@ -994,8 +698,8 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
return -EINVAL;
}
- if (psid->num_subauth) {
#ifdef CONFIG_CIFS_DEBUG2
+ if (psid->num_subauth) {
int i;
cFYI(1, "SID revision %d num_auth %d",
psid->revision, psid->num_subauth);
@@ -1009,8 +713,8 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
num auths and therefore go off the end */
cFYI(1, "RID 0x%x",
le32_to_cpu(psid->sub_auth[psid->num_subauth-1]));
-#endif
}
+#endif
return 0;
}
@@ -1120,8 +824,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
kfree(nowner_sid_ptr);
return rc;
}
- memcpy(owner_sid_ptr, nowner_sid_ptr,
- sizeof(struct cifs_sid));
+ cifs_copy_sid(owner_sid_ptr, nowner_sid_ptr);
kfree(nowner_sid_ptr);
*aclflag = CIFS_ACL_OWNER;
}
@@ -1139,8 +842,7 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
kfree(ngroup_sid_ptr);
return rc;
}
- memcpy(group_sid_ptr, ngroup_sid_ptr,
- sizeof(struct cifs_sid));
+ cifs_copy_sid(group_sid_ptr, ngroup_sid_ptr);
kfree(ngroup_sid_ptr);
*aclflag = CIFS_ACL_GROUP;
}
@@ -1316,42 +1018,39 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
/* Get the security descriptor */
pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
-
- /* Add three ACEs for owner, group, everyone getting rid of
- other ACEs as chmod disables ACEs and set the security descriptor */
-
if (IS_ERR(pntsd)) {
rc = PTR_ERR(pntsd);
cERROR(1, "%s: error %d getting sec desc", __func__, rc);
- } else {
- /* allocate memory for the smb header,
- set security descriptor request security descriptor
- parameters, and secuirty descriptor itself */
-
- secdesclen = secdesclen < DEFSECDESCLEN ?
- DEFSECDESCLEN : secdesclen;
- pnntsd = kmalloc(secdesclen, GFP_KERNEL);
- if (!pnntsd) {
- cERROR(1, "Unable to allocate security descriptor");
- kfree(pntsd);
- return -ENOMEM;
- }
+ goto out;
+ }
- rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid,
- &aclflag);
+ /*
+ * Add three ACEs for owner, group, everyone getting rid of other ACEs
+ * as chmod disables ACEs and set the security descriptor. Allocate
+ * memory for the smb header, set security descriptor request security
+ * descriptor parameters, and secuirty descriptor itself
+ */
+ secdesclen = max_t(u32, secdesclen, DEFAULT_SEC_DESC_LEN);
+ pnntsd = kmalloc(secdesclen, GFP_KERNEL);
+ if (!pnntsd) {
+ cERROR(1, "Unable to allocate security descriptor");
+ kfree(pntsd);
+ return -ENOMEM;
+ }
- cFYI(DBG2, "build_sec_desc rc: %d", rc);
+ rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid,
+ &aclflag);
- if (!rc) {
- /* Set the security descriptor */
- rc = set_cifs_acl(pnntsd, secdesclen, inode,
- path, aclflag);
- cFYI(DBG2, "set_cifs_acl rc: %d", rc);
- }
+ cFYI(DBG2, "build_sec_desc rc: %d", rc);
- kfree(pnntsd);
- kfree(pntsd);
+ if (!rc) {
+ /* Set the security descriptor */
+ rc = set_cifs_acl(pnntsd, secdesclen, inode, path, aclflag);
+ cFYI(DBG2, "set_cifs_acl rc: %d", rc);
}
+ kfree(pnntsd);
+ kfree(pntsd);
+out:
return rc;
}
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index 5c902c7ce524..4f3884835267 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -23,11 +23,8 @@
#define _CIFSACL_H
-#define NUM_AUTHS 6 /* number of authority fields */
-#define NUM_SUBAUTHS 5 /* number of sub authority fields */
-#define NUM_WK_SIDS 7 /* number of well known sids */
-#define SIDNAMELENGTH 20 /* long enough for the ones we care about */
-#define DEFSECDESCLEN 192 /* sec desc len contaiting a dacl with three aces */
+#define NUM_AUTHS (6) /* number of authority fields */
+#define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */
#define READ_BIT 0x4
#define WRITE_BIT 0x2
@@ -41,12 +38,32 @@
#define SIDOWNER 1
#define SIDGROUP 2
-#define SIDLEN 150 /* S- 1 revision- 6 authorities- max 5 sub authorities */
-#define SID_ID_MAPPED 0
-#define SID_ID_PENDING 1
-#define SID_MAP_EXPIRE (3600 * HZ) /* map entry expires after one hour */
-#define SID_MAP_RETRY (300 * HZ) /* wait 5 minutes for next attempt to map */
+/*
+ * Security Descriptor length containing DACL with 3 ACEs (one each for
+ * owner, group and world).
+ */
+#define DEFAULT_SEC_DESC_LEN (sizeof(struct cifs_ntsd) + \
+ sizeof(struct cifs_acl) + \
+ (sizeof(struct cifs_ace) * 3))
+
+/*
+ * Maximum size of a string representation of a SID:
+ *
+ * The fields are unsigned values in decimal. So:
+ *
+ * u8: max 3 bytes in decimal
+ * u32: max 10 bytes in decimal
+ *
+ * "S-" + 3 bytes for version field + 15 for authority field + NULL terminator
+ *
+ * For authority field, max is when all 6 values are non-zero and it must be
+ * represented in hex. So "-0x" + 12 hex digits.
+ *
+ * Add 11 bytes for each subauthority field (10 bytes each + 1 for '-')
+ */
+#define SID_STRING_BASE_SIZE (2 + 3 + 15 + 1)
+#define SID_STRING_SUBAUTH_SIZE (11) /* size of a single subauth string */
struct cifs_ntsd {
__le16 revision; /* revision level */
@@ -60,10 +77,13 @@ struct cifs_ntsd {
struct cifs_sid {
__u8 revision; /* revision level */
__u8 num_subauth;
- __u8 authority[6];
- __le32 sub_auth[5]; /* sub_auth[num_subauth] */
+ __u8 authority[NUM_AUTHS];
+ __le32 sub_auth[SID_MAX_SUB_AUTHORITIES]; /* sub_auth[num_subauth] */
} __attribute__((packed));
+/* size of a struct cifs_sid, sans sub_auth array */
+#define CIFS_SID_BASE_SIZE (1 + 1 + NUM_AUTHS)
+
struct cifs_acl {
__le16 revision; /* revision level */
__le16 size;
@@ -78,26 +98,4 @@ struct cifs_ace {
struct cifs_sid sid; /* ie UUID of user or group who gets these perms */
} __attribute__((packed));
-struct cifs_wksid {
- struct cifs_sid cifssid;
- char sidname[SIDNAMELENGTH];
-} __attribute__((packed));
-
-struct cifs_sid_id {
- unsigned int refcount; /* increment with spinlock, decrement without */
- unsigned long id;
- unsigned long time;
- unsigned long state;
- char *sidstr;
- struct rb_node rbnode;
- struct cifs_sid sid;
-};
-
-#ifdef __KERNEL__
-extern struct key_type cifs_idmap_key_type;
-extern const struct cred *root_cred;
-#endif /* KERNEL */
-
-extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *);
-
#endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index e7931cc55d0c..210f0af83fc4 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -64,24 +64,23 @@ unsigned int global_secflags = CIFSSEC_DEF;
unsigned int sign_CIFS_PDUs = 1;
static const struct super_operations cifs_super_ops;
unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
-module_param(CIFSMaxBufSize, int, 0);
+module_param(CIFSMaxBufSize, uint, 0);
MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). "
"Default: 16384 Range: 8192 to 130048");
unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL;
-module_param(cifs_min_rcv, int, 0);
+module_param(cifs_min_rcv, uint, 0);
MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: "
"1 to 64");
unsigned int cifs_min_small = 30;
-module_param(cifs_min_small, int, 0);
+module_param(cifs_min_small, uint, 0);
MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
"Range: 2 to 256");
unsigned int cifs_max_pending = CIFS_MAX_REQ;
-module_param(cifs_max_pending, int, 0444);
+module_param(cifs_max_pending, uint, 0444);
MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
"Default: 32767 Range: 2 to 32767.");
module_param(enable_oplocks, bool, 0644);
-MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:"
- "y/Y/1");
+MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
extern mempool_t *cifs_sm_req_poolp;
extern mempool_t *cifs_req_poolp;
@@ -230,6 +229,7 @@ cifs_alloc_inode(struct super_block *sb)
cifs_set_oplock_level(cifs_inode, 0);
cifs_inode->delete_pending = false;
cifs_inode->invalid_mapping = false;
+ cifs_inode->leave_pages_clean = false;
cifs_inode->vfs_inode.i_blkbits = 14; /* 2**14 = CIFS_MAX_MSGSIZE */
cifs_inode->server_eof = 0;
cifs_inode->uniqueid = 0;
@@ -540,8 +540,8 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
char *s, *p;
char sep;
- full_path = build_path_to_root(vol, cifs_sb,
- cifs_sb_master_tcon(cifs_sb));
+ full_path = cifs_build_path_to_root(vol, cifs_sb,
+ cifs_sb_master_tcon(cifs_sb));
if (full_path == NULL)
return ERR_PTR(-ENOMEM);
@@ -1205,7 +1205,6 @@ exit_cifs(void)
unregister_filesystem(&cifs_fs_type);
cifs_dfs_release_automount_timer();
#ifdef CONFIG_CIFS_ACL
- cifs_destroy_idmaptrees();
exit_cifs_idmap();
#endif
#ifdef CONFIG_CIFS_UPCALL
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f5af2527fc69..aea1eec64911 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -178,6 +178,7 @@ struct smb_rqst {
enum smb_version {
Smb_1 = 1,
+ Smb_20,
Smb_21,
Smb_30,
};
@@ -280,9 +281,6 @@ struct smb_version_operations {
/* set attributes */
int (*set_file_info)(struct inode *, const char *, FILE_BASIC_INFO *,
const unsigned int);
- /* build a full path to the root of the mount */
- char * (*build_path_to_root)(struct smb_vol *, struct cifs_sb_info *,
- struct cifs_tcon *);
/* check if we can send an echo or nor */
bool (*can_echo)(struct TCP_Server_Info *);
/* send echo request */
@@ -369,6 +367,8 @@ struct smb_version_operations {
void (*set_lease_key)(struct inode *, struct cifs_fid *fid);
/* generate new lease key */
void (*new_lease_key)(struct cifs_fid *fid);
+ int (*calc_signature)(struct smb_rqst *rqst,
+ struct TCP_Server_Info *server);
};
struct smb_version_values {
@@ -396,7 +396,6 @@ struct smb_vol {
char *password;
char *domainname;
char *UNC;
- char *UNCip;
char *iocharset; /* local code page for mapping to and from Unicode */
char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
@@ -444,11 +443,11 @@ struct smb_vol {
unsigned int rsize;
unsigned int wsize;
bool sockopt_tcp_nodelay:1;
- unsigned short int port;
unsigned long actimeo; /* attribute cache timeout (jiffies) */
struct smb_version_operations *ops;
struct smb_version_values *vals;
char *prepath;
+ struct sockaddr_storage dstaddr; /* destination address */
struct sockaddr_storage srcaddr; /* allow binding to a local IP */
struct nls_table *local_nls;
};
@@ -1031,6 +1030,7 @@ struct cifsInodeInfo {
bool clientCanCacheAll; /* read and writebehind oplock */
bool delete_pending; /* DELETE_ON_CLOSE is set */
bool invalid_mapping; /* pagecache is invalid */
+ bool leave_pages_clean; /* protected by i_mutex, not set pages dirty */
unsigned long time; /* jiffies of last update of inode */
u64 server_eof; /* current file size on server -- protected by i_lock */
u64 uniqueid; /* server inode number */
@@ -1067,30 +1067,16 @@ static inline char CIFS_DIR_SEP(const struct cifs_sb_info *cifs_sb)
static inline void
convert_delimiter(char *path, char delim)
{
- int i;
- char old_delim;
-
- if (path == NULL)
- return;
+ char old_delim, *pos;
if (delim == '/')
old_delim = '\\';
else
old_delim = '/';
- for (i = 0; path[i] != '\0'; i++) {
- if (path[i] == old_delim)
- path[i] = delim;
- }
-}
-
-static inline char *
-build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
- struct cifs_tcon *tcon)
-{
- if (!vol->ops->build_path_to_root)
- return NULL;
- return vol->ops->build_path_to_root(vol, cifs_sb, tcon);
+ pos = path;
+ while ((pos = strchr(pos, old_delim)))
+ *pos = delim;
}
#ifdef CONFIG_CIFS_STATS
@@ -1362,7 +1348,7 @@ require use of the stronger protocol */
#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */
#define CIFSSEC_MUST_NTLMSSP 0x80080 /* raw ntlmssp with ntlmv2 */
-#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP)
+#define CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMSSP)
#define CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
#define CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
/*
@@ -1506,6 +1492,6 @@ extern struct smb_version_values smb20_values;
extern struct smb_version_operations smb21_operations;
extern struct smb_version_values smb21_values;
#define SMB30_VERSION_STRING "3.0"
-/*extern struct smb_version_operations smb30_operations; */ /* not needed yet */
+extern struct smb_version_operations smb30_operations;
extern struct smb_version_values smb30_values;
#endif /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 5144e9fbeb8c..1988c1baa224 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -58,8 +58,10 @@ do { \
} while (0)
extern int init_cifs_idmap(void);
extern void exit_cifs_idmap(void);
-extern void cifs_destroy_idmaptrees(void);
extern char *build_path_from_dentry(struct dentry *);
+extern char *cifs_build_path_to_root(struct smb_vol *vol,
+ struct cifs_sb_info *cifs_sb,
+ struct cifs_tcon *tcon);
extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
extern char *cifs_compose_mount_options(const char *sb_mountdata,
const char *fullpath, const struct dfs_info3_param *ref,
@@ -107,9 +109,7 @@ extern unsigned int smbCalcSize(void *buf);
extern int decode_negTokenInit(unsigned char *security_blob, int length,
struct TCP_Server_Info *server);
extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
-extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port);
-extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
- const unsigned short int port);
+extern void cifs_set_port(struct sockaddr *addr, const unsigned short int port);
extern int map_smb_to_linux_error(char *buf, bool logErr);
extern void header_assemble(struct smb_hdr *, char /* command */ ,
const struct cifs_tcon *, int /* length of
@@ -185,7 +185,7 @@ extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon);
extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset,
__u64 length, __u8 type,
struct cifsLockInfo **conf_lock,
- bool rw_check);
+ int rw_check);
extern void cifs_add_pending_open(struct cifs_fid *fid,
struct tcon_link *tlink,
struct cifs_pending_open *open);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 5c670b998ffb..7635b5db26a7 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -186,6 +186,7 @@ static const match_table_t cifs_mount_option_tokens = {
{ Opt_user, "user=%s" },
{ Opt_user, "username=%s" },
{ Opt_blank_pass, "pass=" },
+ { Opt_blank_pass, "password=" },
{ Opt_pass, "pass=%s" },
{ Opt_pass, "password=%s" },
{ Opt_blank_ip, "ip=" },
@@ -274,6 +275,7 @@ static const match_table_t cifs_cacheflavor_tokens = {
static const match_table_t cifs_smb_version_tokens = {
{ Smb_1, SMB1_VERSION_STRING },
+ { Smb_20, SMB20_VERSION_STRING},
{ Smb_21, SMB21_VERSION_STRING },
{ Smb_30, SMB30_VERSION_STRING },
};
@@ -1074,12 +1076,16 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
vol->vals = &smb1_values;
break;
#ifdef CONFIG_CIFS_SMB2
+ case Smb_20:
+ vol->ops = &smb21_operations; /* currently identical with 2.1 */
+ vol->vals = &smb20_values;
+ break;
case Smb_21:
vol->ops = &smb21_operations;
vol->vals = &smb21_values;
break;
case Smb_30:
- vol->ops = &smb21_operations; /* currently identical with 2.1 */
+ vol->ops = &smb30_operations;
vol->vals = &smb30_values;
break;
#endif
@@ -1090,6 +1096,52 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
return 0;
}
+/*
+ * Parse a devname into substrings and populate the vol->UNC and vol->prepath
+ * fields with the result. Returns 0 on success and an error otherwise.
+ */
+static int
+cifs_parse_devname(const char *devname, struct smb_vol *vol)
+{
+ char *pos;
+ const char *delims = "/\\";
+ size_t len;
+
+ /* make sure we have a valid UNC double delimiter prefix */
+ len = strspn(devname, delims);
+ if (len != 2)
+ return -EINVAL;
+
+ /* find delimiter between host and sharename */
+ pos = strpbrk(devname + 2, delims);
+ if (!pos)
+ return -EINVAL;
+
+ /* skip past delimiter */
+ ++pos;
+
+ /* now go until next delimiter or end of string */
+ len = strcspn(pos, delims);
+
+ /* move "pos" up to delimiter or NULL */
+ pos += len;
+ vol->UNC = kstrndup(devname, pos - devname, GFP_KERNEL);
+ if (!vol->UNC)
+ return -ENOMEM;
+
+ convert_delimiter(vol->UNC, '\\');
+
+ /* If pos is NULL, or is a bogus trailing delimiter then no prepath */
+ if (!*pos++ || !*pos)
+ return 0;
+
+ vol->prepath = kstrdup(pos, GFP_KERNEL);
+ if (!vol->prepath)
+ return -ENOMEM;
+
+ return 0;
+}
+
static int
cifs_parse_mount_options(const char *mountdata, const char *devname,
struct smb_vol *vol)
@@ -1108,11 +1160,17 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
char *string = NULL;
char *tmp_end, *value;
char delim;
+ bool got_ip = false;
+ unsigned short port = 0;
+ struct sockaddr *dstaddr = (struct sockaddr *)&vol->dstaddr;
separator[0] = ',';
separator[1] = 0;
delim = separator[0];
+ /* ensure we always start with zeroed-out smb_vol */
+ memset(vol, 0, sizeof(*vol));
+
/*
* does not have to be perfect mapping since field is
* informational, only used for servers that do not support
@@ -1169,6 +1227,16 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->backupuid_specified = false; /* no backup intent for a user */
vol->backupgid_specified = false; /* no backup intent for a group */
+ /*
+ * For now, we ignore -EINVAL errors under the assumption that the
+ * unc= and prefixpath= options will be usable.
+ */
+ if (cifs_parse_devname(devname, vol) == -ENOMEM) {
+ printk(KERN_ERR "CIFS: Unable to allocate memory to parse "
+ "device string.\n");
+ goto out_nomem;
+ }
+
while ((data = strsep(&options, separator)) != NULL) {
substring_t args[MAX_OPT_ARGS];
unsigned long option;
@@ -1416,12 +1484,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->dir_mode = option;
break;
case Opt_port:
- if (get_option_ul(args, &option)) {
- cERROR(1, "%s: Invalid port value",
- __func__);
+ if (get_option_ul(args, &option) ||
+ option > USHRT_MAX) {
+ cERROR(1, "%s: Invalid port value", __func__);
goto cifs_parse_mount_err;
}
- vol->port = option;
+ port = (unsigned short)option;
break;
case Opt_rsize:
if (get_option_ul(args, &option)) {
@@ -1537,53 +1605,48 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
vol->password[j] = '\0';
break;
case Opt_blank_ip:
- vol->UNCip = NULL;
+ /* FIXME: should this be an error instead? */
+ got_ip = false;
break;
case Opt_ip:
string = match_strdup(args);
if (string == NULL)
goto out_nomem;
- if (strnlen(string, INET6_ADDRSTRLEN) >
- INET6_ADDRSTRLEN) {
- printk(KERN_WARNING "CIFS: ip address "
- "too long\n");
- goto cifs_parse_mount_err;
- }
- vol->UNCip = kstrdup(string, GFP_KERNEL);
- if (!vol->UNCip) {
- printk(KERN_WARNING "CIFS: no memory "
- "for UNC IP\n");
+ if (!cifs_convert_address(dstaddr, string,
+ strlen(string))) {
+ printk(KERN_ERR "CIFS: bad ip= option (%s).\n",
+ string);
goto cifs_parse_mount_err;
}
+ got_ip = true;
break;
case Opt_unc:
- string = match_strdup(args);
- if (string == NULL)
+ string = vol->UNC;
+ vol->UNC = match_strdup(args);
+ if (vol->UNC == NULL) {
+ kfree(string);
goto out_nomem;
-
- temp_len = strnlen(string, 300);
- if (temp_len == 300) {
- printk(KERN_WARNING "CIFS: UNC name too long\n");
- goto cifs_parse_mount_err;
}
- vol->UNC = kmalloc(temp_len+1, GFP_KERNEL);
- if (vol->UNC == NULL) {
- printk(KERN_WARNING "CIFS: no memory for UNC\n");
- goto cifs_parse_mount_err;
- }
- strcpy(vol->UNC, string);
-
- if (strncmp(string, "//", 2) == 0) {
- vol->UNC[0] = '\\';
- vol->UNC[1] = '\\';
- } else if (strncmp(string, "\\\\", 2) != 0) {
- printk(KERN_WARNING "CIFS: UNC Path does not "
- "begin with // or \\\\\n");
+ convert_delimiter(vol->UNC, '\\');
+ if (vol->UNC[0] != '\\' || vol->UNC[1] != '\\') {
+ kfree(string);
+ printk(KERN_ERR "CIFS: UNC Path does not "
+ "begin with // or \\\\\n");
goto cifs_parse_mount_err;
}
+ /* Compare old unc= option to new one */
+ if (!string || strcmp(string, vol->UNC))
+ printk(KERN_WARNING "CIFS: the value of the "
+ "unc= mount option does not match the "
+ "device string. Using the unc= option "
+ "for now. In 3.10, that option will "
+ "be ignored and the contents of the "
+ "device string will be used "
+ "instead. (%s != %s)\n", string,
+ vol->UNC);
break;
case Opt_domain:
string = match_strdup(args);
@@ -1618,31 +1681,26 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
}
break;
case Opt_prefixpath:
- string = match_strdup(args);
- if (string == NULL)
- goto out_nomem;
-
- temp_len = strnlen(string, 1024);
- if (string[0] != '/')
- temp_len++; /* missing leading slash */
- if (temp_len > 1024) {
- printk(KERN_WARNING "CIFS: prefix too long\n");
- goto cifs_parse_mount_err;
- }
+ /* skip over any leading delimiter */
+ if (*args[0].from == '/' || *args[0].from == '\\')
+ args[0].from++;
- vol->prepath = kmalloc(temp_len+1, GFP_KERNEL);
+ string = vol->prepath;
+ vol->prepath = match_strdup(args);
if (vol->prepath == NULL) {
- printk(KERN_WARNING "CIFS: no memory "
- "for path prefix\n");
- goto cifs_parse_mount_err;
+ kfree(string);
+ goto out_nomem;
}
-
- if (string[0] != '/') {
- vol->prepath[0] = '/';
- strcpy(vol->prepath+1, string);
- } else
- strcpy(vol->prepath, string);
-
+ /* Compare old prefixpath= option to new one */
+ if (!string || strcmp(string, vol->prepath))
+ printk(KERN_WARNING "CIFS: the value of the "
+ "prefixpath= mount option does not "
+ "match the device string. Using the "
+ "prefixpath= option for now. In 3.10, "
+ "that option will be ignored and the "
+ "contents of the device string will be "
+ "used instead.(%s != %s)\n", string,
+ vol->prepath);
break;
case Opt_iocharset:
string = match_strdup(args);
@@ -1799,9 +1857,30 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
goto cifs_parse_mount_err;
}
#endif
+ if (!vol->UNC) {
+ cERROR(1, "CIFS mount error: No usable UNC path provided in "
+ "device string or in unc= option!");
+ goto cifs_parse_mount_err;
+ }
- if (vol->UNCip == NULL)
- vol->UNCip = &vol->UNC[2];
+ /* make sure UNC has a share name */
+ if (!strchr(vol->UNC + 3, '\\')) {
+ cERROR(1, "Malformed UNC. Unable to find share name.");
+ goto cifs_parse_mount_err;
+ }
+
+ if (!got_ip) {
+ /* No ip= option specified? Try to get it from UNC */
+ if (!cifs_convert_address(dstaddr, &vol->UNC[2],
+ strlen(&vol->UNC[2]))) {
+ printk(KERN_ERR "Unable to determine destination "
+ "address.\n");
+ goto cifs_parse_mount_err;
+ }
+ }
+
+ /* set the port that we got earlier */
+ cifs_set_port(dstaddr, port);
if (uid_specified)
vol->override_uid = override_uid;
@@ -1972,9 +2051,10 @@ match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
return true;
}
-static int match_server(struct TCP_Server_Info *server, struct sockaddr *addr,
- struct smb_vol *vol)
+static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
{
+ struct sockaddr *addr = (struct sockaddr *)&vol->dstaddr;
+
if ((server->vals != vol->vals) || (server->ops != vol->ops))
return 0;
@@ -1995,13 +2075,13 @@ static int match_server(struct TCP_Server_Info *server, struct sockaddr *addr,
}
static struct TCP_Server_Info *
-cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
+cifs_find_tcp_session(struct smb_vol *vol)
{
struct TCP_Server_Info *server;
spin_lock(&cifs_tcp_ses_lock);
list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
- if (!match_server(server, addr, vol))
+ if (!match_server(server, vol))
continue;
++server->srv_count;
@@ -2051,40 +2131,12 @@ static struct TCP_Server_Info *
cifs_get_tcp_session(struct smb_vol *volume_info)
{
struct TCP_Server_Info *tcp_ses = NULL;
- struct sockaddr_storage addr;
- struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr;
- struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr;
int rc;
- memset(&addr, 0, sizeof(struct sockaddr_storage));
-
- cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip);
-
- if (volume_info->UNCip && volume_info->UNC) {
- rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
- volume_info->UNCip,
- strlen(volume_info->UNCip),
- volume_info->port);
- if (!rc) {
- /* we failed translating address */
- rc = -EINVAL;
- goto out_err;
- }
- } else if (volume_info->UNCip) {
- /* BB using ip addr as tcp_ses name to connect to the
- DFS root below */
- cERROR(1, "Connecting to DFS root not implemented yet");
- rc = -EINVAL;
- goto out_err;
- } else /* which tcp_sess DFS root would we conect to */ {
- cERROR(1, "CIFS mount error: No UNC path (e.g. -o "
- "unc=//192.168.1.100/public) specified");
- rc = -EINVAL;
- goto out_err;
- }
+ cFYI(1, "UNC: %s", volume_info->UNC);
/* see if we already have a matching tcp_ses */
- tcp_ses = cifs_find_tcp_session((struct sockaddr *)&addr, volume_info);
+ tcp_ses = cifs_find_tcp_session(volume_info);
if (tcp_ses)
return tcp_ses;
@@ -2129,27 +2181,18 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
-
+ memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
+ sizeof(tcp_ses->srcaddr));
+ memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr,
+ sizeof(tcp_ses->dstaddr));
/*
* at this point we are the only ones with the pointer
* to the struct since the kernel thread not created yet
* no need to spinlock this init of tcpStatus or srv_count
*/
tcp_ses->tcpStatus = CifsNew;
- memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
- sizeof(tcp_ses->srcaddr));
++tcp_ses->srv_count;
- if (addr.ss_family == AF_INET6) {
- cFYI(1, "attempting ipv6 connect");
- /* BB should we allow ipv6 on port 139? */
- /* other OS never observed in Wild doing 139 with v6 */
- memcpy(&tcp_ses->dstaddr, sin_server6,
- sizeof(struct sockaddr_in6));
- } else
- memcpy(&tcp_ses->dstaddr, sin_server,
- sizeof(struct sockaddr_in));
-
rc = ip_connect(tcp_ses);
if (rc < 0) {
cERROR(1, "Error connecting to socket. Aborting operation");
@@ -2397,8 +2440,6 @@ cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)),
}
#endif /* CONFIG_KEYS */
-static bool warned_on_ntlm; /* globals init to false automatically */
-
static struct cifs_ses *
cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
{
@@ -2475,14 +2516,6 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
ses->cred_uid = volume_info->cred_uid;
ses->linux_uid = volume_info->linux_uid;
- /* ntlmv2 is much stronger than ntlm security, and has been broadly
- supported for many years, time to update default security mechanism */
- if ((volume_info->secFlg == 0) && warned_on_ntlm == false) {
- warned_on_ntlm = true;
- cERROR(1, "default security mechanism requested. The default "
- "security mechanism will be upgraded from ntlm to "
- "ntlmv2 in kernel release 3.3");
- }
ses->overrideSecFlg = volume_info->secFlg;
mutex_lock(&ses->session_mutex);
@@ -2598,13 +2631,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
}
}
- if (strchr(volume_info->UNC + 3, '\\') == NULL
- && strchr(volume_info->UNC + 3, '/') == NULL) {
- cERROR(1, "Missing share name");
- rc = -ENODEV;
- goto out_fail;
- }
-
/*
* BB Do we need to wrap session_mutex around this TCon call and Unix
* SetFS as we do on SessSetup and reconnect?
@@ -2718,11 +2744,8 @@ cifs_match_super(struct super_block *sb, void *data)
struct cifs_ses *ses;
struct cifs_tcon *tcon;
struct tcon_link *tlink;
- struct sockaddr_storage addr;
int rc = 0;
- memset(&addr, 0, sizeof(struct sockaddr_storage));
-
spin_lock(&cifs_tcp_ses_lock);
cifs_sb = CIFS_SB(sb);
tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
@@ -2736,17 +2759,7 @@ cifs_match_super(struct super_block *sb, void *data)
volume_info = mnt_data->vol;
- if (!volume_info->UNCip || !volume_info->UNC)
- goto out;
-
- rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
- volume_info->UNCip,
- strlen(volume_info->UNCip),
- volume_info->port);
- if (!rc)
- goto out;
-
- if (!match_server(tcp_srv, (struct sockaddr *)&addr, volume_info) ||
+ if (!match_server(tcp_srv, volume_info) ||
!match_session(ses, volume_info) ||
!match_tcon(tcon, volume_info->UNC)) {
rc = 0;
@@ -3261,8 +3274,6 @@ cleanup_volume_info_contents(struct smb_vol *volume_info)
{
kfree(volume_info->username);
kzfree(volume_info->password);
- if (volume_info->UNCip != volume_info->UNC + 2)
- kfree(volume_info->UNCip);
kfree(volume_info->UNC);
kfree(volume_info->domainname);
kfree(volume_info->iocharset);
@@ -3280,14 +3291,16 @@ cifs_cleanup_volume_info(struct smb_vol *volume_info)
#ifdef CONFIG_CIFS_DFS_UPCALL
-/* build_path_to_root returns full path to root when
- * we do not have an exiting connection (tcon) */
+/*
+ * cifs_build_path_to_root returns full path to root when we do not have an
+ * exiting connection (tcon)
+ */
static char *
build_unc_path_to_root(const struct smb_vol *vol,
const struct cifs_sb_info *cifs_sb)
{
char *full_path, *pos;
- unsigned int pplen = vol->prepath ? strlen(vol->prepath) : 0;
+ unsigned int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0;
unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1);
full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL);
@@ -3298,6 +3311,7 @@ build_unc_path_to_root(const struct smb_vol *vol,
pos = full_path + unc_len;
if (pplen) {
+ *pos++ = CIFS_DIR_SEP(cifs_sb);
strncpy(pos, vol->prepath, pplen);
pos += pplen;
}
@@ -3353,7 +3367,6 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses,
mdata = NULL;
} else {
cleanup_volume_info_contents(volume_info);
- memset(volume_info, '\0', sizeof(*volume_info));
rc = cifs_setup_volume_info(volume_info, mdata,
fake_devname);
}
@@ -3375,7 +3388,6 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
if (cifs_parse_mount_options(mount_data, devname, volume_info))
return -EINVAL;
-
if (volume_info->nullauth) {
cFYI(1, "Anonymous login");
kfree(volume_info->username);
@@ -3412,7 +3424,7 @@ cifs_get_volume_info(char *mount_data, const char *devname)
int rc;
struct smb_vol *volume_info;
- volume_info = kzalloc(sizeof(struct smb_vol), GFP_KERNEL);
+ volume_info = kmalloc(sizeof(struct smb_vol), GFP_KERNEL);
if (!volume_info)
return ERR_PTR(-ENOMEM);
@@ -3537,8 +3549,10 @@ remote_path_check:
rc = -ENOSYS;
goto mount_fail_check;
}
- /* build_path_to_root works only when we have a valid tcon */
- full_path = build_path_to_root(volume_info, cifs_sb, tcon);
+ /*
+ * cifs_build_path_to_root works only when we have a valid tcon
+ */
+ full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon);
if (full_path == NULL) {
rc = -ENOMEM;
goto mount_fail_check;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 7c0a81283645..8719bbe0dcc3 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -44,6 +44,38 @@ renew_parental_timestamps(struct dentry *direntry)
} while (!IS_ROOT(direntry));
}
+char *
+cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
+ struct cifs_tcon *tcon)
+{
+ int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0;
+ int dfsplen;
+ char *full_path = NULL;
+
+ /* if no prefix path, simply set path to the root of share to "" */
+ if (pplen == 0) {
+ full_path = kzalloc(1, GFP_KERNEL);
+ return full_path;
+ }
+
+ if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
+ dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
+ else
+ dfsplen = 0;
+
+ full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL);
+ if (full_path == NULL)
+ return full_path;
+
+ if (dfsplen)
+ strncpy(full_path, tcon->treeName, dfsplen);
+ full_path[dfsplen] = CIFS_DIR_SEP(cifs_sb);
+ strncpy(full_path + dfsplen + 1, vol->prepath, pplen);
+ convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
+ full_path[dfsplen + pplen] = 0; /* add trailing null */
+ return full_path;
+}
+
/* Note: caller must free return buffer */
char *
build_path_from_dentry(struct dentry *direntry)
@@ -398,7 +430,16 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
* in network traffic in the other paths.
*/
if (!(oflags & O_CREAT)) {
- struct dentry *res = cifs_lookup(inode, direntry, 0);
+ struct dentry *res;
+
+ /*
+ * Check for hashed negative dentry. We have already revalidated
+ * the dentry and it is fine. No need to perform another lookup.
+ */
+ if (!d_unhashed(direntry))
+ return -ENOENT;
+
+ res = cifs_lookup(inode, direntry, 0);
if (IS_ERR(res))
return PTR_ERR(res);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index edb25b4bbb95..0a6677ba212b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -505,16 +505,36 @@ out:
return rc;
}
+static int cifs_push_posix_locks(struct cifsFileInfo *cfile);
+
/*
* Try to reacquire byte range locks that were released when session
- * to server was lost
+ * to server was lost.
*/
-static int cifs_relock_file(struct cifsFileInfo *cifsFile)
+static int
+cifs_relock_file(struct cifsFileInfo *cfile)
{
+ struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+ struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+ struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
int rc = 0;
- /* BB list all locks open on this file and relock */
+ /* we are going to update can_cache_brlcks here - need a write access */
+ down_write(&cinode->lock_sem);
+ if (cinode->can_cache_brlcks) {
+ /* can cache locks - no need to push them */
+ up_write(&cinode->lock_sem);
+ return rc;
+ }
+ if (cap_unix(tcon->ses) &&
+ (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+ ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+ rc = cifs_push_posix_locks(cfile);
+ else
+ rc = tcon->ses->server->ops->push_mand_locks(cfile);
+
+ up_write(&cinode->lock_sem);
return rc;
}
@@ -739,10 +759,15 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock)
}
}
+#define CIFS_LOCK_OP 0
+#define CIFS_READ_OP 1
+#define CIFS_WRITE_OP 2
+
+/* @rw_check : 0 - no op, 1 - read, 2 - write */
static bool
cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
__u64 length, __u8 type, struct cifsFileInfo *cfile,
- struct cifsLockInfo **conf_lock, bool rw_check)
+ struct cifsLockInfo **conf_lock, int rw_check)
{
struct cifsLockInfo *li;
struct cifsFileInfo *cur_cfile = fdlocks->cfile;
@@ -752,9 +777,13 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
if (offset + length <= li->offset ||
offset >= li->offset + li->length)
continue;
- if (rw_check && server->ops->compare_fids(cfile, cur_cfile) &&
- current->tgid == li->pid)
- continue;
+ if (rw_check != CIFS_LOCK_OP && current->tgid == li->pid &&
+ server->ops->compare_fids(cfile, cur_cfile)) {
+ /* shared lock prevents write op through the same fid */
+ if (!(li->type & server->vals->shared_lock_type) ||
+ rw_check != CIFS_WRITE_OP)
+ continue;
+ }
if ((type & server->vals->shared_lock_type) &&
((server->ops->compare_fids(cfile, cur_cfile) &&
current->tgid == li->pid) || type == li->type))
@@ -769,7 +798,7 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
bool
cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
__u8 type, struct cifsLockInfo **conf_lock,
- bool rw_check)
+ int rw_check)
{
bool rc = false;
struct cifs_fid_locks *cur;
@@ -805,7 +834,7 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
down_read(&cinode->lock_sem);
exist = cifs_find_lock_conflict(cfile, offset, length, type,
- &conf_lock, false);
+ &conf_lock, CIFS_LOCK_OP);
if (exist) {
flock->fl_start = conf_lock->offset;
flock->fl_end = conf_lock->offset + conf_lock->length - 1;
@@ -852,7 +881,7 @@ try_again:
down_write(&cinode->lock_sem);
exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length,
- lock->type, &conf_lock, false);
+ lock->type, &conf_lock, CIFS_LOCK_OP);
if (!exist && cinode->can_cache_brlcks) {
list_add_tail(&lock->llist, &cfile->llist->locks);
up_write(&cinode->lock_sem);
@@ -948,7 +977,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
int rc = 0, stored_rc;
struct cifsLockInfo *li, *tmp;
struct cifs_tcon *tcon;
- struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
unsigned int num, max_num, max_buf;
LOCKING_ANDX_RANGE *buf, *cur;
int types[] = {LOCKING_ANDX_LARGE_FILES,
@@ -958,21 +986,12 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
xid = get_xid();
tcon = tlink_tcon(cfile->tlink);
- /* we are going to update can_cache_brlcks here - need a write access */
- down_write(&cinode->lock_sem);
- if (!cinode->can_cache_brlcks) {
- up_write(&cinode->lock_sem);
- free_xid(xid);
- return rc;
- }
-
/*
* Accessing maxBuf is racy with cifs_reconnect - need to store value
* and check it for zero before using.
*/
max_buf = tcon->ses->server->maxBuf;
if (!max_buf) {
- up_write(&cinode->lock_sem);
free_xid(xid);
return -EINVAL;
}
@@ -981,7 +1000,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
sizeof(LOCKING_ANDX_RANGE);
buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
if (!buf) {
- up_write(&cinode->lock_sem);
free_xid(xid);
return -ENOMEM;
}
@@ -1018,9 +1036,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
}
}
- cinode->can_cache_brlcks = false;
- up_write(&cinode->lock_sem);
-
kfree(buf);
free_xid(xid);
return rc;
@@ -1043,7 +1058,6 @@ struct lock_to_push {
static int
cifs_push_posix_locks(struct cifsFileInfo *cfile)
{
- struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
struct file_lock *flock, **before;
unsigned int count = 0, i = 0;
@@ -1054,14 +1068,6 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
xid = get_xid();
- /* we are going to update can_cache_brlcks here - need a write access */
- down_write(&cinode->lock_sem);
- if (!cinode->can_cache_brlcks) {
- up_write(&cinode->lock_sem);
- free_xid(xid);
- return rc;
- }
-
lock_flocks();
cifs_for_each_lock(cfile->dentry->d_inode, before) {
if ((*before)->fl_flags & FL_POSIX)
@@ -1127,9 +1133,6 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
}
out:
- cinode->can_cache_brlcks = false;
- up_write(&cinode->lock_sem);
-
free_xid(xid);
return rc;
err_out:
@@ -1144,14 +1147,27 @@ static int
cifs_push_locks(struct cifsFileInfo *cfile)
{
struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+ struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+ int rc = 0;
+
+ /* we are going to update can_cache_brlcks here - need a write access */
+ down_write(&cinode->lock_sem);
+ if (!cinode->can_cache_brlcks) {
+ up_write(&cinode->lock_sem);
+ return rc;
+ }
if (cap_unix(tcon->ses) &&
(CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
- return cifs_push_posix_locks(cfile);
+ rc = cifs_push_posix_locks(cfile);
+ else
+ rc = tcon->ses->server->ops->push_mand_locks(cfile);
- return tcon->ses->server->ops->push_mand_locks(cfile);
+ cinode->can_cache_brlcks = false;
+ up_write(&cinode->lock_sem);
+ return rc;
}
static void
@@ -1436,16 +1452,18 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
return -ENOMEM;
rc = cifs_lock_add_if(cfile, lock, wait_flag);
- if (rc < 0)
+ if (rc < 0) {
kfree(lock);
- if (rc <= 0)
+ return rc;
+ }
+ if (!rc)
goto out;
rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
type, 1, 0, wait_flag);
if (rc) {
kfree(lock);
- goto out;
+ return rc;
}
cifs_lock_add(cfile, lock);
@@ -1794,7 +1812,6 @@ static int cifs_writepages(struct address_space *mapping,
struct TCP_Server_Info *server;
struct page *page;
int rc = 0;
- loff_t isize = i_size_read(mapping->host);
/*
* If wsize is smaller than the page cache size, default to writing
@@ -1899,7 +1916,7 @@ retry:
*/
set_page_writeback(page);
- if (page_offset(page) >= isize) {
+ if (page_offset(page) >= i_size_read(mapping->host)) {
done = true;
unlock_page(page);
end_page_writeback(page);
@@ -1932,7 +1949,8 @@ retry:
wdata->offset = page_offset(wdata->pages[0]);
wdata->pagesz = PAGE_CACHE_SIZE;
wdata->tailsz =
- min(isize - page_offset(wdata->pages[nr_pages - 1]),
+ min(i_size_read(mapping->host) -
+ page_offset(wdata->pages[nr_pages - 1]),
(loff_t)PAGE_CACHE_SIZE);
wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) +
wdata->tailsz;
@@ -2085,7 +2103,15 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
} else {
rc = copied;
pos += copied;
- set_page_dirty(page);
+ /*
+ * When we use strict cache mode and cifs_strict_writev was run
+ * with level II oplock (indicated by leave_pages_clean field of
+ * CIFS_I(inode)), we can leave pages clean - cifs_strict_writev
+ * sent the data to the server itself.
+ */
+ if (!CIFS_I(inode)->leave_pages_clean ||
+ !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO))
+ set_page_dirty(page);
}
if (rc > 0) {
@@ -2436,8 +2462,8 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
}
static ssize_t
-cifs_writev(struct kiocb *iocb, const struct iovec *iov,
- unsigned long nr_segs, loff_t pos)
+cifs_pagecache_writev(struct kiocb *iocb, const struct iovec *iov,
+ unsigned long nr_segs, loff_t pos, bool cache_ex)
{
struct file *file = iocb->ki_filp;
struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
@@ -2457,10 +2483,14 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
down_read(&cinode->lock_sem);
if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs),
server->vals->exclusive_lock_type, NULL,
- true)) {
+ CIFS_WRITE_OP)) {
mutex_lock(&inode->i_mutex);
+ if (!cache_ex)
+ cinode->leave_pages_clean = true;
rc = __generic_file_aio_write(iocb, iov, nr_segs,
- &iocb->ki_pos);
+ &iocb->ki_pos);
+ if (!cache_ex)
+ cinode->leave_pages_clean = false;
mutex_unlock(&inode->i_mutex);
}
@@ -2487,42 +2517,62 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
struct cifsFileInfo *cfile = (struct cifsFileInfo *)
iocb->ki_filp->private_data;
struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
-
-#ifdef CONFIG_CIFS_SMB2
+ ssize_t written, written2;
/*
- * If we have an oplock for read and want to write a data to the file
- * we need to store it in the page cache and then push it to the server
- * to be sure the next read will get a valid data.
+ * We need to store clientCanCacheAll here to prevent race
+ * conditions - this value can be changed during an execution
+ * of generic_file_aio_write. For CIFS it can be changed from
+ * true to false only, but for SMB2 it can be changed both from
+ * true to false and vice versa. So, we can end up with a data
+ * stored in the cache, not marked dirty and not sent to the
+ * server if this value changes its state from false to true
+ * after cifs_write_end.
*/
- if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead) {
- ssize_t written;
- int rc;
-
- written = generic_file_aio_write(iocb, iov, nr_segs, pos);
- rc = filemap_fdatawrite(inode->i_mapping);
- if (rc)
- return (ssize_t)rc;
+ bool cache_ex = cinode->clientCanCacheAll;
+ bool cache_read = cinode->clientCanCacheRead;
+ int rc;
+ loff_t saved_pos;
- return written;
+ if (cache_ex) {
+ if (cap_unix(tcon->ses) &&
+ ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) &&
+ (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(
+ tcon->fsUnixInfo.Capability)))
+ return generic_file_aio_write(iocb, iov, nr_segs, pos);
+ return cifs_pagecache_writev(iocb, iov, nr_segs, pos, cache_ex);
}
-#endif
/*
- * For non-oplocked files in strict cache mode we need to write the data
- * to the server exactly from the pos to pos+len-1 rather than flush all
- * affected pages because it may cause a error with mandatory locks on
- * these pages but not on the region from pos to ppos+len-1.
+ * For files without exclusive oplock in strict cache mode we need to
+ * write the data to the server exactly from the pos to pos+len-1 rather
+ * than flush all affected pages because it may cause a error with
+ * mandatory locks on these pages but not on the region from pos to
+ * ppos+len-1.
*/
+ written = cifs_user_writev(iocb, iov, nr_segs, pos);
+ if (!cache_read || written <= 0)
+ return written;
- if (!cinode->clientCanCacheAll)
- return cifs_user_writev(iocb, iov, nr_segs, pos);
-
+ saved_pos = iocb->ki_pos;
+ iocb->ki_pos = pos;
+ /* we have a read oplock - need to store a data in the page cache */
if (cap_unix(tcon->ses) &&
- (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
- ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
- return generic_file_aio_write(iocb, iov, nr_segs, pos);
-
- return cifs_writev(iocb, iov, nr_segs, pos);
+ ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) &&
+ (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(
+ tcon->fsUnixInfo.Capability)))
+ written2 = generic_file_aio_write(iocb, iov, nr_segs, pos);
+ else
+ written2 = cifs_pagecache_writev(iocb, iov, nr_segs, pos,
+ cache_ex);
+ /* errors occured during writing - invalidate the page cache */
+ if (written2 < 0) {
+ rc = cifs_invalidate_mapping(inode);
+ if (rc)
+ written = (ssize_t)rc;
+ else
+ iocb->ki_pos = saved_pos;
+ }
+ return written;
}
static struct cifs_readdata *
@@ -2892,7 +2942,7 @@ cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
down_read(&cinode->lock_sem);
if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs),
tcon->ses->server->vals->shared_lock_type,
- NULL, true))
+ NULL, CIFS_READ_OP))
rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
up_read(&cinode->lock_sem);
return rc;
@@ -3536,7 +3586,7 @@ void cifs_oplock_break(struct work_struct *work)
if (cinode->clientCanCacheRead == 0) {
rc = filemap_fdatawait(inode->i_mapping);
mapping_set_error(inode->i_mapping, rc);
- invalidate_remote_inode(inode);
+ cifs_invalidate_mapping(inode);
}
cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
}
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index afdff79651f1..ed6208ff85a7 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1791,11 +1791,12 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
stat->ino = CIFS_I(inode)->uniqueid;
/*
- * If on a multiuser mount without unix extensions, and the admin hasn't
- * overridden them, set the ownership to the fsuid/fsgid of the current
- * process.
+ * If on a multiuser mount without unix extensions or cifsacl being
+ * enabled, and the admin hasn't overridden them, set the ownership
+ * to the fsuid/fsgid of the current process.
*/
if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
+ !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
!tcon->unix_ext) {
if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
stat->uid = current_fsuid();
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index d5ce9e26696c..a82bc51fdc82 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -204,7 +204,7 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
return rc;
}
-int
+void
cifs_set_port(struct sockaddr *addr, const unsigned short int port)
{
switch (addr->sa_family) {
@@ -214,19 +214,7 @@ cifs_set_port(struct sockaddr *addr, const unsigned short int port)
case AF_INET6:
((struct sockaddr_in6 *)addr)->sin6_port = htons(port);
break;
- default:
- return 0;
}
- return 1;
-}
-
-int
-cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
- const unsigned short int port)
-{
- if (!cifs_convert_address(dst, src, len))
- return 0;
- return cifs_set_port(dst, port);
}
/*****************************************************************************
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f9b5d3d6cf33..6002fdc920ae 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -66,18 +66,20 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
#endif /* DEBUG2 */
/*
+ * Attempt to preload the dcache with the results from the FIND_FIRST/NEXT
+ *
* Find the dentry that matches "name". If there isn't one, create one. If it's
* a negative dentry or the uniqueid changed, then drop it and recreate it.
*/
-static struct dentry *
-cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
+static void
+cifs_prime_dcache(struct dentry *parent, struct qstr *name,
struct cifs_fattr *fattr)
{
struct dentry *dentry, *alias;
struct inode *inode;
struct super_block *sb = parent->d_inode->i_sb;
- cFYI(1, "For %s", name->name);
+ cFYI(1, "%s: for %s", __func__, name->name);
if (parent->d_op && parent->d_op->d_hash)
parent->d_op->d_hash(parent, parent->d_inode, name);
@@ -86,35 +88,33 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
dentry = d_lookup(parent, name);
if (dentry) {
+ int err;
+
inode = dentry->d_inode;
/* update inode in place if i_ino didn't change */
if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) {
cifs_fattr_to_inode(inode, fattr);
- return dentry;
+ goto out;
}
- d_drop(dentry);
+ err = d_invalidate(dentry);
dput(dentry);
+ if (err)
+ return;
}
dentry = d_alloc(parent, name);
- if (dentry == NULL)
- return NULL;
+ if (!dentry)
+ return;
inode = cifs_iget(sb, fattr);
- if (!inode) {
- dput(dentry);
- return NULL;
- }
+ if (!inode)
+ goto out;
alias = d_materialise_unique(dentry, inode);
- if (alias != NULL) {
- dput(dentry);
- if (IS_ERR(alias))
- return NULL;
- dentry = alias;
- }
-
- return dentry;
+ if (alias && !IS_ERR(alias))
+ dput(alias);
+out:
+ dput(dentry);
}
static void
@@ -134,6 +134,16 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
if (fattr->cf_cifsattrs & ATTR_READONLY)
fattr->cf_mode &= ~S_IWUGO;
+ /*
+ * We of course don't get ACL info in FIND_FIRST/NEXT results, so
+ * mark it for revalidation so that "ls -l" will look right. It might
+ * be super-slow, but if we don't do this then the ownership of files
+ * may look wrong since the inodes may not have timed out by the time
+ * "ls" does a stat() call on them.
+ */
+ if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
+ fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
+
if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL &&
fattr->cf_cifsattrs & ATTR_SYSTEM) {
if (fattr->cf_eof == 0) {
@@ -649,7 +659,6 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
struct cifs_dirent de = { NULL, };
struct cifs_fattr fattr;
- struct dentry *dentry;
struct qstr name;
int rc = 0;
ino_t ino;
@@ -720,13 +729,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
*/
fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
- ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
- dentry = cifs_readdir_lookup(file->f_dentry, &name, &fattr);
+ cifs_prime_dcache(file->f_dentry, &name, &fattr);
+ ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
rc = filldir(dirent, name.name, name.len, file->f_pos, ino,
fattr.cf_dtype);
-
- dput(dentry);
return rc;
}
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 56cc4be87807..a5d234c8d5d9 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -575,37 +575,6 @@ cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
return CIFSSMBQFileInfo(xid, tcon, fid->netfid, data);
}
-static char *
-cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
- struct cifs_tcon *tcon)
-{
- int pplen = vol->prepath ? strlen(vol->prepath) : 0;
- int dfsplen;
- char *full_path = NULL;
-
- /* if no prefix path, simply set path to the root of share to "" */
- if (pplen == 0) {
- full_path = kzalloc(1, GFP_KERNEL);
- return full_path;
- }
-
- if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
- dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
- else
- dfsplen = 0;
-
- full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL);
- if (full_path == NULL)
- return full_path;
-
- if (dfsplen)
- strncpy(full_path, tcon->treeName, dfsplen);
- strncpy(full_path + dfsplen, vol->prepath, pplen);
- convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
- full_path[dfsplen + pplen] = 0; /* add trailing null */
- return full_path;
-}
-
static void
cifs_clear_stats(struct cifs_tcon *tcon)
{
@@ -766,7 +735,6 @@ smb_set_file_info(struct inode *inode, const char *full_path,
struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
struct tcon_link *tlink = NULL;
struct cifs_tcon *tcon;
- FILE_BASIC_INFO info_buf;
/* if the file is already open for write, just use that fileid */
open_file = find_writable_file(cinode, true);
@@ -817,7 +785,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,
netpid = current->tgid;
set_via_filehandle:
- rc = CIFSSMBSetFileInfo(xid, tcon, &info_buf, netfid, netpid);
+ rc = CIFSSMBSetFileInfo(xid, tcon, buf, netfid, netpid);
if (!rc)
cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
@@ -944,7 +912,6 @@ struct smb_version_operations smb1_operations = {
.set_path_size = CIFSSMBSetEOF,
.set_file_size = CIFSSMBSetFileSize,
.set_file_info = smb_set_file_info,
- .build_path_to_root = cifs_build_path_to_root,
.echo = CIFSSMBEcho,
.mkdir = CIFSSMBMkDir,
.mkdir_setinfo = cifs_mkdir_setinfo,
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index a93eec30a50d..71e6aed4b382 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -260,13 +260,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
struct cifs_fid_locks *fdlocks;
xid = get_xid();
- /* we are going to update can_cache_brlcks here - need a write access */
- down_write(&cinode->lock_sem);
- if (!cinode->can_cache_brlcks) {
- up_write(&cinode->lock_sem);
- free_xid(xid);
- return rc;
- }
/*
* Accessing maxBuf is racy with cifs_reconnect - need to store value
@@ -274,7 +267,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
*/
max_buf = tlink_tcon(cfile->tlink)->ses->server->maxBuf;
if (!max_buf) {
- up_write(&cinode->lock_sem);
free_xid(xid);
return -EINVAL;
}
@@ -282,7 +274,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
max_num = max_buf / sizeof(struct smb2_lock_element);
buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL);
if (!buf) {
- up_write(&cinode->lock_sem);
free_xid(xid);
return -ENOMEM;
}
@@ -293,10 +284,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
rc = stored_rc;
}
- cinode->can_cache_brlcks = false;
kfree(buf);
-
- up_write(&cinode->lock_sem);
free_xid(xid);
return rc;
}
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 4d9dbe0b7385..d79de7bc4435 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -262,23 +262,6 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
return rc;
}
-static char *
-smb2_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
- struct cifs_tcon *tcon)
-{
- int pplen = vol->prepath ? strlen(vol->prepath) : 0;
- char *full_path = NULL;
-
- /* if no prefix path, simply set path to the root of share to "" */
- if (pplen == 0) {
- full_path = kzalloc(2, GFP_KERNEL);
- return full_path;
- }
-
- cERROR(1, "prefixpath is not supported for SMB2 now");
- return NULL;
-}
-
static bool
smb2_can_echo(struct TCP_Server_Info *server)
{
@@ -613,7 +596,6 @@ struct smb_version_operations smb21_operations = {
.set_path_size = smb2_set_path_size,
.set_file_size = smb2_set_file_size,
.set_file_info = smb2_set_file_info,
- .build_path_to_root = smb2_build_path_to_root,
.mkdir = smb2_mkdir,
.mkdir_setinfo = smb2_mkdir_setinfo,
.rmdir = smb2_rmdir,
@@ -641,6 +623,91 @@ struct smb_version_operations smb21_operations = {
.get_lease_key = smb2_get_lease_key,
.set_lease_key = smb2_set_lease_key,
.new_lease_key = smb2_new_lease_key,
+ .calc_signature = smb2_calc_signature,
+};
+
+
+struct smb_version_operations smb30_operations = {
+ .compare_fids = smb2_compare_fids,
+ .setup_request = smb2_setup_request,
+ .setup_async_request = smb2_setup_async_request,
+ .check_receive = smb2_check_receive,
+ .add_credits = smb2_add_credits,
+ .set_credits = smb2_set_credits,
+ .get_credits_field = smb2_get_credits_field,
+ .get_credits = smb2_get_credits,
+ .get_next_mid = smb2_get_next_mid,
+ .read_data_offset = smb2_read_data_offset,
+ .read_data_length = smb2_read_data_length,
+ .map_error = map_smb2_to_linux_error,
+ .find_mid = smb2_find_mid,
+ .check_message = smb2_check_message,
+ .dump_detail = smb2_dump_detail,
+ .clear_stats = smb2_clear_stats,
+ .print_stats = smb2_print_stats,
+ .is_oplock_break = smb2_is_valid_oplock_break,
+ .need_neg = smb2_need_neg,
+ .negotiate = smb2_negotiate,
+ .negotiate_wsize = smb2_negotiate_wsize,
+ .negotiate_rsize = smb2_negotiate_rsize,
+ .sess_setup = SMB2_sess_setup,
+ .logoff = SMB2_logoff,
+ .tree_connect = SMB2_tcon,
+ .tree_disconnect = SMB2_tdis,
+ .is_path_accessible = smb2_is_path_accessible,
+ .can_echo = smb2_can_echo,
+ .echo = SMB2_echo,
+ .query_path_info = smb2_query_path_info,
+ .get_srv_inum = smb2_get_srv_inum,
+ .query_file_info = smb2_query_file_info,
+ .set_path_size = smb2_set_path_size,
+ .set_file_size = smb2_set_file_size,
+ .set_file_info = smb2_set_file_info,
+ .mkdir = smb2_mkdir,
+ .mkdir_setinfo = smb2_mkdir_setinfo,
+ .rmdir = smb2_rmdir,
+ .unlink = smb2_unlink,
+ .rename = smb2_rename_path,
+ .create_hardlink = smb2_create_hardlink,
+ .open = smb2_open_file,
+ .set_fid = smb2_set_fid,
+ .close = smb2_close_file,
+ .flush = smb2_flush_file,
+ .async_readv = smb2_async_readv,
+ .async_writev = smb2_async_writev,
+ .sync_read = smb2_sync_read,
+ .sync_write = smb2_sync_write,
+ .query_dir_first = smb2_query_dir_first,
+ .query_dir_next = smb2_query_dir_next,
+ .close_dir = smb2_close_dir,
+ .calc_smb_size = smb2_calc_size,
+ .is_status_pending = smb2_is_status_pending,
+ .oplock_response = smb2_oplock_response,
+ .queryfs = smb2_queryfs,
+ .mand_lock = smb2_mand_lock,
+ .mand_unlock_range = smb2_unlock_range,
+ .push_mand_locks = smb2_push_mandatory_locks,
+ .get_lease_key = smb2_get_lease_key,
+ .set_lease_key = smb2_set_lease_key,
+ .new_lease_key = smb2_new_lease_key,
+ .calc_signature = smb3_calc_signature,
+};
+
+struct smb_version_values smb20_values = {
+ .version_string = SMB20_VERSION_STRING,
+ .protocol_id = SMB20_PROT_ID,
+ .req_capabilities = 0, /* MBZ */
+ .large_lock_type = 0,
+ .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
+ .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+ .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
+ .header_size = sizeof(struct smb2_hdr),
+ .max_header_size = MAX_SMB2_HDR_SIZE,
+ .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+ .lock_cmd = SMB2_LOCK,
+ .cap_unix = 0,
+ .cap_nt_find = SMB2_NT_FIND,
+ .cap_large_files = SMB2_LARGE_FILES,
};
struct smb_version_values smb21_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index cf33622cdac8..41d9d0725f0f 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -425,7 +425,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
}
cFYI(1, "sec_flags 0x%x", sec_flags);
- if (sec_flags & CIFSSEC_MUST_SIGN) {
+ if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
cFYI(1, "Signing required");
if (!(server->sec_mode & (SMB2_NEGOTIATE_SIGNING_REQUIRED |
SMB2_NEGOTIATE_SIGNING_ENABLED))) {
@@ -612,7 +612,8 @@ ssetup_ntlmssp_authenticate:
/* BB add code to build os and lm fields */
- rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, CIFS_LOG_ERROR);
+ rc = SendReceive2(xid, ses, iov, 2, &resp_buftype,
+ CIFS_LOG_ERROR | CIFS_NEG_OP);
kfree(security_blob);
rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 7d25f8b14f93..2aa3535e38ce 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -47,6 +47,10 @@ extern struct mid_q_entry *smb2_setup_request(struct cifs_ses *ses,
struct smb_rqst *rqst);
extern struct mid_q_entry *smb2_setup_async_request(
struct TCP_Server_Info *server, struct smb_rqst *rqst);
+extern int smb2_calc_signature(struct smb_rqst *rqst,
+ struct TCP_Server_Info *server);
+extern int smb3_calc_signature(struct smb_rqst *rqst,
+ struct TCP_Server_Info *server);
extern void smb2_echo_request(struct work_struct *work);
extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode);
extern __u8 smb2_map_lease_to_oplock(__le32 lease_state);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 2a5fdf26f79f..8dd73e61d762 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -39,7 +39,7 @@
#include "smb2status.h"
#include "smb2glob.h"
-static int
+int
smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
{
int i, rc;
@@ -116,6 +116,13 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
return rc;
}
+int
+smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
+{
+ cFYI(1, "smb3 signatures not supported yet");
+ return -EOPNOTSUPP;
+}
+
/* must be called with server->srv_mutex held */
static int
smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
@@ -132,7 +139,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
return rc;
}
- rc = smb2_calc_signature(rqst, server);
+ rc = server->ops->calc_signature(rqst, server);
return rc;
}
@@ -168,7 +175,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
memset(smb2_pdu->Signature, 0, SMB2_SIGNATURE_SIZE);
mutex_lock(&server->srv_mutex);
- rc = smb2_calc_signature(rqst, server);
+ rc = server->ops->calc_signature(rqst, server);
mutex_unlock(&server->srv_mutex);
if (rc)
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 4c6285fff598..e2f57a007029 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -844,6 +844,9 @@ COMPATIBLE_IOCTL(TIOCGDEV)
COMPATIBLE_IOCTL(TIOCCBRK)
COMPATIBLE_IOCTL(TIOCGSID)
COMPATIBLE_IOCTL(TIOCGICOUNT)
+COMPATIBLE_IOCTL(TIOCGPKT)
+COMPATIBLE_IOCTL(TIOCGPTLCK)
+COMPATIBLE_IOCTL(TIOCGEXCL)
/* Little t */
COMPATIBLE_IOCTL(TIOCGETD)
COMPATIBLE_IOCTL(TIOCSETD)
diff --git a/fs/coredump.c b/fs/coredump.c
index ce47379bfa61..177493272a61 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -458,7 +458,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
return err;
}
-void do_coredump(siginfo_t *siginfo, struct pt_regs *regs)
+void do_coredump(siginfo_t *siginfo)
{
struct core_state core_state;
struct core_name cn;
@@ -474,7 +474,7 @@ void do_coredump(siginfo_t *siginfo, struct pt_regs *regs)
static atomic_t core_dump_count = ATOMIC_INIT(0);
struct coredump_params cprm = {
.siginfo = siginfo,
- .regs = regs,
+ .regs = signal_pt_regs(),
.limit = rlimit(RLIMIT_CORE),
/*
* We must use the same mm->flags while dumping core to avoid
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b607d92cdf24..153bb1e42e63 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -59,7 +59,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev
case S_IFDIR:
inode->i_op = &simple_dir_inode_operations;
inode->i_fop = &simple_dir_operations;
- inode->i_private = NULL;
/* directory inodes start off with i_nlink == 2
* (for "." entry) */
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 14afbabe6546..472e6befc54d 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -545,37 +545,38 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx)
mutex_unlock(&allocated_ptys_lock);
}
-int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
+/**
+ * devpts_pty_new -- create a new inode in /dev/pts/
+ * @ptmx_inode: inode of the master
+ * @device: major+minor of the node to be created
+ * @index: used as a name of the node
+ * @priv: what's given back by devpts_get_priv
+ *
+ * The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill.
+ */
+struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
+ void *priv)
{
- /* tty layer puts index from devpts_new_index() in here */
- int number = tty->index;
- struct tty_driver *driver = tty->driver;
- dev_t device = MKDEV(driver->major, driver->minor_start+number);
struct dentry *dentry;
struct super_block *sb = pts_sb_from_inode(ptmx_inode);
- struct inode *inode = new_inode(sb);
+ struct inode *inode;
struct dentry *root = sb->s_root;
struct pts_fs_info *fsi = DEVPTS_SB(sb);
struct pts_mount_opts *opts = &fsi->mount_opts;
- int ret = 0;
char s[12];
- /* We're supposed to be given the slave end of a pty */
- BUG_ON(driver->type != TTY_DRIVER_TYPE_PTY);
- BUG_ON(driver->subtype != PTY_TYPE_SLAVE);
-
+ inode = new_inode(sb);
if (!inode)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
- inode->i_ino = number + 3;
+ inode->i_ino = index + 3;
inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
init_special_inode(inode, S_IFCHR|opts->mode, device);
- inode->i_private = tty;
- tty->driver_data = inode;
+ inode->i_private = priv;
- sprintf(s, "%d", number);
+ sprintf(s, "%d", index);
mutex_lock(&root->d_inode->i_mutex);
@@ -585,18 +586,24 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
fsnotify_create(root->d_inode, dentry);
} else {
iput(inode);
- ret = -ENOMEM;
+ inode = ERR_PTR(-ENOMEM);
}
mutex_unlock(&root->d_inode->i_mutex);
- return ret;
+ return inode;
}
-struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
+/**
+ * devpts_get_priv -- get private data for a slave
+ * @pts_inode: inode of the slave
+ *
+ * Returns whatever was passed as priv in devpts_pty_new for a given inode.
+ */
+void *devpts_get_priv(struct inode *pts_inode)
{
struct dentry *dentry;
- struct tty_struct *tty;
+ void *priv = NULL;
BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
@@ -605,18 +612,22 @@ struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
if (!dentry)
return NULL;
- tty = NULL;
if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
- tty = (struct tty_struct *)pts_inode->i_private;
+ priv = pts_inode->i_private;
dput(dentry);
- return tty;
+ return priv;
}
-void devpts_pty_kill(struct tty_struct *tty)
+/**
+ * devpts_pty_kill -- remove inode form /dev/pts/
+ * @inode: inode of the slave to be removed
+ *
+ * This is an inverse operation of devpts_pty_new.
+ */
+void devpts_pty_kill(struct inode *inode)
{
- struct inode *inode = tty->driver_data;
struct super_block *sb = pts_sb_from_inode(inode);
struct dentry *root = sb->s_root;
struct dentry *dentry;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index f86c720dba0e..cf5b44b10c67 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -540,6 +540,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
sector_t fs_endblk; /* Into file, in filesystem-sized blocks */
unsigned long fs_count; /* Number of filesystem-sized blocks */
int create;
+ unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor;
/*
* If there was a memory error and we've overwritten all the
@@ -554,7 +555,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
fs_count = fs_endblk - fs_startblk + 1;
map_bh->b_state = 0;
- map_bh->b_size = fs_count << dio->inode->i_blkbits;
+ map_bh->b_size = fs_count << i_blkbits;
/*
* For writes inside i_size on a DIO_SKIP_HOLES filesystem we
@@ -1053,7 +1054,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
int seg;
size_t size;
unsigned long addr;
- unsigned blkbits = inode->i_blkbits;
+ unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
+ unsigned blkbits = i_blkbits;
unsigned blocksize_mask = (1 << blkbits) - 1;
ssize_t retval = -EINVAL;
loff_t end = offset;
@@ -1149,7 +1151,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
dio->inode = inode;
dio->rw = rw;
sdio.blkbits = blkbits;
- sdio.blkfactor = inode->i_blkbits - blkbits;
+ sdio.blkfactor = i_blkbits - blkbits;
sdio.block_in_file = offset >> blkbits;
sdio.get_block = get_block;
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index 1897eb1b4b6a..e4242c3f8486 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -1,6 +1,6 @@
menuconfig DLM
tristate "Distributed Lock Manager (DLM)"
- depends on EXPERIMENTAL && INET
+ depends on INET
depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
select IP_SCTP
help
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 871c1abf6029..77c0f70f8fe8 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -337,6 +337,7 @@ enum rsb_flags {
RSB_NEW_MASTER2,
RSB_RECOVER_CONVERT,
RSB_RECOVER_GRANT,
+ RSB_RECOVER_LVB_INVAL,
};
static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index b56950758188..a579f30f237d 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -5393,6 +5393,13 @@ static void purge_dead_list(struct dlm_ls *ls, struct dlm_rsb *r,
if ((lkb->lkb_nodeid == nodeid_gone) ||
dlm_is_removed(ls, lkb->lkb_nodeid)) {
+ /* tell recover_lvb to invalidate the lvb
+ because a node holding EX/PW failed */
+ if ((lkb->lkb_exflags & DLM_LKF_VALBLK) &&
+ (lkb->lkb_grmode >= DLM_LOCK_PW)) {
+ rsb_set_flag(r, RSB_RECOVER_LVB_INVAL);
+ }
+
del_lkb(r, lkb);
/* this put should free the lkb */
@@ -6025,15 +6032,18 @@ static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
return error;
}
-/* The force flag allows the unlock to go ahead even if the lkb isn't granted.
- Regardless of what rsb queue the lock is on, it's removed and freed. */
+/* The FORCEUNLOCK flag allows the unlock to go ahead even if the lkb isn't
+ granted. Regardless of what rsb queue the lock is on, it's removed and
+ freed. The IVVALBLK flag causes the lvb on the resource to be invalidated
+ if our lock is PW/EX (it's ignored if our granted mode is smaller.) */
static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
{
struct dlm_args args;
int error;
- set_unlock_args(DLM_LKF_FORCEUNLOCK, lkb->lkb_ua, &args);
+ set_unlock_args(DLM_LKF_FORCEUNLOCK | DLM_LKF_IVVALBLK,
+ lkb->lkb_ua, &args);
error = unlock_lock(ls, lkb, &args);
if (error == -DLM_EUNLOCK)
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 331ea4f94efd..dd87a31bcc21 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1385,7 +1385,6 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
struct connection *con;
struct writequeue_entry *e;
int offset = 0;
- int users = 0;
con = nodeid2con(nodeid, allocation);
if (!con)
@@ -1399,7 +1398,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
} else {
offset = e->end;
e->end += len;
- users = e->users++;
+ e->users++;
}
spin_unlock(&con->writequeue_lock);
@@ -1414,7 +1413,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
spin_lock(&con->writequeue_lock);
offset = e->end;
e->end += len;
- users = e->users++;
+ e->users++;
list_add_tail(&e->list, &con->writequeue);
spin_unlock(&con->writequeue_lock);
goto got_one;
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 4a7a76e42fc3..aedea28a86a1 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -717,8 +717,14 @@ void dlm_recovered_lock(struct dlm_rsb *r)
* the VALNOTVALID flag if necessary, and determining the correct lvb contents
* based on the lvb's of the locks held on the rsb.
*
- * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb. If it
- * was already set prior to recovery, it's not cleared, regardless of locks.
+ * RSB_VALNOTVALID is set in two cases:
+ *
+ * 1. we are master, but not new, and we purged an EX/PW lock held by a
+ * failed node (in dlm_recover_purge which set RSB_RECOVER_LVB_INVAL)
+ *
+ * 2. we are a new master, and there are only NL/CR locks left.
+ * (We could probably improve this by only invaliding in this way when
+ * the previous master left uncleanly. VMS docs mention that.)
*
* The LVB contents are only considered for changing when this is a new master
* of the rsb (NEW_MASTER2). Then, the rsb's lvb is taken from any lkb with
@@ -734,6 +740,19 @@ static void recover_lvb(struct dlm_rsb *r)
int big_lock_exists = 0;
int lvblen = r->res_ls->ls_lvblen;
+ if (!rsb_flag(r, RSB_NEW_MASTER2) &&
+ rsb_flag(r, RSB_RECOVER_LVB_INVAL)) {
+ /* case 1 above */
+ rsb_set_flag(r, RSB_VALNOTVALID);
+ return;
+ }
+
+ if (!rsb_flag(r, RSB_NEW_MASTER2))
+ return;
+
+ /* we are the new master, so figure out if VALNOTVALID should
+ be set, and set the rsb lvb from the best lkb available. */
+
list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
continue;
@@ -772,13 +791,10 @@ static void recover_lvb(struct dlm_rsb *r)
if (!lock_lvb_exists)
goto out;
+ /* lvb is invalidated if only NL/CR locks remain */
if (!big_lock_exists)
rsb_set_flag(r, RSB_VALNOTVALID);
- /* don't mess with the lvb unless we're the new master */
- if (!rsb_flag(r, RSB_NEW_MASTER2))
- goto out;
-
if (!r->res_lvbptr) {
r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
if (!r->res_lvbptr)
@@ -852,12 +868,19 @@ void dlm_recover_rsbs(struct dlm_ls *ls)
if (is_master(r)) {
if (rsb_flag(r, RSB_RECOVER_CONVERT))
recover_conversion(r);
+
+ /* recover lvb before granting locks so the updated
+ lvb/VALNOTVALID is presented in the completion */
+ recover_lvb(r);
+
if (rsb_flag(r, RSB_NEW_MASTER2))
recover_grant(r);
- recover_lvb(r);
count++;
+ } else {
+ rsb_clear_flag(r, RSB_VALNOTVALID);
}
rsb_clear_flag(r, RSB_RECOVER_CONVERT);
+ rsb_clear_flag(r, RSB_RECOVER_LVB_INVAL);
rsb_clear_flag(r, RSB_NEW_MASTER2);
unlock_rsb(r);
}
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index da72250ddc1c..cd96649bfe62 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -346,7 +346,7 @@ static inline struct epitem *ep_item_from_epqueue(poll_table *p)
/* Tells if the epoll_ctl(2) operation needs an event copy from userspace */
static inline int ep_op_has_event(int op)
{
- return op == EPOLL_CTL_ADD || op == EPOLL_CTL_MOD;
+ return op != EPOLL_CTL_DEL;
}
/* Initialize the poll safe wake up structure */
@@ -676,34 +676,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
return 0;
}
-/*
- * Disables a "struct epitem" in the eventpoll set. Returns -EBUSY if the item
- * had no event flags set, indicating that another thread may be currently
- * handling that item's events (in the case that EPOLLONESHOT was being
- * used). Otherwise a zero result indicates that the item has been disabled
- * from receiving events. A disabled item may be re-enabled via
- * EPOLL_CTL_MOD. Must be called with "mtx" held.
- */
-static int ep_disable(struct eventpoll *ep, struct epitem *epi)
-{
- int result = 0;
- unsigned long flags;
-
- spin_lock_irqsave(&ep->lock, flags);
- if (epi->event.events & ~EP_PRIVATE_BITS) {
- if (ep_is_linked(&epi->rdllink))
- list_del_init(&epi->rdllink);
- /* Ensure ep_poll_callback will not add epi back onto ready
- list: */
- epi->event.events &= EP_PRIVATE_BITS;
- }
- else
- result = -EBUSY;
- spin_unlock_irqrestore(&ep->lock, flags);
-
- return result;
-}
-
static void ep_free(struct eventpoll *ep)
{
struct rb_node *rbp;
@@ -1048,6 +1020,8 @@ static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi)
rb_insert_color(&epi->rbn, &ep->rbr);
}
+
+
#define PATH_ARR_SIZE 5
/*
* These are the number paths of length 1 to 5, that we are allowing to emanate
@@ -1813,12 +1787,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
} else
error = -ENOENT;
break;
- case EPOLL_CTL_DISABLE:
- if (epi)
- error = ep_disable(ep, epi);
- else
- error = -ENOENT;
- break;
}
mutex_unlock(&ep->mtx);
diff --git a/fs/exec.c b/fs/exec.c
index 0039055b1fc6..721a29929511 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1349,7 +1349,7 @@ EXPORT_SYMBOL(remove_arg_zero);
/*
* cycle the list of binary formats handler, until one recognizes the image
*/
-int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
+int search_binary_handler(struct linux_binprm *bprm)
{
unsigned int depth = bprm->recursion_depth;
int try,retval;
@@ -1374,13 +1374,13 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
for (try=0; try<2; try++) {
read_lock(&binfmt_lock);
list_for_each_entry(fmt, &formats, lh) {
- int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
+ int (*fn)(struct linux_binprm *) = fmt->load_binary;
if (!fn)
continue;
if (!try_module_get(fmt->module))
continue;
read_unlock(&binfmt_lock);
- retval = fn(bprm, regs);
+ retval = fn(bprm);
/*
* Restore the depth counter to its starting value
* in this call, so we don't have to rely on every
@@ -1439,8 +1439,7 @@ EXPORT_SYMBOL(search_binary_handler);
*/
static int do_execve_common(const char *filename,
struct user_arg_ptr argv,
- struct user_arg_ptr envp,
- struct pt_regs *regs)
+ struct user_arg_ptr envp)
{
struct linux_binprm *bprm;
struct file *file;
@@ -1524,7 +1523,7 @@ static int do_execve_common(const char *filename,
if (retval < 0)
goto out;
- retval = search_binary_handler(bprm,regs);
+ retval = search_binary_handler(bprm);
if (retval < 0)
goto out;
@@ -1566,19 +1565,17 @@ out_ret:
int do_execve(const char *filename,
const char __user *const __user *__argv,
- const char __user *const __user *__envp,
- struct pt_regs *regs)
+ const char __user *const __user *__envp)
{
struct user_arg_ptr argv = { .ptr.native = __argv };
struct user_arg_ptr envp = { .ptr.native = __envp };
- return do_execve_common(filename, argv, envp, regs);
+ return do_execve_common(filename, argv, envp);
}
#ifdef CONFIG_COMPAT
-int compat_do_execve(const char *filename,
+static int compat_do_execve(const char *filename,
const compat_uptr_t __user *__argv,
- const compat_uptr_t __user *__envp,
- struct pt_regs *regs)
+ const compat_uptr_t __user *__envp)
{
struct user_arg_ptr argv = {
.is_compat = true,
@@ -1588,7 +1585,7 @@ int compat_do_execve(const char *filename,
.is_compat = true,
.ptr.compat = __envp,
};
- return do_execve_common(filename, argv, envp, regs);
+ return do_execve_common(filename, argv, envp);
}
#endif
@@ -1669,7 +1666,7 @@ SYSCALL_DEFINE3(execve,
struct filename *path = getname(filename);
int error = PTR_ERR(path);
if (!IS_ERR(path)) {
- error = do_execve(path->name, argv, envp, current_pt_regs());
+ error = do_execve(path->name, argv, envp);
putname(path);
}
return error;
@@ -1682,8 +1679,7 @@ asmlinkage long compat_sys_execve(const char __user * filename,
struct filename *path = getname(filename);
int error = PTR_ERR(path);
if (!IS_ERR(path)) {
- error = compat_do_execve(path->name, argv, envp,
- current_pt_regs());
+ error = compat_do_execve(path->name, argv, envp);
putname(path);
}
return error;
@@ -1696,12 +1692,9 @@ int kernel_execve(const char *filename,
const char *const argv[],
const char *const envp[])
{
- struct pt_regs *p = current_pt_regs();
- int ret;
-
- ret = do_execve(filename,
+ int ret = do_execve(filename,
(const char __user *const __user *)argv,
- (const char __user *const __user *)envp, p);
+ (const char __user *const __user *)envp);
if (ret < 0)
return ret;
@@ -1709,6 +1702,6 @@ int kernel_execve(const char *filename,
* We were successful. We won't be returning to our caller, but
* instead to user space by manipulating the kernel stack.
*/
- ret_from_kernel_execve(p);
+ ret_from_kernel_execve(current_pt_regs());
}
#endif
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 7320a66e958f..22548f56197b 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -2101,8 +2101,9 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
end = start + (range->len >> sb->s_blocksize_bits) - 1;
minlen = range->minlen >> sb->s_blocksize_bits;
- if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)) ||
- unlikely(start >= max_blks))
+ if (minlen > EXT3_BLOCKS_PER_GROUP(sb) ||
+ start >= max_blks ||
+ range->len < sb->s_blocksize)
return -EINVAL;
if (end >= max_blks)
end = max_blks - 1;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 4facdd29a350..3a100e7a62a8 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -725,6 +725,10 @@ repeat_in_this_group:
"inode=%lu", ino + 1);
continue;
}
+ BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
+ err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
+ if (err)
+ goto fail;
ext4_lock_group(sb, group);
ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
ext4_unlock_group(sb, group);
@@ -738,6 +742,11 @@ repeat_in_this_group:
goto out;
got:
+ BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
+ err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
+ if (err)
+ goto fail;
+
/* We may have to initialize the block bitmap if it isn't already */
if (ext4_has_group_desc_csum(sb) &&
gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
@@ -771,11 +780,6 @@ got:
goto fail;
}
- BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
- err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
- if (err)
- goto fail;
-
BUFFER_TRACE(group_desc_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, group_desc_bh);
if (err)
@@ -823,11 +827,6 @@ got:
}
ext4_unlock_group(sb, group);
- BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
- err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
- if (err)
- goto fail;
-
BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
if (err)
diff --git a/fs/file.c b/fs/file.c
index d3b5fa80b71b..15cb8618e95d 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -519,12 +519,6 @@ struct files_struct init_files = {
.file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
};
-void daemonize_descriptors(void)
-{
- atomic_inc(&init_files.count);
- reset_files_struct(&init_files);
-}
-
/*
* allocate a file descriptor, mark it busy.
*/
@@ -685,7 +679,6 @@ void do_close_on_exec(struct files_struct *files)
struct fdtable *fdt;
/* exec unshares first */
- BUG_ON(atomic_read(&files->count) != 1);
spin_lock(&files->file_lock);
for (i = 0; ; i++) {
unsigned long set;
@@ -900,7 +893,7 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags)
return __close_fd(files, fd);
if (fd >= rlimit(RLIMIT_NOFILE))
- return -EMFILE;
+ return -EBADF;
spin_lock(&files->file_lock);
err = expand_files(files, fd);
@@ -926,7 +919,7 @@ SYSCALL_DEFINE3(dup3, unsigned int, oldfd, unsigned int, newfd, int, flags)
return -EINVAL;
if (newfd >= rlimit(RLIMIT_NOFILE))
- return -EMFILE;
+ return -EBADF;
spin_lock(&files->file_lock);
err = expand_files(files, newfd);
@@ -995,16 +988,18 @@ int iterate_fd(struct files_struct *files, unsigned n,
const void *p)
{
struct fdtable *fdt;
- struct file *file;
int res = 0;
if (!files)
return 0;
spin_lock(&files->file_lock);
- fdt = files_fdtable(files);
- while (!res && n < fdt->max_fds) {
- file = rcu_dereference_check_fdtable(files, fdt->fd[n++]);
- if (file)
- res = f(p, file, n);
+ for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
+ struct file *file;
+ file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
+ if (!file)
+ continue;
+ res = f(p, file, n);
+ if (res)
+ break;
}
spin_unlock(&files->file_lock);
return res;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 51ea267d444c..3e3422f7f0a4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -228,6 +228,8 @@ static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
static void inode_sync_complete(struct inode *inode)
{
inode->i_state &= ~I_SYNC;
+ /* If inode is clean an unused, put it into LRU now... */
+ inode_add_lru(inode);
/* Waiters must see I_SYNC cleared before being woken up */
smp_mb();
wake_up_bit(&inode->i_state, __I_SYNC);
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 5df4775fea03..fe6ca583bbc0 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -164,27 +164,3 @@ struct fs_struct init_fs = {
.seq = SEQCNT_ZERO,
.umask = 0022,
};
-
-void daemonize_fs_struct(void)
-{
- struct fs_struct *fs = current->fs;
-
- if (fs) {
- int kill;
-
- task_lock(current);
-
- spin_lock(&init_fs.lock);
- init_fs.users++;
- spin_unlock(&init_fs.lock);
-
- spin_lock(&fs->lock);
- current->fs = &init_fs;
- kill = !--fs->users;
- spin_unlock(&fs->lock);
-
- task_unlock(current);
- if (kill)
- free_fs_struct(fs);
- }
-}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 0def0504afc1..e056b4ce4877 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -516,15 +516,13 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma)
struct gfs2_holder i_gh;
int error;
- gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
- error = gfs2_glock_nq(&i_gh);
- if (error == 0) {
- file_accessed(file);
- gfs2_glock_dq(&i_gh);
- }
- gfs2_holder_uninit(&i_gh);
+ error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY,
+ &i_gh);
if (error)
return error;
+ /* grab lock to update inode */
+ gfs2_glock_dq_uninit(&i_gh);
+ file_accessed(file);
}
vma->vm_ops = &gfs2_vm_ops;
@@ -677,10 +675,8 @@ static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
size_t writesize = iov_length(iov, nr_segs);
struct dentry *dentry = file->f_dentry;
struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
- struct gfs2_sbd *sdp;
int ret;
- sdp = GFS2_SB(file->f_mapping->host);
ret = gfs2_rs_alloc(ip);
if (ret)
return ret;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index e6c2fd53cab2..0f22d09f358d 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -768,7 +768,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
mapping->host = s->s_bdev->bd_inode;
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_NOFS);
- mapping->assoc_mapping = NULL;
+ mapping->private_data = NULL;
mapping->backing_dev_info = s->s_bdi;
mapping->writeback_index = 0;
}
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 8ff95a2d54ee..9ceccb1595a3 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -393,12 +393,10 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
struct gfs2_meta_header *mh;
struct gfs2_trans *tr;
- lock_buffer(bd->bd_bh);
- gfs2_log_lock(sdp);
tr = current->journal_info;
tr->tr_touched = 1;
if (!list_empty(&bd->bd_list))
- goto out;
+ return;
set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
@@ -414,9 +412,6 @@ static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
sdp->sd_log_num_buf++;
list_add(&bd->bd_list, &sdp->sd_log_le_buf);
tr->tr_num_buf_new++;
-out:
- gfs2_log_unlock(sdp);
- unlock_buffer(bd->bd_bh);
}
static void gfs2_check_magic(struct buffer_head *bh)
@@ -621,7 +616,6 @@ static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
{
- struct gfs2_log_descriptor *ld;
struct gfs2_meta_header *mh;
unsigned int offset;
struct list_head *head = &sdp->sd_log_le_revoke;
@@ -634,7 +628,6 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
length = gfs2_struct2blk(sdp, sdp->sd_log_num_revoke, sizeof(u64));
page = gfs2_get_log_desc(sdp, GFS2_LOG_DESC_REVOKE, length, sdp->sd_log_num_revoke);
- ld = page_address(page);
offset = sizeof(struct gfs2_log_descriptor);
list_for_each_entry(bd, head, bd_list) {
@@ -777,12 +770,10 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
struct address_space *mapping = bd->bd_bh->b_page->mapping;
struct gfs2_inode *ip = GFS2_I(mapping->host);
- lock_buffer(bd->bd_bh);
- gfs2_log_lock(sdp);
if (tr)
tr->tr_touched = 1;
if (!list_empty(&bd->bd_list))
- goto out;
+ return;
set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
if (gfs2_is_jdata(ip)) {
@@ -793,9 +784,6 @@ static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
} else {
list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered);
}
-out:
- gfs2_log_unlock(sdp);
- unlock_buffer(bd->bd_bh);
}
/**
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 40c4b0d42fa8..c5af8e18f27a 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -497,8 +497,11 @@ int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid)
struct gfs2_quota_data **qd;
int error;
- if (ip->i_res == NULL)
- gfs2_rs_alloc(ip);
+ if (ip->i_res == NULL) {
+ error = gfs2_rs_alloc(ip);
+ if (error)
+ return error;
+ }
qd = ip->i_res->rs_qa_qd;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 3cc402ce6fea..38fe18f2f055 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -553,7 +553,6 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd)
*/
int gfs2_rs_alloc(struct gfs2_inode *ip)
{
- int error = 0;
struct gfs2_blkreserv *res;
if (ip->i_res)
@@ -561,7 +560,7 @@ int gfs2_rs_alloc(struct gfs2_inode *ip)
res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
if (!res)
- error = -ENOMEM;
+ return -ENOMEM;
RB_CLEAR_NODE(&res->rs_node);
@@ -571,7 +570,7 @@ int gfs2_rs_alloc(struct gfs2_inode *ip)
else
ip->i_res = res;
up_write(&ip->i_rw_mutex);
- return error;
+ return 0;
}
static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
@@ -1263,7 +1262,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
int ret = 0;
u64 amt;
u64 trimmed = 0;
+ u64 start, end, minlen;
unsigned int x;
+ unsigned bs_shift = sdp->sd_sb.sb_bsize_shift;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@@ -1271,19 +1272,25 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
if (!blk_queue_discard(q))
return -EOPNOTSUPP;
- if (argp == NULL) {
- r.start = 0;
- r.len = ULLONG_MAX;
- r.minlen = 0;
- } else if (copy_from_user(&r, argp, sizeof(r)))
+ if (copy_from_user(&r, argp, sizeof(r)))
return -EFAULT;
ret = gfs2_rindex_update(sdp);
if (ret)
return ret;
- rgd = gfs2_blk2rgrpd(sdp, r.start, 0);
- rgd_end = gfs2_blk2rgrpd(sdp, r.start + r.len, 0);
+ start = r.start >> bs_shift;
+ end = start + (r.len >> bs_shift);
+ minlen = max_t(u64, r.minlen,
+ q->limits.discard_granularity) >> bs_shift;
+
+ rgd = gfs2_blk2rgrpd(sdp, start, 0);
+ rgd_end = gfs2_blk2rgrpd(sdp, end - 1, 0);
+
+ if (end <= start ||
+ minlen > sdp->sd_max_rg_data ||
+ start > rgd_end->rd_data0 + rgd_end->rd_data)
+ return -EINVAL;
while (1) {
@@ -1295,7 +1302,9 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
/* Trim each bitmap in the rgrp */
for (x = 0; x < rgd->rd_length; x++) {
struct gfs2_bitmap *bi = rgd->rd_bits + x;
- ret = gfs2_rgrp_send_discards(sdp, rgd->rd_data0, NULL, bi, r.minlen, &amt);
+ ret = gfs2_rgrp_send_discards(sdp,
+ rgd->rd_data0, NULL, bi, minlen,
+ &amt);
if (ret) {
gfs2_glock_dq_uninit(&gh);
goto out;
@@ -1324,7 +1333,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
out:
r.len = trimmed << 9;
- if (argp && copy_to_user(argp, &r, sizeof(r)))
+ if (copy_to_user(argp, &r, sizeof(r)))
return -EFAULT;
return ret;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index bc737261f234..d6488674d916 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -810,7 +810,8 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
return;
}
need_unlock = 1;
- }
+ } else if (WARN_ON_ONCE(ip->i_gl->gl_state != LM_ST_EXCLUSIVE))
+ return;
if (current->journal_info == NULL) {
ret = gfs2_trans_begin(sdp, RES_DINODE, 0);
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index adbd27875ef9..413627072f36 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -155,14 +155,22 @@ void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
struct gfs2_sbd *sdp = gl->gl_sbd;
struct gfs2_bufdata *bd;
+ lock_buffer(bh);
+ gfs2_log_lock(sdp);
bd = bh->b_private;
if (bd)
gfs2_assert(sdp, bd->bd_gl == gl);
else {
+ gfs2_log_unlock(sdp);
+ unlock_buffer(bh);
gfs2_attach_bufdata(gl, bh, meta);
bd = bh->b_private;
+ lock_buffer(bh);
+ gfs2_log_lock(sdp);
}
lops_add(sdp, bd);
+ gfs2_log_unlock(sdp);
+ unlock_buffer(bh);
}
void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c98d0665fa5c..78bde32ea951 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -151,8 +151,8 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
- unsigned long start_addr;
struct hstate *h = hstate_file(file);
+ struct vm_unmapped_area_info info;
if (len & ~huge_page_mask(h))
return -EINVAL;
@@ -173,39 +173,13 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
return addr;
}
- if (len > mm->cached_hole_size)
- start_addr = mm->free_area_cache;
- else {
- start_addr = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = 0;
- }
-
-full_search:
- addr = ALIGN(start_addr, huge_page_size(h));
-
- for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
- /* At this point: (!vma || addr < vma->vm_end). */
- if (TASK_SIZE - len < addr) {
- /*
- * Start a new search - just in case we missed
- * some holes.
- */
- if (start_addr != TASK_UNMAPPED_BASE) {
- start_addr = TASK_UNMAPPED_BASE;
- mm->cached_hole_size = 0;
- goto full_search;
- }
- return -ENOMEM;
- }
-
- if (!vma || addr + len <= vma->vm_start) {
- mm->free_area_cache = addr + len;
- return addr;
- }
- if (addr + mm->cached_hole_size < vma->vm_start)
- mm->cached_hole_size = vma->vm_start - addr;
- addr = ALIGN(vma->vm_end, huge_page_size(h));
- }
+ info.flags = 0;
+ info.length = len;
+ info.low_limit = TASK_UNMAPPED_BASE;
+ info.high_limit = TASK_SIZE;
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+ return vm_unmapped_area(&info);
}
#endif
@@ -608,11 +582,11 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
int rc;
rc = migrate_huge_page_move_mapping(mapping, newpage, page);
- if (rc)
+ if (rc != MIGRATEPAGE_SUCCESS)
return rc;
migrate_page_copy(newpage, page);
- return 0;
+ return MIGRATEPAGE_SUCCESS;
}
static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -923,7 +897,7 @@ static struct file_system_type hugetlbfs_fs_type = {
.kill_sb = kill_litter_super,
};
-static struct vfsmount *hugetlbfs_vfsmount;
+static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
static int can_do_hugetlb_shm(void)
{
@@ -932,9 +906,22 @@ static int can_do_hugetlb_shm(void)
return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
}
+static int get_hstate_idx(int page_size_log)
+{
+ struct hstate *h;
+
+ if (!page_size_log)
+ return default_hstate_idx;
+ h = size_to_hstate(1 << page_size_log);
+ if (!h)
+ return -1;
+ return h - hstates;
+}
+
struct file *hugetlb_file_setup(const char *name, unsigned long addr,
size_t size, vm_flags_t acctflag,
- struct user_struct **user, int creat_flags)
+ struct user_struct **user,
+ int creat_flags, int page_size_log)
{
int error = -ENOMEM;
struct file *file;
@@ -944,9 +931,14 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
struct qstr quick_string;
struct hstate *hstate;
unsigned long num_pages;
+ int hstate_idx;
+
+ hstate_idx = get_hstate_idx(page_size_log);
+ if (hstate_idx < 0)
+ return ERR_PTR(-ENODEV);
*user = NULL;
- if (!hugetlbfs_vfsmount)
+ if (!hugetlbfs_vfsmount[hstate_idx])
return ERR_PTR(-ENOENT);
if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
@@ -963,7 +955,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
}
}
- root = hugetlbfs_vfsmount->mnt_root;
+ root = hugetlbfs_vfsmount[hstate_idx]->mnt_root;
quick_string.name = name;
quick_string.len = strlen(quick_string.name);
quick_string.hash = 0;
@@ -971,7 +963,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
if (!path.dentry)
goto out_shm_unlock;
- path.mnt = mntget(hugetlbfs_vfsmount);
+ path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
error = -ENOSPC;
inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0);
if (!inode)
@@ -1011,8 +1003,9 @@ out_shm_unlock:
static int __init init_hugetlbfs_fs(void)
{
+ struct hstate *h;
int error;
- struct vfsmount *vfsmount;
+ int i;
error = bdi_init(&hugetlbfs_backing_dev_info);
if (error)
@@ -1029,14 +1022,26 @@ static int __init init_hugetlbfs_fs(void)
if (error)
goto out;
- vfsmount = kern_mount(&hugetlbfs_fs_type);
+ i = 0;
+ for_each_hstate(h) {
+ char buf[50];
+ unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
- if (!IS_ERR(vfsmount)) {
- hugetlbfs_vfsmount = vfsmount;
- return 0;
- }
+ snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
+ hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
+ buf);
- error = PTR_ERR(vfsmount);
+ if (IS_ERR(hugetlbfs_vfsmount[i])) {
+ pr_err("hugetlb: Cannot mount internal hugetlbfs for "
+ "page size %uK", ps_kb);
+ error = PTR_ERR(hugetlbfs_vfsmount[i]);
+ hugetlbfs_vfsmount[i] = NULL;
+ }
+ i++;
+ }
+ /* Non default hstates are optional */
+ if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
+ return 0;
out:
kmem_cache_destroy(hugetlbfs_inode_cachep);
@@ -1047,13 +1052,19 @@ static int __init init_hugetlbfs_fs(void)
static void __exit exit_hugetlbfs_fs(void)
{
+ struct hstate *h;
+ int i;
+
+
/*
* Make sure all delayed rcu free inodes are flushed before we
* destroy cache.
*/
rcu_barrier();
kmem_cache_destroy(hugetlbfs_inode_cachep);
- kern_unmount(hugetlbfs_vfsmount);
+ i = 0;
+ for_each_hstate(h)
+ kern_unmount(hugetlbfs_vfsmount[i++]);
unregister_filesystem(&hugetlbfs_fs_type);
bdi_destroy(&hugetlbfs_backing_dev_info);
}
diff --git a/fs/inode.c b/fs/inode.c
index b03c71957246..14084b72b259 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -165,7 +165,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
mapping->host = inode;
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
- mapping->assoc_mapping = NULL;
+ mapping->private_data = NULL;
mapping->backing_dev_info = &default_backing_dev_info;
mapping->writeback_index = 0;
@@ -408,6 +408,19 @@ static void inode_lru_list_add(struct inode *inode)
spin_unlock(&inode->i_sb->s_inode_lru_lock);
}
+/*
+ * Add inode to LRU if needed (inode is unused and clean).
+ *
+ * Needs inode->i_lock held.
+ */
+void inode_add_lru(struct inode *inode)
+{
+ if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) &&
+ !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
+ inode_lru_list_add(inode);
+}
+
+
static void inode_lru_list_del(struct inode *inode)
{
spin_lock(&inode->i_sb->s_inode_lru_lock);
@@ -1390,8 +1403,7 @@ static void iput_final(struct inode *inode)
if (!drop && (sb->s_flags & MS_ACTIVE)) {
inode->i_state |= I_REFERENCED;
- if (!(inode->i_state & (I_DIRTY|I_SYNC)))
- inode_lru_list_add(inode);
+ inode_add_lru(inode);
spin_unlock(&inode->i_lock);
return;
}
diff --git a/fs/internal.h b/fs/internal.h
index 916b7cbf3e3e..2f6af7f645eb 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -110,6 +110,7 @@ extern int open_check_o_direct(struct file *f);
* inode.c
*/
extern spinlock_t inode_sb_list_lock;
+extern void inode_add_lru(struct inode *inode);
/*
* fs-writeback.c
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 8b472c8bbf7a..071d6905f0dd 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1961,7 +1961,9 @@ retry:
spin_unlock(&journal->j_list_lock);
jbd_unlock_bh_state(bh);
spin_unlock(&journal->j_state_lock);
+ unlock_buffer(bh);
log_wait_commit(journal, tid);
+ lock_buffer(bh);
goto retry;
}
/*
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 60ef3fb707ff..1506673c087e 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -138,33 +138,39 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
struct page *pg;
struct inode *inode = mapping->host;
struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+ struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+ struct jffs2_raw_inode ri;
+ uint32_t alloc_len = 0;
pgoff_t index = pos >> PAGE_CACHE_SHIFT;
uint32_t pageofs = index << PAGE_CACHE_SHIFT;
int ret = 0;
+ jffs2_dbg(1, "%s()\n", __func__);
+
+ if (pageofs > inode->i_size) {
+ ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
+ ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+ if (ret)
+ return ret;
+ }
+
+ mutex_lock(&f->sem);
pg = grab_cache_page_write_begin(mapping, index, flags);
- if (!pg)
+ if (!pg) {
+ if (alloc_len)
+ jffs2_complete_reservation(c);
+ mutex_unlock(&f->sem);
return -ENOMEM;
+ }
*pagep = pg;
- jffs2_dbg(1, "%s()\n", __func__);
-
- if (pageofs > inode->i_size) {
+ if (alloc_len) {
/* Make new hole frag from old EOF to new page */
- struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
- struct jffs2_raw_inode ri;
struct jffs2_full_dnode *fn;
- uint32_t alloc_len;
jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n",
(unsigned int)inode->i_size, pageofs);
- ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
- ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
- if (ret)
- goto out_page;
-
- mutex_lock(&f->sem);
memset(&ri, 0, sizeof(ri));
ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -191,7 +197,6 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
if (IS_ERR(fn)) {
ret = PTR_ERR(fn);
jffs2_complete_reservation(c);
- mutex_unlock(&f->sem);
goto out_page;
}
ret = jffs2_add_full_dnode_to_inode(c, f, fn);
@@ -206,12 +211,10 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
jffs2_mark_node_obsolete(c, fn->raw);
jffs2_free_full_dnode(fn);
jffs2_complete_reservation(c);
- mutex_unlock(&f->sem);
goto out_page;
}
jffs2_complete_reservation(c);
inode->i_size = pageofs;
- mutex_unlock(&f->sem);
}
/*
@@ -220,18 +223,18 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
* case of a short-copy.
*/
if (!PageUptodate(pg)) {
- mutex_lock(&f->sem);
ret = jffs2_do_readpage_nolock(inode, pg);
- mutex_unlock(&f->sem);
if (ret)
goto out_page;
}
+ mutex_unlock(&f->sem);
jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags);
return ret;
out_page:
unlock_page(pg);
page_cache_release(pg);
+ mutex_unlock(&f->sem);
return ret;
}
diff --git a/fs/namei.c b/fs/namei.c
index 937f9d50c84b..5f4cdf3ad913 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2131,6 +2131,11 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
if (!len)
return ERR_PTR(-EACCES);
+ if (unlikely(name[0] == '.')) {
+ if (len < 2 || (len == 2 && name[1] == '.'))
+ return ERR_PTR(-EACCES);
+ }
+
while (len--) {
c = *(const unsigned char *)name++;
if (c == '/' || c == '\0')
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ce8cb926526b..b9e66b7e0c14 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -450,7 +450,8 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
nfs_refresh_inode(dentry->d_inode, entry->fattr);
goto out;
} else {
- d_drop(dentry);
+ if (d_invalidate(dentry) != 0)
+ goto out;
dput(dentry);
}
}
@@ -1100,6 +1101,8 @@ out_set_verifier:
out_zap_parent:
nfs_zap_caches(dir);
out_bad:
+ nfs_free_fattr(fattr);
+ nfs_free_fhandle(fhandle);
nfs_mark_for_revalidate(dir);
if (inode && S_ISDIR(inode->i_mode)) {
/* Purge readdir caches. */
@@ -1112,8 +1115,6 @@ out_zap_parent:
shrink_dcache_parent(dentry);
}
d_drop(dentry);
- nfs_free_fattr(fattr);
- nfs_free_fhandle(fhandle);
dput(parent);
dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
__func__, dentry->d_parent->d_name.name,
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
index 31c26c4dcc23..ca4b11ec87a2 100644
--- a/fs/nfs/dns_resolve.c
+++ b/fs/nfs/dns_resolve.c
@@ -217,7 +217,7 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
{
char buf1[NFS_DNS_HOSTNAME_MAXLEN+1];
struct nfs_dns_ent key, *item;
- unsigned long ttl;
+ unsigned int ttl;
ssize_t len;
int ret = -EINVAL;
@@ -240,7 +240,8 @@ static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
key.namelen = len;
memset(&key.h, 0, sizeof(key.h));
- ttl = get_expiry(&buf);
+ if (get_uint(&buf, &ttl) < 0)
+ goto out;
if (ttl == 0)
goto out;
key.h.expiry_time = ttl + seconds_since_boot();
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 5c7325c5c5e6..6fa01aea2488 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -685,7 +685,10 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
if (ctx->cred != NULL)
put_rpccred(ctx->cred);
dput(ctx->dentry);
- nfs_sb_deactive(sb);
+ if (is_sync)
+ nfs_sb_deactive(sb);
+ else
+ nfs_sb_deactive_async(sb);
kfree(ctx->mdsthreshold);
kfree(ctx);
}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 59b133c5d652..05521cadac2e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -351,10 +351,12 @@ extern int __init register_nfs_fs(void);
extern void __exit unregister_nfs_fs(void);
extern void nfs_sb_active(struct super_block *sb);
extern void nfs_sb_deactive(struct super_block *sb);
+extern void nfs_sb_deactive_async(struct super_block *sb);
/* namespace.c */
+#define NFS_PATH_CANONICAL 1
extern char *nfs_path(char **p, struct dentry *dentry,
- char *buffer, ssize_t buflen);
+ char *buffer, ssize_t buflen, unsigned flags);
extern struct vfsmount *nfs_d_automount(struct path *path);
struct vfsmount *nfs_submount(struct nfs_server *, struct dentry *,
struct nfs_fh *, struct nfs_fattr *);
@@ -498,7 +500,7 @@ static inline char *nfs_devname(struct dentry *dentry,
char *buffer, ssize_t buflen)
{
char *dummy;
- return nfs_path(&dummy, dentry, buffer, buflen);
+ return nfs_path(&dummy, dentry, buffer, buflen, NFS_PATH_CANONICAL);
}
/*
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 8e65c7f1f87c..015f71f8f62c 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -181,7 +181,7 @@ int nfs_mount(struct nfs_mount_request *info)
else
msg.rpc_proc = &mnt_clnt->cl_procinfo[MOUNTPROC_MNT];
- status = rpc_call_sync(mnt_clnt, &msg, 0);
+ status = rpc_call_sync(mnt_clnt, &msg, RPC_TASK_SOFT|RPC_TASK_TIMEOUT);
rpc_shutdown_client(mnt_clnt);
if (status < 0)
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 655925373b91..dd057bc6b65b 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -33,6 +33,7 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ;
* @dentry - pointer to dentry
* @buffer - result buffer
* @buflen - length of buffer
+ * @flags - options (see below)
*
* Helper function for constructing the server pathname
* by arbitrary hashed dentry.
@@ -40,8 +41,14 @@ int nfs_mountpoint_expiry_timeout = 500 * HZ;
* This is mainly for use in figuring out the path on the
* server side when automounting on top of an existing partition
* and in generating /proc/mounts and friends.
+ *
+ * Supported flags:
+ * NFS_PATH_CANONICAL: ensure there is exactly one slash after
+ * the original device (export) name
+ * (if unset, the original name is returned verbatim)
*/
-char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen)
+char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen,
+ unsigned flags)
{
char *end;
int namelen;
@@ -74,7 +81,7 @@ rename_retry:
rcu_read_unlock();
goto rename_retry;
}
- if (*end != '/') {
+ if ((flags & NFS_PATH_CANONICAL) && *end != '/') {
if (--buflen < 0) {
spin_unlock(&dentry->d_lock);
rcu_read_unlock();
@@ -91,9 +98,11 @@ rename_retry:
return end;
}
namelen = strlen(base);
- /* Strip off excess slashes in base string */
- while (namelen > 0 && base[namelen - 1] == '/')
- namelen--;
+ if (flags & NFS_PATH_CANONICAL) {
+ /* Strip off excess slashes in base string */
+ while (namelen > 0 && base[namelen - 1] == '/')
+ namelen--;
+ }
buflen -= namelen;
if (buflen < 0) {
spin_unlock(&dentry->d_lock);
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 79fbb61ce202..1e09eb78543b 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -81,7 +81,8 @@ static char *nfs_path_component(const char *nfspath, const char *end)
static char *nfs4_path(struct dentry *dentry, char *buffer, ssize_t buflen)
{
char *limit;
- char *path = nfs_path(&limit, dentry, buffer, buflen);
+ char *path = nfs_path(&limit, dentry, buffer, buflen,
+ NFS_PATH_CANONICAL);
if (!IS_ERR(path)) {
char *path_component = nfs_path_component(path, limit);
if (path_component)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 68b21d81b7ac..5eec4429970c 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -339,8 +339,7 @@ static int nfs4_handle_exception(struct nfs_server *server, int errorcode, struc
dprintk("%s ERROR: %d Reset session\n", __func__,
errorcode);
nfs4_schedule_session_recovery(clp->cl_session, errorcode);
- exception->retry = 1;
- break;
+ goto wait_on_recovery;
#endif /* defined(CONFIG_NFS_V4_1) */
case -NFS4ERR_FILE_OPEN:
if (exception->timeout > HZ) {
@@ -1572,9 +1571,11 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
data->timestamp = jiffies;
if (nfs4_setup_sequence(data->o_arg.server,
&data->o_arg.seq_args,
- &data->o_res.seq_res, task))
- return;
- rpc_call_start(task);
+ &data->o_res.seq_res,
+ task) != 0)
+ nfs_release_seqid(data->o_arg.seqid);
+ else
+ rpc_call_start(task);
return;
unlock_no_action:
rcu_read_unlock();
@@ -1748,7 +1749,7 @@ static int nfs4_opendata_access(struct rpc_cred *cred,
/* even though OPEN succeeded, access is denied. Close the file */
nfs4_close_state(state, fmode);
- return -NFS4ERR_ACCESS;
+ return -EACCES;
}
/*
@@ -2196,7 +2197,7 @@ static void nfs4_free_closedata(void *data)
nfs4_put_open_state(calldata->state);
nfs_free_seqid(calldata->arg.seqid);
nfs4_put_state_owner(sp);
- nfs_sb_deactive(sb);
+ nfs_sb_deactive_async(sb);
kfree(calldata);
}
@@ -2296,9 +2297,10 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
if (nfs4_setup_sequence(NFS_SERVER(inode),
&calldata->arg.seq_args,
&calldata->res.seq_res,
- task))
- goto out;
- rpc_call_start(task);
+ task) != 0)
+ nfs_release_seqid(calldata->arg.seqid);
+ else
+ rpc_call_start(task);
out:
dprintk("%s: done!\n", __func__);
}
@@ -4529,6 +4531,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN)
rpc_restart_call_prepare(task);
}
+ nfs_release_seqid(calldata->arg.seqid);
}
static void nfs4_locku_prepare(struct rpc_task *task, void *data)
@@ -4545,9 +4548,11 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
calldata->timestamp = jiffies;
if (nfs4_setup_sequence(calldata->server,
&calldata->arg.seq_args,
- &calldata->res.seq_res, task))
- return;
- rpc_call_start(task);
+ &calldata->res.seq_res,
+ task) != 0)
+ nfs_release_seqid(calldata->arg.seqid);
+ else
+ rpc_call_start(task);
}
static const struct rpc_call_ops nfs4_locku_ops = {
@@ -4692,7 +4697,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
/* Do we need to do an open_to_lock_owner? */
if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {
if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0)
- return;
+ goto out_release_lock_seqid;
data->arg.open_stateid = &state->stateid;
data->arg.new_lock_owner = 1;
data->res.open_seqid = data->arg.open_seqid;
@@ -4701,10 +4706,15 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
data->timestamp = jiffies;
if (nfs4_setup_sequence(data->server,
&data->arg.seq_args,
- &data->res.seq_res, task))
+ &data->res.seq_res,
+ task) == 0) {
+ rpc_call_start(task);
return;
- rpc_call_start(task);
- dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
+ }
+ nfs_release_seqid(data->arg.open_seqid);
+out_release_lock_seqid:
+ nfs_release_seqid(data->arg.lock_seqid);
+ dprintk("%s: done!, ret = %d\n", __func__, task->tk_status);
}
static void nfs4_recover_lock_prepare(struct rpc_task *task, void *calldata)
@@ -5667,7 +5677,7 @@ static void nfs4_add_and_init_slots(struct nfs4_slot_table *tbl,
tbl->slots = new;
tbl->max_slots = max_slots;
}
- tbl->highest_used_slotid = -1; /* no slot is currently used */
+ tbl->highest_used_slotid = NFS4_NO_SLOT;
for (i = 0; i < tbl->max_slots; i++)
tbl->slots[i].seq_nr = ivalue;
spin_unlock(&tbl->slot_tbl_lock);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index fe624c91bd00..2878f97bd78d 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -925,8 +925,8 @@ pnfs_find_alloc_layout(struct inode *ino,
if (likely(nfsi->layout == NULL)) { /* Won the race? */
nfsi->layout = new;
return new;
- }
- pnfs_free_layout_hdr(new);
+ } else if (new != NULL)
+ pnfs_free_layout_hdr(new);
out_existing:
pnfs_get_layout_hdr(nfsi->layout);
return nfsi->layout;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index e831bce49766..652d3f7176a9 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -54,6 +54,7 @@
#include <linux/parser.h>
#include <linux/nsproxy.h>
#include <linux/rcupdate.h>
+#include <linux/kthread.h>
#include <asm/uaccess.h>
@@ -415,6 +416,54 @@ void nfs_sb_deactive(struct super_block *sb)
}
EXPORT_SYMBOL_GPL(nfs_sb_deactive);
+static int nfs_deactivate_super_async_work(void *ptr)
+{
+ struct super_block *sb = ptr;
+
+ deactivate_super(sb);
+ module_put_and_exit(0);
+ return 0;
+}
+
+/*
+ * same effect as deactivate_super, but will do final unmount in kthread
+ * context
+ */
+static void nfs_deactivate_super_async(struct super_block *sb)
+{
+ struct task_struct *task;
+ char buf[INET6_ADDRSTRLEN + 1];
+ struct nfs_server *server = NFS_SB(sb);
+ struct nfs_client *clp = server->nfs_client;
+
+ if (!atomic_add_unless(&sb->s_active, -1, 1)) {
+ rcu_read_lock();
+ snprintf(buf, sizeof(buf),
+ rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+ rcu_read_unlock();
+
+ __module_get(THIS_MODULE);
+ task = kthread_run(nfs_deactivate_super_async_work, sb,
+ "%s-deactivate-super", buf);
+ if (IS_ERR(task)) {
+ pr_err("%s: kthread_run: %ld\n",
+ __func__, PTR_ERR(task));
+ /* make synchronous call and hope for the best */
+ deactivate_super(sb);
+ module_put(THIS_MODULE);
+ }
+ }
+}
+
+void nfs_sb_deactive_async(struct super_block *sb)
+{
+ struct nfs_server *server = NFS_SB(sb);
+
+ if (atomic_dec_and_test(&server->active))
+ nfs_deactivate_super_async(sb);
+}
+EXPORT_SYMBOL_GPL(nfs_sb_deactive_async);
+
/*
* Deliver file system statistics to userspace
*/
@@ -771,7 +820,7 @@ int nfs_show_devname(struct seq_file *m, struct dentry *root)
int err = 0;
if (!page)
return -ENOMEM;
- devname = nfs_path(&dummy, root, page, PAGE_SIZE);
+ devname = nfs_path(&dummy, root, page, PAGE_SIZE, 0);
if (IS_ERR(devname))
err = PTR_ERR(devname);
else
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 13cea637eff8..3f79c77153b8 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -95,7 +95,7 @@ static void nfs_async_unlink_release(void *calldata)
nfs_dec_sillycount(data->dir);
nfs_free_unlinkdata(data);
- nfs_sb_deactive(sb);
+ nfs_sb_deactive_async(sb);
}
static void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 3e7b2a0dc0c8..07f76db04ec7 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -431,7 +431,7 @@ void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
mapping->host = inode;
mapping->flags = 0;
mapping_set_gfp_mask(mapping, GFP_NOFS);
- mapping->assoc_mapping = NULL;
+ mapping->private_data = NULL;
mapping->backing_dev_info = bdi;
mapping->a_ops = &empty_aops;
}
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index f35794b97e8e..a50636025364 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -21,6 +21,7 @@ static bool should_merge(struct fsnotify_event *old, struct fsnotify_event *new)
if ((old->path.mnt == new->path.mnt) &&
(old->path.dentry == new->path.dentry))
return true;
+ break;
case (FSNOTIFY_EVENT_NONE):
return true;
default:
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 721d692fa8d4..6fcaeb8c902e 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -258,7 +258,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
if (ret)
goto out_close_fd;
- fd_install(fd, f);
+ if (fd != FAN_NOFD)
+ fd_install(fd, f);
return fanotify_event_metadata.event_len;
out_close_fd:
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 5a4ee77cec51..dda089804942 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2513,18 +2513,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
ret = sd.num_spliced;
if (ret > 0) {
- unsigned long nr_pages;
int err;
- nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-
err = generic_write_sync(out, *ppos, ret);
if (err)
ret = err;
else
*ppos += ret;
- balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
+ balance_dirty_pages_ratelimited(mapping);
}
return ret;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index c1c207c36cae..d3696708fc1a 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -438,7 +438,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
min_flt += sig->min_flt;
maj_flt += sig->maj_flt;
- thread_group_times(task, &utime, &stime);
+ thread_group_cputime_adjusted(task, &utime, &stime);
gtime += sig->gtime;
}
@@ -454,7 +454,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
if (!whole) {
min_flt = task->min_flt;
maj_flt = task->maj_flt;
- task_times(task, &utime, &stime);
+ task_cputime_adjusted(task, &utime, &stime);
gtime = task->gtime;
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 144a96732dd7..aa63d25157b8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -873,12 +873,119 @@ static const struct file_operations proc_environ_operations = {
.release = mem_release,
};
+static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
+ loff_t *ppos)
+{
+ struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+ char buffer[PROC_NUMBUF];
+ int oom_adj = OOM_ADJUST_MIN;
+ size_t len;
+ unsigned long flags;
+
+ if (!task)
+ return -ESRCH;
+ if (lock_task_sighand(task, &flags)) {
+ if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
+ oom_adj = OOM_ADJUST_MAX;
+ else
+ oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
+ OOM_SCORE_ADJ_MAX;
+ unlock_task_sighand(task, &flags);
+ }
+ put_task_struct(task);
+ len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
+ return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+
+static ssize_t oom_adj_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct task_struct *task;
+ char buffer[PROC_NUMBUF];
+ int oom_adj;
+ unsigned long flags;
+ int err;
+
+ memset(buffer, 0, sizeof(buffer));
+ if (count > sizeof(buffer) - 1)
+ count = sizeof(buffer) - 1;
+ if (copy_from_user(buffer, buf, count)) {
+ err = -EFAULT;
+ goto out;
+ }
+
+ err = kstrtoint(strstrip(buffer), 0, &oom_adj);
+ if (err)
+ goto out;
+ if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
+ oom_adj != OOM_DISABLE) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ task = get_proc_task(file->f_path.dentry->d_inode);
+ if (!task) {
+ err = -ESRCH;
+ goto out;
+ }
+
+ task_lock(task);
+ if (!task->mm) {
+ err = -EINVAL;
+ goto err_task_lock;
+ }
+
+ if (!lock_task_sighand(task, &flags)) {
+ err = -ESRCH;
+ goto err_task_lock;
+ }
+
+ /*
+ * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
+ * value is always attainable.
+ */
+ if (oom_adj == OOM_ADJUST_MAX)
+ oom_adj = OOM_SCORE_ADJ_MAX;
+ else
+ oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
+
+ if (oom_adj < task->signal->oom_score_adj &&
+ !capable(CAP_SYS_RESOURCE)) {
+ err = -EACCES;
+ goto err_sighand;
+ }
+
+ /*
+ * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
+ * /proc/pid/oom_score_adj instead.
+ */
+ printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
+ current->comm, task_pid_nr(current), task_pid_nr(task),
+ task_pid_nr(task));
+
+ task->signal->oom_score_adj = oom_adj;
+ trace_oom_score_adj_update(task);
+err_sighand:
+ unlock_task_sighand(task, &flags);
+err_task_lock:
+ task_unlock(task);
+ put_task_struct(task);
+out:
+ return err < 0 ? err : count;
+}
+
+static const struct file_operations proc_oom_adj_operations = {
+ .read = oom_adj_read,
+ .write = oom_adj_write,
+ .llseek = generic_file_llseek,
+};
+
static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
char buffer[PROC_NUMBUF];
- int oom_score_adj = OOM_SCORE_ADJ_MIN;
+ short oom_score_adj = OOM_SCORE_ADJ_MIN;
unsigned long flags;
size_t len;
@@ -889,7 +996,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
unlock_task_sighand(task, &flags);
}
put_task_struct(task);
- len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj);
+ len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
return simple_read_from_buffer(buf, count, ppos, buffer, len);
}
@@ -936,15 +1043,15 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
goto err_task_lock;
}
- if (oom_score_adj < task->signal->oom_score_adj_min &&
+ if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
!capable(CAP_SYS_RESOURCE)) {
err = -EACCES;
goto err_sighand;
}
- task->signal->oom_score_adj = oom_score_adj;
+ task->signal->oom_score_adj = (short)oom_score_adj;
if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
- task->signal->oom_score_adj_min = oom_score_adj;
+ task->signal->oom_score_adj_min = (short)oom_score_adj;
trace_oom_score_adj_update(task);
err_sighand:
@@ -1770,8 +1877,9 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
if (!vma)
goto out_no_vma;
- result = proc_map_files_instantiate(dir, dentry, task,
- (void *)(unsigned long)vma->vm_file->f_mode);
+ if (vma->vm_file)
+ result = proc_map_files_instantiate(dir, dentry, task,
+ (void *)(unsigned long)vma->vm_file->f_mode);
out_no_vma:
up_read(&mm->mmap_sem);
@@ -2598,6 +2706,7 @@ static const struct pid_entry tgid_base_stuff[] = {
REG("cgroup", S_IRUGO, proc_cgroup_operations),
#endif
INF("oom_score", S_IRUGO, proc_oom_score),
+ REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDITSYSCALL
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
@@ -2964,6 +3073,7 @@ static const struct pid_entry tid_base_stuff[] = {
REG("cgroup", S_IRUGO, proc_cgroup_operations),
#endif
INF("oom_score", S_IRUGO, proc_oom_score),
+ REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations),
REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
#ifdef CONFIG_AUDITSYSCALL
REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations),
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index a781bdf06694..701580ddfcc3 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -378,12 +378,13 @@ static int test_perm(int mode, int op)
return -EACCES;
}
-static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
+static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op)
{
+ struct ctl_table_root *root = head->root;
int mode;
if (root->permissions)
- mode = root->permissions(root, current->nsproxy, table);
+ mode = root->permissions(head, table);
else
mode = table->mode;
@@ -491,7 +492,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
* and won't be until we finish.
*/
error = -EPERM;
- if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ))
+ if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ))
goto out;
/* if that can happen at all, it should be -EINVAL, not -EISDIR */
@@ -717,7 +718,7 @@ static int proc_sys_permission(struct inode *inode, int mask)
if (!table) /* global root - r-xr-xr-x */
error = mask & MAY_WRITE ? -EACCES : 0;
else /* Use the permissions on the sysctl table entry */
- error = sysctl_perm(head->root, table, mask & ~MAY_NOT_BLOCK);
+ error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK);
sysctl_head_finish(head);
return error;
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 4ab572e6d277..ed1d8c7212da 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -49,6 +49,7 @@ struct pstore_private {
struct pstore_info *psi;
enum pstore_type_id type;
u64 id;
+ int count;
ssize_t size;
char data[];
};
@@ -175,7 +176,8 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
struct pstore_private *p = dentry->d_inode->i_private;
if (p->psi->erase)
- p->psi->erase(p->type, p->id, p->psi);
+ p->psi->erase(p->type, p->id, p->count,
+ dentry->d_inode->i_ctime, p->psi);
return simple_unlink(dir, dentry);
}
@@ -270,7 +272,7 @@ int pstore_is_mounted(void)
* Load it up with "size" bytes of data from "buf".
* Set the mtime & ctime to the date that this record was originally stored.
*/
-int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
+int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
char *data, size_t size, struct timespec time,
struct pstore_info *psi)
{
@@ -306,6 +308,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
goto fail_alloc;
private->type = type;
private->id = id;
+ private->count = count;
private->psi = psi;
switch (type) {
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 4847f588b7d5..937d820f273c 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -50,7 +50,7 @@ extern struct pstore_info *psinfo;
extern void pstore_set_kmsg_bytes(int);
extern void pstore_get_records(int);
extern int pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
- char *data, size_t size,
+ int count, char *data, size_t size,
struct timespec time, struct pstore_info *psi);
extern int pstore_is_mounted(void);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index a40da07e93d6..5ea2e77ff023 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -136,7 +136,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
break;
ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
- hsize + len, psinfo);
+ oopscount, hsize + len, psinfo);
if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
pstore_new_entry = 1;
@@ -161,6 +161,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
while (s < e) {
unsigned long flags;
+ u64 id;
if (c > psinfo->bufsize)
c = psinfo->bufsize;
@@ -172,7 +173,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
spin_lock_irqsave(&psinfo->buf_lock, flags);
}
memcpy(psinfo->buf, s, c);
- psinfo->write(PSTORE_TYPE_CONSOLE, 0, NULL, 0, c, psinfo);
+ psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, c, psinfo);
spin_unlock_irqrestore(&psinfo->buf_lock, flags);
s += c;
c = e - s;
@@ -196,7 +197,7 @@ static void pstore_register_console(void) {}
static int pstore_write_compat(enum pstore_type_id type,
enum kmsg_dump_reason reason,
- u64 *id, unsigned int part,
+ u64 *id, unsigned int part, int count,
size_t size, struct pstore_info *psi)
{
return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi);
@@ -266,6 +267,7 @@ void pstore_get_records(int quiet)
char *buf = NULL;
ssize_t size;
u64 id;
+ int count;
enum pstore_type_id type;
struct timespec time;
int failed = 0, rc;
@@ -277,9 +279,9 @@ void pstore_get_records(int quiet)
if (psi->open && psi->open(psi))
goto out;
- while ((size = psi->read(&id, &type, &time, &buf, psi)) > 0) {
- rc = pstore_mkfile(type, psi->name, id, buf, (size_t)size,
- time, psi);
+ while ((size = psi->read(&id, &type, &count, &time, &buf, psi)) > 0) {
+ rc = pstore_mkfile(type, psi->name, id, count, buf,
+ (size_t)size, time, psi);
kfree(buf);
buf = NULL;
if (rc && (rc != -EEXIST || !quiet))
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 1a4f6da58eab..2bfa36e0ffe8 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -132,9 +132,8 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max,
}
static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
- struct timespec *time,
- char **buf,
- struct pstore_info *psi)
+ int *count, struct timespec *time,
+ char **buf, struct pstore_info *psi)
{
ssize_t size;
struct ramoops_context *cxt = psi->data;
@@ -236,8 +235,8 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
return 0;
}
-static int ramoops_pstore_erase(enum pstore_type_id type, u64 id,
- struct pstore_info *psi)
+static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
+ struct timespec time, struct pstore_info *psi)
{
struct ramoops_context *cxt = psi->data;
struct persistent_ram_zone *prz;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index f27f01a98aa2..d83736fbc26c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1782,8 +1782,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
BUG_ON(!th->t_trans_id);
- dquot_initialize(inode);
+ reiserfs_write_unlock(inode->i_sb);
err = dquot_alloc_inode(inode);
+ reiserfs_write_lock(inode->i_sb);
if (err)
goto out_end_trans;
if (!dir->i_nlink) {
@@ -1979,8 +1980,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
out_end_trans:
journal_end(th, th->t_super, th->t_blocks_allocated);
+ reiserfs_write_unlock(inode->i_sb);
/* Drop can be outside and it needs more credits so it's better to have it outside */
dquot_drop(inode);
+ reiserfs_write_lock(inode->i_sb);
inode->i_flags |= S_NOQUOTA;
make_bad_inode(inode);
@@ -3103,10 +3106,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
/* must be turned off for recursive notify_change calls */
ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
- depth = reiserfs_write_lock_once(inode->i_sb);
if (is_quota_modification(inode, attr))
dquot_initialize(inode);
-
+ depth = reiserfs_write_lock_once(inode->i_sb);
if (attr->ia_valid & ATTR_SIZE) {
/* version 2 items will be caught by the s_maxbytes check
** done for us in vmtruncate
@@ -3170,7 +3172,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
error = journal_begin(&th, inode->i_sb, jbegin_count);
if (error)
goto out;
+ reiserfs_write_unlock_once(inode->i_sb, depth);
error = dquot_transfer(inode, attr);
+ depth = reiserfs_write_lock_once(inode->i_sb);
if (error) {
journal_end(&th, inode->i_sb, jbegin_count);
goto out;
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index f8afa4b162b8..2f40a4c70a4d 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1968,7 +1968,9 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
key2type(&(key->on_disk_key)));
#endif
+ reiserfs_write_unlock(inode->i_sb);
retval = dquot_alloc_space_nodirty(inode, pasted_size);
+ reiserfs_write_lock(inode->i_sb);
if (retval) {
pathrelse(search_path);
return retval;
@@ -2061,9 +2063,11 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
"reiserquota insert_item(): allocating %u id=%u type=%c",
quota_bytes, inode->i_uid, head2type(ih));
#endif
+ reiserfs_write_unlock(inode->i_sb);
/* We can't dirty inode here. It would be immediately written but
* appropriate stat item isn't inserted yet... */
retval = dquot_alloc_space_nodirty(inode, quota_bytes);
+ reiserfs_write_lock(inode->i_sb);
if (retval) {
pathrelse(path);
return retval;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 1078ae179993..418bdc3a57da 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -298,7 +298,9 @@ static int finish_unfinished(struct super_block *s)
retval = remove_save_link_only(s, &save_link_key, 0);
continue;
}
+ reiserfs_write_unlock(s);
dquot_initialize(inode);
+ reiserfs_write_lock(s);
if (truncate && S_ISDIR(inode->i_mode)) {
/* We got a truncate request for a dir which is impossible.
@@ -1335,7 +1337,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
kfree(qf_names[i]);
#endif
err = -EINVAL;
- goto out_err;
+ goto out_unlock;
}
#ifdef CONFIG_QUOTA
handle_quota_files(s, qf_names, &qfmt);
@@ -1379,7 +1381,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
if (blocks) {
err = reiserfs_resize(s, blocks);
if (err != 0)
- goto out_err;
+ goto out_unlock;
}
if (*mount_flags & MS_RDONLY) {
@@ -1389,9 +1391,15 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
/* it is read-only already */
goto out_ok;
+ /*
+ * Drop write lock. Quota will retake it when needed and lock
+ * ordering requires calling dquot_suspend() without it.
+ */
+ reiserfs_write_unlock(s);
err = dquot_suspend(s, -1);
if (err < 0)
goto out_err;
+ reiserfs_write_lock(s);
/* try to remount file system with read-only permissions */
if (sb_umount_state(rs) == REISERFS_VALID_FS
@@ -1401,7 +1409,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
err = journal_begin(&th, s, 10);
if (err)
- goto out_err;
+ goto out_unlock;
/* Mounting a rw partition read-only. */
reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
@@ -1416,7 +1424,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
if (reiserfs_is_journal_aborted(journal)) {
err = journal->j_errno;
- goto out_err;
+ goto out_unlock;
}
handle_data_mode(s, mount_options);
@@ -1425,7 +1433,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
s->s_flags &= ~MS_RDONLY; /* now it is safe to call journal_begin */
err = journal_begin(&th, s, 10);
if (err)
- goto out_err;
+ goto out_unlock;
/* Mount a partition which is read-only, read-write */
reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
@@ -1442,10 +1450,16 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
SB_JOURNAL(s)->j_must_wait = 1;
err = journal_end(&th, s, 10);
if (err)
- goto out_err;
+ goto out_unlock;
if (!(*mount_flags & MS_RDONLY)) {
+ /*
+ * Drop write lock. Quota will retake it when needed and lock
+ * ordering requires calling dquot_resume() without it.
+ */
+ reiserfs_write_unlock(s);
dquot_resume(s, -1);
+ reiserfs_write_lock(s);
finish_unfinished(s);
reiserfs_xattr_init(s, *mount_flags);
}
@@ -1455,9 +1469,10 @@ out_ok:
reiserfs_write_unlock(s);
return 0;
+out_unlock:
+ reiserfs_write_unlock(s);
out_err:
kfree(new_opts);
- reiserfs_write_unlock(s);
return err;
}
@@ -2095,13 +2110,15 @@ static int reiserfs_write_dquot(struct dquot *dquot)
REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
if (ret)
goto out;
+ reiserfs_write_unlock(dquot->dq_sb);
ret = dquot_commit(dquot);
+ reiserfs_write_lock(dquot->dq_sb);
err =
journal_end(&th, dquot->dq_sb,
REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
if (!ret && err)
ret = err;
- out:
+out:
reiserfs_write_unlock(dquot->dq_sb);
return ret;
}
@@ -2117,13 +2134,15 @@ static int reiserfs_acquire_dquot(struct dquot *dquot)
REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
if (ret)
goto out;
+ reiserfs_write_unlock(dquot->dq_sb);
ret = dquot_acquire(dquot);
+ reiserfs_write_lock(dquot->dq_sb);
err =
journal_end(&th, dquot->dq_sb,
REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
if (!ret && err)
ret = err;
- out:
+out:
reiserfs_write_unlock(dquot->dq_sb);
return ret;
}
@@ -2137,19 +2156,21 @@ static int reiserfs_release_dquot(struct dquot *dquot)
ret =
journal_begin(&th, dquot->dq_sb,
REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+ reiserfs_write_unlock(dquot->dq_sb);
if (ret) {
/* Release dquot anyway to avoid endless cycle in dqput() */
dquot_release(dquot);
goto out;
}
ret = dquot_release(dquot);
+ reiserfs_write_lock(dquot->dq_sb);
err =
journal_end(&th, dquot->dq_sb,
REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
if (!ret && err)
ret = err;
- out:
reiserfs_write_unlock(dquot->dq_sb);
+out:
return ret;
}
@@ -2174,11 +2195,13 @@ static int reiserfs_write_info(struct super_block *sb, int type)
ret = journal_begin(&th, sb, 2);
if (ret)
goto out;
+ reiserfs_write_unlock(sb);
ret = dquot_commit_info(sb, type);
+ reiserfs_write_lock(sb);
err = journal_end(&th, sb, 2);
if (!ret && err)
ret = err;
- out:
+out:
reiserfs_write_unlock(sb);
return ret;
}
@@ -2203,8 +2226,11 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
struct reiserfs_transaction_handle th;
int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA;
- if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt)))
- return -EINVAL;
+ reiserfs_write_lock(sb);
+ if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) {
+ err = -EINVAL;
+ goto out;
+ }
/* Quotafile not on the same filesystem? */
if (path->dentry->d_sb != sb) {
@@ -2246,8 +2272,10 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
if (err)
goto out;
}
- err = dquot_quota_on(sb, type, format_id, path);
+ reiserfs_write_unlock(sb);
+ return dquot_quota_on(sb, type, format_id, path);
out:
+ reiserfs_write_unlock(sb);
return err;
}
@@ -2320,7 +2348,9 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
tocopy = sb->s_blocksize - offset < towrite ?
sb->s_blocksize - offset : towrite;
tmp_bh.b_state = 0;
+ reiserfs_write_lock(sb);
err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE);
+ reiserfs_write_unlock(sb);
if (err)
goto out;
if (offset || tocopy != sb->s_blocksize)
@@ -2336,10 +2366,12 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
flush_dcache_page(bh->b_page);
set_buffer_uptodate(bh);
unlock_buffer(bh);
+ reiserfs_write_lock(sb);
reiserfs_prepare_for_journal(sb, bh, 1);
journal_mark_dirty(current->journal_info, sb, bh);
if (!journal_quota)
reiserfs_add_ordered_list(inode, bh);
+ reiserfs_write_unlock(sb);
brelse(bh);
offset = 0;
towrite -= tocopy;
diff --git a/fs/splice.c b/fs/splice.c
index 13e5b4776e7a..8890604e3fcd 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1024,17 +1024,14 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
ret = sd.num_spliced;
if (ret > 0) {
- unsigned long nr_pages;
int err;
- nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-
err = generic_write_sync(out, *ppos, ret);
if (err)
ret = err;
else
*ppos += ret;
- balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
+ balance_dirty_pages_ratelimited(mapping);
}
sb_end_write(inode->i_sb);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 00012e31829d..602f56db0442 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -485,8 +485,8 @@ const struct file_operations sysfs_file_operations = {
.poll = sysfs_poll,
};
-int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
- const void **pns)
+static int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
+ const void **pns)
{
struct sysfs_dirent *dir_sd = kobj->sd;
const struct sysfs_ops *ops;
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 28ec13af28d9..2dcf3d473fec 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -681,8 +681,16 @@ int ubifs_find_free_leb_for_idx(struct ubifs_info *c)
if (!lprops) {
lprops = ubifs_fast_find_freeable(c);
if (!lprops) {
- ubifs_assert(c->freeable_cnt == 0);
- if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
+ /*
+ * The first condition means the following: go scan the
+ * LPT if there are uncategorized lprops, which means
+ * there may be freeable LEBs there (UBIFS does not
+ * store the information about freeable LEBs in the
+ * master node).
+ */
+ if (c->in_a_category_cnt != c->main_lebs ||
+ c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
+ ubifs_assert(c->freeable_cnt == 0);
lprops = scan_for_leb_for_idx(c);
if (IS_ERR(lprops)) {
err = PTR_ERR(lprops);
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index e5a2a35a46dc..46190a7c42a6 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -300,8 +300,11 @@ void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
default:
ubifs_assert(0);
}
+
lprops->flags &= ~LPROPS_CAT_MASK;
lprops->flags |= cat;
+ c->in_a_category_cnt += 1;
+ ubifs_assert(c->in_a_category_cnt <= c->main_lebs);
}
/**
@@ -334,6 +337,9 @@ static void ubifs_remove_from_cat(struct ubifs_info *c,
default:
ubifs_assert(0);
}
+
+ c->in_a_category_cnt -= 1;
+ ubifs_assert(c->in_a_category_cnt >= 0);
}
/**
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 5486346d0a3f..d133c276fe05 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1183,6 +1183,8 @@ struct ubifs_debug_info;
* @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size)
* @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size)
* @freeable_cnt: number of freeable LEBs in @freeable_list
+ * @in_a_category_cnt: count of lprops which are in a certain category, which
+ * basically meants that they were loaded from the flash
*
* @ltab_lnum: LEB number of LPT's own lprops table
* @ltab_offs: offset of LPT's own lprops table
@@ -1412,6 +1414,7 @@ struct ubifs_info {
struct list_head freeable_list;
struct list_head frdi_idx_list;
int freeable_cnt;
+ int in_a_category_cnt;
int ltab_lnum;
int ltab_offs;
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 6100ec0fa1d4..5a7ffe54f5d5 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -2,6 +2,7 @@ config XFS_FS
tristate "XFS filesystem support"
depends on BLOCK
select EXPORTFS
+ select LIBCRC32C
help
XFS is a high performance journaling filesystem which originated
on the SGI IRIX platform. It is completely multi-threaded, can
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d2bf974b1a2f..d02201df855b 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -37,9 +37,8 @@ xfs-y += xfs_aops.o \
xfs_file.o \
xfs_filestream.o \
xfs_fsops.o \
- xfs_fs_subr.o \
xfs_globals.o \
- xfs_iget.o \
+ xfs_icache.o \
xfs_ioctl.o \
xfs_iomap.o \
xfs_iops.o \
@@ -47,7 +46,6 @@ xfs-y += xfs_aops.o \
xfs_message.o \
xfs_mru_cache.o \
xfs_super.o \
- xfs_sync.o \
xfs_xattr.o \
xfs_rename.o \
xfs_utils.o \
diff --git a/fs/xfs/uuid.h b/fs/xfs/uuid.h
index 4732d71262cc..104db0f3bed6 100644
--- a/fs/xfs/uuid.h
+++ b/fs/xfs/uuid.h
@@ -26,4 +26,10 @@ extern int uuid_is_nil(uuid_t *uuid);
extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
+static inline void
+uuid_copy(uuid_t *dst, uuid_t *src)
+{
+ memcpy(dst, src, sizeof(uuid_t));
+}
+
#endif /* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 44d65c1533c0..f2aeedb6a579 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -108,6 +108,8 @@ typedef struct xfs_agf {
extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+
/*
* Size of the unlinked inode hash table in the agi.
*/
@@ -161,6 +163,8 @@ typedef struct xfs_agi {
extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_agnumber_t agno, struct xfs_buf **bpp);
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
+
/*
* The third a.g. block contains the a.g. freelist, an array
* of block pointers to blocks owned by the allocation btree code.
@@ -233,6 +237,7 @@ typedef struct xfs_perag {
#define XFS_ICI_NO_TAG (-1) /* special flag for an untagged lookup
in xfs_inode_ag_iterator */
#define XFS_ICI_RECLAIM_TAG 0 /* inode is to be reclaimed */
+#define XFS_ICI_EOFBLOCKS_TAG 1 /* inode has blocks beyond EOF */
#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels)
#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 4f33c32affe3..393055fe3aef 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -430,6 +430,60 @@ xfs_alloc_fixup_trees(
return 0;
}
+static void
+xfs_agfl_verify(
+ struct xfs_buf *bp)
+{
+#ifdef WHEN_CRCS_COME_ALONG
+ /*
+ * we cannot actually do any verification of the AGFL because mkfs does
+ * not initialise the AGFL to zero or NULL. Hence the only valid part of
+ * the AGFL is what the AGF says is active. We can't get to the AGF, so
+ * we can't verify just those entries are valid.
+ *
+ * This problem goes away when the CRC format change comes along as that
+ * requires the AGFL to be initialised by mkfs. At that point, we can
+ * verify the blocks in the agfl -active or not- lie within the bounds
+ * of the AG. Until then, just leave this check ifdef'd out.
+ */
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
+ int agfl_ok = 1;
+
+ int i;
+
+ for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
+ if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK ||
+ be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
+ agfl_ok = 0;
+ }
+
+ if (!agfl_ok) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+#endif
+}
+
+static void
+xfs_agfl_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_agfl_verify(bp);
+}
+
+static void
+xfs_agfl_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_agfl_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_agfl_buf_ops = {
+ .verify_read = xfs_agfl_read_verify,
+ .verify_write = xfs_agfl_write_verify,
+};
+
/*
* Read in the allocation group free block array.
*/
@@ -447,7 +501,7 @@ xfs_alloc_read_agfl(
error = xfs_trans_read_buf(
mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, &bp);
+ XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
if (error)
return error;
ASSERT(!xfs_buf_geterror(bp));
@@ -1866,6 +1920,7 @@ xfs_alloc_fix_freelist(
/*
* Initialize the args structure.
*/
+ memset(&targs, 0, sizeof(targs));
targs.tp = tp;
targs.mp = mp;
targs.agbp = agbp;
@@ -2090,6 +2145,63 @@ xfs_alloc_put_freelist(
return 0;
}
+static void
+xfs_agf_verify(
+ struct xfs_buf *bp)
+ {
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_agf *agf;
+ int agf_ok;
+
+ agf = XFS_BUF_TO_AGF(bp);
+
+ agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
+ XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+ be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
+ be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
+ be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
+ be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp);
+
+ /*
+ * during growfs operations, the perag is not fully initialised,
+ * so we can't use it for any useful checking. growfs ensures we can't
+ * use it by using uncached buffers that don't have the perag attached
+ * so we can detect and avoid this problem.
+ */
+ if (bp->b_pag)
+ agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) ==
+ bp->b_pag->pag_agno;
+
+ if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+ agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
+ be32_to_cpu(agf->agf_length);
+
+ if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
+ XFS_RANDOM_ALLOC_READ_AGF))) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+static void
+xfs_agf_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_agf_verify(bp);
+}
+
+static void
+xfs_agf_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_agf_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_agf_buf_ops = {
+ .verify_read = xfs_agf_read_verify,
+ .verify_write = xfs_agf_write_verify,
+};
+
/*
* Read in the allocation group header (free/alloc section).
*/
@@ -2101,44 +2213,19 @@ xfs_read_agf(
int flags, /* XFS_BUF_ */
struct xfs_buf **bpp) /* buffer for the ag freelist header */
{
- struct xfs_agf *agf; /* ag freelist header */
- int agf_ok; /* set if agf is consistent */
int error;
ASSERT(agno != NULLAGNUMBER);
error = xfs_trans_read_buf(
mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), flags, bpp);
+ XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
if (error)
return error;
if (!*bpp)
return 0;
ASSERT(!(*bpp)->b_error);
- agf = XFS_BUF_TO_AGF(*bpp);
-
- /*
- * Validate the magic number of the agf block.
- */
- agf_ok =
- agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
- XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
- be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
- be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) &&
- be32_to_cpu(agf->agf_seqno) == agno;
- if (xfs_sb_version_haslazysbcount(&mp->m_sb))
- agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
- be32_to_cpu(agf->agf_length);
- if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
- XFS_RANDOM_ALLOC_READ_AGF))) {
- XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
- XFS_ERRLEVEL_LOW, mp, agf);
- xfs_trans_brelse(tp, *bpp);
- return XFS_ERROR(EFSCORRUPTED);
- }
xfs_buf_set_ref(*bpp, XFS_AGF_REF);
return 0;
}
@@ -2207,7 +2294,7 @@ xfs_alloc_read_agf(
* group or loop over the allocation groups to find the result.
*/
int /* error */
-__xfs_alloc_vextent(
+xfs_alloc_vextent(
xfs_alloc_arg_t *args) /* allocation argument structure */
{
xfs_agblock_t agsize; /* allocation group size */
@@ -2417,46 +2504,6 @@ error0:
return error;
}
-static void
-xfs_alloc_vextent_worker(
- struct work_struct *work)
-{
- struct xfs_alloc_arg *args = container_of(work,
- struct xfs_alloc_arg, work);
- unsigned long pflags;
-
- /* we are in a transaction context here */
- current_set_flags_nested(&pflags, PF_FSTRANS);
-
- args->result = __xfs_alloc_vextent(args);
- complete(args->done);
-
- current_restore_flags_nested(&pflags, PF_FSTRANS);
-}
-
-/*
- * Data allocation requests often come in with little stack to work on. Push
- * them off to a worker thread so there is lots of stack to use. Metadata
- * requests, OTOH, are generally from low stack usage paths, so avoid the
- * context switch overhead here.
- */
-int
-xfs_alloc_vextent(
- struct xfs_alloc_arg *args)
-{
- DECLARE_COMPLETION_ONSTACK(done);
-
- if (!args->userdata)
- return __xfs_alloc_vextent(args);
-
-
- args->done = &done;
- INIT_WORK_ONSTACK(&args->work, xfs_alloc_vextent_worker);
- queue_work(xfs_alloc_wq, &args->work);
- wait_for_completion(&done);
- return args->result;
-}
-
/*
* Free an extent.
* Just break up the extent address and hand off to xfs_free_ag_extent
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 93be4a667ca1..99d0a6101558 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -120,9 +120,6 @@ typedef struct xfs_alloc_arg {
char isfl; /* set if is freelist blocks - !acctg */
char userdata; /* set if this is user data */
xfs_fsblock_t firstblock; /* io first block allocated */
- struct completion *done;
- struct work_struct work;
- int result;
} xfs_alloc_arg_t;
/*
@@ -234,4 +231,7 @@ xfs_alloc_get_rec(
xfs_extlen_t *len, /* output: length of extent */
int *stat); /* output: success/failure */
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+extern const struct xfs_buf_ops xfs_agfl_buf_ops;
+
#endif /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index f1647caace8f..b1ddef6b2689 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -121,6 +121,8 @@ xfs_allocbt_free_block(
xfs_extent_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1,
XFS_EXTENT_BUSY_SKIP_DISCARD);
xfs_trans_agbtree_delta(cur->bc_tp, -1);
+
+ xfs_trans_binval(cur->bc_tp, bp);
return 0;
}
@@ -270,6 +272,82 @@ xfs_allocbt_key_diff(
return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
}
+static void
+xfs_allocbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ struct xfs_perag *pag = bp->b_pag;
+ unsigned int level;
+ int sblock_ok; /* block passes checks */
+
+ /*
+ * magic number and level verification
+ *
+ * During growfs operations, we can't verify the exact level as the
+ * perag is not fully initialised and hence not attached to the buffer.
+ * In this case, check against the maximum tree depth.
+ */
+ level = be16_to_cpu(block->bb_level);
+ switch (block->bb_magic) {
+ case cpu_to_be32(XFS_ABTB_MAGIC):
+ if (pag)
+ sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi];
+ else
+ sblock_ok = level < mp->m_ag_maxlevels;
+ break;
+ case cpu_to_be32(XFS_ABTC_MAGIC):
+ if (pag)
+ sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi];
+ else
+ sblock_ok = level < mp->m_ag_maxlevels;
+ break;
+ default:
+ sblock_ok = 0;
+ break;
+ }
+
+ /* numrecs verification */
+ sblock_ok = sblock_ok &&
+ be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0];
+
+ /* sibling pointer verification */
+ sblock_ok = sblock_ok &&
+ (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
+ be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
+ block->bb_u.s.bb_leftsib &&
+ (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
+ be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
+ block->bb_u.s.bb_rightsib;
+
+ if (!sblock_ok) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+static void
+xfs_allocbt_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_allocbt_verify(bp);
+}
+
+static void
+xfs_allocbt_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_allocbt_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_allocbt_buf_ops = {
+ .verify_read = xfs_allocbt_read_verify,
+ .verify_write = xfs_allocbt_write_verify,
+};
+
+
#ifdef DEBUG
STATIC int
xfs_allocbt_keys_inorder(
@@ -325,6 +403,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
.init_rec_from_cur = xfs_allocbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_allocbt_init_ptr_from_cur,
.key_diff = xfs_allocbt_key_diff,
+ .buf_ops = &xfs_allocbt_buf_ops,
#ifdef DEBUG
.keys_inorder = xfs_allocbt_keys_inorder,
.recs_inorder = xfs_allocbt_recs_inorder,
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index 359fb86ed876..7e89a2b429dd 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -93,4 +93,6 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
xfs_agnumber_t, xfs_btnum_t);
extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
+extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
+
#endif /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e562dd43f41f..4111a40ebe1a 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -124,7 +124,7 @@ xfs_setfilesize_trans_alloc(
ioend->io_append_trans = tp;
/*
- * We will pass freeze protection with a transaction. So tell lockdep
+ * We may pass freeze protection with a transaction. So tell lockdep
* we released it.
*/
rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
@@ -149,11 +149,13 @@ xfs_setfilesize(
xfs_fsize_t isize;
/*
- * The transaction was allocated in the I/O submission thread,
- * thus we need to mark ourselves as beeing in a transaction
- * manually.
+ * The transaction may have been allocated in the I/O submission thread,
+ * thus we need to mark ourselves as beeing in a transaction manually.
+ * Similarly for freeze protection.
*/
current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+ rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+ 0, 1, _THIS_IP_);
xfs_ilock(ip, XFS_ILOCK_EXCL);
isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
@@ -187,7 +189,8 @@ xfs_finish_ioend(
if (ioend->io_type == XFS_IO_UNWRITTEN)
queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
- else if (ioend->io_append_trans)
+ else if (ioend->io_append_trans ||
+ (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
queue_work(mp->m_data_workqueue, &ioend->io_work);
else
xfs_destroy_ioend(ioend);
@@ -205,15 +208,6 @@ xfs_end_io(
struct xfs_inode *ip = XFS_I(ioend->io_inode);
int error = 0;
- if (ioend->io_append_trans) {
- /*
- * We've got freeze protection passed with the transaction.
- * Tell lockdep about it.
- */
- rwsem_acquire_read(
- &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
- 0, 1, _THIS_IP_);
- }
if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
ioend->io_error = -EIO;
goto done;
@@ -226,35 +220,31 @@ xfs_end_io(
* range to normal written extens after the data I/O has finished.
*/
if (ioend->io_type == XFS_IO_UNWRITTEN) {
+ error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
+ ioend->io_size);
+ } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
/*
- * For buffered I/O we never preallocate a transaction when
- * doing the unwritten extent conversion, but for direct I/O
- * we do not know if we are converting an unwritten extent
- * or not at the point where we preallocate the transaction.
+ * For direct I/O we do not know if we need to allocate blocks
+ * or not so we can't preallocate an append transaction as that
+ * results in nested reservations and log space deadlocks. Hence
+ * allocate the transaction here. While this is sub-optimal and
+ * can block IO completion for some time, we're stuck with doing
+ * it this way until we can pass the ioend to the direct IO
+ * allocation callbacks and avoid nesting that way.
*/
- if (ioend->io_append_trans) {
- ASSERT(ioend->io_isdirect);
-
- current_set_flags_nested(
- &ioend->io_append_trans->t_pflags, PF_FSTRANS);
- xfs_trans_cancel(ioend->io_append_trans, 0);
- }
-
- error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
- ioend->io_size);
- if (error) {
- ioend->io_error = -error;
+ error = xfs_setfilesize_trans_alloc(ioend);
+ if (error)
goto done;
- }
+ error = xfs_setfilesize(ioend);
} else if (ioend->io_append_trans) {
error = xfs_setfilesize(ioend);
- if (error)
- ioend->io_error = -error;
} else {
ASSERT(!xfs_ioend_is_append(ioend));
}
done:
+ if (error)
+ ioend->io_error = -error;
xfs_destroy_ioend(ioend);
}
@@ -481,11 +471,17 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
*
* The fix is two passes across the ioend list - one to start writeback on the
* buffer_heads, and then submit them for I/O on the second pass.
+ *
+ * If @fail is non-zero, it means that we have a situation where some part of
+ * the submission process has failed after we have marked paged for writeback
+ * and unlocked them. In this situation, we need to fail the ioend chain rather
+ * than submit it to IO. This typically only happens on a filesystem shutdown.
*/
STATIC void
xfs_submit_ioend(
struct writeback_control *wbc,
- xfs_ioend_t *ioend)
+ xfs_ioend_t *ioend,
+ int fail)
{
xfs_ioend_t *head = ioend;
xfs_ioend_t *next;
@@ -506,6 +502,18 @@ xfs_submit_ioend(
next = ioend->io_list;
bio = NULL;
+ /*
+ * If we are failing the IO now, just mark the ioend with an
+ * error and finish it. This will run IO completion immediately
+ * as there is only one reference to the ioend at this point in
+ * time.
+ */
+ if (fail) {
+ ioend->io_error = -fail;
+ xfs_finish_ioend(ioend);
+ continue;
+ }
+
for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
if (!bio) {
@@ -1060,7 +1068,18 @@ xfs_vm_writepage(
xfs_start_page_writeback(page, 1, count);
- if (ioend && imap_valid) {
+ /* if there is no IO to be submitted for this page, we are done */
+ if (!ioend)
+ return 0;
+
+ ASSERT(iohead);
+
+ /*
+ * Any errors from this point onwards need tobe reported through the IO
+ * completion path as we have marked the initial page as under writeback
+ * and unlocked it.
+ */
+ if (imap_valid) {
xfs_off_t end_index;
end_index = imap.br_startoff + imap.br_blockcount;
@@ -1079,20 +1098,15 @@ xfs_vm_writepage(
wbc, end_index);
}
- if (iohead) {
- /*
- * Reserve log space if we might write beyond the on-disk
- * inode size.
- */
- if (ioend->io_type != XFS_IO_UNWRITTEN &&
- xfs_ioend_is_append(ioend)) {
- err = xfs_setfilesize_trans_alloc(ioend);
- if (err)
- goto error;
- }
- xfs_submit_ioend(wbc, iohead);
- }
+ /*
+ * Reserve log space if we might write beyond the on-disk inode size.
+ */
+ err = 0;
+ if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
+ err = xfs_setfilesize_trans_alloc(ioend);
+
+ xfs_submit_ioend(wbc, iohead, err);
return 0;
@@ -1408,25 +1422,21 @@ xfs_vm_direct_IO(
size_t size = iov_length(iov, nr_segs);
/*
- * We need to preallocate a transaction for a size update
- * here. In the case that this write both updates the size
- * and converts at least on unwritten extent we will cancel
- * the still clean transaction after the I/O has finished.
+ * We cannot preallocate a size update transaction here as we
+ * don't know whether allocation is necessary or not. Hence we
+ * can only tell IO completion that one is necessary if we are
+ * not doing unwritten extent conversion.
*/
iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
- if (offset + size > XFS_I(inode)->i_d.di_size) {
- ret = xfs_setfilesize_trans_alloc(ioend);
- if (ret)
- goto out_destroy_ioend;
+ if (offset + size > XFS_I(inode)->i_d.di_size)
ioend->io_isdirect = 1;
- }
ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
offset, nr_segs,
xfs_get_blocks_direct,
xfs_end_io_direct_write, NULL, 0);
if (ret != -EIOCBQUEUED && iocb->private)
- goto out_trans_cancel;
+ goto out_destroy_ioend;
} else {
ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
offset, nr_segs,
@@ -1436,15 +1446,6 @@ xfs_vm_direct_IO(
return ret;
-out_trans_cancel:
- if (ioend->io_append_trans) {
- current_set_flags_nested(&ioend->io_append_trans->t_pflags,
- PF_FSTRANS);
- rwsem_acquire_read(
- &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
- 0, 1, _THIS_IP_);
- xfs_trans_cancel(ioend->io_append_trans, 0);
- }
out_destroy_ioend:
xfs_destroy_ioend(ioend);
return ret;
@@ -1617,7 +1618,7 @@ xfs_vm_bmap(
trace_xfs_vm_bmap(XFS_I(inode));
xfs_ilock(ip, XFS_IOLOCK_SHARED);
- xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
+ filemap_write_and_wait(mapping);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return generic_block_bmap(mapping, block, xfs_get_blocks);
}
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 0ca1f0be62d2..aaf472532b3c 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -903,11 +903,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
*/
dp = args->dp;
args->blkno = 0;
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
- XFS_ATTR_FORK);
+ error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
if (error)
- return(error);
- ASSERT(bp != NULL);
+ return error;
/*
* Look up the given attribute in the leaf block. Figure out if
@@ -1031,12 +1029,12 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
* Read in the block containing the "old" attr, then
* remove the "old" attr from that block (neat, huh!)
*/
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1,
- &bp, XFS_ATTR_FORK);
+ error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno,
+ -1, &bp);
if (error)
- return(error);
- ASSERT(bp != NULL);
- (void)xfs_attr_leaf_remove(bp, args);
+ return error;
+
+ xfs_attr_leaf_remove(bp, args);
/*
* If the result is small enough, shrink it all into the inode.
@@ -1100,20 +1098,17 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
*/
dp = args->dp;
args->blkno = 0;
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
- XFS_ATTR_FORK);
- if (error) {
- return(error);
- }
+ error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ if (error)
+ return error;
- ASSERT(bp != NULL);
error = xfs_attr_leaf_lookup_int(bp, args);
if (error == ENOATTR) {
xfs_trans_brelse(args->trans, bp);
return(error);
}
- (void)xfs_attr_leaf_remove(bp, args);
+ xfs_attr_leaf_remove(bp, args);
/*
* If the result is small enough, shrink it all into the inode.
@@ -1155,12 +1150,12 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
struct xfs_buf *bp;
int error;
+ trace_xfs_attr_leaf_get(args);
+
args->blkno = 0;
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
- XFS_ATTR_FORK);
+ error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
if (error)
- return(error);
- ASSERT(bp != NULL);
+ return error;
error = xfs_attr_leaf_lookup_int(bp, args);
if (error != EEXIST) {
@@ -1181,22 +1176,15 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
STATIC int
xfs_attr_leaf_list(xfs_attr_list_context_t *context)
{
- xfs_attr_leafblock_t *leaf;
int error;
struct xfs_buf *bp;
+ trace_xfs_attr_leaf_list(context);
+
context->cursor->blkno = 0;
- error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK);
+ error = xfs_attr_leaf_read(NULL, context->dp, 0, -1, &bp);
if (error)
return XFS_ERROR(error);
- ASSERT(bp != NULL);
- leaf = bp->b_addr;
- if (unlikely(leaf->hdr.info.magic != cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) {
- XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW,
- context->dp->i_mount, leaf);
- xfs_trans_brelse(NULL, bp);
- return XFS_ERROR(EFSCORRUPTED);
- }
error = xfs_attr_leaf_list_int(bp, context);
xfs_trans_brelse(NULL, bp);
@@ -1600,12 +1588,9 @@ xfs_attr_node_removename(xfs_da_args_t *args)
ASSERT(state->path.blk[0].bp);
state->path.blk[0].bp = NULL;
- error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
- XFS_ATTR_FORK);
+ error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp);
if (error)
goto out;
- ASSERT((((xfs_attr_leafblock_t *)bp->b_addr)->hdr.info.magic) ==
- cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
xfs_bmap_init(args->flist, args->firstblock);
@@ -1653,6 +1638,8 @@ xfs_attr_fillstate(xfs_da_state_t *state)
xfs_da_state_blk_t *blk;
int level;
+ trace_xfs_attr_fillstate(state->args);
+
/*
* Roll down the "path" in the state structure, storing the on-disk
* block number for those buffers in the "path".
@@ -1699,6 +1686,8 @@ xfs_attr_refillstate(xfs_da_state_t *state)
xfs_da_state_blk_t *blk;
int level, error;
+ trace_xfs_attr_refillstate(state->args);
+
/*
* Roll down the "path" in the state structure, storing the on-disk
* block number for those buffers in the "path".
@@ -1707,7 +1696,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
if (blk->disk_blkno) {
- error = xfs_da_read_buf(state->args->trans,
+ error = xfs_da_node_read(state->args->trans,
state->args->dp,
blk->blkno, blk->disk_blkno,
&blk->bp, XFS_ATTR_FORK);
@@ -1726,7 +1715,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
if (blk->disk_blkno) {
- error = xfs_da_read_buf(state->args->trans,
+ error = xfs_da_node_read(state->args->trans,
state->args->dp,
blk->blkno, blk->disk_blkno,
&blk->bp, XFS_ATTR_FORK);
@@ -1755,6 +1744,8 @@ xfs_attr_node_get(xfs_da_args_t *args)
int error, retval;
int i;
+ trace_xfs_attr_node_get(args);
+
state = xfs_da_state_alloc();
state->args = args;
state->mp = args->dp->i_mount;
@@ -1804,6 +1795,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
int error, i;
struct xfs_buf *bp;
+ trace_xfs_attr_node_list(context);
+
cursor = context->cursor;
cursor->initted = 1;
@@ -1814,7 +1807,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
*/
bp = NULL;
if (cursor->blkno > 0) {
- error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
+ error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1,
&bp, XFS_ATTR_FORK);
if ((error != 0) && (error != EFSCORRUPTED))
return(error);
@@ -1856,17 +1849,11 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
if (bp == NULL) {
cursor->blkno = 0;
for (;;) {
- error = xfs_da_read_buf(NULL, context->dp,
+ error = xfs_da_node_read(NULL, context->dp,
cursor->blkno, -1, &bp,
XFS_ATTR_FORK);
if (error)
return(error);
- if (unlikely(bp == NULL)) {
- XFS_ERROR_REPORT("xfs_attr_node_list(2)",
- XFS_ERRLEVEL_LOW,
- context->dp->i_mount);
- return(XFS_ERROR(EFSCORRUPTED));
- }
node = bp->b_addr;
if (node->hdr.info.magic ==
cpu_to_be16(XFS_ATTR_LEAF_MAGIC))
@@ -1907,14 +1894,6 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
*/
for (;;) {
leaf = bp->b_addr;
- if (unlikely(leaf->hdr.info.magic !=
- cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) {
- XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)",
- XFS_ERRLEVEL_LOW,
- context->dp->i_mount, leaf);
- xfs_trans_brelse(NULL, bp);
- return(XFS_ERROR(EFSCORRUPTED));
- }
error = xfs_attr_leaf_list_int(bp, context);
if (error) {
xfs_trans_brelse(NULL, bp);
@@ -1924,16 +1903,10 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
break;
cursor->blkno = be32_to_cpu(leaf->hdr.info.forw);
xfs_trans_brelse(NULL, bp);
- error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
- &bp, XFS_ATTR_FORK);
+ error = xfs_attr_leaf_read(NULL, context->dp, cursor->blkno, -1,
+ &bp);
if (error)
- return(error);
- if (unlikely((bp == NULL))) {
- XFS_ERROR_REPORT("xfs_attr_node_list(5)",
- XFS_ERRLEVEL_LOW,
- context->dp->i_mount);
- return(XFS_ERROR(EFSCORRUPTED));
- }
+ return error;
}
xfs_trans_brelse(NULL, bp);
return(0);
@@ -1959,6 +1932,8 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
int nmap, error, tmp, valuelen, blkcnt, i;
xfs_dablk_t lblkno;
+ trace_xfs_attr_rmtval_get(args);
+
ASSERT(!(args->flags & ATTR_KERNOVAL));
mp = args->dp->i_mount;
@@ -1980,7 +1955,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
- dblkno, blkcnt, 0, &bp);
+ dblkno, blkcnt, 0, &bp, NULL);
if (error)
return(error);
@@ -2014,6 +1989,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
xfs_dablk_t lblkno;
int blkcnt, valuelen, nmap, error, tmp, committed;
+ trace_xfs_attr_rmtval_set(args);
+
dp = args->dp;
mp = dp->i_mount;
src = args->value;
@@ -2143,6 +2120,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
xfs_dablk_t lblkno;
int valuelen, blkcnt, nmap, error, done, committed;
+ trace_xfs_attr_rmtval_remove(args);
+
mp = args->dp->i_mount;
/*
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index d330111ca738..ee24993c7d12 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -57,7 +57,8 @@ STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block,
struct xfs_buf **bpp);
STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer,
xfs_da_args_t *args, int freemap_index);
-STATIC void xfs_attr_leaf_compact(xfs_trans_t *tp, struct xfs_buf *leaf_buffer);
+STATIC void xfs_attr_leaf_compact(struct xfs_da_args *args,
+ struct xfs_buf *leaf_buffer);
STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state,
xfs_da_state_blk_t *blk1,
xfs_da_state_blk_t *blk2);
@@ -87,6 +88,52 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
xfs_mount_t *mp);
STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
+static void
+xfs_attr_leaf_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_attr_leaf_hdr *hdr = bp->b_addr;
+ int block_ok = 0;
+
+ block_ok = hdr->info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC);
+ if (!block_ok) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+static void
+xfs_attr_leaf_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_attr_leaf_verify(bp);
+}
+
+static void
+xfs_attr_leaf_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_attr_leaf_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_attr_leaf_buf_ops = {
+ .verify_read = xfs_attr_leaf_read_verify,
+ .verify_write = xfs_attr_leaf_write_verify,
+};
+
+int
+xfs_attr_leaf_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp)
+{
+ return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+ XFS_ATTR_FORK, &xfs_attr_leaf_buf_ops);
+}
+
/*========================================================================
* Namespace helper routines
*========================================================================*/
@@ -869,17 +916,16 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args)
error = xfs_da_grow_inode(args, &blkno);
if (error)
goto out;
- error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
- XFS_ATTR_FORK);
+ error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp1);
if (error)
goto out;
- ASSERT(bp1 != NULL);
+
bp2 = NULL;
error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2,
XFS_ATTR_FORK);
if (error)
goto out;
- ASSERT(bp2 != NULL);
+ bp2->b_ops = bp1->b_ops;
memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount));
bp1 = NULL;
xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
@@ -933,7 +979,7 @@ xfs_attr_leaf_create(
XFS_ATTR_FORK);
if (error)
return(error);
- ASSERT(bp != NULL);
+ bp->b_ops = &xfs_attr_leaf_buf_ops;
leaf = bp->b_addr;
memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
hdr = &leaf->hdr;
@@ -1071,7 +1117,7 @@ xfs_attr_leaf_add(
* Compact the entries to coalesce free space.
* This may change the hdr->count via dropping INCOMPLETE entries.
*/
- xfs_attr_leaf_compact(args->trans, bp);
+ xfs_attr_leaf_compact(args, bp);
/*
* After compaction, the block is guaranteed to have only one
@@ -1102,6 +1148,8 @@ xfs_attr_leaf_add_work(
xfs_mount_t *mp;
int tmp, i;
+ trace_xfs_attr_leaf_add_work(args);
+
leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
hdr = &leaf->hdr;
@@ -1214,15 +1262,17 @@ xfs_attr_leaf_add_work(
*/
STATIC void
xfs_attr_leaf_compact(
- struct xfs_trans *trans,
- struct xfs_buf *bp)
+ struct xfs_da_args *args,
+ struct xfs_buf *bp)
{
- xfs_attr_leafblock_t *leaf_s, *leaf_d;
- xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
- xfs_mount_t *mp;
- char *tmpbuffer;
+ xfs_attr_leafblock_t *leaf_s, *leaf_d;
+ xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
+ struct xfs_trans *trans = args->trans;
+ struct xfs_mount *mp = trans->t_mountp;
+ char *tmpbuffer;
+
+ trace_xfs_attr_leaf_compact(args);
- mp = trans->t_mountp;
tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
ASSERT(tmpbuffer != NULL);
memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp));
@@ -1291,6 +1341,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
leaf2 = blk2->bp->b_addr;
ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+ ASSERT(leaf2->hdr.count == 0);
args = state->args;
trace_xfs_attr_leaf_rebalance(args);
@@ -1344,9 +1395,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
max = be16_to_cpu(hdr2->firstused)
- sizeof(xfs_attr_leaf_hdr_t);
max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t);
- if (space > max) {
- xfs_attr_leaf_compact(args->trans, blk2->bp);
- }
+ if (space > max)
+ xfs_attr_leaf_compact(args, blk2->bp);
/*
* Move high entries from leaf1 to low end of leaf2.
@@ -1361,6 +1411,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
* I assert that since all callers pass in an empty
* second buffer, this code should never execute.
*/
+ ASSERT(0);
/*
* Figure the total bytes to be added to the destination leaf.
@@ -1376,9 +1427,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
max = be16_to_cpu(hdr1->firstused)
- sizeof(xfs_attr_leaf_hdr_t);
max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t);
- if (space > max) {
- xfs_attr_leaf_compact(args->trans, blk1->bp);
- }
+ if (space > max)
+ xfs_attr_leaf_compact(args, blk1->bp);
/*
* Move low entries from leaf2 to high end of leaf1.
@@ -1422,10 +1472,24 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
args->index2 = 0;
args->blkno2 = blk2->blkno;
} else {
+ /*
+ * On a double leaf split, the original attr location
+ * is already stored in blkno2/index2, so don't
+ * overwrite it overwise we corrupt the tree.
+ */
blk2->index = blk1->index
- be16_to_cpu(leaf1->hdr.count);
- args->index = args->index2 = blk2->index;
- args->blkno = args->blkno2 = blk2->blkno;
+ args->index = blk2->index;
+ args->blkno = blk2->blkno;
+ if (!state->extravalid) {
+ /*
+ * set the new attr location to match the old
+ * one and let the higher level split code
+ * decide where in the leaf to place it.
+ */
+ args->index2 = blk2->index;
+ args->blkno2 = blk2->blkno;
+ }
}
} else {
ASSERT(state->inleaf == 1);
@@ -1561,6 +1625,8 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
xfs_dablk_t blkno;
struct xfs_buf *bp;
+ trace_xfs_attr_leaf_toosmall(state->args);
+
/*
* Check for the degenerate case of the block being over 50% full.
* If so, it's not worth even looking to see if we might be able
@@ -1620,18 +1686,16 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
blkno = be32_to_cpu(info->back);
if (blkno == 0)
continue;
- error = xfs_da_read_buf(state->args->trans, state->args->dp,
- blkno, -1, &bp, XFS_ATTR_FORK);
+ error = xfs_attr_leaf_read(state->args->trans, state->args->dp,
+ blkno, -1, &bp);
if (error)
return(error);
- ASSERT(bp != NULL);
leaf = (xfs_attr_leafblock_t *)info;
count = be16_to_cpu(leaf->hdr.count);
bytes = state->blocksize - (state->blocksize>>2);
bytes -= be16_to_cpu(leaf->hdr.usedbytes);
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
count += be16_to_cpu(leaf->hdr.count);
bytes -= be16_to_cpu(leaf->hdr.usedbytes);
bytes -= count * sizeof(xfs_attr_leaf_entry_t);
@@ -1686,6 +1750,8 @@ xfs_attr_leaf_remove(
int tablesize, tmp, i;
xfs_mount_t *mp;
+ trace_xfs_attr_leaf_remove(args);
+
leaf = bp->b_addr;
ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
hdr = &leaf->hdr;
@@ -2495,15 +2561,11 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
/*
* Set up the operation.
*/
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
- XFS_ATTR_FORK);
- if (error) {
+ error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ if (error)
return(error);
- }
- ASSERT(bp != NULL);
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
ASSERT(args->index >= 0);
entry = &leaf->entries[ args->index ];
@@ -2560,15 +2622,11 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
/*
* Set up the operation.
*/
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
- XFS_ATTR_FORK);
- if (error) {
+ error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
+ if (error)
return(error);
- }
- ASSERT(bp != NULL);
leaf = bp->b_addr;
- ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
ASSERT(args->index >= 0);
entry = &leaf->entries[ args->index ];
@@ -2617,35 +2675,28 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
/*
* Read the block containing the "old" attr
*/
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1,
- XFS_ATTR_FORK);
- if (error) {
- return(error);
- }
- ASSERT(bp1 != NULL);
+ error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
+ if (error)
+ return error;
/*
* Read the block containing the "new" attr, if it is different
*/
if (args->blkno2 != args->blkno) {
- error = xfs_da_read_buf(args->trans, args->dp, args->blkno2,
- -1, &bp2, XFS_ATTR_FORK);
- if (error) {
- return(error);
- }
- ASSERT(bp2 != NULL);
+ error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno2,
+ -1, &bp2);
+ if (error)
+ return error;
} else {
bp2 = bp1;
}
leaf1 = bp1->b_addr;
- ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
ASSERT(args->index < be16_to_cpu(leaf1->hdr.count));
ASSERT(args->index >= 0);
entry1 = &leaf1->entries[ args->index ];
leaf2 = bp2->b_addr;
- ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count));
ASSERT(args->index2 >= 0);
entry2 = &leaf2->entries[ args->index2 ];
@@ -2730,7 +2781,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
* the extents in reverse order the extent containing
* block 0 must still be there.
*/
- error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
+ error = xfs_da_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
if (error)
return(error);
blkno = XFS_BUF_ADDR(bp);
@@ -2815,7 +2866,7 @@ xfs_attr_node_inactive(
* traversal of the tree so we may deal with many blocks
* before we come back to this one.
*/
- error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp,
+ error = xfs_da_node_read(*trans, dp, child_fsb, -2, &child_bp,
XFS_ATTR_FORK);
if (error)
return(error);
@@ -2856,8 +2907,8 @@ xfs_attr_node_inactive(
* child block number.
*/
if ((i+1) < count) {
- error = xfs_da_read_buf(*trans, dp, 0, parent_blkno,
- &bp, XFS_ATTR_FORK);
+ error = xfs_da_node_read(*trans, dp, 0, parent_blkno,
+ &bp, XFS_ATTR_FORK);
if (error)
return(error);
child_fsb = be32_to_cpu(node->btree[i+1].before);
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index dea17722945e..77de139a58f0 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -261,4 +261,10 @@ int xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
struct xfs_buf *leaf2_bp);
int xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
int *local);
+int xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t bno, xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp);
+
+extern const struct xfs_buf_ops xfs_attr_leaf_buf_ops;
+
#endif /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 848ffa77707b..0e92d12765d2 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2437,6 +2437,7 @@ xfs_bmap_btalloc(
* Normal allocation, done through xfs_alloc_vextent.
*/
tryagain = isaligned = 0;
+ memset(&args, 0, sizeof(args));
args.tp = ap->tp;
args.mp = mp;
args.fsbno = ap->blkno;
@@ -2661,8 +2662,9 @@ xfs_bmap_btree_to_extents(
if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
return error;
#endif
- if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp,
- XFS_BMAP_BTREE_REF)))
+ error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
return error;
cblock = XFS_BUF_TO_BLOCK(cbp);
if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
@@ -3082,6 +3084,7 @@ xfs_bmap_extents_to_btree(
* Convert to a btree with two levels, one record in root.
*/
XFS_IFORK_FMT_SET(ip, whichfork, XFS_DINODE_FMT_BTREE);
+ memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = mp;
args.firstblock = *firstblock;
@@ -3121,6 +3124,7 @@ xfs_bmap_extents_to_btree(
/*
* Fill in the child block.
*/
+ abp->b_ops = &xfs_bmbt_buf_ops;
ablock = XFS_BUF_TO_BLOCK(abp);
ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
ablock->bb_level = 0;
@@ -3237,6 +3241,7 @@ xfs_bmap_local_to_extents(
xfs_buf_t *bp; /* buffer for extent block */
xfs_bmbt_rec_host_t *ep;/* extent record pointer */
+ memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = ip->i_mount;
args.firstblock = *firstblock;
@@ -3266,6 +3271,7 @@ xfs_bmap_local_to_extents(
ASSERT(args.len == 1);
*firstblock = args.fsbno;
bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+ bp->b_ops = &xfs_bmbt_buf_ops;
memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
@@ -4075,8 +4081,9 @@ xfs_bmap_read_extents(
* pointer (leftmost) at each level.
*/
while (level-- > 0) {
- if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
- XFS_BMAP_BTREE_REF)))
+ error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+ if (error)
return error;
block = XFS_BUF_TO_BLOCK(bp);
XFS_WANT_CORRUPTED_GOTO(
@@ -4121,7 +4128,8 @@ xfs_bmap_read_extents(
*/
nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
if (nextbno != NULLFSBLOCK)
- xfs_btree_reada_bufl(mp, nextbno, 1);
+ xfs_btree_reada_bufl(mp, nextbno, 1,
+ &xfs_bmbt_buf_ops);
/*
* Copy records into the extent records.
*/
@@ -4153,8 +4161,9 @@ xfs_bmap_read_extents(
*/
if (bno == NULLFSBLOCK)
break;
- if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
- XFS_BMAP_BTREE_REF)))
+ error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+ if (error)
return error;
block = XFS_BUF_TO_BLOCK(bp);
}
@@ -4616,12 +4625,11 @@ xfs_bmapi_delay(
STATIC int
-xfs_bmapi_allocate(
- struct xfs_bmalloca *bma,
- int flags)
+__xfs_bmapi_allocate(
+ struct xfs_bmalloca *bma)
{
struct xfs_mount *mp = bma->ip->i_mount;
- int whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
+ int whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ?
XFS_ATTR_FORK : XFS_DATA_FORK;
struct xfs_ifork *ifp = XFS_IFORK_PTR(bma->ip, whichfork);
int tmp_logflags = 0;
@@ -4654,24 +4662,27 @@ xfs_bmapi_allocate(
* Indicate if this is the first user data in the file, or just any
* user data.
*/
- if (!(flags & XFS_BMAPI_METADATA)) {
+ if (!(bma->flags & XFS_BMAPI_METADATA)) {
bma->userdata = (bma->offset == 0) ?
XFS_ALLOC_INITIAL_USER_DATA : XFS_ALLOC_USERDATA;
}
- bma->minlen = (flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
+ bma->minlen = (bma->flags & XFS_BMAPI_CONTIG) ? bma->length : 1;
/*
* Only want to do the alignment at the eof if it is userdata and
* allocation length is larger than a stripe unit.
*/
if (mp->m_dalign && bma->length >= mp->m_dalign &&
- !(flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
+ !(bma->flags & XFS_BMAPI_METADATA) && whichfork == XFS_DATA_FORK) {
error = xfs_bmap_isaeof(bma, whichfork);
if (error)
return error;
}
+ if (bma->flags & XFS_BMAPI_STACK_SWITCH)
+ bma->stack_switch = 1;
+
error = xfs_bmap_alloc(bma);
if (error)
return error;
@@ -4706,7 +4717,7 @@ xfs_bmapi_allocate(
* A wasdelay extent has been initialized, so shouldn't be flagged
* as unwritten.
*/
- if (!bma->wasdel && (flags & XFS_BMAPI_PREALLOC) &&
+ if (!bma->wasdel && (bma->flags & XFS_BMAPI_PREALLOC) &&
xfs_sb_version_hasextflgbit(&mp->m_sb))
bma->got.br_state = XFS_EXT_UNWRITTEN;
@@ -4734,6 +4745,45 @@ xfs_bmapi_allocate(
return 0;
}
+static void
+xfs_bmapi_allocate_worker(
+ struct work_struct *work)
+{
+ struct xfs_bmalloca *args = container_of(work,
+ struct xfs_bmalloca, work);
+ unsigned long pflags;
+
+ /* we are in a transaction context here */
+ current_set_flags_nested(&pflags, PF_FSTRANS);
+
+ args->result = __xfs_bmapi_allocate(args);
+ complete(args->done);
+
+ current_restore_flags_nested(&pflags, PF_FSTRANS);
+}
+
+/*
+ * Some allocation requests often come in with little stack to work on. Push
+ * them off to a worker thread so there is lots of stack to use. Otherwise just
+ * call directly to avoid the context switch overhead here.
+ */
+int
+xfs_bmapi_allocate(
+ struct xfs_bmalloca *args)
+{
+ DECLARE_COMPLETION_ONSTACK(done);
+
+ if (!args->stack_switch)
+ return __xfs_bmapi_allocate(args);
+
+
+ args->done = &done;
+ INIT_WORK_ONSTACK(&args->work, xfs_bmapi_allocate_worker);
+ queue_work(xfs_alloc_wq, &args->work);
+ wait_for_completion(&done);
+ return args->result;
+}
+
STATIC int
xfs_bmapi_convert_unwritten(
struct xfs_bmalloca *bma,
@@ -4919,6 +4969,7 @@ xfs_bmapi_write(
bma.conv = !!(flags & XFS_BMAPI_CONVERT);
bma.wasdel = wasdelay;
bma.offset = bno;
+ bma.flags = flags;
/*
* There's a 32/64 bit type mismatch between the
@@ -4934,7 +4985,7 @@ xfs_bmapi_write(
ASSERT(len > 0);
ASSERT(bma.length > 0);
- error = xfs_bmapi_allocate(&bma, flags);
+ error = xfs_bmapi_allocate(&bma);
if (error)
goto error0;
if (bma.blkno == NULLFSBLOCK)
@@ -5554,7 +5605,7 @@ xfs_getbmap(
xfs_ilock(ip, XFS_IOLOCK_SHARED);
if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
- error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
+ error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
if (error)
goto out_unlock_iolock;
}
@@ -5823,15 +5874,16 @@ xfs_bmap_check_leaf_extents(
*/
while (level-- > 0) {
/* See if buf is in cur first */
+ bp_release = 0;
bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
- if (bp) {
- bp_release = 0;
- } else {
+ if (!bp) {
bp_release = 1;
+ error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ goto error_norelse;
}
- if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
- XFS_BMAP_BTREE_REF)))
- goto error_norelse;
block = XFS_BUF_TO_BLOCK(bp);
XFS_WANT_CORRUPTED_GOTO(
xfs_bmap_sanity_check(mp, bp, level),
@@ -5908,15 +5960,16 @@ xfs_bmap_check_leaf_extents(
if (bno == NULLFSBLOCK)
break;
+ bp_release = 0;
bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
- if (bp) {
- bp_release = 0;
- } else {
+ if (!bp) {
bp_release = 1;
+ error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
+ goto error_norelse;
}
- if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
- XFS_BMAP_BTREE_REF)))
- goto error_norelse;
block = XFS_BUF_TO_BLOCK(bp);
}
if (bp_release) {
@@ -6007,7 +6060,9 @@ xfs_bmap_count_tree(
struct xfs_btree_block *block, *nextblock;
int numrecs;
- if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF)))
+ error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
return error;
*count += 1;
block = XFS_BUF_TO_BLOCK(bp);
@@ -6016,8 +6071,10 @@ xfs_bmap_count_tree(
/* Not at node above leaves, count this level of nodes */
nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
while (nextbno != NULLFSBLOCK) {
- if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
- 0, &nbp, XFS_BMAP_BTREE_REF)))
+ error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
+ XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
return error;
*count += 1;
nextblock = XFS_BUF_TO_BLOCK(nbp);
@@ -6046,8 +6103,10 @@ xfs_bmap_count_tree(
if (nextbno == NULLFSBLOCK)
break;
bno = nextbno;
- if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
- XFS_BMAP_BTREE_REF)))
+ error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+ XFS_BMAP_BTREE_REF,
+ &xfs_bmbt_buf_ops);
+ if (error)
return error;
*count += 1;
block = XFS_BUF_TO_BLOCK(bp);
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 803b56d7ce16..5f469c3516eb 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -77,6 +77,7 @@ typedef struct xfs_bmap_free
* from written to unwritten, otherwise convert from unwritten to written.
*/
#define XFS_BMAPI_CONVERT 0x040
+#define XFS_BMAPI_STACK_SWITCH 0x080
#define XFS_BMAPI_FLAGS \
{ XFS_BMAPI_ENTIRE, "ENTIRE" }, \
@@ -85,7 +86,8 @@ typedef struct xfs_bmap_free
{ XFS_BMAPI_PREALLOC, "PREALLOC" }, \
{ XFS_BMAPI_IGSTATE, "IGSTATE" }, \
{ XFS_BMAPI_CONTIG, "CONTIG" }, \
- { XFS_BMAPI_CONVERT, "CONVERT" }
+ { XFS_BMAPI_CONVERT, "CONVERT" }, \
+ { XFS_BMAPI_STACK_SWITCH, "STACK_SWITCH" }
static inline int xfs_bmapi_aflag(int w)
@@ -133,6 +135,11 @@ typedef struct xfs_bmalloca {
char userdata;/* set if is user data */
char aeof; /* allocated space at eof */
char conv; /* overwriting unwritten extents */
+ char stack_switch;
+ int flags;
+ struct completion *done;
+ struct work_struct work;
+ int result;
} xfs_bmalloca_t;
/*
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 862084a47a7e..061b45cbe614 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -36,6 +36,7 @@
#include "xfs_bmap.h"
#include "xfs_error.h"
#include "xfs_quota.h"
+#include "xfs_trace.h"
/*
* Determine the extent state.
@@ -707,6 +708,67 @@ xfs_bmbt_key_diff(
cur->bc_rec.b.br_startoff;
}
+static void
+xfs_bmbt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ unsigned int level;
+ int lblock_ok; /* block passes checks */
+
+ /* magic number and level verification.
+ *
+ * We don't know waht fork we belong to, so just verify that the level
+ * is less than the maximum of the two. Later checks will be more
+ * precise.
+ */
+ level = be16_to_cpu(block->bb_level);
+ lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) &&
+ level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]);
+
+ /* numrecs verification */
+ lblock_ok = lblock_ok &&
+ be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0];
+
+ /* sibling pointer verification */
+ lblock_ok = lblock_ok &&
+ block->bb_u.l.bb_leftsib &&
+ (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
+ XFS_FSB_SANITY_CHECK(mp,
+ be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
+ block->bb_u.l.bb_rightsib &&
+ (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
+ XFS_FSB_SANITY_CHECK(mp,
+ be64_to_cpu(block->bb_u.l.bb_rightsib)));
+
+ if (!lblock_ok) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+static void
+xfs_bmbt_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_bmbt_verify(bp);
+}
+
+static void
+xfs_bmbt_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_bmbt_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_bmbt_buf_ops = {
+ .verify_read = xfs_bmbt_read_verify,
+ .verify_write = xfs_bmbt_write_verify,
+};
+
+
#ifdef DEBUG
STATIC int
xfs_bmbt_keys_inorder(
@@ -746,6 +808,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
.init_rec_from_cur = xfs_bmbt_init_rec_from_cur,
.init_ptr_from_cur = xfs_bmbt_init_ptr_from_cur,
.key_diff = xfs_bmbt_key_diff,
+ .buf_ops = &xfs_bmbt_buf_ops,
#ifdef DEBUG
.keys_inorder = xfs_bmbt_keys_inorder,
.recs_inorder = xfs_bmbt_recs_inorder,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 0e66c4ea0f85..88469ca08696 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -236,5 +236,6 @@ extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_inode *, int);
+extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
#endif /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e53e317b1582..db010408d701 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -266,9 +266,13 @@ xfs_btree_dup_cursor(
for (i = 0; i < new->bc_nlevels; i++) {
new->bc_ptrs[i] = cur->bc_ptrs[i];
new->bc_ra[i] = cur->bc_ra[i];
- if ((bp = cur->bc_bufs[i])) {
- if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
- XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) {
+ bp = cur->bc_bufs[i];
+ if (bp) {
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+ XFS_BUF_ADDR(bp), mp->m_bsize,
+ 0, &bp,
+ cur->bc_ops->buf_ops);
+ if (error) {
xfs_btree_del_cursor(new, error);
*ncur = NULL;
return error;
@@ -609,25 +613,26 @@ xfs_btree_offsets(
* Get a buffer for the block, return it read in.
* Long-form addressing.
*/
-int /* error */
+int
xfs_btree_read_bufl(
- xfs_mount_t *mp, /* file system mount point */
- xfs_trans_t *tp, /* transaction pointer */
- xfs_fsblock_t fsbno, /* file system block number */
- uint lock, /* lock flags for read_buf */
- xfs_buf_t **bpp, /* buffer for fsbno */
- int refval) /* ref count value for buffer */
-{
- xfs_buf_t *bp; /* return value */
+ struct xfs_mount *mp, /* file system mount point */
+ struct xfs_trans *tp, /* transaction pointer */
+ xfs_fsblock_t fsbno, /* file system block number */
+ uint lock, /* lock flags for read_buf */
+ struct xfs_buf **bpp, /* buffer for fsbno */
+ int refval, /* ref count value for buffer */
+ const struct xfs_buf_ops *ops)
+{
+ struct xfs_buf *bp; /* return value */
xfs_daddr_t d; /* real disk block address */
- int error;
+ int error;
ASSERT(fsbno != NULLFSBLOCK);
d = XFS_FSB_TO_DADDR(mp, fsbno);
- if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
- mp->m_bsize, lock, &bp))) {
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+ mp->m_bsize, lock, &bp, ops);
+ if (error)
return error;
- }
ASSERT(!xfs_buf_geterror(bp));
if (bp)
xfs_buf_set_ref(bp, refval);
@@ -642,15 +647,16 @@ xfs_btree_read_bufl(
/* ARGSUSED */
void
xfs_btree_reada_bufl(
- xfs_mount_t *mp, /* file system mount point */
- xfs_fsblock_t fsbno, /* file system block number */
- xfs_extlen_t count) /* count of filesystem blocks */
+ struct xfs_mount *mp, /* file system mount point */
+ xfs_fsblock_t fsbno, /* file system block number */
+ xfs_extlen_t count, /* count of filesystem blocks */
+ const struct xfs_buf_ops *ops)
{
xfs_daddr_t d;
ASSERT(fsbno != NULLFSBLOCK);
d = XFS_FSB_TO_DADDR(mp, fsbno);
- xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
+ xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
}
/*
@@ -660,17 +666,18 @@ xfs_btree_reada_bufl(
/* ARGSUSED */
void
xfs_btree_reada_bufs(
- xfs_mount_t *mp, /* file system mount point */
- xfs_agnumber_t agno, /* allocation group number */
- xfs_agblock_t agbno, /* allocation group block number */
- xfs_extlen_t count) /* count of filesystem blocks */
+ struct xfs_mount *mp, /* file system mount point */
+ xfs_agnumber_t agno, /* allocation group number */
+ xfs_agblock_t agbno, /* allocation group block number */
+ xfs_extlen_t count, /* count of filesystem blocks */
+ const struct xfs_buf_ops *ops)
{
xfs_daddr_t d;
ASSERT(agno != NULLAGNUMBER);
ASSERT(agbno != NULLAGBLOCK);
d = XFS_AGB_TO_DADDR(mp, agno, agbno);
- xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
+ xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
}
STATIC int
@@ -684,12 +691,14 @@ xfs_btree_readahead_lblock(
xfs_dfsbno_t right = be64_to_cpu(block->bb_u.l.bb_rightsib);
if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
- xfs_btree_reada_bufl(cur->bc_mp, left, 1);
+ xfs_btree_reada_bufl(cur->bc_mp, left, 1,
+ cur->bc_ops->buf_ops);
rval++;
}
if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
- xfs_btree_reada_bufl(cur->bc_mp, right, 1);
+ xfs_btree_reada_bufl(cur->bc_mp, right, 1,
+ cur->bc_ops->buf_ops);
rval++;
}
@@ -709,13 +718,13 @@ xfs_btree_readahead_sblock(
if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
- left, 1);
+ left, 1, cur->bc_ops->buf_ops);
rval++;
}
if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
- right, 1);
+ right, 1, cur->bc_ops->buf_ops);
rval++;
}
@@ -853,18 +862,22 @@ xfs_btree_set_sibling(
}
}
-STATIC void
+void
xfs_btree_init_block(
- struct xfs_btree_cur *cur,
- int level,
- int numrecs,
- struct xfs_btree_block *new) /* new block */
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ __u32 magic,
+ __u16 level,
+ __u16 numrecs,
+ unsigned int flags)
{
- new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
+ struct xfs_btree_block *new = XFS_BUF_TO_BLOCK(bp);
+
+ new->bb_magic = cpu_to_be32(magic);
new->bb_level = cpu_to_be16(level);
new->bb_numrecs = cpu_to_be16(numrecs);
- if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+ if (flags & XFS_BTREE_LONG_PTRS) {
new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
} else {
@@ -873,6 +886,17 @@ xfs_btree_init_block(
}
}
+STATIC void
+xfs_btree_init_block_cur(
+ struct xfs_btree_cur *cur,
+ int level,
+ int numrecs,
+ struct xfs_buf *bp)
+{
+ xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum],
+ level, numrecs, cur->bc_flags);
+}
+
/*
* Return true if ptr is the last record in the btree and
* we need to track updateѕ to this record. The decision
@@ -972,6 +996,7 @@ xfs_btree_get_buf_block(
if (!*bpp)
return ENOMEM;
+ (*bpp)->b_ops = cur->bc_ops->buf_ops;
*block = XFS_BUF_TO_BLOCK(*bpp);
return 0;
}
@@ -998,19 +1023,15 @@ xfs_btree_read_buf_block(
d = xfs_btree_ptr_to_daddr(cur, ptr);
error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
- mp->m_bsize, flags, bpp);
+ mp->m_bsize, flags, bpp,
+ cur->bc_ops->buf_ops);
if (error)
return error;
ASSERT(!xfs_buf_geterror(*bpp));
-
xfs_btree_set_refs(cur, *bpp);
*block = XFS_BUF_TO_BLOCK(*bpp);
-
- error = xfs_btree_check_block(cur, *block, level, *bpp);
- if (error)
- xfs_trans_brelse(cur->bc_tp, *bpp);
- return error;
+ return 0;
}
/*
@@ -2183,7 +2204,7 @@ xfs_btree_split(
goto error0;
/* Fill in the btree header for the new right block. */
- xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right);
+ xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp);
/*
* Split the entries between the old and the new block evenly.
@@ -2492,7 +2513,7 @@ xfs_btree_new_root(
nptr = 2;
}
/* Fill in the new block's btree header and log it. */
- xfs_btree_init_block(cur, cur->bc_nlevels, 2, new);
+ xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp);
xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
!xfs_btree_ptr_is_null(cur, &rptr));
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 5b240de104c0..f932897194eb 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -188,6 +188,8 @@ struct xfs_btree_ops {
__int64_t (*key_diff)(struct xfs_btree_cur *cur,
union xfs_btree_key *key);
+ const struct xfs_buf_ops *buf_ops;
+
#ifdef DEBUG
/* check that k1 is lower than k2 */
int (*keys_inorder)(struct xfs_btree_cur *cur,
@@ -355,7 +357,8 @@ xfs_btree_read_bufl(
xfs_fsblock_t fsbno, /* file system block number */
uint lock, /* lock flags for read_buf */
struct xfs_buf **bpp, /* buffer for fsbno */
- int refval);/* ref count value for buffer */
+ int refval, /* ref count value for buffer */
+ const struct xfs_buf_ops *ops);
/*
* Read-ahead the block, don't wait for it, don't return a buffer.
@@ -365,7 +368,8 @@ void /* error */
xfs_btree_reada_bufl(
struct xfs_mount *mp, /* file system mount point */
xfs_fsblock_t fsbno, /* file system block number */
- xfs_extlen_t count); /* count of filesystem blocks */
+ xfs_extlen_t count, /* count of filesystem blocks */
+ const struct xfs_buf_ops *ops);
/*
* Read-ahead the block, don't wait for it, don't return a buffer.
@@ -376,8 +380,20 @@ xfs_btree_reada_bufs(
struct xfs_mount *mp, /* file system mount point */
xfs_agnumber_t agno, /* allocation group number */
xfs_agblock_t agbno, /* allocation group block number */
- xfs_extlen_t count); /* count of filesystem blocks */
+ xfs_extlen_t count, /* count of filesystem blocks */
+ const struct xfs_buf_ops *ops);
+/*
+ * Initialise a new btree block header
+ */
+void
+xfs_btree_init_block(
+ struct xfs_mount *mp,
+ struct xfs_buf *bp,
+ __u32 magic,
+ __u16 level,
+ __u16 numrecs,
+ unsigned int flags);
/*
* Common btree core entry points.
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 933b7930b863..26673a0b20e7 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -569,7 +569,9 @@ found:
*/
if (bp->b_flags & XBF_STALE) {
ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
+ ASSERT(bp->b_iodone == NULL);
bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
+ bp->b_ops = NULL;
}
trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -654,7 +656,8 @@ xfs_buf_read_map(
struct xfs_buftarg *target,
struct xfs_buf_map *map,
int nmaps,
- xfs_buf_flags_t flags)
+ xfs_buf_flags_t flags,
+ const struct xfs_buf_ops *ops)
{
struct xfs_buf *bp;
@@ -666,6 +669,7 @@ xfs_buf_read_map(
if (!XFS_BUF_ISDONE(bp)) {
XFS_STATS_INC(xb_get_read);
+ bp->b_ops = ops;
_xfs_buf_read(bp, flags);
} else if (flags & XBF_ASYNC) {
/*
@@ -691,13 +695,14 @@ void
xfs_buf_readahead_map(
struct xfs_buftarg *target,
struct xfs_buf_map *map,
- int nmaps)
+ int nmaps,
+ const struct xfs_buf_ops *ops)
{
if (bdi_read_congested(target->bt_bdi))
return;
xfs_buf_read_map(target, map, nmaps,
- XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
+ XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops);
}
/*
@@ -709,10 +714,10 @@ xfs_buf_read_uncached(
struct xfs_buftarg *target,
xfs_daddr_t daddr,
size_t numblks,
- int flags)
+ int flags,
+ const struct xfs_buf_ops *ops)
{
- xfs_buf_t *bp;
- int error;
+ struct xfs_buf *bp;
bp = xfs_buf_get_uncached(target, numblks, flags);
if (!bp)
@@ -723,13 +728,10 @@ xfs_buf_read_uncached(
bp->b_bn = daddr;
bp->b_maps[0].bm_bn = daddr;
bp->b_flags |= XBF_READ;
+ bp->b_ops = ops;
xfsbdstrat(target->bt_mount, bp);
- error = xfs_buf_iowait(bp);
- if (error) {
- xfs_buf_relse(bp);
- return NULL;
- }
+ xfs_buf_iowait(bp);
return bp;
}
@@ -999,27 +1001,37 @@ STATIC void
xfs_buf_iodone_work(
struct work_struct *work)
{
- xfs_buf_t *bp =
+ struct xfs_buf *bp =
container_of(work, xfs_buf_t, b_iodone_work);
+ bool read = !!(bp->b_flags & XBF_READ);
+
+ bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
+ if (read && bp->b_ops)
+ bp->b_ops->verify_read(bp);
if (bp->b_iodone)
(*(bp->b_iodone))(bp);
else if (bp->b_flags & XBF_ASYNC)
xfs_buf_relse(bp);
+ else {
+ ASSERT(read && bp->b_ops);
+ complete(&bp->b_iowait);
+ }
}
void
xfs_buf_ioend(
- xfs_buf_t *bp,
- int schedule)
+ struct xfs_buf *bp,
+ int schedule)
{
+ bool read = !!(bp->b_flags & XBF_READ);
+
trace_xfs_buf_iodone(bp, _RET_IP_);
- bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
if (bp->b_error == 0)
bp->b_flags |= XBF_DONE;
- if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
+ if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) {
if (schedule) {
INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
queue_work(xfslogd_workqueue, &bp->b_iodone_work);
@@ -1027,6 +1039,7 @@ xfs_buf_ioend(
xfs_buf_iodone_work(&bp->b_iodone_work);
}
} else {
+ bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
complete(&bp->b_iowait);
}
}
@@ -1197,9 +1210,14 @@ xfs_buf_bio_end_io(
{
xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private;
- xfs_buf_ioerror(bp, -error);
+ /*
+ * don't overwrite existing errors - otherwise we can lose errors on
+ * buffers that require multiple bios to complete.
+ */
+ if (!bp->b_error)
+ xfs_buf_ioerror(bp, -error);
- if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
+ if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
_xfs_buf_ioend(bp, 1);
@@ -1279,6 +1297,11 @@ next_chunk:
if (size)
goto next_chunk;
} else {
+ /*
+ * This is guaranteed not to be the last io reference count
+ * because the caller (xfs_buf_iorequest) holds a count itself.
+ */
+ atomic_dec(&bp->b_io_remaining);
xfs_buf_ioerror(bp, EIO);
bio_put(bio);
}
@@ -1304,6 +1327,20 @@ _xfs_buf_ioapply(
rw |= REQ_FUA;
if (bp->b_flags & XBF_FLUSH)
rw |= REQ_FLUSH;
+
+ /*
+ * Run the write verifier callback function if it exists. If
+ * this function fails it will mark the buffer with an error and
+ * the IO should not be dispatched.
+ */
+ if (bp->b_ops) {
+ bp->b_ops->verify_write(bp);
+ if (bp->b_error) {
+ xfs_force_shutdown(bp->b_target->bt_mount,
+ SHUTDOWN_CORRUPT_INCORE);
+ return;
+ }
+ }
} else if (bp->b_flags & XBF_READ_AHEAD) {
rw = READA;
} else {
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 7c0b6a0a1557..23f5642480bb 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -100,6 +100,7 @@ typedef struct xfs_buftarg {
struct xfs_buf;
typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
+
#define XB_PAGES 2
struct xfs_buf_map {
@@ -110,6 +111,11 @@ struct xfs_buf_map {
#define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \
struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) };
+struct xfs_buf_ops {
+ void (*verify_read)(struct xfs_buf *);
+ void (*verify_write)(struct xfs_buf *);
+};
+
typedef struct xfs_buf {
/*
* first cacheline holds all the fields needed for an uncontended cache
@@ -153,13 +159,13 @@ typedef struct xfs_buf {
unsigned int b_page_count; /* size of page array */
unsigned int b_offset; /* page offset in first page */
unsigned short b_error; /* error code on I/O */
+ const struct xfs_buf_ops *b_ops;
#ifdef XFS_BUF_LOCK_TRACKING
int b_last_holder;
#endif
} xfs_buf_t;
-
/* Finding and Reading Buffers */
struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target,
struct xfs_buf_map *map, int nmaps,
@@ -196,9 +202,11 @@ struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target,
xfs_buf_flags_t flags);
struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target,
struct xfs_buf_map *map, int nmaps,
- xfs_buf_flags_t flags);
+ xfs_buf_flags_t flags,
+ const struct xfs_buf_ops *ops);
void xfs_buf_readahead_map(struct xfs_buftarg *target,
- struct xfs_buf_map *map, int nmaps);
+ struct xfs_buf_map *map, int nmaps,
+ const struct xfs_buf_ops *ops);
static inline struct xfs_buf *
xfs_buf_get(
@@ -216,20 +224,22 @@ xfs_buf_read(
struct xfs_buftarg *target,
xfs_daddr_t blkno,
size_t numblks,
- xfs_buf_flags_t flags)
+ xfs_buf_flags_t flags,
+ const struct xfs_buf_ops *ops)
{
DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
- return xfs_buf_read_map(target, &map, 1, flags);
+ return xfs_buf_read_map(target, &map, 1, flags, ops);
}
static inline void
xfs_buf_readahead(
struct xfs_buftarg *target,
xfs_daddr_t blkno,
- size_t numblks)
+ size_t numblks,
+ const struct xfs_buf_ops *ops)
{
DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
- return xfs_buf_readahead_map(target, &map, 1);
+ return xfs_buf_readahead_map(target, &map, 1, ops);
}
struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks);
@@ -239,7 +249,8 @@ int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
int flags);
struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target,
- xfs_daddr_t daddr, size_t numblks, int flags);
+ xfs_daddr_t daddr, size_t numblks, int flags,
+ const struct xfs_buf_ops *ops);
void xfs_buf_hold(struct xfs_buf *bp);
/* Releasing Buffers */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index a8d0ed911196..becf4a97efc6 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -526,7 +526,25 @@ xfs_buf_item_unpin(
}
xfs_buf_relse(bp);
} else if (freed && remove) {
+ /*
+ * There are currently two references to the buffer - the active
+ * LRU reference and the buf log item. What we are about to do
+ * here - simulate a failed IO completion - requires 3
+ * references.
+ *
+ * The LRU reference is removed by the xfs_buf_stale() call. The
+ * buf item reference is removed by the xfs_buf_iodone()
+ * callback that is run by xfs_buf_do_callbacks() during ioend
+ * processing (via the bp->b_iodone callback), and then finally
+ * the ioend processing will drop the IO reference if the buffer
+ * is marked XBF_ASYNC.
+ *
+ * Hence we need to take an additional reference here so that IO
+ * completion processing doesn't free the buffer prematurely.
+ */
xfs_buf_lock(bp);
+ xfs_buf_hold(bp);
+ bp->b_flags |= XBF_ASYNC;
xfs_buf_ioerror(bp, EIO);
XFS_BUF_UNDONE(bp);
xfs_buf_stale(bp);
diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/xfs_cksum.h
new file mode 100644
index 000000000000..fad1676ad8cd
--- /dev/null
+++ b/fs/xfs/xfs_cksum.h
@@ -0,0 +1,63 @@
+#ifndef _XFS_CKSUM_H
+#define _XFS_CKSUM_H 1
+
+#define XFS_CRC_SEED (~(__uint32_t)0)
+
+/*
+ * Calculate the intermediate checksum for a buffer that has the CRC field
+ * inside it. The offset of the 32bit crc fields is passed as the
+ * cksum_offset parameter.
+ */
+static inline __uint32_t
+xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+ __uint32_t zero = 0;
+ __uint32_t crc;
+
+ /* Calculate CRC up to the checksum. */
+ crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset);
+
+ /* Skip checksum field */
+ crc = crc32c(crc, &zero, sizeof(__u32));
+
+ /* Calculate the rest of the CRC. */
+ return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)],
+ length - (cksum_offset + sizeof(__be32)));
+}
+
+/*
+ * Convert the intermediate checksum to the final ondisk format.
+ *
+ * The CRC32c calculation uses LE format even on BE machines, but returns the
+ * result in host endian format. Hence we need to byte swap it back to LE format
+ * so that it is consistent on disk.
+ */
+static inline __le32
+xfs_end_cksum(__uint32_t crc)
+{
+ return ~cpu_to_le32(crc);
+}
+
+/*
+ * Helper to generate the checksum for a buffer.
+ */
+static inline void
+xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+ __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+
+ *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
+}
+
+/*
+ * Helper to verify the checksum for a buffer.
+ */
+static inline int
+xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+ __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+
+ return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
+}
+
+#endif /* _XFS_CKSUM_H */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 7bfb7dd334fc..4d7696a02418 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -91,6 +91,84 @@ STATIC int xfs_da_blk_unlink(xfs_da_state_t *state,
xfs_da_state_blk_t *save_blk);
STATIC void xfs_da_state_kill_altpath(xfs_da_state_t *state);
+static void
+xfs_da_node_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_da_node_hdr *hdr = bp->b_addr;
+ int block_ok = 0;
+
+ block_ok = hdr->info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC);
+ block_ok = block_ok &&
+ be16_to_cpu(hdr->level) > 0 &&
+ be16_to_cpu(hdr->count) > 0 ;
+ if (!block_ok) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+
+}
+
+static void
+xfs_da_node_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_da_node_verify(bp);
+}
+
+/*
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
+ */
+static void
+xfs_da_node_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_da_blkinfo *info = bp->b_addr;
+
+ switch (be16_to_cpu(info->magic)) {
+ case XFS_DA_NODE_MAGIC:
+ xfs_da_node_verify(bp);
+ break;
+ case XFS_ATTR_LEAF_MAGIC:
+ bp->b_ops = &xfs_attr_leaf_buf_ops;
+ bp->b_ops->verify_read(bp);
+ return;
+ case XFS_DIR2_LEAFN_MAGIC:
+ bp->b_ops = &xfs_dir2_leafn_buf_ops;
+ bp->b_ops->verify_read(bp);
+ return;
+ default:
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+ mp, info);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ break;
+ }
+}
+
+const struct xfs_buf_ops xfs_da_node_buf_ops = {
+ .verify_read = xfs_da_node_read_verify,
+ .verify_write = xfs_da_node_write_verify,
+};
+
+
+int
+xfs_da_node_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp,
+ int which_fork)
+{
+ return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+ which_fork, &xfs_da_node_buf_ops);
+}
+
/*========================================================================
* Routines used for growing the Btree.
*========================================================================*/
@@ -125,6 +203,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
xfs_trans_log_buf(tp, bp,
XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+ bp->b_ops = &xfs_da_node_buf_ops;
*bpp = bp;
return(0);
}
@@ -324,6 +403,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
}
memcpy(node, oldroot, size);
xfs_trans_log_buf(tp, bp, 0, size - 1);
+
+ bp->b_ops = blk1->bp->b_ops;
blk1->bp = bp;
blk1->blkno = blkno;
@@ -746,7 +827,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
*/
child = be32_to_cpu(oldroot->btree[0].before);
ASSERT(child != 0);
- error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp,
+ error = xfs_da_node_read(args->trans, args->dp, child, -1, &bp,
args->whichfork);
if (error)
return(error);
@@ -754,7 +835,14 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
xfs_da_blkinfo_onlychild_validate(bp->b_addr,
be16_to_cpu(oldroot->hdr.level));
+ /*
+ * This could be copying a leaf back into the root block in the case of
+ * there only being a single leaf block left in the tree. Hence we have
+ * to update the b_ops pointer as well to match the buffer type change
+ * that could occur.
+ */
memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize);
+ root_blk->bp->b_ops = bp->b_ops;
xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
error = xfs_da_shrink_inode(args, child, bp);
return(error);
@@ -779,6 +867,8 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
xfs_dablk_t blkno;
struct xfs_buf *bp;
+ trace_xfs_da_node_toosmall(state->args);
+
/*
* Check for the degenerate case of the block being over 50% full.
* If so, it's not worth even looking to see if we might be able
@@ -835,7 +925,7 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
blkno = be32_to_cpu(info->back);
if (blkno == 0)
continue;
- error = xfs_da_read_buf(state->args->trans, state->args->dp,
+ error = xfs_da_node_read(state->args->trans, state->args->dp,
blkno, -1, &bp, state->args->whichfork);
if (error)
return(error);
@@ -900,6 +990,8 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
xfs_dahash_t lasthash=0;
int level, count;
+ trace_xfs_da_fixhashpath(state->args);
+
level = path->active-1;
blk = &path->blk[ level ];
switch (blk->magic) {
@@ -1079,7 +1171,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
* Read the next node down in the tree.
*/
blk->blkno = blkno;
- error = xfs_da_read_buf(args->trans, args->dp, blkno,
+ error = xfs_da_node_read(args->trans, args->dp, blkno,
-1, &blk->bp, args->whichfork);
if (error) {
blk->blkno = 0;
@@ -1241,7 +1333,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
new_info->forw = cpu_to_be32(old_blk->blkno);
new_info->back = old_info->back;
if (old_info->back) {
- error = xfs_da_read_buf(args->trans, args->dp,
+ error = xfs_da_node_read(args->trans, args->dp,
be32_to_cpu(old_info->back),
-1, &bp, args->whichfork);
if (error)
@@ -1262,7 +1354,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
new_info->forw = old_info->forw;
new_info->back = cpu_to_be32(old_blk->blkno);
if (old_info->forw) {
- error = xfs_da_read_buf(args->trans, args->dp,
+ error = xfs_da_node_read(args->trans, args->dp,
be32_to_cpu(old_info->forw),
-1, &bp, args->whichfork);
if (error)
@@ -1362,7 +1454,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
trace_xfs_da_unlink_back(args);
save_info->back = drop_info->back;
if (drop_info->back) {
- error = xfs_da_read_buf(args->trans, args->dp,
+ error = xfs_da_node_read(args->trans, args->dp,
be32_to_cpu(drop_info->back),
-1, &bp, args->whichfork);
if (error)
@@ -1379,7 +1471,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
trace_xfs_da_unlink_forward(args);
save_info->forw = drop_info->forw;
if (drop_info->forw) {
- error = xfs_da_read_buf(args->trans, args->dp,
+ error = xfs_da_node_read(args->trans, args->dp,
be32_to_cpu(drop_info->forw),
-1, &bp, args->whichfork);
if (error)
@@ -1417,6 +1509,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
xfs_dablk_t blkno=0;
int level, error;
+ trace_xfs_da_path_shift(state->args);
+
/*
* Roll up the Btree looking for the first block where our
* current index is not at the edge of the block. Note that
@@ -1463,8 +1557,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
* Read the next child block.
*/
blk->blkno = blkno;
- error = xfs_da_read_buf(args->trans, args->dp, blkno, -1,
- &blk->bp, args->whichfork);
+ error = xfs_da_node_read(args->trans, args->dp, blkno, -1,
+ &blk->bp, args->whichfork);
if (error)
return(error);
ASSERT(blk->bp != NULL);
@@ -1727,7 +1821,8 @@ xfs_da_swap_lastblock(
* Read the last block in the btree space.
*/
last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
- if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w)))
+ error = xfs_da_node_read(tp, ip, last_blkno, -1, &last_buf, w);
+ if (error)
return error;
/*
* Copy the last block into the dead buffer and log it.
@@ -1753,7 +1848,8 @@ xfs_da_swap_lastblock(
* If the moved block has a left sibling, fix up the pointers.
*/
if ((sib_blkno = be32_to_cpu(dead_info->back))) {
- if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+ error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
+ if (error)
goto done;
sib_info = sib_buf->b_addr;
if (unlikely(
@@ -1774,7 +1870,8 @@ xfs_da_swap_lastblock(
* If the moved block has a right sibling, fix up the pointers.
*/
if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
- if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+ error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
+ if (error)
goto done;
sib_info = sib_buf->b_addr;
if (unlikely(
@@ -1797,7 +1894,8 @@ xfs_da_swap_lastblock(
* Walk down the tree looking for the parent of the moved block.
*/
for (;;) {
- if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+ error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
+ if (error)
goto done;
par_node = par_buf->b_addr;
if (unlikely(par_node->hdr.info.magic !=
@@ -1847,7 +1945,8 @@ xfs_da_swap_lastblock(
error = XFS_ERROR(EFSCORRUPTED);
goto done;
}
- if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+ error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
+ if (error)
goto done;
par_node = par_buf->b_addr;
if (unlikely(
@@ -2133,7 +2232,8 @@ xfs_da_read_buf(
xfs_dablk_t bno,
xfs_daddr_t mappedbno,
struct xfs_buf **bpp,
- int whichfork)
+ int whichfork,
+ const struct xfs_buf_ops *ops)
{
struct xfs_buf *bp;
struct xfs_buf_map map;
@@ -2155,7 +2255,7 @@ xfs_da_read_buf(
error = xfs_trans_read_buf_map(dp->i_mount, trans,
dp->i_mount->m_ddev_targp,
- mapp, nmap, 0, &bp);
+ mapp, nmap, 0, &bp, ops);
if (error)
goto out_free;
@@ -2211,9 +2311,10 @@ xfs_da_reada_buf(
struct xfs_trans *trans,
struct xfs_inode *dp,
xfs_dablk_t bno,
- int whichfork)
+ xfs_daddr_t mappedbno,
+ int whichfork,
+ const struct xfs_buf_ops *ops)
{
- xfs_daddr_t mappedbno = -1;
struct xfs_buf_map map;
struct xfs_buf_map *mapp;
int nmap;
@@ -2221,7 +2322,7 @@ xfs_da_reada_buf(
mapp = &map;
nmap = 1;
- error = xfs_dabuf_map(trans, dp, bno, -1, whichfork,
+ error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
&mapp, &nmap);
if (error) {
/* mapping a hole is not an error, but we don't continue */
@@ -2231,7 +2332,7 @@ xfs_da_reada_buf(
}
mappedbno = mapp[0].bm_bn;
- xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap);
+ xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
out_free:
if (mapp != &map)
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 132adafb041e..ee5170c46ae1 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -18,7 +18,6 @@
#ifndef __XFS_DA_BTREE_H__
#define __XFS_DA_BTREE_H__
-struct xfs_buf;
struct xfs_bmap_free;
struct xfs_inode;
struct xfs_mount;
@@ -214,6 +213,9 @@ int xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
*/
int xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
xfs_da_state_blk_t *new_blk);
+int xfs_da_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t bno, xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp, int which_fork);
/*
* Utility routines.
@@ -226,9 +228,11 @@ int xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
struct xfs_buf **bp, int whichfork);
int xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
xfs_dablk_t bno, xfs_daddr_t mappedbno,
- struct xfs_buf **bpp, int whichfork);
+ struct xfs_buf **bpp, int whichfork,
+ const struct xfs_buf_ops *ops);
xfs_daddr_t xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
- xfs_dablk_t bno, int whichfork);
+ xfs_dablk_t bno, xfs_daddr_t mapped_bno,
+ int whichfork, const struct xfs_buf_ops *ops);
int xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
struct xfs_buf *dead_buf);
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index b9b8646e62db..d0e9c74d3d96 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -246,12 +246,10 @@ xfs_swap_extents(
goto out_unlock;
}
- if (VN_CACHED(VFS_I(tip)) != 0) {
- error = xfs_flushinval_pages(tip, 0, -1,
- FI_REMAPF_LOCKED);
- if (error)
- goto out_unlock;
- }
+ error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
+ if (error)
+ goto out_unlock;
+ truncate_pagecache_range(VFS_I(ip), 0, -1);
/* Verify O_DIRECT for ftmp */
if (VN_CACHED(VFS_I(tip)) != 0) {
@@ -315,8 +313,7 @@ xfs_swap_extents(
* are safe. We don't really care if non-io related
* fields change.
*/
-
- xfs_tosspages(ip, 0, -1, FI_REMAPF);
+ truncate_pagecache_range(VFS_I(ip), 0, -1);
tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
if ((error = xfs_trans_reserve(tp, 0,
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e93ca8f054f4..7536faaa61e7 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -56,6 +56,214 @@ xfs_dir_startup(void)
xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
}
+static void
+xfs_dir2_block_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+ int block_ok = 0;
+
+ block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+ block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
+
+ if (!block_ok) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+static void
+xfs_dir2_block_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dir2_block_verify(bp);
+}
+
+static void
+xfs_dir2_block_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dir2_block_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_dir2_block_buf_ops = {
+ .verify_read = xfs_dir2_block_read_verify,
+ .verify_write = xfs_dir2_block_write_verify,
+};
+
+static int
+xfs_dir2_block_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ struct xfs_buf **bpp)
+{
+ struct xfs_mount *mp = dp->i_mount;
+
+ return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp,
+ XFS_DATA_FORK, &xfs_dir2_block_buf_ops);
+}
+
+static void
+xfs_dir2_block_need_space(
+ struct xfs_dir2_data_hdr *hdr,
+ struct xfs_dir2_block_tail *btp,
+ struct xfs_dir2_leaf_entry *blp,
+ __be16 **tagpp,
+ struct xfs_dir2_data_unused **dupp,
+ struct xfs_dir2_data_unused **enddupp,
+ int *compact,
+ int len)
+{
+ struct xfs_dir2_data_free *bf;
+ __be16 *tagp = NULL;
+ struct xfs_dir2_data_unused *dup = NULL;
+ struct xfs_dir2_data_unused *enddup = NULL;
+
+ *compact = 0;
+ bf = hdr->bestfree;
+
+ /*
+ * If there are stale entries we'll use one for the leaf.
+ */
+ if (btp->stale) {
+ if (be16_to_cpu(bf[0].length) >= len) {
+ /*
+ * The biggest entry enough to avoid compaction.
+ */
+ dup = (xfs_dir2_data_unused_t *)
+ ((char *)hdr + be16_to_cpu(bf[0].offset));
+ goto out;
+ }
+
+ /*
+ * Will need to compact to make this work.
+ * Tag just before the first leaf entry.
+ */
+ *compact = 1;
+ tagp = (__be16 *)blp - 1;
+
+ /* Data object just before the first leaf entry. */
+ dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+
+ /*
+ * If it's not free then the data will go where the
+ * leaf data starts now, if it works at all.
+ */
+ if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
+ (uint)sizeof(*blp) < len)
+ dup = NULL;
+ } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
+ dup = NULL;
+ else
+ dup = (xfs_dir2_data_unused_t *)blp;
+ goto out;
+ }
+
+ /*
+ * no stale entries, so just use free space.
+ * Tag just before the first leaf entry.
+ */
+ tagp = (__be16 *)blp - 1;
+
+ /* Data object just before the first leaf entry. */
+ enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+
+ /*
+ * If it's not free then can't do this add without cleaning up:
+ * the space before the first leaf entry needs to be free so it
+ * can be expanded to hold the pointer to the new entry.
+ */
+ if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+ /*
+ * Check out the biggest freespace and see if it's the same one.
+ */
+ dup = (xfs_dir2_data_unused_t *)
+ ((char *)hdr + be16_to_cpu(bf[0].offset));
+ if (dup != enddup) {
+ /*
+ * Not the same free entry, just check its length.
+ */
+ if (be16_to_cpu(dup->length) < len)
+ dup = NULL;
+ goto out;
+ }
+
+ /*
+ * It is the biggest freespace, can it hold the leaf too?
+ */
+ if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
+ /*
+ * Yes, use the second-largest entry instead if it works.
+ */
+ if (be16_to_cpu(bf[1].length) >= len)
+ dup = (xfs_dir2_data_unused_t *)
+ ((char *)hdr + be16_to_cpu(bf[1].offset));
+ else
+ dup = NULL;
+ }
+ }
+out:
+ *tagpp = tagp;
+ *dupp = dup;
+ *enddupp = enddup;
+}
+
+/*
+ * compact the leaf entries.
+ * Leave the highest-numbered stale entry stale.
+ * XXX should be the one closest to mid but mid is not yet computed.
+ */
+static void
+xfs_dir2_block_compact(
+ struct xfs_trans *tp,
+ struct xfs_buf *bp,
+ struct xfs_dir2_data_hdr *hdr,
+ struct xfs_dir2_block_tail *btp,
+ struct xfs_dir2_leaf_entry *blp,
+ int *needlog,
+ int *lfloghigh,
+ int *lfloglow)
+{
+ int fromidx; /* source leaf index */
+ int toidx; /* target leaf index */
+ int needscan = 0;
+ int highstale; /* high stale index */
+
+ fromidx = toidx = be32_to_cpu(btp->count) - 1;
+ highstale = *lfloghigh = -1;
+ for (; fromidx >= 0; fromidx--) {
+ if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
+ if (highstale == -1)
+ highstale = toidx;
+ else {
+ if (*lfloghigh == -1)
+ *lfloghigh = toidx;
+ continue;
+ }
+ }
+ if (fromidx < toidx)
+ blp[toidx] = blp[fromidx];
+ toidx--;
+ }
+ *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
+ *lfloghigh -= be32_to_cpu(btp->stale) - 1;
+ be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
+ xfs_dir2_data_make_free(tp, bp,
+ (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
+ (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
+ needlog, &needscan);
+ blp += be32_to_cpu(btp->stale) - 1;
+ btp->stale = cpu_to_be32(1);
+ /*
+ * If we now need to rebuild the bestfree map, do so.
+ * This needs to happen before the next call to use_free.
+ */
+ if (needscan)
+ xfs_dir2_data_freescan(tp->t_mountp, hdr, needlog);
+}
+
/*
* Add an entry to a block directory.
*/
@@ -63,7 +271,6 @@ int /* error */
xfs_dir2_block_addname(
xfs_da_args_t *args) /* directory op arguments */
{
- xfs_dir2_data_free_t *bf; /* bestfree table in block */
xfs_dir2_data_hdr_t *hdr; /* block header */
xfs_dir2_leaf_entry_t *blp; /* block leaf entries */
struct xfs_buf *bp; /* buffer for block */
@@ -94,134 +301,44 @@ xfs_dir2_block_addname(
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- /*
- * Read the (one and only) directory block into dabuf bp.
- */
- if ((error =
- xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+
+ /* Read the (one and only) directory block into bp. */
+ error = xfs_dir2_block_read(tp, dp, &bp);
+ if (error)
return error;
- }
- ASSERT(bp != NULL);
- hdr = bp->b_addr;
- /*
- * Check the magic number, corrupted if wrong.
- */
- if (unlikely(hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))) {
- XFS_CORRUPTION_ERROR("xfs_dir2_block_addname",
- XFS_ERRLEVEL_LOW, mp, hdr);
- xfs_trans_brelse(tp, bp);
- return XFS_ERROR(EFSCORRUPTED);
- }
+
len = xfs_dir2_data_entsize(args->namelen);
+
/*
* Set up pointers to parts of the block.
*/
- bf = hdr->bestfree;
+ hdr = bp->b_addr;
btp = xfs_dir2_block_tail_p(mp, hdr);
blp = xfs_dir2_block_leaf_p(btp);
+
/*
- * No stale entries? Need space for entry and new leaf.
- */
- if (!btp->stale) {
- /*
- * Tag just before the first leaf entry.
- */
- tagp = (__be16 *)blp - 1;
- /*
- * Data object just before the first leaf entry.
- */
- enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
- /*
- * If it's not free then can't do this add without cleaning up:
- * the space before the first leaf entry needs to be free so it
- * can be expanded to hold the pointer to the new entry.
- */
- if (be16_to_cpu(enddup->freetag) != XFS_DIR2_DATA_FREE_TAG)
- dup = enddup = NULL;
- /*
- * Check out the biggest freespace and see if it's the same one.
- */
- else {
- dup = (xfs_dir2_data_unused_t *)
- ((char *)hdr + be16_to_cpu(bf[0].offset));
- if (dup == enddup) {
- /*
- * It is the biggest freespace, is it too small
- * to hold the new leaf too?
- */
- if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
- /*
- * Yes, we use the second-largest
- * entry instead if it works.
- */
- if (be16_to_cpu(bf[1].length) >= len)
- dup = (xfs_dir2_data_unused_t *)
- ((char *)hdr +
- be16_to_cpu(bf[1].offset));
- else
- dup = NULL;
- }
- } else {
- /*
- * Not the same free entry,
- * just check its length.
- */
- if (be16_to_cpu(dup->length) < len) {
- dup = NULL;
- }
- }
- }
- compact = 0;
- }
- /*
- * If there are stale entries we'll use one for the leaf.
- * Is the biggest entry enough to avoid compaction?
- */
- else if (be16_to_cpu(bf[0].length) >= len) {
- dup = (xfs_dir2_data_unused_t *)
- ((char *)hdr + be16_to_cpu(bf[0].offset));
- compact = 0;
- }
- /*
- * Will need to compact to make this work.
+ * Find out if we can reuse stale entries or whether we need extra
+ * space for entry and new leaf.
*/
- else {
- /*
- * Tag just before the first leaf entry.
- */
- tagp = (__be16 *)blp - 1;
- /*
- * Data object just before the first leaf entry.
- */
- dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
- /*
- * If it's not free then the data will go where the
- * leaf data starts now, if it works at all.
- */
- if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
- if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
- (uint)sizeof(*blp) < len)
- dup = NULL;
- } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
- dup = NULL;
- else
- dup = (xfs_dir2_data_unused_t *)blp;
- compact = 1;
- }
+ xfs_dir2_block_need_space(hdr, btp, blp, &tagp, &dup,
+ &enddup, &compact, len);
+
/*
- * If this isn't a real add, we're done with the buffer.
+ * Done everything we need for a space check now.
*/
- if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+ if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
xfs_trans_brelse(tp, bp);
+ if (!dup)
+ return XFS_ERROR(ENOSPC);
+ return 0;
+ }
+
/*
* If we don't have space for the new entry & leaf ...
*/
if (!dup) {
- /*
- * Not trying to actually do anything, or don't have
- * a space reservation: return no-space.
- */
- if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
+ /* Don't have a space reservation: return no-space. */
+ if (args->total == 0)
return XFS_ERROR(ENOSPC);
/*
* Convert to the next larger format.
@@ -232,65 +349,24 @@ xfs_dir2_block_addname(
return error;
return xfs_dir2_leaf_addname(args);
}
- /*
- * Just checking, and it would work, so say so.
- */
- if (args->op_flags & XFS_DA_OP_JUSTCHECK)
- return 0;
+
needlog = needscan = 0;
+
/*
* If need to compact the leaf entries, do it now.
- * Leave the highest-numbered stale entry stale.
- * XXX should be the one closest to mid but mid is not yet computed.
- */
- if (compact) {
- int fromidx; /* source leaf index */
- int toidx; /* target leaf index */
-
- for (fromidx = toidx = be32_to_cpu(btp->count) - 1,
- highstale = lfloghigh = -1;
- fromidx >= 0;
- fromidx--) {
- if (blp[fromidx].address ==
- cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
- if (highstale == -1)
- highstale = toidx;
- else {
- if (lfloghigh == -1)
- lfloghigh = toidx;
- continue;
- }
- }
- if (fromidx < toidx)
- blp[toidx] = blp[fromidx];
- toidx--;
- }
- lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
- lfloghigh -= be32_to_cpu(btp->stale) - 1;
- be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
- xfs_dir2_data_make_free(tp, bp,
- (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
- (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
- &needlog, &needscan);
- blp += be32_to_cpu(btp->stale) - 1;
- btp->stale = cpu_to_be32(1);
- /*
- * If we now need to rebuild the bestfree map, do so.
- * This needs to happen before the next call to use_free.
- */
- if (needscan) {
- xfs_dir2_data_freescan(mp, hdr, &needlog);
- needscan = 0;
- }
- }
- /*
- * Set leaf logging boundaries to impossible state.
- * For the no-stale case they're set explicitly.
*/
+ if (compact)
+ xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog,
+ &lfloghigh, &lfloglow);
else if (btp->stale) {
+ /*
+ * Set leaf logging boundaries to impossible state.
+ * For the no-stale case they're set explicitly.
+ */
lfloglow = be32_to_cpu(btp->count);
lfloghigh = -1;
}
+
/*
* Find the slot that's first lower than our hash value, -1 if none.
*/
@@ -450,18 +526,13 @@ xfs_dir2_block_getdents(
/*
* If the block number in the offset is out of range, we're done.
*/
- if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) {
+ if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
return 0;
- }
- /*
- * Can't read the block, give up, else get dabuf in bp.
- */
- error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1,
- &bp, XFS_DATA_FORK);
+
+ error = xfs_dir2_block_read(NULL, dp, &bp);
if (error)
return error;
- ASSERT(bp != NULL);
/*
* Extract the byte offset we start at from the seek pointer.
* We'll skip entries before this.
@@ -637,14 +708,11 @@ xfs_dir2_block_lookup_int(
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- /*
- * Read the buffer, return error if we can't get it.
- */
- if ((error =
- xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
+
+ error = xfs_dir2_block_read(tp, dp, &bp);
+ if (error)
return error;
- }
- ASSERT(bp != NULL);
+
hdr = bp->b_addr;
xfs_dir2_data_check(dp, bp);
btp = xfs_dir2_block_tail_p(mp, hdr);
@@ -917,10 +985,10 @@ xfs_dir2_leaf_to_block(
/*
* Read the data block if we don't already have it, give up if it fails.
*/
- if (dbp == NULL &&
- (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp,
- XFS_DATA_FORK))) {
- return error;
+ if (!dbp) {
+ error = xfs_dir2_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp);
+ if (error)
+ return error;
}
hdr = dbp->b_addr;
ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
@@ -944,6 +1012,7 @@ xfs_dir2_leaf_to_block(
/*
* Start converting it to block form.
*/
+ dbp->b_ops = &xfs_dir2_block_buf_ops;
hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
needlog = 1;
needscan = 0;
@@ -1073,6 +1142,7 @@ xfs_dir2_sf_to_block(
kmem_free(sfp);
return error;
}
+ bp->b_ops = &xfs_dir2_block_buf_ops;
hdr = bp->b_addr;
hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
/*
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 44ffd4d6bc91..ffcf1774152e 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -34,14 +34,13 @@
STATIC xfs_dir2_data_free_t *
xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup);
-#ifdef DEBUG
/*
* Check the consistency of the data block.
* The input can also be a block-format directory.
- * Pop an assert if we find anything bad.
+ * Return 0 is the buffer is good, otherwise an error.
*/
-void
-xfs_dir2_data_check(
+int
+__xfs_dir2_data_check(
struct xfs_inode *dp, /* incore inode pointer */
struct xfs_buf *bp) /* data block's buffer */
{
@@ -64,18 +63,23 @@ xfs_dir2_data_check(
int stale; /* count of stale leaves */
struct xfs_name name;
- mp = dp->i_mount;
+ mp = bp->b_target->bt_mount;
hdr = bp->b_addr;
bf = hdr->bestfree;
p = (char *)(hdr + 1);
- if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
+ switch (hdr->magic) {
+ case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
btp = xfs_dir2_block_tail_p(mp, hdr);
lep = xfs_dir2_block_leaf_p(btp);
endp = (char *)lep;
- } else {
- ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
+ break;
+ case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
endp = (char *)hdr + mp->m_dirblksize;
+ break;
+ default:
+ XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
+ return EFSCORRUPTED;
}
count = lastfree = freeseen = 0;
@@ -83,19 +87,22 @@ xfs_dir2_data_check(
* Account for zero bestfree entries.
*/
if (!bf[0].length) {
- ASSERT(!bf[0].offset);
+ XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
freeseen |= 1 << 0;
}
if (!bf[1].length) {
- ASSERT(!bf[1].offset);
+ XFS_WANT_CORRUPTED_RETURN(!bf[1].offset);
freeseen |= 1 << 1;
}
if (!bf[2].length) {
- ASSERT(!bf[2].offset);
+ XFS_WANT_CORRUPTED_RETURN(!bf[2].offset);
freeseen |= 1 << 2;
}
- ASSERT(be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length));
- ASSERT(be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length));
+
+ XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >=
+ be16_to_cpu(bf[1].length));
+ XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >=
+ be16_to_cpu(bf[2].length));
/*
* Loop over the data/unused entries.
*/
@@ -107,17 +114,20 @@ xfs_dir2_data_check(
* doesn't need to be there.
*/
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
- ASSERT(lastfree == 0);
- ASSERT(be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
- (char *)dup - (char *)hdr);
+ XFS_WANT_CORRUPTED_RETURN(lastfree == 0);
+ XFS_WANT_CORRUPTED_RETURN(
+ be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
+ (char *)dup - (char *)hdr);
dfp = xfs_dir2_data_freefind(hdr, dup);
if (dfp) {
i = (int)(dfp - bf);
- ASSERT((freeseen & (1 << i)) == 0);
+ XFS_WANT_CORRUPTED_RETURN(
+ (freeseen & (1 << i)) == 0);
freeseen |= 1 << i;
} else {
- ASSERT(be16_to_cpu(dup->length) <=
- be16_to_cpu(bf[2].length));
+ XFS_WANT_CORRUPTED_RETURN(
+ be16_to_cpu(dup->length) <=
+ be16_to_cpu(bf[2].length));
}
p += be16_to_cpu(dup->length);
lastfree = 1;
@@ -130,10 +140,12 @@ xfs_dir2_data_check(
* The linear search is crude but this is DEBUG code.
*/
dep = (xfs_dir2_data_entry_t *)p;
- ASSERT(dep->namelen != 0);
- ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0);
- ASSERT(be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) ==
- (char *)dep - (char *)hdr);
+ XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0);
+ XFS_WANT_CORRUPTED_RETURN(
+ !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
+ XFS_WANT_CORRUPTED_RETURN(
+ be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) ==
+ (char *)dep - (char *)hdr);
count++;
lastfree = 0;
if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
@@ -148,27 +160,122 @@ xfs_dir2_data_check(
be32_to_cpu(lep[i].hashval) == hash)
break;
}
- ASSERT(i < be32_to_cpu(btp->count));
+ XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
}
p += xfs_dir2_data_entsize(dep->namelen);
}
/*
* Need to have seen all the entries and all the bestfree slots.
*/
- ASSERT(freeseen == 7);
+ XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
if (lep[i].address ==
cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
stale++;
if (i > 0)
- ASSERT(be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval));
+ XFS_WANT_CORRUPTED_RETURN(
+ be32_to_cpu(lep[i].hashval) >=
+ be32_to_cpu(lep[i - 1].hashval));
}
- ASSERT(count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
- ASSERT(stale == be32_to_cpu(btp->stale));
+ XFS_WANT_CORRUPTED_RETURN(count ==
+ be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
+ XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale));
}
+ return 0;
+}
+
+static void
+xfs_dir2_data_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+ int block_ok = 0;
+
+ block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+ block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
+
+ if (!block_ok) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+/*
+ * Readahead of the first block of the directory when it is opened is completely
+ * oblivious to the format of the directory. Hence we can either get a block
+ * format buffer or a data format buffer on readahead.
+ */
+static void
+xfs_dir2_data_reada_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+
+ switch (hdr->magic) {
+ case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+ bp->b_ops = &xfs_dir2_block_buf_ops;
+ bp->b_ops->verify_read(bp);
+ return;
+ case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+ xfs_dir2_data_verify(bp);
+ return;
+ default:
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ break;
+ }
+}
+
+static void
+xfs_dir2_data_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dir2_data_verify(bp);
+}
+
+static void
+xfs_dir2_data_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dir2_data_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_dir2_data_buf_ops = {
+ .verify_read = xfs_dir2_data_read_verify,
+ .verify_write = xfs_dir2_data_write_verify,
+};
+
+static const struct xfs_buf_ops xfs_dir2_data_reada_buf_ops = {
+ .verify_read = xfs_dir2_data_reada_verify,
+ .verify_write = xfs_dir2_data_write_verify,
+};
+
+
+int
+xfs_dir2_data_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ xfs_daddr_t mapped_bno,
+ struct xfs_buf **bpp)
+{
+ return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
+ XFS_DATA_FORK, &xfs_dir2_data_buf_ops);
+}
+
+int
+xfs_dir2_data_readahead(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t bno,
+ xfs_daddr_t mapped_bno)
+{
+ return xfs_da_reada_buf(tp, dp, bno, mapped_bno,
+ XFS_DATA_FORK, &xfs_dir2_data_reada_buf_ops);
}
-#endif
/*
* Given a data block and an unused entry from that block,
@@ -409,10 +516,9 @@ xfs_dir2_data_init(
*/
error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp,
XFS_DATA_FORK);
- if (error) {
+ if (error)
return error;
- }
- ASSERT(bp != NULL);
+ bp->b_ops = &xfs_dir2_data_buf_ops;
/*
* Initialize the header.
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 0b296253bd01..60cd2fa4e047 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -48,6 +48,83 @@ static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp,
int first, int last);
static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp);
+static void
+xfs_dir2_leaf_verify(
+ struct xfs_buf *bp,
+ __be16 magic)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_dir2_leaf_hdr *hdr = bp->b_addr;
+ int block_ok = 0;
+
+ block_ok = hdr->info.magic == magic;
+ if (!block_ok) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+static void
+xfs_dir2_leaf1_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+}
+
+static void
+xfs_dir2_leaf1_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+}
+
+void
+xfs_dir2_leafn_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+}
+
+void
+xfs_dir2_leafn_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+}
+
+static const struct xfs_buf_ops xfs_dir2_leaf1_buf_ops = {
+ .verify_read = xfs_dir2_leaf1_read_verify,
+ .verify_write = xfs_dir2_leaf1_write_verify,
+};
+
+const struct xfs_buf_ops xfs_dir2_leafn_buf_ops = {
+ .verify_read = xfs_dir2_leafn_read_verify,
+ .verify_write = xfs_dir2_leafn_write_verify,
+};
+
+static int
+xfs_dir2_leaf_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t fbno,
+ xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp)
+{
+ return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ XFS_DATA_FORK, &xfs_dir2_leaf1_buf_ops);
+}
+
+int
+xfs_dir2_leafn_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t fbno,
+ xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp)
+{
+ return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ XFS_DATA_FORK, &xfs_dir2_leafn_buf_ops);
+}
/*
* Convert a block form directory to a leaf form directory.
@@ -125,6 +202,7 @@ xfs_dir2_block_to_leaf(
/*
* Fix up the block header, make it a data block.
*/
+ dbp->b_ops = &xfs_dir2_data_buf_ops;
hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
if (needscan)
xfs_dir2_data_freescan(mp, hdr, &needlog);
@@ -311,15 +389,11 @@ xfs_dir2_leaf_addname(
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- /*
- * Read the leaf block.
- */
- error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
- XFS_DATA_FORK);
- if (error) {
+
+ error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
+ if (error)
return error;
- }
- ASSERT(lbp != NULL);
+
/*
* Look up the entry by hash value and name.
* We know it's not there, our caller has already done a lookup.
@@ -494,22 +568,21 @@ xfs_dir2_leaf_addname(
hdr = dbp->b_addr;
bestsp[use_block] = hdr->bestfree[0].length;
grown = 1;
- }
- /*
- * Already had space in some data block.
- * Just read that one in.
- */
- else {
- if ((error =
- xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block),
- -1, &dbp, XFS_DATA_FORK))) {
+ } else {
+ /*
+ * Already had space in some data block.
+ * Just read that one in.
+ */
+ error = xfs_dir2_data_read(tp, dp,
+ xfs_dir2_db_to_da(mp, use_block),
+ -1, &dbp);
+ if (error) {
xfs_trans_brelse(tp, lbp);
return error;
}
hdr = dbp->b_addr;
grown = 0;
}
- xfs_dir2_data_check(dp, dbp);
/*
* Point to the biggest freespace in our data block.
*/
@@ -892,10 +965,9 @@ xfs_dir2_leaf_readbuf(
* Read the directory block starting at the first mapping.
*/
mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
- error = xfs_da_read_buf(NULL, dp, map->br_startoff,
+ error = xfs_dir2_data_read(NULL, dp, map->br_startoff,
map->br_blockcount >= mp->m_dirblkfsbs ?
- XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1,
- &bp, XFS_DATA_FORK);
+ XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
/*
* Should just skip over the data block instead of giving up.
@@ -922,11 +994,11 @@ xfs_dir2_leaf_readbuf(
*/
if (i > mip->ra_current &&
map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
- xfs_buf_readahead(mp->m_ddev_targp,
+ xfs_dir2_data_readahead(NULL, dp,
+ map[mip->ra_index].br_startoff + mip->ra_offset,
XFS_FSB_TO_DADDR(mp,
map[mip->ra_index].br_startblock +
- mip->ra_offset),
- (int)BTOBB(mp->m_dirblksize));
+ mip->ra_offset));
mip->ra_current = i;
}
@@ -935,10 +1007,9 @@ xfs_dir2_leaf_readbuf(
* use our mapping, but this is a very rare case.
*/
else if (i > mip->ra_current) {
- xfs_da_reada_buf(NULL, dp,
+ xfs_dir2_data_readahead(NULL, dp,
map[mip->ra_index].br_startoff +
- mip->ra_offset,
- XFS_DATA_FORK);
+ mip->ra_offset, -1);
mip->ra_current = i;
}
@@ -1177,15 +1248,14 @@ xfs_dir2_leaf_init(
* Get the buffer for the block.
*/
error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp,
- XFS_DATA_FORK);
- if (error) {
+ XFS_DATA_FORK);
+ if (error)
return error;
- }
- ASSERT(bp != NULL);
- leaf = bp->b_addr;
+
/*
* Initialize the header.
*/
+ leaf = bp->b_addr;
leaf->hdr.info.magic = cpu_to_be16(magic);
leaf->hdr.info.forw = 0;
leaf->hdr.info.back = 0;
@@ -1198,10 +1268,12 @@ xfs_dir2_leaf_init(
* the block.
*/
if (magic == XFS_DIR2_LEAF1_MAGIC) {
+ bp->b_ops = &xfs_dir2_leaf1_buf_ops;
ltp = xfs_dir2_leaf_tail_p(mp, leaf);
ltp->bestcount = 0;
xfs_dir2_leaf_log_tail(tp, bp);
- }
+ } else
+ bp->b_ops = &xfs_dir2_leafn_buf_ops;
*bpp = bp;
return 0;
}
@@ -1372,13 +1444,11 @@ xfs_dir2_leaf_lookup_int(
dp = args->dp;
tp = args->trans;
mp = dp->i_mount;
- /*
- * Read the leaf block into the buffer.
- */
- error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
- XFS_DATA_FORK);
+
+ error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
if (error)
return error;
+
*lbpp = lbp;
leaf = lbp->b_addr;
xfs_dir2_leaf_check(dp, lbp);
@@ -1409,14 +1479,13 @@ xfs_dir2_leaf_lookup_int(
if (newdb != curdb) {
if (dbp)
xfs_trans_brelse(tp, dbp);
- error = xfs_da_read_buf(tp, dp,
- xfs_dir2_db_to_da(mp, newdb),
- -1, &dbp, XFS_DATA_FORK);
+ error = xfs_dir2_data_read(tp, dp,
+ xfs_dir2_db_to_da(mp, newdb),
+ -1, &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
}
- xfs_dir2_data_check(dp, dbp);
curdb = newdb;
}
/*
@@ -1451,9 +1520,9 @@ xfs_dir2_leaf_lookup_int(
ASSERT(cidb != -1);
if (cidb != curdb) {
xfs_trans_brelse(tp, dbp);
- error = xfs_da_read_buf(tp, dp,
- xfs_dir2_db_to_da(mp, cidb),
- -1, &dbp, XFS_DATA_FORK);
+ error = xfs_dir2_data_read(tp, dp,
+ xfs_dir2_db_to_da(mp, cidb),
+ -1, &dbp);
if (error) {
xfs_trans_brelse(tp, lbp);
return error;
@@ -1738,10 +1807,9 @@ xfs_dir2_leaf_trim_data(
/*
* Read the offending data block. We need its buffer.
*/
- if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp,
- XFS_DATA_FORK))) {
+ error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp);
+ if (error)
return error;
- }
leaf = lbp->b_addr;
ltp = xfs_dir2_leaf_tail_p(mp, leaf);
@@ -1864,10 +1932,9 @@ xfs_dir2_node_to_leaf(
/*
* Read the freespace block.
*/
- if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp,
- XFS_DATA_FORK))) {
+ error = xfs_dir2_free_read(tp, dp, mp->m_dirfreeblk, &fbp);
+ if (error)
return error;
- }
free = fbp->b_addr;
ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
ASSERT(!free->hdr.firstdb);
@@ -1890,7 +1957,10 @@ xfs_dir2_node_to_leaf(
xfs_dir2_leaf_compact(args, lbp);
else
xfs_dir2_leaf_log_header(tp, lbp);
+
+ lbp->b_ops = &xfs_dir2_leaf1_buf_ops;
leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC);
+
/*
* Set up the leaf tail from the freespace block.
*/
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 6c7052406605..5980f9b7fa9b 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -55,6 +55,74 @@ static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
xfs_da_state_blk_t *fblk);
+static void
+xfs_dir2_free_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_dir2_free_hdr *hdr = bp->b_addr;
+ int block_ok = 0;
+
+ block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC);
+ if (!block_ok) {
+ XFS_CORRUPTION_ERROR("xfs_dir2_free_verify magic",
+ XFS_ERRLEVEL_LOW, mp, hdr);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+static void
+xfs_dir2_free_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dir2_free_verify(bp);
+}
+
+static void
+xfs_dir2_free_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dir2_free_verify(bp);
+}
+
+static const struct xfs_buf_ops xfs_dir2_free_buf_ops = {
+ .verify_read = xfs_dir2_free_read_verify,
+ .verify_write = xfs_dir2_free_write_verify,
+};
+
+
+static int
+__xfs_dir2_free_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t fbno,
+ xfs_daddr_t mappedbno,
+ struct xfs_buf **bpp)
+{
+ return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+ XFS_DATA_FORK, &xfs_dir2_free_buf_ops);
+}
+
+int
+xfs_dir2_free_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t fbno,
+ struct xfs_buf **bpp)
+{
+ return __xfs_dir2_free_read(tp, dp, fbno, -1, bpp);
+}
+
+static int
+xfs_dir2_free_try_read(
+ struct xfs_trans *tp,
+ struct xfs_inode *dp,
+ xfs_dablk_t fbno,
+ struct xfs_buf **bpp)
+{
+ return __xfs_dir2_free_read(tp, dp, fbno, -2, bpp);
+}
+
/*
* Log entries from a freespace block.
*/
@@ -131,11 +199,12 @@ xfs_dir2_leaf_to_node(
/*
* Get the buffer for the new freespace block.
*/
- if ((error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp,
- XFS_DATA_FORK))) {
+ error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp,
+ XFS_DATA_FORK);
+ if (error)
return error;
- }
- ASSERT(fbp != NULL);
+ fbp->b_ops = &xfs_dir2_free_buf_ops;
+
free = fbp->b_addr;
leaf = lbp->b_addr;
ltp = xfs_dir2_leaf_tail_p(mp, leaf);
@@ -157,7 +226,10 @@ xfs_dir2_leaf_to_node(
*to = cpu_to_be16(off);
}
free->hdr.nused = cpu_to_be32(n);
+
+ lbp->b_ops = &xfs_dir2_leafn_buf_ops;
leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
+
/*
* Log everything.
*/
@@ -394,12 +466,10 @@ xfs_dir2_leafn_lookup_for_addname(
*/
if (curbp)
xfs_trans_brelse(tp, curbp);
- /*
- * Read the free block.
- */
- error = xfs_da_read_buf(tp, dp,
+
+ error = xfs_dir2_free_read(tp, dp,
xfs_dir2_db_to_da(mp, newfdb),
- -1, &curbp, XFS_DATA_FORK);
+ &curbp);
if (error)
return error;
free = curbp->b_addr;
@@ -534,9 +604,9 @@ xfs_dir2_leafn_lookup_for_entry(
ASSERT(state->extravalid);
curbp = state->extrablk.bp;
} else {
- error = xfs_da_read_buf(tp, dp,
+ error = xfs_dir2_data_read(tp, dp,
xfs_dir2_db_to_da(mp, newdb),
- -1, &curbp, XFS_DATA_FORK);
+ -1, &curbp);
if (error)
return error;
}
@@ -568,6 +638,7 @@ xfs_dir2_leafn_lookup_for_entry(
state->extrablk.index = (int)((char *)dep -
(char *)curbp->b_addr);
state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+ curbp->b_ops = &xfs_dir2_data_buf_ops;
if (cmp == XFS_CMP_EXACT)
return XFS_ERROR(EEXIST);
}
@@ -582,6 +653,7 @@ xfs_dir2_leafn_lookup_for_entry(
state->extrablk.index = -1;
state->extrablk.blkno = curdb;
state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+ curbp->b_ops = &xfs_dir2_data_buf_ops;
} else {
/* If the curbp is not the CI match block, drop it */
if (state->extrablk.bp != curbp)
@@ -825,6 +897,77 @@ xfs_dir2_leafn_rebalance(
}
}
+static int
+xfs_dir2_data_block_free(
+ xfs_da_args_t *args,
+ struct xfs_dir2_data_hdr *hdr,
+ struct xfs_dir2_free *free,
+ xfs_dir2_db_t fdb,
+ int findex,
+ struct xfs_buf *fbp,
+ int longest)
+{
+ struct xfs_trans *tp = args->trans;
+ int logfree = 0;
+
+ if (!hdr) {
+ /* One less used entry in the free table. */
+ be32_add_cpu(&free->hdr.nused, -1);
+ xfs_dir2_free_log_header(tp, fbp);
+
+ /*
+ * If this was the last entry in the table, we can trim the
+ * table size back. There might be other entries at the end
+ * referring to non-existent data blocks, get those too.
+ */
+ if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
+ int i; /* free entry index */
+
+ for (i = findex - 1; i >= 0; i--) {
+ if (free->bests[i] != cpu_to_be16(NULLDATAOFF))
+ break;
+ }
+ free->hdr.nvalid = cpu_to_be32(i + 1);
+ logfree = 0;
+ } else {
+ /* Not the last entry, just punch it out. */
+ free->bests[findex] = cpu_to_be16(NULLDATAOFF);
+ logfree = 1;
+ }
+ /*
+ * If there are no useful entries left in the block,
+ * get rid of the block if we can.
+ */
+ if (!free->hdr.nused) {
+ int error;
+
+ error = xfs_dir2_shrink_inode(args, fdb, fbp);
+ if (error == 0) {
+ fbp = NULL;
+ logfree = 0;
+ } else if (error != ENOSPC || args->total != 0)
+ return error;
+ /*
+ * It's possible to get ENOSPC if there is no
+ * space reservation. In this case some one
+ * else will eventually get rid of this block.
+ */
+ }
+ } else {
+ /*
+ * Data block is not empty, just set the free entry to the new
+ * value.
+ */
+ free->bests[findex] = cpu_to_be16(longest);
+ logfree = 1;
+ }
+
+ /* Log the free entry that changed, unless we got rid of it. */
+ if (logfree)
+ xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+ return 0;
+}
+
/*
* Remove an entry from a node directory.
* This removes the leaf entry and the data entry,
@@ -908,17 +1051,16 @@ xfs_dir2_leafn_remove(
xfs_dir2_db_t fdb; /* freeblock block number */
int findex; /* index in freeblock entries */
xfs_dir2_free_t *free; /* freeblock structure */
- int logfree; /* need to log free entry */
/*
* Convert the data block number to a free block,
* read in the free block.
*/
fdb = xfs_dir2_db_to_fdb(mp, db);
- if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb),
- -1, &fbp, XFS_DATA_FORK))) {
+ error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, fdb),
+ &fbp);
+ if (error)
return error;
- }
free = fbp->b_addr;
ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
ASSERT(be32_to_cpu(free->hdr.firstdb) ==
@@ -954,68 +1096,12 @@ xfs_dir2_leafn_remove(
* If we got rid of the data block, we can eliminate that entry
* in the free block.
*/
- if (hdr == NULL) {
- /*
- * One less used entry in the free table.
- */
- be32_add_cpu(&free->hdr.nused, -1);
- xfs_dir2_free_log_header(tp, fbp);
- /*
- * If this was the last entry in the table, we can
- * trim the table size back. There might be other
- * entries at the end referring to non-existent
- * data blocks, get those too.
- */
- if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
- int i; /* free entry index */
-
- for (i = findex - 1;
- i >= 0 &&
- free->bests[i] == cpu_to_be16(NULLDATAOFF);
- i--)
- continue;
- free->hdr.nvalid = cpu_to_be32(i + 1);
- logfree = 0;
- }
- /*
- * Not the last entry, just punch it out.
- */
- else {
- free->bests[findex] = cpu_to_be16(NULLDATAOFF);
- logfree = 1;
- }
- /*
- * If there are no useful entries left in the block,
- * get rid of the block if we can.
- */
- if (!free->hdr.nused) {
- error = xfs_dir2_shrink_inode(args, fdb, fbp);
- if (error == 0) {
- fbp = NULL;
- logfree = 0;
- } else if (error != ENOSPC || args->total != 0)
- return error;
- /*
- * It's possible to get ENOSPC if there is no
- * space reservation. In this case some one
- * else will eventually get rid of this block.
- */
- }
- }
- /*
- * Data block is not empty, just set the free entry to
- * the new value.
- */
- else {
- free->bests[findex] = cpu_to_be16(longest);
- logfree = 1;
- }
- /*
- * Log the free entry that changed, unless we got rid of it.
- */
- if (logfree)
- xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+ error = xfs_dir2_data_block_free(args, hdr, free,
+ fdb, findex, fbp, longest);
+ if (error)
+ return error;
}
+
xfs_dir2_leafn_check(dp, bp);
/*
* Return indication of whether this leaf block is empty enough
@@ -1169,12 +1255,11 @@ xfs_dir2_leafn_toosmall(
/*
* Read the sibling leaf block.
*/
- if ((error =
- xfs_da_read_buf(state->args->trans, state->args->dp, blkno,
- -1, &bp, XFS_DATA_FORK))) {
+ error = xfs_dir2_leafn_read(state->args->trans, state->args->dp,
+ blkno, -1, &bp);
+ if (error)
return error;
- }
- ASSERT(bp != NULL);
+
/*
* Count bytes in the two blocks combined.
*/
@@ -1454,14 +1539,13 @@ xfs_dir2_node_addname_int(
* This should be really rare, so there's no reason
* to avoid it.
*/
- if ((error = xfs_da_read_buf(tp, dp,
- xfs_dir2_db_to_da(mp, fbno), -2, &fbp,
- XFS_DATA_FORK))) {
+ error = xfs_dir2_free_try_read(tp, dp,
+ xfs_dir2_db_to_da(mp, fbno),
+ &fbp);
+ if (error)
return error;
- }
- if (unlikely(fbp == NULL)) {
+ if (!fbp)
continue;
- }
free = fbp->b_addr;
ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
findex = 0;
@@ -1520,9 +1604,10 @@ xfs_dir2_node_addname_int(
* that was just allocated.
*/
fbno = xfs_dir2_db_to_fdb(mp, dbno);
- if (unlikely(error = xfs_da_read_buf(tp, dp,
- xfs_dir2_db_to_da(mp, fbno), -2, &fbp,
- XFS_DATA_FORK)))
+ error = xfs_dir2_free_try_read(tp, dp,
+ xfs_dir2_db_to_da(mp, fbno),
+ &fbp);
+ if (error)
return error;
/*
@@ -1561,12 +1646,12 @@ xfs_dir2_node_addname_int(
/*
* Get a buffer for the new block.
*/
- if ((error = xfs_da_get_buf(tp, dp,
- xfs_dir2_db_to_da(mp, fbno),
- -1, &fbp, XFS_DATA_FORK))) {
+ error = xfs_da_get_buf(tp, dp,
+ xfs_dir2_db_to_da(mp, fbno),
+ -1, &fbp, XFS_DATA_FORK);
+ if (error)
return error;
- }
- ASSERT(fbp != NULL);
+ fbp->b_ops = &xfs_dir2_free_buf_ops;
/*
* Initialize the new block to be empty, and remember
@@ -1630,8 +1715,8 @@ xfs_dir2_node_addname_int(
/*
* Read the data block in.
*/
- error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno),
- -1, &dbp, XFS_DATA_FORK);
+ error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno),
+ -1, &dbp);
if (error)
return error;
hdr = dbp->b_addr;
@@ -1917,18 +2002,15 @@ xfs_dir2_node_trim_free(
/*
* Read the freespace block.
*/
- if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp,
- XFS_DATA_FORK))) {
+ error = xfs_dir2_free_try_read(tp, dp, fo, &bp);
+ if (error)
return error;
- }
-
/*
* There can be holes in freespace. If fo is a hole, there's
* nothing to do.
*/
- if (bp == NULL) {
+ if (!bp)
return 0;
- }
free = bp->b_addr;
ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
/*
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 3523d3e15aa8..7da79f6515fd 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -30,6 +30,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
const unsigned char *name, int len);
/* xfs_dir2_block.c */
+extern const struct xfs_buf_ops xfs_dir2_block_buf_ops;
+
extern int xfs_dir2_block_addname(struct xfs_da_args *args);
extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent,
xfs_off_t *offset, filldir_t filldir);
@@ -41,10 +43,19 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
/* xfs_dir2_data.c */
#ifdef DEBUG
-extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
+#define xfs_dir2_data_check(dp,bp) __xfs_dir2_data_check(dp, bp);
#else
#define xfs_dir2_data_check(dp,bp)
#endif
+
+extern const struct xfs_buf_ops xfs_dir2_data_buf_ops;
+
+extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
+extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
+extern int xfs_dir2_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t bno, xfs_daddr_t mapped_bno);
+
extern struct xfs_dir2_data_free *
xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
struct xfs_dir2_data_unused *dup, int *loghead);
@@ -66,6 +77,10 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
/* xfs_dir2_leaf.c */
+extern const struct xfs_buf_ops xfs_dir2_leafn_buf_ops;
+
+extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
struct xfs_buf *dbp);
extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
@@ -115,6 +130,8 @@ extern int xfs_dir2_node_removename(struct xfs_da_args *args);
extern int xfs_dir2_node_replace(struct xfs_da_args *args);
extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
int *rvalp);
+extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
+ xfs_dablk_t fbno, struct xfs_buf **bpp);
/* xfs_dir2_sf.c */
extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index bf27fcca4843..9e1bf5294c91 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -248,7 +248,59 @@ xfs_qm_init_dquot_blk(
xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
}
+static void
+xfs_dquot_buf_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr;
+ struct xfs_disk_dquot *ddq;
+ xfs_dqid_t id = 0;
+ int i;
+
+ /*
+ * On the first read of the buffer, verify that each dquot is valid.
+ * We don't know what the id of the dquot is supposed to be, just that
+ * they should be increasing monotonically within the buffer. If the
+ * first id is corrupt, then it will fail on the second dquot in the
+ * buffer so corruptions could point to the wrong dquot in this case.
+ */
+ for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
+ int error;
+
+ ddq = &d[i].dd_diskdq;
+
+ if (i == 0)
+ id = be32_to_cpu(ddq->d_id);
+
+ error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
+ "xfs_dquot_read_verify");
+ if (error) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ break;
+ }
+ }
+}
+
+static void
+xfs_dquot_buf_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dquot_buf_verify(bp);
+}
+
+void
+xfs_dquot_buf_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_dquot_buf_verify(bp);
+}
+const struct xfs_buf_ops xfs_dquot_buf_ops = {
+ .verify_read = xfs_dquot_buf_read_verify,
+ .verify_write = xfs_dquot_buf_write_verify,
+};
/*
* Allocate a block and fill it with dquots.
@@ -315,6 +367,7 @@ xfs_qm_dqalloc(
error = xfs_buf_geterror(bp);
if (error)
goto error1;
+ bp->b_ops = &xfs_dquot_buf_ops;
/*
* Make a chunk of dquots out of this buffer and log
@@ -359,6 +412,51 @@ xfs_qm_dqalloc(
return (error);
}
+STATIC int
+xfs_qm_dqrepair(
+ struct xfs_mount *mp,
+ struct xfs_trans *tp,
+ struct xfs_dquot *dqp,
+ xfs_dqid_t firstid,
+ struct xfs_buf **bpp)
+{
+ int error;
+ struct xfs_disk_dquot *ddq;
+ struct xfs_dqblk *d;
+ int i;
+
+ /*
+ * Read the buffer without verification so we get the corrupted
+ * buffer returned to us. make sure we verify it on write, though.
+ */
+ error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno,
+ mp->m_quotainfo->qi_dqchunklen,
+ 0, bpp, NULL);
+
+ if (error) {
+ ASSERT(*bpp == NULL);
+ return XFS_ERROR(error);
+ }
+ (*bpp)->b_ops = &xfs_dquot_buf_ops;
+
+ ASSERT(xfs_buf_islocked(*bpp));
+ d = (struct xfs_dqblk *)(*bpp)->b_addr;
+
+ /* Do the actual repair of dquots in this buffer */
+ for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
+ ddq = &d[i].dd_diskdq;
+ error = xfs_qm_dqcheck(mp, ddq, firstid + i,
+ dqp->dq_flags & XFS_DQ_ALLTYPES,
+ XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair");
+ if (error) {
+ /* repair failed, we're screwed */
+ xfs_trans_brelse(tp, *bpp);
+ return XFS_ERROR(EIO);
+ }
+ }
+
+ return 0;
+}
/*
* Maps a dquot to the buffer containing its on-disk version.
@@ -378,7 +476,6 @@ xfs_qm_dqtobp(
xfs_buf_t *bp;
xfs_inode_t *quotip = XFS_DQ_TO_QIP(dqp);
xfs_mount_t *mp = dqp->q_mount;
- xfs_disk_dquot_t *ddq;
xfs_dqid_t id = be32_to_cpu(dqp->q_core.d_id);
xfs_trans_t *tp = (tpp ? *tpp : NULL);
@@ -439,33 +536,24 @@ xfs_qm_dqtobp(
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
dqp->q_blkno,
mp->m_quotainfo->qi_dqchunklen,
- 0, &bp);
- if (error || !bp)
- return XFS_ERROR(error);
- }
-
- ASSERT(xfs_buf_islocked(bp));
+ 0, &bp, &xfs_dquot_buf_ops);
- /*
- * calculate the location of the dquot inside the buffer.
- */
- ddq = bp->b_addr + dqp->q_bufoffset;
+ if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
+ xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff *
+ mp->m_quotainfo->qi_dqperchunk;
+ ASSERT(bp == NULL);
+ error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp);
+ }
- /*
- * A simple sanity check in case we got a corrupted dquot...
- */
- error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
- flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
- "dqtobp");
- if (error) {
- if (!(flags & XFS_QMOPT_DQREPAIR)) {
- xfs_trans_brelse(tp, bp);
- return XFS_ERROR(EIO);
+ if (error) {
+ ASSERT(bp == NULL);
+ return XFS_ERROR(error);
}
}
+ ASSERT(xfs_buf_islocked(bp));
*O_bpp = bp;
- *O_ddpp = ddq;
+ *O_ddpp = bp->b_addr + dqp->q_bufoffset;
return (0);
}
@@ -920,7 +1008,7 @@ xfs_qm_dqflush(
* Get the buffer containing the on-disk dquot
*/
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
- mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL);
if (error)
goto out_unlock;
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 7d20af27346d..c694a8469c4a 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -161,4 +161,6 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
return dqp;
}
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+
#endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 8c6d1d70278c..a83611849cee 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -29,6 +29,7 @@
#include "xfs_inode.h"
#include "xfs_inode_item.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
/*
* Note that we only accept fileids which are long enough rather than allow
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index aa473fa640a2..67284edb84d7 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -31,6 +31,8 @@
#include "xfs_error.h"
#include "xfs_vnodeops.h"
#include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2_priv.h"
#include "xfs_ioctl.h"
#include "xfs_trace.h"
@@ -84,7 +86,7 @@ xfs_rw_ilock_demote(
* valid before the operation, it will be read from disk before
* being partially zeroed.
*/
-STATIC int
+int
xfs_iozero(
struct xfs_inode *ip, /* inode */
loff_t pos, /* offset in file */
@@ -255,15 +257,14 @@ xfs_file_aio_read(
xfs_buftarg_t *target =
XFS_IS_REALTIME_INODE(ip) ?
mp->m_rtdev_targp : mp->m_ddev_targp;
- if ((iocb->ki_pos & target->bt_smask) ||
- (size & target->bt_smask)) {
- if (iocb->ki_pos == i_size_read(inode))
+ if ((pos & target->bt_smask) || (size & target->bt_smask)) {
+ if (pos == i_size_read(inode))
return 0;
return -XFS_ERROR(EINVAL);
}
}
- n = mp->m_super->s_maxbytes - iocb->ki_pos;
+ n = mp->m_super->s_maxbytes - pos;
if (n <= 0 || size == 0)
return 0;
@@ -289,20 +290,21 @@ xfs_file_aio_read(
xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
if (inode->i_mapping->nrpages) {
- ret = -xfs_flushinval_pages(ip,
- (iocb->ki_pos & PAGE_CACHE_MASK),
- -1, FI_REMAPF_LOCKED);
+ ret = -filemap_write_and_wait_range(
+ VFS_I(ip)->i_mapping,
+ pos, -1);
if (ret) {
xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
return ret;
}
+ truncate_pagecache_range(VFS_I(ip), pos, -1);
}
xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
}
- trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
+ trace_xfs_file_read(ip, size, pos, ioflags);
- ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos);
+ ret = generic_file_aio_read(iocb, iovp, nr_segs, pos);
if (ret > 0)
XFS_STATS_ADD(xs_read_bytes, ret);
@@ -670,10 +672,11 @@ xfs_file_dio_aio_write(
goto out;
if (mapping->nrpages) {
- ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
- FI_REMAPF_LOCKED);
+ ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+ pos, -1);
if (ret)
goto out;
+ truncate_pagecache_range(VFS_I(ip), pos, -1);
}
/*
@@ -728,16 +731,17 @@ xfs_file_buffered_aio_write(
write_retry:
trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
ret = generic_file_buffered_write(iocb, iovp, nr_segs,
- pos, &iocb->ki_pos, count, ret);
+ pos, &iocb->ki_pos, count, 0);
+
/*
- * if we just got an ENOSPC, flush the inode now we aren't holding any
- * page locks and retry *once*
+ * If we just got an ENOSPC, try to write back all dirty inodes to
+ * convert delalloc space to free up some of the excess reserved
+ * metadata space.
*/
if (ret == -ENOSPC && !enospc) {
enospc = 1;
- ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
- if (!ret)
- goto write_retry;
+ xfs_flush_inodes(ip->i_mount);
+ goto write_retry;
}
current->backing_dev_info = NULL;
@@ -889,7 +893,7 @@ xfs_dir_open(
*/
mode = xfs_ilock_map_shared(ip);
if (ip->i_d.di_nextents > 0)
- xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
+ xfs_dir2_data_readahead(NULL, ip, 0, -1);
xfs_iunlock(ip, mode);
return 0;
}
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index c13fed8c394a..6dda3f949b04 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -233,7 +233,8 @@ typedef struct xfs_fsop_resblks {
#define XFS_FSOP_GEOM_FLAGS_LOGV2 0x0100 /* log format version 2 */
#define XFS_FSOP_GEOM_FLAGS_SECTOR 0x0200 /* sector sizes >1BB */
#define XFS_FSOP_GEOM_FLAGS_ATTR2 0x0400 /* inline attributes rework */
-#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */
+#define XFS_FSOP_GEOM_FLAGS_PROJID32 0x0800 /* 32-bit project IDs */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2CI 0x1000 /* ASCII only CI names */
#define XFS_FSOP_GEOM_FLAGS_LAZYSB 0x4000 /* lazy superblock counters */
@@ -339,6 +340,35 @@ typedef struct xfs_error_injection {
/*
+ * Speculative preallocation trimming.
+ */
+#define XFS_EOFBLOCKS_VERSION 1
+struct xfs_eofblocks {
+ __u32 eof_version;
+ __u32 eof_flags;
+ uid_t eof_uid;
+ gid_t eof_gid;
+ prid_t eof_prid;
+ __u32 pad32;
+ __u64 eof_min_file_size;
+ __u64 pad64[12];
+};
+
+/* eof_flags values */
+#define XFS_EOF_FLAGS_SYNC (1 << 0) /* sync/wait mode scan */
+#define XFS_EOF_FLAGS_UID (1 << 1) /* filter by uid */
+#define XFS_EOF_FLAGS_GID (1 << 2) /* filter by gid */
+#define XFS_EOF_FLAGS_PRID (1 << 3) /* filter by project id */
+#define XFS_EOF_FLAGS_MINFILESIZE (1 << 4) /* filter by min file size */
+#define XFS_EOF_FLAGS_VALID \
+ (XFS_EOF_FLAGS_SYNC | \
+ XFS_EOF_FLAGS_UID | \
+ XFS_EOF_FLAGS_GID | \
+ XFS_EOF_FLAGS_PRID | \
+ XFS_EOF_FLAGS_MINFILESIZE)
+
+
+/*
* The user-level Handle Request interface structure.
*/
typedef struct xfs_fsop_handlereq {
@@ -456,6 +486,7 @@ typedef struct xfs_handle {
/* XFS_IOC_GETBIOSIZE ---- deprecated 47 */
#define XFS_IOC_GETBMAPX _IOWR('X', 56, struct getbmap)
#define XFS_IOC_ZERO_RANGE _IOW ('X', 57, struct xfs_flock64)
+#define XFS_IOC_FREE_EOFBLOCKS _IOR ('X', 58, struct xfs_eofblocks)
/*
* ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
deleted file mode 100644
index 652b875a9d4c..000000000000
--- a/fs/xfs/xfs_fs_subr.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_vnodeops.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_trace.h"
-
-/*
- * note: all filemap functions return negative error codes. These
- * need to be inverted before returning to the xfs core functions.
- */
-void
-xfs_tosspages(
- xfs_inode_t *ip,
- xfs_off_t first,
- xfs_off_t last,
- int fiopt)
-{
- /* can't toss partial tail pages, so mask them out */
- last &= ~(PAGE_SIZE - 1);
- truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
-}
-
-int
-xfs_flushinval_pages(
- xfs_inode_t *ip,
- xfs_off_t first,
- xfs_off_t last,
- int fiopt)
-{
- struct address_space *mapping = VFS_I(ip)->i_mapping;
- int ret = 0;
-
- trace_xfs_pagecache_inval(ip, first, last);
-
- xfs_iflags_clear(ip, XFS_ITRUNCATED);
- ret = filemap_write_and_wait_range(mapping, first,
- last == -1 ? LLONG_MAX : last);
- if (!ret)
- truncate_inode_pages_range(mapping, first, last);
- return -ret;
-}
-
-int
-xfs_flush_pages(
- xfs_inode_t *ip,
- xfs_off_t first,
- xfs_off_t last,
- uint64_t flags,
- int fiopt)
-{
- struct address_space *mapping = VFS_I(ip)->i_mapping;
- int ret = 0;
- int ret2;
-
- xfs_iflags_clear(ip, XFS_ITRUNCATED);
- ret = -filemap_fdatawrite_range(mapping, first,
- last == -1 ? LLONG_MAX : last);
- if (flags & XBF_ASYNC)
- return ret;
- ret2 = xfs_wait_on_pages(ip, first, last);
- if (!ret)
- ret = ret2;
- return ret;
-}
-
-int
-xfs_wait_on_pages(
- xfs_inode_t *ip,
- xfs_off_t first,
- xfs_off_t last)
-{
- struct address_space *mapping = VFS_I(ip)->i_mapping;
-
- if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
- return -filemap_fdatawait_range(mapping, first,
- last == -1 ? XFS_ISIZE(ip) - 1 : last);
- }
- return 0;
-}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index c25b094efbf7..94eaeedc5498 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -97,7 +97,9 @@ xfs_fs_geometry(
(xfs_sb_version_haslazysbcount(&mp->m_sb) ?
XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) |
(xfs_sb_version_hasattr2(&mp->m_sb) ?
- XFS_FSOP_GEOM_FLAGS_ATTR2 : 0);
+ XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) |
+ (xfs_sb_version_hasprojid32bit(&mp->m_sb) ?
+ XFS_FSOP_GEOM_FLAGS_PROJID32 : 0);
geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
mp->m_sb.sb_logsectsize : BBSIZE;
geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -112,18 +114,40 @@ xfs_fs_geometry(
return 0;
}
+static struct xfs_buf *
+xfs_growfs_get_hdr_buf(
+ struct xfs_mount *mp,
+ xfs_daddr_t blkno,
+ size_t numblks,
+ int flags,
+ const struct xfs_buf_ops *ops)
+{
+ struct xfs_buf *bp;
+
+ bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
+ if (!bp)
+ return NULL;
+
+ xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+ bp->b_bn = blkno;
+ bp->b_maps[0].bm_bn = blkno;
+ bp->b_ops = ops;
+
+ return bp;
+}
+
static int
xfs_growfs_data_private(
xfs_mount_t *mp, /* mount point for filesystem */
xfs_growfs_data_t *in) /* growfs data input struct */
{
xfs_agf_t *agf;
+ struct xfs_agfl *agfl;
xfs_agi_t *agi;
xfs_agnumber_t agno;
xfs_extlen_t agsize;
xfs_extlen_t tmpsize;
xfs_alloc_rec_t *arec;
- struct xfs_btree_block *block;
xfs_buf_t *bp;
int bucket;
int dpct;
@@ -146,9 +170,14 @@ xfs_growfs_data_private(
dpct = pct - mp->m_sb.sb_imax_pct;
bp = xfs_buf_read_uncached(mp->m_ddev_targp,
XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
- XFS_FSS_TO_BB(mp, 1), 0);
+ XFS_FSS_TO_BB(mp, 1), 0, NULL);
if (!bp)
return EIO;
+ if (bp->b_error) {
+ int error = bp->b_error;
+ xfs_buf_relse(bp);
+ return error;
+ }
xfs_buf_relse(bp);
new = nb; /* use new as a temporary here */
@@ -186,17 +215,18 @@ xfs_growfs_data_private(
nfree = 0;
for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
/*
- * AG freelist header block
+ * AG freespace header block
*/
- bp = xfs_buf_get(mp->m_ddev_targp,
- XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0);
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0,
+ &xfs_agf_buf_ops);
if (!bp) {
error = ENOMEM;
goto error0;
}
+
agf = XFS_BUF_TO_AGF(bp);
- memset(agf, 0, mp->m_sb.sb_sectsize);
agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
agf->agf_seqno = cpu_to_be32(agno);
@@ -223,17 +253,39 @@ xfs_growfs_data_private(
goto error0;
/*
+ * AG freelist header block
+ */
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0,
+ &xfs_agfl_buf_ops);
+ if (!bp) {
+ error = ENOMEM;
+ goto error0;
+ }
+
+ agfl = XFS_BUF_TO_AGFL(bp);
+ for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
+ agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
+
+ error = xfs_bwrite(bp);
+ xfs_buf_relse(bp);
+ if (error)
+ goto error0;
+
+ /*
* AG inode header block
*/
- bp = xfs_buf_get(mp->m_ddev_targp,
- XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0);
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0,
+ &xfs_agi_buf_ops);
if (!bp) {
error = ENOMEM;
goto error0;
}
+
agi = XFS_BUF_TO_AGI(bp);
- memset(agi, 0, mp->m_sb.sb_sectsize);
agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
agi->agi_seqno = cpu_to_be32(agno);
@@ -254,24 +306,22 @@ xfs_growfs_data_private(
/*
* BNO btree root block
*/
- bp = xfs_buf_get(mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize), 0);
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
+ BTOBB(mp->m_sb.sb_blocksize), 0,
+ &xfs_allocbt_buf_ops);
+
if (!bp) {
error = ENOMEM;
goto error0;
}
- block = XFS_BUF_TO_BLOCK(bp);
- memset(block, 0, mp->m_sb.sb_blocksize);
- block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
- block->bb_level = 0;
- block->bb_numrecs = cpu_to_be16(1);
- block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
- block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
- arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
+
+ xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0);
+ arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
arec->ar_blockcount = cpu_to_be32(
agsize - be32_to_cpu(arec->ar_startblock));
+
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
if (error)
@@ -280,25 +330,22 @@ xfs_growfs_data_private(
/*
* CNT btree root block
*/
- bp = xfs_buf_get(mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize), 0);
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
+ BTOBB(mp->m_sb.sb_blocksize), 0,
+ &xfs_allocbt_buf_ops);
if (!bp) {
error = ENOMEM;
goto error0;
}
- block = XFS_BUF_TO_BLOCK(bp);
- memset(block, 0, mp->m_sb.sb_blocksize);
- block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
- block->bb_level = 0;
- block->bb_numrecs = cpu_to_be16(1);
- block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
- block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
- arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
+
+ xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0);
+ arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
arec->ar_blockcount = cpu_to_be32(
agsize - be32_to_cpu(arec->ar_startblock));
nfree += be32_to_cpu(arec->ar_blockcount);
+
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
if (error)
@@ -307,20 +354,17 @@ xfs_growfs_data_private(
/*
* INO btree root block
*/
- bp = xfs_buf_get(mp->m_ddev_targp,
- XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
- BTOBB(mp->m_sb.sb_blocksize), 0);
+ bp = xfs_growfs_get_hdr_buf(mp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
+ BTOBB(mp->m_sb.sb_blocksize), 0,
+ &xfs_inobt_buf_ops);
if (!bp) {
error = ENOMEM;
goto error0;
}
- block = XFS_BUF_TO_BLOCK(bp);
- memset(block, 0, mp->m_sb.sb_blocksize);
- block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
- block->bb_level = 0;
- block->bb_numrecs = 0;
- block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
- block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
+
+ xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0);
+
error = xfs_bwrite(bp);
xfs_buf_relse(bp);
if (error)
@@ -399,9 +443,28 @@ xfs_growfs_data_private(
/* update secondary superblocks. */
for (agno = 1; agno < nagcount; agno++) {
- error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+ error = 0;
+ /*
+ * new secondary superblocks need to be zeroed, not read from
+ * disk as the contents of the new area we are growing into is
+ * completely unknown.
+ */
+ if (agno < oagcount) {
+ error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
+ XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
+ XFS_FSS_TO_BB(mp, 1), 0, &bp,
+ &xfs_sb_buf_ops);
+ } else {
+ bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp,
XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, &bp);
+ XFS_FSS_TO_BB(mp, 1), 0);
+ if (bp) {
+ bp->b_ops = &xfs_sb_buf_ops;
+ xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+ } else
+ error = ENOMEM;
+ }
+
if (error) {
xfs_warn(mp,
"error %d reading secondary superblock for ag %d",
@@ -409,6 +472,7 @@ xfs_growfs_data_private(
break;
}
xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS);
+
/*
* If we get an error writing out the alternate superblocks,
* just issue a warning and continue. The real work is
@@ -423,7 +487,7 @@ xfs_growfs_data_private(
break; /* no point in continuing */
}
}
- return 0;
+ return error;
error0:
xfs_trans_cancel(tp, XFS_TRANS_ABORT);
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 76e81cff70b9..5399ef222dd7 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -21,7 +21,8 @@
/*
* Tunable XFS parameters. xfs_params is required even when CONFIG_SYSCTL=n,
* other XFS code uses these values. Times are measured in centisecs (i.e.
- * 100ths of a second).
+ * 100ths of a second) with the exception of eofb_timer, which is measured in
+ * seconds.
*/
xfs_param_t xfs_params = {
/* MIN DFLT MAX */
@@ -40,4 +41,5 @@ xfs_param_t xfs_params = {
.rotorstep = { 1, 1, 255 },
.inherit_nodfrg = { 0, 1, 1 },
.fstrm_timer = { 1, 30*100, 3600*100},
+ .eofb_timer = { 1, 300, 3600*24},
};
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 445bf1aef31c..a815412eab80 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -200,7 +200,8 @@ xfs_ialloc_inode_init(
*/
d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
- mp->m_bsize * blks_per_cluster, 0);
+ mp->m_bsize * blks_per_cluster,
+ XBF_UNMAPPED);
if (!fbuf)
return ENOMEM;
/*
@@ -210,6 +211,7 @@ xfs_ialloc_inode_init(
* to log a whole cluster of inodes instead of all the
* individual transactions causing a lot of log traffic.
*/
+ fbuf->b_ops = &xfs_inode_buf_ops;
xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
for (i = 0; i < ninodes; i++) {
int ioffset = i << mp->m_sb.sb_inodelog;
@@ -250,6 +252,7 @@ xfs_ialloc_ag_alloc(
/* boundary */
struct xfs_perag *pag;
+ memset(&args, 0, sizeof(args));
args.tp = tp;
args.mp = tp->t_mountp;
@@ -876,9 +879,9 @@ error0:
* This function is designed to be called twice if it has to do an allocation
* to make more free inodes. On the first call, *IO_agbp should be set to NULL.
* If an inode is available without having to performn an allocation, an inode
- * number is returned. In this case, *IO_agbp would be NULL. If an allocation
- * needes to be done, xfs_dialloc would return the current AGI buffer in
- * *IO_agbp. The caller should then commit the current transaction, allocate a
+ * number is returned. In this case, *IO_agbp is set to NULL. If an allocation
+ * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp.
+ * The caller should then commit the current transaction, allocate a
* new transaction, and call xfs_dialloc() again, passing in the previous value
* of *IO_agbp. IO_agbp should be held across the transactions. Since the AGI
* buffer is locked across the two calls, the second call is guaranteed to have
@@ -1471,6 +1474,57 @@ xfs_check_agi_unlinked(
#define xfs_check_agi_unlinked(agi)
#endif
+static void
+xfs_agi_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_agi *agi = XFS_BUF_TO_AGI(bp);
+ int agi_ok;
+
+ /*
+ * Validate the magic number of the agi block.
+ */
+ agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
+ XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
+
+ /*
+ * during growfs operations, the perag is not fully initialised,
+ * so we can't use it for any useful checking. growfs ensures we can't
+ * use it by using uncached buffers that don't have the perag attached
+ * so we can detect and avoid this problem.
+ */
+ if (bp->b_pag)
+ agi_ok = agi_ok && be32_to_cpu(agi->agi_seqno) ==
+ bp->b_pag->pag_agno;
+
+ if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
+ XFS_RANDOM_IALLOC_READ_AGI))) {
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+ xfs_check_agi_unlinked(agi);
+}
+
+static void
+xfs_agi_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_agi_verify(bp);
+}
+
+static void
+xfs_agi_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_agi_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_agi_buf_ops = {
+ .verify_read = xfs_agi_read_verify,
+ .verify_write = xfs_agi_write_verify,
+};
+
/*
* Read in the allocation group header (inode allocation section)
*/
@@ -1481,38 +1535,18 @@ xfs_read_agi(
xfs_agnumber_t agno, /* allocation group number */
struct xfs_buf **bpp) /* allocation group hdr buf */
{
- struct xfs_agi *agi; /* allocation group header */
- int agi_ok; /* agi is consistent */
int error;
ASSERT(agno != NULLAGNUMBER);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
- XFS_FSS_TO_BB(mp, 1), 0, bpp);
+ XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
if (error)
return error;
ASSERT(!xfs_buf_geterror(*bpp));
- agi = XFS_BUF_TO_AGI(*bpp);
-
- /*
- * Validate the magic number of the agi block.
- */
- agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
- XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) &&
- be32_to_cpu(agi->agi_seqno) == agno;
- if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
- XFS_RANDOM_IALLOC_READ_AGI))) {
- XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW,
- mp, agi);
- xfs_trans_brelse(tp, *bpp);
- return XFS_ERROR(EFSCORRUPTED);
- }
-
xfs_buf_set_ref(*bpp, XFS_AGI_REF);
-
- xfs_check_agi_unlinked(agi);
return 0;
}
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 1fd6ea4e9c91..c8da3df271e6 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -147,7 +147,9 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
/*
* Get the data from the pointed-to record.
*/
-extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
+int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
xfs_inobt_rec_incore_t *rec, int *stat);
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
+
#endif /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 2b8b7a37aa18..bec344b36507 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -33,6 +33,7 @@
#include "xfs_ialloc.h"
#include "xfs_alloc.h"
#include "xfs_error.h"
+#include "xfs_trace.h"
STATIC int
@@ -181,6 +182,59 @@ xfs_inobt_key_diff(
cur->bc_rec.i.ir_startino;
}
+void
+xfs_inobt_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp);
+ unsigned int level;
+ int sblock_ok; /* block passes checks */
+
+ /* magic number and level verification */
+ level = be16_to_cpu(block->bb_level);
+ sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) &&
+ level < mp->m_in_maxlevels;
+
+ /* numrecs verification */
+ sblock_ok = sblock_ok &&
+ be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0];
+
+ /* sibling pointer verification */
+ sblock_ok = sblock_ok &&
+ (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
+ be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
+ block->bb_u.s.bb_leftsib &&
+ (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
+ be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
+ block->bb_u.s.bb_rightsib;
+
+ if (!sblock_ok) {
+ trace_xfs_btree_corrupt(bp, _RET_IP_);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ }
+}
+
+static void
+xfs_inobt_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_inobt_verify(bp);
+}
+
+static void
+xfs_inobt_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_inobt_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_inobt_buf_ops = {
+ .verify_read = xfs_inobt_read_verify,
+ .verify_write = xfs_inobt_write_verify,
+};
+
#ifdef DEBUG
STATIC int
xfs_inobt_keys_inorder(
@@ -218,6 +272,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
.init_rec_from_cur = xfs_inobt_init_rec_from_cur,
.init_ptr_from_cur = xfs_inobt_init_ptr_from_cur,
.key_diff = xfs_inobt_key_diff,
+ .buf_ops = &xfs_inobt_buf_ops,
#ifdef DEBUG
.keys_inorder = xfs_inobt_keys_inorder,
.recs_inorder = xfs_inobt_recs_inorder,
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index f782ad0c4769..25c0239a8eab 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -109,4 +109,6 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
+extern const struct xfs_buf_ops xfs_inobt_buf_ops;
+
#endif /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_icache.c
index 9500caf15acf..96e344e3e927 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_icache.c
@@ -19,6 +19,7 @@
#include "xfs_fs.h"
#include "xfs_types.h"
#include "xfs_log.h"
+#include "xfs_log_priv.h"
#include "xfs_inum.h"
#include "xfs_trans.h"
#include "xfs_trans_priv.h"
@@ -35,11 +36,425 @@
#include "xfs_quota.h"
#include "xfs_trace.h"
#include "xfs_fsops.h"
+#include "xfs_icache.h"
#include <linux/kthread.h>
#include <linux/freezer.h>
-struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
+STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
+ struct xfs_perag *pag, struct xfs_inode *ip);
+
+/*
+ * Allocate and initialise an xfs_inode.
+ */
+STATIC struct xfs_inode *
+xfs_inode_alloc(
+ struct xfs_mount *mp,
+ xfs_ino_t ino)
+{
+ struct xfs_inode *ip;
+
+ /*
+ * if this didn't occur in transactions, we could use
+ * KM_MAYFAIL and return NULL here on ENOMEM. Set the
+ * code up to do this anyway.
+ */
+ ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+ if (!ip)
+ return NULL;
+ if (inode_init_always(mp->m_super, VFS_I(ip))) {
+ kmem_zone_free(xfs_inode_zone, ip);
+ return NULL;
+ }
+
+ ASSERT(atomic_read(&ip->i_pincount) == 0);
+ ASSERT(!spin_is_locked(&ip->i_flags_lock));
+ ASSERT(!xfs_isiflocked(ip));
+ ASSERT(ip->i_ino == 0);
+
+ mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+
+ /* initialise the xfs inode */
+ ip->i_ino = ino;
+ ip->i_mount = mp;
+ memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
+ ip->i_afp = NULL;
+ memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
+ ip->i_flags = 0;
+ ip->i_delayed_blks = 0;
+ memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+
+ return ip;
+}
+
+STATIC void
+xfs_inode_free_callback(
+ struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ struct xfs_inode *ip = XFS_I(inode);
+
+ kmem_zone_free(xfs_inode_zone, ip);
+}
+
+STATIC void
+xfs_inode_free(
+ struct xfs_inode *ip)
+{
+ switch (ip->i_d.di_mode & S_IFMT) {
+ case S_IFREG:
+ case S_IFDIR:
+ case S_IFLNK:
+ xfs_idestroy_fork(ip, XFS_DATA_FORK);
+ break;
+ }
+
+ if (ip->i_afp)
+ xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+
+ if (ip->i_itemp) {
+ ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
+ xfs_inode_item_destroy(ip);
+ ip->i_itemp = NULL;
+ }
+
+ /* asserts to verify all state is correct here */
+ ASSERT(atomic_read(&ip->i_pincount) == 0);
+ ASSERT(!spin_is_locked(&ip->i_flags_lock));
+ ASSERT(!xfs_isiflocked(ip));
+
+ /*
+ * Because we use RCU freeing we need to ensure the inode always
+ * appears to be reclaimed with an invalid inode number when in the
+ * free state. The ip->i_flags_lock provides the barrier against lookup
+ * races.
+ */
+ spin_lock(&ip->i_flags_lock);
+ ip->i_flags = XFS_IRECLAIM;
+ ip->i_ino = 0;
+ spin_unlock(&ip->i_flags_lock);
+
+ call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+}
+
+/*
+ * Check the validity of the inode we just found it the cache
+ */
+static int
+xfs_iget_cache_hit(
+ struct xfs_perag *pag,
+ struct xfs_inode *ip,
+ xfs_ino_t ino,
+ int flags,
+ int lock_flags) __releases(RCU)
+{
+ struct inode *inode = VFS_I(ip);
+ struct xfs_mount *mp = ip->i_mount;
+ int error;
+
+ /*
+ * check for re-use of an inode within an RCU grace period due to the
+ * radix tree nodes not being updated yet. We monitor for this by
+ * setting the inode number to zero before freeing the inode structure.
+ * If the inode has been reallocated and set up, then the inode number
+ * will not match, so check for that, too.
+ */
+ spin_lock(&ip->i_flags_lock);
+ if (ip->i_ino != ino) {
+ trace_xfs_iget_skip(ip);
+ XFS_STATS_INC(xs_ig_frecycle);
+ error = EAGAIN;
+ goto out_error;
+ }
+
+
+ /*
+ * If we are racing with another cache hit that is currently
+ * instantiating this inode or currently recycling it out of
+ * reclaimabe state, wait for the initialisation to complete
+ * before continuing.
+ *
+ * XXX(hch): eventually we should do something equivalent to
+ * wait_on_inode to wait for these flags to be cleared
+ * instead of polling for it.
+ */
+ if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
+ trace_xfs_iget_skip(ip);
+ XFS_STATS_INC(xs_ig_frecycle);
+ error = EAGAIN;
+ goto out_error;
+ }
+
+ /*
+ * If lookup is racing with unlink return an error immediately.
+ */
+ if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+ error = ENOENT;
+ goto out_error;
+ }
+
+ /*
+ * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+ * Need to carefully get it back into useable state.
+ */
+ if (ip->i_flags & XFS_IRECLAIMABLE) {
+ trace_xfs_iget_reclaim(ip);
+
+ /*
+ * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
+ * from stomping over us while we recycle the inode. We can't
+ * clear the radix tree reclaimable tag yet as it requires
+ * pag_ici_lock to be held exclusive.
+ */
+ ip->i_flags |= XFS_IRECLAIM;
+
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+
+ error = -inode_init_always(mp->m_super, inode);
+ if (error) {
+ /*
+ * Re-initializing the inode failed, and we are in deep
+ * trouble. Try to re-add it to the reclaim list.
+ */
+ rcu_read_lock();
+ spin_lock(&ip->i_flags_lock);
+
+ ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
+ ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
+ trace_xfs_iget_reclaim_fail(ip);
+ goto out_error;
+ }
+
+ spin_lock(&pag->pag_ici_lock);
+ spin_lock(&ip->i_flags_lock);
+
+ /*
+ * Clear the per-lifetime state in the inode as we are now
+ * effectively a new inode and need to return to the initial
+ * state before reuse occurs.
+ */
+ ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
+ ip->i_flags |= XFS_INEW;
+ __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+ inode->i_state = I_NEW;
+
+ ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+ mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+
+ spin_unlock(&ip->i_flags_lock);
+ spin_unlock(&pag->pag_ici_lock);
+ } else {
+ /* If the VFS inode is being torn down, pause and try again. */
+ if (!igrab(inode)) {
+ trace_xfs_iget_skip(ip);
+ error = EAGAIN;
+ goto out_error;
+ }
+
+ /* We've got a live one. */
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+ trace_xfs_iget_hit(ip);
+ }
+
+ if (lock_flags != 0)
+ xfs_ilock(ip, lock_flags);
+
+ xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
+ XFS_STATS_INC(xs_ig_found);
+
+ return 0;
+
+out_error:
+ spin_unlock(&ip->i_flags_lock);
+ rcu_read_unlock();
+ return error;
+}
+
+
+static int
+xfs_iget_cache_miss(
+ struct xfs_mount *mp,
+ struct xfs_perag *pag,
+ xfs_trans_t *tp,
+ xfs_ino_t ino,
+ struct xfs_inode **ipp,
+ int flags,
+ int lock_flags)
+{
+ struct xfs_inode *ip;
+ int error;
+ xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
+ int iflags;
+
+ ip = xfs_inode_alloc(mp, ino);
+ if (!ip)
+ return ENOMEM;
+
+ error = xfs_iread(mp, tp, ip, flags);
+ if (error)
+ goto out_destroy;
+
+ trace_xfs_iget_miss(ip);
+
+ if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
+ error = ENOENT;
+ goto out_destroy;
+ }
+
+ /*
+ * Preload the radix tree so we can insert safely under the
+ * write spinlock. Note that we cannot sleep inside the preload
+ * region. Since we can be called from transaction context, don't
+ * recurse into the file system.
+ */
+ if (radix_tree_preload(GFP_NOFS)) {
+ error = EAGAIN;
+ goto out_destroy;
+ }
+
+ /*
+ * Because the inode hasn't been added to the radix-tree yet it can't
+ * be found by another thread, so we can do the non-sleeping lock here.
+ */
+ if (lock_flags) {
+ if (!xfs_ilock_nowait(ip, lock_flags))
+ BUG();
+ }
+
+ /*
+ * These values must be set before inserting the inode into the radix
+ * tree as the moment it is inserted a concurrent lookup (allowed by the
+ * RCU locking mechanism) can find it and that lookup must see that this
+ * is an inode currently under construction (i.e. that XFS_INEW is set).
+ * The ip->i_flags_lock that protects the XFS_INEW flag forms the
+ * memory barrier that ensures this detection works correctly at lookup
+ * time.
+ */
+ iflags = XFS_INEW;
+ if (flags & XFS_IGET_DONTCACHE)
+ iflags |= XFS_IDONTCACHE;
+ ip->i_udquot = ip->i_gdquot = NULL;
+ xfs_iflags_set(ip, iflags);
+
+ /* insert the new inode */
+ spin_lock(&pag->pag_ici_lock);
+ error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
+ if (unlikely(error)) {
+ WARN_ON(error != -EEXIST);
+ XFS_STATS_INC(xs_ig_dup);
+ error = EAGAIN;
+ goto out_preload_end;
+ }
+ spin_unlock(&pag->pag_ici_lock);
+ radix_tree_preload_end();
+
+ *ipp = ip;
+ return 0;
+
+out_preload_end:
+ spin_unlock(&pag->pag_ici_lock);
+ radix_tree_preload_end();
+ if (lock_flags)
+ xfs_iunlock(ip, lock_flags);
+out_destroy:
+ __destroy_inode(VFS_I(ip));
+ xfs_inode_free(ip);
+ return error;
+}
+
+/*
+ * Look up an inode by number in the given file system.
+ * The inode is looked up in the cache held in each AG.
+ * If the inode is found in the cache, initialise the vfs inode
+ * if necessary.
+ *
+ * If it is not in core, read it in from the file system's device,
+ * add it to the cache and initialise the vfs inode.
+ *
+ * The inode is locked according to the value of the lock_flags parameter.
+ * This flag parameter indicates how and if the inode's IO lock and inode lock
+ * should be taken.
+ *
+ * mp -- the mount point structure for the current file system. It points
+ * to the inode hash table.
+ * tp -- a pointer to the current transaction if there is one. This is
+ * simply passed through to the xfs_iread() call.
+ * ino -- the number of the inode desired. This is the unique identifier
+ * within the file system for the inode being requested.
+ * lock_flags -- flags indicating how to lock the inode. See the comment
+ * for xfs_ilock() for a list of valid values.
+ */
+int
+xfs_iget(
+ xfs_mount_t *mp,
+ xfs_trans_t *tp,
+ xfs_ino_t ino,
+ uint flags,
+ uint lock_flags,
+ xfs_inode_t **ipp)
+{
+ xfs_inode_t *ip;
+ int error;
+ xfs_perag_t *pag;
+ xfs_agino_t agino;
+
+ /*
+ * xfs_reclaim_inode() uses the ILOCK to ensure an inode
+ * doesn't get freed while it's being referenced during a
+ * radix tree traversal here. It assumes this function
+ * aqcuires only the ILOCK (and therefore it has no need to
+ * involve the IOLOCK in this synchronization).
+ */
+ ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
+
+ /* reject inode numbers outside existing AGs */
+ if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+ return EINVAL;
+
+ /* get the perag structure and ensure that it's inode capable */
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
+ agino = XFS_INO_TO_AGINO(mp, ino);
+
+again:
+ error = 0;
+ rcu_read_lock();
+ ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+
+ if (ip) {
+ error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
+ if (error)
+ goto out_error_or_again;
+ } else {
+ rcu_read_unlock();
+ XFS_STATS_INC(xs_ig_missed);
+
+ error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
+ flags, lock_flags);
+ if (error)
+ goto out_error_or_again;
+ }
+ xfs_perag_put(pag);
+
+ *ipp = ip;
+
+ /*
+ * If we have a real type for an on-disk inode, we can set ops(&unlock)
+ * now. If it's a new inode being created, xfs_ialloc will handle it.
+ */
+ if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
+ xfs_setup_inode(ip);
+ return 0;
+
+out_error_or_again:
+ if (error == EAGAIN) {
+ delay(1);
+ goto again;
+ }
+ xfs_perag_put(pag);
+ return error;
+}
/*
* The inode lookup is done in batches to keep the amount of lock traffic and
@@ -101,8 +516,11 @@ xfs_inode_ag_walk(
struct xfs_mount *mp,
struct xfs_perag *pag,
int (*execute)(struct xfs_inode *ip,
- struct xfs_perag *pag, int flags),
- int flags)
+ struct xfs_perag *pag, int flags,
+ void *args),
+ int flags,
+ void *args,
+ int tag)
{
uint32_t first_index;
int last_error = 0;
@@ -121,9 +539,17 @@ restart:
int i;
rcu_read_lock();
- nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+
+ if (tag == -1)
+ nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
(void **)batch, first_index,
XFS_LOOKUP_BATCH);
+ else
+ nr_found = radix_tree_gang_lookup_tag(
+ &pag->pag_ici_root,
+ (void **) batch, first_index,
+ XFS_LOOKUP_BATCH, tag);
+
if (!nr_found) {
rcu_read_unlock();
break;
@@ -164,7 +590,7 @@ restart:
for (i = 0; i < nr_found; i++) {
if (!batch[i])
continue;
- error = execute(batch[i], pag, flags);
+ error = execute(batch[i], pag, flags, args);
IRELE(batch[i]);
if (error == EAGAIN) {
skipped++;
@@ -189,12 +615,40 @@ restart:
return last_error;
}
+/*
+ * Background scanning to trim post-EOF preallocated space. This is queued
+ * based on the 'background_prealloc_discard_period' tunable (5m by default).
+ */
+STATIC void
+xfs_queue_eofblocks(
+ struct xfs_mount *mp)
+{
+ rcu_read_lock();
+ if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG))
+ queue_delayed_work(mp->m_eofblocks_workqueue,
+ &mp->m_eofblocks_work,
+ msecs_to_jiffies(xfs_eofb_secs * 1000));
+ rcu_read_unlock();
+}
+
+void
+xfs_eofblocks_worker(
+ struct work_struct *work)
+{
+ struct xfs_mount *mp = container_of(to_delayed_work(work),
+ struct xfs_mount, m_eofblocks_work);
+ xfs_icache_free_eofblocks(mp, NULL);
+ xfs_queue_eofblocks(mp);
+}
+
int
xfs_inode_ag_iterator(
struct xfs_mount *mp,
int (*execute)(struct xfs_inode *ip,
- struct xfs_perag *pag, int flags),
- int flags)
+ struct xfs_perag *pag, int flags,
+ void *args),
+ int flags,
+ void *args)
{
struct xfs_perag *pag;
int error = 0;
@@ -204,7 +658,7 @@ xfs_inode_ag_iterator(
ag = 0;
while ((pag = xfs_perag_get(mp, ag))) {
ag = pag->pag_agno + 1;
- error = xfs_inode_ag_walk(mp, pag, execute, flags);
+ error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1);
xfs_perag_put(pag);
if (error) {
last_error = error;
@@ -215,224 +669,50 @@ xfs_inode_ag_iterator(
return XFS_ERROR(last_error);
}
-STATIC int
-xfs_sync_inode_data(
- struct xfs_inode *ip,
- struct xfs_perag *pag,
- int flags)
-{
- struct inode *inode = VFS_I(ip);
- struct address_space *mapping = inode->i_mapping;
- int error = 0;
-
- if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
- return 0;
-
- if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
- if (flags & SYNC_TRYLOCK)
- return 0;
- xfs_ilock(ip, XFS_IOLOCK_SHARED);
- }
-
- error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
- 0 : XBF_ASYNC, FI_NONE);
- xfs_iunlock(ip, XFS_IOLOCK_SHARED);
- return error;
-}
-
-/*
- * Write out pagecache data for the whole filesystem.
- */
-STATIC int
-xfs_sync_data(
- struct xfs_mount *mp,
- int flags)
-{
- int error;
-
- ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
-
- error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
- if (error)
- return XFS_ERROR(error);
-
- xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
- return 0;
-}
-
-STATIC int
-xfs_sync_fsdata(
- struct xfs_mount *mp)
-{
- struct xfs_buf *bp;
- int error;
-
- /*
- * If the buffer is pinned then push on the log so we won't get stuck
- * waiting in the write for someone, maybe ourselves, to flush the log.
- *
- * Even though we just pushed the log above, we did not have the
- * superblock buffer locked at that point so it can become pinned in
- * between there and here.
- */
- bp = xfs_getsb(mp, 0);
- if (xfs_buf_ispinned(bp))
- xfs_log_force(mp, 0);
- error = xfs_bwrite(bp);
- xfs_buf_relse(bp);
- return error;
-}
-
-/*
- * When remounting a filesystem read-only or freezing the filesystem, we have
- * two phases to execute. This first phase is syncing the data before we
- * quiesce the filesystem, and the second is flushing all the inodes out after
- * we've waited for all the transactions created by the first phase to
- * complete. The second phase ensures that the inodes are written to their
- * location on disk rather than just existing in transactions in the log. This
- * means after a quiesce there is no log replay required to write the inodes to
- * disk (this is the main difference between a sync and a quiesce).
- */
-/*
- * First stage of freeze - no writers will make progress now we are here,
- * so we flush delwri and delalloc buffers here, then wait for all I/O to
- * complete. Data is frozen at that point. Metadata is not frozen,
- * transactions can still occur here so don't bother emptying the AIL
- * because it'll just get dirty again.
- */
int
-xfs_quiesce_data(
- struct xfs_mount *mp)
-{
- int error, error2 = 0;
-
- /* force out the log */
- xfs_log_force(mp, XFS_LOG_SYNC);
-
- /* write superblock and hoover up shutdown errors */
- error = xfs_sync_fsdata(mp);
-
- /* mark the log as covered if needed */
- if (xfs_log_need_covered(mp))
- error2 = xfs_fs_log_dummy(mp);
-
- return error ? error : error2;
-}
-
-/*
- * Second stage of a quiesce. The data is already synced, now we have to take
- * care of the metadata. New transactions are already blocked, so we need to
- * wait for any remaining transactions to drain out before proceeding.
- */
-void
-xfs_quiesce_attr(
- struct xfs_mount *mp)
-{
- int error = 0;
-
- /* wait for all modifications to complete */
- while (atomic_read(&mp->m_active_trans) > 0)
- delay(100);
-
- /* reclaim inodes to do any IO before the freeze completes */
- xfs_reclaim_inodes(mp, 0);
- xfs_reclaim_inodes(mp, SYNC_WAIT);
-
- /* flush all pending changes from the AIL */
- xfs_ail_push_all_sync(mp->m_ail);
-
- /*
- * Just warn here till VFS can correctly support
- * read-only remount without racing.
- */
- WARN_ON(atomic_read(&mp->m_active_trans) != 0);
-
- /* Push the superblock and write an unmount record */
- error = xfs_log_sbcount(mp);
- if (error)
- xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
- "Frozen image may not be consistent.");
- xfs_log_unmount_write(mp);
-
- /*
- * At this point we might have modified the superblock again and thus
- * added an item to the AIL, thus flush it again.
- */
- xfs_ail_push_all_sync(mp->m_ail);
-
- /*
- * The superblock buffer is uncached and xfsaild_push() will lock and
- * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
- * here but a lock on the superblock buffer will block until iodone()
- * has completed.
- */
- xfs_buf_lock(mp->m_sb_bp);
- xfs_buf_unlock(mp->m_sb_bp);
-}
-
-static void
-xfs_syncd_queue_sync(
- struct xfs_mount *mp)
-{
- queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
- msecs_to_jiffies(xfs_syncd_centisecs * 10));
-}
-
-/*
- * Every sync period we need to unpin all items, reclaim inodes and sync
- * disk quotas. We might need to cover the log to indicate that the
- * filesystem is idle and not frozen.
- */
-STATIC void
-xfs_sync_worker(
- struct work_struct *work)
+xfs_inode_ag_iterator_tag(
+ struct xfs_mount *mp,
+ int (*execute)(struct xfs_inode *ip,
+ struct xfs_perag *pag, int flags,
+ void *args),
+ int flags,
+ void *args,
+ int tag)
{
- struct xfs_mount *mp = container_of(to_delayed_work(work),
- struct xfs_mount, m_sync_work);
- int error;
-
- /*
- * We shouldn't write/force the log if we are in the mount/unmount
- * process or on a read only filesystem. The workqueue still needs to be
- * active in both cases, however, because it is used for inode reclaim
- * during these times. Use the MS_ACTIVE flag to avoid doing anything
- * during mount. Doing work during unmount is avoided by calling
- * cancel_delayed_work_sync on this work queue before tearing down
- * the ail and the log in xfs_log_unmount.
- */
- if (!(mp->m_super->s_flags & MS_ACTIVE) &&
- !(mp->m_flags & XFS_MOUNT_RDONLY)) {
- /* dgc: errors ignored here */
- if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&
- xfs_log_need_covered(mp))
- error = xfs_fs_log_dummy(mp);
- else
- xfs_log_force(mp, 0);
+ struct xfs_perag *pag;
+ int error = 0;
+ int last_error = 0;
+ xfs_agnumber_t ag;
- /* start pushing all the metadata that is currently
- * dirty */
- xfs_ail_push_all(mp->m_ail);
+ ag = 0;
+ while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
+ ag = pag->pag_agno + 1;
+ error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag);
+ xfs_perag_put(pag);
+ if (error) {
+ last_error = error;
+ if (error == EFSCORRUPTED)
+ break;
+ }
}
-
- /* queue us up again */
- xfs_syncd_queue_sync(mp);
+ return XFS_ERROR(last_error);
}
/*
* Queue a new inode reclaim pass if there are reclaimable inodes and there
* isn't a reclaim pass already in progress. By default it runs every 5s based
- * on the xfs syncd work default of 30s. Perhaps this should have it's own
+ * on the xfs periodic sync default of 30s. Perhaps this should have it's own
* tunable, but that can be done if this method proves to be ineffective or too
* aggressive.
*/
static void
-xfs_syncd_queue_reclaim(
+xfs_reclaim_work_queue(
struct xfs_mount *mp)
{
rcu_read_lock();
if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
- queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
+ queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
}
rcu_read_unlock();
@@ -445,7 +725,7 @@ xfs_syncd_queue_reclaim(
* goes low. It scans as quickly as possible avoiding locked inodes or those
* already being flushed, and once done schedules a future pass.
*/
-STATIC void
+void
xfs_reclaim_worker(
struct work_struct *work)
{
@@ -453,65 +733,10 @@ xfs_reclaim_worker(
struct xfs_mount, m_reclaim_work);
xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
- xfs_syncd_queue_reclaim(mp);
+ xfs_reclaim_work_queue(mp);
}
-/*
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations. At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room.
- *
- * Queue a new data flush if there isn't one already in progress and
- * wait for completion of the flush. This means that we only ever have one
- * inode flush in progress no matter how many ENOSPC events are occurring and
- * so will prevent the system from bogging down due to every concurrent
- * ENOSPC event scanning all the active inodes in the system for writeback.
- */
-void
-xfs_flush_inodes(
- struct xfs_inode *ip)
-{
- struct xfs_mount *mp = ip->i_mount;
-
- queue_work(xfs_syncd_wq, &mp->m_flush_work);
- flush_work(&mp->m_flush_work);
-}
-
-STATIC void
-xfs_flush_worker(
- struct work_struct *work)
-{
- struct xfs_mount *mp = container_of(work,
- struct xfs_mount, m_flush_work);
-
- xfs_sync_data(mp, SYNC_TRYLOCK);
- xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
-}
-
-int
-xfs_syncd_init(
- struct xfs_mount *mp)
-{
- INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
- INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
- INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
-
- xfs_syncd_queue_sync(mp);
-
- return 0;
-}
-
-void
-xfs_syncd_stop(
- struct xfs_mount *mp)
-{
- cancel_delayed_work_sync(&mp->m_sync_work);
- cancel_delayed_work_sync(&mp->m_reclaim_work);
- cancel_work_sync(&mp->m_flush_work);
-}
-
-void
+static void
__xfs_inode_set_reclaim_tag(
struct xfs_perag *pag,
struct xfs_inode *ip)
@@ -529,7 +754,7 @@ __xfs_inode_set_reclaim_tag(
spin_unlock(&ip->i_mount->m_perag_lock);
/* schedule periodic background inode reclaim */
- xfs_syncd_queue_reclaim(ip->i_mount);
+ xfs_reclaim_work_queue(ip->i_mount);
trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
-1, _RET_IP_);
@@ -577,7 +802,7 @@ __xfs_inode_clear_reclaim(
}
}
-void
+STATIC void
__xfs_inode_clear_reclaim_tag(
xfs_mount_t *mp,
xfs_perag_t *pag,
@@ -787,9 +1012,9 @@ out:
/*
* We could return EAGAIN here to make reclaim rescan the inode tree in
* a short while. However, this just burns CPU time scanning the tree
- * waiting for IO to complete and xfssyncd never goes back to the idle
- * state. Instead, return 0 to let the next scheduled background reclaim
- * attempt to reclaim the inode again.
+ * waiting for IO to complete and the reclaim work never goes back to
+ * the idle state. Instead, return 0 to let the next scheduled
+ * background reclaim attempt to reclaim the inode again.
*/
return 0;
}
@@ -800,7 +1025,7 @@ out:
* then a shut down during filesystem unmount reclaim walk leak all the
* unreclaimed inodes.
*/
-int
+STATIC int
xfs_reclaim_inodes_ag(
struct xfs_mount *mp,
int flags,
@@ -945,7 +1170,7 @@ xfs_reclaim_inodes_nr(
int nr_to_scan)
{
/* kick background reclaimer and push the AIL */
- xfs_syncd_queue_reclaim(mp);
+ xfs_reclaim_work_queue(mp);
xfs_ail_push_all(mp->m_ail);
xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
@@ -971,3 +1196,146 @@ xfs_reclaim_inodes_count(
return reclaimable;
}
+STATIC int
+xfs_inode_match_id(
+ struct xfs_inode *ip,
+ struct xfs_eofblocks *eofb)
+{
+ if (eofb->eof_flags & XFS_EOF_FLAGS_UID &&
+ ip->i_d.di_uid != eofb->eof_uid)
+ return 0;
+
+ if (eofb->eof_flags & XFS_EOF_FLAGS_GID &&
+ ip->i_d.di_gid != eofb->eof_gid)
+ return 0;
+
+ if (eofb->eof_flags & XFS_EOF_FLAGS_PRID &&
+ xfs_get_projid(ip) != eofb->eof_prid)
+ return 0;
+
+ return 1;
+}
+
+STATIC int
+xfs_inode_free_eofblocks(
+ struct xfs_inode *ip,
+ struct xfs_perag *pag,
+ int flags,
+ void *args)
+{
+ int ret;
+ struct xfs_eofblocks *eofb = args;
+
+ if (!xfs_can_free_eofblocks(ip, false)) {
+ /* inode could be preallocated or append-only */
+ trace_xfs_inode_free_eofblocks_invalid(ip);
+ xfs_inode_clear_eofblocks_tag(ip);
+ return 0;
+ }
+
+ /*
+ * If the mapping is dirty the operation can block and wait for some
+ * time. Unless we are waiting, skip it.
+ */
+ if (!(flags & SYNC_WAIT) &&
+ mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
+ return 0;
+
+ if (eofb) {
+ if (!xfs_inode_match_id(ip, eofb))
+ return 0;
+
+ /* skip the inode if the file size is too small */
+ if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
+ XFS_ISIZE(ip) < eofb->eof_min_file_size)
+ return 0;
+ }
+
+ ret = xfs_free_eofblocks(ip->i_mount, ip, true);
+
+ /* don't revisit the inode if we're not waiting */
+ if (ret == EAGAIN && !(flags & SYNC_WAIT))
+ ret = 0;
+
+ return ret;
+}
+
+int
+xfs_icache_free_eofblocks(
+ struct xfs_mount *mp,
+ struct xfs_eofblocks *eofb)
+{
+ int flags = SYNC_TRYLOCK;
+
+ if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
+ flags = SYNC_WAIT;
+
+ return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags,
+ eofb, XFS_ICI_EOFBLOCKS_TAG);
+}
+
+void
+xfs_inode_set_eofblocks_tag(
+ xfs_inode_t *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
+ int tagged;
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+ spin_lock(&pag->pag_ici_lock);
+ trace_xfs_inode_set_eofblocks_tag(ip);
+
+ tagged = radix_tree_tagged(&pag->pag_ici_root,
+ XFS_ICI_EOFBLOCKS_TAG);
+ radix_tree_tag_set(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_EOFBLOCKS_TAG);
+ if (!tagged) {
+ /* propagate the eofblocks tag up into the perag radix tree */
+ spin_lock(&ip->i_mount->m_perag_lock);
+ radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_ICI_EOFBLOCKS_TAG);
+ spin_unlock(&ip->i_mount->m_perag_lock);
+
+ /* kick off background trimming */
+ xfs_queue_eofblocks(ip->i_mount);
+
+ trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno,
+ -1, _RET_IP_);
+ }
+
+ spin_unlock(&pag->pag_ici_lock);
+ xfs_perag_put(pag);
+}
+
+void
+xfs_inode_clear_eofblocks_tag(
+ xfs_inode_t *ip)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ struct xfs_perag *pag;
+
+ pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+ spin_lock(&pag->pag_ici_lock);
+ trace_xfs_inode_clear_eofblocks_tag(ip);
+
+ radix_tree_tag_clear(&pag->pag_ici_root,
+ XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+ XFS_ICI_EOFBLOCKS_TAG);
+ if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) {
+ /* clear the eofblocks tag from the perag radix tree */
+ spin_lock(&ip->i_mount->m_perag_lock);
+ radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+ XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+ XFS_ICI_EOFBLOCKS_TAG);
+ spin_unlock(&ip->i_mount->m_perag_lock);
+ trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno,
+ -1, _RET_IP_);
+ }
+
+ spin_unlock(&pag->pag_ici_lock);
+ xfs_perag_put(pag);
+}
+
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_icache.h
index 941202e7ac6e..e0f138c70a2f 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_icache.h
@@ -24,28 +24,30 @@ struct xfs_perag;
#define SYNC_WAIT 0x0001 /* wait for i/o to complete */
#define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */
-extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
+int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
+ uint flags, uint lock_flags, xfs_inode_t **ipp);
-int xfs_syncd_init(struct xfs_mount *mp);
-void xfs_syncd_stop(struct xfs_mount *mp);
-
-int xfs_quiesce_data(struct xfs_mount *mp);
-void xfs_quiesce_attr(struct xfs_mount *mp);
-
-void xfs_flush_inodes(struct xfs_inode *ip);
+void xfs_reclaim_worker(struct work_struct *work);
int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
int xfs_reclaim_inodes_count(struct xfs_mount *mp);
void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
-void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
-void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
- struct xfs_inode *ip);
+
+void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
+void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
+int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
+void xfs_eofblocks_worker(struct work_struct *);
int xfs_sync_inode_grab(struct xfs_inode *ip);
int xfs_inode_ag_iterator(struct xfs_mount *mp,
- int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
- int flags);
+ int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
+ int flags, void *args),
+ int flags, void *args);
+int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
+ int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
+ int flags, void *args),
+ int flags, void *args, int tag);
#endif
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
deleted file mode 100644
index 784a803383ec..000000000000
--- a/fs/xfs/xfs_iget.c
+++ /dev/null
@@ -1,705 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_acl.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_quota.h"
-#include "xfs_utils.h"
-#include "xfs_trans_priv.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_trace.h"
-
-
-/*
- * Allocate and initialise an xfs_inode.
- */
-STATIC struct xfs_inode *
-xfs_inode_alloc(
- struct xfs_mount *mp,
- xfs_ino_t ino)
-{
- struct xfs_inode *ip;
-
- /*
- * if this didn't occur in transactions, we could use
- * KM_MAYFAIL and return NULL here on ENOMEM. Set the
- * code up to do this anyway.
- */
- ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
- if (!ip)
- return NULL;
- if (inode_init_always(mp->m_super, VFS_I(ip))) {
- kmem_zone_free(xfs_inode_zone, ip);
- return NULL;
- }
-
- ASSERT(atomic_read(&ip->i_pincount) == 0);
- ASSERT(!spin_is_locked(&ip->i_flags_lock));
- ASSERT(!xfs_isiflocked(ip));
- ASSERT(ip->i_ino == 0);
-
- mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-
- /* initialise the xfs inode */
- ip->i_ino = ino;
- ip->i_mount = mp;
- memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
- ip->i_afp = NULL;
- memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
- ip->i_flags = 0;
- ip->i_delayed_blks = 0;
- memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
-
- return ip;
-}
-
-STATIC void
-xfs_inode_free_callback(
- struct rcu_head *head)
-{
- struct inode *inode = container_of(head, struct inode, i_rcu);
- struct xfs_inode *ip = XFS_I(inode);
-
- kmem_zone_free(xfs_inode_zone, ip);
-}
-
-void
-xfs_inode_free(
- struct xfs_inode *ip)
-{
- switch (ip->i_d.di_mode & S_IFMT) {
- case S_IFREG:
- case S_IFDIR:
- case S_IFLNK:
- xfs_idestroy_fork(ip, XFS_DATA_FORK);
- break;
- }
-
- if (ip->i_afp)
- xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-
- if (ip->i_itemp) {
- ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
- xfs_inode_item_destroy(ip);
- ip->i_itemp = NULL;
- }
-
- /* asserts to verify all state is correct here */
- ASSERT(atomic_read(&ip->i_pincount) == 0);
- ASSERT(!spin_is_locked(&ip->i_flags_lock));
- ASSERT(!xfs_isiflocked(ip));
-
- /*
- * Because we use RCU freeing we need to ensure the inode always
- * appears to be reclaimed with an invalid inode number when in the
- * free state. The ip->i_flags_lock provides the barrier against lookup
- * races.
- */
- spin_lock(&ip->i_flags_lock);
- ip->i_flags = XFS_IRECLAIM;
- ip->i_ino = 0;
- spin_unlock(&ip->i_flags_lock);
-
- call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
-}
-
-/*
- * Check the validity of the inode we just found it the cache
- */
-static int
-xfs_iget_cache_hit(
- struct xfs_perag *pag,
- struct xfs_inode *ip,
- xfs_ino_t ino,
- int flags,
- int lock_flags) __releases(RCU)
-{
- struct inode *inode = VFS_I(ip);
- struct xfs_mount *mp = ip->i_mount;
- int error;
-
- /*
- * check for re-use of an inode within an RCU grace period due to the
- * radix tree nodes not being updated yet. We monitor for this by
- * setting the inode number to zero before freeing the inode structure.
- * If the inode has been reallocated and set up, then the inode number
- * will not match, so check for that, too.
- */
- spin_lock(&ip->i_flags_lock);
- if (ip->i_ino != ino) {
- trace_xfs_iget_skip(ip);
- XFS_STATS_INC(xs_ig_frecycle);
- error = EAGAIN;
- goto out_error;
- }
-
-
- /*
- * If we are racing with another cache hit that is currently
- * instantiating this inode or currently recycling it out of
- * reclaimabe state, wait for the initialisation to complete
- * before continuing.
- *
- * XXX(hch): eventually we should do something equivalent to
- * wait_on_inode to wait for these flags to be cleared
- * instead of polling for it.
- */
- if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
- trace_xfs_iget_skip(ip);
- XFS_STATS_INC(xs_ig_frecycle);
- error = EAGAIN;
- goto out_error;
- }
-
- /*
- * If lookup is racing with unlink return an error immediately.
- */
- if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
- error = ENOENT;
- goto out_error;
- }
-
- /*
- * If IRECLAIMABLE is set, we've torn down the VFS inode already.
- * Need to carefully get it back into useable state.
- */
- if (ip->i_flags & XFS_IRECLAIMABLE) {
- trace_xfs_iget_reclaim(ip);
-
- /*
- * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
- * from stomping over us while we recycle the inode. We can't
- * clear the radix tree reclaimable tag yet as it requires
- * pag_ici_lock to be held exclusive.
- */
- ip->i_flags |= XFS_IRECLAIM;
-
- spin_unlock(&ip->i_flags_lock);
- rcu_read_unlock();
-
- error = -inode_init_always(mp->m_super, inode);
- if (error) {
- /*
- * Re-initializing the inode failed, and we are in deep
- * trouble. Try to re-add it to the reclaim list.
- */
- rcu_read_lock();
- spin_lock(&ip->i_flags_lock);
-
- ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
- ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
- trace_xfs_iget_reclaim_fail(ip);
- goto out_error;
- }
-
- spin_lock(&pag->pag_ici_lock);
- spin_lock(&ip->i_flags_lock);
-
- /*
- * Clear the per-lifetime state in the inode as we are now
- * effectively a new inode and need to return to the initial
- * state before reuse occurs.
- */
- ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
- ip->i_flags |= XFS_INEW;
- __xfs_inode_clear_reclaim_tag(mp, pag, ip);
- inode->i_state = I_NEW;
-
- ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
- mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-
- spin_unlock(&ip->i_flags_lock);
- spin_unlock(&pag->pag_ici_lock);
- } else {
- /* If the VFS inode is being torn down, pause and try again. */
- if (!igrab(inode)) {
- trace_xfs_iget_skip(ip);
- error = EAGAIN;
- goto out_error;
- }
-
- /* We've got a live one. */
- spin_unlock(&ip->i_flags_lock);
- rcu_read_unlock();
- trace_xfs_iget_hit(ip);
- }
-
- if (lock_flags != 0)
- xfs_ilock(ip, lock_flags);
-
- xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
- XFS_STATS_INC(xs_ig_found);
-
- return 0;
-
-out_error:
- spin_unlock(&ip->i_flags_lock);
- rcu_read_unlock();
- return error;
-}
-
-
-static int
-xfs_iget_cache_miss(
- struct xfs_mount *mp,
- struct xfs_perag *pag,
- xfs_trans_t *tp,
- xfs_ino_t ino,
- struct xfs_inode **ipp,
- int flags,
- int lock_flags)
-{
- struct xfs_inode *ip;
- int error;
- xfs_agino_t agino = XFS_INO_TO_AGINO(mp, ino);
- int iflags;
-
- ip = xfs_inode_alloc(mp, ino);
- if (!ip)
- return ENOMEM;
-
- error = xfs_iread(mp, tp, ip, flags);
- if (error)
- goto out_destroy;
-
- trace_xfs_iget_miss(ip);
-
- if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
- error = ENOENT;
- goto out_destroy;
- }
-
- /*
- * Preload the radix tree so we can insert safely under the
- * write spinlock. Note that we cannot sleep inside the preload
- * region. Since we can be called from transaction context, don't
- * recurse into the file system.
- */
- if (radix_tree_preload(GFP_NOFS)) {
- error = EAGAIN;
- goto out_destroy;
- }
-
- /*
- * Because the inode hasn't been added to the radix-tree yet it can't
- * be found by another thread, so we can do the non-sleeping lock here.
- */
- if (lock_flags) {
- if (!xfs_ilock_nowait(ip, lock_flags))
- BUG();
- }
-
- /*
- * These values must be set before inserting the inode into the radix
- * tree as the moment it is inserted a concurrent lookup (allowed by the
- * RCU locking mechanism) can find it and that lookup must see that this
- * is an inode currently under construction (i.e. that XFS_INEW is set).
- * The ip->i_flags_lock that protects the XFS_INEW flag forms the
- * memory barrier that ensures this detection works correctly at lookup
- * time.
- */
- iflags = XFS_INEW;
- if (flags & XFS_IGET_DONTCACHE)
- iflags |= XFS_IDONTCACHE;
- ip->i_udquot = ip->i_gdquot = NULL;
- xfs_iflags_set(ip, iflags);
-
- /* insert the new inode */
- spin_lock(&pag->pag_ici_lock);
- error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
- if (unlikely(error)) {
- WARN_ON(error != -EEXIST);
- XFS_STATS_INC(xs_ig_dup);
- error = EAGAIN;
- goto out_preload_end;
- }
- spin_unlock(&pag->pag_ici_lock);
- radix_tree_preload_end();
-
- *ipp = ip;
- return 0;
-
-out_preload_end:
- spin_unlock(&pag->pag_ici_lock);
- radix_tree_preload_end();
- if (lock_flags)
- xfs_iunlock(ip, lock_flags);
-out_destroy:
- __destroy_inode(VFS_I(ip));
- xfs_inode_free(ip);
- return error;
-}
-
-/*
- * Look up an inode by number in the given file system.
- * The inode is looked up in the cache held in each AG.
- * If the inode is found in the cache, initialise the vfs inode
- * if necessary.
- *
- * If it is not in core, read it in from the file system's device,
- * add it to the cache and initialise the vfs inode.
- *
- * The inode is locked according to the value of the lock_flags parameter.
- * This flag parameter indicates how and if the inode's IO lock and inode lock
- * should be taken.
- *
- * mp -- the mount point structure for the current file system. It points
- * to the inode hash table.
- * tp -- a pointer to the current transaction if there is one. This is
- * simply passed through to the xfs_iread() call.
- * ino -- the number of the inode desired. This is the unique identifier
- * within the file system for the inode being requested.
- * lock_flags -- flags indicating how to lock the inode. See the comment
- * for xfs_ilock() for a list of valid values.
- */
-int
-xfs_iget(
- xfs_mount_t *mp,
- xfs_trans_t *tp,
- xfs_ino_t ino,
- uint flags,
- uint lock_flags,
- xfs_inode_t **ipp)
-{
- xfs_inode_t *ip;
- int error;
- xfs_perag_t *pag;
- xfs_agino_t agino;
-
- /*
- * xfs_reclaim_inode() uses the ILOCK to ensure an inode
- * doesn't get freed while it's being referenced during a
- * radix tree traversal here. It assumes this function
- * aqcuires only the ILOCK (and therefore it has no need to
- * involve the IOLOCK in this synchronization).
- */
- ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
-
- /* reject inode numbers outside existing AGs */
- if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
- return EINVAL;
-
- /* get the perag structure and ensure that it's inode capable */
- pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
- agino = XFS_INO_TO_AGINO(mp, ino);
-
-again:
- error = 0;
- rcu_read_lock();
- ip = radix_tree_lookup(&pag->pag_ici_root, agino);
-
- if (ip) {
- error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
- if (error)
- goto out_error_or_again;
- } else {
- rcu_read_unlock();
- XFS_STATS_INC(xs_ig_missed);
-
- error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
- flags, lock_flags);
- if (error)
- goto out_error_or_again;
- }
- xfs_perag_put(pag);
-
- *ipp = ip;
-
- /*
- * If we have a real type for an on-disk inode, we can set ops(&unlock)
- * now. If it's a new inode being created, xfs_ialloc will handle it.
- */
- if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
- xfs_setup_inode(ip);
- return 0;
-
-out_error_or_again:
- if (error == EAGAIN) {
- delay(1);
- goto again;
- }
- xfs_perag_put(pag);
- return error;
-}
-
-/*
- * This is a wrapper routine around the xfs_ilock() routine
- * used to centralize some grungy code. It is used in places
- * that wish to lock the inode solely for reading the extents.
- * The reason these places can't just call xfs_ilock(SHARED)
- * is that the inode lock also guards to bringing in of the
- * extents from disk for a file in b-tree format. If the inode
- * is in b-tree format, then we need to lock the inode exclusively
- * until the extents are read in. Locking it exclusively all
- * the time would limit our parallelism unnecessarily, though.
- * What we do instead is check to see if the extents have been
- * read in yet, and only lock the inode exclusively if they
- * have not.
- *
- * The function returns a value which should be given to the
- * corresponding xfs_iunlock_map_shared(). This value is
- * the mode in which the lock was actually taken.
- */
-uint
-xfs_ilock_map_shared(
- xfs_inode_t *ip)
-{
- uint lock_mode;
-
- if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
- ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
- lock_mode = XFS_ILOCK_EXCL;
- } else {
- lock_mode = XFS_ILOCK_SHARED;
- }
-
- xfs_ilock(ip, lock_mode);
-
- return lock_mode;
-}
-
-/*
- * This is simply the unlock routine to go with xfs_ilock_map_shared().
- * All it does is call xfs_iunlock() with the given lock_mode.
- */
-void
-xfs_iunlock_map_shared(
- xfs_inode_t *ip,
- unsigned int lock_mode)
-{
- xfs_iunlock(ip, lock_mode);
-}
-
-/*
- * The xfs inode contains 2 locks: a multi-reader lock called the
- * i_iolock and a multi-reader lock called the i_lock. This routine
- * allows either or both of the locks to be obtained.
- *
- * The 2 locks should always be ordered so that the IO lock is
- * obtained first in order to prevent deadlock.
- *
- * ip -- the inode being locked
- * lock_flags -- this parameter indicates the inode's locks
- * to be locked. It can be:
- * XFS_IOLOCK_SHARED,
- * XFS_IOLOCK_EXCL,
- * XFS_ILOCK_SHARED,
- * XFS_ILOCK_EXCL,
- * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
- * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
- * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
- * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
- */
-void
-xfs_ilock(
- xfs_inode_t *ip,
- uint lock_flags)
-{
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-
- if (lock_flags & XFS_IOLOCK_EXCL)
- mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
- else if (lock_flags & XFS_IOLOCK_SHARED)
- mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
-
- if (lock_flags & XFS_ILOCK_EXCL)
- mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
- else if (lock_flags & XFS_ILOCK_SHARED)
- mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
-
- trace_xfs_ilock(ip, lock_flags, _RET_IP_);
-}
-
-/*
- * This is just like xfs_ilock(), except that the caller
- * is guaranteed not to sleep. It returns 1 if it gets
- * the requested locks and 0 otherwise. If the IO lock is
- * obtained but the inode lock cannot be, then the IO lock
- * is dropped before returning.
- *
- * ip -- the inode being locked
- * lock_flags -- this parameter indicates the inode's locks to be
- * to be locked. See the comment for xfs_ilock() for a list
- * of valid values.
- */
-int
-xfs_ilock_nowait(
- xfs_inode_t *ip,
- uint lock_flags)
-{
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-
- if (lock_flags & XFS_IOLOCK_EXCL) {
- if (!mrtryupdate(&ip->i_iolock))
- goto out;
- } else if (lock_flags & XFS_IOLOCK_SHARED) {
- if (!mrtryaccess(&ip->i_iolock))
- goto out;
- }
- if (lock_flags & XFS_ILOCK_EXCL) {
- if (!mrtryupdate(&ip->i_lock))
- goto out_undo_iolock;
- } else if (lock_flags & XFS_ILOCK_SHARED) {
- if (!mrtryaccess(&ip->i_lock))
- goto out_undo_iolock;
- }
- trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
- return 1;
-
- out_undo_iolock:
- if (lock_flags & XFS_IOLOCK_EXCL)
- mrunlock_excl(&ip->i_iolock);
- else if (lock_flags & XFS_IOLOCK_SHARED)
- mrunlock_shared(&ip->i_iolock);
- out:
- return 0;
-}
-
-/*
- * xfs_iunlock() is used to drop the inode locks acquired with
- * xfs_ilock() and xfs_ilock_nowait(). The caller must pass
- * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
- * that we know which locks to drop.
- *
- * ip -- the inode being unlocked
- * lock_flags -- this parameter indicates the inode's locks to be
- * to be unlocked. See the comment for xfs_ilock() for a list
- * of valid values for this parameter.
- *
- */
-void
-xfs_iunlock(
- xfs_inode_t *ip,
- uint lock_flags)
-{
- /*
- * You can't set both SHARED and EXCL for the same lock,
- * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
- * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
- */
- ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
- (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
- ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
- (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
- ASSERT(lock_flags != 0);
-
- if (lock_flags & XFS_IOLOCK_EXCL)
- mrunlock_excl(&ip->i_iolock);
- else if (lock_flags & XFS_IOLOCK_SHARED)
- mrunlock_shared(&ip->i_iolock);
-
- if (lock_flags & XFS_ILOCK_EXCL)
- mrunlock_excl(&ip->i_lock);
- else if (lock_flags & XFS_ILOCK_SHARED)
- mrunlock_shared(&ip->i_lock);
-
- trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
-}
-
-/*
- * give up write locks. the i/o lock cannot be held nested
- * if it is being demoted.
- */
-void
-xfs_ilock_demote(
- xfs_inode_t *ip,
- uint lock_flags)
-{
- ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
- ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
-
- if (lock_flags & XFS_ILOCK_EXCL)
- mrdemote(&ip->i_lock);
- if (lock_flags & XFS_IOLOCK_EXCL)
- mrdemote(&ip->i_iolock);
-
- trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
-}
-
-#ifdef DEBUG
-int
-xfs_isilocked(
- xfs_inode_t *ip,
- uint lock_flags)
-{
- if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
- if (!(lock_flags & XFS_ILOCK_SHARED))
- return !!ip->i_lock.mr_writer;
- return rwsem_is_locked(&ip->i_lock.mr_lock);
- }
-
- if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
- if (!(lock_flags & XFS_IOLOCK_SHARED))
- return !!ip->i_iolock.mr_writer;
- return rwsem_is_locked(&ip->i_iolock.mr_lock);
- }
-
- ASSERT(0);
- return 0;
-}
-#endif
-
-void
-__xfs_iflock(
- struct xfs_inode *ip)
-{
- wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
- DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
-
- do {
- prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
- if (xfs_isiflocked(ip))
- io_schedule();
- } while (!xfs_iflock_nowait(ip));
-
- finish_wait(wq, &wait.wait);
-}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 2778258fcfa2..66282dcb821b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -45,6 +45,7 @@
#include "xfs_filestream.h"
#include "xfs_vnodeops.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
kmem_zone_t *xfs_ifork_zone;
kmem_zone_t *xfs_inode_zone;
@@ -74,6 +75,256 @@ xfs_get_extsz_hint(
return 0;
}
+/*
+ * This is a wrapper routine around the xfs_ilock() routine used to centralize
+ * some grungy code. It is used in places that wish to lock the inode solely
+ * for reading the extents. The reason these places can't just call
+ * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
+ * extents from disk for a file in b-tree format. If the inode is in b-tree
+ * format, then we need to lock the inode exclusively until the extents are read
+ * in. Locking it exclusively all the time would limit our parallelism
+ * unnecessarily, though. What we do instead is check to see if the extents
+ * have been read in yet, and only lock the inode exclusively if they have not.
+ *
+ * The function returns a value which should be given to the corresponding
+ * xfs_iunlock_map_shared(). This value is the mode in which the lock was
+ * actually taken.
+ */
+uint
+xfs_ilock_map_shared(
+ xfs_inode_t *ip)
+{
+ uint lock_mode;
+
+ if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
+ ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
+ lock_mode = XFS_ILOCK_EXCL;
+ } else {
+ lock_mode = XFS_ILOCK_SHARED;
+ }
+
+ xfs_ilock(ip, lock_mode);
+
+ return lock_mode;
+}
+
+/*
+ * This is simply the unlock routine to go with xfs_ilock_map_shared().
+ * All it does is call xfs_iunlock() with the given lock_mode.
+ */
+void
+xfs_iunlock_map_shared(
+ xfs_inode_t *ip,
+ unsigned int lock_mode)
+{
+ xfs_iunlock(ip, lock_mode);
+}
+
+/*
+ * The xfs inode contains 2 locks: a multi-reader lock called the
+ * i_iolock and a multi-reader lock called the i_lock. This routine
+ * allows either or both of the locks to be obtained.
+ *
+ * The 2 locks should always be ordered so that the IO lock is
+ * obtained first in order to prevent deadlock.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks
+ * to be locked. It can be:
+ * XFS_IOLOCK_SHARED,
+ * XFS_IOLOCK_EXCL,
+ * XFS_ILOCK_SHARED,
+ * XFS_ILOCK_EXCL,
+ * XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
+ * XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
+ * XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
+ * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
+ */
+void
+xfs_ilock(
+ xfs_inode_t *ip,
+ uint lock_flags)
+{
+ trace_xfs_ilock(ip, lock_flags, _RET_IP_);
+
+ /*
+ * You can't set both SHARED and EXCL for the same lock,
+ * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+ * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+ */
+ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+ (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+ (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+
+ if (lock_flags & XFS_IOLOCK_EXCL)
+ mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+ else if (lock_flags & XFS_IOLOCK_SHARED)
+ mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+
+ if (lock_flags & XFS_ILOCK_EXCL)
+ mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+ else if (lock_flags & XFS_ILOCK_SHARED)
+ mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+}
+
+/*
+ * This is just like xfs_ilock(), except that the caller
+ * is guaranteed not to sleep. It returns 1 if it gets
+ * the requested locks and 0 otherwise. If the IO lock is
+ * obtained but the inode lock cannot be, then the IO lock
+ * is dropped before returning.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ * to be locked. See the comment for xfs_ilock() for a list
+ * of valid values.
+ */
+int
+xfs_ilock_nowait(
+ xfs_inode_t *ip,
+ uint lock_flags)
+{
+ trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
+
+ /*
+ * You can't set both SHARED and EXCL for the same lock,
+ * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+ * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+ */
+ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+ (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+ (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+
+ if (lock_flags & XFS_IOLOCK_EXCL) {
+ if (!mrtryupdate(&ip->i_iolock))
+ goto out;
+ } else if (lock_flags & XFS_IOLOCK_SHARED) {
+ if (!mrtryaccess(&ip->i_iolock))
+ goto out;
+ }
+ if (lock_flags & XFS_ILOCK_EXCL) {
+ if (!mrtryupdate(&ip->i_lock))
+ goto out_undo_iolock;
+ } else if (lock_flags & XFS_ILOCK_SHARED) {
+ if (!mrtryaccess(&ip->i_lock))
+ goto out_undo_iolock;
+ }
+ return 1;
+
+ out_undo_iolock:
+ if (lock_flags & XFS_IOLOCK_EXCL)
+ mrunlock_excl(&ip->i_iolock);
+ else if (lock_flags & XFS_IOLOCK_SHARED)
+ mrunlock_shared(&ip->i_iolock);
+ out:
+ return 0;
+}
+
+/*
+ * xfs_iunlock() is used to drop the inode locks acquired with
+ * xfs_ilock() and xfs_ilock_nowait(). The caller must pass
+ * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
+ * that we know which locks to drop.
+ *
+ * ip -- the inode being unlocked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ * to be unlocked. See the comment for xfs_ilock() for a list
+ * of valid values for this parameter.
+ *
+ */
+void
+xfs_iunlock(
+ xfs_inode_t *ip,
+ uint lock_flags)
+{
+ /*
+ * You can't set both SHARED and EXCL for the same lock,
+ * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+ * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+ */
+ ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+ (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+ ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+ (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+ ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+ ASSERT(lock_flags != 0);
+
+ if (lock_flags & XFS_IOLOCK_EXCL)
+ mrunlock_excl(&ip->i_iolock);
+ else if (lock_flags & XFS_IOLOCK_SHARED)
+ mrunlock_shared(&ip->i_iolock);
+
+ if (lock_flags & XFS_ILOCK_EXCL)
+ mrunlock_excl(&ip->i_lock);
+ else if (lock_flags & XFS_ILOCK_SHARED)
+ mrunlock_shared(&ip->i_lock);
+
+ trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
+}
+
+/*
+ * give up write locks. the i/o lock cannot be held nested
+ * if it is being demoted.
+ */
+void
+xfs_ilock_demote(
+ xfs_inode_t *ip,
+ uint lock_flags)
+{
+ ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
+ ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
+
+ if (lock_flags & XFS_ILOCK_EXCL)
+ mrdemote(&ip->i_lock);
+ if (lock_flags & XFS_IOLOCK_EXCL)
+ mrdemote(&ip->i_iolock);
+
+ trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
+}
+
+#ifdef DEBUG
+int
+xfs_isilocked(
+ xfs_inode_t *ip,
+ uint lock_flags)
+{
+ if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
+ if (!(lock_flags & XFS_ILOCK_SHARED))
+ return !!ip->i_lock.mr_writer;
+ return rwsem_is_locked(&ip->i_lock.mr_lock);
+ }
+
+ if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
+ if (!(lock_flags & XFS_IOLOCK_SHARED))
+ return !!ip->i_iolock.mr_writer;
+ return rwsem_is_locked(&ip->i_iolock.mr_lock);
+ }
+
+ ASSERT(0);
+ return 0;
+}
+#endif
+
+void
+__xfs_iflock(
+ struct xfs_inode *ip)
+{
+ wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+ DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+
+ do {
+ prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+ if (xfs_isiflocked(ip))
+ io_schedule();
+ } while (!xfs_iflock_nowait(ip));
+
+ finish_wait(wq, &wait.wait);
+}
+
#ifdef DEBUG
/*
* Make sure that the extents in the given memory buffer
@@ -131,6 +382,65 @@ xfs_inobp_check(
}
#endif
+static void
+xfs_inode_buf_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ int i;
+ int ni;
+
+ /*
+ * Validate the magic number and version of every inode in the buffer
+ */
+ ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
+ for (i = 0; i < ni; i++) {
+ int di_ok;
+ xfs_dinode_t *dip;
+
+ dip = (struct xfs_dinode *)xfs_buf_offset(bp,
+ (i << mp->m_sb.sb_inodelog));
+ di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+ XFS_DINODE_GOOD_VERSION(dip->di_version);
+ if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+ XFS_ERRTAG_ITOBP_INOTOBP,
+ XFS_RANDOM_ITOBP_INOTOBP))) {
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+ XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
+ mp, dip);
+#ifdef DEBUG
+ xfs_emerg(mp,
+ "bad inode magic/vsn daddr %lld #%d (magic=%x)",
+ (unsigned long long)bp->b_bn, i,
+ be16_to_cpu(dip->di_magic));
+ ASSERT(0);
+#endif
+ }
+ }
+ xfs_inobp_check(mp, bp);
+}
+
+
+static void
+xfs_inode_buf_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_inode_buf_verify(bp);
+}
+
+static void
+xfs_inode_buf_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_inode_buf_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_inode_buf_ops = {
+ .verify_read = xfs_inode_buf_read_verify,
+ .verify_write = xfs_inode_buf_write_verify,
+};
+
+
/*
* This routine is called to map an inode to the buffer containing the on-disk
* version of the inode. It returns a pointer to the buffer containing the
@@ -145,71 +455,33 @@ xfs_imap_to_bp(
struct xfs_mount *mp,
struct xfs_trans *tp,
struct xfs_imap *imap,
- struct xfs_dinode **dipp,
+ struct xfs_dinode **dipp,
struct xfs_buf **bpp,
uint buf_flags,
uint iget_flags)
{
struct xfs_buf *bp;
int error;
- int i;
- int ni;
buf_flags |= XBF_UNMAPPED;
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
- (int)imap->im_len, buf_flags, &bp);
+ (int)imap->im_len, buf_flags, &bp,
+ &xfs_inode_buf_ops);
if (error) {
- if (error != EAGAIN) {
- xfs_warn(mp,
- "%s: xfs_trans_read_buf() returned error %d.",
- __func__, error);
- } else {
+ if (error == EAGAIN) {
ASSERT(buf_flags & XBF_TRYLOCK);
+ return error;
}
- return error;
- }
-
- /*
- * Validate the magic number and version of every inode in the buffer
- * (if DEBUG kernel) or the first inode in the buffer, otherwise.
- */
-#ifdef DEBUG
- ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
-#else /* usual case */
- ni = 1;
-#endif
- for (i = 0; i < ni; i++) {
- int di_ok;
- xfs_dinode_t *dip;
+ if (error == EFSCORRUPTED &&
+ (iget_flags & XFS_IGET_UNTRUSTED))
+ return XFS_ERROR(EINVAL);
- dip = (xfs_dinode_t *)xfs_buf_offset(bp,
- (i << mp->m_sb.sb_inodelog));
- di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
- XFS_DINODE_GOOD_VERSION(dip->di_version);
- if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
- XFS_ERRTAG_ITOBP_INOTOBP,
- XFS_RANDOM_ITOBP_INOTOBP))) {
- if (iget_flags & XFS_IGET_UNTRUSTED) {
- xfs_trans_brelse(tp, bp);
- return XFS_ERROR(EINVAL);
- }
- XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
- mp, dip);
-#ifdef DEBUG
- xfs_emerg(mp,
- "bad inode magic/vsn daddr %lld #%d (magic=%x)",
- (unsigned long long)imap->im_blkno, i,
- be16_to_cpu(dip->di_magic));
- ASSERT(0);
-#endif
- xfs_trans_brelse(tp, bp);
- return XFS_ERROR(EFSCORRUPTED);
- }
+ xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
+ __func__, error);
+ return error;
}
- xfs_inobp_check(mp, bp);
-
*bpp = bp;
*dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
return 0;
@@ -853,16 +1125,16 @@ xfs_iread_extents(
* set according to the contents of the given cred structure.
*
* Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
- * has a free inode available, call xfs_iget()
- * to obtain the in-core version of the allocated inode. Finally,
- * fill in the inode and log its initial contents. In this case,
- * ialloc_context would be set to NULL and call_again set to false.
+ * has a free inode available, call xfs_iget() to obtain the in-core
+ * version of the allocated inode. Finally, fill in the inode and
+ * log its initial contents. In this case, ialloc_context would be
+ * set to NULL.
*
- * If xfs_dialloc() does not have an available inode,
- * it will replenish its supply by doing an allocation. Since we can
- * only do one allocation within a transaction without deadlocks, we
- * must commit the current transaction before returning the inode itself.
- * In this case, therefore, we will set call_again to true and return.
+ * If xfs_dialloc() does not have an available inode, it will replenish
+ * its supply by doing an allocation. Since we can only do one
+ * allocation within a transaction without deadlocks, we must commit
+ * the current transaction before returning the inode itself.
+ * In this case, therefore, we will set ialloc_context and return.
* The caller should then commit the current transaction, start a new
* transaction, and call xfs_ialloc() again to actually get the inode.
*
@@ -1509,10 +1781,23 @@ xfs_ifree_cluster(
* to mark all the active inodes on the buffer stale.
*/
bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
- mp->m_bsize * blks_per_cluster, 0);
+ mp->m_bsize * blks_per_cluster,
+ XBF_UNMAPPED);
if (!bp)
return ENOMEM;
+
+ /*
+ * This buffer may not have been correctly initialised as we
+ * didn't read it from disk. That's not important because we are
+ * only using to mark the buffer as stale in the log, and to
+ * attach stale cached inodes on it. That means it will never be
+ * dispatched for IO. If it is, we want to know about it, and we
+ * want it to fail. We can acheive this by adding a write
+ * verifier to the buffer.
+ */
+ bp->b_ops = &xfs_inode_buf_ops;
+
/*
* Walk the inodes already attached to the buffer and mark them
* stale. These will all have the flush locks held, so an
@@ -3660,3 +3945,40 @@ xfs_iext_irec_update_extoffs(
ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
}
}
+
+/*
+ * Test whether it is appropriate to check an inode for and free post EOF
+ * blocks. The 'force' parameter determines whether we should also consider
+ * regular files that are marked preallocated or append-only.
+ */
+bool
+xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
+{
+ /* prealloc/delalloc exists only on regular files */
+ if (!S_ISREG(ip->i_d.di_mode))
+ return false;
+
+ /*
+ * Zero sized files with no cached pages and delalloc blocks will not
+ * have speculative prealloc/delalloc blocks to remove.
+ */
+ if (VFS_I(ip)->i_size == 0 &&
+ VN_CACHED(VFS_I(ip)) == 0 &&
+ ip->i_delayed_blks == 0)
+ return false;
+
+ /* If we haven't read in the extent list, then don't do it now. */
+ if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
+ return false;
+
+ /*
+ * Do not free real preallocated or append-only files unless the file
+ * has delalloc blocks and we are forced to remove them.
+ */
+ if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
+ if (!force || ip->i_delayed_blks == 0)
+ return false;
+
+ return true;
+}
+
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 94b32f906e79..22baf6ea4fac 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -496,11 +496,10 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
(((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
((pip)->i_d.di_mode & S_ISGID))
+
/*
- * xfs_iget.c prototypes.
+ * xfs_inode.c prototypes.
*/
-int xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
- uint, uint, xfs_inode_t **);
void xfs_ilock(xfs_inode_t *, uint);
int xfs_ilock_nowait(xfs_inode_t *, uint);
void xfs_iunlock(xfs_inode_t *, uint);
@@ -508,11 +507,6 @@ void xfs_ilock_demote(xfs_inode_t *, uint);
int xfs_isilocked(xfs_inode_t *, uint);
uint xfs_ilock_map_shared(xfs_inode_t *);
void xfs_iunlock_map_shared(xfs_inode_t *, uint);
-void xfs_inode_free(struct xfs_inode *ip);
-
-/*
- * xfs_inode.c prototypes.
- */
int xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
xfs_nlink_t, xfs_dev_t, prid_t, int,
struct xfs_buf **, xfs_inode_t **);
@@ -591,6 +585,7 @@ void xfs_iext_irec_compact(xfs_ifork_t *);
void xfs_iext_irec_compact_pages(xfs_ifork_t *);
void xfs_iext_irec_compact_full(xfs_ifork_t *);
void xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
+bool xfs_can_free_eofblocks(struct xfs_inode *, bool);
#define xfs_ipincount(ip) ((unsigned int) atomic_read(&ip->i_pincount))
@@ -603,5 +598,6 @@ void xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
extern struct kmem_zone *xfs_ifork_zone;
extern struct kmem_zone *xfs_inode_zone;
extern struct kmem_zone *xfs_ili_zone;
+extern const struct xfs_buf_ops xfs_inode_buf_ops;
#endif /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 8305f2ac6773..c1c3ef88a260 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -42,6 +42,7 @@
#include "xfs_inode_item.h"
#include "xfs_export.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
#include <linux/capability.h>
#include <linux/dcache.h>
@@ -70,7 +71,7 @@ xfs_find_handle(
int hsize;
xfs_handle_t handle;
struct inode *inode;
- struct fd f;
+ struct fd f = {0};
struct path path;
int error;
struct xfs_inode *ip;
@@ -1602,6 +1603,26 @@ xfs_file_ioctl(
error = xfs_errortag_clearall(mp, 1);
return -error;
+ case XFS_IOC_FREE_EOFBLOCKS: {
+ struct xfs_eofblocks eofb;
+
+ if (copy_from_user(&eofb, arg, sizeof(eofb)))
+ return -XFS_ERROR(EFAULT);
+
+ if (eofb.eof_version != XFS_EOFBLOCKS_VERSION)
+ return -XFS_ERROR(EINVAL);
+
+ if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID)
+ return -XFS_ERROR(EINVAL);
+
+ if (memchr_inv(&eofb.pad32, 0, sizeof(eofb.pad32)) ||
+ memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64)))
+ return -XFS_ERROR(EINVAL);
+
+ error = xfs_icache_free_eofblocks(mp, &eofb);
+ return -error;
+ }
+
default:
return -ENOTTY;
}
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 973dff6ad935..add06b4e9a63 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -41,6 +41,7 @@
#include "xfs_utils.h"
#include "xfs_iomap.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
#define XFS_WRITEIO_ALIGN(mp,off) (((off) >> mp->m_writeio_log) \
@@ -373,7 +374,7 @@ xfs_iomap_write_delay(
xfs_extlen_t extsz;
int nimaps;
xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
- int prealloc, flushed = 0;
+ int prealloc;
int error;
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -434,31 +435,29 @@ retry:
}
/*
- * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For
- * ENOSPC, * flush all other inodes with delalloc blocks to free up
- * some of the excess reserved metadata space. For both cases, retry
+ * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
* without EOF preallocation.
*/
if (nimaps == 0) {
trace_xfs_delalloc_enospc(ip, offset, count);
- if (flushed)
- return XFS_ERROR(error ? error : ENOSPC);
-
- if (error == ENOSPC) {
- xfs_iunlock(ip, XFS_ILOCK_EXCL);
- xfs_flush_inodes(ip);
- xfs_ilock(ip, XFS_ILOCK_EXCL);
+ if (prealloc) {
+ prealloc = 0;
+ error = 0;
+ goto retry;
}
-
- flushed = 1;
- error = 0;
- prealloc = 0;
- goto retry;
+ return XFS_ERROR(error ? error : ENOSPC);
}
if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
return xfs_alert_fsblock_zero(ip, &imap[0]);
+ /*
+ * Tag the inode as speculatively preallocated so we can reclaim this
+ * space on demand, if necessary.
+ */
+ if (prealloc)
+ xfs_inode_set_eofblocks_tag(ip);
+
*ret_imap = imap[0];
return 0;
}
@@ -584,7 +583,9 @@ xfs_iomap_write_allocate(
* pointer that the caller gave to us.
*/
error = xfs_bmapi_write(tp, ip, map_start_fsb,
- count_fsb, 0, &first_block, 1,
+ count_fsb,
+ XFS_BMAPI_STACK_SWITCH,
+ &first_block, 1,
imap, &nimaps, &free_list);
if (error)
goto trans_cancel;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 4e00cf091d2c..d82efaa2ac73 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,6 +38,7 @@
#include "xfs_vnodeops.h"
#include "xfs_inode_item.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
#include <linux/capability.h>
#include <linux/xattr.h>
@@ -779,8 +780,8 @@ xfs_setattr_size(
* care about here.
*/
if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
- error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0,
- FI_NONE);
+ error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+ ip->i_d.di_size, newsize);
if (error)
goto out_unlock;
}
@@ -854,6 +855,9 @@ xfs_setattr_size(
* and do not wait the usual (long) time for writeout.
*/
xfs_iflags_set(ip, XFS_ITRUNCATED);
+
+ /* A truncate down always removes post-EOF blocks. */
+ xfs_inode_clear_eofblocks_tag(ip);
}
if (mask & ATTR_CTIME) {
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 01d10a66e302..2ea7d402188d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -34,6 +34,7 @@
#include "xfs_error.h"
#include "xfs_btree.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
STATIC int
xfs_internal_inum(
@@ -395,7 +396,8 @@ xfs_bulkstat(
if (xfs_inobt_maskn(chunkidx, nicluster)
& ~r.ir_free)
xfs_btree_reada_bufs(mp, agno,
- agbno, nbcluster);
+ agbno, nbcluster,
+ &xfs_inode_buf_ops);
}
irbp->ir_startino = r.ir_startino;
irbp->ir_freecount = r.ir_freecount;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 828662f70d64..fe7e4df85a7b 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -44,6 +44,7 @@
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
+#include <linux/crc32c.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/file.h>
@@ -118,6 +119,7 @@
#define xfs_rotorstep xfs_params.rotorstep.val
#define xfs_inherit_nodefrag xfs_params.inherit_nodfrg.val
#define xfs_fstrm_centisecs xfs_params.fstrm_timer.val
+#define xfs_eofb_secs xfs_params.eofb_timer.val
#define current_cpu() (raw_smp_processor_id())
#define current_pid() (current->pid)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 7f4f9370d0e7..46bd9d52ab51 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -34,6 +34,8 @@
#include "xfs_dinode.h"
#include "xfs_inode.h"
#include "xfs_trace.h"
+#include "xfs_fsops.h"
+#include "xfs_cksum.h"
kmem_zone_t *xfs_log_ticket_zone;
@@ -458,7 +460,8 @@ xfs_log_reserve(
tic->t_trans_type = t_type;
*ticp = tic;
- xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt);
+ xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
+ : tic->t_unit_res);
trace_xfs_log_reserve(log, tic);
@@ -679,25 +682,29 @@ out:
}
/*
- * Finish the recovery of the file system. This is separate from
- * the xfs_log_mount() call, because it depends on the code in
- * xfs_mountfs() to read in the root and real-time bitmap inodes
- * between calling xfs_log_mount() and here.
+ * Finish the recovery of the file system. This is separate from the
+ * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read
+ * in the root and real-time bitmap inodes between calling xfs_log_mount() and
+ * here.
*
- * mp - ubiquitous xfs mount point structure
+ * If we finish recovery successfully, start the background log work. If we are
+ * not doing recovery, then we have a RO filesystem and we don't need to start
+ * it.
*/
int
xfs_log_mount_finish(xfs_mount_t *mp)
{
- int error;
+ int error = 0;
- if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
+ if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
error = xlog_recover_finish(mp->m_log);
- else {
- error = 0;
+ if (!error)
+ xfs_log_work_queue(mp);
+ } else {
ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
}
+
return error;
}
@@ -850,15 +857,49 @@ xfs_log_unmount_write(xfs_mount_t *mp)
} /* xfs_log_unmount_write */
/*
- * Deallocate log structures for unmount/relocation.
+ * Empty the log for unmount/freeze.
+ *
+ * To do this, we first need to shut down the background log work so it is not
+ * trying to cover the log as we clean up. We then need to unpin all objects in
+ * the log so we can then flush them out. Once they have completed their IO and
+ * run the callbacks removing themselves from the AIL, we can write the unmount
+ * record.
+ */
+void
+xfs_log_quiesce(
+ struct xfs_mount *mp)
+{
+ cancel_delayed_work_sync(&mp->m_log->l_work);
+ xfs_log_force(mp, XFS_LOG_SYNC);
+
+ /*
+ * The superblock buffer is uncached and while xfs_ail_push_all_sync()
+ * will push it, xfs_wait_buftarg() will not wait for it. Further,
+ * xfs_buf_iowait() cannot be used because it was pushed with the
+ * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for
+ * the IO to complete.
+ */
+ xfs_ail_push_all_sync(mp->m_ail);
+ xfs_wait_buftarg(mp->m_ddev_targp);
+ xfs_buf_lock(mp->m_sb_bp);
+ xfs_buf_unlock(mp->m_sb_bp);
+
+ xfs_log_unmount_write(mp);
+}
+
+/*
+ * Shut down and release the AIL and Log.
*
- * We need to stop the aild from running before we destroy
- * and deallocate the log as the aild references the log.
+ * During unmount, we need to ensure we flush all the dirty metadata objects
+ * from the AIL so that the log is empty before we write the unmount record to
+ * the log. Once this is done, we can tear down the AIL and the log.
*/
void
-xfs_log_unmount(xfs_mount_t *mp)
+xfs_log_unmount(
+ struct xfs_mount *mp)
{
- cancel_delayed_work_sync(&mp->m_sync_work);
+ xfs_log_quiesce(mp);
+
xfs_trans_ail_destroy(mp);
xlog_dealloc_log(mp->m_log);
}
@@ -1090,8 +1131,7 @@ xlog_iodone(xfs_buf_t *bp)
* with it being freed after writing the unmount record to the
* log.
*/
-
-} /* xlog_iodone */
+}
/*
* Return size of each in-core log record buffer.
@@ -1161,6 +1201,40 @@ done:
} /* xlog_get_iclog_buffer_size */
+void
+xfs_log_work_queue(
+ struct xfs_mount *mp)
+{
+ queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work,
+ msecs_to_jiffies(xfs_syncd_centisecs * 10));
+}
+
+/*
+ * Every sync period we need to unpin all items in the AIL and push them to
+ * disk. If there is nothing dirty, then we might need to cover the log to
+ * indicate that the filesystem is idle.
+ */
+void
+xfs_log_worker(
+ struct work_struct *work)
+{
+ struct xlog *log = container_of(to_delayed_work(work),
+ struct xlog, l_work);
+ struct xfs_mount *mp = log->l_mp;
+
+ /* dgc: errors ignored - not fatal and nowhere to report them */
+ if (xfs_log_need_covered(mp))
+ xfs_fs_log_dummy(mp);
+ else
+ xfs_log_force(mp, 0);
+
+ /* start pushing all the metadata that is currently dirty */
+ xfs_ail_push_all(mp->m_ail);
+
+ /* queue us up again */
+ xfs_log_work_queue(mp);
+}
+
/*
* This routine initializes some of the log structure for a given mount point.
* Its primary purpose is to fill in enough, so recovery can occur. However,
@@ -1195,6 +1269,7 @@ xlog_alloc_log(
log->l_logBBsize = num_bblks;
log->l_covered_state = XLOG_STATE_COVER_IDLE;
log->l_flags |= XLOG_ACTIVE_RECOVERY;
+ INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
log->l_prev_block = -1;
/* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
@@ -1417,6 +1492,84 @@ xlog_grant_push_ail(
}
/*
+ * Stamp cycle number in every block
+ */
+STATIC void
+xlog_pack_data(
+ struct xlog *log,
+ struct xlog_in_core *iclog,
+ int roundoff)
+{
+ int i, j, k;
+ int size = iclog->ic_offset + roundoff;
+ __be32 cycle_lsn;
+ xfs_caddr_t dp;
+
+ cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
+
+ dp = iclog->ic_datap;
+ for (i = 0; i < BTOBB(size); i++) {
+ if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE))
+ break;
+ iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
+ *(__be32 *)dp = cycle_lsn;
+ dp += BBSIZE;
+ }
+
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ xlog_in_core_2_t *xhdr = iclog->ic_data;
+
+ for ( ; i < BTOBB(size); i++) {
+ j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+ k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+ xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
+ *(__be32 *)dp = cycle_lsn;
+ dp += BBSIZE;
+ }
+
+ for (i = 1; i < log->l_iclog_heads; i++)
+ xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
+ }
+}
+
+/*
+ * Calculate the checksum for a log buffer.
+ *
+ * This is a little more complicated than it should be because the various
+ * headers and the actual data are non-contiguous.
+ */
+__le32
+xlog_cksum(
+ struct xlog *log,
+ struct xlog_rec_header *rhead,
+ char *dp,
+ int size)
+{
+ __uint32_t crc;
+
+ /* first generate the crc for the record header ... */
+ crc = xfs_start_cksum((char *)rhead,
+ sizeof(struct xlog_rec_header),
+ offsetof(struct xlog_rec_header, h_crc));
+
+ /* ... then for additional cycle data for v2 logs ... */
+ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+ union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead;
+ int i;
+
+ for (i = 1; i < log->l_iclog_heads; i++) {
+ crc = crc32c(crc, &xhdr[i].hic_xheader,
+ sizeof(struct xlog_rec_ext_header));
+ }
+ }
+
+ /* ... and finally for the payload */
+ crc = crc32c(crc, dp, size);
+
+ return xfs_end_cksum(crc);
+}
+
+/*
* The bdstrat callback function for log bufs. This gives us a central
* place to trap bufs in case we get hit by a log I/O error and need to
* shutdown. Actually, in practice, even when we didn't get a log error,
@@ -1476,7 +1629,6 @@ xlog_sync(
struct xlog *log,
struct xlog_in_core *iclog)
{
- xfs_caddr_t dptr; /* pointer to byte sized element */
xfs_buf_t *bp;
int i;
uint count; /* byte count of bwrite */
@@ -1485,6 +1637,7 @@ xlog_sync(
int split = 0; /* split write into two regions */
int error;
int v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
+ int size;
XFS_STATS_INC(xs_log_writes);
ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
@@ -1515,13 +1668,10 @@ xlog_sync(
xlog_pack_data(log, iclog, roundoff);
/* real byte length */
- if (v2) {
- iclog->ic_header.h_len =
- cpu_to_be32(iclog->ic_offset + roundoff);
- } else {
- iclog->ic_header.h_len =
- cpu_to_be32(iclog->ic_offset);
- }
+ size = iclog->ic_offset;
+ if (v2)
+ size += roundoff;
+ iclog->ic_header.h_len = cpu_to_be32(size);
bp = iclog->ic_bp;
XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
@@ -1530,12 +1680,36 @@ xlog_sync(
/* Do we need to split this write into 2 parts? */
if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
+ char *dptr;
+
split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
- iclog->ic_bwritecnt = 2; /* split into 2 writes */
+ iclog->ic_bwritecnt = 2;
+
+ /*
+ * Bump the cycle numbers at the start of each block in the
+ * part of the iclog that ends up in the buffer that gets
+ * written to the start of the log.
+ *
+ * Watch out for the header magic number case, though.
+ */
+ dptr = (char *)&iclog->ic_header + count;
+ for (i = 0; i < split; i += BBSIZE) {
+ __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr);
+ if (++cycle == XLOG_HEADER_MAGIC_NUM)
+ cycle++;
+ *(__be32 *)dptr = cpu_to_be32(cycle);
+
+ dptr += BBSIZE;
+ }
} else {
iclog->ic_bwritecnt = 1;
}
+
+ /* calculcate the checksum */
+ iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
+ iclog->ic_datap, size);
+
bp->b_io_length = BTOBB(count);
bp->b_fspriv = iclog;
XFS_BUF_ZEROFLAGS(bp);
@@ -1589,19 +1763,6 @@ xlog_sync(
bp->b_flags |= XBF_SYNCIO;
if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
bp->b_flags |= XBF_FUA;
- dptr = bp->b_addr;
- /*
- * Bump the cycle numbers at the start of each block
- * since this part of the buffer is at the start of
- * a new cycle. Watch out for the header magic number
- * case, though.
- */
- for (i = 0; i < split; i += BBSIZE) {
- be32_add_cpu((__be32 *)dptr, 1);
- if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM)
- be32_add_cpu((__be32 *)dptr, 1);
- dptr += BBSIZE;
- }
ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -1618,7 +1779,6 @@ xlog_sync(
return 0;
} /* xlog_sync */
-
/*
* Deallocate a log structure
*/
@@ -2387,14 +2547,27 @@ xlog_state_do_callback(
/*
- * update the last_sync_lsn before we drop the
+ * Completion of a iclog IO does not imply that
+ * a transaction has completed, as transactions
+ * can be large enough to span many iclogs. We
+ * cannot change the tail of the log half way
+ * through a transaction as this may be the only
+ * transaction in the log and moving th etail to
+ * point to the middle of it will prevent
+ * recovery from finding the start of the
+ * transaction. Hence we should only update the
+ * last_sync_lsn if this iclog contains
+ * transaction completion callbacks on it.
+ *
+ * We have to do this before we drop the
* icloglock to ensure we are the only one that
* can update it.
*/
ASSERT(XFS_LSN_CMP(atomic64_read(&log->l_last_sync_lsn),
be64_to_cpu(iclog->ic_header.h_lsn)) <= 0);
- atomic64_set(&log->l_last_sync_lsn,
- be64_to_cpu(iclog->ic_header.h_lsn));
+ if (iclog->ic_callback)
+ atomic64_set(&log->l_last_sync_lsn,
+ be64_to_cpu(iclog->ic_header.h_lsn));
} else
ioerrors++;
@@ -3700,3 +3873,4 @@ xlog_iclogs_empty(
} while (iclog != log->l_iclog);
return 1;
}
+
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 748d312850e2..5caee96059df 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -181,5 +181,9 @@ int xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
xfs_lsn_t *commit_lsn, int flags);
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
+void xfs_log_work_queue(struct xfs_mount *mp);
+void xfs_log_worker(struct work_struct *work);
+void xfs_log_quiesce(struct xfs_mount *mp);
+
#endif
#endif /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 18a801d76a42..16d8d12ea3b4 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -139,7 +139,6 @@ static inline uint xlog_get_client_id(__be32 i)
/*
* Flags for log structure
*/
-#define XLOG_CHKSUM_MISMATCH 0x1 /* used only during recovery */
#define XLOG_ACTIVE_RECOVERY 0x2 /* in the middle of recovery */
#define XLOG_RECOVERY_NEEDED 0x4 /* log was recovered */
#define XLOG_IO_ERROR 0x8 /* log hit an I/O error, and being
@@ -291,7 +290,7 @@ typedef struct xlog_rec_header {
__be32 h_len; /* len in bytes; should be 64-bit aligned: 4 */
__be64 h_lsn; /* lsn of this LR : 8 */
__be64 h_tail_lsn; /* lsn of 1st LR w/ buffers not committed: 8 */
- __be32 h_chksum; /* may not be used; non-zero if used : 4 */
+ __le32 h_crc; /* crc of log record : 4 */
__be32 h_prev_block; /* block number to previous LR : 4 */
__be32 h_num_logops; /* number of log operations in this LR : 4 */
__be32 h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
@@ -495,6 +494,7 @@ struct xlog {
struct xfs_buf *l_xbuf; /* extra buffer for log
* wrapping */
struct xfs_buftarg *l_targ; /* buftarg of log */
+ struct delayed_work l_work; /* background flush work */
uint l_flags;
uint l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
struct list_head *l_buf_cancel_table;
@@ -554,11 +554,9 @@ xlog_recover(
extern int
xlog_recover_finish(
struct xlog *log);
-extern void
-xlog_pack_data(
- struct xlog *log,
- struct xlog_in_core *iclog,
- int);
+
+extern __le32 xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
+ char *dp, int size);
extern kmem_zone_t *xfs_log_ticket_zone;
struct xlog_ticket *
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 5da3ace352bf..96fcbb85ff83 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -41,7 +41,9 @@
#include "xfs_trans_priv.h"
#include "xfs_quota.h"
#include "xfs_utils.h"
+#include "xfs_cksum.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
STATIC int
xlog_find_zeroed(
@@ -2143,7 +2145,7 @@ xlog_recover_buffer_pass2(
buf_flags |= XBF_UNMAPPED;
bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
- buf_flags);
+ buf_flags, NULL);
if (!bp)
return XFS_ERROR(ENOMEM);
error = bp->b_error;
@@ -2236,7 +2238,8 @@ xlog_recover_inode_pass2(
}
trace_xfs_log_recover_inode_recover(log, in_f);
- bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0);
+ bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
+ NULL);
if (!bp) {
error = ENOMEM;
goto error;
@@ -2547,7 +2550,8 @@ xlog_recover_dquot_pass2(
ASSERT(dq_f->qlf_len == 1);
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
- XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp);
+ XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
+ NULL);
if (error)
return error;
@@ -3213,80 +3217,58 @@ xlog_recover_process_iunlinks(
mp->m_dmevmask = mp_dmevmask;
}
-
-#ifdef DEBUG
-STATIC void
-xlog_pack_data_checksum(
- struct xlog *log,
- struct xlog_in_core *iclog,
- int size)
-{
- int i;
- __be32 *up;
- uint chksum = 0;
-
- up = (__be32 *)iclog->ic_datap;
- /* divide length by 4 to get # words */
- for (i = 0; i < (size >> 2); i++) {
- chksum ^= be32_to_cpu(*up);
- up++;
- }
- iclog->ic_header.h_chksum = cpu_to_be32(chksum);
-}
-#else
-#define xlog_pack_data_checksum(log, iclog, size)
-#endif
-
/*
- * Stamp cycle number in every block
+ * Upack the log buffer data and crc check it. If the check fails, issue a
+ * warning if and only if the CRC in the header is non-zero. This makes the
+ * check an advisory warning, and the zero CRC check will prevent failure
+ * warnings from being emitted when upgrading the kernel from one that does not
+ * add CRCs by default.
+ *
+ * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
+ * corruption failure
*/
-void
-xlog_pack_data(
- struct xlog *log,
- struct xlog_in_core *iclog,
- int roundoff)
+STATIC int
+xlog_unpack_data_crc(
+ struct xlog_rec_header *rhead,
+ xfs_caddr_t dp,
+ struct xlog *log)
{
- int i, j, k;
- int size = iclog->ic_offset + roundoff;
- __be32 cycle_lsn;
- xfs_caddr_t dp;
-
- xlog_pack_data_checksum(log, iclog, size);
-
- cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
-
- dp = iclog->ic_datap;
- for (i = 0; i < BTOBB(size) &&
- i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
- iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
- *(__be32 *)dp = cycle_lsn;
- dp += BBSIZE;
- }
-
- if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
- xlog_in_core_2_t *xhdr = iclog->ic_data;
-
- for ( ; i < BTOBB(size); i++) {
- j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
- k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
- xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
- *(__be32 *)dp = cycle_lsn;
- dp += BBSIZE;
+ __le32 crc;
+
+ crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
+ if (crc != rhead->h_crc) {
+ if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
+ xfs_alert(log->l_mp,
+ "log record CRC mismatch: found 0x%x, expected 0x%x.\n",
+ le32_to_cpu(rhead->h_crc),
+ le32_to_cpu(crc));
+ xfs_hex_dump(dp, 32);
}
- for (i = 1; i < log->l_iclog_heads; i++) {
- xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
- }
+ /*
+ * If we've detected a log record corruption, then we can't
+ * recover past this point. Abort recovery if we are enforcing
+ * CRC protection by punting an error back up the stack.
+ */
+ if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
+ return EFSCORRUPTED;
}
+
+ return 0;
}
-STATIC void
+STATIC int
xlog_unpack_data(
struct xlog_rec_header *rhead,
xfs_caddr_t dp,
struct xlog *log)
{
int i, j, k;
+ int error;
+
+ error = xlog_unpack_data_crc(rhead, dp, log);
+ if (error)
+ return error;
for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
@@ -3303,6 +3285,8 @@ xlog_unpack_data(
dp += BBSIZE;
}
}
+
+ return 0;
}
STATIC int
@@ -3434,9 +3418,13 @@ xlog_do_recovery_pass(
if (error)
goto bread_err2;
- xlog_unpack_data(rhead, offset, log);
- if ((error = xlog_recover_process_data(log,
- rhash, rhead, offset, pass)))
+ error = xlog_unpack_data(rhead, offset, log);
+ if (error)
+ goto bread_err2;
+
+ error = xlog_recover_process_data(log,
+ rhash, rhead, offset, pass);
+ if (error)
goto bread_err2;
blk_no += bblks + hblks;
}
@@ -3541,14 +3529,19 @@ xlog_do_recovery_pass(
* - order is important.
*/
error = xlog_bread_offset(log, 0,
- bblks - split_bblks, hbp,
+ bblks - split_bblks, dbp,
offset + BBTOB(split_bblks));
if (error)
goto bread_err2;
}
- xlog_unpack_data(rhead, offset, log);
- if ((error = xlog_recover_process_data(log, rhash,
- rhead, offset, pass)))
+
+ error = xlog_unpack_data(rhead, offset, log);
+ if (error)
+ goto bread_err2;
+
+ error = xlog_recover_process_data(log, rhash,
+ rhead, offset, pass);
+ if (error)
goto bread_err2;
blk_no += bblks;
}
@@ -3573,9 +3566,13 @@ xlog_do_recovery_pass(
if (error)
goto bread_err2;
- xlog_unpack_data(rhead, offset, log);
- if ((error = xlog_recover_process_data(log, rhash,
- rhead, offset, pass)))
+ error = xlog_unpack_data(rhead, offset, log);
+ if (error)
+ goto bread_err2;
+
+ error = xlog_recover_process_data(log, rhash,
+ rhead, offset, pass);
+ if (error)
goto bread_err2;
blk_no += bblks + hblks;
}
@@ -3689,13 +3686,14 @@ xlog_do_recover(
/*
* Now that we've finished replaying all buffer and inode
- * updates, re-read in the superblock.
+ * updates, re-read in the superblock and reverify it.
*/
bp = xfs_getsb(log->l_mp, 0);
XFS_BUF_UNDONE(bp);
ASSERT(!(XFS_BUF_ISWRITE(bp)));
XFS_BUF_READ(bp);
XFS_BUF_UNASYNC(bp);
+ bp->b_ops = &xfs_sb_buf_ops;
xfsbdstrat(log->l_mp, bp);
error = xfs_buf_iowait(bp);
if (error) {
@@ -3707,7 +3705,7 @@ xlog_do_recover(
/* Convert superblock from on-disk format */
sbp = &log->l_mp->m_sb;
- xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp));
+ xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
ASSERT(xfs_sb_good_version(sbp));
xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b2bd3a0e6376..da508463ff10 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -42,6 +42,7 @@
#include "xfs_fsops.h"
#include "xfs_utils.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
#ifdef HAVE_PERCPU_SB
@@ -303,9 +304,8 @@ STATIC int
xfs_mount_validate_sb(
xfs_mount_t *mp,
xfs_sb_t *sbp,
- int flags)
+ bool check_inprogress)
{
- int loud = !(flags & XFS_MFSI_QUIET);
/*
* If the log device and data device have the
@@ -315,21 +315,18 @@ xfs_mount_validate_sb(
* a volume filesystem in a non-volume manner.
*/
if (sbp->sb_magicnum != XFS_SB_MAGIC) {
- if (loud)
- xfs_warn(mp, "bad magic number");
+ xfs_warn(mp, "bad magic number");
return XFS_ERROR(EWRONGFS);
}
if (!xfs_sb_good_version(sbp)) {
- if (loud)
- xfs_warn(mp, "bad version");
+ xfs_warn(mp, "bad version");
return XFS_ERROR(EWRONGFS);
}
if (unlikely(
sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
- if (loud)
- xfs_warn(mp,
+ xfs_warn(mp,
"filesystem is marked as having an external log; "
"specify logdev on the mount command line.");
return XFS_ERROR(EINVAL);
@@ -337,8 +334,7 @@ xfs_mount_validate_sb(
if (unlikely(
sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
- if (loud)
- xfs_warn(mp,
+ xfs_warn(mp,
"filesystem is marked as having an internal log; "
"do not specify logdev on the mount command line.");
return XFS_ERROR(EINVAL);
@@ -372,8 +368,7 @@ xfs_mount_validate_sb(
sbp->sb_dblocks == 0 ||
sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp) ||
sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
- if (loud)
- XFS_CORRUPTION_ERROR("SB sanity check failed",
+ XFS_CORRUPTION_ERROR("SB sanity check failed",
XFS_ERRLEVEL_LOW, mp, sbp);
return XFS_ERROR(EFSCORRUPTED);
}
@@ -382,12 +377,10 @@ xfs_mount_validate_sb(
* Until this is fixed only page-sized or smaller data blocks work.
*/
if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
- if (loud) {
- xfs_warn(mp,
+ xfs_warn(mp,
"File system with blocksize %d bytes. "
"Only pagesize (%ld) or less will currently work.",
sbp->sb_blocksize, PAGE_SIZE);
- }
return XFS_ERROR(ENOSYS);
}
@@ -401,23 +394,20 @@ xfs_mount_validate_sb(
case 2048:
break;
default:
- if (loud)
- xfs_warn(mp, "inode size of %d bytes not supported",
+ xfs_warn(mp, "inode size of %d bytes not supported",
sbp->sb_inodesize);
return XFS_ERROR(ENOSYS);
}
if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
- if (loud)
- xfs_warn(mp,
+ xfs_warn(mp,
"file system too large to be mounted on this system.");
return XFS_ERROR(EFBIG);
}
- if (unlikely(sbp->sb_inprogress)) {
- if (loud)
- xfs_warn(mp, "file system busy");
+ if (check_inprogress && sbp->sb_inprogress) {
+ xfs_warn(mp, "Offline file system operation in progress!");
return XFS_ERROR(EFSCORRUPTED);
}
@@ -425,9 +415,7 @@ xfs_mount_validate_sb(
* Version 1 directory format has never worked on Linux.
*/
if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
- if (loud)
- xfs_warn(mp,
- "file system using version 1 directory format");
+ xfs_warn(mp, "file system using version 1 directory format");
return XFS_ERROR(ENOSYS);
}
@@ -520,11 +508,9 @@ out_unwind:
void
xfs_sb_from_disk(
- struct xfs_mount *mp,
+ struct xfs_sb *to,
xfs_dsb_t *from)
{
- struct xfs_sb *to = &mp->m_sb;
-
to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
@@ -626,6 +612,72 @@ xfs_sb_to_disk(
}
}
+static void
+xfs_sb_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_mount *mp = bp->b_target->bt_mount;
+ struct xfs_sb sb;
+ int error;
+
+ xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
+
+ /*
+ * Only check the in progress field for the primary superblock as
+ * mkfs.xfs doesn't clear it from secondary superblocks.
+ */
+ error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR);
+ if (error)
+ xfs_buf_ioerror(bp, error);
+}
+
+static void
+xfs_sb_read_verify(
+ struct xfs_buf *bp)
+{
+ xfs_sb_verify(bp);
+}
+
+/*
+ * We may be probed for a filesystem match, so we may not want to emit
+ * messages when the superblock buffer is not actually an XFS superblock.
+ * If we find an XFS superblock, the run a normal, noisy mount because we are
+ * really going to mount it and want to know about errors.
+ */
+static void
+xfs_sb_quiet_read_verify(
+ struct xfs_buf *bp)
+{
+ struct xfs_sb sb;
+
+ xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
+
+ if (sb.sb_magicnum == XFS_SB_MAGIC) {
+ /* XFS filesystem, verify noisily! */
+ xfs_sb_read_verify(bp);
+ return;
+ }
+ /* quietly fail */
+ xfs_buf_ioerror(bp, EFSCORRUPTED);
+}
+
+static void
+xfs_sb_write_verify(
+ struct xfs_buf *bp)
+{
+ xfs_sb_verify(bp);
+}
+
+const struct xfs_buf_ops xfs_sb_buf_ops = {
+ .verify_read = xfs_sb_read_verify,
+ .verify_write = xfs_sb_write_verify,
+};
+
+static const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
+ .verify_read = xfs_sb_quiet_read_verify,
+ .verify_write = xfs_sb_write_verify,
+};
+
/*
* xfs_readsb
*
@@ -651,26 +703,27 @@ xfs_readsb(xfs_mount_t *mp, int flags)
reread:
bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
- BTOBB(sector_size), 0);
+ BTOBB(sector_size), 0,
+ loud ? &xfs_sb_buf_ops
+ : &xfs_sb_quiet_buf_ops);
if (!bp) {
if (loud)
xfs_warn(mp, "SB buffer read failed");
return EIO;
}
-
- /*
- * Initialize the mount structure from the superblock.
- * But first do some basic consistency checking.
- */
- xfs_sb_from_disk(mp, XFS_BUF_TO_SBP(bp));
- error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
- if (error) {
+ if (bp->b_error) {
+ error = bp->b_error;
if (loud)
xfs_warn(mp, "SB validate failed");
goto release_buf;
}
/*
+ * Initialize the mount structure from the superblock.
+ */
+ xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
+
+ /*
* We must be able to do sector-sized and sector-aligned IO.
*/
if (sector_size > mp->m_sb.sb_sectsize) {
@@ -1001,7 +1054,7 @@ xfs_check_sizes(xfs_mount_t *mp)
}
bp = xfs_buf_read_uncached(mp->m_ddev_targp,
d - XFS_FSS_TO_BB(mp, 1),
- XFS_FSS_TO_BB(mp, 1), 0);
+ XFS_FSS_TO_BB(mp, 1), 0, NULL);
if (!bp) {
xfs_warn(mp, "last sector read failed");
return EIO;
@@ -1016,7 +1069,7 @@ xfs_check_sizes(xfs_mount_t *mp)
}
bp = xfs_buf_read_uncached(mp->m_logdev_targp,
d - XFS_FSB_TO_BB(mp, 1),
- XFS_FSB_TO_BB(mp, 1), 0);
+ XFS_FSB_TO_BB(mp, 1), 0, NULL);
if (!bp) {
xfs_warn(mp, "log device read failed");
return EIO;
@@ -1427,6 +1480,8 @@ xfs_unmountfs(
__uint64_t resblks;
int error;
+ cancel_delayed_work_sync(&mp->m_eofblocks_work);
+
xfs_qm_unmount_quotas(mp);
xfs_rtunmount_inodes(mp);
IRELE(mp->m_rootip);
@@ -1450,21 +1505,16 @@ xfs_unmountfs(
/*
* And reclaim all inodes. At this point there should be no dirty
- * inode, and none should be pinned or locked, but use synchronous
- * reclaim just to be sure.
+ * inodes and none should be pinned or locked, but use synchronous
+ * reclaim just to be sure. We can stop background inode reclaim
+ * here as well if it is still running.
*/
+ cancel_delayed_work_sync(&mp->m_reclaim_work);
xfs_reclaim_inodes(mp, SYNC_WAIT);
xfs_qm_unmount(mp);
/*
- * Flush out the log synchronously so that we know for sure
- * that nothing is pinned. This is important because bflush()
- * will skip pinned buffers.
- */
- xfs_log_force(mp, XFS_LOG_SYNC);
-
- /*
* Unreserve any blocks we have so that when we unmount we don't account
* the reserved free space as used. This is really only necessary for
* lazy superblock counting because it trusts the incore superblock
@@ -1489,23 +1539,6 @@ xfs_unmountfs(
xfs_warn(mp, "Unable to update superblock counters. "
"Freespace may not be correct on next mount.");
- /*
- * At this point we might have modified the superblock again and thus
- * added an item to the AIL, thus flush it again.
- */
- xfs_ail_push_all_sync(mp->m_ail);
- xfs_wait_buftarg(mp->m_ddev_targp);
-
- /*
- * The superblock buffer is uncached and xfsaild_push() will lock and
- * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
- * here but a lock on the superblock buffer will block until iodone()
- * has completed.
- */
- xfs_buf_lock(mp->m_sb_bp);
- xfs_buf_unlock(mp->m_sb_bp);
-
- xfs_log_unmount_write(mp);
xfs_log_unmount(mp);
xfs_uuid_unmount(mp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index deee09e534dc..bab8314507e4 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -51,8 +51,6 @@ typedef struct xfs_trans_reservations {
#else /* __KERNEL__ */
-#include "xfs_sync.h"
-
struct xlog;
struct xfs_inode;
struct xfs_mru_cache;
@@ -197,9 +195,9 @@ typedef struct xfs_mount {
struct mutex m_icsb_mutex; /* balancer sync lock */
#endif
struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
- struct delayed_work m_sync_work; /* background sync work */
struct delayed_work m_reclaim_work; /* background inode reclaim */
- struct work_struct m_flush_work; /* background inode flush */
+ struct delayed_work m_eofblocks_work; /* background eof blocks
+ trimming */
__int64_t m_update_flags; /* sb flags we need to update
on the next remount,rw */
struct shrinker m_inode_shrink; /* inode reclaim shrinker */
@@ -209,6 +207,9 @@ typedef struct xfs_mount {
struct workqueue_struct *m_data_workqueue;
struct workqueue_struct *m_unwritten_workqueue;
struct workqueue_struct *m_cil_workqueue;
+ struct workqueue_struct *m_reclaim_workqueue;
+ struct workqueue_struct *m_log_workqueue;
+ struct workqueue_struct *m_eofblocks_workqueue;
} xfs_mount_t;
/*
@@ -387,7 +388,9 @@ extern void xfs_set_low_space_thresholds(struct xfs_mount *);
extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
extern int xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
xfs_agnumber_t *);
-extern void xfs_sb_from_disk(struct xfs_mount *, struct xfs_dsb *);
+extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
+extern const struct xfs_buf_ops xfs_sb_buf_ops;
+
#endif /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 2e86fa0cfc0d..60eff4763156 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -40,6 +40,7 @@
#include "xfs_utils.h"
#include "xfs_qm.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
/*
* The global quota manager. There is only one of these for the entire
@@ -891,7 +892,8 @@ xfs_qm_dqiter_bufs(
while (blkcnt--) {
error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, bno),
- mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+ mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+ &xfs_dquot_buf_ops);
if (error)
break;
@@ -978,7 +980,8 @@ xfs_qm_dqiterate(
while (rablkcnt--) {
xfs_buf_readahead(mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, rablkno),
- mp->m_quotainfo->qi_dqchunklen);
+ mp->m_quotainfo->qi_dqchunklen,
+ NULL);
rablkno++;
}
}
@@ -1453,7 +1456,7 @@ xfs_qm_dqreclaim_one(
int error;
if (!xfs_dqlock_nowait(dqp))
- goto out_busy;
+ goto out_move_tail;
/*
* This dquot has acquired a reference in the meantime remove it from
@@ -1476,7 +1479,7 @@ xfs_qm_dqreclaim_one(
* getting flushed to disk, we don't want to reclaim it.
*/
if (!xfs_dqflock_nowait(dqp))
- goto out_busy;
+ goto out_unlock_move_tail;
if (XFS_DQ_IS_DIRTY(dqp)) {
struct xfs_buf *bp = NULL;
@@ -1487,7 +1490,7 @@ xfs_qm_dqreclaim_one(
if (error) {
xfs_warn(mp, "%s: dquot %p flush failed",
__func__, dqp);
- goto out_busy;
+ goto out_unlock_move_tail;
}
xfs_buf_delwri_queue(bp, buffer_list);
@@ -1496,7 +1499,7 @@ xfs_qm_dqreclaim_one(
* Give the dquot another try on the freelist, as the
* flushing will take some time.
*/
- goto out_busy;
+ goto out_unlock_move_tail;
}
xfs_dqfunlock(dqp);
@@ -1515,14 +1518,13 @@ xfs_qm_dqreclaim_one(
XFS_STATS_INC(xs_qm_dqreclaims);
return;
-out_busy:
- xfs_dqunlock(dqp);
-
/*
* Move the dquot to the tail of the list so that we don't spin on it.
*/
+out_unlock_move_tail:
+ xfs_dqunlock(dqp);
+out_move_tail:
list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
-
trace_xfs_dqreclaim_busy(dqp);
XFS_STATS_INC(xs_qm_dqreclaim_misses);
}
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 858a3b186110..5f53e75409b8 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -40,6 +40,7 @@
#include "xfs_utils.h"
#include "xfs_qm.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
STATIC int xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
STATIC int xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
@@ -845,7 +846,8 @@ STATIC int
xfs_dqrele_inode(
struct xfs_inode *ip,
struct xfs_perag *pag,
- int flags)
+ int flags,
+ void *args)
{
/* skip quota inodes */
if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
@@ -881,5 +883,5 @@ xfs_qm_dqrele_all_inodes(
uint flags)
{
ASSERT(mp->m_quotainfo);
- xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags);
+ xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL);
}
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ca28a4ba4b54..98dc670d3ee0 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -38,6 +38,7 @@
#include "xfs_utils.h"
#include "xfs_trace.h"
#include "xfs_buf.h"
+#include "xfs_icache.h"
/*
@@ -869,7 +870,7 @@ xfs_rtbuf_get(
ASSERT(map.br_startblock != NULLFSBLOCK);
error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
XFS_FSB_TO_DADDR(mp, map.br_startblock),
- mp->m_bsize, 0, &bp);
+ mp->m_bsize, 0, &bp, NULL);
if (error)
return error;
ASSERT(!xfs_buf_geterror(bp));
@@ -1872,9 +1873,14 @@ xfs_growfs_rt(
*/
bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
XFS_FSB_TO_BB(mp, nrblocks - 1),
- XFS_FSB_TO_BB(mp, 1), 0);
+ XFS_FSB_TO_BB(mp, 1), 0, NULL);
if (!bp)
return EIO;
+ if (bp->b_error) {
+ error = bp->b_error;
+ xfs_buf_relse(bp);
+ return error;
+ }
xfs_buf_relse(bp);
/*
@@ -2219,9 +2225,11 @@ xfs_rtmount_init(
}
bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
d - XFS_FSB_TO_BB(mp, 1),
- XFS_FSB_TO_BB(mp, 1), 0);
- if (!bp) {
+ XFS_FSB_TO_BB(mp, 1), 0, NULL);
+ if (!bp || bp->b_error) {
xfs_warn(mp, "realtime device size check failed");
+ if (bp)
+ xfs_buf_relse(bp);
return EIO;
}
xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index f429d9d5d325..a05b45175fb0 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -81,6 +81,7 @@ struct xfs_mount;
#define XFS_SB_VERSION2_ATTR2BIT 0x00000008 /* Inline attr rework */
#define XFS_SB_VERSION2_PARENTBIT 0x00000010 /* parent pointers */
#define XFS_SB_VERSION2_PROJID32BIT 0x00000080 /* 32 bit project id */
+#define XFS_SB_VERSION2_CRCBIT 0x00000100 /* metadata CRCs */
#define XFS_SB_VERSION2_OKREALFBITS \
(XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
@@ -503,6 +504,12 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
(sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
}
+static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp)
+{
+ return (xfs_sb_version_hasmorebits(sbp) &&
+ (sbp->sb_features2 & XFS_SB_VERSION2_CRCBIT));
+}
+
/*
* end of superblock version macros
*/
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 26a09bd7f975..ab8839b26272 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -49,7 +49,7 @@
#include "xfs_extfree_item.h"
#include "xfs_mru_cache.h"
#include "xfs_inode_item.h"
-#include "xfs_sync.h"
+#include "xfs_icache.h"
#include "xfs_trace.h"
#include <linux/namei.h>
@@ -863,8 +863,30 @@ xfs_init_mount_workqueues(
WQ_MEM_RECLAIM, 0, mp->m_fsname);
if (!mp->m_cil_workqueue)
goto out_destroy_unwritten;
+
+ mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
+ WQ_NON_REENTRANT, 0, mp->m_fsname);
+ if (!mp->m_reclaim_workqueue)
+ goto out_destroy_cil;
+
+ mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
+ WQ_NON_REENTRANT, 0, mp->m_fsname);
+ if (!mp->m_log_workqueue)
+ goto out_destroy_reclaim;
+
+ mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
+ WQ_NON_REENTRANT, 0, mp->m_fsname);
+ if (!mp->m_eofblocks_workqueue)
+ goto out_destroy_log;
+
return 0;
+out_destroy_log:
+ destroy_workqueue(mp->m_log_workqueue);
+out_destroy_reclaim:
+ destroy_workqueue(mp->m_reclaim_workqueue);
+out_destroy_cil:
+ destroy_workqueue(mp->m_cil_workqueue);
out_destroy_unwritten:
destroy_workqueue(mp->m_unwritten_workqueue);
out_destroy_data_iodone_queue:
@@ -877,11 +899,32 @@ STATIC void
xfs_destroy_mount_workqueues(
struct xfs_mount *mp)
{
+ destroy_workqueue(mp->m_eofblocks_workqueue);
+ destroy_workqueue(mp->m_log_workqueue);
+ destroy_workqueue(mp->m_reclaim_workqueue);
destroy_workqueue(mp->m_cil_workqueue);
destroy_workqueue(mp->m_data_workqueue);
destroy_workqueue(mp->m_unwritten_workqueue);
}
+/*
+ * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
+ * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
+ * for IO to complete so that we effectively throttle multiple callers to the
+ * rate at which IO is completing.
+ */
+void
+xfs_flush_inodes(
+ struct xfs_mount *mp)
+{
+ struct super_block *sb = mp->m_super;
+
+ if (down_read_trylock(&sb->s_umount)) {
+ sync_inodes_sb(sb);
+ up_read(&sb->s_umount);
+ }
+}
+
/* Catch misguided souls that try to use this interface on XFS */
STATIC struct inode *
xfs_fs_alloc_inode(
@@ -1006,9 +1049,8 @@ xfs_fs_put_super(
struct xfs_mount *mp = XFS_M(sb);
xfs_filestream_unmount(mp);
- cancel_delayed_work_sync(&mp->m_sync_work);
xfs_unmountfs(mp);
- xfs_syncd_stop(mp);
+
xfs_freesb(mp);
xfs_icsb_destroy_counters(mp);
xfs_destroy_mount_workqueues(mp);
@@ -1023,7 +1065,6 @@ xfs_fs_sync_fs(
int wait)
{
struct xfs_mount *mp = XFS_M(sb);
- int error;
/*
* Doing anything during the async pass would be counterproductive.
@@ -1031,17 +1072,14 @@ xfs_fs_sync_fs(
if (!wait)
return 0;
- error = xfs_quiesce_data(mp);
- if (error)
- return -error;
-
+ xfs_log_force(mp, XFS_LOG_SYNC);
if (laptop_mode) {
/*
* The disk must be active because we're syncing.
- * We schedule xfssyncd now (now that the disk is
+ * We schedule log work now (now that the disk is
* active) instead of later (when it might not be).
*/
- flush_delayed_work(&mp->m_sync_work);
+ flush_delayed_work(&mp->m_log->l_work);
}
return 0;
@@ -1118,6 +1156,48 @@ xfs_restore_resvblks(struct xfs_mount *mp)
xfs_reserve_blocks(mp, &resblks, NULL);
}
+/*
+ * Trigger writeback of all the dirty metadata in the file system.
+ *
+ * This ensures that the metadata is written to their location on disk rather
+ * than just existing in transactions in the log. This means after a quiesce
+ * there is no log replay required to write the inodes to disk - this is the
+ * primary difference between a sync and a quiesce.
+ *
+ * Note: xfs_log_quiesce() stops background log work - the callers must ensure
+ * it is started again when appropriate.
+ */
+void
+xfs_quiesce_attr(
+ struct xfs_mount *mp)
+{
+ int error = 0;
+
+ /* wait for all modifications to complete */
+ while (atomic_read(&mp->m_active_trans) > 0)
+ delay(100);
+
+ /* force the log to unpin objects from the now complete transactions */
+ xfs_log_force(mp, XFS_LOG_SYNC);
+
+ /* reclaim inodes to do any IO before the freeze completes */
+ xfs_reclaim_inodes(mp, 0);
+ xfs_reclaim_inodes(mp, SYNC_WAIT);
+
+ /* Push the superblock and write an unmount record */
+ error = xfs_log_sbcount(mp);
+ if (error)
+ xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
+ "Frozen image may not be consistent.");
+ /*
+ * Just warn here till VFS can correctly support
+ * read-only remount without racing.
+ */
+ WARN_ON(atomic_read(&mp->m_active_trans) != 0);
+
+ xfs_log_quiesce(mp);
+}
+
STATIC int
xfs_fs_remount(
struct super_block *sb,
@@ -1198,20 +1278,18 @@ xfs_fs_remount(
* value if it is non-zero, otherwise go with the default.
*/
xfs_restore_resvblks(mp);
+ xfs_log_work_queue(mp);
}
/* rw -> ro */
if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
/*
- * After we have synced the data but before we sync the
- * metadata, we need to free up the reserve block pool so that
- * the used block count in the superblock on disk is correct at
- * the end of the remount. Stash the current reserve pool size
- * so that if we get remounted rw, we can return it to the same
- * size.
+ * Before we sync the metadata, we need to free up the reserve
+ * block pool so that the used block count in the superblock on
+ * disk is correct at the end of the remount. Stash the current
+ * reserve pool size so that if we get remounted rw, we can
+ * return it to the same size.
*/
-
- xfs_quiesce_data(mp);
xfs_save_resvblks(mp);
xfs_quiesce_attr(mp);
mp->m_flags |= XFS_MOUNT_RDONLY;
@@ -1243,6 +1321,7 @@ xfs_fs_unfreeze(
struct xfs_mount *mp = XFS_M(sb);
xfs_restore_resvblks(mp);
+ xfs_log_work_queue(mp);
return 0;
}
@@ -1321,6 +1400,8 @@ xfs_fs_fill_super(
spin_lock_init(&mp->m_sb_lock);
mutex_init(&mp->m_growlock);
atomic_set(&mp->m_active_trans, 0);
+ INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+ INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
mp->m_super = sb;
sb->s_fs_info = mp;
@@ -1371,10 +1452,6 @@ xfs_fs_fill_super(
/*
* we must configure the block size in the superblock before we run the
* full mount process as the mount process can lookup and cache inodes.
- * For the same reason we must also initialise the syncd and register
- * the inode cache shrinker so that inodes can be reclaimed during
- * operations like a quotacheck that iterate all inodes in the
- * filesystem.
*/
sb->s_magic = XFS_SB_MAGIC;
sb->s_blocksize = mp->m_sb.sb_blocksize;
@@ -1384,13 +1461,9 @@ xfs_fs_fill_super(
sb->s_time_gran = 1;
set_posix_acl_flag(sb);
- error = xfs_syncd_init(mp);
- if (error)
- goto out_filestream_unmount;
-
error = xfs_mountfs(mp);
if (error)
- goto out_syncd_stop;
+ goto out_filestream_unmount;
root = igrab(VFS_I(mp->m_rootip));
if (!root) {
@@ -1408,8 +1481,7 @@ xfs_fs_fill_super(
}
return 0;
- out_syncd_stop:
- xfs_syncd_stop(mp);
+
out_filestream_unmount:
xfs_filestream_unmount(mp);
out_free_sb:
@@ -1429,7 +1501,6 @@ out_destroy_workqueues:
out_unmount:
xfs_filestream_unmount(mp);
xfs_unmountfs(mp);
- xfs_syncd_stop(mp);
goto out_free_sb;
}
@@ -1625,16 +1696,6 @@ STATIC int __init
xfs_init_workqueues(void)
{
/*
- * We never want to the same work item to run twice, reclaiming inodes
- * or idling the log is not going to get any faster by multiple CPUs
- * competing for ressources. Use the default large max_active value
- * so that even lots of filesystems can perform these task in parallel.
- */
- xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0);
- if (!xfs_syncd_wq)
- return -ENOMEM;
-
- /*
* The allocation workqueue can be used in memory reclaim situations
* (writepage path), and parallelism is only limited by the number of
* AGs in all the filesystems mounted. Hence use the default large
@@ -1642,20 +1703,15 @@ xfs_init_workqueues(void)
*/
xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0);
if (!xfs_alloc_wq)
- goto out_destroy_syncd;
+ return -ENOMEM;
return 0;
-
-out_destroy_syncd:
- destroy_workqueue(xfs_syncd_wq);
- return -ENOMEM;
}
STATIC void
xfs_destroy_workqueues(void)
{
destroy_workqueue(xfs_alloc_wq);
- destroy_workqueue(xfs_syncd_wq);
}
STATIC int __init
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 9de4a920ba05..bbe3d15a7904 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -74,6 +74,7 @@ struct block_device;
extern __uint64_t xfs_max_file_offset(unsigned int);
+extern void xfs_flush_inodes(struct xfs_mount *mp);
extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *);
extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *);
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index ee2d2adaa438..2801b5ce6cdb 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -202,6 +202,15 @@ static ctl_table xfs_table[] = {
.extra1 = &xfs_params.fstrm_timer.min,
.extra2 = &xfs_params.fstrm_timer.max,
},
+ {
+ .procname = "speculative_prealloc_lifetime",
+ .data = &xfs_params.eofb_timer.val,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &xfs_params.eofb_timer.min,
+ .extra2 = &xfs_params.eofb_timer.max,
+ },
/* please keep this the last entry */
#ifdef CONFIG_PROC_FS
{
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index b9937d450f8e..bd8e157c20ef 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -47,6 +47,7 @@ typedef struct xfs_param {
xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */
xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
xfs_sysctl_val_t fstrm_timer; /* Filestream dir-AG assoc'n timeout. */
+ xfs_sysctl_val_t eofb_timer; /* Interval between eofb scan wakeups */
} xfs_param_t;
/*
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 7d36ccf57f93..2e137d4a85ae 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -96,6 +96,8 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full);
DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);
DECLARE_EVENT_CLASS(xfs_perag_class,
TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
@@ -130,6 +132,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
DEFINE_PERAG_REF_EVENT(xfs_perag_put);
DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks);
TRACE_EVENT(xfs_attr_list_node_descend,
TP_PROTO(struct xfs_attr_list_context *ctx,
@@ -585,6 +589,10 @@ DEFINE_INODE_EVENT(xfs_update_time);
DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
+DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
+DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
+DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
+
DECLARE_EVENT_CLASS(xfs_iref_class,
TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
TP_ARGS(ip, caller_ip),
@@ -1496,8 +1504,42 @@ DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
+DECLARE_EVENT_CLASS(xfs_attr_class,
+ TP_PROTO(struct xfs_da_args *args),
+ TP_ARGS(args),
+ TP_STRUCT__entry(
+ __field(dev_t, dev)
+ __field(xfs_ino_t, ino)
+ __dynamic_array(char, name, args->namelen)
+ __field(int, namelen)
+ __field(int, valuelen)
+ __field(xfs_dahash_t, hashval)
+ __field(int, op_flags)
+ ),
+ TP_fast_assign(
+ __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+ __entry->ino = args->dp->i_ino;
+ if (args->namelen)
+ memcpy(__get_str(name), args->name, args->namelen);
+ __entry->namelen = args->namelen;
+ __entry->valuelen = args->valuelen;
+ __entry->hashval = args->hashval;
+ __entry->op_flags = args->op_flags;
+ ),
+ TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d "
+ "hashval 0x%x op_flags %s",
+ MAJOR(__entry->dev), MINOR(__entry->dev),
+ __entry->ino,
+ __entry->namelen,
+ __entry->namelen ? __get_str(name) : NULL,
+ __entry->namelen,
+ __entry->valuelen,
+ __entry->hashval,
+ __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
+)
+
#define DEFINE_ATTR_EVENT(name) \
-DEFINE_EVENT(xfs_da_class, name, \
+DEFINE_EVENT(xfs_attr_class, name, \
TP_PROTO(struct xfs_da_args *args), \
TP_ARGS(args))
DEFINE_ATTR_EVENT(xfs_attr_sf_add);
@@ -1511,10 +1553,14 @@ DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf);
DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old);
DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work);
DEFINE_ATTR_EVENT(xfs_attr_leaf_addname);
DEFINE_ATTR_EVENT(xfs_attr_leaf_create);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_compact);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_get);
DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup);
DEFINE_ATTR_EVENT(xfs_attr_leaf_replace);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_remove);
DEFINE_ATTR_EVENT(xfs_attr_leaf_removename);
DEFINE_ATTR_EVENT(xfs_attr_leaf_split);
DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before);
@@ -1526,12 +1572,21 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf);
DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node);
DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance);
DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall);
DEFINE_ATTR_EVENT(xfs_attr_node_addname);
+DEFINE_ATTR_EVENT(xfs_attr_node_get);
DEFINE_ATTR_EVENT(xfs_attr_node_lookup);
DEFINE_ATTR_EVENT(xfs_attr_node_replace);
DEFINE_ATTR_EVENT(xfs_attr_node_removename);
+DEFINE_ATTR_EVENT(xfs_attr_fillstate);
+DEFINE_ATTR_EVENT(xfs_attr_refillstate);
+
+DEFINE_ATTR_EVENT(xfs_attr_rmtval_get);
+DEFINE_ATTR_EVENT(xfs_attr_rmtval_set);
+DEFINE_ATTR_EVENT(xfs_attr_rmtval_remove);
+
#define DEFINE_DA_EVENT(name) \
DEFINE_EVENT(xfs_da_class, name, \
TP_PROTO(struct xfs_da_args *args), \
@@ -1550,9 +1605,12 @@ DEFINE_DA_EVENT(xfs_da_node_split);
DEFINE_DA_EVENT(xfs_da_node_remove);
DEFINE_DA_EVENT(xfs_da_node_rebalance);
DEFINE_DA_EVENT(xfs_da_node_unbalance);
+DEFINE_DA_EVENT(xfs_da_node_toosmall);
DEFINE_DA_EVENT(xfs_da_swap_lastblock);
DEFINE_DA_EVENT(xfs_da_grow_inode);
DEFINE_DA_EVENT(xfs_da_shrink_inode);
+DEFINE_DA_EVENT(xfs_da_fixhashpath);
+DEFINE_DA_EVENT(xfs_da_path_shift);
DECLARE_EVENT_CLASS(xfs_dir2_space_class,
TP_PROTO(struct xfs_da_args *args, int idx),
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index db056544cbb5..c6c0601abd7a 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -464,10 +464,7 @@ xfs_trans_get_buf(
int numblks,
uint flags)
{
- struct xfs_buf_map map = {
- .bm_bn = blkno,
- .bm_len = numblks,
- };
+ DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
return xfs_trans_get_buf_map(tp, target, &map, 1, flags);
}
@@ -476,7 +473,8 @@ int xfs_trans_read_buf_map(struct xfs_mount *mp,
struct xfs_buftarg *target,
struct xfs_buf_map *map, int nmaps,
xfs_buf_flags_t flags,
- struct xfs_buf **bpp);
+ struct xfs_buf **bpp,
+ const struct xfs_buf_ops *ops);
static inline int
xfs_trans_read_buf(
@@ -486,13 +484,12 @@ xfs_trans_read_buf(
xfs_daddr_t blkno,
int numblks,
xfs_buf_flags_t flags,
- struct xfs_buf **bpp)
+ struct xfs_buf **bpp,
+ const struct xfs_buf_ops *ops)
{
- struct xfs_buf_map map = {
- .bm_bn = blkno,
- .bm_len = numblks,
- };
- return xfs_trans_read_buf_map(mp, tp, target, &map, 1, flags, bpp);
+ DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
+ return xfs_trans_read_buf_map(mp, tp, target, &map, 1,
+ flags, bpp, ops);
}
struct xfs_buf *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 6311b99c267f..4fc17d479d42 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -257,7 +257,8 @@ xfs_trans_read_buf_map(
struct xfs_buf_map *map,
int nmaps,
xfs_buf_flags_t flags,
- struct xfs_buf **bpp)
+ struct xfs_buf **bpp,
+ const struct xfs_buf_ops *ops)
{
xfs_buf_t *bp;
xfs_buf_log_item_t *bip;
@@ -265,7 +266,7 @@ xfs_trans_read_buf_map(
*bpp = NULL;
if (!tp) {
- bp = xfs_buf_read_map(target, map, nmaps, flags);
+ bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
if (!bp)
return (flags & XBF_TRYLOCK) ?
EAGAIN : XFS_ERROR(ENOMEM);
@@ -312,7 +313,9 @@ xfs_trans_read_buf_map(
if (!(XFS_BUF_ISDONE(bp))) {
trace_xfs_trans_read_buf_io(bp, _RET_IP_);
ASSERT(!XFS_BUF_ISASYNC(bp));
+ ASSERT(bp->b_iodone == NULL);
XFS_BUF_READ(bp);
+ bp->b_ops = ops;
xfsbdstrat(tp->t_mountp, bp);
error = xfs_buf_iowait(bp);
if (error) {
@@ -349,7 +352,7 @@ xfs_trans_read_buf_map(
return 0;
}
- bp = xfs_buf_read_map(target, map, nmaps, flags);
+ bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
if (bp == NULL) {
*bpp = NULL;
return (flags & XBF_TRYLOCK) ?
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 2a5c637344b4..d95f565a390e 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -47,6 +47,7 @@
#include "xfs_filestream.h"
#include "xfs_vnodeops.h"
#include "xfs_trace.h"
+#include "xfs_icache.h"
/*
* The maximum pathlen is 1024 bytes. Since the minimum file system
@@ -79,7 +80,7 @@ xfs_readlink_bmap(
d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
- bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0);
+ bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, NULL);
if (!bp)
return XFS_ERROR(ENOMEM);
error = bp->b_error;
@@ -150,7 +151,7 @@ xfs_readlink(
* when the link count isn't zero and by xfs_dm_punch_hole() when
* punching a hole to EOF.
*/
-STATIC int
+int
xfs_free_eofblocks(
xfs_mount_t *mp,
xfs_inode_t *ip,
@@ -199,7 +200,7 @@ xfs_free_eofblocks(
if (need_iolock) {
if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
xfs_trans_cancel(tp, 0);
- return 0;
+ return EAGAIN;
}
}
@@ -237,6 +238,8 @@ xfs_free_eofblocks(
} else {
error = xfs_trans_commit(tp,
XFS_TRANS_RELEASE_LOG_RES);
+ if (!error)
+ xfs_inode_clear_eofblocks_tag(ip);
}
xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -425,19 +428,18 @@ xfs_release(
truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
if (truncated) {
xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
- if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
- xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
+ if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
+ error = -filemap_flush(VFS_I(ip)->i_mapping);
+ if (error)
+ return error;
+ }
}
}
if (ip->i_d.di_nlink == 0)
return 0;
- if ((S_ISREG(ip->i_d.di_mode) &&
- (VFS_I(ip)->i_size > 0 ||
- (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
- (ip->i_df.if_flags & XFS_IFEXTENTS)) &&
- (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
+ if (xfs_can_free_eofblocks(ip, false)) {
/*
* If we can't get the iolock just skip truncating the blocks
@@ -464,7 +466,7 @@ xfs_release(
return 0;
error = xfs_free_eofblocks(mp, ip, true);
- if (error)
+ if (error && error != EAGAIN)
return error;
/* delalloc blocks after truncation means it really is dirty */
@@ -513,13 +515,12 @@ xfs_inactive(
goto out;
if (ip->i_d.di_nlink != 0) {
- if ((S_ISREG(ip->i_d.di_mode) &&
- (VFS_I(ip)->i_size > 0 ||
- (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
- (ip->i_df.if_flags & XFS_IFEXTENTS) &&
- (!(ip->i_d.di_flags &
- (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
- ip->i_delayed_blks != 0))) {
+ /*
+ * force is true because we are evicting an inode from the
+ * cache. Post-eof blocks must be freed, lest we end up with
+ * broken free space accounting.
+ */
+ if (xfs_can_free_eofblocks(ip, true)) {
error = xfs_free_eofblocks(mp, ip, false);
if (error)
return VN_INACTIVE_CACHE;
@@ -777,7 +778,7 @@ xfs_create(
XFS_TRANS_PERM_LOG_RES, log_count);
if (error == ENOSPC) {
/* flush outstanding delalloc blocks and retry */
- xfs_flush_inodes(dp);
+ xfs_flush_inodes(mp);
error = xfs_trans_reserve(tp, resblks, log_res, 0,
XFS_TRANS_PERM_LOG_RES, log_count);
}
@@ -1957,12 +1958,11 @@ xfs_free_file_space(
rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
ioffset = offset & ~(rounding - 1);
-
- if (VN_CACHED(VFS_I(ip)) != 0) {
- error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
- if (error)
- goto out_unlock_iolock;
- }
+ error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
+ ioffset, -1);
+ if (error)
+ goto out_unlock_iolock;
+ truncate_pagecache_range(VFS_I(ip), ioffset, -1);
/*
* Need to zero the stuff we're not freeing, on disk.
@@ -2095,6 +2095,73 @@ xfs_free_file_space(
return error;
}
+
+STATIC int
+xfs_zero_file_space(
+ struct xfs_inode *ip,
+ xfs_off_t offset,
+ xfs_off_t len,
+ int attr_flags)
+{
+ struct xfs_mount *mp = ip->i_mount;
+ uint granularity;
+ xfs_off_t start_boundary;
+ xfs_off_t end_boundary;
+ int error;
+
+ granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+
+ /*
+ * Round the range of extents we are going to convert inwards. If the
+ * offset is aligned, then it doesn't get changed so we zero from the
+ * start of the block offset points to.
+ */
+ start_boundary = round_up(offset, granularity);
+ end_boundary = round_down(offset + len, granularity);
+
+ ASSERT(start_boundary >= offset);
+ ASSERT(end_boundary <= offset + len);
+
+ if (!(attr_flags & XFS_ATTR_NOLOCK))
+ xfs_ilock(ip, XFS_IOLOCK_EXCL);
+
+ if (start_boundary < end_boundary - 1) {
+ /* punch out the page cache over the conversion range */
+ truncate_pagecache_range(VFS_I(ip), start_boundary,
+ end_boundary - 1);
+ /* convert the blocks */
+ error = xfs_alloc_file_space(ip, start_boundary,
+ end_boundary - start_boundary - 1,
+ XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT,
+ attr_flags);
+ if (error)
+ goto out_unlock;
+
+ /* We've handled the interior of the range, now for the edges */
+ if (start_boundary != offset)
+ error = xfs_iozero(ip, offset, start_boundary - offset);
+ if (error)
+ goto out_unlock;
+
+ if (end_boundary != offset + len)
+ error = xfs_iozero(ip, end_boundary,
+ offset + len - end_boundary);
+
+ } else {
+ /*
+ * It's either a sub-granularity range or the range spanned lies
+ * partially across two adjacent blocks.
+ */
+ error = xfs_iozero(ip, offset, len);
+ }
+
+out_unlock:
+ if (!(attr_flags & XFS_ATTR_NOLOCK))
+ xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+ return error;
+
+}
+
/*
* xfs_change_file_space()
* This routine allocates or frees disk space for the given file.
@@ -2120,10 +2187,8 @@ xfs_change_file_space(
xfs_fsize_t fsize;
int setprealloc;
xfs_off_t startoffset;
- xfs_off_t llen;
xfs_trans_t *tp;
struct iattr iattr;
- int prealloc_type;
if (!S_ISREG(ip->i_d.di_mode))
return XFS_ERROR(EINVAL);
@@ -2141,12 +2206,30 @@ xfs_change_file_space(
return XFS_ERROR(EINVAL);
}
- llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
+ /*
+ * length of <= 0 for resv/unresv/zero is invalid. length for
+ * alloc/free is ignored completely and we have no idea what userspace
+ * might have set it to, so set it to zero to allow range
+ * checks to pass.
+ */
+ switch (cmd) {
+ case XFS_IOC_ZERO_RANGE:
+ case XFS_IOC_RESVSP:
+ case XFS_IOC_RESVSP64:
+ case XFS_IOC_UNRESVSP:
+ case XFS_IOC_UNRESVSP64:
+ if (bf->l_len <= 0)
+ return XFS_ERROR(EINVAL);
+ break;
+ default:
+ bf->l_len = 0;
+ break;
+ }
if (bf->l_start < 0 ||
bf->l_start > mp->m_super->s_maxbytes ||
- bf->l_start + llen < 0 ||
- bf->l_start + llen > mp->m_super->s_maxbytes)
+ bf->l_start + bf->l_len < 0 ||
+ bf->l_start + bf->l_len >= mp->m_super->s_maxbytes)
return XFS_ERROR(EINVAL);
bf->l_whence = 0;
@@ -2154,29 +2237,20 @@ xfs_change_file_space(
startoffset = bf->l_start;
fsize = XFS_ISIZE(ip);
- /*
- * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
- * file space.
- * These calls do NOT zero the data space allocated to the file,
- * nor do they change the file size.
- *
- * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
- * space.
- * These calls cause the new file data to be zeroed and the file
- * size to be changed.
- */
setprealloc = clrprealloc = 0;
- prealloc_type = XFS_BMAPI_PREALLOC;
-
switch (cmd) {
case XFS_IOC_ZERO_RANGE:
- prealloc_type |= XFS_BMAPI_CONVERT;
- xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0);
- /* FALLTHRU */
+ error = xfs_zero_file_space(ip, startoffset, bf->l_len,
+ attr_flags);
+ if (error)
+ return error;
+ setprealloc = 1;
+ break;
+
case XFS_IOC_RESVSP:
case XFS_IOC_RESVSP64:
error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
- prealloc_type, attr_flags);
+ XFS_BMAPI_PREALLOC, attr_flags);
if (error)
return error;
setprealloc = 1;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 447e146b2ba6..5163022d9808 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -48,14 +48,9 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
int flags, struct attrlist_cursor_kern *cursor);
-void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first,
- xfs_off_t last, int fiopt);
-int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,
- xfs_off_t last, int fiopt);
-int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
- xfs_off_t last, uint64_t flags, int fiopt);
-int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
+int xfs_iozero(struct xfs_inode *, loff_t, size_t);
int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
+int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool);
#endif /* _XFS_VNODEOPS_H */