diff options
Diffstat (limited to 'fs')
235 files changed, 7668 insertions, 5029 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 332b5ff02fec..f7003cfac63d 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -76,7 +76,7 @@ static const match_table_t tokens = { * Return 0 upon success, -ERRNO upon failure. */ -static int v9fs_parse_options(struct v9fs_session_info *v9ses) +static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) { char *options; substring_t args[MAX_OPT_ARGS]; @@ -90,10 +90,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses) v9ses->debug = 0; v9ses->cache = 0; - if (!v9ses->options) + if (!opts) return 0; - options = kstrdup(v9ses->options, GFP_KERNEL); + options = kstrdup(opts, GFP_KERNEL); if (!options) { P9_DPRINTK(P9_DEBUG_ERROR, "failed to allocate copy of option string\n"); @@ -206,24 +206,14 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, v9ses->uid = ~0; v9ses->dfltuid = V9FS_DEFUID; v9ses->dfltgid = V9FS_DEFGID; - if (data) { - v9ses->options = kstrdup(data, GFP_KERNEL); - if (!v9ses->options) { - P9_DPRINTK(P9_DEBUG_ERROR, - "failed to allocate copy of option string\n"); - retval = -ENOMEM; - goto error; - } - } - rc = v9fs_parse_options(v9ses); + rc = v9fs_parse_options(v9ses, data); if (rc < 0) { retval = rc; goto error; } - v9ses->clnt = p9_client_create(dev_name, v9ses->options); - + v9ses->clnt = p9_client_create(dev_name, data); if (IS_ERR(v9ses->clnt)) { retval = PTR_ERR(v9ses->clnt); v9ses->clnt = NULL; @@ -280,7 +270,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses) __putname(v9ses->uname); __putname(v9ses->aname); - kfree(v9ses->options); } /** diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index a7d567192998..38762bf102a9 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -85,7 +85,6 @@ struct v9fs_session_info { unsigned int afid; unsigned int cache; - char *options; /* copy of mount options */ char *uname; /* user name to mount as */ char *aname; /* name of remote hierarchy being mounted */ unsigned int maxdata; /* max data for client interface */ diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 81f8bbf12f9f..06a223d50a81 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -171,7 +171,6 @@ int v9fs_uflags2omode(int uflags, int extended) /** * v9fs_blank_wstat - helper function to setup a 9P stat structure - * @v9ses: 9P session info (for determining extended mode) * @wstat: structure to initialize * */ @@ -207,65 +206,72 @@ v9fs_blank_wstat(struct p9_wstat *wstat) struct inode *v9fs_get_inode(struct super_block *sb, int mode) { + int err; struct inode *inode; struct v9fs_session_info *v9ses = sb->s_fs_info; P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode); inode = new_inode(sb); - if (inode) { - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); - inode->i_blocks = 0; - inode->i_rdev = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - inode->i_mapping->a_ops = &v9fs_addr_operations; - - switch (mode & S_IFMT) { - case S_IFIFO: - case S_IFBLK: - case S_IFCHR: - case S_IFSOCK: - if (!v9fs_extended(v9ses)) { - P9_DPRINTK(P9_DEBUG_ERROR, - "special files without extended mode\n"); - return ERR_PTR(-EINVAL); - } - init_special_inode(inode, inode->i_mode, - inode->i_rdev); - break; - case S_IFREG: - inode->i_op = &v9fs_file_inode_operations; - inode->i_fop = &v9fs_file_operations; - break; - case S_IFLNK: - if (!v9fs_extended(v9ses)) { - P9_DPRINTK(P9_DEBUG_ERROR, - "extended modes used w/o 9P2000.u\n"); - return ERR_PTR(-EINVAL); - } - inode->i_op = &v9fs_symlink_inode_operations; - break; - case S_IFDIR: - inc_nlink(inode); - if (v9fs_extended(v9ses)) - inode->i_op = &v9fs_dir_inode_operations_ext; - else - inode->i_op = &v9fs_dir_inode_operations; - inode->i_fop = &v9fs_dir_operations; - break; - default: - P9_DPRINTK(P9_DEBUG_ERROR, - "BAD mode 0x%x S_IFMT 0x%x\n", - mode, mode & S_IFMT); - return ERR_PTR(-EINVAL); - } - } else { + if (!inode) { P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); return ERR_PTR(-ENOMEM); } + + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_blocks = 0; + inode->i_rdev = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mapping->a_ops = &v9fs_addr_operations; + + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFBLK: + case S_IFCHR: + case S_IFSOCK: + if (!v9fs_extended(v9ses)) { + P9_DPRINTK(P9_DEBUG_ERROR, + "special files without extended mode\n"); + err = -EINVAL; + goto error; + } + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + case S_IFREG: + inode->i_op = &v9fs_file_inode_operations; + inode->i_fop = &v9fs_file_operations; + break; + case S_IFLNK: + if (!v9fs_extended(v9ses)) { + P9_DPRINTK(P9_DEBUG_ERROR, + "extended modes used w/o 9P2000.u\n"); + err = -EINVAL; + goto error; + } + inode->i_op = &v9fs_symlink_inode_operations; + break; + case S_IFDIR: + inc_nlink(inode); + if (v9fs_extended(v9ses)) + inode->i_op = &v9fs_dir_inode_operations_ext; + else + inode->i_op = &v9fs_dir_inode_operations; + inode->i_fop = &v9fs_dir_operations; + break; + default: + P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n", + mode, mode & S_IFMT); + err = -EINVAL; + goto error; + } + return inode; + +error: + iput(inode); + return ERR_PTR(err); } /* @@ -338,30 +344,25 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, ret = NULL; st = p9_client_stat(fid); - if (IS_ERR(st)) { - err = PTR_ERR(st); - st = NULL; - goto error; - } + if (IS_ERR(st)) + return ERR_CAST(st); umode = p9mode2unixmode(v9ses, st->mode); ret = v9fs_get_inode(sb, umode); if (IS_ERR(ret)) { err = PTR_ERR(ret); - ret = NULL; goto error; } v9fs_stat2inode(st, ret, sb); ret->i_ino = v9fs_qid2ino(&st->qid); + p9stat_free(st); kfree(st); return ret; error: + p9stat_free(st); kfree(st); - if (ret) - iput(ret); - return ERR_PTR(err); } @@ -403,9 +404,9 @@ v9fs_open_created(struct inode *inode, struct file *file) * @v9ses: session information * @dir: directory that dentry is being created in * @dentry: dentry that is being created + * @extension: 9p2000.u extension string to support devices, etc. * @perm: create permissions * @mode: open mode - * @extension: 9p2000.u extension string to support devices, etc. * */ static struct p9_fid * @@ -470,7 +471,10 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, dentry->d_op = &v9fs_dentry_operations; d_instantiate(dentry, inode); - v9fs_fid_add(dentry, fid); + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + return ofid; error: diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 38d695d66a0b..8961f1a8f668 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -81,7 +81,7 @@ static int v9fs_set_super(struct super_block *s, void *data) static void v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, - int flags) + int flags, void *data) { sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize_bits = fls(v9ses->maxdata - 1); @@ -91,6 +91,8 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | MS_NOATIME; + + save_mount_options(sb, data); } /** @@ -113,14 +115,11 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, struct v9fs_session_info *v9ses = NULL; struct p9_wstat *st = NULL; int mode = S_IRWXUGO | S_ISVTX; - uid_t uid = current_fsuid(); - gid_t gid = current_fsgid(); struct p9_fid *fid; int retval = 0; P9_DPRINTK(P9_DEBUG_VFS, " \n"); - st = NULL; v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); if (!v9ses) return -ENOMEM; @@ -142,7 +141,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, retval = PTR_ERR(sb); goto free_stat; } - v9fs_fill_super(sb, v9ses, flags); + v9fs_fill_super(sb, v9ses, flags, data); inode = v9fs_get_inode(sb, S_IFDIR | mode); if (IS_ERR(inode)) { @@ -150,9 +149,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, goto release_sb; } - inode->i_uid = uid; - inode->i_gid = gid; - root = d_alloc_root(inode); if (!root) { iput(inode); @@ -173,10 +169,8 @@ P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); simple_set_mnt(mnt, sb); return 0; -release_sb: - deactivate_locked_super(sb); - free_stat: + p9stat_free(st); kfree(st); clunk_fid: @@ -185,7 +179,12 @@ clunk_fid: close_session: v9fs_session_close(v9ses); kfree(v9ses); + return retval; +release_sb: + p9stat_free(st); + kfree(st); + deactivate_locked_super(sb); return retval; } @@ -207,24 +206,10 @@ static void v9fs_kill_super(struct super_block *s) v9fs_session_close(v9ses); kfree(v9ses); + s->s_fs_info = NULL; P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n"); } -/** - * v9fs_show_options - Show mount options in /proc/mounts - * @m: seq_file to write to - * @mnt: mount descriptor - * - */ - -static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt) -{ - struct v9fs_session_info *v9ses = mnt->mnt_sb->s_fs_info; - - seq_printf(m, "%s", v9ses->options); - return 0; -} - static void v9fs_umount_begin(struct super_block *sb) { @@ -237,7 +222,7 @@ v9fs_umount_begin(struct super_block *sb) static const struct super_operations v9fs_super_ops = { .statfs = simple_statfs, .clear_inode = v9fs_clear_inode, - .show_options = v9fs_show_options, + .show_options = generic_show_options, .umount_begin = v9fs_umount_begin, }; diff --git a/fs/Kconfig b/fs/Kconfig index 0e7da7bb5d93..455aa207e67e 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -43,6 +43,7 @@ source "fs/xfs/Kconfig" source "fs/gfs2/Kconfig" source "fs/ocfs2/Kconfig" source "fs/btrfs/Kconfig" +source "fs/nilfs2/Kconfig" endif # BLOCK @@ -186,7 +187,6 @@ source "fs/romfs/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/exofs/Kconfig" -source "fs/nilfs2/Kconfig" endif # MISC_FILESYSTEMS diff --git a/fs/afs/file.c b/fs/afs/file.c index 0149dab365e7..681c2a7b013f 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -134,9 +134,16 @@ static int afs_readpage(struct file *file, struct page *page) inode = page->mapping->host; - ASSERT(file != NULL); - key = file->private_data; - ASSERT(key != NULL); + if (file) { + key = file->private_data; + ASSERT(key != NULL); + } else { + key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto error_nokey; + } + } _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); @@ -207,12 +214,17 @@ static int afs_readpage(struct file *file, struct page *page) unlock_page(page); } + if (!file) + key_put(key); _leave(" = 0"); return 0; error: SetPageError(page); unlock_page(page); + if (!file) + key_put(key); +error_nokey: _leave(" = %d", ret); return ret; } diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index aa39ae83f019..3da18d453488 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -77,7 +77,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) } /* Update the expiry counter if fs is busy */ - if (!may_umount_tree(mnt)) { + if (!may_umount_tree(path.mnt)) { struct autofs_info *ino = autofs4_dentry_ino(top); ino->last_used = jiffies; goto done; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index b7c1603cd4bd..7c1e65d54872 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -501,22 +501,22 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex, } } - /* - * Now fill out the bss section. First pad the last page up - * to the page boundary, and then perform a mmap to make sure - * that there are zero-mapped pages up to and including the - * last bss page. - */ - if (padzero(elf_bss)) { - error = -EFAULT; - goto out_close; - } + if (last_bss > elf_bss) { + /* + * Now fill out the bss section. First pad the last page up + * to the page boundary, and then perform a mmap to make sure + * that there are zero-mapped pages up to and including the + * last bss page. + */ + if (padzero(elf_bss)) { + error = -EFAULT; + goto out_close; + } - /* What we have mapped so far */ - elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1); + /* What we have mapped so far */ + elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1); - /* Map the last of the bss segment */ - if (last_bss > elf_bss) { + /* Map the last of the bss segment */ down_write(¤t->mm->mmap_sem); error = do_brk(elf_bss, last_bss - elf_bss); up_write(¤t->mm->mmap_sem); diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 697f6b5f1313..e92f229e3c6e 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -828,15 +828,22 @@ static int load_flat_shared_library(int id, struct lib_info *libs) if (IS_ERR(bprm.file)) return res; + bprm.cred = prepare_exec_creds(); + res = -ENOMEM; + if (!bprm.cred) + goto out; + res = prepare_binprm(&bprm); if (res <= (unsigned long)-4096) res = load_flat_file(&bprm, libs, id, NULL); - if (bprm.file) { - allow_write_access(bprm.file); - fput(bprm.file); - bprm.file = NULL; - } + + abort_creds(bprm.cred); + +out: + allow_write_access(bprm.file); + fput(bprm.file); + return(res); } diff --git a/fs/block_dev.c b/fs/block_dev.c index 3a6d4fb2a329..3581a4e53942 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -564,6 +564,16 @@ struct block_device *bdget(dev_t dev) EXPORT_SYMBOL(bdget); +/** + * bdgrab -- Grab a reference to an already referenced block device + * @bdev: Block device to grab a reference to. + */ +struct block_device *bdgrab(struct block_device *bdev) +{ + atomic_inc(&bdev->bd_inode->i_count); + return bdev; +} + long nr_blockdev_pages(void) { struct block_device *bdev; @@ -1395,6 +1405,33 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) } /* + * Write data to the block device. Only intended for the block device itself + * and the raw driver which basically is a fake block device. + * + * Does not take i_mutex for the write and thus is not for general purpose + * use. + */ +ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + ssize_t ret; + + BUG_ON(iocb->ki_pos != pos); + + ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); + if (ret > 0 || ret == -EIOCBQUEUED) { + ssize_t err; + + err = generic_write_sync(file, pos, ret); + if (err < 0 && ret > 0) + ret = err; + } + return ret; +} +EXPORT_SYMBOL_GPL(blkdev_aio_write); + +/* * Try to release a page associated with block device when the system * is under memory pressure. */ @@ -1426,7 +1463,7 @@ const struct file_operations def_blk_fops = { .read = do_sync_read, .write = do_sync_write, .aio_read = generic_file_aio_read, - .aio_write = generic_file_aio_write_nolock, + .aio_write = blkdev_aio_write, .mmap = generic_file_mmap, .fsync = block_fsync, .unlocked_ioctl = block_ioctl, diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 6e4f6c50a120..019e8af449ab 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -424,11 +424,11 @@ int btrfs_requeue_work(struct btrfs_work *work) * list */ if (worker->idle) { - spin_lock_irqsave(&worker->workers->lock, flags); + spin_lock(&worker->workers->lock); worker->idle = 0; list_move_tail(&worker->worker_list, &worker->workers->worker_list); - spin_unlock_irqrestore(&worker->workers->lock, flags); + spin_unlock(&worker->workers->lock); } if (!worker->working) { wake = 1; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 60a45f3a4e91..3fdcc0512d3a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -557,19 +557,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) btrfs_disk_key_to_cpu(&k1, disk); - if (k1.objectid > k2->objectid) - return 1; - if (k1.objectid < k2->objectid) - return -1; - if (k1.type > k2->type) - return 1; - if (k1.type < k2->type) - return -1; - if (k1.offset > k2->offset) - return 1; - if (k1.offset < k2->offset) - return -1; - return 0; + return btrfs_comp_cpu_keys(&k1, k2); } /* @@ -1052,9 +1040,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, BTRFS_NODEPTRS_PER_BLOCK(root) / 4) return 0; - if (btrfs_header_nritems(mid) > 2) - return 0; - if (btrfs_header_nritems(mid) < 2) err_on_enospc = 1; @@ -1701,6 +1686,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root struct extent_buffer *b; int slot; int ret; + int err; int level; int lowest_unlock = 1; u8 lowest_level = 0; @@ -1737,8 +1723,6 @@ again: p->locks[level] = 1; if (cow) { - int wret; - /* * if we don't really need to cow this block * then we don't want to set the path blocking, @@ -1749,12 +1733,12 @@ again: btrfs_set_path_blocking(p); - wret = btrfs_cow_block(trans, root, b, - p->nodes[level + 1], - p->slots[level + 1], &b); - if (wret) { + err = btrfs_cow_block(trans, root, b, + p->nodes[level + 1], + p->slots[level + 1], &b); + if (err) { free_extent_buffer(b); - ret = wret; + ret = err; goto done; } } @@ -1793,41 +1777,45 @@ cow_done: ret = bin_search(b, key, level, &slot); if (level != 0) { - if (ret && slot > 0) + int dec = 0; + if (ret && slot > 0) { + dec = 1; slot -= 1; + } p->slots[level] = slot; - ret = setup_nodes_for_search(trans, root, p, b, level, + err = setup_nodes_for_search(trans, root, p, b, level, ins_len); - if (ret == -EAGAIN) + if (err == -EAGAIN) goto again; - else if (ret) + if (err) { + ret = err; goto done; + } b = p->nodes[level]; slot = p->slots[level]; unlock_up(p, level, lowest_unlock); - /* this is only true while dropping a snapshot */ if (level == lowest_level) { - ret = 0; + if (dec) + p->slots[level]++; goto done; } - ret = read_block_for_search(trans, root, p, + err = read_block_for_search(trans, root, p, &b, level, slot, key); - if (ret == -EAGAIN) + if (err == -EAGAIN) goto again; - - if (ret == -EIO) + if (err) { + ret = err; goto done; + } if (!p->skip_locking) { - int lret; - btrfs_clear_path_blocking(p, NULL); - lret = btrfs_try_spin_lock(b); + err = btrfs_try_spin_lock(b); - if (!lret) { + if (!err) { btrfs_set_path_blocking(p); btrfs_tree_lock(b); btrfs_clear_path_blocking(p, b); @@ -1837,16 +1825,14 @@ cow_done: p->slots[level] = slot; if (ins_len > 0 && btrfs_leaf_free_space(root, b) < ins_len) { - int sret; - btrfs_set_path_blocking(p); - sret = split_leaf(trans, root, key, - p, ins_len, ret == 0); + err = split_leaf(trans, root, key, + p, ins_len, ret == 0); btrfs_clear_path_blocking(p, NULL); - BUG_ON(sret > 0); - if (sret) { - ret = sret; + BUG_ON(err > 0); + if (err) { + ret = err; goto done; } } @@ -3807,7 +3793,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, } /* delete the leaf if it is mostly empty */ - if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) { + if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) { /* push_leaf_left fixes the path. * make sure the path still points to our leaf * for possible call to del_ptr below @@ -4042,10 +4028,9 @@ out: * calling this function. */ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, - struct btrfs_key *key, int lowest_level, + struct btrfs_key *key, int level, int cache_only, u64 min_trans) { - int level = lowest_level; int slot; struct extent_buffer *c; @@ -4058,11 +4043,40 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, c = path->nodes[level]; next: if (slot >= btrfs_header_nritems(c)) { - level++; - if (level == BTRFS_MAX_LEVEL) + int ret; + int orig_lowest; + struct btrfs_key cur_key; + if (level + 1 >= BTRFS_MAX_LEVEL || + !path->nodes[level + 1]) return 1; - continue; + + if (path->locks[level + 1]) { + level++; + continue; + } + + slot = btrfs_header_nritems(c) - 1; + if (level == 0) + btrfs_item_key_to_cpu(c, &cur_key, slot); + else + btrfs_node_key_to_cpu(c, &cur_key, slot); + + orig_lowest = path->lowest_level; + btrfs_release_path(root, path); + path->lowest_level = level; + ret = btrfs_search_slot(NULL, root, &cur_key, path, + 0, 0); + path->lowest_level = orig_lowest; + if (ret < 0) + return ret; + + c = path->nodes[level]; + slot = path->slots[level]; + if (ret == 0) + slot++; + goto next; } + if (level == 0) btrfs_item_key_to_cpu(c, key, slot); else { @@ -4146,7 +4160,8 @@ again: * advance the path if there are now more items available. */ if (nritems > 0 && path->slots[0] < nritems - 1) { - path->slots[0]++; + if (ret == 0) + path->slots[0]++; ret = 0; goto done; } @@ -4278,10 +4293,10 @@ int btrfs_previous_item(struct btrfs_root *root, path->slots[0]--; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (found_key.type == type) - return 0; if (found_key.objectid < min_objectid) break; + if (found_key.type == type) + return 0; if (found_key.objectid == min_objectid && found_key.type < type) break; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 98a873838717..837435ce84ca 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -481,7 +481,7 @@ struct btrfs_shared_data_ref { struct btrfs_extent_inline_ref { u8 type; - u64 offset; + __le64 offset; } __attribute__ ((__packed__)); /* old style backrefs item */ @@ -689,6 +689,7 @@ struct btrfs_space_info { struct list_head block_groups; spinlock_t lock; struct rw_semaphore groups_sem; + atomic_t caching_threads; }; /* @@ -707,6 +708,9 @@ struct btrfs_free_cluster { /* first extent starting offset */ u64 window_start; + /* if this cluster simply points at a bitmap in the block group */ + bool points_to_bitmap; + struct btrfs_block_group_cache *block_group; /* * when a cluster is allocated from a block group, we put the @@ -716,24 +720,37 @@ struct btrfs_free_cluster { struct list_head block_group_list; }; +enum btrfs_caching_type { + BTRFS_CACHE_NO = 0, + BTRFS_CACHE_STARTED = 1, + BTRFS_CACHE_FINISHED = 2, +}; + struct btrfs_block_group_cache { struct btrfs_key key; struct btrfs_block_group_item item; + struct btrfs_fs_info *fs_info; spinlock_t lock; - struct mutex cache_mutex; u64 pinned; u64 reserved; u64 flags; - int cached; + u64 sectorsize; + int extents_thresh; + int free_extents; + int total_bitmaps; int ro; int dirty; + /* cache tracking stuff */ + wait_queue_head_t caching_q; + int cached; + struct btrfs_space_info *space_info; /* free space cache stuff */ spinlock_t tree_lock; - struct rb_root free_space_bytes; struct rb_root free_space_offset; + u64 free_space; /* block group cache stuff */ struct rb_node cache_node; @@ -808,6 +825,7 @@ struct btrfs_fs_info { struct mutex drop_mutex; struct mutex volume_mutex; struct mutex tree_reloc_mutex; + struct rw_semaphore extent_commit_sem; /* * this protects the ordered operations list only while we are @@ -1988,6 +2006,7 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, u64 bytes); void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); +void btrfs_free_pinned_extents(struct btrfs_fs_info *info); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d28d29c95f7c..15831d5c7367 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1352,6 +1352,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) { int err; + bdi->name = "btrfs"; bdi->capabilities = BDI_CAP_MAP_COPY; err = bdi_init(bdi); if (err) @@ -1639,6 +1640,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->volume_mutex); mutex_init(&fs_info->tree_reloc_mutex); + init_rwsem(&fs_info->extent_commit_sem); btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); btrfs_init_free_cluster(&fs_info->data_alloc_cluster); @@ -1799,6 +1801,11 @@ struct btrfs_root *open_ctree(struct super_block *sb, btrfs_super_chunk_root(disk_super), blocksize, generation); BUG_ON(!chunk_root->node); + if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { + printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n", + sb->s_id); + goto fail_chunk_root; + } btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); chunk_root->commit_root = btrfs_root_node(chunk_root); @@ -1826,6 +1833,11 @@ struct btrfs_root *open_ctree(struct super_block *sb, blocksize, generation); if (!tree_root->node) goto fail_chunk_root; + if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { + printk(KERN_WARNING "btrfs: failed to read tree root on %s\n", + sb->s_id); + goto fail_tree_root; + } btrfs_set_root_node(&tree_root->root_item, tree_root->node); tree_root->commit_root = btrfs_root_node(tree_root); @@ -2322,6 +2334,9 @@ int close_ctree(struct btrfs_root *root) printk(KERN_ERR "btrfs: commit super ret %d\n", ret); } + fs_info->closing = 2; + smp_mb(); + if (fs_info->delalloc_bytes) { printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", (unsigned long long)fs_info->delalloc_bytes); @@ -2343,6 +2358,7 @@ int close_ctree(struct btrfs_root *root) free_extent_buffer(root->fs_info->csum_root->commit_root); btrfs_free_block_groups(root->fs_info); + btrfs_free_pinned_extents(root->fs_info); del_fs_roots(fs_info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index a5aca3997d42..535f85ba104f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -21,6 +21,7 @@ #include <linux/blkdev.h> #include <linux/sort.h> #include <linux/rcupdate.h> +#include <linux/kthread.h> #include "compat.h" #include "hash.h" #include "ctree.h" @@ -61,6 +62,13 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force); +static noinline int +block_group_cache_done(struct btrfs_block_group_cache *cache) +{ + smp_mb(); + return cache->cached == BTRFS_CACHE_FINISHED; +} + static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) { return (cache->flags & bits) == bits; @@ -146,20 +154,70 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, } /* + * We always set EXTENT_LOCKED for the super mirror extents so we don't + * overwrite them, so those bits need to be unset. Also, if we are unmounting + * with pinned extents still sitting there because we had a block group caching, + * we need to clear those now, since we are done. + */ +void btrfs_free_pinned_extents(struct btrfs_fs_info *info) +{ + u64 start, end, last = 0; + int ret; + + while (1) { + ret = find_first_extent_bit(&info->pinned_extents, last, + &start, &end, + EXTENT_LOCKED|EXTENT_DIRTY); + if (ret) + break; + + clear_extent_bits(&info->pinned_extents, start, end, + EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS); + last = end+1; + } +} + +static int remove_sb_from_cache(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 bytenr; + u64 *logical; + int stripe_len; + int i, nr, ret; + + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + ret = btrfs_rmap_block(&root->fs_info->mapping_tree, + cache->key.objectid, bytenr, + 0, &logical, &nr, &stripe_len); + BUG_ON(ret); + while (nr--) { + try_lock_extent(&fs_info->pinned_extents, + logical[nr], + logical[nr] + stripe_len - 1, GFP_NOFS); + } + kfree(logical); + } + + return 0; +} + +/* * this is only called by cache_block_group, since we could have freed extents * we need to check the pinned_extents for any extents that can't be used yet * since their free space will be released as soon as the transaction commits. */ -static int add_new_free_space(struct btrfs_block_group_cache *block_group, +static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, struct btrfs_fs_info *info, u64 start, u64 end) { - u64 extent_start, extent_end, size; + u64 extent_start, extent_end, size, total_added = 0; int ret; while (start < end) { ret = find_first_extent_bit(&info->pinned_extents, start, &extent_start, &extent_end, - EXTENT_DIRTY); + EXTENT_DIRTY|EXTENT_LOCKED); if (ret) break; @@ -167,6 +225,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group, start = extent_end + 1; } else if (extent_start > start && extent_start < end) { size = extent_start - start; + total_added += size; ret = btrfs_add_free_space(block_group, start, size); BUG_ON(ret); @@ -178,84 +237,93 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group, if (start < end) { size = end - start; + total_added += size; ret = btrfs_add_free_space(block_group, start, size); BUG_ON(ret); } - return 0; -} - -static int remove_sb_from_cache(struct btrfs_root *root, - struct btrfs_block_group_cache *cache) -{ - u64 bytenr; - u64 *logical; - int stripe_len; - int i, nr, ret; - - for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { - bytenr = btrfs_sb_offset(i); - ret = btrfs_rmap_block(&root->fs_info->mapping_tree, - cache->key.objectid, bytenr, 0, - &logical, &nr, &stripe_len); - BUG_ON(ret); - while (nr--) { - btrfs_remove_free_space(cache, logical[nr], - stripe_len); - } - kfree(logical); - } - return 0; + return total_added; } -static int cache_block_group(struct btrfs_root *root, - struct btrfs_block_group_cache *block_group) +static int caching_kthread(void *data) { + struct btrfs_block_group_cache *block_group = data; + struct btrfs_fs_info *fs_info = block_group->fs_info; + u64 last = 0; struct btrfs_path *path; int ret = 0; struct btrfs_key key; struct extent_buffer *leaf; int slot; - u64 last; - - if (!block_group) - return 0; - - root = root->fs_info->extent_root; + u64 total_found = 0; - if (block_group->cached) - return 0; + BUG_ON(!fs_info); path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 2; + atomic_inc(&block_group->space_info->caching_threads); + last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); /* - * we get into deadlocks with paths held by callers of this function. - * since the alloc_mutex is protecting things right now, just - * skip the locking here + * We don't want to deadlock with somebody trying to allocate a new + * extent for the extent root while also trying to search the extent + * root to add free space. So we skip locking and search the commit + * root, since its read-only */ path->skip_locking = 1; - last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); + path->search_commit_root = 1; + path->reada = 2; + key.objectid = last; key.offset = 0; btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); +again: + /* need to make sure the commit_root doesn't disappear */ + down_read(&fs_info->extent_commit_sem); + + ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); if (ret < 0) goto err; while (1) { + smp_mb(); + if (block_group->fs_info->closing > 1) { + last = (u64)-1; + break; + } + leaf = path->nodes[0]; slot = path->slots[0]; if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); + ret = btrfs_next_leaf(fs_info->extent_root, path); if (ret < 0) goto err; - if (ret == 0) - continue; - else + else if (ret) break; + + if (need_resched() || + btrfs_transaction_in_commit(fs_info)) { + leaf = path->nodes[0]; + + /* this shouldn't happen, but if the + * leaf is empty just move on. + */ + if (btrfs_header_nritems(leaf) == 0) + break; + /* + * we need to copy the key out so that + * we are sure the next search advances + * us forward in the btree. + */ + btrfs_item_key_to_cpu(leaf, &key, 0); + btrfs_release_path(fs_info->extent_root, path); + up_read(&fs_info->extent_commit_sem); + schedule_timeout(1); + goto again; + } + + continue; } btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid < block_group->key.objectid) @@ -266,24 +334,59 @@ static int cache_block_group(struct btrfs_root *root, break; if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { - add_new_free_space(block_group, root->fs_info, last, - key.objectid); - + total_found += add_new_free_space(block_group, + fs_info, last, + key.objectid); last = key.objectid + key.offset; } + + if (total_found > (1024 * 1024 * 2)) { + total_found = 0; + wake_up(&block_group->caching_q); + } next: path->slots[0]++; } + ret = 0; - add_new_free_space(block_group, root->fs_info, last, - block_group->key.objectid + - block_group->key.offset); + total_found += add_new_free_space(block_group, fs_info, last, + block_group->key.objectid + + block_group->key.offset); + + spin_lock(&block_group->lock); + block_group->cached = BTRFS_CACHE_FINISHED; + spin_unlock(&block_group->lock); - block_group->cached = 1; - remove_sb_from_cache(root, block_group); - ret = 0; err: btrfs_free_path(path); + up_read(&fs_info->extent_commit_sem); + atomic_dec(&block_group->space_info->caching_threads); + wake_up(&block_group->caching_q); + + return 0; +} + +static int cache_block_group(struct btrfs_block_group_cache *cache) +{ + struct task_struct *tsk; + int ret = 0; + + spin_lock(&cache->lock); + if (cache->cached != BTRFS_CACHE_NO) { + spin_unlock(&cache->lock); + return ret; + } + cache->cached = BTRFS_CACHE_STARTED; + spin_unlock(&cache->lock); + + tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", + cache->key.objectid); + if (IS_ERR(tsk)) { + ret = PTR_ERR(tsk); + printk(KERN_ERR "error running thread %d\n", ret); + BUG(); + } + return ret; } @@ -1408,7 +1511,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, static void btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len) { - blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); + blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, + DISCARD_FL_BARRIER); } #endif @@ -2387,13 +2491,29 @@ fail: } +static struct btrfs_block_group_cache * +next_block_group(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + struct rb_node *node; + spin_lock(&root->fs_info->block_group_cache_lock); + node = rb_next(&cache->cache_node); + btrfs_put_block_group(cache); + if (node) { + cache = rb_entry(node, struct btrfs_block_group_cache, + cache_node); + atomic_inc(&cache->count); + } else + cache = NULL; + spin_unlock(&root->fs_info->block_group_cache_lock); + return cache; +} + int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_block_group_cache *cache, *entry; - struct rb_node *n; + struct btrfs_block_group_cache *cache; int err = 0; - int werr = 0; struct btrfs_path *path; u64 last = 0; @@ -2402,39 +2522,35 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, return -ENOMEM; while (1) { - cache = NULL; - spin_lock(&root->fs_info->block_group_cache_lock); - for (n = rb_first(&root->fs_info->block_group_cache_tree); - n; n = rb_next(n)) { - entry = rb_entry(n, struct btrfs_block_group_cache, - cache_node); - if (entry->dirty) { - cache = entry; - break; - } + if (last == 0) { + err = btrfs_run_delayed_refs(trans, root, + (unsigned long)-1); + BUG_ON(err); } - spin_unlock(&root->fs_info->block_group_cache_lock); - if (!cache) - break; + cache = btrfs_lookup_first_block_group(root->fs_info, last); + while (cache) { + if (cache->dirty) + break; + cache = next_block_group(root, cache); + } + if (!cache) { + if (last == 0) + break; + last = 0; + continue; + } cache->dirty = 0; - last += cache->key.offset; + last = cache->key.objectid + cache->key.offset; - err = write_one_cache_group(trans, root, - path, cache); - /* - * if we fail to write the cache group, we want - * to keep it marked dirty in hopes that a later - * write will work - */ - if (err) { - werr = err; - continue; - } + err = write_one_cache_group(trans, root, path, cache); + BUG_ON(err); + btrfs_put_block_group(cache); } + btrfs_free_path(path); - return werr; + return 0; } int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) @@ -2484,6 +2600,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->force_alloc = 0; *space_info = found; list_add_rcu(&found->list, &info->space_info); + atomic_set(&found->caching_threads, 0); return 0; } @@ -2947,13 +3064,9 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, struct btrfs_block_group_cache *cache; struct btrfs_fs_info *fs_info = root->fs_info; - if (pin) { + if (pin) set_extent_dirty(&fs_info->pinned_extents, bytenr, bytenr + num - 1, GFP_NOFS); - } else { - clear_extent_dirty(&fs_info->pinned_extents, - bytenr, bytenr + num - 1, GFP_NOFS); - } while (num > 0) { cache = btrfs_lookup_block_group(fs_info, bytenr); @@ -2969,14 +3082,34 @@ int btrfs_update_pinned_extents(struct btrfs_root *root, spin_unlock(&cache->space_info->lock); fs_info->total_pinned += len; } else { + int unpin = 0; + + /* + * in order to not race with the block group caching, we + * only want to unpin the extent if we are cached. If + * we aren't cached, we want to start async caching this + * block group so we can free the extent the next time + * around. + */ spin_lock(&cache->space_info->lock); spin_lock(&cache->lock); - cache->pinned -= len; - cache->space_info->bytes_pinned -= len; + unpin = (cache->cached == BTRFS_CACHE_FINISHED); + if (likely(unpin)) { + cache->pinned -= len; + cache->space_info->bytes_pinned -= len; + fs_info->total_pinned -= len; + } spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - fs_info->total_pinned -= len; - if (cache->cached) + + if (likely(unpin)) + clear_extent_dirty(&fs_info->pinned_extents, + bytenr, bytenr + len -1, + GFP_NOFS); + else + cache_block_group(cache); + + if (unpin) btrfs_add_free_space(cache, bytenr, len); } btrfs_put_block_group(cache); @@ -3030,6 +3163,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) &start, &end, EXTENT_DIRTY); if (ret) break; + set_extent_dirty(copy, start, end, GFP_NOFS); last = end + 1; } @@ -3058,6 +3192,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, cond_resched(); } + return ret; } @@ -3436,6 +3571,45 @@ static u64 stripe_align(struct btrfs_root *root, u64 val) } /* + * when we wait for progress in the block group caching, its because + * our allocation attempt failed at least once. So, we must sleep + * and let some progress happen before we try again. + * + * This function will sleep at least once waiting for new free space to + * show up, and then it will check the block group free space numbers + * for our min num_bytes. Another option is to have it go ahead + * and look in the rbtree for a free extent of a given size, but this + * is a good start. + */ +static noinline int +wait_block_group_cache_progress(struct btrfs_block_group_cache *cache, + u64 num_bytes) +{ + DEFINE_WAIT(wait); + + prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE); + + if (block_group_cache_done(cache)) { + finish_wait(&cache->caching_q, &wait); + return 0; + } + schedule(); + finish_wait(&cache->caching_q, &wait); + + wait_event(cache->caching_q, block_group_cache_done(cache) || + (cache->free_space >= num_bytes)); + return 0; +} + +enum btrfs_loop_type { + LOOP_CACHED_ONLY = 0, + LOOP_CACHING_NOWAIT = 1, + LOOP_CACHING_WAIT = 2, + LOOP_ALLOC_CHUNK = 3, + LOOP_NO_EMPTY_SIZE = 4, +}; + +/* * walks the btree of allocated extents and find a hole of a given size. * The key ins is changed to record the hole: * ins->objectid == block start @@ -3460,6 +3634,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, struct btrfs_space_info *space_info; int last_ptr_loop = 0; int loop = 0; + bool found_uncached_bg = false; WARN_ON(num_bytes < root->sectorsize); btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); @@ -3491,15 +3666,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, search_start = max(search_start, first_logical_byte(root, 0)); search_start = max(search_start, hint_byte); - if (!last_ptr) { + if (!last_ptr) empty_cluster = 0; - loop = 1; - } if (search_start == hint_byte) { block_group = btrfs_lookup_block_group(root->fs_info, search_start); - if (block_group && block_group_bits(block_group, data)) { + /* + * we don't want to use the block group if it doesn't match our + * allocation bits, or if its not cached. + */ + if (block_group && block_group_bits(block_group, data) && + block_group_cache_done(block_group)) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || block_group->ro) { @@ -3522,21 +3700,35 @@ search: down_read(&space_info->groups_sem); list_for_each_entry(block_group, &space_info->block_groups, list) { u64 offset; + int cached; atomic_inc(&block_group->count); search_start = block_group->key.objectid; have_block_group: - if (unlikely(!block_group->cached)) { - mutex_lock(&block_group->cache_mutex); - ret = cache_block_group(root, block_group); - mutex_unlock(&block_group->cache_mutex); - if (ret) { - btrfs_put_block_group(block_group); - break; + if (unlikely(block_group->cached == BTRFS_CACHE_NO)) { + /* + * we want to start caching kthreads, but not too many + * right off the bat so we don't overwhelm the system, + * so only start them if there are less than 2 and we're + * in the initial allocation phase. + */ + if (loop > LOOP_CACHING_NOWAIT || + atomic_read(&space_info->caching_threads) < 2) { + ret = cache_block_group(block_group); + BUG_ON(ret); } } + cached = block_group_cache_done(block_group); + if (unlikely(!cached)) { + found_uncached_bg = true; + + /* if we only want cached bgs, loop */ + if (loop == LOOP_CACHED_ONLY) + goto loop; + } + if (unlikely(block_group->ro)) goto loop; @@ -3615,14 +3807,21 @@ refill_cluster: spin_unlock(&last_ptr->refill_lock); goto checks; } + } else if (!cached && loop > LOOP_CACHING_NOWAIT) { + spin_unlock(&last_ptr->refill_lock); + + wait_block_group_cache_progress(block_group, + num_bytes + empty_cluster + empty_size); + goto have_block_group; } + /* * at this point we either didn't find a cluster * or we weren't able to allocate a block from our * cluster. Free the cluster we've been trying * to use, and go to the next block group */ - if (loop < 2) { + if (loop < LOOP_NO_EMPTY_SIZE) { btrfs_return_cluster_to_free_space(NULL, last_ptr); spin_unlock(&last_ptr->refill_lock); @@ -3633,11 +3832,17 @@ refill_cluster: offset = btrfs_find_space_for_alloc(block_group, search_start, num_bytes, empty_size); - if (!offset) + if (!offset && (cached || (!cached && + loop == LOOP_CACHING_NOWAIT))) { goto loop; + } else if (!offset && (!cached && + loop > LOOP_CACHING_NOWAIT)) { + wait_block_group_cache_progress(block_group, + num_bytes + empty_size); + goto have_block_group; + } checks: search_start = stripe_align(root, offset); - /* move on to the next group */ if (search_start + num_bytes >= search_end) { btrfs_add_free_space(block_group, offset, num_bytes); @@ -3683,13 +3888,26 @@ loop: } up_read(&space_info->groups_sem); - /* loop == 0, try to find a clustered alloc in every block group - * loop == 1, try again after forcing a chunk allocation - * loop == 2, set empty_size and empty_cluster to 0 and try again + /* LOOP_CACHED_ONLY, only search fully cached block groups + * LOOP_CACHING_NOWAIT, search partially cached block groups, but + * dont wait foR them to finish caching + * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching + * LOOP_ALLOC_CHUNK, force a chunk allocation and try again + * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try + * again */ - if (!ins->objectid && loop < 3 && - (empty_size || empty_cluster || allowed_chunk_alloc)) { - if (loop >= 2) { + if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE && + (found_uncached_bg || empty_size || empty_cluster || + allowed_chunk_alloc)) { + if (found_uncached_bg) { + found_uncached_bg = false; + if (loop < LOOP_CACHING_WAIT) { + loop++; + goto search; + } + } + + if (loop == LOOP_ALLOC_CHUNK) { empty_size = 0; empty_cluster = 0; } @@ -3702,7 +3920,7 @@ loop: space_info->force_alloc = 1; } - if (loop < 3) { + if (loop < LOOP_NO_EMPTY_SIZE) { loop++; goto search; } @@ -3798,7 +4016,7 @@ again: num_bytes, data, 1); goto again; } - if (ret) { + if (ret == -ENOSPC) { struct btrfs_space_info *sinfo; sinfo = __find_space_info(root->fs_info, data); @@ -3806,7 +4024,6 @@ again: "wanted %llu\n", (unsigned long long)data, (unsigned long long)num_bytes); dump_space_info(sinfo, num_bytes); - BUG(); } return ret; @@ -3844,7 +4061,9 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size, empty_size, hint_byte, search_end, ins, data); - update_reserved_extents(root, ins->objectid, ins->offset, 1); + if (!ret) + update_reserved_extents(root, ins->objectid, ins->offset, 1); + return ret; } @@ -4006,9 +4225,9 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, struct btrfs_block_group_cache *block_group; block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); - mutex_lock(&block_group->cache_mutex); - cache_block_group(root, block_group); - mutex_unlock(&block_group->cache_mutex); + cache_block_group(block_group); + wait_event(block_group->caching_q, + block_group_cache_done(block_group)); ret = btrfs_remove_free_space(block_group, ins->objectid, ins->offset); @@ -4039,7 +4258,8 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans, ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes, empty_size, hint_byte, search_end, ins, 0); - BUG_ON(ret); + if (ret) + return ret; if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) { if (parent == 0) @@ -6955,11 +7175,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) &info->block_group_cache_tree); spin_unlock(&info->block_group_cache_lock); - btrfs_remove_free_space_cache(block_group); down_write(&block_group->space_info->groups_sem); list_del(&block_group->list); up_write(&block_group->space_info->groups_sem); + if (block_group->cached == BTRFS_CACHE_STARTED) + wait_event(block_group->caching_q, + block_group_cache_done(block_group)); + + btrfs_remove_free_space_cache(block_group); + WARN_ON(atomic_read(&block_group->count) != 1); kfree(block_group); @@ -7025,9 +7250,19 @@ int btrfs_read_block_groups(struct btrfs_root *root) atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); spin_lock_init(&cache->tree_lock); - mutex_init(&cache->cache_mutex); + cache->fs_info = info; + init_waitqueue_head(&cache->caching_q); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); + + /* + * we only want to have 32k of ram per block group for keeping + * track of free space, and if we pass 1/2 of that we want to + * start converting things over to using bitmaps + */ + cache->extents_thresh = ((1024 * 32) / 2) / + sizeof(struct btrfs_free_space); + read_extent_buffer(leaf, &cache->item, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(cache->item)); @@ -7036,6 +7271,26 @@ int btrfs_read_block_groups(struct btrfs_root *root) key.objectid = found_key.objectid + found_key.offset; btrfs_release_path(root, path); cache->flags = btrfs_block_group_flags(&cache->item); + cache->sectorsize = root->sectorsize; + + remove_sb_from_cache(root, cache); + + /* + * check for two cases, either we are full, and therefore + * don't need to bother with the caching work since we won't + * find any space, or we are empty, and we can just add all + * the space in and be done with it. This saves us _alot_ of + * time, particularly in the full case. + */ + if (found_key.offset == btrfs_block_group_used(&cache->item)) { + cache->cached = BTRFS_CACHE_FINISHED; + } else if (btrfs_block_group_used(&cache->item) == 0) { + cache->cached = BTRFS_CACHE_FINISHED; + add_new_free_space(cache, root->fs_info, + found_key.objectid, + found_key.objectid + + found_key.offset); + } ret = update_space_info(info, cache->flags, found_key.offset, btrfs_block_group_used(&cache->item), @@ -7079,10 +7334,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->key.objectid = chunk_offset; cache->key.offset = size; cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + cache->sectorsize = root->sectorsize; + + /* + * we only want to have 32k of ram per block group for keeping track + * of free space, and if we pass 1/2 of that we want to start + * converting things over to using bitmaps + */ + cache->extents_thresh = ((1024 * 32) / 2) / + sizeof(struct btrfs_free_space); atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); spin_lock_init(&cache->tree_lock); - mutex_init(&cache->cache_mutex); + init_waitqueue_head(&cache->caching_q); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); @@ -7091,6 +7355,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->flags = type; btrfs_set_block_group_flags(&cache->item, type); + cache->cached = BTRFS_CACHE_FINISHED; + remove_sb_from_cache(root, cache); + + add_new_free_space(cache, root->fs_info, chunk_offset, + chunk_offset + size); + ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); BUG_ON(ret); @@ -7149,7 +7419,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, rb_erase(&block_group->cache_node, &root->fs_info->block_group_cache_tree); spin_unlock(&root->fs_info->block_group_cache_lock); - btrfs_remove_free_space_cache(block_group); + down_write(&block_group->space_info->groups_sem); /* * we must use list_del_init so people can check to see if they @@ -7158,11 +7428,18 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, list_del_init(&block_group->list); up_write(&block_group->space_info->groups_sem); + if (block_group->cached == BTRFS_CACHE_STARTED) + wait_event(block_group->caching_q, + block_group_cache_done(block_group)); + + btrfs_remove_free_space_cache(block_group); + spin_lock(&block_group->space_info->lock); block_group->space_info->total_bytes -= block_group->key.offset; block_group->space_info->bytes_readonly -= block_group->key.offset; spin_unlock(&block_group->space_info->lock); - block_group->space_info->full = 0; + + btrfs_clear_space_info_full(root->fs_info); btrfs_put_block_group(block_group); btrfs_put_block_group(block_group); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 4538e48581a5..5edcee3a617f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -16,45 +16,46 @@ * Boston, MA 021110-1307, USA. */ +#include <linux/pagemap.h> #include <linux/sched.h> +#include <linux/math64.h> #include "ctree.h" #include "free-space-cache.h" #include "transaction.h" -struct btrfs_free_space { - struct rb_node bytes_index; - struct rb_node offset_index; - u64 offset; - u64 bytes; -}; +#define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) +#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) -static int tree_insert_offset(struct rb_root *root, u64 offset, - struct rb_node *node) +static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize, + u64 offset) { - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct btrfs_free_space *info; + BUG_ON(offset < bitmap_start); + offset -= bitmap_start; + return (unsigned long)(div64_u64(offset, sectorsize)); +} - while (*p) { - parent = *p; - info = rb_entry(parent, struct btrfs_free_space, offset_index); +static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize) +{ + return (unsigned long)(div64_u64(bytes, sectorsize)); +} - if (offset < info->offset) - p = &(*p)->rb_left; - else if (offset > info->offset) - p = &(*p)->rb_right; - else - return -EEXIST; - } +static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group, + u64 offset) +{ + u64 bitmap_start; + u64 bytes_per_bitmap; - rb_link_node(node, parent, p); - rb_insert_color(node, root); + bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize; + bitmap_start = offset - block_group->key.objectid; + bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap); + bitmap_start *= bytes_per_bitmap; + bitmap_start += block_group->key.objectid; - return 0; + return bitmap_start; } -static int tree_insert_bytes(struct rb_root *root, u64 bytes, - struct rb_node *node) +static int tree_insert_offset(struct rb_root *root, u64 offset, + struct rb_node *node, int bitmap) { struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; @@ -62,12 +63,34 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes, while (*p) { parent = *p; - info = rb_entry(parent, struct btrfs_free_space, bytes_index); + info = rb_entry(parent, struct btrfs_free_space, offset_index); - if (bytes < info->bytes) + if (offset < info->offset) { p = &(*p)->rb_left; - else + } else if (offset > info->offset) { p = &(*p)->rb_right; + } else { + /* + * we could have a bitmap entry and an extent entry + * share the same offset. If this is the case, we want + * the extent entry to always be found first if we do a + * linear search through the tree, since we want to have + * the quickest allocation time, and allocating from an + * extent is faster than allocating from a bitmap. So + * if we're inserting a bitmap and we find an entry at + * this offset, we want to go right, or after this entry + * logically. If we are inserting an extent and we've + * found a bitmap, we want to go left, or before + * logically. + */ + if (bitmap) { + WARN_ON(info->bitmap); + p = &(*p)->rb_right; + } else { + WARN_ON(!info->bitmap); + p = &(*p)->rb_left; + } + } } rb_link_node(node, parent, p); @@ -79,110 +102,143 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes, /* * searches the tree for the given offset. * - * fuzzy == 1: this is used for allocations where we are given a hint of where - * to look for free space. Because the hint may not be completely on an offset - * mark, or the hint may no longer point to free space we need to fudge our - * results a bit. So we look for free space starting at or after offset with at - * least bytes size. We prefer to find as close to the given offset as we can. - * Also if the offset is within a free space range, then we will return the free - * space that contains the given offset, which means we can return a free space - * chunk with an offset before the provided offset. - * - * fuzzy == 0: this is just a normal tree search. Give us the free space that - * starts at the given offset which is at least bytes size, and if its not there - * return NULL. + * fuzzy - If this is set, then we are trying to make an allocation, and we just + * want a section that has at least bytes size and comes at or after the given + * offset. */ -static struct btrfs_free_space *tree_search_offset(struct rb_root *root, - u64 offset, u64 bytes, - int fuzzy) +static struct btrfs_free_space * +tree_search_offset(struct btrfs_block_group_cache *block_group, + u64 offset, int bitmap_only, int fuzzy) { - struct rb_node *n = root->rb_node; - struct btrfs_free_space *entry, *ret = NULL; + struct rb_node *n = block_group->free_space_offset.rb_node; + struct btrfs_free_space *entry, *prev = NULL; + + /* find entry that is closest to the 'offset' */ + while (1) { + if (!n) { + entry = NULL; + break; + } - while (n) { entry = rb_entry(n, struct btrfs_free_space, offset_index); + prev = entry; - if (offset < entry->offset) { - if (fuzzy && - (!ret || entry->offset < ret->offset) && - (bytes <= entry->bytes)) - ret = entry; + if (offset < entry->offset) n = n->rb_left; - } else if (offset > entry->offset) { - if (fuzzy && - (entry->offset + entry->bytes - 1) >= offset && - bytes <= entry->bytes) { - ret = entry; - break; - } + else if (offset > entry->offset) n = n->rb_right; - } else { - if (bytes > entry->bytes) { - n = n->rb_right; - continue; - } - ret = entry; + else break; - } } - return ret; -} + if (bitmap_only) { + if (!entry) + return NULL; + if (entry->bitmap) + return entry; -/* - * return a chunk at least bytes size, as close to offset that we can get. - */ -static struct btrfs_free_space *tree_search_bytes(struct rb_root *root, - u64 offset, u64 bytes) -{ - struct rb_node *n = root->rb_node; - struct btrfs_free_space *entry, *ret = NULL; - - while (n) { - entry = rb_entry(n, struct btrfs_free_space, bytes_index); + /* + * bitmap entry and extent entry may share same offset, + * in that case, bitmap entry comes after extent entry. + */ + n = rb_next(n); + if (!n) + return NULL; + entry = rb_entry(n, struct btrfs_free_space, offset_index); + if (entry->offset != offset) + return NULL; - if (bytes < entry->bytes) { + WARN_ON(!entry->bitmap); + return entry; + } else if (entry) { + if (entry->bitmap) { /* - * We prefer to get a hole size as close to the size we - * are asking for so we don't take small slivers out of - * huge holes, but we also want to get as close to the - * offset as possible so we don't have a whole lot of - * fragmentation. + * if previous extent entry covers the offset, + * we should return it instead of the bitmap entry */ - if (offset <= entry->offset) { - if (!ret) - ret = entry; - else if (entry->bytes < ret->bytes) - ret = entry; - else if (entry->offset < ret->offset) - ret = entry; + n = &entry->offset_index; + while (1) { + n = rb_prev(n); + if (!n) + break; + prev = rb_entry(n, struct btrfs_free_space, + offset_index); + if (!prev->bitmap) { + if (prev->offset + prev->bytes > offset) + entry = prev; + break; + } } - n = n->rb_left; - } else if (bytes > entry->bytes) { - n = n->rb_right; + } + return entry; + } + + if (!prev) + return NULL; + + /* find last entry before the 'offset' */ + entry = prev; + if (entry->offset > offset) { + n = rb_prev(&entry->offset_index); + if (n) { + entry = rb_entry(n, struct btrfs_free_space, + offset_index); + BUG_ON(entry->offset > offset); } else { - /* - * Ok we may have multiple chunks of the wanted size, - * so we don't want to take the first one we find, we - * want to take the one closest to our given offset, so - * keep searching just in case theres a better match. - */ - n = n->rb_right; - if (offset > entry->offset) - continue; - else if (!ret || entry->offset < ret->offset) - ret = entry; + if (fuzzy) + return entry; + else + return NULL; } } - return ret; + if (entry->bitmap) { + n = &entry->offset_index; + while (1) { + n = rb_prev(n); + if (!n) + break; + prev = rb_entry(n, struct btrfs_free_space, + offset_index); + if (!prev->bitmap) { + if (prev->offset + prev->bytes > offset) + return prev; + break; + } + } + if (entry->offset + BITS_PER_BITMAP * + block_group->sectorsize > offset) + return entry; + } else if (entry->offset + entry->bytes > offset) + return entry; + + if (!fuzzy) + return NULL; + + while (1) { + if (entry->bitmap) { + if (entry->offset + BITS_PER_BITMAP * + block_group->sectorsize > offset) + break; + } else { + if (entry->offset + entry->bytes > offset) + break; + } + + n = rb_next(&entry->offset_index); + if (!n) + return NULL; + entry = rb_entry(n, struct btrfs_free_space, offset_index); + } + return entry; } static void unlink_free_space(struct btrfs_block_group_cache *block_group, struct btrfs_free_space *info) { rb_erase(&info->offset_index, &block_group->free_space_offset); - rb_erase(&info->bytes_index, &block_group->free_space_bytes); + block_group->free_extents--; + block_group->free_space -= info->bytes; } static int link_free_space(struct btrfs_block_group_cache *block_group, @@ -190,17 +246,353 @@ static int link_free_space(struct btrfs_block_group_cache *block_group, { int ret = 0; - - BUG_ON(!info->bytes); + BUG_ON(!info->bitmap && !info->bytes); ret = tree_insert_offset(&block_group->free_space_offset, info->offset, - &info->offset_index); + &info->offset_index, (info->bitmap != NULL)); if (ret) return ret; - ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes, - &info->bytes_index); - if (ret) - return ret; + block_group->free_space += info->bytes; + block_group->free_extents++; + return ret; +} + +static void recalculate_thresholds(struct btrfs_block_group_cache *block_group) +{ + u64 max_bytes, possible_bytes; + + /* + * The goal is to keep the total amount of memory used per 1gb of space + * at or below 32k, so we need to adjust how much memory we allow to be + * used by extent based free space tracking + */ + max_bytes = MAX_CACHE_BYTES_PER_GIG * + (div64_u64(block_group->key.offset, 1024 * 1024 * 1024)); + + possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) + + (sizeof(struct btrfs_free_space) * + block_group->extents_thresh); + + if (possible_bytes > max_bytes) { + int extent_bytes = max_bytes - + (block_group->total_bitmaps * PAGE_CACHE_SIZE); + + if (extent_bytes <= 0) { + block_group->extents_thresh = 0; + return; + } + + block_group->extents_thresh = extent_bytes / + (sizeof(struct btrfs_free_space)); + } +} + +static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + unsigned long start, end; + unsigned long i; + + start = offset_to_bit(info->offset, block_group->sectorsize, offset); + end = start + bytes_to_bits(bytes, block_group->sectorsize); + BUG_ON(end > BITS_PER_BITMAP); + + for (i = start; i < end; i++) + clear_bit(i, info->bitmap); + + info->bytes -= bytes; + block_group->free_space -= bytes; +} + +static void bitmap_set_bits(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info, u64 offset, + u64 bytes) +{ + unsigned long start, end; + unsigned long i; + + start = offset_to_bit(info->offset, block_group->sectorsize, offset); + end = start + bytes_to_bits(bytes, block_group->sectorsize); + BUG_ON(end > BITS_PER_BITMAP); + + for (i = start; i < end; i++) + set_bit(i, info->bitmap); + + info->bytes += bytes; + block_group->free_space += bytes; +} + +static int search_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *bitmap_info, u64 *offset, + u64 *bytes) +{ + unsigned long found_bits = 0; + unsigned long bits, i; + unsigned long next_zero; + + i = offset_to_bit(bitmap_info->offset, block_group->sectorsize, + max_t(u64, *offset, bitmap_info->offset)); + bits = bytes_to_bits(*bytes, block_group->sectorsize); + + for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i); + i < BITS_PER_BITMAP; + i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) { + next_zero = find_next_zero_bit(bitmap_info->bitmap, + BITS_PER_BITMAP, i); + if ((next_zero - i) >= bits) { + found_bits = next_zero - i; + break; + } + i = next_zero; + } + + if (found_bits) { + *offset = (u64)(i * block_group->sectorsize) + + bitmap_info->offset; + *bytes = (u64)(found_bits) * block_group->sectorsize; + return 0; + } + + return -1; +} + +static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache + *block_group, u64 *offset, + u64 *bytes, int debug) +{ + struct btrfs_free_space *entry; + struct rb_node *node; + int ret; + + if (!block_group->free_space_offset.rb_node) + return NULL; + + entry = tree_search_offset(block_group, + offset_to_bitmap(block_group, *offset), + 0, 1); + if (!entry) + return NULL; + + for (node = &entry->offset_index; node; node = rb_next(node)) { + entry = rb_entry(node, struct btrfs_free_space, offset_index); + if (entry->bytes < *bytes) + continue; + + if (entry->bitmap) { + ret = search_bitmap(block_group, entry, offset, bytes); + if (!ret) + return entry; + continue; + } + + *offset = entry->offset; + *bytes = entry->bytes; + return entry; + } + + return NULL; +} + +static void add_new_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info, u64 offset) +{ + u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize; + int max_bitmaps = (int)div64_u64(block_group->key.offset + + bytes_per_bg - 1, bytes_per_bg); + BUG_ON(block_group->total_bitmaps >= max_bitmaps); + + info->offset = offset_to_bitmap(block_group, offset); + link_free_space(block_group, info); + block_group->total_bitmaps++; + + recalculate_thresholds(block_group); +} + +static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *bitmap_info, + u64 *offset, u64 *bytes) +{ + u64 end; + u64 search_start, search_bytes; + int ret; + +again: + end = bitmap_info->offset + + (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1; + + /* + * XXX - this can go away after a few releases. + * + * since the only user of btrfs_remove_free_space is the tree logging + * stuff, and the only way to test that is under crash conditions, we + * want to have this debug stuff here just in case somethings not + * working. Search the bitmap for the space we are trying to use to + * make sure its actually there. If its not there then we need to stop + * because something has gone wrong. + */ + search_start = *offset; + search_bytes = *bytes; + ret = search_bitmap(block_group, bitmap_info, &search_start, + &search_bytes); + BUG_ON(ret < 0 || search_start != *offset); + + if (*offset > bitmap_info->offset && *offset + *bytes > end) { + bitmap_clear_bits(block_group, bitmap_info, *offset, + end - *offset + 1); + *bytes -= end - *offset + 1; + *offset = end + 1; + } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) { + bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes); + *bytes = 0; + } + + if (*bytes) { + struct rb_node *next = rb_next(&bitmap_info->offset_index); + if (!bitmap_info->bytes) { + unlink_free_space(block_group, bitmap_info); + kfree(bitmap_info->bitmap); + kfree(bitmap_info); + block_group->total_bitmaps--; + recalculate_thresholds(block_group); + } + + /* + * no entry after this bitmap, but we still have bytes to + * remove, so something has gone wrong. + */ + if (!next) + return -EINVAL; + + bitmap_info = rb_entry(next, struct btrfs_free_space, + offset_index); + + /* + * if the next entry isn't a bitmap we need to return to let the + * extent stuff do its work. + */ + if (!bitmap_info->bitmap) + return -EAGAIN; + + /* + * Ok the next item is a bitmap, but it may not actually hold + * the information for the rest of this free space stuff, so + * look for it, and if we don't find it return so we can try + * everything over again. + */ + search_start = *offset; + search_bytes = *bytes; + ret = search_bitmap(block_group, bitmap_info, &search_start, + &search_bytes); + if (ret < 0 || search_start != *offset) + return -EAGAIN; + + goto again; + } else if (!bitmap_info->bytes) { + unlink_free_space(block_group, bitmap_info); + kfree(bitmap_info->bitmap); + kfree(bitmap_info); + block_group->total_bitmaps--; + recalculate_thresholds(block_group); + } + + return 0; +} + +static int insert_into_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) +{ + struct btrfs_free_space *bitmap_info; + int added = 0; + u64 bytes, offset, end; + int ret; + + /* + * If we are below the extents threshold then we can add this as an + * extent, and don't have to deal with the bitmap + */ + if (block_group->free_extents < block_group->extents_thresh && + info->bytes > block_group->sectorsize * 4) + return 0; + + /* + * some block groups are so tiny they can't be enveloped by a bitmap, so + * don't even bother to create a bitmap for this + */ + if (BITS_PER_BITMAP * block_group->sectorsize > + block_group->key.offset) + return 0; + + bytes = info->bytes; + offset = info->offset; + +again: + bitmap_info = tree_search_offset(block_group, + offset_to_bitmap(block_group, offset), + 1, 0); + if (!bitmap_info) { + BUG_ON(added); + goto new_bitmap; + } + + end = bitmap_info->offset + + (u64)(BITS_PER_BITMAP * block_group->sectorsize); + + if (offset >= bitmap_info->offset && offset + bytes > end) { + bitmap_set_bits(block_group, bitmap_info, offset, + end - offset); + bytes -= end - offset; + offset = end; + added = 0; + } else if (offset >= bitmap_info->offset && offset + bytes <= end) { + bitmap_set_bits(block_group, bitmap_info, offset, bytes); + bytes = 0; + } else { + BUG(); + } + + if (!bytes) { + ret = 1; + goto out; + } else + goto again; + +new_bitmap: + if (info && info->bitmap) { + add_new_bitmap(block_group, info, offset); + added = 1; + info = NULL; + goto again; + } else { + spin_unlock(&block_group->tree_lock); + + /* no pre-allocated info, allocate a new one */ + if (!info) { + info = kzalloc(sizeof(struct btrfs_free_space), + GFP_NOFS); + if (!info) { + spin_lock(&block_group->tree_lock); + ret = -ENOMEM; + goto out; + } + } + + /* allocate the bitmap */ + info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS); + spin_lock(&block_group->tree_lock); + if (!info->bitmap) { + ret = -ENOMEM; + goto out; + } + goto again; + } + +out: + if (info) { + if (info->bitmap) + kfree(info->bitmap); + kfree(info); + } return ret; } @@ -208,8 +600,8 @@ static int link_free_space(struct btrfs_block_group_cache *block_group, int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, u64 offset, u64 bytes) { - struct btrfs_free_space *right_info; - struct btrfs_free_space *left_info; + struct btrfs_free_space *right_info = NULL; + struct btrfs_free_space *left_info = NULL; struct btrfs_free_space *info = NULL; int ret = 0; @@ -227,18 +619,38 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, * are adding, if there is remove that struct and add a new one to * cover the entire range */ - right_info = tree_search_offset(&block_group->free_space_offset, - offset+bytes, 0, 0); - left_info = tree_search_offset(&block_group->free_space_offset, - offset-1, 0, 1); + right_info = tree_search_offset(block_group, offset + bytes, 0, 0); + if (right_info && rb_prev(&right_info->offset_index)) + left_info = rb_entry(rb_prev(&right_info->offset_index), + struct btrfs_free_space, offset_index); + else + left_info = tree_search_offset(block_group, offset - 1, 0, 0); + + /* + * If there was no extent directly to the left or right of this new + * extent then we know we're going to have to allocate a new extent, so + * before we do that see if we need to drop this into a bitmap + */ + if ((!left_info || left_info->bitmap) && + (!right_info || right_info->bitmap)) { + ret = insert_into_bitmap(block_group, info); + + if (ret < 0) { + goto out; + } else if (ret) { + ret = 0; + goto out; + } + } - if (right_info) { + if (right_info && !right_info->bitmap) { unlink_free_space(block_group, right_info); info->bytes += right_info->bytes; kfree(right_info); } - if (left_info && left_info->offset + left_info->bytes == offset) { + if (left_info && !left_info->bitmap && + left_info->offset + left_info->bytes == offset) { unlink_free_space(block_group, left_info); info->offset = left_info->offset; info->bytes += left_info->bytes; @@ -248,11 +660,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, ret = link_free_space(block_group, info); if (ret) kfree(info); - +out: spin_unlock(&block_group->tree_lock); if (ret) { - printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); + printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret); BUG_ON(ret == -EEXIST); } @@ -263,40 +675,74 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, u64 offset, u64 bytes) { struct btrfs_free_space *info; + struct btrfs_free_space *next_info = NULL; int ret = 0; spin_lock(&block_group->tree_lock); - info = tree_search_offset(&block_group->free_space_offset, offset, 0, - 1); - if (info && info->offset == offset) { - if (info->bytes < bytes) { - printk(KERN_ERR "Found free space at %llu, size %llu," - "trying to use %llu\n", - (unsigned long long)info->offset, - (unsigned long long)info->bytes, - (unsigned long long)bytes); +again: + info = tree_search_offset(block_group, offset, 0, 0); + if (!info) { + /* + * oops didn't find an extent that matched the space we wanted + * to remove, look for a bitmap instead + */ + info = tree_search_offset(block_group, + offset_to_bitmap(block_group, offset), + 1, 0); + if (!info) { + WARN_ON(1); + goto out_lock; + } + } + + if (info->bytes < bytes && rb_next(&info->offset_index)) { + u64 end; + next_info = rb_entry(rb_next(&info->offset_index), + struct btrfs_free_space, + offset_index); + + if (next_info->bitmap) + end = next_info->offset + BITS_PER_BITMAP * + block_group->sectorsize - 1; + else + end = next_info->offset + next_info->bytes; + + if (next_info->bytes < bytes || + next_info->offset > offset || offset > end) { + printk(KERN_CRIT "Found free space at %llu, size %llu," + " trying to use %llu\n", + (unsigned long long)info->offset, + (unsigned long long)info->bytes, + (unsigned long long)bytes); WARN_ON(1); ret = -EINVAL; - spin_unlock(&block_group->tree_lock); - goto out; + goto out_lock; } - unlink_free_space(block_group, info); - if (info->bytes == bytes) { - kfree(info); - spin_unlock(&block_group->tree_lock); - goto out; + info = next_info; + } + + if (info->bytes == bytes) { + unlink_free_space(block_group, info); + if (info->bitmap) { + kfree(info->bitmap); + block_group->total_bitmaps--; } + kfree(info); + goto out_lock; + } + if (!info->bitmap && info->offset == offset) { + unlink_free_space(block_group, info); info->offset += bytes; info->bytes -= bytes; + link_free_space(block_group, info); + goto out_lock; + } - ret = link_free_space(block_group, info); - spin_unlock(&block_group->tree_lock); - BUG_ON(ret); - } else if (info && info->offset < offset && - info->offset + info->bytes >= offset + bytes) { + if (!info->bitmap && info->offset <= offset && + info->offset + info->bytes >= offset + bytes) { u64 old_start = info->offset; /* * we're freeing space in the middle of the info, @@ -312,7 +758,9 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, info->offset = offset + bytes; info->bytes = old_end - info->offset; ret = link_free_space(block_group, info); - BUG_ON(ret); + WARN_ON(ret); + if (ret) + goto out_lock; } else { /* the hole we're creating ends at the end * of the info struct, just free the info @@ -320,32 +768,22 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, kfree(info); } spin_unlock(&block_group->tree_lock); - /* step two, insert a new info struct to cover anything - * before the hole + + /* step two, insert a new info struct to cover + * anything before the hole */ ret = btrfs_add_free_space(block_group, old_start, offset - old_start); - BUG_ON(ret); - } else { - spin_unlock(&block_group->tree_lock); - if (!info) { - printk(KERN_ERR "couldn't find space %llu to free\n", - (unsigned long long)offset); - printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n", - block_group->cached, - (unsigned long long)block_group->key.objectid, - (unsigned long long)block_group->key.offset); - btrfs_dump_free_space(block_group, bytes); - } else if (info) { - printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, " - "but wanted offset=%llu bytes=%llu\n", - (unsigned long long)info->offset, - (unsigned long long)info->bytes, - (unsigned long long)offset, - (unsigned long long)bytes); - } - WARN_ON(1); + WARN_ON(ret); + goto out; } + + ret = remove_from_bitmap(block_group, info, &offset, &bytes); + if (ret == -EAGAIN) + goto again; + BUG_ON(ret); +out_lock: + spin_unlock(&block_group->tree_lock); out: return ret; } @@ -361,10 +799,13 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, info = rb_entry(n, struct btrfs_free_space, offset_index); if (info->bytes >= bytes) count++; - printk(KERN_ERR "entry offset %llu, bytes %llu\n", + printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n", (unsigned long long)info->offset, - (unsigned long long)info->bytes); + (unsigned long long)info->bytes, + (info->bitmap) ? "yes" : "no"); } + printk(KERN_INFO "block group has cluster?: %s\n", + list_empty(&block_group->cluster_list) ? "no" : "yes"); printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" "\n", count); } @@ -397,26 +838,35 @@ __btrfs_return_cluster_to_free_space( { struct btrfs_free_space *entry; struct rb_node *node; + bool bitmap; spin_lock(&cluster->lock); if (cluster->block_group != block_group) goto out; + bitmap = cluster->points_to_bitmap; + cluster->block_group = NULL; cluster->window_start = 0; + list_del_init(&cluster->block_group_list); + cluster->points_to_bitmap = false; + + if (bitmap) + goto out; + node = rb_first(&cluster->root); - while(node) { + while (node) { entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); rb_erase(&entry->offset_index, &cluster->root); - link_free_space(block_group, entry); + BUG_ON(entry->bitmap); + tree_insert_offset(&block_group->free_space_offset, + entry->offset, &entry->offset_index, 0); } - list_del_init(&cluster->block_group_list); - - btrfs_put_block_group(cluster->block_group); - cluster->block_group = NULL; cluster->root.rb_node = NULL; + out: spin_unlock(&cluster->lock); + btrfs_put_block_group(block_group); return 0; } @@ -425,20 +875,28 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) struct btrfs_free_space *info; struct rb_node *node; struct btrfs_free_cluster *cluster; - struct btrfs_free_cluster *safe; + struct list_head *head; spin_lock(&block_group->tree_lock); - - list_for_each_entry_safe(cluster, safe, &block_group->cluster_list, - block_group_list) { + while ((head = block_group->cluster_list.next) != + &block_group->cluster_list) { + cluster = list_entry(head, struct btrfs_free_cluster, + block_group_list); WARN_ON(cluster->block_group != block_group); __btrfs_return_cluster_to_free_space(block_group, cluster); + if (need_resched()) { + spin_unlock(&block_group->tree_lock); + cond_resched(); + spin_lock(&block_group->tree_lock); + } } - while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { - info = rb_entry(node, struct btrfs_free_space, bytes_index); + while ((node = rb_last(&block_group->free_space_offset)) != NULL) { + info = rb_entry(node, struct btrfs_free_space, offset_index); unlink_free_space(block_group, info); + if (info->bitmap) + kfree(info->bitmap); kfree(info); if (need_resched()) { spin_unlock(&block_group->tree_lock); @@ -446,6 +904,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) spin_lock(&block_group->tree_lock); } } + spin_unlock(&block_group->tree_lock); } @@ -453,25 +912,35 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group, u64 offset, u64 bytes, u64 empty_size) { struct btrfs_free_space *entry = NULL; + u64 bytes_search = bytes + empty_size; u64 ret = 0; spin_lock(&block_group->tree_lock); - entry = tree_search_offset(&block_group->free_space_offset, offset, - bytes + empty_size, 1); + entry = find_free_space(block_group, &offset, &bytes_search, 0); if (!entry) - entry = tree_search_bytes(&block_group->free_space_bytes, - offset, bytes + empty_size); - if (entry) { + goto out; + + ret = offset; + if (entry->bitmap) { + bitmap_clear_bits(block_group, entry, offset, bytes); + if (!entry->bytes) { + unlink_free_space(block_group, entry); + kfree(entry->bitmap); + kfree(entry); + block_group->total_bitmaps--; + recalculate_thresholds(block_group); + } + } else { unlink_free_space(block_group, entry); - ret = entry->offset; entry->offset += bytes; entry->bytes -= bytes; - if (!entry->bytes) kfree(entry); else link_free_space(block_group, entry); } + +out: spin_unlock(&block_group->tree_lock); return ret; @@ -517,6 +986,54 @@ int btrfs_return_cluster_to_free_space( return ret; } +static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, + struct btrfs_free_cluster *cluster, + u64 bytes, u64 min_start) +{ + struct btrfs_free_space *entry; + int err; + u64 search_start = cluster->window_start; + u64 search_bytes = bytes; + u64 ret = 0; + + spin_lock(&block_group->tree_lock); + spin_lock(&cluster->lock); + + if (!cluster->points_to_bitmap) + goto out; + + if (cluster->block_group != block_group) + goto out; + + /* + * search_start is the beginning of the bitmap, but at some point it may + * be a good idea to point to the actual start of the free area in the + * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only + * to 1 to make sure we get the bitmap entry + */ + entry = tree_search_offset(block_group, + offset_to_bitmap(block_group, search_start), + 1, 0); + if (!entry || !entry->bitmap) + goto out; + + search_start = min_start; + search_bytes = bytes; + + err = search_bitmap(block_group, entry, &search_start, + &search_bytes); + if (err) + goto out; + + ret = search_start; + bitmap_clear_bits(block_group, entry, ret, bytes); +out: + spin_unlock(&cluster->lock); + spin_unlock(&block_group->tree_lock); + + return ret; +} + /* * given a cluster, try to allocate 'bytes' from it, returns 0 * if it couldn't find anything suitably large, or a logical disk offset @@ -530,6 +1047,10 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, struct rb_node *node; u64 ret = 0; + if (cluster->points_to_bitmap) + return btrfs_alloc_from_bitmap(block_group, cluster, bytes, + min_start); + spin_lock(&cluster->lock); if (bytes > cluster->max_size) goto out; @@ -567,9 +1088,73 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group, } out: spin_unlock(&cluster->lock); + return ret; } +static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *entry, + struct btrfs_free_cluster *cluster, + u64 offset, u64 bytes, u64 min_bytes) +{ + unsigned long next_zero; + unsigned long i; + unsigned long search_bits; + unsigned long total_bits; + unsigned long found_bits; + unsigned long start = 0; + unsigned long total_found = 0; + bool found = false; + + i = offset_to_bit(entry->offset, block_group->sectorsize, + max_t(u64, offset, entry->offset)); + search_bits = bytes_to_bits(min_bytes, block_group->sectorsize); + total_bits = bytes_to_bits(bytes, block_group->sectorsize); + +again: + found_bits = 0; + for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i); + i < BITS_PER_BITMAP; + i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) { + next_zero = find_next_zero_bit(entry->bitmap, + BITS_PER_BITMAP, i); + if (next_zero - i >= search_bits) { + found_bits = next_zero - i; + break; + } + i = next_zero; + } + + if (!found_bits) + return -1; + + if (!found) { + start = i; + found = true; + } + + total_found += found_bits; + + if (cluster->max_size < found_bits * block_group->sectorsize) + cluster->max_size = found_bits * block_group->sectorsize; + + if (total_found < total_bits) { + i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero); + if (i - start > total_bits * 2) { + total_found = 0; + cluster->max_size = 0; + found = false; + } + goto again; + } + + cluster->window_start = start * block_group->sectorsize + + entry->offset; + cluster->points_to_bitmap = true; + + return 0; +} + /* * here we try to find a cluster of blocks in a block group. The goal * is to find at least bytes free and up to empty_size + bytes free. @@ -587,12 +1172,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, struct btrfs_free_space *entry = NULL; struct rb_node *node; struct btrfs_free_space *next; - struct btrfs_free_space *last; + struct btrfs_free_space *last = NULL; u64 min_bytes; u64 window_start; u64 window_free; u64 max_extent = 0; - int total_retries = 0; + bool found_bitmap = false; int ret; /* for metadata, allow allocates with more holes */ @@ -620,31 +1205,80 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans, goto out; } again: - min_bytes = min(min_bytes, bytes + empty_size); - entry = tree_search_bytes(&block_group->free_space_bytes, - offset, min_bytes); + entry = tree_search_offset(block_group, offset, found_bitmap, 1); if (!entry) { ret = -ENOSPC; goto out; } + + /* + * If found_bitmap is true, we exhausted our search for extent entries, + * and we just want to search all of the bitmaps that we can find, and + * ignore any extent entries we find. + */ + while (entry->bitmap || found_bitmap || + (!entry->bitmap && entry->bytes < min_bytes)) { + struct rb_node *node = rb_next(&entry->offset_index); + + if (entry->bitmap && entry->bytes > bytes + empty_size) { + ret = btrfs_bitmap_cluster(block_group, entry, cluster, + offset, bytes + empty_size, + min_bytes); + if (!ret) + goto got_it; + } + + if (!node) { + ret = -ENOSPC; + goto out; + } + entry = rb_entry(node, struct btrfs_free_space, offset_index); + } + + /* + * We already searched all the extent entries from the passed in offset + * to the end and didn't find enough space for the cluster, and we also + * didn't find any bitmaps that met our criteria, just go ahead and exit + */ + if (found_bitmap) { + ret = -ENOSPC; + goto out; + } + + cluster->points_to_bitmap = false; window_start = entry->offset; window_free = entry->bytes; last = entry; max_extent = entry->bytes; - while(1) { + while (1) { /* out window is just right, lets fill it */ if (window_free >= bytes + empty_size) break; node = rb_next(&last->offset_index); if (!node) { + if (found_bitmap) + goto again; ret = -ENOSPC; goto out; } next = rb_entry(node, struct btrfs_free_space, offset_index); /* + * we found a bitmap, so if this search doesn't result in a + * cluster, we know to go and search again for the bitmaps and + * start looking for space there + */ + if (next->bitmap) { + if (!found_bitmap) + offset = next->offset; + found_bitmap = true; + last = next; + continue; + } + + /* * we haven't filled the empty size and the window is * very large. reset and try again */ @@ -655,19 +1289,6 @@ again: window_free = entry->bytes; last = entry; max_extent = 0; - total_retries++; - if (total_retries % 64 == 0) { - if (min_bytes >= (bytes + empty_size)) { - ret = -ENOSPC; - goto out; - } - /* - * grow our allocation a bit, we're not having - * much luck - */ - min_bytes *= 2; - goto again; - } } else { last = next; window_free += next->bytes; @@ -685,11 +1306,19 @@ again: * The cluster includes an rbtree, but only uses the offset index * of each free space cache entry. */ - while(1) { + while (1) { node = rb_next(&entry->offset_index); - unlink_free_space(block_group, entry); + if (entry->bitmap && node) { + entry = rb_entry(node, struct btrfs_free_space, + offset_index); + continue; + } else if (entry->bitmap && !node) { + break; + } + + rb_erase(&entry->offset_index, &block_group->free_space_offset); ret = tree_insert_offset(&cluster->root, entry->offset, - &entry->offset_index); + &entry->offset_index, 0); BUG_ON(ret); if (!node || entry == last) @@ -697,8 +1326,10 @@ again: entry = rb_entry(node, struct btrfs_free_space, offset_index); } - ret = 0; + cluster->max_size = max_extent; +got_it: + ret = 0; atomic_inc(&block_group->count); list_add_tail(&cluster->block_group_list, &block_group->cluster_list); cluster->block_group = block_group; @@ -718,6 +1349,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) spin_lock_init(&cluster->refill_lock); cluster->root.rb_node = NULL; cluster->max_size = 0; + cluster->points_to_bitmap = false; INIT_LIST_HEAD(&cluster->block_group_list); cluster->block_group = NULL; } diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index 266fb8764054..890a8e79011b 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -19,6 +19,14 @@ #ifndef __BTRFS_FREE_SPACE_CACHE #define __BTRFS_FREE_SPACE_CACHE +struct btrfs_free_space { + struct rb_node offset_index; + u64 offset; + u64 bytes; + unsigned long *bitmap; + struct list_head list; +}; + int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, u64 bytenr, u64 size); int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 791eab19e330..59cba180fe83 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2603,8 +2603,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, if (root->ref_cows) btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); path = btrfs_alloc_path(); - path->reada = -1; BUG_ON(!path); + path->reada = -1; /* FIXME, add redo link to tree so we don't leak on crash */ key.objectid = inode->i_ino; @@ -3099,8 +3099,12 @@ static void inode_tree_add(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_inode *entry; - struct rb_node **p = &root->inode_tree.rb_node; - struct rb_node *parent = NULL; + struct rb_node **p; + struct rb_node *parent; + +again: + p = &root->inode_tree.rb_node; + parent = NULL; spin_lock(&root->inode_lock); while (*p) { @@ -3108,13 +3112,16 @@ static void inode_tree_add(struct inode *inode) entry = rb_entry(parent, struct btrfs_inode, rb_node); if (inode->i_ino < entry->vfs_inode.i_ino) - p = &(*p)->rb_left; + p = &parent->rb_left; else if (inode->i_ino > entry->vfs_inode.i_ino) - p = &(*p)->rb_right; + p = &parent->rb_right; else { WARN_ON(!(entry->vfs_inode.i_state & (I_WILL_FREE | I_FREEING | I_CLEAR))); - break; + rb_erase(parent, &root->inode_tree); + RB_CLEAR_NODE(parent); + spin_unlock(&root->inode_lock); + goto again; } } rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); @@ -3126,12 +3133,12 @@ static void inode_tree_del(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + spin_lock(&root->inode_lock); if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { - spin_lock(&root->inode_lock); rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); - spin_unlock(&root->inode_lock); RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); } + spin_unlock(&root->inode_lock); } static noinline void init_btrfs_i(struct inode *inode) @@ -4785,8 +4792,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, * and the replacement file is large. Start IO on it now so * we don't add too much work to the end of the transaction */ - if (new_inode && old_inode && S_ISREG(old_inode->i_mode) && - new_inode->i_size && + if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) filemap_flush(old_inode->i_mapping); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 6d6523da0a30..0d126be22b63 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -309,7 +309,7 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) } printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n", (unsigned long long)btrfs_header_bytenr(c), - btrfs_header_level(c), nr, + level, nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); for (i = 0; i < nr; i++) { btrfs_node_key_to_cpu(c, &key, i); @@ -326,10 +326,10 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) btrfs_level_size(root, level - 1), btrfs_node_ptr_generation(c, i)); if (btrfs_is_leaf(next) && - btrfs_header_level(c) != 1) + level != 1) BUG(); if (btrfs_header_level(next) != - btrfs_header_level(c) - 1) + level - 1) BUG(); btrfs_print_tree(root, next); free_extent_buffer(next); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 008397934778..c04f7f212602 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -670,6 +670,8 @@ again: err = ret; goto out; } + if (ret > 0 && path2->slots[level] > 0) + path2->slots[level]--; eb = path2->nodes[level]; WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) != @@ -1609,6 +1611,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, BUG_ON(level == 0); path->lowest_level = level; ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0); + path->lowest_level = 0; if (ret < 0) { btrfs_free_path(path); return ret; @@ -2550,8 +2553,13 @@ int relocate_inode_pages(struct inode *inode, u64 start, u64 len) last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; /* make sure the dirty trick played by the caller work */ - ret = invalidate_inode_pages2_range(inode->i_mapping, - first_index, last_index); + while (1) { + ret = invalidate_inode_pages2_range(inode->i_mapping, + first_index, last_index); + if (ret != -EBUSY) + break; + schedule_timeout(HZ/10); + } if (ret) goto out_unlock; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 2dbf1c1f56ee..cdbb5022da52 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -40,6 +40,12 @@ static noinline void put_transaction(struct btrfs_transaction *transaction) } } +static noinline void switch_commit_root(struct btrfs_root *root) +{ + free_extent_buffer(root->commit_root); + root->commit_root = btrfs_root_node(root); +} + /* * either allocate a new transaction or hop into the existing one */ @@ -444,9 +450,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, btrfs_write_dirty_block_groups(trans, root); - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); - while (1) { old_root_bytenr = btrfs_root_bytenr(&root->root_item); if (old_root_bytenr == root->node->start) @@ -457,13 +460,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, &root->root_key, &root->root_item); BUG_ON(ret); - btrfs_write_dirty_block_groups(trans, root); - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + ret = btrfs_write_dirty_block_groups(trans, root); BUG_ON(ret); } - free_extent_buffer(root->commit_root); - root->commit_root = btrfs_root_node(root); + + if (root != root->fs_info->extent_root) + switch_commit_root(root); + return 0; } @@ -495,10 +499,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans, root = list_entry(next, struct btrfs_root, dirty_list); update_cowonly_root(trans, root); - - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - BUG_ON(ret); } + + down_write(&fs_info->extent_commit_sem); + switch_commit_root(fs_info->extent_root); + up_write(&fs_info->extent_commit_sem); + return 0; } @@ -544,8 +550,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans, btrfs_update_reloc_root(trans, root); if (root->commit_root != root->node) { - free_extent_buffer(root->commit_root); - root->commit_root = btrfs_root_node(root); + switch_commit_root(root); btrfs_set_root_node(&root->root_item, root->node); } @@ -852,6 +857,16 @@ static void update_super_roots(struct btrfs_root *root) super->root_level = root_item->level; } +int btrfs_transaction_in_commit(struct btrfs_fs_info *info) +{ + int ret = 0; + spin_lock(&info->new_trans_lock); + if (info->running_transaction) + ret = info->running_transaction->in_commit; + spin_unlock(&info->new_trans_lock); + return ret; +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root) { @@ -943,9 +958,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_unlock(&root->fs_info->trans_mutex); - if (flush_on_commit || snap_pending) { - if (flush_on_commit) - btrfs_start_delalloc_inodes(root); + if (flush_on_commit) { + btrfs_start_delalloc_inodes(root); + ret = btrfs_wait_ordered_extents(root, 0); + BUG_ON(ret); + } else if (snap_pending) { ret = btrfs_wait_ordered_extents(root, 1); BUG_ON(ret); } @@ -1009,15 +1026,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, btrfs_set_root_node(&root->fs_info->tree_root->root_item, root->fs_info->tree_root->node); - free_extent_buffer(root->fs_info->tree_root->commit_root); - root->fs_info->tree_root->commit_root = - btrfs_root_node(root->fs_info->tree_root); + switch_commit_root(root->fs_info->tree_root); btrfs_set_root_node(&root->fs_info->chunk_root->root_item, root->fs_info->chunk_root->node); - free_extent_buffer(root->fs_info->chunk_root->commit_root); - root->fs_info->chunk_root->commit_root = - btrfs_root_node(root->fs_info->chunk_root); + switch_commit_root(root->fs_info->chunk_root); update_super_roots(root); @@ -1057,6 +1070,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, cur_trans->commit_done = 1; root->fs_info->last_trans_committed = cur_trans->transid; + wake_up(&cur_trans->commit_wait); put_transaction(cur_trans); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 961c3ee5a2e1..663c67404918 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -107,4 +107,5 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, struct extent_io_tree *dirty_pages); +int btrfs_transaction_in_commit(struct btrfs_fs_info *info); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index c13922206d1b..d91b0de7c502 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -797,7 +797,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, return -ENOENT; inode = read_one_inode(root, key->objectid); - BUG_ON(!dir); + BUG_ON(!inode); ref_ptr = btrfs_item_ptr_offset(eb, slot); ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3ab80e9cd767..5cf405b0828d 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -260,7 +260,7 @@ loop_lock: num_run++; batch_run++; - if (bio_sync(cur)) + if (bio_rw_flagged(cur, BIO_RW_SYNCIO)) num_sync_run++; if (need_resched()) { @@ -721,7 +721,8 @@ error: */ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, - u64 num_bytes, u64 *start) + u64 num_bytes, u64 *start, + u64 *max_avail) { struct btrfs_key key; struct btrfs_root *root = device->dev_root; @@ -758,9 +759,13 @@ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, 0, 0); if (ret < 0) goto error; - ret = btrfs_previous_item(root, path, 0, key.type); - if (ret < 0) - goto error; + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, key.type); + if (ret < 0) + goto error; + if (ret > 0) + start_found = 1; + } l = path->nodes[0]; btrfs_item_key_to_cpu(l, &key, path->slots[0]); while (1) { @@ -803,6 +808,10 @@ no_more_items: if (last_byte < search_start) last_byte = search_start; hole_size = key.offset - last_byte; + + if (hole_size > *max_avail) + *max_avail = hole_size; + if (key.offset > last_byte && hole_size >= num_bytes) { *start = last_byte; @@ -1621,6 +1630,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, device->fs_devices->total_rw_bytes += diff; device->total_bytes = new_size; + device->disk_total_bytes = new_size; btrfs_clear_space_info_full(device->dev_root->fs_info); return btrfs_update_device(trans, device); @@ -2007,7 +2017,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) goto done; if (ret) { ret = 0; - goto done; + break; } l = path->nodes[0]; @@ -2015,7 +2025,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) btrfs_item_key_to_cpu(l, &key, path->slots[0]); if (key.objectid != device->devid) - goto done; + break; dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); length = btrfs_dev_extent_length(l, dev_extent); @@ -2171,6 +2181,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_chunk_size); again: + max_avail = 0; if (!map || map->num_stripes != num_stripes) { kfree(map); map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); @@ -2219,7 +2230,8 @@ again: if (device->in_fs_metadata && avail >= min_free) { ret = find_free_dev_extent(trans, device, - min_free, &dev_offset); + min_free, &dev_offset, + &max_avail); if (ret == 0) { list_move_tail(&device->dev_alloc_list, &private_devs); @@ -2795,26 +2807,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, } } - for (i = 0; i > nr; i++) { - struct btrfs_multi_bio *multi; - struct btrfs_bio_stripe *stripe; - int ret; - - length = 1; - ret = btrfs_map_block(map_tree, WRITE, buf[i], - &length, &multi, 0); - BUG_ON(ret); - - stripe = multi->stripes; - for (j = 0; j < multi->num_stripes; j++) { - if (stripe->physical >= physical && - physical < stripe->physical + length) - break; - } - BUG_ON(j >= multi->num_stripes); - kfree(multi); - } - *logical = buf; *naddrs = nr; *stripe_len = map->stripe_len; @@ -2911,7 +2903,7 @@ static noinline int schedule_bio(struct btrfs_root *root, bio->bi_rw |= rw; spin_lock(&device->io_lock); - if (bio_sync(bio)) + if (bio_rw_flagged(bio, BIO_RW_SYNCIO)) pending_bios = &device->pending_sync_bios; else pending_bios = &device->pending_bios; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index ecfbce836d32..3e2b90eaa239 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -208,7 +208,7 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, *total_in = 0; workspace = find_zlib_workspace(); - if (!workspace) + if (IS_ERR(workspace)) return -1; if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { @@ -366,7 +366,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in, char *kaddr; workspace = find_zlib_workspace(); - if (!workspace) + if (IS_ERR(workspace)) return -ENOMEM; data_in = kmap(pages_in[page_in_index]); @@ -547,7 +547,7 @@ int btrfs_zlib_decompress(unsigned char *data_in, return -ENOMEM; workspace = find_zlib_workspace(); - if (!workspace) + if (IS_ERR(workspace)) return -ENOMEM; workspace->inf_strm.next_in = data_in; diff --git a/fs/buffer.c b/fs/buffer.c index a3ef091a45bd..90a98865b0cc 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -281,7 +281,7 @@ static void free_more_memory(void) struct zone *zone; int nid; - wakeup_pdflush(1024); + wakeup_flusher_threads(1024); yield(); for_each_online_node(nid) { @@ -1165,8 +1165,11 @@ void mark_buffer_dirty(struct buffer_head *bh) if (!test_set_buffer_dirty(bh)) { struct page *page = bh->b_page; - if (!TestSetPageDirty(page)) - __set_page_dirty(page, page_mapping(page), 0); + if (!TestSetPageDirty(page)) { + struct address_space *mapping = page_mapping(page); + if (mapping) + __set_page_dirty(page, mapping, 0); + } } } diff --git a/fs/char_dev.c b/fs/char_dev.c index a173551e19d7..3cbc57f932d2 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -31,6 +31,7 @@ * - no readahead or I/O queue unplugging required */ struct backing_dev_info directly_mappable_cdev_bdi = { + .name = "char", .capabilities = ( #ifdef CONFIG_MMU /* permit private copies of the data to be taken */ @@ -237,8 +238,10 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, } /** - * register_chrdev() - Register a major number for character devices. + * __register_chrdev() - create and register a cdev occupying a range of minors * @major: major device number or 0 for dynamic allocation + * @baseminor: first of the requested range of minor numbers + * @count: the number of minor numbers required * @name: name of this range of devices * @fops: file operations associated with this devices * @@ -254,19 +257,17 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count, * /dev. It only helps to keep track of the different owners of devices. If * your module name has only one type of devices it's ok to use e.g. the name * of the module here. - * - * This function registers a range of 256 minor numbers. The first minor number - * is 0. */ -int register_chrdev(unsigned int major, const char *name, - const struct file_operations *fops) +int __register_chrdev(unsigned int major, unsigned int baseminor, + unsigned int count, const char *name, + const struct file_operations *fops) { struct char_device_struct *cd; struct cdev *cdev; char *s; int err = -ENOMEM; - cd = __register_chrdev_region(major, 0, 256, name); + cd = __register_chrdev_region(major, baseminor, count, name); if (IS_ERR(cd)) return PTR_ERR(cd); @@ -280,7 +281,7 @@ int register_chrdev(unsigned int major, const char *name, for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/')) *s = '!'; - err = cdev_add(cdev, MKDEV(cd->major, 0), 256); + err = cdev_add(cdev, MKDEV(cd->major, baseminor), count); if (err) goto out; @@ -290,7 +291,7 @@ int register_chrdev(unsigned int major, const char *name, out: kobject_put(&cdev->kobj); out2: - kfree(__unregister_chrdev_region(cd->major, 0, 256)); + kfree(__unregister_chrdev_region(cd->major, baseminor, count)); return err; } @@ -316,10 +317,23 @@ void unregister_chrdev_region(dev_t from, unsigned count) } } -void unregister_chrdev(unsigned int major, const char *name) +/** + * __unregister_chrdev - unregister and destroy a cdev + * @major: major device number + * @baseminor: first of the range of minor numbers + * @count: the number of minor numbers this cdev is occupying + * @name: name of this range of devices + * + * Unregister and destroy the cdev occupying the region described by + * @major, @baseminor and @count. This function undoes what + * __register_chrdev() did. + */ +void __unregister_chrdev(unsigned int major, unsigned int baseminor, + unsigned int count, const char *name) { struct char_device_struct *cd; - cd = __unregister_chrdev_region(major, 0, 256); + + cd = __unregister_chrdev_region(major, baseminor, count); if (cd && cd->cdev) cdev_del(cd->cdev); kfree(cd); @@ -568,6 +582,6 @@ EXPORT_SYMBOL(cdev_alloc); EXPORT_SYMBOL(cdev_del); EXPORT_SYMBOL(cdev_add); EXPORT_SYMBOL(cdev_index); -EXPORT_SYMBOL(register_chrdev); -EXPORT_SYMBOL(unregister_chrdev); +EXPORT_SYMBOL(__register_chrdev); +EXPORT_SYMBOL(__unregister_chrdev); EXPORT_SYMBOL(directly_mappable_cdev_bdi); diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 92888aa90749..145540a316ab 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,3 +1,13 @@ +Version 1.60 +------------- +Fix memory leak in reconnect. Fix oops in DFS mount error path. +Set s_maxbytes to smaller (the max that vfs can handle) so that +sendfile will now work over cifs mounts again. Add noforcegid +and noforceuid mount parameters. Fix small mem leak when using +ntlmv2. Fix 2nd mount to same server but with different port to +be allowed (rather than reusing the 1st port) - only when the +user explicitly overrides the port on the 2nd mount. + Version 1.59 ------------ Client uses server inode numbers (which are persistent) rather than diff --git a/fs/cifs/README b/fs/cifs/README index ad92921dbde4..79c1a93400be 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -262,11 +262,11 @@ A partial list of the supported mount options follows: mount. domain Set the SMB/CIFS workgroup name prepended to the username during CIFS session establishment - forceuid Set the default uid for inodes based on the uid - passed in. For mounts to servers + forceuid Set the default uid for inodes to the uid + passed in on mount. For mounts to servers which do support the CIFS Unix extensions, such as a properly configured Samba server, the server provides - the uid, gid and mode so this parameter should not be + the uid, gid and mode so this parameter should not be specified unless the server and clients uid and gid numbering differ. If the server and client are in the same domain (e.g. running winbind or nss_ldap) and @@ -278,11 +278,7 @@ A partial list of the supported mount options follows: of existing files will be the uid (gid) of the person who executed the mount (root, except when mount.cifs is configured setuid for user mounts) unless the "uid=" - (gid) mount option is specified. For the uid (gid) of newly - created files and directories, ie files created since - the last mount of the server share, the expected uid - (gid) is cached as long as the inode remains in - memory on the client. Also note that permission + (gid) mount option is specified. Also note that permission checks (authorization checks) on accesses to a file occur at the server, but there are cases in which an administrator may want to restrict at the client as well. For those @@ -290,12 +286,15 @@ A partial list of the supported mount options follows: (such as Windows), permissions can also be checked at the client, and a crude form of client side permission checking can be enabled by specifying file_mode and dir_mode on - the client. Note that the mount.cifs helper must be - at version 1.10 or higher to support specifying the uid - (or gid) in non-numeric form. - forcegid (similar to above but for the groupid instead of uid) + the client. (default) + forcegid (similar to above but for the groupid instead of uid) (default) + noforceuid Fill in file owner information (uid) by requesting it from + the server if possible. With this option, the value given in + the uid= option (on mount) will only be used if the server + can not support returning uids on inodes. + noforcegid (similar to above but for the group owner, gid, instead of uid) uid Set the default uid for inodes, and indicate to the - cifs kernel driver which local user mounted . If the server + cifs kernel driver which local user mounted. If the server supports the unix extensions the default uid is not used to fill in the owner fields of inodes (files) unless the "forceuid" parameter is specified. diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 3bb11be8b6a8..606912d8f2a8 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -55,7 +55,7 @@ void cifs_dfs_release_automount_timer(void) * i.e. strips from UNC trailing path that is not part of share * name and fixup missing '\' in the begining of DFS node refferal * if neccessary. - * Returns pointer to share name on success or NULL on error. + * Returns pointer to share name on success or ERR_PTR on error. * Caller is responsible for freeing returned string. */ static char *cifs_get_share_name(const char *node_name) @@ -68,7 +68,7 @@ static char *cifs_get_share_name(const char *node_name) UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */, GFP_KERNEL); if (!UNC) - return NULL; + return ERR_PTR(-ENOMEM); /* get share name and server name */ if (node_name[1] != '\\') { @@ -87,7 +87,7 @@ static char *cifs_get_share_name(const char *node_name) cERROR(1, ("%s: no server name end in node name: %s", __func__, node_name)); kfree(UNC); - return NULL; + return ERR_PTR(-EINVAL); } /* find sharename end */ @@ -133,6 +133,12 @@ char *cifs_compose_mount_options(const char *sb_mountdata, return ERR_PTR(-EINVAL); *devname = cifs_get_share_name(ref->node_name); + if (IS_ERR(*devname)) { + rc = PTR_ERR(*devname); + *devname = NULL; + goto compose_mount_options_err; + } + rc = dns_resolve_server_name_to_ip(*devname, &srvIP); if (rc != 0) { cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d", diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 051caecf7d67..8ec7736ce954 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -125,7 +125,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo) if (server->addr.sockAddr.sin_family == AF_INET) sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr); else if (server->addr.sockAddr.sin_family == AF_INET6) - sprintf(dp, "ip6=%pi6", &server->addr.sockAddr6.sin6_addr); + sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr); else goto out; diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 60e3c4253de0..714a542cbafc 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -44,7 +44,7 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes, int maxwords = maxbytes / 2; char tmp[NLS_MAX_CHARSET_SIZE]; - for (i = 0; from[i] && i < maxwords; i++) { + for (i = 0; i < maxwords && from[i]; i++) { charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp, NLS_MAX_CHARSET_SIZE); if (charlen > 0) diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 6941c22398a6..7dfe0842a6f6 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -607,7 +607,7 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb, return get_cifs_acl_by_path(cifs_sb, path, pacllen); pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); return pntsd; } @@ -665,7 +665,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen, return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen); rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); return rc; } diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 7c9809523f42..7efe1745494d 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -373,6 +373,7 @@ calc_exit_2: compare with the NTLM example */ hmac_md5_final(ses->server->ntlmv2_hash, pctxt); + kfree(pctxt); return rc; } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 44f30504b82d..3610e9958b4c 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -361,13 +361,10 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server) static int cifs_show_options(struct seq_file *s, struct vfsmount *m) { - struct cifs_sb_info *cifs_sb; - struct cifsTconInfo *tcon; - - cifs_sb = CIFS_SB(m->mnt_sb); - tcon = cifs_sb->tcon; + struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb); + struct cifsTconInfo *tcon = cifs_sb->tcon; - seq_printf(s, ",unc=%s", cifs_sb->tcon->treeName); + seq_printf(s, ",unc=%s", tcon->treeName); if (tcon->ses->userName) seq_printf(s, ",username=%s", tcon->ses->userName); if (tcon->ses->domainName) @@ -376,10 +373,14 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m) seq_printf(s, ",uid=%d", cifs_sb->mnt_uid); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) seq_printf(s, ",forceuid"); + else + seq_printf(s, ",noforceuid"); seq_printf(s, ",gid=%d", cifs_sb->mnt_gid); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) seq_printf(s, ",forcegid"); + else + seq_printf(s, ",noforcegid"); cifs_show_address(s, tcon->ses->server); @@ -985,19 +986,19 @@ static int cifs_oplock_thread(void *dummyarg) if (try_to_freeze()) continue; - spin_lock(&GlobalMid_Lock); - if (list_empty(&GlobalOplock_Q)) { - spin_unlock(&GlobalMid_Lock); + spin_lock(&cifs_oplock_lock); + if (list_empty(&cifs_oplock_list)) { + spin_unlock(&cifs_oplock_lock); set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(39*HZ); } else { - oplock_item = list_entry(GlobalOplock_Q.next, + oplock_item = list_entry(cifs_oplock_list.next, struct oplock_q_entry, qhead); cFYI(1, ("found oplock item to write out")); pTcon = oplock_item->tcon; inode = oplock_item->pinode; netfid = oplock_item->netfid; - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_oplock_lock); DeleteOplockQEntry(oplock_item); /* can not grab inode sem here since it would deadlock when oplock received on delete @@ -1054,7 +1055,7 @@ init_cifs(void) int rc = 0; cifs_proc_init(); INIT_LIST_HEAD(&cifs_tcp_ses_list); - INIT_LIST_HEAD(&GlobalOplock_Q); + INIT_LIST_HEAD(&cifs_oplock_list); #ifdef CONFIG_CIFS_EXPERIMENTAL INIT_LIST_HEAD(&GlobalDnotifyReqList); INIT_LIST_HEAD(&GlobalDnotifyRsp_Q); @@ -1083,6 +1084,7 @@ init_cifs(void) rwlock_init(&GlobalSMBSeslock); rwlock_init(&cifs_tcp_ses_lock); spin_lock_init(&GlobalMid_Lock); + spin_lock_init(&cifs_oplock_lock); if (cifs_max_pending < 2) { cifs_max_pending = 2; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 6c170948300d..094325e3f714 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -113,5 +113,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* EXPERIMENTAL */ -#define CIFS_VERSION "1.60" +#define CIFS_VERSION "1.61" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 6084d6379c03..6cfc81a32703 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -351,11 +351,24 @@ struct cifsFileInfo { bool closePend:1; /* file is marked to close */ bool invalidHandle:1; /* file closed via session abend */ bool messageMode:1; /* for pipes: message vs byte mode */ - atomic_t wrtPending; /* handle in use - defer close */ + atomic_t count; /* reference count */ struct mutex fh_mutex; /* prevents reopen race after dead ses*/ struct cifs_search_info srch_inf; }; +/* Take a reference on the file private data */ +static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file) +{ + atomic_inc(&cifs_file->count); +} + +/* Release a reference on the file private data */ +static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file) +{ + if (atomic_dec_and_test(&cifs_file->count)) + kfree(cifs_file); +} + /* * One of these for each file inode */ @@ -656,7 +669,11 @@ GLOBAL_EXTERN rwlock_t cifs_tcp_ses_lock; */ GLOBAL_EXTERN rwlock_t GlobalSMBSeslock; -GLOBAL_EXTERN struct list_head GlobalOplock_Q; +/* Global list of oplocks */ +GLOBAL_EXTERN struct list_head cifs_oplock_list; + +/* Protects the cifs_oplock_list */ +GLOBAL_EXTERN spinlock_t cifs_oplock_lock; /* Outstanding dir notify requests */ GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 1866bc2927d4..301e307e1279 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -100,110 +100,138 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon) to this tcon */ } -/* Allocate and return pointer to an SMB request buffer, and set basic - SMB information in the SMB header. If the return code is zero, this - function must have filled in request_buf pointer */ +/* reconnect the socket, tcon, and smb session if needed */ static int -small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, - void **request_buf) +cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command) { int rc = 0; + struct cifsSesInfo *ses; + struct TCP_Server_Info *server; + struct nls_table *nls_codepage; - /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so - check for tcp and smb session status done differently - for those three - in the calling routine */ - if (tcon) { - if (tcon->tidStatus == CifsExiting) { - /* only tree disconnect, open, and write, - (and ulogoff which does not have tcon) - are allowed as we start force umount */ - if ((smb_command != SMB_COM_WRITE_ANDX) && - (smb_command != SMB_COM_OPEN_ANDX) && - (smb_command != SMB_COM_TREE_DISCONNECT)) { - cFYI(1, ("can not send cmd %d while umounting", - smb_command)); - return -ENODEV; - } + /* + * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for + * tcp and smb session status done differently for those three - in the + * calling routine + */ + if (!tcon) + return 0; + + ses = tcon->ses; + server = ses->server; + + /* + * only tree disconnect, open, and write, (and ulogoff which does not + * have tcon) are allowed as we start force umount + */ + if (tcon->tidStatus == CifsExiting) { + if (smb_command != SMB_COM_WRITE_ANDX && + smb_command != SMB_COM_OPEN_ANDX && + smb_command != SMB_COM_TREE_DISCONNECT) { + cFYI(1, ("can not send cmd %d while umounting", + smb_command)); + return -ENODEV; } - if ((tcon->ses) && (tcon->ses->status != CifsExiting) && - (tcon->ses->server)) { - struct nls_table *nls_codepage; - /* Give Demultiplex thread up to 10 seconds to - reconnect, should be greater than cifs socket - timeout which is 7 seconds */ - while (tcon->ses->server->tcpStatus == - CifsNeedReconnect) { - wait_event_interruptible_timeout(tcon->ses->server->response_q, - (tcon->ses->server->tcpStatus == - CifsGood), 10 * HZ); - if (tcon->ses->server->tcpStatus == - CifsNeedReconnect) { - /* on "soft" mounts we wait once */ - if (!tcon->retry || - (tcon->ses->status == CifsExiting)) { - cFYI(1, ("gave up waiting on " - "reconnect in smb_init")); - return -EHOSTDOWN; - } /* else "hard" mount - keep retrying - until process is killed or server - comes back on-line */ - } else /* TCP session is reestablished now */ - break; - } + } - nls_codepage = load_nls_default(); - /* need to prevent multiple threads trying to - simultaneously reconnect the same SMB session */ - down(&tcon->ses->sesSem); - if (tcon->ses->need_reconnect) - rc = cifs_setup_session(0, tcon->ses, - nls_codepage); - if (!rc && (tcon->need_reconnect)) { - mark_open_files_invalid(tcon); - rc = CIFSTCon(0, tcon->ses, tcon->treeName, - tcon, nls_codepage); - up(&tcon->ses->sesSem); - /* BB FIXME add code to check if wsize needs - update due to negotiated smb buffer size - shrinking */ - if (rc == 0) { - atomic_inc(&tconInfoReconnectCount); - /* tell server Unix caps we support */ - if (tcon->ses->capabilities & CAP_UNIX) - reset_cifs_unix_caps( - 0 /* no xid */, - tcon, - NULL /* we do not know sb */, - NULL /* no vol info */); - } + if (ses->status == CifsExiting) + return -EIO; - cFYI(1, ("reconnect tcon rc = %d", rc)); - /* Removed call to reopen open files here. - It is safer (and faster) to reopen files - one at a time as needed in read and write */ - - /* Check if handle based operation so we - know whether we can continue or not without - returning to caller to reset file handle */ - switch (smb_command) { - case SMB_COM_READ_ANDX: - case SMB_COM_WRITE_ANDX: - case SMB_COM_CLOSE: - case SMB_COM_FIND_CLOSE2: - case SMB_COM_LOCKING_ANDX: { - unload_nls(nls_codepage); - return -EAGAIN; - } - } - } else { - up(&tcon->ses->sesSem); - } - unload_nls(nls_codepage); + /* + * Give demultiplex thread up to 10 seconds to reconnect, should be + * greater than cifs socket timeout which is 7 seconds + */ + while (server->tcpStatus == CifsNeedReconnect) { + wait_event_interruptible_timeout(server->response_q, + (server->tcpStatus == CifsGood), 10 * HZ); - } else { - return -EIO; + /* is TCP session is reestablished now ?*/ + if (server->tcpStatus != CifsNeedReconnect) + break; + + /* + * on "soft" mounts we wait once. Hard mounts keep + * retrying until process is killed or server comes + * back on-line + */ + if (!tcon->retry || ses->status == CifsExiting) { + cFYI(1, ("gave up waiting on reconnect in smb_init")); + return -EHOSTDOWN; } } + + if (!ses->need_reconnect && !tcon->need_reconnect) + return 0; + + nls_codepage = load_nls_default(); + + /* + * need to prevent multiple threads trying to simultaneously + * reconnect the same SMB session + */ + down(&ses->sesSem); + if (ses->need_reconnect) + rc = cifs_setup_session(0, ses, nls_codepage); + + /* do we need to reconnect tcon? */ + if (rc || !tcon->need_reconnect) { + up(&ses->sesSem); + goto out; + } + + mark_open_files_invalid(tcon); + rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage); + up(&ses->sesSem); + cFYI(1, ("reconnect tcon rc = %d", rc)); + + if (rc) + goto out; + + /* + * FIXME: check if wsize needs updated due to negotiated smb buffer + * size shrinking + */ + atomic_inc(&tconInfoReconnectCount); + + /* tell server Unix caps we support */ + if (ses->capabilities & CAP_UNIX) + reset_cifs_unix_caps(0, tcon, NULL, NULL); + + /* + * Removed call to reopen open files here. It is safer (and faster) to + * reopen files one at a time as needed in read and write. + * + * FIXME: what about file locks? don't we need to reclaim them ASAP? + */ + +out: + /* + * Check if handle based operation so we know whether we can continue + * or not without returning to caller to reset file handle + */ + switch (smb_command) { + case SMB_COM_READ_ANDX: + case SMB_COM_WRITE_ANDX: + case SMB_COM_CLOSE: + case SMB_COM_FIND_CLOSE2: + case SMB_COM_LOCKING_ANDX: + rc = -EAGAIN; + } + + unload_nls(nls_codepage); + return rc; +} + +/* Allocate and return pointer to an SMB request buffer, and set basic + SMB information in the SMB header. If the return code is zero, this + function must have filled in request_buf pointer */ +static int +small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, + void **request_buf) +{ + int rc = 0; + + rc = cifs_reconnect_tcon(tcon, smb_command); if (rc) return rc; @@ -256,101 +284,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, { int rc = 0; - /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so - check for tcp and smb session status done differently - for those three - in the calling routine */ - if (tcon) { - if (tcon->tidStatus == CifsExiting) { - /* only tree disconnect, open, and write, - (and ulogoff which does not have tcon) - are allowed as we start force umount */ - if ((smb_command != SMB_COM_WRITE_ANDX) && - (smb_command != SMB_COM_OPEN_ANDX) && - (smb_command != SMB_COM_TREE_DISCONNECT)) { - cFYI(1, ("can not send cmd %d while umounting", - smb_command)); - return -ENODEV; - } - } - - if ((tcon->ses) && (tcon->ses->status != CifsExiting) && - (tcon->ses->server)) { - struct nls_table *nls_codepage; - /* Give Demultiplex thread up to 10 seconds to - reconnect, should be greater than cifs socket - timeout which is 7 seconds */ - while (tcon->ses->server->tcpStatus == - CifsNeedReconnect) { - wait_event_interruptible_timeout(tcon->ses->server->response_q, - (tcon->ses->server->tcpStatus == - CifsGood), 10 * HZ); - if (tcon->ses->server->tcpStatus == - CifsNeedReconnect) { - /* on "soft" mounts we wait once */ - if (!tcon->retry || - (tcon->ses->status == CifsExiting)) { - cFYI(1, ("gave up waiting on " - "reconnect in smb_init")); - return -EHOSTDOWN; - } /* else "hard" mount - keep retrying - until process is killed or server - comes on-line */ - } else /* TCP session is reestablished now */ - break; - } - nls_codepage = load_nls_default(); - /* need to prevent multiple threads trying to - simultaneously reconnect the same SMB session */ - down(&tcon->ses->sesSem); - if (tcon->ses->need_reconnect) - rc = cifs_setup_session(0, tcon->ses, - nls_codepage); - if (!rc && (tcon->need_reconnect)) { - mark_open_files_invalid(tcon); - rc = CIFSTCon(0, tcon->ses, tcon->treeName, - tcon, nls_codepage); - up(&tcon->ses->sesSem); - /* BB FIXME add code to check if wsize needs - update due to negotiated smb buffer size - shrinking */ - if (rc == 0) { - atomic_inc(&tconInfoReconnectCount); - /* tell server Unix caps we support */ - if (tcon->ses->capabilities & CAP_UNIX) - reset_cifs_unix_caps( - 0 /* no xid */, - tcon, - NULL /* do not know sb */, - NULL /* no vol info */); - } - - cFYI(1, ("reconnect tcon rc = %d", rc)); - /* Removed call to reopen open files here. - It is safer (and faster) to reopen files - one at a time as needed in read and write */ - - /* Check if handle based operation so we - know whether we can continue or not without - returning to caller to reset file handle */ - switch (smb_command) { - case SMB_COM_READ_ANDX: - case SMB_COM_WRITE_ANDX: - case SMB_COM_CLOSE: - case SMB_COM_FIND_CLOSE2: - case SMB_COM_LOCKING_ANDX: { - unload_nls(nls_codepage); - return -EAGAIN; - } - } - } else { - up(&tcon->ses->sesSem); - } - unload_nls(nls_codepage); - - } else { - return -EIO; - } - } + rc = cifs_reconnect_tcon(tcon, smb_command); if (rc) return rc; @@ -3961,6 +3895,10 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr, if (is_unicode) { __le16 *tmp = kmalloc(strlen(searchName)*2 + 2, GFP_KERNEL); + if (tmp == NULL) { + rc = -ENOMEM; + goto parse_DFS_referrals_exit; + } cifsConvertToUCS((__le16 *) tmp, searchName, PATH_MAX, nls_codepage, remap); node->path_consumed = cifs_ucs2_bytes(tmp, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 9bb5c8750736..d49682433c20 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -803,6 +803,10 @@ cifs_parse_mount_options(char *options, const char *devname, char *data; unsigned int temp_len, i, j; char separator[2]; + short int override_uid = -1; + short int override_gid = -1; + bool uid_specified = false; + bool gid_specified = false; separator[0] = ','; separator[1] = 0; @@ -1093,18 +1097,20 @@ cifs_parse_mount_options(char *options, const char *devname, "too long.\n"); return 1; } - } else if (strnicmp(data, "uid", 3) == 0) { - if (value && *value) - vol->linux_uid = - simple_strtoul(value, &value, 0); - } else if (strnicmp(data, "forceuid", 8) == 0) { - vol->override_uid = 1; - } else if (strnicmp(data, "gid", 3) == 0) { - if (value && *value) - vol->linux_gid = - simple_strtoul(value, &value, 0); - } else if (strnicmp(data, "forcegid", 8) == 0) { - vol->override_gid = 1; + } else if (!strnicmp(data, "uid", 3) && value && *value) { + vol->linux_uid = simple_strtoul(value, &value, 0); + uid_specified = true; + } else if (!strnicmp(data, "forceuid", 8)) { + override_uid = 1; + } else if (!strnicmp(data, "noforceuid", 10)) { + override_uid = 0; + } else if (!strnicmp(data, "gid", 3) && value && *value) { + vol->linux_gid = simple_strtoul(value, &value, 0); + gid_specified = true; + } else if (!strnicmp(data, "forcegid", 8)) { + override_gid = 1; + } else if (!strnicmp(data, "noforcegid", 10)) { + override_gid = 0; } else if (strnicmp(data, "file_mode", 4) == 0) { if (value && *value) { vol->file_mode = @@ -1355,11 +1361,23 @@ cifs_parse_mount_options(char *options, const char *devname, if (vol->UNCip == NULL) vol->UNCip = &vol->UNC[2]; + if (uid_specified) + vol->override_uid = override_uid; + else if (override_uid == 1) + printk(KERN_NOTICE "CIFS: ignoring forceuid mount option " + "specified with no uid= option.\n"); + + if (gid_specified) + vol->override_gid = override_gid; + else if (override_gid == 1) + printk(KERN_NOTICE "CIFS: ignoring forcegid mount option " + "specified with no gid= option.\n"); + return 0; } static struct TCP_Server_Info * -cifs_find_tcp_session(struct sockaddr_storage *addr) +cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port) { struct list_head *tmp; struct TCP_Server_Info *server; @@ -1379,16 +1397,37 @@ cifs_find_tcp_session(struct sockaddr_storage *addr) if (server->tcpStatus == CifsNew) continue; - if (addr->ss_family == AF_INET && - (addr4->sin_addr.s_addr != - server->addr.sockAddr.sin_addr.s_addr)) - continue; - else if (addr->ss_family == AF_INET6 && - (!ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr, - &addr6->sin6_addr) || - server->addr.sockAddr6.sin6_scope_id != - addr6->sin6_scope_id)) - continue; + switch (addr->ss_family) { + case AF_INET: + if (addr4->sin_addr.s_addr == + server->addr.sockAddr.sin_addr.s_addr) { + addr4->sin_port = htons(port); + /* user overrode default port? */ + if (addr4->sin_port) { + if (addr4->sin_port != + server->addr.sockAddr.sin_port) + continue; + } + break; + } else + continue; + + case AF_INET6: + if (ipv6_addr_equal(&addr6->sin6_addr, + &server->addr.sockAddr6.sin6_addr) && + (addr6->sin6_scope_id == + server->addr.sockAddr6.sin6_scope_id)) { + addr6->sin6_port = htons(port); + /* user overrode default port? */ + if (addr6->sin6_port) { + if (addr6->sin6_port != + server->addr.sockAddr6.sin6_port) + continue; + } + break; + } else + continue; + } ++server->srv_count; write_unlock(&cifs_tcp_ses_lock); @@ -1457,7 +1496,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) } /* see if we already have a matching tcp_ses */ - tcp_ses = cifs_find_tcp_session(&addr); + tcp_ses = cifs_find_tcp_session(&addr, volume_info->port); if (tcp_ses) return tcp_ses; @@ -2452,10 +2491,10 @@ try_mount_again: tcon->local_lease = volume_info->local_lease; } if (pSesInfo) { - if (pSesInfo->capabilities & CAP_LARGE_FILES) { - sb->s_maxbytes = (u64) 1 << 63; - } else - sb->s_maxbytes = (u64) 1 << 31; /* 2 GB */ + if (pSesInfo->capabilities & CAP_LARGE_FILES) + sb->s_maxbytes = MAX_LFS_FILESIZE; + else + sb->s_maxbytes = MAX_NON_LFS; } /* BB FIXME fix time_gran to be larger for LANMAN sessions */ @@ -2544,11 +2583,20 @@ remote_path_check: if (mount_data != mount_data_global) kfree(mount_data); + mount_data = cifs_compose_mount_options( cifs_sb->mountdata, full_path + 1, referrals, &fake_devname); - kfree(fake_devname); + free_dfs_info_array(referrals, num_referrals); + kfree(fake_devname); + kfree(full_path); + + if (IS_ERR(mount_data)) { + rc = PTR_ERR(mount_data); + mount_data = NULL; + goto mount_fail_check; + } if (tcon) cifs_put_tcon(tcon); @@ -2556,8 +2604,6 @@ remote_path_check: cifs_put_smb_ses(pSesInfo); cleanup_volume_info(&volume_info); - FreeXid(xid); - kfree(full_path); referral_walks_count++; goto try_mount_again; } @@ -2611,9 +2657,9 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, return -EIO; smb_buffer = cifs_buf_get(); - if (smb_buffer == NULL) { + if (smb_buffer == NULL) return -ENOMEM; - } + smb_buffer_response = smb_buffer; header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX, diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 4326ffd90fa9..a6424cfc0121 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -153,7 +153,7 @@ cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle, mutex_init(&pCifsFile->fh_mutex); mutex_init(&pCifsFile->lock_mutex); INIT_LIST_HEAD(&pCifsFile->llist); - atomic_set(&pCifsFile->wrtPending, 0); + atomic_set(&pCifsFile->count, 1); /* set the following in open now pCifsFile->pfile = file; */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index c34b7f8a217b..fa7beac8b80e 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -53,11 +53,9 @@ static inline struct cifsFileInfo *cifs_init_private( private_data->pInode = inode; private_data->invalidHandle = false; private_data->closePend = false; - /* we have to track num writers to the inode, since writepages - does not tell us which handle the write is for so there can - be a close (overlapping with write) of the filehandle that - cifs_writepages chose to use */ - atomic_set(&private_data->wrtPending, 0); + /* Initialize reference count to one. The private data is + freed on the release of the last reference */ + atomic_set(&private_data->count, 1); return private_data; } @@ -643,7 +641,7 @@ int cifs_close(struct inode *inode, struct file *file) if (!pTcon->need_reconnect) { write_unlock(&GlobalSMBSeslock); timeout = 2; - while ((atomic_read(&pSMBFile->wrtPending) != 0) + while ((atomic_read(&pSMBFile->count) != 1) && (timeout <= 2048)) { /* Give write a better chance to get to server ahead of the close. We do not @@ -657,8 +655,6 @@ int cifs_close(struct inode *inode, struct file *file) msleep(timeout); timeout *= 4; } - if (atomic_read(&pSMBFile->wrtPending)) - cERROR(1, ("close with pending write")); if (!pTcon->need_reconnect && !pSMBFile->invalidHandle) rc = CIFSSMBClose(xid, pTcon, @@ -681,24 +677,7 @@ int cifs_close(struct inode *inode, struct file *file) list_del(&pSMBFile->flist); list_del(&pSMBFile->tlist); write_unlock(&GlobalSMBSeslock); - timeout = 10; - /* We waited above to give the SMBWrite a chance to issue - on the wire (so we do not get SMBWrite returning EBADF - if writepages is racing with close. Note that writepages - does not specify a file handle, so it is possible for a file - to be opened twice, and the application close the "wrong" - file handle - in these cases we delay long enough to allow - the SMBWrite to get on the wire before the SMB Close. - We allow total wait here over 45 seconds, more than - oplock break time, and more than enough to allow any write - to complete on the server, or to time out on the client */ - while ((atomic_read(&pSMBFile->wrtPending) != 0) - && (timeout <= 50000)) { - cERROR(1, ("writes pending, delay free of handle")); - msleep(timeout); - timeout *= 8; - } - kfree(file->private_data); + cifsFileInfo_put(file->private_data); file->private_data = NULL; } else rc = -EBADF; @@ -1236,7 +1215,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode) if (!open_file->invalidHandle) { /* found a good file */ /* lock it so it will not be closed on us */ - atomic_inc(&open_file->wrtPending); + cifsFileInfo_get(open_file); read_unlock(&GlobalSMBSeslock); return open_file; } /* else might as well continue, and look for @@ -1276,7 +1255,7 @@ refind_writable: if (open_file->pfile && ((open_file->pfile->f_flags & O_RDWR) || (open_file->pfile->f_flags & O_WRONLY))) { - atomic_inc(&open_file->wrtPending); + cifsFileInfo_get(open_file); if (!open_file->invalidHandle) { /* found a good writable file */ @@ -1293,7 +1272,7 @@ refind_writable: else { /* start over in case this was deleted */ /* since the list could be modified */ read_lock(&GlobalSMBSeslock); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); goto refind_writable; } } @@ -1309,7 +1288,7 @@ refind_writable: read_lock(&GlobalSMBSeslock); /* can not use this handle, no write pending on this one after all */ - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); if (open_file->closePend) /* list could have changed */ goto refind_writable; @@ -1373,7 +1352,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to) if (open_file) { bytes_written = cifs_write(open_file->pfile, write_data, to-from, &offset); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); /* Does mm or vfs already set times? */ inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb); if ((bytes_written > 0) && (offset)) @@ -1562,7 +1541,7 @@ retry: bytes_to_write, offset, &bytes_written, iov, n_iov, long_op); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); cifs_update_eof(cifsi, offset, bytes_written); if (rc || bytes_written < bytes_to_write) { diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 18afe57b2461..1f09c7619319 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -212,7 +212,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info, * junction to the new submount (ie to setup the fake directory * which represents a DFS referral). */ -void +static void cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb) { struct cifs_sb_info *cifs_sb = CIFS_SB(sb); @@ -388,7 +388,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, } /* Fill a cifs_fattr struct with info from FILE_ALL_INFO */ -void +static void cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info, struct cifs_sb_info *cifs_sb, bool adjust_tz) { @@ -513,9 +513,12 @@ int cifs_get_inode_info(struct inode **pinode, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc1) { - /* BB EOPNOSUPP disable SERVER_INUM? */ cFYI(1, ("GetSrvInodeNum rc %d", rc1)); fattr.cf_uniqueid = iunique(sb, ROOT_I); + /* disable serverino if call not supported */ + if (rc1 == -EINVAL) + cifs_sb->mnt_cifs_flags &= + ~CIFS_MOUNT_SERVER_INUM; } } else { fattr.cf_uniqueid = iunique(sb, ROOT_I); @@ -797,7 +800,7 @@ set_via_filehandle: if (open_file == NULL) CIFSSMBClose(xid, pTcon, netfid); else - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); out: return rc; } @@ -1632,7 +1635,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, __u32 npid = open_file->pid; rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid, npid, false); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); cFYI(1, ("SetFSize for attrs rc = %d", rc)); if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { unsigned int bytes_written; @@ -1787,7 +1790,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) u16 nfid = open_file->netfid; u32 npid = open_file->pid; rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid); - atomic_dec(&open_file->wrtPending); + cifsFileInfo_put(open_file); } else { rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args, cifs_sb->local_nls, diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 0ad3e2d116a6..1da4ab250eae 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -119,20 +119,19 @@ AllocOplockQEntry(struct inode *pinode, __u16 fid, struct cifsTconInfo *tcon) temp->pinode = pinode; temp->tcon = tcon; temp->netfid = fid; - spin_lock(&GlobalMid_Lock); - list_add_tail(&temp->qhead, &GlobalOplock_Q); - spin_unlock(&GlobalMid_Lock); + spin_lock(&cifs_oplock_lock); + list_add_tail(&temp->qhead, &cifs_oplock_list); + spin_unlock(&cifs_oplock_lock); } return temp; - } void DeleteOplockQEntry(struct oplock_q_entry *oplockEntry) { - spin_lock(&GlobalMid_Lock); + spin_lock(&cifs_oplock_lock); /* should we check if list empty first? */ list_del(&oplockEntry->qhead); - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_oplock_lock); kmem_cache_free(cifs_oplock_cachep, oplockEntry); } @@ -144,14 +143,14 @@ void DeleteTconOplockQEntries(struct cifsTconInfo *tcon) if (tcon == NULL) return; - spin_lock(&GlobalMid_Lock); - list_for_each_entry(temp, &GlobalOplock_Q, qhead) { + spin_lock(&cifs_oplock_lock); + list_for_each_entry(temp, &cifs_oplock_list, qhead) { if ((temp->tcon) && (temp->tcon == tcon)) { list_del(&temp->qhead); kmem_cache_free(cifs_oplock_cachep, temp); } } - spin_unlock(&GlobalMid_Lock); + spin_unlock(&cifs_oplock_lock); } static int diff --git a/fs/compat.c b/fs/compat.c index 94502dab972a..6d6f98fe64a0 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1485,20 +1485,15 @@ int compat_do_execve(char * filename, if (!bprm) goto out_files; - retval = -ERESTARTNOINTR; - if (mutex_lock_interruptible(¤t->cred_guard_mutex)) + retval = prepare_bprm_creds(bprm); + if (retval) goto out_free; - current->in_execve = 1; - - retval = -ENOMEM; - bprm->cred = prepare_exec_creds(); - if (!bprm->cred) - goto out_unlock; retval = check_unsafe_exec(bprm); if (retval < 0) - goto out_unlock; + goto out_free; clear_in_exec = retval; + current->in_execve = 1; file = open_exec(filename); retval = PTR_ERR(file); @@ -1547,7 +1542,6 @@ int compat_do_execve(char * filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; - mutex_unlock(¤t->cred_guard_mutex); acct_update_integrals(current); free_bprm(bprm); if (displaced) @@ -1567,10 +1561,7 @@ out_file: out_unmark: if (clear_in_exec) current->fs->in_exec = 0; - -out_unlock: current->in_execve = 0; - mutex_unlock(¤t->cred_guard_mutex); out_free: free_bprm(bprm); diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index f28f070a60fc..f91fd51b32e3 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1905,6 +1905,7 @@ COMPATIBLE_IOCTL(FIONCLEX) COMPATIBLE_IOCTL(FIOASYNC) COMPATIBLE_IOCTL(FIONBIO) COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */ +COMPATIBLE_IOCTL(FS_IOC_FIEMAP) /* 0x00 */ COMPATIBLE_IOCTL(FIBMAP) COMPATIBLE_IOCTL(FIGETBSZ) diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 4921e7426d95..a2f746066c5d 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -51,6 +51,7 @@ static const struct address_space_operations configfs_aops = { }; static struct backing_dev_info configfs_backing_dev_info = { + .name = "configfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; diff --git a/fs/dcache.c b/fs/dcache.c index 9e5cd3c3a6ba..a100fa35a48f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -32,6 +32,7 @@ #include <linux/swap.h> #include <linux/bootmem.h> #include <linux/fs_struct.h> +#include <linux/hardirq.h> #include "internal.h" int sysctl_vfs_cache_pressure __read_mostly = 100; diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index ccc9d62c462d..55ea369f43a9 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -63,7 +63,7 @@ static int send_data(struct sk_buff *skb) return rv; } - return genlmsg_unicast(skb, listener_nlpid); + return genlmsg_unicast(&init_net, skb, listener_nlpid); } static int user_cmd(struct sk_buff *skb, struct genl_info *info) diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index af737bb56cb7..259525c9abb8 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1303,6 +1303,13 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat, } (*new_auth_tok)->session_key.encrypted_key_size = (body_size - (ECRYPTFS_SALT_SIZE + 5)); + if ((*new_auth_tok)->session_key.encrypted_key_size + > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { + printk(KERN_WARNING "Tag 3 packet contains key larger " + "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n"); + rc = -EINVAL; + goto out_free; + } if (unlikely(data[(*packet_size)++] != 0x04)) { printk(KERN_WARNING "Unknown version number [%d]\n", data[(*packet_size) - 1]); @@ -1449,6 +1456,12 @@ parse_tag_11_packet(unsigned char *data, unsigned char *contents, rc = -EINVAL; goto out; } + if (unlikely((*tag_11_contents_size) > max_contents_bytes)) { + printk(KERN_ERR "Literal data section in tag 11 packet exceeds " + "expected size\n"); + rc = -EINVAL; + goto out; + } if (data[(*packet_size)++] != 0x62) { printk(KERN_WARNING "Unrecognizable packet\n"); rc = -EINVAL; diff --git a/fs/exec.c b/fs/exec.c index 4a8849e45b21..172ceb6edde4 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -678,8 +678,8 @@ exit: } EXPORT_SYMBOL(open_exec); -int kernel_read(struct file *file, unsigned long offset, - char *addr, unsigned long count) +int kernel_read(struct file *file, loff_t offset, + char *addr, unsigned long count) { mm_segment_t old_fs; loff_t pos = offset; @@ -1016,6 +1016,35 @@ out: EXPORT_SYMBOL(flush_old_exec); /* + * Prepare credentials and lock ->cred_guard_mutex. + * install_exec_creds() commits the new creds and drops the lock. + * Or, if exec fails before, free_bprm() should release ->cred and + * and unlock. + */ +int prepare_bprm_creds(struct linux_binprm *bprm) +{ + if (mutex_lock_interruptible(¤t->cred_guard_mutex)) + return -ERESTARTNOINTR; + + bprm->cred = prepare_exec_creds(); + if (likely(bprm->cred)) + return 0; + + mutex_unlock(¤t->cred_guard_mutex); + return -ENOMEM; +} + +void free_bprm(struct linux_binprm *bprm) +{ + free_arg_pages(bprm); + if (bprm->cred) { + mutex_unlock(¤t->cred_guard_mutex); + abort_creds(bprm->cred); + } + kfree(bprm); +} + +/* * install the new credentials for this executable */ void install_exec_creds(struct linux_binprm *bprm) @@ -1024,12 +1053,13 @@ void install_exec_creds(struct linux_binprm *bprm) commit_creds(bprm->cred); bprm->cred = NULL; - - /* cred_guard_mutex must be held at least to this point to prevent + /* + * cred_guard_mutex must be held at least to this point to prevent * ptrace_attach() from altering our determination of the task's - * credentials; any time after this it may be unlocked */ - + * credentials; any time after this it may be unlocked. + */ security_bprm_committed_creds(bprm); + mutex_unlock(¤t->cred_guard_mutex); } EXPORT_SYMBOL(install_exec_creds); @@ -1246,14 +1276,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) EXPORT_SYMBOL(search_binary_handler); -void free_bprm(struct linux_binprm *bprm) -{ - free_arg_pages(bprm); - if (bprm->cred) - abort_creds(bprm->cred); - kfree(bprm); -} - /* * sys_execve() executes a new program. */ @@ -1277,20 +1299,15 @@ int do_execve(char * filename, if (!bprm) goto out_files; - retval = -ERESTARTNOINTR; - if (mutex_lock_interruptible(¤t->cred_guard_mutex)) + retval = prepare_bprm_creds(bprm); + if (retval) goto out_free; - current->in_execve = 1; - - retval = -ENOMEM; - bprm->cred = prepare_exec_creds(); - if (!bprm->cred) - goto out_unlock; retval = check_unsafe_exec(bprm); if (retval < 0) - goto out_unlock; + goto out_free; clear_in_exec = retval; + current->in_execve = 1; file = open_exec(filename); retval = PTR_ERR(file); @@ -1340,7 +1357,6 @@ int do_execve(char * filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; - mutex_unlock(¤t->cred_guard_mutex); acct_update_integrals(current); free_bprm(bprm); if (displaced) @@ -1360,10 +1376,7 @@ out_file: out_unmark: if (clear_in_exec) current->fs->in_exec = 0; - -out_unlock: current->in_execve = 0; - mutex_unlock(¤t->cred_guard_mutex); out_free: free_bprm(bprm); diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c index d636e1297cad..a63d44256a70 100644 --- a/fs/ext2/acl.c +++ b/fs/ext2/acl.c @@ -230,7 +230,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl) return error; } -static int +int ext2_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS); @@ -246,12 +246,6 @@ ext2_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int -ext2_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, ext2_check_acl); -} - /* * Initialize the ACLs of a new inode. Called from ext2_new_inode. * diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h index ecefe478898f..3ff6cbb9ac44 100644 --- a/fs/ext2/acl.h +++ b/fs/ext2/acl.h @@ -54,13 +54,13 @@ static inline int ext2_acl_count(size_t size) #ifdef CONFIG_EXT2_FS_POSIX_ACL /* acl.c */ -extern int ext2_permission (struct inode *, int); +extern int ext2_check_acl (struct inode *, int); extern int ext2_acl_chmod (struct inode *); extern int ext2_init_acl (struct inode *, struct inode *); #else #include <linux/sched.h> -#define ext2_permission NULL +#define ext2_check_acl NULL #define ext2_get_acl NULL #define ext2_set_acl NULL diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 2b9e47dc9222..a2f3afd1a1c1 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -85,6 +85,6 @@ const struct inode_operations ext2_file_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .permission = ext2_permission, + .check_acl = ext2_check_acl, .fiemap = ext2_fiemap, }; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index e27130341d4f..1c1638f873a4 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -482,7 +482,7 @@ static int ext2_alloc_branch(struct inode *inode, unlock_buffer(bh); mark_buffer_dirty_inode(bh, inode); /* We used to sync bh here if IS_SYNC(inode). - * But we now rely upon generic_osync_inode() + * But we now rely upon generic_write_sync() * and b_inode_buffers. But not for directories. */ if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index e1dedb0f7873..23701f289e98 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -362,6 +362,10 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry, if (dir_de) { if (old_dir != new_dir) ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0); + else { + kunmap(dir_page); + page_cache_release(dir_page); + } inode_dec_link_count(old_dir); } return 0; @@ -396,7 +400,7 @@ const struct inode_operations ext2_dir_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .permission = ext2_permission, + .check_acl = ext2_check_acl, }; const struct inode_operations ext2_special_inode_operations = { @@ -407,5 +411,5 @@ const struct inode_operations ext2_special_inode_operations = { .removexattr = generic_removexattr, #endif .setattr = ext2_setattr, - .permission = ext2_permission, + .check_acl = ext2_check_acl, }; diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig index fb3c1a21b135..522b15498f45 100644 --- a/fs/ext3/Kconfig +++ b/fs/ext3/Kconfig @@ -29,23 +29,25 @@ config EXT3_FS module will be called ext3. config EXT3_DEFAULTS_TO_ORDERED - bool "Default to 'data=ordered' in ext3 (legacy option)" + bool "Default to 'data=ordered' in ext3" depends on EXT3_FS help - If a filesystem does not explicitly specify a data ordering - mode, and the journal capability allowed it, ext3 used to - historically default to 'data=ordered'. - - That was a rather unfortunate choice, because it leads to all - kinds of latency problems, and the 'data=writeback' mode is more - appropriate these days. - - You should probably always answer 'n' here, and if you really - want to use 'data=ordered' mode, set it in the filesystem itself - with 'tune2fs -o journal_data_ordered'. - - But if you really want to enable the legacy default, you can do - so by answering 'y' to this question. + The journal mode options for ext3 have different tradeoffs + between when data is guaranteed to be on disk and + performance. The use of "data=writeback" can cause + unwritten data to appear in files after an system crash or + power failure, which can be a security issue. However, + "data=ordered" mode can also result in major performance + problems, including seconds-long delays before an fsync() + call returns. For details, see: + + http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs + + If you have been historically happy with ext3's performance, + data=ordered mode will be a safe choice and you should + answer 'y' here. If you understand the reliability and data + privacy issues of data=writeback and are willing to make + that trade off, answer 'n'. config EXT3_FS_XATTR bool "Ext3 extended attributes" diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index e167bae37ef0..c9b0df376b5f 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -238,7 +238,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -static int +int ext3_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS); @@ -254,12 +254,6 @@ ext3_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int -ext3_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, ext3_check_acl); -} - /* * Initialize the ACLs of a new inode. Called from ext3_new_inode. * diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h index 07d15a3a5969..597334626de9 100644 --- a/fs/ext3/acl.h +++ b/fs/ext3/acl.h @@ -54,13 +54,13 @@ static inline int ext3_acl_count(size_t size) #ifdef CONFIG_EXT3_FS_POSIX_ACL /* acl.c */ -extern int ext3_permission (struct inode *, int); +extern int ext3_check_acl (struct inode *, int); extern int ext3_acl_chmod (struct inode *); extern int ext3_init_acl (handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT3_FS_POSIX_ACL */ #include <linux/sched.h> -#define ext3_permission NULL +#define ext3_check_acl NULL static inline int ext3_acl_chmod(struct inode *inode) diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 3d724a95882f..373fa90c796a 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -130,8 +130,7 @@ static int ext3_readdir(struct file * filp, struct buffer_head *bh = NULL; map_bh.b_state = 0; - err = ext3_get_blocks_handle(NULL, inode, blk, 1, - &map_bh, 0, 0); + err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0); if (err > 0) { pgoff_t index = map_bh.b_blocknr >> (PAGE_CACHE_SHIFT - inode->i_blkbits); diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 5b49704b231b..388bbdfa0b4e 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -51,71 +51,12 @@ static int ext3_release_file (struct inode * inode, struct file * filp) return 0; } -static ssize_t -ext3_file_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_path.dentry->d_inode; - ssize_t ret; - int err; - - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - - /* - * Skip flushing if there was an error, or if nothing was written. - */ - if (ret <= 0) - return ret; - - /* - * If the inode is IS_SYNC, or is O_SYNC and we are doing data - * journalling then we need to make sure that we force the transaction - * to disk to keep all metadata uptodate synchronously. - */ - if (file->f_flags & O_SYNC) { - /* - * If we are non-data-journaled, then the dirty data has - * already been flushed to backing store by generic_osync_inode, - * and the inode has been flushed too if there have been any - * modifications other than mere timestamp updates. - * - * Open question --- do we care about flushing timestamps too - * if the inode is IS_SYNC? - */ - if (!ext3_should_journal_data(inode)) - return ret; - - goto force_commit; - } - - /* - * So we know that there has been no forced data flush. If the inode - * is marked IS_SYNC, we need to force one ourselves. - */ - if (!IS_SYNC(inode)) - return ret; - - /* - * Open question #2 --- should we force data to disk here too? If we - * don't, the only impact is that data=writeback filesystems won't - * flush data to disk automatically on IS_SYNC, only metadata (but - * historically, that is what ext2 has done.) - */ - -force_commit: - err = ext3_force_commit(inode->i_sb); - if (err) - return err; - return ret; -} - const struct file_operations ext3_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = generic_file_aio_read, - .aio_write = ext3_file_write, + .aio_write = generic_file_aio_write, .unlocked_ioctl = ext3_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext3_compat_ioctl, @@ -137,7 +78,7 @@ const struct inode_operations ext3_file_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext3_permission, + .check_acl = ext3_check_acl, .fiemap = ext3_fiemap, }; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 5f51fed5c750..b49908a167ae 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -788,7 +788,7 @@ err_out: int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, - int create, int extend_disksize) + int create) { int err = -EIO; int offsets[4]; @@ -911,13 +911,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, if (!err) err = ext3_splice_branch(handle, inode, iblock, partial, indirect_blks, count); - /* - * i_disksize growing is protected by truncate_mutex. Don't forget to - * protect it if you're about to implement concurrent - * ext3_get_block() -bzzz - */ - if (!err && extend_disksize && inode->i_size > ei->i_disksize) - ei->i_disksize = inode->i_size; mutex_unlock(&ei->truncate_mutex); if (err) goto cleanup; @@ -972,7 +965,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock, } ret = ext3_get_blocks_handle(handle, inode, iblock, - max_blocks, bh_result, create, 0); + max_blocks, bh_result, create); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -1005,7 +998,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode, dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); err = ext3_get_blocks_handle(handle, inode, block, 1, - &dummy, create, 1); + &dummy, create); /* * ext3_get_blocks_handle() returns number of blocks * mapped. 0 in case of a HOLE. @@ -1193,15 +1186,16 @@ write_begin_failed: * i_size_read because we hold i_mutex. * * Add inode to orphan list in case we crash before truncate - * finishes. + * finishes. Do this only if ext3_can_truncate() agrees so + * that orphan processing code is happy. */ - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); ext3_journal_stop(handle); unlock_page(page); page_cache_release(page); if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + ext3_truncate(inode); } if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries)) goto retry; @@ -1287,7 +1281,7 @@ static int ext3_ordered_write_end(struct file *file, * There may be allocated blocks outside of i_size because * we failed to copy some data. Prepare for truncate. */ - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); ret2 = ext3_journal_stop(handle); if (!ret) @@ -1296,7 +1290,7 @@ static int ext3_ordered_write_end(struct file *file, page_cache_release(page); if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + ext3_truncate(inode); return ret ? ret : copied; } @@ -1315,14 +1309,14 @@ static int ext3_writeback_write_end(struct file *file, * There may be allocated blocks outside of i_size because * we failed to copy some data. Prepare for truncate. */ - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); ret = ext3_journal_stop(handle); unlock_page(page); page_cache_release(page); if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + ext3_truncate(inode); return ret ? ret : copied; } @@ -1358,7 +1352,7 @@ static int ext3_journalled_write_end(struct file *file, * There may be allocated blocks outside of i_size because * we failed to copy some data. Prepare for truncate. */ - if (pos + len > inode->i_size) + if (pos + len > inode->i_size && ext3_can_truncate(inode)) ext3_orphan_add(handle, inode); EXT3_I(inode)->i_state |= EXT3_STATE_JDATA; if (inode->i_size > EXT3_I(inode)->i_disksize) { @@ -1375,7 +1369,7 @@ static int ext3_journalled_write_end(struct file *file, page_cache_release(page); if (pos + len > inode->i_size) - vmtruncate(inode, inode->i_size); + ext3_truncate(inode); return ret ? ret : copied; } diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 6ff7b9730234..aad6400c9b77 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -2445,7 +2445,7 @@ const struct inode_operations ext3_dir_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext3_permission, + .check_acl = ext3_check_acl, }; const struct inode_operations ext3_special_inode_operations = { @@ -2456,5 +2456,5 @@ const struct inode_operations ext3_special_inode_operations = { .listxattr = ext3_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext3_permission, + .check_acl = ext3_check_acl, }; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 524b349c6299..a8d80a7f1105 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -543,6 +543,19 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl #endif } +static char *data_mode_string(unsigned long mode) +{ + switch (mode) { + case EXT3_MOUNT_JOURNAL_DATA: + return "journal"; + case EXT3_MOUNT_ORDERED_DATA: + return "ordered"; + case EXT3_MOUNT_WRITEBACK_DATA: + return "writeback"; + } + return "unknown"; +} + /* * Show an option if * - it's set to a non-default value OR @@ -616,13 +629,8 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, NOBH)) seq_puts(seq, ",nobh"); - if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA) - seq_puts(seq, ",data=journal"); - else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA) - seq_puts(seq, ",data=ordered"); - else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA) - seq_puts(seq, ",data=writeback"); - + seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt & + EXT3_MOUNT_DATA_FLAGS)); if (test_opt(sb, DATA_ERR_ABORT)) seq_puts(seq, ",data_err=abort"); @@ -1024,12 +1032,18 @@ static int parse_options (char *options, struct super_block *sb, datacheck: if (is_remount) { if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS) - != data_opt) { - printk(KERN_ERR - "EXT3-fs: cannot change data " - "mode on remount\n"); - return 0; - } + == data_opt) + break; + printk(KERN_ERR + "EXT3-fs (device %s): Cannot change " + "data mode on remount. The filesystem " + "is mounted in data=%s mode and you " + "try to remount it in data=%s mode.\n", + sb->s_id, + data_mode_string(sbi->s_mount_opt & + EXT3_MOUNT_DATA_FLAGS), + data_mode_string(data_opt)); + return 0; } else { sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS; sbi->s_mount_opt |= data_opt; diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index f6d8967149ca..0df88b2a69b0 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -236,7 +236,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, return error; } -static int +int ext4_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS); @@ -252,12 +252,6 @@ ext4_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int -ext4_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, ext4_check_acl); -} - /* * Initialize the ACLs of a new inode. Called from ext4_new_inode. * diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index 949789d2bba6..9d843d5deac4 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -54,13 +54,13 @@ static inline int ext4_acl_count(size_t size) #ifdef CONFIG_EXT4_FS_POSIX_ACL /* acl.c */ -extern int ext4_permission(struct inode *, int); +extern int ext4_check_acl(struct inode *, int); extern int ext4_acl_chmod(struct inode *); extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); #else /* CONFIG_EXT4_FS_POSIX_ACL */ #include <linux/sched.h> -#define ext4_permission NULL +#define ext4_check_acl NULL static inline int ext4_acl_chmod(struct inode *inode) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3f1873fef1c6..5ca3eca70a1e 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -58,10 +58,7 @@ static ssize_t ext4_file_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_path.dentry->d_inode; - ssize_t ret; - int err; + struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; /* * If we have encountered a bitmap-format file, the size limit @@ -81,53 +78,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, } } - ret = generic_file_aio_write(iocb, iov, nr_segs, pos); - /* - * Skip flushing if there was an error, or if nothing was written. - */ - if (ret <= 0) - return ret; - - /* - * If the inode is IS_SYNC, or is O_SYNC and we are doing data - * journalling then we need to make sure that we force the transaction - * to disk to keep all metadata uptodate synchronously. - */ - if (file->f_flags & O_SYNC) { - /* - * If we are non-data-journaled, then the dirty data has - * already been flushed to backing store by generic_osync_inode, - * and the inode has been flushed too if there have been any - * modifications other than mere timestamp updates. - * - * Open question --- do we care about flushing timestamps too - * if the inode is IS_SYNC? - */ - if (!ext4_should_journal_data(inode)) - return ret; - - goto force_commit; - } - - /* - * So we know that there has been no forced data flush. If the inode - * is marked IS_SYNC, we need to force one ourselves. - */ - if (!IS_SYNC(inode)) - return ret; - - /* - * Open question #2 --- should we force data to disk here too? If we - * don't, the only impact is that data=writeback filesystems won't - * flush data to disk automatically on IS_SYNC, only metadata (but - * historically, that is what ext2 has done.) - */ - -force_commit: - err = ext4_force_commit(inode->i_sb); - if (err) - return err; - return ret; + return generic_file_aio_write(iocb, iov, nr_segs, pos); } static struct vm_operations_struct ext4_file_vm_ops = { @@ -207,7 +158,7 @@ const struct inode_operations ext4_file_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext4_permission, + .check_acl = ext4_check_acl, .fallocate = ext4_fallocate, .fiemap = ext4_fiemap, }; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index de04013d16ff..114abe5d2c1d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2536,7 +2536,7 @@ const struct inode_operations ext4_dir_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext4_permission, + .check_acl = ext4_check_acl, .fiemap = ext4_fiemap, }; @@ -2548,5 +2548,5 @@ const struct inode_operations ext4_special_inode_operations = { .listxattr = ext4_listxattr, .removexattr = generic_removexattr, #endif - .permission = ext4_permission, + .check_acl = ext4_check_acl, }; diff --git a/fs/fat/file.c b/fs/fat/file.c index f042b965c95c..e8c159de236b 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -176,8 +176,26 @@ static int fat_cont_expand(struct inode *inode, loff_t size) inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - if (IS_SYNC(inode)) - err = sync_page_range_nolock(inode, mapping, start, count); + if (IS_SYNC(inode)) { + int err2; + + /* + * Opencode syncing since we don't have a file open to use + * standard fsync path. + */ + err = filemap_fdatawrite_range(mapping, start, + start + count - 1); + err2 = sync_mapping_buffers(mapping); + if (!err) + err = err2; + err2 = write_inode_now(inode, 1); + if (!err) + err = err2; + if (!err) { + err = filemap_fdatawait_range(mapping, start, + start + count - 1); + } + } out: return err; } diff --git a/fs/fat/misc.c b/fs/fat/misc.c index a6c20473dfd7..4e35be873e09 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -119,8 +119,8 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster) MSDOS_I(inode)->i_start = new_dclus; MSDOS_I(inode)->i_logstart = new_dclus; /* - * Since generic_osync_inode() synchronize later if - * this is not directory, we don't here. + * Since generic_write_sync() synchronizes regular files later, + * we sync here only directories. */ if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) { ret = fat_sync_inode(inode); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index c54226be5294..628235cf44b5 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -19,171 +19,223 @@ #include <linux/sched.h> #include <linux/fs.h> #include <linux/mm.h> +#include <linux/kthread.h> +#include <linux/freezer.h> #include <linux/writeback.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/buffer_head.h> #include "internal.h" +#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info) -/** - * writeback_acquire - attempt to get exclusive writeback access to a device - * @bdi: the device's backing_dev_info structure - * - * It is a waste of resources to have more than one pdflush thread blocked on - * a single request queue. Exclusion at the request_queue level is obtained - * via a flag in the request_queue's backing_dev_info.state. - * - * Non-request_queue-backed address_spaces will share default_backing_dev_info, - * unless they implement their own. Which is somewhat inefficient, as this - * may prevent concurrent writeback against multiple devices. +/* + * We don't actually have pdflush, but this one is exported though /proc... */ -static int writeback_acquire(struct backing_dev_info *bdi) +int nr_pdflush_threads; + +/* + * Work items for the bdi_writeback threads + */ +struct bdi_work { + struct list_head list; + struct list_head wait_list; + struct rcu_head rcu_head; + + unsigned long seen; + atomic_t pending; + + struct super_block *sb; + unsigned long nr_pages; + enum writeback_sync_modes sync_mode; + + unsigned long state; +}; + +enum { + WS_USED_B = 0, + WS_ONSTACK_B, +}; + +#define WS_USED (1 << WS_USED_B) +#define WS_ONSTACK (1 << WS_ONSTACK_B) + +static inline bool bdi_work_on_stack(struct bdi_work *work) +{ + return test_bit(WS_ONSTACK_B, &work->state); +} + +static inline void bdi_work_init(struct bdi_work *work, + struct writeback_control *wbc) +{ + INIT_RCU_HEAD(&work->rcu_head); + work->sb = wbc->sb; + work->nr_pages = wbc->nr_to_write; + work->sync_mode = wbc->sync_mode; + work->state = WS_USED; +} + +static inline void bdi_work_init_on_stack(struct bdi_work *work, + struct writeback_control *wbc) { - return !test_and_set_bit(BDI_pdflush, &bdi->state); + bdi_work_init(work, wbc); + work->state |= WS_ONSTACK; } /** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. * - * Determine whether there is writeback in progress against a backing device. + * Determine whether there is writeback waiting to be handled against a + * backing device. */ int writeback_in_progress(struct backing_dev_info *bdi) { - return test_bit(BDI_pdflush, &bdi->state); + return !list_empty(&bdi->work_list); } -/** - * writeback_release - relinquish exclusive writeback access against a device. - * @bdi: the device's backing_dev_info structure - */ -static void writeback_release(struct backing_dev_info *bdi) +static void bdi_work_clear(struct bdi_work *work) { - BUG_ON(!writeback_in_progress(bdi)); - clear_bit(BDI_pdflush, &bdi->state); + clear_bit(WS_USED_B, &work->state); + smp_mb__after_clear_bit(); + wake_up_bit(&work->state, WS_USED_B); } -static noinline void block_dump___mark_inode_dirty(struct inode *inode) +static void bdi_work_free(struct rcu_head *head) { - if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { - struct dentry *dentry; - const char *name = "?"; + struct bdi_work *work = container_of(head, struct bdi_work, rcu_head); - dentry = d_find_alias(inode); - if (dentry) { - spin_lock(&dentry->d_lock); - name = (const char *) dentry->d_name.name; - } - printk(KERN_DEBUG - "%s(%d): dirtied inode %lu (%s) on %s\n", - current->comm, task_pid_nr(current), inode->i_ino, - name, inode->i_sb->s_id); - if (dentry) { - spin_unlock(&dentry->d_lock); - dput(dentry); - } - } + if (!bdi_work_on_stack(work)) + kfree(work); + else + bdi_work_clear(work); } -/** - * __mark_inode_dirty - internal function - * @inode: inode to mark - * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) - * Mark an inode as dirty. Callers should use mark_inode_dirty or - * mark_inode_dirty_sync. - * - * Put the inode on the super block's dirty list. - * - * CAREFUL! We mark it dirty unconditionally, but move it onto the - * dirty list only if it is hashed or if it refers to a blockdev. - * If it was not hashed, it will never be added to the dirty list - * even if it is later hashed, as it will have been marked dirty already. - * - * In short, make sure you hash any inodes _before_ you start marking - * them dirty. - * - * This function *must* be atomic for the I_DIRTY_PAGES case - - * set_page_dirty() is called under spinlock in several places. - * - * Note that for blockdevs, inode->dirtied_when represents the dirtying time of - * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of - * the kernel-internal blockdev inode represents the dirtying time of the - * blockdev's pages. This is why for I_DIRTY_PAGES we always use - * page->mapping->host, so the page-dirtying time is recorded in the internal - * blockdev inode. - */ -void __mark_inode_dirty(struct inode *inode, int flags) +static void wb_work_complete(struct bdi_work *work) { - struct super_block *sb = inode->i_sb; + const enum writeback_sync_modes sync_mode = work->sync_mode; /* - * Don't do this for I_DIRTY_PAGES - that doesn't actually - * dirty the inode itself + * For allocated work, we can clear the done/seen bit right here. + * For on-stack work, we need to postpone both the clear and free + * to after the RCU grace period, since the stack could be invalidated + * as soon as bdi_work_clear() has done the wakeup. */ - if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { - if (sb->s_op->dirty_inode) - sb->s_op->dirty_inode(inode); - } + if (!bdi_work_on_stack(work)) + bdi_work_clear(work); + if (sync_mode == WB_SYNC_NONE || bdi_work_on_stack(work)) + call_rcu(&work->rcu_head, bdi_work_free); +} +static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work) +{ /* - * make sure that changes are seen by all cpus before we test i_state - * -- mikulas + * The caller has retrieved the work arguments from this work, + * drop our reference. If this is the last ref, delete and free it */ - smp_mb(); - - /* avoid the locking if we can */ - if ((inode->i_state & flags) == flags) - return; + if (atomic_dec_and_test(&work->pending)) { + struct backing_dev_info *bdi = wb->bdi; - if (unlikely(block_dump)) - block_dump___mark_inode_dirty(inode); + spin_lock(&bdi->wb_lock); + list_del_rcu(&work->list); + spin_unlock(&bdi->wb_lock); - spin_lock(&inode_lock); - if ((inode->i_state & flags) != flags) { - const int was_dirty = inode->i_state & I_DIRTY; + wb_work_complete(work); + } +} - inode->i_state |= flags; +static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work) +{ + if (work) { + work->seen = bdi->wb_mask; + BUG_ON(!work->seen); + atomic_set(&work->pending, bdi->wb_cnt); + BUG_ON(!bdi->wb_cnt); /* - * If the inode is being synced, just update its dirty state. - * The unlocker will place the inode on the appropriate - * superblock list, based upon its state. + * Make sure stores are seen before it appears on the list */ - if (inode->i_state & I_SYNC) - goto out; + smp_mb(); - /* - * Only add valid (hashed) inodes to the superblock's - * dirty list. Add blockdev inodes as well. - */ - if (!S_ISBLK(inode->i_mode)) { - if (hlist_unhashed(&inode->i_hash)) - goto out; - } - if (inode->i_state & (I_FREEING|I_CLEAR)) - goto out; + spin_lock(&bdi->wb_lock); + list_add_tail_rcu(&work->list, &bdi->work_list); + spin_unlock(&bdi->wb_lock); + } + + /* + * If the default thread isn't there, make sure we add it. When + * it gets created and wakes up, we'll run this work. + */ + if (unlikely(list_empty_careful(&bdi->wb_list))) + wake_up_process(default_backing_dev_info.wb.task); + else { + struct bdi_writeback *wb = &bdi->wb; /* - * If the inode was already on s_dirty/s_io/s_more_io, don't - * reposition it (that would break s_dirty time-ordering). + * If we failed allocating the bdi work item, wake up the wb + * thread always. As a safety precaution, it'll flush out + * everything */ - if (!was_dirty) { - inode->dirtied_when = jiffies; - list_move(&inode->i_list, &sb->s_dirty); - } + if (!wb_has_dirty_io(wb)) { + if (work) + wb_clear_pending(wb, work); + } else if (wb->task) + wake_up_process(wb->task); } -out: - spin_unlock(&inode_lock); } -EXPORT_SYMBOL(__mark_inode_dirty); +/* + * Used for on-stack allocated work items. The caller needs to wait until + * the wb threads have acked the work before it's safe to continue. + */ +static void bdi_wait_on_work_clear(struct bdi_work *work) +{ + wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait, + TASK_UNINTERRUPTIBLE); +} -static int write_inode(struct inode *inode, int sync) +static struct bdi_work *bdi_alloc_work(struct writeback_control *wbc) { - if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) - return inode->i_sb->s_op->write_inode(inode, sync); - return 0; + struct bdi_work *work; + + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) + bdi_work_init(work, wbc); + + return work; +} + +void bdi_start_writeback(struct writeback_control *wbc) +{ + const bool must_wait = wbc->sync_mode == WB_SYNC_ALL; + struct bdi_work work_stack, *work = NULL; + + if (!must_wait) + work = bdi_alloc_work(wbc); + + if (!work) { + work = &work_stack; + bdi_work_init_on_stack(work, wbc); + } + + bdi_queue_work(wbc->bdi, work); + + /* + * If the sync mode is WB_SYNC_ALL, block waiting for the work to + * complete. If not, we only need to wait for the work to be started, + * if we allocated it on-stack. We use the same mechanism, if the + * wait bit is set in the bdi_work struct, then threads will not + * clear pending until after they are done. + * + * Note that work == &work_stack if must_wait is true, so we don't + * need to do call_rcu() here ever, since the completion path will + * have done that for us. + */ + if (must_wait || work == &work_stack) { + bdi_wait_on_work_clear(work); + if (work != &work_stack) + call_rcu(&work->rcu_head, bdi_work_free); + } } /* @@ -191,31 +243,32 @@ static int write_inode(struct inode *inode, int sync) * furthest end of its superblock's dirty-inode list. * * Before stamping the inode's ->dirtied_when, we check to see whether it is - * already the most-recently-dirtied inode on the s_dirty list. If that is + * already the most-recently-dirtied inode on the b_dirty list. If that is * the case then the inode must have been redirtied while it was being written * out and we don't reset its dirtied_when. */ static void redirty_tail(struct inode *inode) { - struct super_block *sb = inode->i_sb; + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; - if (!list_empty(&sb->s_dirty)) { - struct inode *tail_inode; + if (!list_empty(&wb->b_dirty)) { + struct inode *tail; - tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list); - if (time_before(inode->dirtied_when, - tail_inode->dirtied_when)) + tail = list_entry(wb->b_dirty.next, struct inode, i_list); + if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } - list_move(&inode->i_list, &sb->s_dirty); + list_move(&inode->i_list, &wb->b_dirty); } /* - * requeue inode for re-scanning after sb->s_io list is exhausted. + * requeue inode for re-scanning after bdi->b_io list is exhausted. */ static void requeue_io(struct inode *inode) { - list_move(&inode->i_list, &inode->i_sb->s_more_io); + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + + list_move(&inode->i_list, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) @@ -262,20 +315,18 @@ static void move_expired_inodes(struct list_head *delaying_queue, /* * Queue all expired dirty inodes for io, eldest first. */ -static void queue_io(struct super_block *sb, - unsigned long *older_than_this) +static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this) { - list_splice_init(&sb->s_more_io, sb->s_io.prev); - move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this); + list_splice_init(&wb->b_more_io, wb->b_io.prev); + move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this); } -int sb_has_dirty_inodes(struct super_block *sb) +static int write_inode(struct inode *inode, int sync) { - return !list_empty(&sb->s_dirty) || - !list_empty(&sb->s_io) || - !list_empty(&sb->s_more_io); + if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode)) + return inode->i_sb->s_op->write_inode(inode, sync); + return 0; } -EXPORT_SYMBOL(sb_has_dirty_inodes); /* * Wait for writeback on an inode to complete. @@ -322,11 +373,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (inode->i_state & I_SYNC) { /* * If this inode is locked for writeback and we are not doing - * writeback-for-data-integrity, move it to s_more_io so that + * writeback-for-data-integrity, move it to b_more_io so that * writeback can proceed with the other inodes on s_io. * * We'll have another go at writing back this inode when we - * completed a full scan of s_io. + * completed a full scan of b_io. */ if (!wait) { requeue_io(inode); @@ -371,11 +422,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) /* * We didn't write back all the pages. nfs_writepages() * sometimes bales out without doing anything. Redirty - * the inode; Move it from s_io onto s_more_io/s_dirty. + * the inode; Move it from b_io onto b_more_io/b_dirty. */ /* * akpm: if the caller was the kupdate function we put - * this inode at the head of s_dirty so it gets first + * this inode at the head of b_dirty so it gets first * consideration. Otherwise, move it to the tail, for * the reasons described there. I'm not really sure * how much sense this makes. Presumably I had a good @@ -385,7 +436,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->for_kupdate) { /* * For the kupdate function we move the inode - * to s_more_io so it will get more writeout as + * to b_more_io so it will get more writeout as * soon as the queue becomes uncongested. */ inode->i_state |= I_DIRTY_PAGES; @@ -434,50 +485,84 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc) } /* - * Write out a superblock's list of dirty inodes. A wait will be performed - * upon no inodes, all inodes or the final one, depending upon sync_mode. - * - * If older_than_this is non-NULL, then only write out inodes which - * had their first dirtying at a time earlier than *older_than_this. - * - * If we're a pdflush thread, then implement pdflush collision avoidance - * against the entire list. - * - * If `bdi' is non-zero then we're being asked to writeback a specific queue. - * This function assumes that the blockdev superblock's inodes are backed by - * a variety of queues, so all inodes are searched. For other superblocks, - * assume that all inodes are backed by the same queue. - * - * FIXME: this linear search could get expensive with many fileystems. But - * how to fix? We need to go from an address_space to all inodes which share - * a queue with that address_space. (Easy: have a global "dirty superblocks" - * list). + * For WB_SYNC_NONE writeback, the caller does not have the sb pinned + * before calling writeback. So make sure that we do pin it, so it doesn't + * go away while we are writing inodes from it. * - * The inodes to be written are parked on sb->s_io. They are moved back onto - * sb->s_dirty as they are selected for writing. This way, none can be missed - * on the writer throttling path, and we get decent balancing between many - * throttled threads: we don't want them all piling up on inode_sync_wait. + * Returns 0 if the super was successfully pinned (or pinning wasn't needed), + * 1 if we failed. */ -void generic_sync_sb_inodes(struct super_block *sb, +static int pin_sb_for_writeback(struct writeback_control *wbc, + struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + /* + * Caller must already hold the ref for this + */ + if (wbc->sync_mode == WB_SYNC_ALL) { + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + return 0; + } + + spin_lock(&sb_lock); + sb->s_count++; + if (down_read_trylock(&sb->s_umount)) { + if (sb->s_root) { + spin_unlock(&sb_lock); + return 0; + } + /* + * umounted, drop rwsem again and fall through to failure + */ + up_read(&sb->s_umount); + } + + sb->s_count--; + spin_unlock(&sb_lock); + return 1; +} + +static void unpin_sb_for_writeback(struct writeback_control *wbc, + struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (wbc->sync_mode == WB_SYNC_ALL) + return; + + up_read(&sb->s_umount); + put_super(sb); +} + +static void writeback_inodes_wb(struct bdi_writeback *wb, struct writeback_control *wbc) { + struct super_block *sb = wbc->sb; + const int is_blkdev_sb = sb_is_blkdev_sb(sb); const unsigned long start = jiffies; /* livelock avoidance */ - int sync = wbc->sync_mode == WB_SYNC_ALL; spin_lock(&inode_lock); - if (!wbc->for_kupdate || list_empty(&sb->s_io)) - queue_io(sb, wbc->older_than_this); - while (!list_empty(&sb->s_io)) { - struct inode *inode = list_entry(sb->s_io.prev, + if (!wbc->for_kupdate || list_empty(&wb->b_io)) + queue_io(wb, wbc->older_than_this); + + while (!list_empty(&wb->b_io)) { + struct inode *inode = list_entry(wb->b_io.prev, struct inode, i_list); - struct address_space *mapping = inode->i_mapping; - struct backing_dev_info *bdi = mapping->backing_dev_info; long pages_skipped; - if (!bdi_cap_writeback_dirty(bdi)) { + /* + * super block given and doesn't match, skip this inode + */ + if (sb && sb != inode->i_sb) { + redirty_tail(inode); + continue; + } + + if (!bdi_cap_writeback_dirty(wb->bdi)) { redirty_tail(inode); - if (sb_is_blkdev_sb(sb)) { + if (is_blkdev_sb) { /* * Dirty memory-backed blockdev: the ramdisk * driver does this. Skip just this inode @@ -497,21 +582,14 @@ void generic_sync_sb_inodes(struct super_block *sb, continue; } - if (wbc->nonblocking && bdi_write_congested(bdi)) { + if (wbc->nonblocking && bdi_write_congested(wb->bdi)) { wbc->encountered_congestion = 1; - if (!sb_is_blkdev_sb(sb)) + if (!is_blkdev_sb) break; /* Skip a congested fs */ requeue_io(inode); continue; /* Skip a congested blockdev */ } - if (wbc->bdi && bdi != wbc->bdi) { - if (!sb_is_blkdev_sb(sb)) - break; /* fs has the wrong queue */ - requeue_io(inode); - continue; /* blockdev has wrong queue */ - } - /* * Was this inode dirtied after sync_sb_inodes was called? * This keeps sync from extra jobs and livelock. @@ -519,16 +597,16 @@ void generic_sync_sb_inodes(struct super_block *sb, if (inode_dirtied_after(inode, start)) break; - /* Is another pdflush already flushing this queue? */ - if (current_is_pdflush() && !writeback_acquire(bdi)) - break; + if (pin_sb_for_writeback(wbc, inode)) { + requeue_io(inode); + continue; + } BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); __iget(inode); pages_skipped = wbc->pages_skipped; writeback_single_inode(inode, wbc); - if (current_is_pdflush()) - writeback_release(bdi); + unpin_sb_for_writeback(wbc, inode); if (wbc->pages_skipped != pages_skipped) { /* * writeback is not making progress due to locked @@ -544,144 +622,571 @@ void generic_sync_sb_inodes(struct super_block *sb, wbc->more_io = 1; break; } - if (!list_empty(&sb->s_more_io)) + if (!list_empty(&wb->b_more_io)) wbc->more_io = 1; } - if (sync) { - struct inode *inode, *old_inode = NULL; + spin_unlock(&inode_lock); + /* Leave any unwritten inodes on b_io */ +} + +void writeback_inodes_wbc(struct writeback_control *wbc) +{ + struct backing_dev_info *bdi = wbc->bdi; + + writeback_inodes_wb(&bdi->wb, wbc); +} + +/* + * The maximum number of pages to writeout in a single bdi flush/kupdate + * operation. We do this so we don't hold I_SYNC against an inode for + * enormous amounts of time, which would block a userspace task which has + * been forced to throttle against that inode. Also, the code reevaluates + * the dirty each time it has written this many pages. + */ +#define MAX_WRITEBACK_PAGES 1024 + +static inline bool over_bground_thresh(void) +{ + unsigned long background_thresh, dirty_thresh; + + get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); + + return (global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) >= background_thresh); +} + +/* + * Explicit flushing or periodic writeback of "old" data. + * + * Define "old": the first time one of an inode's pages is dirtied, we mark the + * dirtying-time in the inode's address_space. So this periodic writeback code + * just walks the superblock inode list, writing back any inodes which are + * older than a specific point in time. + * + * Try to run once per dirty_writeback_interval. But if a writeback event + * takes longer than a dirty_writeback_interval interval, then leave a + * one-second gap. + * + * older_than_this takes precedence over nr_to_write. So we'll only write back + * all dirty pages if they are all attached to "old" mappings. + */ +static long wb_writeback(struct bdi_writeback *wb, long nr_pages, + struct super_block *sb, + enum writeback_sync_modes sync_mode, int for_kupdate) +{ + struct writeback_control wbc = { + .bdi = wb->bdi, + .sb = sb, + .sync_mode = sync_mode, + .older_than_this = NULL, + .for_kupdate = for_kupdate, + .range_cyclic = 1, + }; + unsigned long oldest_jif; + long wrote = 0; + + if (wbc.for_kupdate) { + wbc.older_than_this = &oldest_jif; + oldest_jif = jiffies - + msecs_to_jiffies(dirty_expire_interval * 10); + } + + for (;;) { + /* + * Don't flush anything for non-integrity writeback where + * no nr_pages was given + */ + if (!for_kupdate && nr_pages <= 0 && sync_mode == WB_SYNC_NONE) + break; /* - * Data integrity sync. Must wait for all pages under writeback, - * because there may have been pages dirtied before our sync - * call, but which had writeout started before we write it out. - * In which case, the inode may not be on the dirty list, but - * we still have to wait for that writeout. + * If no specific pages were given and this is just a + * periodic background writeout and we are below the + * background dirty threshold, don't do anything */ - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { - struct address_space *mapping; + if (for_kupdate && nr_pages <= 0 && !over_bground_thresh()) + break; - if (inode->i_state & - (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) - continue; - mapping = inode->i_mapping; - if (mapping->nrpages == 0) + wbc.more_io = 0; + wbc.encountered_congestion = 0; + wbc.nr_to_write = MAX_WRITEBACK_PAGES; + wbc.pages_skipped = 0; + writeback_inodes_wb(wb, &wbc); + nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; + wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write; + + /* + * If we ran out of stuff to write, bail unless more_io got set + */ + if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { + if (wbc.more_io && !wbc.for_kupdate) continue; - __iget(inode); - spin_unlock(&inode_lock); + break; + } + } + + return wrote; +} + +/* + * Return the next bdi_work struct that hasn't been processed by this + * wb thread yet + */ +static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi, + struct bdi_writeback *wb) +{ + struct bdi_work *work, *ret = NULL; + + rcu_read_lock(); + + list_for_each_entry_rcu(work, &bdi->work_list, list) { + if (!test_and_clear_bit(wb->nr, &work->seen)) + continue; + + ret = work; + break; + } + + rcu_read_unlock(); + return ret; +} + +static long wb_check_old_data_flush(struct bdi_writeback *wb) +{ + unsigned long expired; + long nr_pages; + + expired = wb->last_old_flush + + msecs_to_jiffies(dirty_writeback_interval * 10); + if (time_before(jiffies, expired)) + return 0; + + wb->last_old_flush = jiffies; + nr_pages = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS) + + (inodes_stat.nr_inodes - inodes_stat.nr_unused); + + if (nr_pages) + return wb_writeback(wb, nr_pages, NULL, WB_SYNC_NONE, 1); + + return 0; +} + +/* + * Retrieve work items and do the writeback they describe + */ +long wb_do_writeback(struct bdi_writeback *wb, int force_wait) +{ + struct backing_dev_info *bdi = wb->bdi; + struct bdi_work *work; + long nr_pages, wrote = 0; + + while ((work = get_next_work_item(bdi, wb)) != NULL) { + enum writeback_sync_modes sync_mode; + + nr_pages = work->nr_pages; + + /* + * Override sync mode, in case we must wait for completion + */ + if (force_wait) + work->sync_mode = sync_mode = WB_SYNC_ALL; + else + sync_mode = work->sync_mode; + + /* + * If this isn't a data integrity operation, just notify + * that we have seen this work and we are now starting it. + */ + if (sync_mode == WB_SYNC_NONE) + wb_clear_pending(wb, work); + + wrote += wb_writeback(wb, nr_pages, work->sb, sync_mode, 0); + + /* + * This is a data integrity writeback, so only do the + * notification when we have completed the work. + */ + if (sync_mode == WB_SYNC_ALL) + wb_clear_pending(wb, work); + } + + /* + * Check for periodic writeback, kupdated() style + */ + wrote += wb_check_old_data_flush(wb); + + return wrote; +} + +/* + * Handle writeback of dirty data for the device backed by this bdi. Also + * wakes up periodically and does kupdated style flushing. + */ +int bdi_writeback_task(struct bdi_writeback *wb) +{ + unsigned long last_active = jiffies; + unsigned long wait_jiffies = -1UL; + long pages_written; + + while (!kthread_should_stop()) { + pages_written = wb_do_writeback(wb, 0); + + if (pages_written) + last_active = jiffies; + else if (wait_jiffies != -1UL) { + unsigned long max_idle; + /* - * We hold a reference to 'inode' so it couldn't have - * been removed from s_inodes list while we dropped the - * inode_lock. We cannot iput the inode now as we can - * be holding the last reference and we cannot iput it - * under inode_lock. So we keep the reference and iput - * it later. + * Longest period of inactivity that we tolerate. If we + * see dirty data again later, the task will get + * recreated automatically. */ - iput(old_inode); - old_inode = inode; + max_idle = max(5UL * 60 * HZ, wait_jiffies); + if (time_after(jiffies, max_idle + last_active)) + break; + } + + wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(wait_jiffies); + try_to_freeze(); + } + + return 0; +} + +/* + * Schedule writeback for all backing devices. Expensive! If this is a data + * integrity operation, writeback will be complete when this returns. If + * we are simply called for WB_SYNC_NONE, then writeback will merely be + * scheduled to run. + */ +static void bdi_writeback_all(struct writeback_control *wbc) +{ + const bool must_wait = wbc->sync_mode == WB_SYNC_ALL; + struct backing_dev_info *bdi; + struct bdi_work *work; + LIST_HEAD(list); + +restart: + spin_lock(&bdi_lock); + + list_for_each_entry(bdi, &bdi_list, bdi_list) { + struct bdi_work *work; + + if (!bdi_has_dirty_io(bdi)) + continue; - filemap_fdatawait(mapping); + /* + * If work allocation fails, do the writes inline. We drop + * the lock and restart the list writeout. This should be OK, + * since this happens rarely and because the writeout should + * eventually make more free memory available. + */ + work = bdi_alloc_work(wbc); + if (!work) { + struct writeback_control __wbc; - cond_resched(); + /* + * Not a data integrity writeout, just continue + */ + if (!must_wait) + continue; - spin_lock(&inode_lock); + spin_unlock(&bdi_lock); + __wbc = *wbc; + __wbc.bdi = bdi; + writeback_inodes_wbc(&__wbc); + goto restart; } - spin_unlock(&inode_lock); - iput(old_inode); - } else - spin_unlock(&inode_lock); + if (must_wait) + list_add_tail(&work->wait_list, &list); + + bdi_queue_work(bdi, work); + } + + spin_unlock(&bdi_lock); - return; /* Leave any unwritten inodes on s_io */ + /* + * If this is for WB_SYNC_ALL, wait for pending work to complete + * before returning. + */ + while (!list_empty(&list)) { + work = list_entry(list.next, struct bdi_work, wait_list); + list_del(&work->wait_list); + bdi_wait_on_work_clear(work); + call_rcu(&work->rcu_head, bdi_work_free); + } } -EXPORT_SYMBOL_GPL(generic_sync_sb_inodes); -static void sync_sb_inodes(struct super_block *sb, - struct writeback_control *wbc) +/* + * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back + * the whole world. + */ +void wakeup_flusher_threads(long nr_pages) { - generic_sync_sb_inodes(sb, wbc); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .range_cyclic = 1, + }; + + if (nr_pages == 0) + nr_pages = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); + wbc.nr_to_write = nr_pages; + bdi_writeback_all(&wbc); } -/* - * Start writeback of dirty pagecache data against all unlocked inodes. +static noinline void block_dump___mark_inode_dirty(struct inode *inode) +{ + if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) { + struct dentry *dentry; + const char *name = "?"; + + dentry = d_find_alias(inode); + if (dentry) { + spin_lock(&dentry->d_lock); + name = (const char *) dentry->d_name.name; + } + printk(KERN_DEBUG + "%s(%d): dirtied inode %lu (%s) on %s\n", + current->comm, task_pid_nr(current), inode->i_ino, + name, inode->i_sb->s_id); + if (dentry) { + spin_unlock(&dentry->d_lock); + dput(dentry); + } + } +} + +/** + * __mark_inode_dirty - internal function + * @inode: inode to mark + * @flags: what kind of dirty (i.e. I_DIRTY_SYNC) + * Mark an inode as dirty. Callers should use mark_inode_dirty or + * mark_inode_dirty_sync. + * + * Put the inode on the super block's dirty list. + * + * CAREFUL! We mark it dirty unconditionally, but move it onto the + * dirty list only if it is hashed or if it refers to a blockdev. + * If it was not hashed, it will never be added to the dirty list + * even if it is later hashed, as it will have been marked dirty already. * - * Note: - * We don't need to grab a reference to superblock here. If it has non-empty - * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed - * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all - * empty. Since __sync_single_inode() regains inode_lock before it finally moves - * inode from superblock lists we are OK. + * In short, make sure you hash any inodes _before_ you start marking + * them dirty. * - * If `older_than_this' is non-zero then only flush inodes which have a - * flushtime older than *older_than_this. + * This function *must* be atomic for the I_DIRTY_PAGES case - + * set_page_dirty() is called under spinlock in several places. * - * If `bdi' is non-zero then we will scan the first inode against each - * superblock until we find the matching ones. One group will be the dirty - * inodes against a filesystem. Then when we hit the dummy blockdev superblock, - * sync_sb_inodes will seekout the blockdev which matches `bdi'. Maybe not - * super-efficient but we're about to do a ton of I/O... + * Note that for blockdevs, inode->dirtied_when represents the dirtying time of + * the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of + * the kernel-internal blockdev inode represents the dirtying time of the + * blockdev's pages. This is why for I_DIRTY_PAGES we always use + * page->mapping->host, so the page-dirtying time is recorded in the internal + * blockdev inode. */ -void -writeback_inodes(struct writeback_control *wbc) +void __mark_inode_dirty(struct inode *inode, int flags) { - struct super_block *sb; + struct super_block *sb = inode->i_sb; - might_sleep(); - spin_lock(&sb_lock); -restart: - list_for_each_entry_reverse(sb, &super_blocks, s_list) { - if (sb_has_dirty_inodes(sb)) { - /* we're making our own get_super here */ - sb->s_count++; - spin_unlock(&sb_lock); - /* - * If we can't get the readlock, there's no sense in - * waiting around, most of the time the FS is going to - * be unmounted by the time it is released. - */ - if (down_read_trylock(&sb->s_umount)) { - if (sb->s_root) - sync_sb_inodes(sb, wbc); - up_read(&sb->s_umount); + /* + * Don't do this for I_DIRTY_PAGES - that doesn't actually + * dirty the inode itself + */ + if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) { + if (sb->s_op->dirty_inode) + sb->s_op->dirty_inode(inode); + } + + /* + * make sure that changes are seen by all cpus before we test i_state + * -- mikulas + */ + smp_mb(); + + /* avoid the locking if we can */ + if ((inode->i_state & flags) == flags) + return; + + if (unlikely(block_dump)) + block_dump___mark_inode_dirty(inode); + + spin_lock(&inode_lock); + if ((inode->i_state & flags) != flags) { + const int was_dirty = inode->i_state & I_DIRTY; + + inode->i_state |= flags; + + /* + * If the inode is being synced, just update its dirty state. + * The unlocker will place the inode on the appropriate + * superblock list, based upon its state. + */ + if (inode->i_state & I_SYNC) + goto out; + + /* + * Only add valid (hashed) inodes to the superblock's + * dirty list. Add blockdev inodes as well. + */ + if (!S_ISBLK(inode->i_mode)) { + if (hlist_unhashed(&inode->i_hash)) + goto out; + } + if (inode->i_state & (I_FREEING|I_CLEAR)) + goto out; + + /* + * If the inode was already on b_dirty/b_io/b_more_io, don't + * reposition it (that would break b_dirty time-ordering). + */ + if (!was_dirty) { + struct bdi_writeback *wb = &inode_to_bdi(inode)->wb; + struct backing_dev_info *bdi = wb->bdi; + + if (bdi_cap_writeback_dirty(bdi) && + !test_bit(BDI_registered, &bdi->state)) { + WARN_ON(1); + printk(KERN_ERR "bdi-%s not registered\n", + bdi->name); } - spin_lock(&sb_lock); - if (__put_super_and_need_restart(sb)) - goto restart; + + inode->dirtied_when = jiffies; + list_move(&inode->i_list, &wb->b_dirty); } - if (wbc->nr_to_write <= 0) - break; } - spin_unlock(&sb_lock); +out: + spin_unlock(&inode_lock); } +EXPORT_SYMBOL(__mark_inode_dirty); /* - * writeback and wait upon the filesystem's dirty inodes. The caller will - * do this in two passes - one to write, and one to wait. + * Write out a superblock's list of dirty inodes. A wait will be performed + * upon no inodes, all inodes or the final one, depending upon sync_mode. + * + * If older_than_this is non-NULL, then only write out inodes which + * had their first dirtying at a time earlier than *older_than_this. + * + * If we're a pdlfush thread, then implement pdflush collision avoidance + * against the entire list. * - * A finite limit is set on the number of pages which will be written. - * To prevent infinite livelock of sys_sync(). + * If `bdi' is non-zero then we're being asked to writeback a specific queue. + * This function assumes that the blockdev superblock's inodes are backed by + * a variety of queues, so all inodes are searched. For other superblocks, + * assume that all inodes are backed by the same queue. * - * We add in the number of potentially dirty inodes, because each inode write - * can dirty pagecache in the underlying blockdev. + * The inodes to be written are parked on bdi->b_io. They are moved back onto + * bdi->b_dirty as they are selected for writing. This way, none can be missed + * on the writer throttling path, and we get decent balancing between many + * throttled threads: we don't want them all piling up on inode_sync_wait. + */ +static void wait_sb_inodes(struct writeback_control *wbc) +{ + struct inode *inode, *old_inode = NULL; + + /* + * We need to be protected against the filesystem going from + * r/o to r/w or vice versa. + */ + WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount)); + + spin_lock(&inode_lock); + + /* + * Data integrity sync. Must wait for all pages under writeback, + * because there may have been pages dirtied before our sync + * call, but which had writeout started before we write it out. + * In which case, the inode may not be on the dirty list, but + * we still have to wait for that writeout. + */ + list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) { + struct address_space *mapping; + + if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) + continue; + mapping = inode->i_mapping; + if (mapping->nrpages == 0) + continue; + __iget(inode); + spin_unlock(&inode_lock); + /* + * We hold a reference to 'inode' so it couldn't have + * been removed from s_inodes list while we dropped the + * inode_lock. We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it + * under inode_lock. So we keep the reference and iput + * it later. + */ + iput(old_inode); + old_inode = inode; + + filemap_fdatawait(mapping); + + cond_resched(); + + spin_lock(&inode_lock); + } + spin_unlock(&inode_lock); + iput(old_inode); +} + +/** + * writeback_inodes_sb - writeback dirty inodes from given super_block + * @sb: the superblock + * + * Start writeback on some inodes on this super_block. No guarantees are made + * on how many (if any) will be written, and this function does not wait + * for IO completion of submitted IO. The number of pages submitted is + * returned. */ -void sync_inodes_sb(struct super_block *sb, int wait) +long writeback_inodes_sb(struct super_block *sb) { struct writeback_control wbc = { - .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, + .sb = sb, + .sync_mode = WB_SYNC_NONE, .range_start = 0, .range_end = LLONG_MAX, }; + unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); + unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); + long nr_to_write; - if (!wait) { - unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); - unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); - - wbc.nr_to_write = nr_dirty + nr_unstable + + nr_to_write = nr_dirty + nr_unstable + (inodes_stat.nr_inodes - inodes_stat.nr_unused); - } else - wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */ - sync_sb_inodes(sb, &wbc); + wbc.nr_to_write = nr_to_write; + bdi_writeback_all(&wbc); + return nr_to_write - wbc.nr_to_write; } +EXPORT_SYMBOL(writeback_inodes_sb); + +/** + * sync_inodes_sb - sync sb inode pages + * @sb: the superblock + * + * This function writes and waits on any dirty inode belonging to this + * super_block. The number of pages synced is returned. + */ +long sync_inodes_sb(struct super_block *sb) +{ + struct writeback_control wbc = { + .sb = sb, + .sync_mode = WB_SYNC_ALL, + .range_start = 0, + .range_end = LLONG_MAX, + }; + long nr_to_write = LONG_MAX; /* doesn't actually matter */ + + wbc.nr_to_write = nr_to_write; + bdi_writeback_all(&wbc); + wait_sb_inodes(&wbc); + return nr_to_write - wbc.nr_to_write; +} +EXPORT_SYMBOL(sync_inodes_sb); /** * write_inode_now - write an inode to disk @@ -737,57 +1242,3 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc) return ret; } EXPORT_SYMBOL(sync_inode); - -/** - * generic_osync_inode - flush all dirty data for a given inode to disk - * @inode: inode to write - * @mapping: the address_space that should be flushed - * @what: what to write and wait upon - * - * This can be called by file_write functions for files which have the - * O_SYNC flag set, to flush dirty writes to disk. - * - * @what is a bitmask, specifying which part of the inode's data should be - * written and waited upon. - * - * OSYNC_DATA: i_mapping's dirty data - * OSYNC_METADATA: the buffers at i_mapping->private_list - * OSYNC_INODE: the inode itself - */ - -int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what) -{ - int err = 0; - int need_write_inode_now = 0; - int err2; - - if (what & OSYNC_DATA) - err = filemap_fdatawrite(mapping); - if (what & (OSYNC_METADATA|OSYNC_DATA)) { - err2 = sync_mapping_buffers(mapping); - if (!err) - err = err2; - } - if (what & OSYNC_DATA) { - err2 = filemap_fdatawait(mapping); - if (!err) - err = err2; - } - - spin_lock(&inode_lock); - if ((inode->i_state & I_DIRTY) && - ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC))) - need_write_inode_now = 1; - spin_unlock(&inode_lock); - - if (need_write_inode_now) { - err2 = write_inode_now(inode, 1); - if (!err) - err = err2; - } - else - inode_sync_wait(inode); - - return err; -} -EXPORT_SYMBOL(generic_osync_inode); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index f91ccc4a189d..4567db6f9430 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -801,6 +801,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) { int err; + fc->bdi.name = "fuse"; fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; fc->bdi.unplug_io_fn = default_unplug_io_fn; /* fuse does it's own writeback accounting */ diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile index 3da2f1f4f738..21f7e46da4c0 100644 --- a/fs/gfs2/Makefile +++ b/fs/gfs2/Makefile @@ -1,6 +1,6 @@ EXTRA_CFLAGS := -I$(src) obj-$(CONFIG_GFS2_FS) += gfs2.o -gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \ +gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \ glops.o inode.o log.o lops.o main.o meta_io.o \ aops.o dentry.o export.o file.o \ ops_fstype.o ops_inode.o quota.o \ diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index fa881bdc3d85..3fc4e3ac7d84 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -19,8 +19,7 @@ #include "gfs2.h" #include "incore.h" #include "acl.h" -#include "eaops.h" -#include "eattr.h" +#include "xattr.h" #include "glock.h" #include "inode.h" #include "meta_io.h" @@ -31,8 +30,7 @@ #define ACL_DEFAULT 0 int gfs2_acl_validate_set(struct gfs2_inode *ip, int access, - struct gfs2_ea_request *er, - int *remove, mode_t *mode) + struct gfs2_ea_request *er, int *remove, mode_t *mode) { struct posix_acl *acl; int error; @@ -83,30 +81,20 @@ int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access) return 0; } -static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl, - struct gfs2_ea_location *el, char **data, unsigned int *len) +static int acl_get(struct gfs2_inode *ip, const char *name, + struct posix_acl **acl, struct gfs2_ea_location *el, + char **datap, unsigned int *lenp) { - struct gfs2_ea_request er; - struct gfs2_ea_location el_this; + char *data; + unsigned int len; int error; + el->el_bh = NULL; + if (!ip->i_eattr) return 0; - memset(&er, 0, sizeof(struct gfs2_ea_request)); - if (access) { - er.er_name = GFS2_POSIX_ACL_ACCESS; - er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN; - } else { - er.er_name = GFS2_POSIX_ACL_DEFAULT; - er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN; - } - er.er_type = GFS2_EATYPE_SYS; - - if (!el) - el = &el_this; - - error = gfs2_ea_find(ip, &er, el); + error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, el); if (error) return error; if (!el->el_ea) @@ -114,32 +102,31 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl, if (!GFS2_EA_DATA_LEN(el->el_ea)) goto out; - er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea); - er.er_data = kmalloc(er.er_data_len, GFP_NOFS); + len = GFS2_EA_DATA_LEN(el->el_ea); + data = kmalloc(len, GFP_NOFS); error = -ENOMEM; - if (!er.er_data) + if (!data) goto out; - error = gfs2_ea_get_copy(ip, el, er.er_data); - if (error) + error = gfs2_ea_get_copy(ip, el, data, len); + if (error < 0) goto out_kfree; + error = 0; if (acl) { - *acl = posix_acl_from_xattr(er.er_data, er.er_data_len); + *acl = posix_acl_from_xattr(data, len); if (IS_ERR(*acl)) error = PTR_ERR(*acl); } out_kfree: - if (error || !data) - kfree(er.er_data); - else { - *data = er.er_data; - *len = er.er_data_len; + if (error || !datap) { + kfree(data); + } else { + *datap = data; + *lenp = len; } out: - if (error || el == &el_this) - brelse(el->el_bh); return error; } @@ -153,10 +140,12 @@ out: int gfs2_check_acl(struct inode *inode, int mask) { + struct gfs2_ea_location el; struct posix_acl *acl = NULL; int error; - error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL); + error = acl_get(GFS2_I(inode), GFS2_POSIX_ACL_ACCESS, &acl, &el, NULL, NULL); + brelse(el.el_bh); if (error) return error; @@ -196,10 +185,12 @@ static int munge_mode(struct gfs2_inode *ip, mode_t mode) int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) { + struct gfs2_ea_location el; struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct posix_acl *acl = NULL, *clone; - struct gfs2_ea_request er; mode_t mode = ip->i_inode.i_mode; + char *data = NULL; + unsigned int len; int error; if (!sdp->sd_args.ar_posix_acl) @@ -207,11 +198,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) if (S_ISLNK(ip->i_inode.i_mode)) return 0; - memset(&er, 0, sizeof(struct gfs2_ea_request)); - er.er_type = GFS2_EATYPE_SYS; - - error = acl_get(dip, ACL_DEFAULT, &acl, NULL, - &er.er_data, &er.er_data_len); + error = acl_get(dip, GFS2_POSIX_ACL_DEFAULT, &acl, &el, &data, &len); + brelse(el.el_bh); if (error) return error; if (!acl) { @@ -229,9 +217,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) acl = clone; if (S_ISDIR(ip->i_inode.i_mode)) { - er.er_name = GFS2_POSIX_ACL_DEFAULT; - er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN; - error = gfs2_system_eaops.eo_set(ip, &er); + error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS, + GFS2_POSIX_ACL_DEFAULT, data, len, 0); if (error) goto out; } @@ -239,21 +226,19 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip) error = posix_acl_create_masq(acl, &mode); if (error < 0) goto out; - if (error > 0) { - er.er_name = GFS2_POSIX_ACL_ACCESS; - er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN; - posix_acl_to_xattr(acl, er.er_data, er.er_data_len); - er.er_mode = mode; - er.er_flags = GFS2_ERF_MODE; - error = gfs2_system_eaops.eo_set(ip, &er); - if (error) - goto out; - } else - munge_mode(ip, mode); + if (error == 0) + goto munge; + posix_acl_to_xattr(acl, data, len); + error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS, + GFS2_POSIX_ACL_ACCESS, data, len, 0); + if (error) + goto out; +munge: + error = munge_mode(ip, mode); out: posix_acl_release(acl); - kfree(er.er_data); + kfree(data); return error; } @@ -265,9 +250,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) unsigned int len; int error; - error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len); + error = acl_get(ip, GFS2_POSIX_ACL_ACCESS, &acl, &el, &data, &len); if (error) - return error; + goto out_brelse; if (!acl) return gfs2_setattr_simple(ip, attr); @@ -286,8 +271,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr) out: posix_acl_release(acl); - brelse(el.el_bh); kfree(data); +out_brelse: + brelse(el.el_bh); return error; } diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 03ebb439ace0..7ebae9a4ecc0 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -624,6 +624,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, { struct gfs2_inode *ip = GFS2_I(mapping->host); struct gfs2_sbd *sdp = GFS2_SB(mapping->host); + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); unsigned int data_blocks = 0, ind_blocks = 0, rblocks; int alloc_required; int error = 0; @@ -637,6 +638,14 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, error = gfs2_glock_nq(&ip->i_gh); if (unlikely(error)) goto out_uninit; + if (&ip->i_inode == sdp->sd_rindex) { + error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, + GL_NOCACHE, &m_ip->i_gh); + if (unlikely(error)) { + gfs2_glock_dq(&ip->i_gh); + goto out_uninit; + } + } error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); if (error) @@ -667,6 +676,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, rblocks += data_blocks ? data_blocks : 1; if (ind_blocks || data_blocks) rblocks += RES_STATFS + RES_QUOTA; + if (&ip->i_inode == sdp->sd_rindex) + rblocks += 2 * RES_STATFS; error = gfs2_trans_begin(sdp, rblocks, PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize); @@ -712,6 +723,10 @@ out_alloc_put: gfs2_alloc_put(ip); } out_unlock: + if (&ip->i_inode == sdp->sd_rindex) { + gfs2_glock_dq(&m_ip->i_gh); + gfs2_holder_uninit(&m_ip->i_gh); + } gfs2_glock_dq(&ip->i_gh); out_uninit: gfs2_holder_uninit(&ip->i_gh); @@ -725,14 +740,21 @@ out_uninit: static void adjust_fs_space(struct inode *inode) { struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + struct buffer_head *m_bh, *l_bh; u64 fs_total, new_free; /* Total up the file system space, according to the latest rindex. */ fs_total = gfs2_ri_total(sdp); + if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0) + return; spin_lock(&sdp->sd_statfs_spin); + gfs2_statfs_change_in(m_sc, m_bh->b_data + + sizeof(struct gfs2_dinode)); if (fs_total > (m_sc->sc_total + l_sc->sc_total)) new_free = fs_total - (m_sc->sc_total + l_sc->sc_total); else @@ -741,6 +763,13 @@ static void adjust_fs_space(struct inode *inode) fs_warn(sdp, "File system extended by %llu blocks.\n", (unsigned long long)new_free); gfs2_statfs_change(sdp, new_free, new_free, 0); + + if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0) + goto out; + update_statfs(sdp, m_bh, l_bh); + brelse(l_bh); +out: + brelse(m_bh); } /** @@ -763,6 +792,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); u64 to = pos + copied; void *kaddr; unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode); @@ -794,6 +824,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, brelse(dibh); gfs2_trans_end(sdp); + if (inode == sdp->sd_rindex) { + gfs2_glock_dq(&m_ip->i_gh); + gfs2_holder_uninit(&m_ip->i_gh); + } gfs2_glock_dq(&ip->i_gh); gfs2_holder_uninit(&ip->i_gh); return copied; @@ -823,6 +857,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, struct inode *inode = page->mapping->host; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); struct buffer_head *dibh; struct gfs2_alloc *al = ip->i_alloc; unsigned int from = pos & (PAGE_CACHE_SIZE - 1); @@ -865,6 +900,10 @@ failed: gfs2_quota_unlock(ip); gfs2_alloc_put(ip); } + if (inode == sdp->sd_rindex) { + gfs2_glock_dq(&m_ip->i_gh); + gfs2_holder_uninit(&m_ip->i_gh); + } gfs2_glock_dq(&ip->i_gh); gfs2_holder_uninit(&ip->i_gh); return ret; diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index 022c66cd5606..91beddadd388 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -107,8 +107,26 @@ static int gfs2_dhash(struct dentry *dentry, struct qstr *str) return 0; } +static int gfs2_dentry_delete(struct dentry *dentry) +{ + struct gfs2_inode *ginode; + + if (!dentry->d_inode) + return 0; + + ginode = GFS2_I(dentry->d_inode); + if (!ginode->i_iopen_gh.gh_gl) + return 0; + + if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags)) + return 1; + + return 0; +} + const struct dentry_operations gfs2_dops = { .d_revalidate = gfs2_drevalidate, .d_hash = gfs2_dhash, + .d_delete = gfs2_dentry_delete, }; diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c deleted file mode 100644 index dee9b03e5b37..000000000000 --- a/fs/gfs2/eaops.c +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/completion.h> -#include <linux/buffer_head.h> -#include <linux/capability.h> -#include <linux/xattr.h> -#include <linux/gfs2_ondisk.h> -#include <asm/uaccess.h> - -#include "gfs2.h" -#include "incore.h" -#include "acl.h" -#include "eaops.h" -#include "eattr.h" -#include "util.h" - -/** - * gfs2_ea_name2type - get the type of the ea, and truncate type from the name - * @namep: ea name, possibly with type appended - * - * Returns: GFS2_EATYPE_XXX - */ - -unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name) -{ - unsigned int type; - - if (strncmp(name, "system.", 7) == 0) { - type = GFS2_EATYPE_SYS; - if (truncated_name) - *truncated_name = name + sizeof("system.") - 1; - } else if (strncmp(name, "user.", 5) == 0) { - type = GFS2_EATYPE_USR; - if (truncated_name) - *truncated_name = name + sizeof("user.") - 1; - } else if (strncmp(name, "security.", 9) == 0) { - type = GFS2_EATYPE_SECURITY; - if (truncated_name) - *truncated_name = name + sizeof("security.") - 1; - } else { - type = GFS2_EATYPE_UNUSED; - if (truncated_name) - *truncated_name = NULL; - } - - return type; -} - -static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) && - !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 && - (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) || - GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len))) - return -EOPNOTSUPP; - - return gfs2_ea_get_i(ip, er); -} - -static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - int remove = 0; - int error; - - if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) { - if (!(er->er_flags & GFS2_ERF_MODE)) { - er->er_mode = ip->i_inode.i_mode; - er->er_flags |= GFS2_ERF_MODE; - } - error = gfs2_acl_validate_set(ip, 1, er, - &remove, &er->er_mode); - if (error) - return error; - error = gfs2_ea_set_i(ip, er); - if (error) - return error; - if (remove) - gfs2_ea_remove_i(ip, er); - return 0; - - } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) { - error = gfs2_acl_validate_set(ip, 0, er, - &remove, NULL); - if (error) - return error; - if (!remove) - error = gfs2_ea_set_i(ip, er); - else { - error = gfs2_ea_remove_i(ip, er); - if (error == -ENODATA) - error = 0; - } - return error; - } - - return -EPERM; -} - -static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) { - int error = gfs2_acl_validate_remove(ip, 1); - if (error) - return error; - - } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) { - int error = gfs2_acl_validate_remove(ip, 0); - if (error) - return error; - - } else - return -EPERM; - - return gfs2_ea_remove_i(ip, er); -} - -static const struct gfs2_eattr_operations gfs2_user_eaops = { - .eo_get = gfs2_ea_get_i, - .eo_set = gfs2_ea_set_i, - .eo_remove = gfs2_ea_remove_i, - .eo_name = "user", -}; - -const struct gfs2_eattr_operations gfs2_system_eaops = { - .eo_get = system_eo_get, - .eo_set = system_eo_set, - .eo_remove = system_eo_remove, - .eo_name = "system", -}; - -static const struct gfs2_eattr_operations gfs2_security_eaops = { - .eo_get = gfs2_ea_get_i, - .eo_set = gfs2_ea_set_i, - .eo_remove = gfs2_ea_remove_i, - .eo_name = "security", -}; - -const struct gfs2_eattr_operations *gfs2_ea_ops[] = { - NULL, - &gfs2_user_eaops, - &gfs2_system_eaops, - &gfs2_security_eaops, -}; - diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h deleted file mode 100644 index da2f7fbbb40d..000000000000 --- a/fs/gfs2/eaops.h +++ /dev/null @@ -1,30 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __EAOPS_DOT_H__ -#define __EAOPS_DOT_H__ - -struct gfs2_ea_request; -struct gfs2_inode; - -struct gfs2_eattr_operations { - int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er); - int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er); - int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er); - char *eo_name; -}; - -unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name); - -extern const struct gfs2_eattr_operations gfs2_system_eaops; - -extern const struct gfs2_eattr_operations *gfs2_ea_ops[]; - -#endif /* __EAOPS_DOT_H__ */ - diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c index 9200ef221716..d15876e9aa26 100644 --- a/fs/gfs2/export.c +++ b/fs/gfs2/export.c @@ -143,17 +143,14 @@ static struct dentry *gfs2_get_parent(struct dentry *child) } static struct dentry *gfs2_get_dentry(struct super_block *sb, - struct gfs2_inum_host *inum) + struct gfs2_inum_host *inum) { struct gfs2_sbd *sdp = sb->s_fs_info; - struct gfs2_holder i_gh, ri_gh, rgd_gh; - struct gfs2_rgrpd *rgd; + struct gfs2_holder i_gh; struct inode *inode; struct dentry *dentry; int error; - /* System files? */ - inode = gfs2_ilookup(sb, inum->no_addr); if (inode) { if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) { @@ -168,29 +165,11 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, if (error) return ERR_PTR(error); - error = gfs2_rindex_hold(sdp, &ri_gh); + error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE); if (error) goto fail; - error = -EINVAL; - rgd = gfs2_blk2rgrpd(sdp, inum->no_addr); - if (!rgd) - goto fail_rindex; - - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh); - if (error) - goto fail_rindex; - - error = -ESTALE; - if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE) - goto fail_rgd; - - gfs2_glock_dq_uninit(&rgd_gh); - gfs2_glock_dq_uninit(&ri_gh); - - inode = gfs2_inode_lookup(sb, DT_UNKNOWN, - inum->no_addr, - 0, 0); + inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0); if (IS_ERR(inode)) { error = PTR_ERR(inode); goto fail; @@ -224,13 +203,6 @@ out_inode: if (!IS_ERR(dentry)) dentry->d_op = &gfs2_dops; return dentry; - -fail_rgd: - gfs2_glock_dq_uninit(&rgd_gh); - -fail_rindex: - gfs2_glock_dq_uninit(&ri_gh); - fail: gfs2_glock_dq_uninit(&i_gh); return ERR_PTR(error); diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 73318a3ce6f1..166f38fbd246 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -38,7 +38,6 @@ #include "rgrp.h" #include "trans.h" #include "util.h" -#include "eaops.h" /** * gfs2_llseek - seek to a location in a file diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 297421c0427a..8b674b1f3a55 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -63,6 +63,7 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int static DECLARE_RWSEM(gfs2_umount_flush_sem); static struct dentry *gfs2_root; static struct workqueue_struct *glock_workqueue; +struct workqueue_struct *gfs2_delete_workqueue; static LIST_HEAD(lru_list); static atomic_t lru_count = ATOMIC_INIT(0); static DEFINE_SPINLOCK(lru_lock); @@ -167,13 +168,33 @@ static void glock_free(struct gfs2_glock *gl) * */ -static void gfs2_glock_hold(struct gfs2_glock *gl) +void gfs2_glock_hold(struct gfs2_glock *gl) { GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0); atomic_inc(&gl->gl_ref); } /** + * demote_ok - Check to see if it's ok to unlock a glock + * @gl: the glock + * + * Returns: 1 if it's ok + */ + +static int demote_ok(const struct gfs2_glock *gl) +{ + const struct gfs2_glock_operations *glops = gl->gl_ops; + + if (gl->gl_state == LM_ST_UNLOCKED) + return 0; + if (!list_empty(&gl->gl_holders)) + return 0; + if (glops->go_demote_ok) + return glops->go_demote_ok(gl); + return 1; +} + +/** * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list * @gl: the glock * @@ -181,8 +202,13 @@ static void gfs2_glock_hold(struct gfs2_glock *gl) static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) { + int may_reclaim; + may_reclaim = (demote_ok(gl) && + (atomic_read(&gl->gl_ref) == 1 || + (gl->gl_name.ln_type == LM_TYPE_INODE && + atomic_read(&gl->gl_ref) <= 2))); spin_lock(&lru_lock); - if (list_empty(&gl->gl_lru) && gl->gl_state != LM_ST_UNLOCKED) { + if (list_empty(&gl->gl_lru) && may_reclaim) { list_add_tail(&gl->gl_lru, &lru_list); atomic_inc(&lru_count); } @@ -190,6 +216,21 @@ static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) } /** + * gfs2_glock_put_nolock() - Decrement reference count on glock + * @gl: The glock to put + * + * This function should only be used if the caller has its own reference + * to the glock, in addition to the one it is dropping. + */ + +void gfs2_glock_put_nolock(struct gfs2_glock *gl) +{ + if (atomic_dec_and_test(&gl->gl_ref)) + GLOCK_BUG_ON(gl, 1); + gfs2_glock_schedule_for_reclaim(gl); +} + +/** * gfs2_glock_put() - Decrement reference count on glock * @gl: The glock to put * @@ -214,9 +255,9 @@ int gfs2_glock_put(struct gfs2_glock *gl) rv = 1; goto out; } - /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */ - if (atomic_read(&gl->gl_ref) == 2) - gfs2_glock_schedule_for_reclaim(gl); + spin_lock(&gl->gl_spin); + gfs2_glock_schedule_for_reclaim(gl); + spin_unlock(&gl->gl_spin); write_unlock(gl_lock_addr(gl->gl_hash)); out: return rv; @@ -398,7 +439,7 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state) if (held2) gfs2_glock_hold(gl); else - gfs2_glock_put(gl); + gfs2_glock_put_nolock(gl); } gl->gl_state = new_state; @@ -633,12 +674,35 @@ out: out_sched: gfs2_glock_hold(gl); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put(gl); + gfs2_glock_put_nolock(gl); out_unlock: clear_bit(GLF_LOCK, &gl->gl_flags); goto out; } +static void delete_work_func(struct work_struct *work) +{ + struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete); + struct gfs2_sbd *sdp = gl->gl_sbd; + struct gfs2_inode *ip = NULL; + struct inode *inode; + u64 no_addr = 0; + + spin_lock(&gl->gl_spin); + ip = (struct gfs2_inode *)gl->gl_object; + if (ip) + no_addr = ip->i_no_addr; + spin_unlock(&gl->gl_spin); + if (ip) { + inode = gfs2_ilookup(sdp->sd_vfs, no_addr); + if (inode) { + d_prune_aliases(inode); + iput(inode); + } + } + gfs2_glock_put(gl); +} + static void glock_work_func(struct work_struct *work) { unsigned long delay = 0; @@ -717,6 +781,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, gl->gl_sbd = sdp; gl->gl_aspace = NULL; INIT_DELAYED_WORK(&gl->gl_work, glock_work_func); + INIT_WORK(&gl->gl_delete, delete_work_func); /* If this glock protects actual on-disk data or metadata blocks, create a VFS inode to manage the pages/buffers holding them. */ @@ -858,6 +923,8 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state, gl->gl_demote_state != state) { gl->gl_demote_state = LM_ST_UNLOCKED; } + if (gl->gl_ops->go_callback) + gl->gl_ops->go_callback(gl); trace_gfs2_demote_rq(gl); } @@ -1274,33 +1341,12 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) gfs2_glock_put(gl); } -/** - * demote_ok - Check to see if it's ok to unlock a glock - * @gl: the glock - * - * Returns: 1 if it's ok - */ - -static int demote_ok(const struct gfs2_glock *gl) -{ - const struct gfs2_glock_operations *glops = gl->gl_ops; - - if (gl->gl_state == LM_ST_UNLOCKED) - return 0; - if (!list_empty(&gl->gl_holders)) - return 0; - if (glops->go_demote_ok) - return glops->go_demote_ok(gl); - return 1; -} - static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask) { struct gfs2_glock *gl; int may_demote; int nr_skipped = 0; - int got_ref = 0; LIST_HEAD(skipped); if (nr == 0) @@ -1315,37 +1361,29 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask) list_del_init(&gl->gl_lru); atomic_dec(&lru_count); + /* Check if glock is about to be freed */ + if (atomic_read(&gl->gl_ref) == 0) + continue; + /* Test for being demotable */ if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { gfs2_glock_hold(gl); - got_ref = 1; spin_unlock(&lru_lock); spin_lock(&gl->gl_spin); may_demote = demote_ok(gl); - spin_unlock(&gl->gl_spin); - clear_bit(GLF_LOCK, &gl->gl_flags); if (may_demote) { handle_callback(gl, LM_ST_UNLOCKED, 0); nr--; - if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put(gl); - got_ref = 0; } + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put_nolock(gl); + spin_unlock(&gl->gl_spin); + clear_bit(GLF_LOCK, &gl->gl_flags); spin_lock(&lru_lock); - if (may_demote) - continue; - } - if (list_empty(&gl->gl_lru) && - (atomic_read(&gl->gl_ref) <= (2 + got_ref))) { - nr_skipped++; - list_add(&gl->gl_lru, &skipped); - } - if (got_ref) { - spin_unlock(&lru_lock); - gfs2_glock_put(gl); - spin_lock(&lru_lock); - got_ref = 0; + continue; } + nr_skipped++; + list_add(&gl->gl_lru, &skipped); } list_splice(&skipped, &lru_list); atomic_add(nr_skipped, &lru_count); @@ -1727,6 +1765,11 @@ int __init gfs2_glock_init(void) glock_workqueue = create_workqueue("glock_workqueue"); if (IS_ERR(glock_workqueue)) return PTR_ERR(glock_workqueue); + gfs2_delete_workqueue = create_workqueue("delete_workqueue"); + if (IS_ERR(gfs2_delete_workqueue)) { + destroy_workqueue(glock_workqueue); + return PTR_ERR(gfs2_delete_workqueue); + } register_shrinker(&glock_shrinker); @@ -1737,6 +1780,7 @@ void gfs2_glock_exit(void) { unregister_shrinker(&glock_shrinker); destroy_workqueue(glock_workqueue); + destroy_workqueue(gfs2_delete_workqueue); } static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index a602a28f6f08..c609894ec0d0 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -143,6 +143,7 @@ struct lm_lockops { #define GLR_TRYFAILED 13 +extern struct workqueue_struct *gfs2_delete_workqueue; static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl) { struct gfs2_holder *gh; @@ -191,6 +192,8 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl) int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, int create, struct gfs2_glock **glp); +void gfs2_glock_hold(struct gfs2_glock *gl); +void gfs2_glock_put_nolock(struct gfs2_glock *gl); int gfs2_glock_put(struct gfs2_glock *gl); void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, struct gfs2_holder *gh); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index d5e4ab155ca0..6985eef06c39 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -323,6 +323,7 @@ static void trans_go_sync(struct gfs2_glock *gl) if (gl->gl_state != LM_ST_UNLOCKED && test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { + flush_workqueue(gfs2_delete_workqueue); gfs2_meta_syncfs(sdp); gfs2_log_shutdown(sdp); } @@ -372,6 +373,25 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl) return 0; } +/** + * iopen_go_callback - schedule the dcache entry for the inode to be deleted + * @gl: the glock + * + * gl_spin lock is held while calling this + */ +static void iopen_go_callback(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object; + + if (gl->gl_demote_state == LM_ST_UNLOCKED && + gl->gl_state == LM_ST_SHARED && + ip && test_bit(GIF_USER, &ip->i_flags)) { + gfs2_glock_hold(gl); + if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0) + gfs2_glock_put_nolock(gl); + } +} + const struct gfs2_glock_operations gfs2_meta_glops = { .go_type = LM_TYPE_META, }; @@ -406,6 +426,7 @@ const struct gfs2_glock_operations gfs2_trans_glops = { const struct gfs2_glock_operations gfs2_iopen_glops = { .go_type = LM_TYPE_IOPEN, + .go_callback = iopen_go_callback, }; const struct gfs2_glock_operations gfs2_flock_glops = { diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 225347fbff3c..6edb423f90b3 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -159,6 +159,7 @@ struct gfs2_glock_operations { int (*go_lock) (struct gfs2_holder *gh); void (*go_unlock) (struct gfs2_holder *gh); int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); + void (*go_callback) (struct gfs2_glock *gl); const int go_type; const unsigned long go_min_hold_time; }; @@ -228,6 +229,7 @@ struct gfs2_glock { struct list_head gl_ail_list; atomic_t gl_ail_count; struct delayed_work gl_work; + struct work_struct gl_delete; }; #define GFS2_MIN_LVB_SIZE 32 /* Min size of LVB that gfs2 supports */ @@ -404,6 +406,12 @@ struct gfs2_statfs_change_host { #define GFS2_DATA_WRITEBACK 1 #define GFS2_DATA_ORDERED 2 +#define GFS2_ERRORS_DEFAULT GFS2_ERRORS_WITHDRAW +#define GFS2_ERRORS_WITHDRAW 0 +#define GFS2_ERRORS_CONTINUE 1 /* place holder for future feature */ +#define GFS2_ERRORS_RO 2 /* place holder for future feature */ +#define GFS2_ERRORS_PANIC 3 + struct gfs2_args { char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */ char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ @@ -420,6 +428,7 @@ struct gfs2_args { unsigned int ar_data:2; /* ordered/writeback */ unsigned int ar_meta:1; /* mount metafs */ unsigned int ar_discard:1; /* discard requests */ + unsigned int ar_errors:2; /* errors=withdraw | panic */ int ar_commit; /* Commit interval */ }; @@ -487,7 +496,6 @@ struct gfs2_sb_host { */ struct lm_lockstruct { - u32 ls_id; unsigned int ls_jid; unsigned int ls_first; unsigned int ls_first_done; @@ -539,18 +547,12 @@ struct gfs2_sbd { struct dentry *sd_root_dir; struct inode *sd_jindex; - struct inode *sd_inum_inode; struct inode *sd_statfs_inode; - struct inode *sd_ir_inode; struct inode *sd_sc_inode; struct inode *sd_qc_inode; struct inode *sd_rindex; struct inode *sd_quota_inode; - /* Inum stuff */ - - struct mutex sd_inum_mutex; - /* StatFS stuff */ spinlock_t sd_statfs_spin; @@ -578,7 +580,6 @@ struct gfs2_sbd { struct gfs2_holder sd_journal_gh; struct gfs2_holder sd_jinode_gh; - struct gfs2_holder sd_ir_gh; struct gfs2_holder sd_sc_gh; struct gfs2_holder sd_qc_gh; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 2f94bd723698..fb15d3b1f409 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -24,7 +24,7 @@ #include "acl.h" #include "bmap.h" #include "dir.h" -#include "eattr.h" +#include "xattr.h" #include "glock.h" #include "glops.h" #include "inode.h" @@ -519,139 +519,6 @@ out: return inode ? inode : ERR_PTR(error); } -static void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf) -{ - const struct gfs2_inum_range *str = buf; - - ir->ir_start = be64_to_cpu(str->ir_start); - ir->ir_length = be64_to_cpu(str->ir_length); -} - -static void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf) -{ - struct gfs2_inum_range *str = buf; - - str->ir_start = cpu_to_be64(ir->ir_start); - str->ir_length = cpu_to_be64(ir->ir_length); -} - -static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino) -{ - struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode); - struct buffer_head *bh; - struct gfs2_inum_range_host ir; - int error; - - error = gfs2_trans_begin(sdp, RES_DINODE, 0); - if (error) - return error; - mutex_lock(&sdp->sd_inum_mutex); - - error = gfs2_meta_inode_buffer(ip, &bh); - if (error) { - mutex_unlock(&sdp->sd_inum_mutex); - gfs2_trans_end(sdp); - return error; - } - - gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode)); - - if (ir.ir_length) { - *formal_ino = ir.ir_start++; - ir.ir_length--; - gfs2_trans_add_bh(ip->i_gl, bh, 1); - gfs2_inum_range_out(&ir, - bh->b_data + sizeof(struct gfs2_dinode)); - brelse(bh); - mutex_unlock(&sdp->sd_inum_mutex); - gfs2_trans_end(sdp); - return 0; - } - - brelse(bh); - - mutex_unlock(&sdp->sd_inum_mutex); - gfs2_trans_end(sdp); - - return 1; -} - -static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino) -{ - struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode); - struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode); - struct gfs2_holder gh; - struct buffer_head *bh; - struct gfs2_inum_range_host ir; - int error; - - error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - if (error) - return error; - - error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0); - if (error) - goto out; - mutex_lock(&sdp->sd_inum_mutex); - - error = gfs2_meta_inode_buffer(ip, &bh); - if (error) - goto out_end_trans; - - gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode)); - - if (!ir.ir_length) { - struct buffer_head *m_bh; - u64 x, y; - __be64 z; - - error = gfs2_meta_inode_buffer(m_ip, &m_bh); - if (error) - goto out_brelse; - - z = *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)); - x = y = be64_to_cpu(z); - ir.ir_start = x; - ir.ir_length = GFS2_INUM_QUANTUM; - x += GFS2_INUM_QUANTUM; - if (x < y) - gfs2_consist_inode(m_ip); - z = cpu_to_be64(x); - gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1); - *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = z; - - brelse(m_bh); - } - - *formal_ino = ir.ir_start++; - ir.ir_length--; - - gfs2_trans_add_bh(ip->i_gl, bh, 1); - gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode)); - -out_brelse: - brelse(bh); -out_end_trans: - mutex_unlock(&sdp->sd_inum_mutex); - gfs2_trans_end(sdp); -out: - gfs2_glock_dq_uninit(&gh); - return error; -} - -static int pick_formal_ino(struct gfs2_sbd *sdp, u64 *inum) -{ - int error; - - error = pick_formal_ino_1(sdp, inum); - if (error <= 0) - return error; - - error = pick_formal_ino_2(sdp, inum); - - return error; -} - /** * create_ok - OK to create a new on-disk inode here? * @dip: Directory in which dinode is to be created @@ -731,7 +598,7 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation) if (error) goto out_ipreserv; - *no_addr = gfs2_alloc_di(dip, generation); + error = gfs2_alloc_di(dip, no_addr, generation); gfs2_trans_end(sdp); @@ -924,7 +791,6 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip) size_t len; void *value; char *name; - struct gfs2_ea_request er; err = security_inode_init_security(&ip->i_inode, &dip->i_inode, &name, &value, &len); @@ -935,16 +801,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip) return err; } - memset(&er, 0, sizeof(struct gfs2_ea_request)); - - er.er_type = GFS2_EATYPE_SECURITY; - er.er_name = name; - er.er_data = value; - er.er_name_len = strlen(name); - er.er_data_len = len; - - err = gfs2_ea_set_i(ip, &er); - + err = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SECURITY, name, value, len, 0); kfree(value); kfree(name); @@ -991,13 +848,10 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, if (error) goto fail_gunlock; - error = pick_formal_ino(sdp, &inum.no_formal_ino); - if (error) - goto fail_gunlock; - error = alloc_dinode(dip, &inum.no_addr, &generation); if (error) goto fail_gunlock; + inum.no_formal_ino = generation; error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1); @@ -1008,9 +862,8 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, if (error) goto fail_gunlock2; - inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), - inum.no_addr, - inum.no_formal_ino, 0); + inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr, + inum.no_formal_ino, 0); if (IS_ERR(inode)) goto fail_gunlock2; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 7bc3c45cd676..52fb6c048981 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -84,7 +84,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) gfs2_tune_init(&sdp->sd_tune); - mutex_init(&sdp->sd_inum_mutex); spin_lock_init(&sdp->sd_statfs_spin); spin_lock_init(&sdp->sd_rindex_spin); @@ -833,21 +832,12 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) if (error) goto fail; - /* Read in the master inode number inode */ - sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum"); - if (IS_ERR(sdp->sd_inum_inode)) { - error = PTR_ERR(sdp->sd_inum_inode); - fs_err(sdp, "can't read in inum inode: %d\n", error); - goto fail_journal; - } - - /* Read in the master statfs inode */ sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs"); if (IS_ERR(sdp->sd_statfs_inode)) { error = PTR_ERR(sdp->sd_statfs_inode); fs_err(sdp, "can't read in statfs inode: %d\n", error); - goto fail_inum; + goto fail_journal; } /* Read in the resource index inode */ @@ -876,8 +866,6 @@ fail_rindex: iput(sdp->sd_rindex); fail_statfs: iput(sdp->sd_statfs_inode); -fail_inum: - iput(sdp->sd_inum_inode); fail_journal: init_journal(sdp, UNDO); fail: @@ -905,20 +893,12 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) return error; } - sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid); - sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf); - if (IS_ERR(sdp->sd_ir_inode)) { - error = PTR_ERR(sdp->sd_ir_inode); - fs_err(sdp, "can't find local \"ir\" file: %d\n", error); - goto fail; - } - sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid); sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf); if (IS_ERR(sdp->sd_sc_inode)) { error = PTR_ERR(sdp->sd_sc_inode); fs_err(sdp, "can't find local \"sc\" file: %d\n", error); - goto fail_ir_i; + goto fail; } sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid); @@ -932,27 +912,16 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) iput(pn); pn = NULL; - ip = GFS2_I(sdp->sd_ir_inode); - error = gfs2_glock_nq_init(ip->i_gl, - LM_ST_EXCLUSIVE, 0, - &sdp->sd_ir_gh); - if (error) { - fs_err(sdp, "can't lock local \"ir\" file: %d\n", error); - goto fail_qc_i; - } - ip = GFS2_I(sdp->sd_sc_inode); - error = gfs2_glock_nq_init(ip->i_gl, - LM_ST_EXCLUSIVE, 0, + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &sdp->sd_sc_gh); if (error) { fs_err(sdp, "can't lock local \"sc\" file: %d\n", error); - goto fail_ir_gh; + goto fail_qc_i; } ip = GFS2_I(sdp->sd_qc_inode); - error = gfs2_glock_nq_init(ip->i_gl, - LM_ST_EXCLUSIVE, 0, + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &sdp->sd_qc_gh); if (error) { fs_err(sdp, "can't lock local \"qc\" file: %d\n", error); @@ -965,14 +934,10 @@ fail_qc_gh: gfs2_glock_dq_uninit(&sdp->sd_qc_gh); fail_ut_gh: gfs2_glock_dq_uninit(&sdp->sd_sc_gh); -fail_ir_gh: - gfs2_glock_dq_uninit(&sdp->sd_ir_gh); fail_qc_i: iput(sdp->sd_qc_inode); fail_ut_i: iput(sdp->sd_sc_inode); -fail_ir_i: - iput(sdp->sd_ir_inode); fail: if (pn) iput(pn); @@ -1063,7 +1028,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) ls->ls_ops = lm; ls->ls_first = 1; - ls->ls_id = 0; for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) { substring_t tmp[MAX_OPT_ARGS]; @@ -1081,10 +1045,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent) ls->ls_jid = option; break; case Opt_id: - ret = match_int(&tmp[0], &option); - if (ret) - goto hostdata_error; - ls->ls_id = option; + /* Obsolete, but left for backward compat purposes */ break; case Opt_first: ret = match_int(&tmp[0], &option); @@ -1133,6 +1094,17 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp) lm->lm_unmount(sdp); } +void gfs2_online_uevent(struct gfs2_sbd *sdp) +{ + struct super_block *sb = sdp->sd_vfs; + char ro[20]; + char spectator[20]; + char *envp[] = { ro, spectator, NULL }; + sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0); + sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0); + kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp); +} + /** * fill_super - Read in superblock * @sb: The VFS superblock @@ -1157,6 +1129,7 @@ static int fill_super(struct super_block *sb, void *data, int silent) sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT; sdp->sd_args.ar_data = GFS2_DATA_DEFAULT; sdp->sd_args.ar_commit = 60; + sdp->sd_args.ar_errors = GFS2_ERRORS_DEFAULT; error = gfs2_mount_args(sdp, &sdp->sd_args, data); if (error) { @@ -1174,6 +1147,7 @@ static int fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = GFS2_MAGIC; sb->s_op = &gfs2_super_ops; sb->s_export_op = &gfs2_export_ops; + sb->s_xattr = gfs2_xattr_handlers; sb->s_time_gran = 1; sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -1236,7 +1210,7 @@ static int fill_super(struct super_block *sb, void *data, int silent) } gfs2_glock_dq_uninit(&mount_gh); - + gfs2_online_uevent(sdp); return 0; fail_threads: diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index f8bd20baf99c..c3ac18054057 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -26,8 +26,7 @@ #include "acl.h" #include "bmap.h" #include "dir.h" -#include "eaops.h" -#include "eattr.h" +#include "xattr.h" #include "glock.h" #include "inode.h" #include "meta_io.h" @@ -349,7 +348,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0); if (error) - goto out_rgrp; + goto out_gunlock; error = gfs2_dir_del(dip, &dentry->d_name); if (error) @@ -1302,60 +1301,53 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name, const void *data, size_t size, int flags) { struct inode *inode = dentry->d_inode; - struct gfs2_ea_request er; - - memset(&er, 0, sizeof(struct gfs2_ea_request)); - er.er_type = gfs2_ea_name2type(name, &er.er_name); - if (er.er_type == GFS2_EATYPE_UNUSED) - return -EOPNOTSUPP; - er.er_data = (char *)data; - er.er_name_len = strlen(er.er_name); - er.er_data_len = size; - er.er_flags = flags; - - gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE)); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; - return gfs2_ea_set(GFS2_I(inode), &er); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret == 0) { + ret = generic_setxattr(dentry, name, data, size, flags); + gfs2_glock_dq(&gh); + } + gfs2_holder_uninit(&gh); + return ret; } static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name, void *data, size_t size) { - struct gfs2_ea_request er; - - memset(&er, 0, sizeof(struct gfs2_ea_request)); - er.er_type = gfs2_ea_name2type(name, &er.er_name); - if (er.er_type == GFS2_EATYPE_UNUSED) - return -EOPNOTSUPP; - er.er_data = data; - er.er_name_len = strlen(er.er_name); - er.er_data_len = size; - - return gfs2_ea_get(GFS2_I(dentry->d_inode), &er); -} - -static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ - struct gfs2_ea_request er; - - memset(&er, 0, sizeof(struct gfs2_ea_request)); - er.er_data = (size) ? buffer : NULL; - er.er_data_len = size; + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; - return gfs2_ea_list(GFS2_I(dentry->d_inode), &er); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh); + ret = gfs2_glock_nq(&gh); + if (ret == 0) { + ret = generic_getxattr(dentry, name, data, size); + gfs2_glock_dq(&gh); + } + gfs2_holder_uninit(&gh); + return ret; } static int gfs2_removexattr(struct dentry *dentry, const char *name) { - struct gfs2_ea_request er; - - memset(&er, 0, sizeof(struct gfs2_ea_request)); - er.er_type = gfs2_ea_name2type(name, &er.er_name); - if (er.er_type == GFS2_EATYPE_UNUSED) - return -EOPNOTSUPP; - er.er_name_len = strlen(er.er_name); + struct inode *inode = dentry->d_inode; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; - return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); + if (ret == 0) { + ret = generic_removexattr(dentry, name); + gfs2_glock_dq(&gh); + } + gfs2_holder_uninit(&gh); + return ret; } static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index daa4ae341a29..28c590b7c9da 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -285,27 +285,19 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) } tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes; - if (count[1] + count[2] != tmp) { + if (count[1] != tmp) { if (gfs2_consist_rgrpd(rgd)) fs_err(sdp, "used data mismatch: %u != %u\n", count[1], tmp); return; } - if (count[3] != rgd->rd_dinodes) { + if (count[2] + count[3] != rgd->rd_dinodes) { if (gfs2_consist_rgrpd(rgd)) fs_err(sdp, "used metadata mismatch: %u != %u\n", - count[3], rgd->rd_dinodes); + count[2] + count[3], rgd->rd_dinodes); return; } - - if (count[2] > count[3]) { - if (gfs2_consist_rgrpd(rgd)) - fs_err(sdp, "unlinked inodes > inodes: %u\n", - count[2]); - return; - } - } static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block) @@ -865,7 +857,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, goto start_new_extent; if ((start + nr_sects) != blk) { rv = blkdev_issue_discard(bdev, start, - nr_sects, GFP_NOFS); + nr_sects, GFP_NOFS, + DISCARD_FL_BARRIER); if (rv) goto fail; nr_sects = 0; @@ -879,7 +872,8 @@ start_new_extent: } } if (nr_sects) { - rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS); + rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS, + DISCARD_FL_BARRIER); if (rv) goto fail; } @@ -961,7 +955,8 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) * Returns: The inode, if one has been found */ -static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked) +static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, + u64 skip) { struct inode *inode; u32 goal = 0, block; @@ -985,6 +980,8 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked) goal++; if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked) continue; + if (no_addr == skip) + continue; *last_unlinked = no_addr; inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN, no_addr, -1, 1); @@ -1104,7 +1101,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) if (try_rgrp_fit(rgd, al)) goto out; if (rgd->rd_flags & GFS2_RDF_CHECK) - inode = try_rgrp_unlink(rgd, last_unlinked); + inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); if (inode) @@ -1138,7 +1135,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked) if (try_rgrp_fit(rgd, al)) goto out; if (rgd->rd_flags & GFS2_RDF_CHECK) - inode = try_rgrp_unlink(rgd, last_unlinked); + inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr); if (!rg_locked) gfs2_glock_dq_uninit(&al->al_rgd_gh); if (inode) @@ -1261,7 +1258,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip) * Returns: The block type (GFS2_BLKST_*) */ -unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) +static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block) { struct gfs2_bitmap *bi = NULL; u32 length, rgrp_block, buf_block; @@ -1464,6 +1461,16 @@ int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) return 0; } +static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) +{ + struct gfs2_sbd *sdp = rgd->rd_sbd; + fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n", + (unsigned long long)rgd->rd_addr); + fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n"); + gfs2_rgrp_dump(NULL, rgd->rd_gl); + rgd->rd_flags |= GFS2_RDF_ERROR; +} + /** * gfs2_alloc_block - Allocate one or more blocks * @ip: the inode to allocate the block for @@ -1525,22 +1532,20 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n) return 0; rgrp_error: - fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n", - (unsigned long long)rgd->rd_addr); - fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n"); - gfs2_rgrp_dump(NULL, rgd->rd_gl); - rgd->rd_flags |= GFS2_RDF_ERROR; + gfs2_rgrp_error(rgd); return -EIO; } /** * gfs2_alloc_di - Allocate a dinode * @dip: the directory that the inode is going in + * @bn: the block number which is allocated + * @generation: the generation number of the inode * - * Returns: the block allocated + * Returns: 0 on success or error */ -u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) +int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation) { struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_alloc *al = dip->i_alloc; @@ -1551,16 +1556,21 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) blk = rgblk_search(rgd, rgd->rd_last_alloc, GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n); - BUG_ON(blk == BFITNOENT); - rgd->rd_last_alloc = blk; + /* Since all blocks are reserved in advance, this shouldn't happen */ + if (blk == BFITNOENT) + goto rgrp_error; + rgd->rd_last_alloc = blk; block = rgd->rd_data0 + blk; + if (rgd->rd_free == 0) + goto rgrp_error; - gfs2_assert_withdraw(sdp, rgd->rd_free); rgd->rd_free--; rgd->rd_dinodes++; *generation = rgd->rd_igeneration++; + if (*generation == 0) + *generation = rgd->rd_igeneration++; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); @@ -1573,7 +1583,12 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) rgd->rd_free_clone--; spin_unlock(&sdp->sd_rindex_spin); trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE); - return block; + *bn = block; + return 0; + +rgrp_error: + gfs2_rgrp_error(rgd); + return -EIO; } /** @@ -1681,6 +1696,46 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip) } /** + * gfs2_check_blk_type - Check the type of a block + * @sdp: The superblock + * @no_addr: The block number to check + * @type: The block type we are looking for + * + * Returns: 0 if the block type matches the expected type + * -ESTALE if it doesn't match + * or -ve errno if something went wrong while checking + */ + +int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type) +{ + struct gfs2_rgrpd *rgd; + struct gfs2_holder ri_gh, rgd_gh; + int error; + + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + goto fail; + + error = -EINVAL; + rgd = gfs2_blk2rgrpd(sdp, no_addr); + if (!rgd) + goto fail_rindex; + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh); + if (error) + goto fail_rindex; + + if (gfs2_get_block_type(rgd, no_addr) != type) + error = -ESTALE; + + gfs2_glock_dq_uninit(&rgd_gh); +fail_rindex: + gfs2_glock_dq_uninit(&ri_gh); +fail: + return error; +} + +/** * gfs2_rlist_add - add a RG to a list of RGs * @sdp: the filesystem * @rlist: the list of resource groups diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 1e76ff0f3e00..b4106ddaaa98 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -44,15 +44,15 @@ gfs2_inplace_reserve_i((ip), __FILE__, __LINE__) extern void gfs2_inplace_release(struct gfs2_inode *ip); -extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block); - extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); -extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation); +extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation); extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); extern void gfs2_unlink_di(struct inode *inode); +extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, + unsigned int type); struct gfs2_rgrp_list { unsigned int rl_rgrps; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 0a6801336470..0ec3ec672de1 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -38,7 +38,7 @@ #include "trans.h" #include "util.h" #include "sys.h" -#include "eattr.h" +#include "xattr.h" #define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) @@ -68,6 +68,8 @@ enum { Opt_discard, Opt_nodiscard, Opt_commit, + Opt_err_withdraw, + Opt_err_panic, Opt_error, }; @@ -97,6 +99,8 @@ static const match_table_t tokens = { {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_commit, "commit=%d"}, + {Opt_err_withdraw, "errors=withdraw"}, + {Opt_err_panic, "errors=panic"}, {Opt_error, NULL} }; @@ -152,6 +156,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) args->ar_localcaching = 1; break; case Opt_debug: + if (args->ar_errors == GFS2_ERRORS_PANIC) { + fs_info(sdp, "-o debug and -o errors=panic " + "are mutually exclusive.\n"); + return -EINVAL; + } args->ar_debug = 1; break; case Opt_nodebug: @@ -205,6 +214,17 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) return rv ? rv : -EINVAL; } break; + case Opt_err_withdraw: + args->ar_errors = GFS2_ERRORS_WITHDRAW; + break; + case Opt_err_panic: + if (args->ar_debug) { + fs_info(sdp, "-o debug and -o errors=panic " + "are mutually exclusive.\n"); + return -EINVAL; + } + args->ar_errors = GFS2_ERRORS_PANIC; + break; case Opt_error: default: fs_info(sdp, "invalid mount option: %s\n", o); @@ -353,7 +373,7 @@ fail: return error; } -static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) +void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) { const struct gfs2_statfs_change *str = buf; @@ -441,6 +461,29 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, brelse(l_bh); } +void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, + struct buffer_head *l_bh) +{ + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode); + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + + gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); + + spin_lock(&sdp->sd_statfs_spin); + m_sc->sc_total += l_sc->sc_total; + m_sc->sc_free += l_sc->sc_free; + m_sc->sc_dinodes += l_sc->sc_dinodes; + memset(l_sc, 0, sizeof(struct gfs2_statfs_change)); + memset(l_bh->b_data + sizeof(struct gfs2_dinode), + 0, sizeof(struct gfs2_statfs_change)); + spin_unlock(&sdp->sd_statfs_spin); + + gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1); + gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); +} + int gfs2_statfs_sync(struct gfs2_sbd *sdp) { struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); @@ -477,19 +520,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp) if (error) goto out_bh2; - gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1); - - spin_lock(&sdp->sd_statfs_spin); - m_sc->sc_total += l_sc->sc_total; - m_sc->sc_free += l_sc->sc_free; - m_sc->sc_dinodes += l_sc->sc_dinodes; - memset(l_sc, 0, sizeof(struct gfs2_statfs_change)); - memset(l_bh->b_data + sizeof(struct gfs2_dinode), - 0, sizeof(struct gfs2_statfs_change)); - spin_unlock(&sdp->sd_statfs_spin); - - gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1); - gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); + update_statfs(sdp, m_bh, l_bh); gfs2_trans_end(sdp); @@ -680,6 +711,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) struct gfs2_holder t_gh; int error; + flush_workqueue(gfs2_delete_workqueue); gfs2_quota_sync(sdp); gfs2_statfs_sync(sdp); @@ -756,7 +788,6 @@ restart: /* Release stuff */ iput(sdp->sd_jindex); - iput(sdp->sd_inum_inode); iput(sdp->sd_statfs_inode); iput(sdp->sd_rindex); iput(sdp->sd_quota_inode); @@ -767,10 +798,8 @@ restart: if (!sdp->sd_args.ar_spectator) { gfs2_glock_dq_uninit(&sdp->sd_journal_gh); gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); - gfs2_glock_dq_uninit(&sdp->sd_ir_gh); gfs2_glock_dq_uninit(&sdp->sd_sc_gh); gfs2_glock_dq_uninit(&sdp->sd_qc_gh); - iput(sdp->sd_ir_inode); iput(sdp->sd_sc_inode); iput(sdp->sd_qc_inode); } @@ -1072,6 +1101,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) gt->gt_log_flush_secs = args.ar_commit; spin_unlock(>->gt_spin); + gfs2_online_uevent(sdp); return 0; } @@ -1213,6 +1243,22 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) lfsecs = sdp->sd_tune.gt_log_flush_secs; if (lfsecs != 60) seq_printf(s, ",commit=%d", lfsecs); + if (args->ar_errors != GFS2_ERRORS_DEFAULT) { + const char *state; + + switch (args->ar_errors) { + case GFS2_ERRORS_WITHDRAW: + state = "withdraw"; + break; + case GFS2_ERRORS_PANIC: + state = "panic"; + break; + default: + state = "unknown"; + break; + } + seq_printf(s, ",errors=%s", state); + } return 0; } @@ -1240,6 +1286,10 @@ static void gfs2_delete_inode(struct inode *inode) goto out; } + error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED); + if (error) + goto out_truncate; + gfs2_glock_dq_wait(&ip->i_iopen_gh); gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); error = gfs2_glock_nq(&ip->i_iopen_gh); diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index b56413e3e40d..235db3682885 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -25,7 +25,7 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) return x; } -void gfs2_jindex_free(struct gfs2_sbd *sdp); +extern void gfs2_jindex_free(struct gfs2_sbd *sdp); extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data); @@ -36,10 +36,14 @@ extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, struct gfs2_inode **ipp); extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp); - +extern void gfs2_online_uevent(struct gfs2_sbd *sdp); extern int gfs2_statfs_init(struct gfs2_sbd *sdp); extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, s64 dinodes); +extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, + const void *buf); +extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, + struct buffer_head *l_bh); extern int gfs2_statfs_sync(struct gfs2_sbd *sdp); extern int gfs2_freeze_fs(struct gfs2_sbd *sdp); @@ -50,6 +54,7 @@ extern struct file_system_type gfs2meta_fs_type; extern const struct export_operations gfs2_export_ops; extern const struct super_operations gfs2_super_ops; extern const struct dentry_operations gfs2_dops; +extern struct xattr_handler *gfs2_xattr_handlers[]; #endif /* __SUPER_DOT_H__ */ diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 23419dc3027b..446329728d52 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -16,6 +16,7 @@ #include <linux/kobject.h> #include <asm/uaccess.h> #include <linux/gfs2_ondisk.h> +#include <linux/genhd.h> #include "gfs2.h" #include "incore.h" @@ -319,12 +320,6 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len) return ret; } -static ssize_t lkid_show(struct gfs2_sbd *sdp, char *buf) -{ - struct lm_lockstruct *ls = &sdp->sd_lockstruct; - return sprintf(buf, "%u\n", ls->ls_id); -} - static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; @@ -386,22 +381,20 @@ static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf) #define GDLM_ATTR(_name,_mode,_show,_store) \ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) -GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); -GDLM_ATTR(block, 0644, block_show, block_store); -GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); -GDLM_ATTR(id, 0444, lkid_show, NULL); -GDLM_ATTR(jid, 0444, jid_show, NULL); -GDLM_ATTR(first, 0444, lkfirst_show, NULL); -GDLM_ATTR(first_done, 0444, first_done_show, NULL); -GDLM_ATTR(recover, 0200, NULL, recover_store); -GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); -GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); +GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); +GDLM_ATTR(block, 0644, block_show, block_store); +GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); +GDLM_ATTR(jid, 0444, jid_show, NULL); +GDLM_ATTR(first, 0444, lkfirst_show, NULL); +GDLM_ATTR(first_done, 0444, first_done_show, NULL); +GDLM_ATTR(recover, 0600, NULL, recover_store); +GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); +GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); static struct attribute *lock_module_attrs[] = { &gdlm_attr_proto_name.attr, &gdlm_attr_block.attr, &gdlm_attr_withdraw.attr, - &gdlm_attr_id.attr, &gdlm_attr_jid.attr, &gdlm_attr_first.attr, &gdlm_attr_first_done.attr, @@ -519,7 +512,14 @@ static struct attribute_group lock_module_group = { int gfs2_sys_fs_add(struct gfs2_sbd *sdp) { + struct super_block *sb = sdp->sd_vfs; int error; + char ro[20]; + char spectator[20]; + char *envp[] = { ro, spectator, NULL }; + + sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0); + sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0); sdp->sd_kobj.kset = gfs2_kset; error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL, @@ -535,9 +535,17 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp) if (error) goto fail_tune; - kobject_uevent(&sdp->sd_kobj, KOBJ_ADD); + error = sysfs_create_link(&sdp->sd_kobj, + &disk_to_dev(sb->s_bdev->bd_disk)->kobj, + "device"); + if (error) + goto fail_lock_module; + + kobject_uevent_env(&sdp->sd_kobj, KOBJ_ADD, envp); return 0; +fail_lock_module: + sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); fail_tune: sysfs_remove_group(&sdp->sd_kobj, &tune_group); fail_reg: @@ -549,12 +557,12 @@ fail: void gfs2_sys_fs_del(struct gfs2_sbd *sdp) { + sysfs_remove_link(&sdp->sd_kobj, "device"); sysfs_remove_group(&sdp->sd_kobj, &tune_group); sysfs_remove_group(&sdp->sd_kobj, &lock_module_group); kobject_put(&sdp->sd_kobj); } - static int gfs2_uevent(struct kset *kset, struct kobject *kobj, struct kobj_uevent_env *env) { @@ -563,6 +571,8 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj, add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); + if (!sdp->sd_args.ar_spectator) + add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid); if (gfs2_uuid_valid(uuid)) { add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-" "%02X%02X-%02X%02X%02X%02X%02X%02X", @@ -578,7 +588,6 @@ static struct kset_uevent_ops gfs2_uevent_ops = { .uevent = gfs2_uevent, }; - int gfs2_sys_init(void) { gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 9d12b1118ba0..f6a7efa34eb9 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -38,24 +38,30 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...) const struct lm_lockops *lm = ls->ls_ops; va_list args; - if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW && + test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags)) return 0; va_start(args, fmt); vprintk(fmt, args); va_end(args); - fs_err(sdp, "about to withdraw this file system\n"); - BUG_ON(sdp->sd_args.ar_debug); + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) { + fs_err(sdp, "about to withdraw this file system\n"); + BUG_ON(sdp->sd_args.ar_debug); - kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE); + kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE); - if (lm->lm_unmount) { - fs_err(sdp, "telling LM to unmount\n"); - lm->lm_unmount(sdp); + if (lm->lm_unmount) { + fs_err(sdp, "telling LM to unmount\n"); + lm->lm_unmount(sdp); + } + fs_err(sdp, "withdrawn\n"); + dump_stack(); } - fs_err(sdp, "withdrawn\n"); - dump_stack(); + + if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) + panic("GFS2: fsid=%s: panic requested.\n", sdp->sd_fsname); return -1; } @@ -93,17 +99,24 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, gfs2_tune_get(sdp, gt_complain_secs) * HZ)) return -2; - printk(KERN_WARNING - "GFS2: fsid=%s: warning: assertion \"%s\" failed\n" - "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", - sdp->sd_fsname, assertion, - sdp->sd_fsname, function, file, line); + if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) + printk(KERN_WARNING + "GFS2: fsid=%s: warning: assertion \"%s\" failed\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, assertion, + sdp->sd_fsname, function, file, line); if (sdp->sd_args.ar_debug) BUG(); else dump_stack(); + if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC) + panic("GFS2: fsid=%s: warning: assertion \"%s\" failed\n" + "GFS2: fsid=%s: function = %s, file = %s, line = %u\n", + sdp->sd_fsname, assertion, + sdp->sd_fsname, function, file, line); + sdp->sd_last_warning = jiffies; return -1; diff --git a/fs/gfs2/eattr.c b/fs/gfs2/xattr.c index 07ea9529adda..8a0f8ef6ee27 100644 --- a/fs/gfs2/eattr.c +++ b/fs/gfs2/xattr.c @@ -18,8 +18,7 @@ #include "gfs2.h" #include "incore.h" #include "acl.h" -#include "eaops.h" -#include "eattr.h" +#include "xattr.h" #include "glock.h" #include "inode.h" #include "meta_io.h" @@ -38,26 +37,32 @@ * Returns: 1 if the EA should be stuffed */ -static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er, +static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize, unsigned int *size) { - *size = GFS2_EAREQ_SIZE_STUFFED(er); - if (*size <= sdp->sd_jbsize) + unsigned int jbsize = sdp->sd_jbsize; + + /* Stuffed */ + *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8); + + if (*size <= jbsize) return 1; - *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er); + /* Unstuffed */ + *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + + (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8); return 0; } -static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er) +static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize) { unsigned int size; - if (er->er_data_len > GFS2_EA_MAX_DATA_LEN) + if (dsize > GFS2_EA_MAX_DATA_LEN) return -ERANGE; - ea_calc_size(sdp, er, &size); + ea_calc_size(sdp, nsize, dsize, &size); /* This can only happen with 512 byte blocks */ if (size > sdp->sd_jbsize) @@ -151,7 +156,9 @@ out: } struct ea_find { - struct gfs2_ea_request *ef_er; + int type; + const char *name; + size_t namel; struct gfs2_ea_location *ef_el; }; @@ -160,14 +167,13 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh, void *private) { struct ea_find *ef = private; - struct gfs2_ea_request *er = ef->ef_er; if (ea->ea_type == GFS2_EATYPE_UNUSED) return 0; - if (ea->ea_type == er->er_type) { - if (ea->ea_name_len == er->er_name_len && - !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) { + if (ea->ea_type == ef->type) { + if (ea->ea_name_len == ef->namel && + !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) { struct gfs2_ea_location *el = ef->ef_el; get_bh(bh); el->el_bh = bh; @@ -180,13 +186,15 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh, return 0; } -int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er, +int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name, struct gfs2_ea_location *el) { struct ea_find ef; int error; - ef.ef_er = er; + ef.type = type; + ef.name = name; + ef.namel = strlen(name); ef.ef_el = el; memset(el, 0, sizeof(struct gfs2_ea_location)); @@ -344,6 +352,20 @@ struct ea_list { unsigned int ei_size; }; +static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea) +{ + switch (ea->ea_type) { + case GFS2_EATYPE_USR: + return 5 + ea->ea_name_len + 1; + case GFS2_EATYPE_SYS: + return 7 + ea->ea_name_len + 1; + case GFS2_EATYPE_SECURITY: + return 9 + ea->ea_name_len + 1; + default: + return 0; + } +} + static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, struct gfs2_ea_header *ea, struct gfs2_ea_header *prev, void *private) @@ -392,21 +414,25 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh, } /** - * gfs2_ea_list - - * @ip: - * @er: + * gfs2_listxattr - List gfs2 extended attributes + * @dentry: The dentry whose inode we are interested in + * @buffer: The buffer to write the results + * @size: The size of the buffer * * Returns: actual size of data on success, -errno on error */ -int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er) +ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size) { + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_ea_request er; struct gfs2_holder i_gh; int error; - if (!er->er_data || !er->er_data_len) { - er->er_data = NULL; - er->er_data_len = 0; + memset(&er, 0, sizeof(struct gfs2_ea_request)); + if (size) { + er.er_data = buffer; + er.er_data_len = size; } error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); @@ -414,7 +440,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er) return error; if (ip->i_eattr) { - struct ea_list ei = { .ei_er = er, .ei_size = 0 }; + struct ea_list ei = { .ei_er = &er, .ei_size = 0 }; error = ea_foreach(ip, ea_list_i, &ei); if (!error) @@ -491,84 +517,61 @@ out: } int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, - char *data) + char *data, size_t size) { + int ret; + size_t len = GFS2_EA_DATA_LEN(el->el_ea); + if (len > size) + return -ERANGE; + if (GFS2_EA_IS_STUFFED(el->el_ea)) { - memcpy(data, GFS2_EA2DATA(el->el_ea), GFS2_EA_DATA_LEN(el->el_ea)); - return 0; - } else - return ea_get_unstuffed(ip, el->el_ea, data); + memcpy(data, GFS2_EA2DATA(el->el_ea), len); + return len; + } + ret = ea_get_unstuffed(ip, el->el_ea, data); + if (ret < 0) + return ret; + return len; } /** - * gfs2_ea_get_i - - * @ip: The GFS2 inode - * @er: The request structure + * gfs2_xattr_get - Get a GFS2 extended attribute + * @inode: The inode + * @type: The type of extended attribute + * @name: The name of the extended attribute + * @buffer: The buffer to write the result into + * @size: The size of the buffer * * Returns: actual size of data on success, -errno on error */ -int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) +int gfs2_xattr_get(struct inode *inode, int type, const char *name, + void *buffer, size_t size) { + struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_ea_location el; int error; if (!ip->i_eattr) return -ENODATA; + if (strlen(name) > GFS2_EA_MAX_NAME_LEN) + return -EINVAL; - error = gfs2_ea_find(ip, er, &el); + error = gfs2_ea_find(ip, type, name, &el); if (error) return error; if (!el.el_ea) return -ENODATA; - - if (er->er_data_len) { - if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len) - error = -ERANGE; - else - error = gfs2_ea_get_copy(ip, &el, er->er_data); - } - if (!error) + if (size) + error = gfs2_ea_get_copy(ip, &el, buffer, size); + else error = GFS2_EA_DATA_LEN(el.el_ea); - brelse(el.el_bh); return error; } /** - * gfs2_ea_get - - * @ip: The GFS2 inode - * @er: The request structure - * - * Returns: actual size of data on success, -errno on error - */ - -int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - struct gfs2_holder i_gh; - int error; - - if (!er->er_name_len || - er->er_name_len > GFS2_EA_MAX_NAME_LEN) - return -EINVAL; - if (!er->er_data || !er->er_data_len) { - er->er_data = NULL; - er->er_data_len = 0; - } - - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh); - if (error) - return error; - - error = gfs2_ea_ops[er->er_type]->eo_get(ip, er); - - gfs2_glock_dq_uninit(&i_gh); - - return error; -} - -/** * ea_alloc_blk - allocates a new block for extended attributes. * @ip: A pointer to the inode that's getting extended attributes * @bhp: Pointer to pointer to a struct buffer_head @@ -713,12 +716,6 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er, error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { - if (er->er_flags & GFS2_ERF_MODE) { - gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), - (ip->i_inode.i_mode & S_IFMT) == - (er->er_mode & S_IFMT)); - ip->i_inode.i_mode = er->er_mode; - } ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -762,15 +759,23 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, * Returns: errno */ -static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er) +static int ea_init(struct gfs2_inode *ip, int type, const char *name, + const void *data, size_t size) { + struct gfs2_ea_request er; unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize; unsigned int blks = 1; - if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize) - blks += DIV_ROUND_UP(er->er_data_len, jbsize); + er.er_type = type; + er.er_name = name; + er.er_name_len = strlen(name); + er.er_data = (void *)data; + er.er_data_len = size; + + if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize) + blks += DIV_ROUND_UP(er.er_data_len, jbsize); - return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL); + return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL); } static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea) @@ -848,12 +853,6 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh, error = gfs2_meta_inode_buffer(ip, &dibh); if (error) goto out; - - if (er->er_flags & GFS2_ERF_MODE) { - gfs2_assert_withdraw(GFS2_SB(&ip->i_inode), - (ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT)); - ip->i_inode.i_mode = er->er_mode; - } ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -894,7 +893,8 @@ static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh, int stuffed; int error; - stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size); + stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len, + es->es_er->er_data_len, &size); if (ea->ea_type == GFS2_EATYPE_UNUSED) { if (GFS2_EA_REC_LEN(ea) < size) @@ -1005,15 +1005,22 @@ out: return error; } -static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, - struct gfs2_ea_location *el) +static int ea_set_i(struct gfs2_inode *ip, int type, const char *name, + const void *value, size_t size, struct gfs2_ea_location *el) { + struct gfs2_ea_request er; struct ea_set es; unsigned int blks = 2; int error; + er.er_type = type; + er.er_name = name; + er.er_data = (void *)value; + er.er_name_len = strlen(name); + er.er_data_len = size; + memset(&es, 0, sizeof(struct ea_set)); - es.es_er = er; + es.es_er = &er; es.es_el = el; error = ea_foreach(ip, ea_set_simple, &es); @@ -1024,10 +1031,10 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) blks++; - if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize) - blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); + if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize) + blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); - return ea_alloc_skeleton(ip, er, blks, ea_set_block, el); + return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el); } static int ea_set_remove_unstuffed(struct gfs2_inode *ip, @@ -1039,75 +1046,7 @@ static int ea_set_remove_unstuffed(struct gfs2_inode *ip, GFS2_EA2NEXT(el->el_prev) == el->el_ea); } - return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0); -} - -int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - struct gfs2_ea_location el; - int error; - - if (!ip->i_eattr) { - if (er->er_flags & XATTR_REPLACE) - return -ENODATA; - return ea_init(ip, er); - } - - error = gfs2_ea_find(ip, er, &el); - if (error) - return error; - - if (el.el_ea) { - if (ip->i_diskflags & GFS2_DIF_APPENDONLY) { - brelse(el.el_bh); - return -EPERM; - } - - error = -EEXIST; - if (!(er->er_flags & XATTR_CREATE)) { - int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea); - error = ea_set_i(ip, er, &el); - if (!error && unstuffed) - ea_set_remove_unstuffed(ip, &el); - } - - brelse(el.el_bh); - } else { - error = -ENODATA; - if (!(er->er_flags & XATTR_REPLACE)) - error = ea_set_i(ip, er, NULL); - } - - return error; -} - -int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er) -{ - struct gfs2_holder i_gh; - int error; - - if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN) - return -EINVAL; - if (!er->er_data || !er->er_data_len) { - er->er_data = NULL; - er->er_data_len = 0; - } - error = ea_check_size(GFS2_SB(&ip->i_inode), er); - if (error) - return error; - - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); - if (error) - return error; - - if (IS_IMMUTABLE(&ip->i_inode)) - error = -EPERM; - else - error = gfs2_ea_ops[er->er_type]->eo_set(ip, er); - - gfs2_glock_dq_uninit(&i_gh); - - return error; + return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev, 0); } static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) @@ -1131,8 +1070,9 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) if (GFS2_EA_IS_LAST(ea)) prev->ea_flags |= GFS2_EAFLAG_LAST; - } else + } else { ea->ea_type = GFS2_EATYPE_UNUSED; + } error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { @@ -1147,15 +1087,29 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el) return error; } -int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) +/** + * gfs2_xattr_remove - Remove a GFS2 extended attribute + * @inode: The inode + * @type: The type of the extended attribute + * @name: The name of the extended attribute + * + * This is not called directly by the VFS since we use the (common) + * scheme of making a "set with NULL data" mean a remove request. Note + * that this is different from a set with zero length data. + * + * Returns: 0, or errno on failure + */ + +static int gfs2_xattr_remove(struct inode *inode, int type, const char *name) { + struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_ea_location el; int error; if (!ip->i_eattr) return -ENODATA; - error = gfs2_ea_find(ip, er, &el); + error = gfs2_ea_find(ip, type, name, &el); if (error) return error; if (!el.el_ea) @@ -1164,8 +1118,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) if (GFS2_EA_IS_STUFFED(el.el_ea)) error = ea_remove_stuffed(ip, &el); else - error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, - 0); + error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0); brelse(el.el_bh); @@ -1173,31 +1126,70 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) } /** - * gfs2_ea_remove - sets (or creates or replaces) an extended attribute - * @ip: pointer to the inode of the target file - * @er: request information + * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute + * @inode: The inode + * @type: The type of the extended attribute + * @name: The name of the extended attribute + * @value: The value of the extended attribute (NULL for remove) + * @size: The size of the @value argument + * @flags: Create or Replace * - * Returns: errno + * See gfs2_xattr_remove() for details of the removal of xattrs. + * + * Returns: 0 or errno on failure */ -int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er) +int gfs2_xattr_set(struct inode *inode, int type, const char *name, + const void *value, size_t size, int flags) { - struct gfs2_holder i_gh; + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_ea_location el; + unsigned int namel = strlen(name); int error; - if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN) - return -EINVAL; + if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) + return -EPERM; + if (namel > GFS2_EA_MAX_NAME_LEN) + return -ERANGE; - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + if (value == NULL) + return gfs2_xattr_remove(inode, type, name); + + if (ea_check_size(sdp, namel, size)) + return -ERANGE; + + if (!ip->i_eattr) { + if (flags & XATTR_REPLACE) + return -ENODATA; + return ea_init(ip, type, name, value, size); + } + + error = gfs2_ea_find(ip, type, name, &el); if (error) return error; - if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode)) - error = -EPERM; - else - error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er); + if (el.el_ea) { + if (ip->i_diskflags & GFS2_DIF_APPENDONLY) { + brelse(el.el_bh); + return -EPERM; + } - gfs2_glock_dq_uninit(&i_gh); + error = -EEXIST; + if (!(flags & XATTR_CREATE)) { + int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea); + error = ea_set_i(ip, type, name, value, size, &el); + if (!error && unstuffed) + ea_set_remove_unstuffed(ip, &el); + } + + brelse(el.el_bh); + return error; + } + + error = -ENODATA; + if (!(flags & XATTR_REPLACE)) + error = ea_set_i(ip, type, name, value, size, NULL); return error; } @@ -1503,3 +1495,64 @@ out_alloc: return error; } +static int gfs2_xattr_user_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size); +} + +static int gfs2_xattr_user_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags); +} + +static int gfs2_xattr_system_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size); +} + +static int gfs2_xattr_system_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags); +} + +static int gfs2_xattr_security_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size); +} + +static int gfs2_xattr_security_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags); +} + +static struct xattr_handler gfs2_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .get = gfs2_xattr_user_get, + .set = gfs2_xattr_user_set, +}; + +static struct xattr_handler gfs2_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = gfs2_xattr_security_get, + .set = gfs2_xattr_security_set, +}; + +static struct xattr_handler gfs2_xattr_system_handler = { + .prefix = XATTR_SYSTEM_PREFIX, + .get = gfs2_xattr_system_get, + .set = gfs2_xattr_system_set, +}; + +struct xattr_handler *gfs2_xattr_handlers[] = { + &gfs2_xattr_user_handler, + &gfs2_xattr_security_handler, + &gfs2_xattr_system_handler, + NULL, +}; + diff --git a/fs/gfs2/eattr.h b/fs/gfs2/xattr.h index c82dbe01d713..cbdfd7743733 100644 --- a/fs/gfs2/eattr.h +++ b/fs/gfs2/xattr.h @@ -19,7 +19,7 @@ struct iattr; #define GFS2_EA_SIZE(ea) \ ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \ ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \ - (sizeof(__be64) * (ea)->ea_num_ptrs)), 8) + (sizeof(__be64) * (ea)->ea_num_ptrs)), 8) #define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs) #define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST) @@ -27,10 +27,6 @@ ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \ #define GFS2_EAREQ_SIZE_STUFFED(er) \ ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8) -#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \ -ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \ - sizeof(__be64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8) - #define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1)) #define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len) @@ -43,16 +39,12 @@ ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \ #define GFS2_EA_BH2FIRST(bh) \ ((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header))) -#define GFS2_ERF_MODE 0x80000000 - struct gfs2_ea_request { const char *er_name; char *er_data; unsigned int er_name_len; unsigned int er_data_len; unsigned int er_type; /* GFS2_EATYPE_... */ - int er_flags; - mode_t er_mode; }; struct gfs2_ea_location { @@ -61,40 +53,20 @@ struct gfs2_ea_location { struct gfs2_ea_header *el_prev; }; -int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); -int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); -int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er); - -int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er); -int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er); -int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er); -int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er); - -int gfs2_ea_dealloc(struct gfs2_inode *ip); +extern int gfs2_xattr_get(struct inode *inode, int type, const char *name, + void *buffer, size_t size); +extern int gfs2_xattr_set(struct inode *inode, int type, const char *name, + const void *value, size_t size, int flags); +extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size); +extern int gfs2_ea_dealloc(struct gfs2_inode *ip); /* Exported to acl.c */ -int gfs2_ea_find(struct gfs2_inode *ip, - struct gfs2_ea_request *er, - struct gfs2_ea_location *el); -int gfs2_ea_get_copy(struct gfs2_inode *ip, - struct gfs2_ea_location *el, - char *data); -int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el, - struct iattr *attr, char *data); - -static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea) -{ - switch (ea->ea_type) { - case GFS2_EATYPE_USR: - return 5 + ea->ea_name_len + 1; - case GFS2_EATYPE_SYS: - return 7 + ea->ea_name_len + 1; - case GFS2_EATYPE_SECURITY: - return 9 + ea->ea_name_len + 1; - default: - return 0; - } -} +extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name, + struct gfs2_ea_location *el); +extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el, + char *data, size_t size); +extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el, + struct iattr *attr, char *data); #endif /* __EATTR_DOT_H__ */ diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 941c8425c10b..a93b885311d8 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -44,6 +44,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations; static const struct inode_operations hugetlbfs_inode_operations; static struct backing_dev_info hugetlbfs_backing_dev_info = { + .name = "hugetlbfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; @@ -935,26 +936,28 @@ static int can_do_hugetlb_shm(void) return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); } -struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) +struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, + struct user_struct **user) { int error = -ENOMEM; - int unlock_shm = 0; struct file *file; struct inode *inode; struct dentry *dentry, *root; struct qstr quick_string; - struct user_struct *user = current_user(); + *user = NULL; if (!hugetlbfs_vfsmount) return ERR_PTR(-ENOENT); if (!can_do_hugetlb_shm()) { - if (user_shm_lock(size, user)) { - unlock_shm = 1; + *user = current_user(); + if (user_shm_lock(size, *user)) { WARN_ONCE(1, "Using mlock ulimits for SHM_HUGETLB deprecated\n"); - } else + } else { + *user = NULL; return ERR_PTR(-EPERM); + } } root = hugetlbfs_vfsmount->mnt_root; @@ -996,8 +999,10 @@ out_inode: out_dentry: dput(dentry); out_shm_unlock: - if (unlock_shm) - user_shm_unlock(size, user); + if (*user) { + user_shm_unlock(size, *user); + *user = NULL; + } return ERR_PTR(error); } diff --git a/fs/inode.c b/fs/inode.c index 901bad1e5f12..ae7b67e48661 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -120,12 +120,11 @@ static void wake_up_inode(struct inode *inode) * These are initializations that need to be done on every inode * allocation as the fields are not initialised by slab allocation. */ -struct inode *inode_init_always(struct super_block *sb, struct inode *inode) +int inode_init_always(struct super_block *sb, struct inode *inode) { static const struct address_space_operations empty_aops; static struct inode_operations empty_iops; static const struct file_operations empty_fops; - struct address_space *const mapping = &inode->i_data; inode->i_sb = sb; @@ -152,7 +151,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode) inode->dirtied_when = 0; if (security_inode_alloc(inode)) - goto out_free_inode; + goto out; /* allocate and initialize an i_integrity */ if (ima_inode_alloc(inode)) @@ -198,16 +197,12 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode) inode->i_fsnotify_mask = 0; #endif - return inode; + return 0; out_free_security: security_inode_free(inode); -out_free_inode: - if (inode->i_sb->s_op->destroy_inode) - inode->i_sb->s_op->destroy_inode(inode); - else - kmem_cache_free(inode_cachep, (inode)); - return NULL; +out: + return -ENOMEM; } EXPORT_SYMBOL(inode_init_always); @@ -220,12 +215,21 @@ static struct inode *alloc_inode(struct super_block *sb) else inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); - if (inode) - return inode_init_always(sb, inode); - return NULL; + if (!inode) + return NULL; + + if (unlikely(inode_init_always(sb, inode))) { + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, inode); + return NULL; + } + + return inode; } -void destroy_inode(struct inode *inode) +void __destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); ima_inode_free(inode); @@ -237,13 +241,17 @@ void destroy_inode(struct inode *inode) if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) posix_acl_release(inode->i_default_acl); #endif +} +EXPORT_SYMBOL(__destroy_inode); + +void destroy_inode(struct inode *inode) +{ + __destroy_inode(inode); if (inode->i_sb->s_op->destroy_inode) inode->i_sb->s_op->destroy_inode(inode); else kmem_cache_free(inode_cachep, (inode)); } -EXPORT_SYMBOL(destroy_inode); - /* * These are initializations that only need to be done diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 737f7246a4b5..f96f85092d1c 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -287,6 +287,7 @@ int journal_write_metadata_buffer(transaction_t *transaction, struct page *new_page; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); + journal_t *journal = transaction->t_journal; /* * The buffer really shouldn't be locked: only the current committing @@ -300,6 +301,11 @@ int journal_write_metadata_buffer(transaction_t *transaction, J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); + /* keep subsequent assertions sane */ + new_bh->b_state = 0; + init_buffer(new_bh, NULL, NULL); + atomic_set(&new_bh->b_count, 1); + new_jh = journal_add_journal_head(new_bh); /* This sleeps */ /* * If a new transaction has already done a buffer copy-out, then @@ -361,14 +367,6 @@ repeat: kunmap_atomic(mapped_data, KM_USER0); } - /* keep subsequent assertions sane */ - new_bh->b_state = 0; - init_buffer(new_bh, NULL, NULL); - atomic_set(&new_bh->b_count, 1); - jbd_unlock_bh_state(bh_in); - - new_jh = journal_add_journal_head(new_bh); /* This sleeps */ - set_bh_page(new_bh, new_page, new_offset); new_jh->b_transaction = NULL; new_bh->b_size = jh2bh(jh_in)->b_size; @@ -385,7 +383,11 @@ repeat: * copying is moved to the transaction's shadow queue. */ JBUFFER_TRACE(jh_in, "file as BJ_Shadow"); - journal_file_buffer(jh_in, transaction, BJ_Shadow); + spin_lock(&journal->j_list_lock); + __journal_file_buffer(jh_in, transaction, BJ_Shadow); + spin_unlock(&journal->j_list_lock); + jbd_unlock_bh_state(bh_in); + JBUFFER_TRACE(new_jh, "file as BJ_IO"); journal_file_buffer(new_jh, transaction, BJ_IO); @@ -848,6 +850,12 @@ static int journal_reset(journal_t *journal) first = be32_to_cpu(sb->s_first); last = be32_to_cpu(sb->s_maxlen); + if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) { + printk(KERN_ERR "JBD: Journal too short (blocks %lu-%lu).\n", + first, last); + journal_fail_superblock(journal); + return -EINVAL; + } journal->j_first = first; journal->j_last = last; diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 73242ba7c7b1..c03ac11f74be 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -489,34 +489,15 @@ void journal_unlock_updates (journal_t *journal) wake_up(&journal->j_wait_transaction_locked); } -/* - * Report any unexpected dirty buffers which turn up. Normally those - * indicate an error, but they can occur if the user is running (say) - * tune2fs to modify the live filesystem, so we need the option of - * continuing as gracefully as possible. # - * - * The caller should already hold the journal lock and - * j_list_lock spinlock: most callers will need those anyway - * in order to probe the buffer's journaling state safely. - */ -static void jbd_unexpected_dirty_buffer(struct journal_head *jh) +static void warn_dirty_buffer(struct buffer_head *bh) { - int jlist; - - /* If this buffer is one which might reasonably be dirty - * --- ie. data, or not part of this journal --- then - * we're OK to leave it alone, but otherwise we need to - * move the dirty bit to the journal's own internal - * JBDDirty bit. */ - jlist = jh->b_jlist; + char b[BDEVNAME_SIZE]; - if (jlist == BJ_Metadata || jlist == BJ_Reserved || - jlist == BJ_Shadow || jlist == BJ_Forget) { - struct buffer_head *bh = jh2bh(jh); - - if (test_clear_buffer_dirty(bh)) - set_buffer_jbddirty(bh); - } + printk(KERN_WARNING + "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " + "There's a risk of filesystem corruption in case of system " + "crash.\n", + bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } /* @@ -583,14 +564,16 @@ repeat: if (jh->b_next_transaction) J_ASSERT_JH(jh, jh->b_next_transaction == transaction); + warn_dirty_buffer(bh); } /* * In any case we need to clean the dirty flag and we must * do it under the buffer lock to be sure we don't race * with running write-out. */ - JBUFFER_TRACE(jh, "Unexpected dirty buffer"); - jbd_unexpected_dirty_buffer(jh); + JBUFFER_TRACE(jh, "Journalling dirty buffer"); + clear_buffer_dirty(bh); + set_buffer_jbddirty(bh); } unlock_buffer(bh); @@ -826,6 +809,15 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh) J_ASSERT_JH(jh, buffer_locked(jh2bh(jh))); if (jh->b_transaction == NULL) { + /* + * Previous journal_forget() could have left the buffer + * with jbddirty bit set because it was being committed. When + * the commit finished, we've filed the buffer for + * checkpointing and marked it dirty. Now we are reallocating + * the buffer so the transaction freeing it must have + * committed and so it's safe to clear the dirty bit. + */ + clear_buffer_dirty(jh2bh(jh)); jh->b_transaction = transaction; /* first access by this transaction */ @@ -1782,8 +1774,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) if (jh->b_cp_transaction) { JBUFFER_TRACE(jh, "on running+cp transaction"); + /* + * We don't want to write the buffer anymore, clear the + * bit so that we don't confuse checks in + * __journal_file_buffer + */ + clear_buffer_dirty(bh); __journal_file_buffer(jh, transaction, BJ_Forget); - clear_buffer_jbddirty(bh); may_free = 0; } else { JBUFFER_TRACE(jh, "on running transaction"); @@ -2041,12 +2038,17 @@ void __journal_file_buffer(struct journal_head *jh, if (jh->b_transaction && jh->b_jlist == jlist) return; - /* The following list of buffer states needs to be consistent - * with __jbd_unexpected_dirty_buffer()'s handling of dirty - * state. */ - if (jlist == BJ_Metadata || jlist == BJ_Reserved || jlist == BJ_Shadow || jlist == BJ_Forget) { + /* + * For metadata buffers, we track dirty bit in buffer_jbddirty + * instead of buffer_dirty. We should not see a dirty bit set + * here because we clear it in do_get_write_access but e.g. + * tune2fs can modify the sb and set the dirty bit at any time + * so we try to gracefully handle that. + */ + if (buffer_dirty(bh)) + warn_dirty_buffer(bh); if (test_clear_buffer_dirty(bh) || test_clear_buffer_jbddirty(bh)) was_dirty = 1; diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 8fcb6239218e..7edb62e97419 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -258,7 +258,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) return rc; } -static int jffs2_check_acl(struct inode *inode, int mask) +int jffs2_check_acl(struct inode *inode, int mask) { struct posix_acl *acl; int rc; @@ -274,11 +274,6 @@ static int jffs2_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int jffs2_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, jffs2_check_acl); -} - int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode) { struct posix_acl *acl, *clone; diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h index fc929f2a14f6..f0ba63e3c36b 100644 --- a/fs/jffs2/acl.h +++ b/fs/jffs2/acl.h @@ -26,7 +26,7 @@ struct jffs2_acl_header { #ifdef CONFIG_JFFS2_FS_POSIX_ACL -extern int jffs2_permission(struct inode *, int); +extern int jffs2_check_acl(struct inode *, int); extern int jffs2_acl_chmod(struct inode *); extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *); extern int jffs2_init_acl_post(struct inode *); @@ -36,7 +36,7 @@ extern struct xattr_handler jffs2_acl_default_xattr_handler; #else -#define jffs2_permission (NULL) +#define jffs2_check_acl (NULL) #define jffs2_acl_chmod(inode) (0) #define jffs2_init_acl_pre(dir_i,inode,mode) (0) #define jffs2_init_acl_post(inode) (0) diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 6f60cc910f4c..7aa4417e085f 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -55,7 +55,7 @@ const struct inode_operations jffs2_dir_inode_operations = .rmdir = jffs2_rmdir, .mknod = jffs2_mknod, .rename = jffs2_rename, - .permission = jffs2_permission, + .check_acl = jffs2_check_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 5edc2bf20581..b7b74e299142 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -56,7 +56,7 @@ const struct file_operations jffs2_file_operations = const struct inode_operations jffs2_file_inode_operations = { - .permission = jffs2_permission, + .check_acl = jffs2_check_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, @@ -99,7 +99,7 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg) kunmap(pg); D2(printk(KERN_DEBUG "readpage finished\n")); - return 0; + return ret; } int jffs2_do_readpage_unlock(struct inode *inode, struct page *pg) diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index b7339c3b6ad9..4ec11e8bda8c 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -21,7 +21,7 @@ const struct inode_operations jffs2_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = jffs2_follow_link, - .permission = jffs2_permission, + .check_acl = jffs2_check_acl, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index d9a721e6db70..5ef7bac265e5 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -1268,10 +1268,20 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) { if (!c->wbuf) return -ENOMEM; +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY + c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL); + if (!c->wbuf_verify) { + kfree(c->wbuf); + return -ENOMEM; + } +#endif return 0; } void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) { +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY + kfree(c->wbuf_verify); +#endif kfree(c->wbuf); } diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 91fa3ad6e8c2..d66477c34306 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -67,10 +67,8 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type) acl = posix_acl_from_xattr(value, size); } kfree(value); - if (!IS_ERR(acl)) { + if (!IS_ERR(acl)) set_cached_acl(inode, type, acl); - posix_acl_release(acl); - } return acl; } @@ -116,7 +114,7 @@ out: return rc; } -static int jfs_check_acl(struct inode *inode, int mask) +int jfs_check_acl(struct inode *inode, int mask) { struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS); @@ -131,11 +129,6 @@ static int jfs_check_acl(struct inode *inode, int mask) return -EAGAIN; } -int jfs_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, jfs_check_acl); -} - int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir) { struct posix_acl *acl = NULL; diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 7f6063acaa3b..2b70fa78e4a7 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -96,7 +96,7 @@ const struct inode_operations jfs_file_inode_operations = { .removexattr = jfs_removexattr, #ifdef CONFIG_JFS_POSIX_ACL .setattr = jfs_setattr, - .permission = jfs_permission, + .check_acl = jfs_check_acl, #endif }; diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h index 88475f10a389..b07bd417ef85 100644 --- a/fs/jfs/jfs_acl.h +++ b/fs/jfs/jfs_acl.h @@ -20,7 +20,7 @@ #ifdef CONFIG_JFS_POSIX_ACL -int jfs_permission(struct inode *, int); +int jfs_check_acl(struct inode *, int); int jfs_init_acl(tid_t, struct inode *, struct inode *); int jfs_setattr(struct dentry *, struct iattr *); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 514ee2edb92a..c79a4270f083 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -1543,7 +1543,7 @@ const struct inode_operations jfs_dir_inode_operations = { .removexattr = jfs_removexattr, #ifdef CONFIG_JFS_POSIX_ACL .setattr = jfs_setattr, - .permission = jfs_permission, + .check_acl = jfs_check_acl, #endif }; diff --git a/fs/libfs.c b/fs/libfs.c index ddfa89948c3f..dcec3d3ea64f 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -217,7 +217,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name, return PTR_ERR(s); s->s_flags = MS_NOUSER; - s->s_maxbytes = ~0ULL; + s->s_maxbytes = MAX_LFS_FILESIZE; s->s_blocksize = PAGE_SIZE; s->s_blocksize_bits = PAGE_SHIFT; s->s_magic = magic; diff --git a/fs/lockd/host.c b/fs/lockd/host.c index 99d737bd4325..7cb076ac6b45 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -87,18 +87,6 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap) return hash & (NLM_HOST_NRHASH - 1); } -static void nlm_clear_port(struct sockaddr *sap) -{ - switch (sap->sa_family) { - case AF_INET: - ((struct sockaddr_in *)sap)->sin_port = 0; - break; - case AF_INET6: - ((struct sockaddr_in6 *)sap)->sin6_port = 0; - break; - } -} - /* * Common host lookup routine for server & client */ @@ -177,7 +165,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) host->h_addrbuf = nsm->sm_addrbuf; memcpy(nlm_addr(host), ni->sap, ni->salen); host->h_addrlen = ni->salen; - nlm_clear_port(nlm_addr(host)); + rpc_set_port(nlm_addr(host), 0); memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len); host->h_version = ni->version; host->h_proto = ni->protocol; diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 7fce1b525849..30c933188dd7 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -61,43 +61,6 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm) return (struct sockaddr *)&nsm->sm_addr; } -static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf, - const size_t len) -{ - const struct sockaddr_in *sin = (struct sockaddr_in *)sap; - snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr); -} - -static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf, - const size_t len) -{ - const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; - - if (ipv6_addr_v4mapped(&sin6->sin6_addr)) - snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]); - else if (sin6->sin6_scope_id != 0) - snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr, - sin6->sin6_scope_id); - else - snprintf(buf, len, "%pI6", &sin6->sin6_addr); -} - -static void nsm_display_address(const struct sockaddr *sap, - char *buf, const size_t len) -{ - switch (sap->sa_family) { - case AF_INET: - nsm_display_ipv4_address(sap, buf, len); - break; - case AF_INET6: - nsm_display_ipv6_address(sap, buf, len); - break; - default: - snprintf(buf, len, "unsupported address family"); - break; - } -} - static struct rpc_clnt *nsm_create(void) { struct sockaddr_in sin = { @@ -307,8 +270,11 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, memcpy(nsm_addr(new), sap, salen); new->sm_addrlen = salen; nsm_init_private(new); - nsm_display_address((const struct sockaddr *)&new->sm_addr, - new->sm_addrbuf, sizeof(new->sm_addrbuf)); + + if (rpc_ntop(nsm_addr(new), new->sm_addrbuf, + sizeof(new->sm_addrbuf)) == 0) + (void)snprintf(new->sm_addrbuf, sizeof(new->sm_addrbuf), + "unsupported address family"); memcpy(new->sm_name, hostname, hostname_len); new->sm_name[hostname_len] = '\0'; diff --git a/fs/locks.c b/fs/locks.c index b6440f52178f..19ee18a6829b 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) * give it the opportunity to lock the file. */ if (found) - cond_resched_bkl(); + cond_resched(); find_conflict: for_each_lock(inode, before) { @@ -1591,7 +1591,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) if (can_sleep) lock->fl_flags |= FL_SLEEP; - error = security_file_lock(filp, cmd); + error = security_file_lock(filp, lock->fl_type); if (error) goto out_free; diff --git a/fs/namei.c b/fs/namei.c index f3c5b278895a..d11f404667e9 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -169,19 +169,10 @@ void putname(const char *name) EXPORT_SYMBOL(putname); #endif - -/** - * generic_permission - check for access rights on a Posix-like filesystem - * @inode: inode to check access rights for - * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) - * @check_acl: optional callback to check for Posix ACLs - * - * Used to check for read/write/execute permissions on a file. - * We use "fsuid" for this, letting us set arbitrary permissions - * for filesystem access without changing the "normal" uids which - * are used for other things.. +/* + * This does basic POSIX ACL permission checking */ -int generic_permission(struct inode *inode, int mask, +static int acl_permission_check(struct inode *inode, int mask, int (*check_acl)(struct inode *inode, int mask)) { umode_t mode = inode->i_mode; @@ -193,9 +184,7 @@ int generic_permission(struct inode *inode, int mask, else { if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) { int error = check_acl(inode, mask); - if (error == -EACCES) - goto check_capabilities; - else if (error != -EAGAIN) + if (error != -EAGAIN) return error; } @@ -208,8 +197,32 @@ int generic_permission(struct inode *inode, int mask, */ if ((mask & ~mode) == 0) return 0; + return -EACCES; +} + +/** + * generic_permission - check for access rights on a Posix-like filesystem + * @inode: inode to check access rights for + * @mask: right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC) + * @check_acl: optional callback to check for Posix ACLs + * + * Used to check for read/write/execute permissions on a file. + * We use "fsuid" for this, letting us set arbitrary permissions + * for filesystem access without changing the "normal" uids which + * are used for other things.. + */ +int generic_permission(struct inode *inode, int mask, + int (*check_acl)(struct inode *inode, int mask)) +{ + int ret; + + /* + * Do the basic POSIX ACL permission checks. + */ + ret = acl_permission_check(inode, mask, check_acl); + if (ret != -EACCES) + return ret; - check_capabilities: /* * Read/write DACs are always overridable. * Executable DACs are overridable if at least one exec bit is set. @@ -262,7 +275,7 @@ int inode_permission(struct inode *inode, int mask) if (inode->i_op->permission) retval = inode->i_op->permission(inode, mask); else - retval = generic_permission(inode, mask, NULL); + retval = generic_permission(inode, mask, inode->i_op->check_acl); if (retval) return retval; @@ -432,29 +445,22 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name, */ static int exec_permission_lite(struct inode *inode) { - umode_t mode = inode->i_mode; + int ret; - if (inode->i_op->permission) - return -EAGAIN; - - if (current_fsuid() == inode->i_uid) - mode >>= 6; - else if (in_group_p(inode->i_gid)) - mode >>= 3; - - if (mode & MAY_EXEC) - goto ok; - - if ((inode->i_mode & S_IXUGO) && capable(CAP_DAC_OVERRIDE)) - goto ok; - - if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_OVERRIDE)) + if (inode->i_op->permission) { + ret = inode->i_op->permission(inode, MAY_EXEC); + if (!ret) + goto ok; + return ret; + } + ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl); + if (!ret) goto ok; - if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_READ_SEARCH)) + if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH)) goto ok; - return -EACCES; + return ret; ok: return security_inode_permission(inode, MAY_EXEC); } @@ -853,12 +859,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd) nd->flags |= LOOKUP_CONTINUE; err = exec_permission_lite(inode); - if (err == -EAGAIN) - err = inode_permission(nd->path.dentry->d_inode, - MAY_EXEC); - if (!err) - err = ima_path_check(&nd->path, MAY_EXEC, - IMA_COUNT_UPDATE); if (err) break; @@ -1533,37 +1533,42 @@ int may_open(struct path *path, int acc_mode, int flag) if (error) return error; - error = ima_path_check(path, - acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC), + error = ima_path_check(path, acc_mode ? + acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) : + ACC_MODE(flag) & (MAY_READ | MAY_WRITE), IMA_COUNT_UPDATE); + if (error) return error; /* * An append-only file must be opened in append mode for writing. */ if (IS_APPEND(inode)) { + error = -EPERM; if ((flag & FMODE_WRITE) && !(flag & O_APPEND)) - return -EPERM; + goto err_out; if (flag & O_TRUNC) - return -EPERM; + goto err_out; } /* O_NOATIME can only be set by the owner or superuser */ if (flag & O_NOATIME) - if (!is_owner_or_cap(inode)) - return -EPERM; + if (!is_owner_or_cap(inode)) { + error = -EPERM; + goto err_out; + } /* * Ensure there are no outstanding leases on the file. */ error = break_lease(inode, flag); if (error) - return error; + goto err_out; if (flag & O_TRUNC) { error = get_write_access(inode); if (error) - return error; + goto err_out; /* * Refuse to truncate files with mandatory locks held on them. @@ -1581,12 +1586,17 @@ int may_open(struct path *path, int acc_mode, int flag) } put_write_access(inode); if (error) - return error; + goto err_out; } else if (flag & FMODE_WRITE) vfs_dq_init(inode); return 0; +err_out: + ima_counts_put(path, acc_mode ? + acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) : + ACC_MODE(flag) & (MAY_READ | MAY_WRITE)); + return error; } /* diff --git a/fs/namespace.c b/fs/namespace.c index 277c28a63ead..7230787d18b0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -316,7 +316,8 @@ EXPORT_SYMBOL_GPL(mnt_clone_write); */ int mnt_want_write_file(struct file *file) { - if (!(file->f_mode & FMODE_WRITE)) + struct inode *inode = file->f_dentry->d_inode; + if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) return mnt_want_write(file->f_path.mnt); else return mnt_clone_write(file->f_path.mnt); diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 845159814de2..da7fda639eac 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -6,7 +6,8 @@ obj-$(CONFIG_NFS_FS) += nfs.o nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \ direct.o pagelist.o proc.o read.o symlink.o unlink.o \ - write.o namespace.o mount_clnt.o + write.o namespace.o mount_clnt.o \ + dns_resolve.o cache_lib.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o nfs-$(CONFIG_NFS_V3_ACL) += nfs3acl.o diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c new file mode 100644 index 000000000000..b4ffd0146ea6 --- /dev/null +++ b/fs/nfs/cache_lib.c @@ -0,0 +1,140 @@ +/* + * linux/fs/nfs/cache_lib.c + * + * Helper routines for the NFS client caches + * + * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com> + */ +#include <linux/kmod.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/mount.h> +#include <linux/namei.h> +#include <linux/sunrpc/cache.h> +#include <linux/sunrpc/rpc_pipe_fs.h> + +#include "cache_lib.h" + +#define NFS_CACHE_UPCALL_PATHLEN 256 +#define NFS_CACHE_UPCALL_TIMEOUT 15 + +static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] = + "/sbin/nfs_cache_getent"; +static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT; + +module_param_string(cache_getent, nfs_cache_getent_prog, + sizeof(nfs_cache_getent_prog), 0600); +MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program"); +module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600); +MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which " + "the cache upcall is assumed to have failed"); + +int nfs_cache_upcall(struct cache_detail *cd, char *entry_name) +{ + static char *envp[] = { "HOME=/", + "TERM=linux", + "PATH=/sbin:/usr/sbin:/bin:/usr/bin", + NULL + }; + char *argv[] = { + nfs_cache_getent_prog, + cd->name, + entry_name, + NULL + }; + int ret = -EACCES; + + if (nfs_cache_getent_prog[0] == '\0') + goto out; + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + /* + * Disable the upcall mechanism if we're getting an ENOENT or + * EACCES error. The admin can re-enable it on the fly by using + * sysfs to set the 'cache_getent' parameter once the problem + * has been fixed. + */ + if (ret == -ENOENT || ret == -EACCES) + nfs_cache_getent_prog[0] = '\0'; +out: + return ret > 0 ? 0 : ret; +} + +/* + * Deferred request handling + */ +void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq) +{ + if (atomic_dec_and_test(&dreq->count)) + kfree(dreq); +} + +static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany) +{ + struct nfs_cache_defer_req *dreq; + + dreq = container_of(d, struct nfs_cache_defer_req, deferred_req); + + complete_all(&dreq->completion); + nfs_cache_defer_req_put(dreq); +} + +static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req) +{ + struct nfs_cache_defer_req *dreq; + + dreq = container_of(req, struct nfs_cache_defer_req, req); + dreq->deferred_req.revisit = nfs_dns_cache_revisit; + atomic_inc(&dreq->count); + + return &dreq->deferred_req; +} + +struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void) +{ + struct nfs_cache_defer_req *dreq; + + dreq = kzalloc(sizeof(*dreq), GFP_KERNEL); + if (dreq) { + init_completion(&dreq->completion); + atomic_set(&dreq->count, 1); + dreq->req.defer = nfs_dns_cache_defer; + } + return dreq; +} + +int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq) +{ + if (wait_for_completion_timeout(&dreq->completion, + nfs_cache_getent_timeout * HZ) == 0) + return -ETIMEDOUT; + return 0; +} + +int nfs_cache_register(struct cache_detail *cd) +{ + struct nameidata nd; + struct vfsmount *mnt; + int ret; + + mnt = rpc_get_mount(); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &nd); + if (ret) + goto err; + ret = sunrpc_cache_register_pipefs(nd.path.dentry, + cd->name, 0600, cd); + path_put(&nd.path); + if (!ret) + return ret; +err: + rpc_put_mount(); + return ret; +} + +void nfs_cache_unregister(struct cache_detail *cd) +{ + sunrpc_cache_unregister_pipefs(cd); + rpc_put_mount(); +} + diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h new file mode 100644 index 000000000000..76f856e284e4 --- /dev/null +++ b/fs/nfs/cache_lib.h @@ -0,0 +1,27 @@ +/* + * Helper routines for the NFS client caches + * + * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com> + */ + +#include <linux/completion.h> +#include <linux/sunrpc/cache.h> +#include <asm/atomic.h> + +/* + * Deferred request handling + */ +struct nfs_cache_defer_req { + struct cache_req req; + struct cache_deferred_req deferred_req; + struct completion completion; + atomic_t count; +}; + +extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name); +extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void); +extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq); +extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq); + +extern int nfs_cache_register(struct cache_detail *cd); +extern void nfs_cache_unregister(struct cache_detail *cd); diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 7f604c7941fb..293fa0528a6e 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -43,21 +43,29 @@ static struct svc_program nfs4_callback_program; unsigned int nfs_callback_set_tcpport; unsigned short nfs_callback_tcpport; unsigned short nfs_callback_tcpport6; -static const int nfs_set_port_min = 0; -static const int nfs_set_port_max = 65535; +#define NFS_CALLBACK_MAXPORTNR (65535U) -static int param_set_port(const char *val, struct kernel_param *kp) +static int param_set_portnr(const char *val, struct kernel_param *kp) { - char *endp; - int num = simple_strtol(val, &endp, 0); - if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max) + unsigned long num; + int ret; + + if (!val) + return -EINVAL; + ret = strict_strtoul(val, 0, &num); + if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR) return -EINVAL; - *((int *)kp->arg) = num; + *((unsigned int *)kp->arg) = num; return 0; } -module_param_call(callback_tcpport, param_set_port, param_get_int, - &nfs_callback_set_tcpport, 0644); +static int param_get_portnr(char *buffer, struct kernel_param *kp) +{ + return param_get_uint(buffer, kp); +} +#define param_check_portnr(name, p) __param_check(name, p, unsigned int); + +module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644); /* * This is the NFSv4 callback kernel thread. diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 8d25ccb2d51d..e350bd6a2334 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -809,6 +809,9 @@ static int nfs_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = data->flags; server->options = data->options; + server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID| + NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP| + NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); @@ -879,6 +882,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo * server->rsize = NFS_MAX_FILE_IO_SIZE; server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + server->backing_dev_info.name = "nfs"; server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD; if (server->wsize > max_rpc_payload) @@ -1074,10 +1078,6 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data, (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); - BUG_ON(!server->nfs_client); - BUG_ON(!server->nfs_client->rpc_ops); - BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - spin_lock(&nfs_client_lock); list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); list_add_tail(&server->master_link, &nfs_volume_list); @@ -1274,7 +1274,7 @@ static int nfs4_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = data->flags; - server->caps |= NFS_CAP_ATOMIC_OPEN; + server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR; server->options = data->options; /* Get a client record */ @@ -1359,10 +1359,6 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data, if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN) server->namelen = NFS4_MAXNAMLEN; - BUG_ON(!server->nfs_client); - BUG_ON(!server->nfs_client->rpc_ops); - BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops); - spin_lock(&nfs_client_lock); list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks); list_add_tail(&server->master_link, &nfs_volume_list); @@ -1400,7 +1396,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, /* Initialise the client representation from the parent server */ nfs_server_copy_userdata(server, parent_server); - server->caps |= NFS_CAP_ATOMIC_OPEN; + server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR; /* Get a client representation. * Note: NFSv4 always uses TCP, */ diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 489fc01a3204..6c3210099d51 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -255,7 +255,7 @@ static void nfs_direct_read_release(void *calldata) if (put_dreq(dreq)) nfs_direct_complete(dreq); - nfs_readdata_release(calldata); + nfs_readdata_free(data); } static const struct rpc_call_ops nfs_read_direct_ops = { @@ -314,14 +314,14 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, data->npages, 1, 0, data->pagevec, NULL); up_read(¤t->mm->mmap_sem); if (result < 0) { - nfs_readdata_release(data); + nfs_readdata_free(data); break; } if ((unsigned)result < data->npages) { bytes = result * PAGE_SIZE; if (bytes <= pgbase) { nfs_direct_release_pages(data->pagevec, result); - nfs_readdata_release(data); + nfs_readdata_free(data); break; } bytes -= pgbase; @@ -334,7 +334,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, data->inode = inode; data->cred = msg.rpc_cred; data->args.fh = NFS_FH(inode); - data->args.context = get_nfs_open_context(ctx); + data->args.context = ctx; data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = data->pagevec; @@ -441,7 +441,7 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq) struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages); list_del(&data->pages); nfs_direct_release_pages(data->pagevec, data->npages); - nfs_writedata_release(data); + nfs_writedata_free(data); } } @@ -534,7 +534,7 @@ static void nfs_direct_commit_release(void *calldata) dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status); nfs_direct_write_complete(dreq, data->inode); - nfs_commitdata_release(calldata); + nfs_commit_free(data); } static const struct rpc_call_ops nfs_commit_direct_ops = { @@ -570,7 +570,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) data->args.fh = NFS_FH(data->inode); data->args.offset = 0; data->args.count = 0; - data->args.context = get_nfs_open_context(dreq->ctx); + data->args.context = dreq->ctx; data->res.count = 0; data->res.fattr = &data->fattr; data->res.verf = &data->verf; @@ -734,14 +734,14 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, data->npages, 0, 0, data->pagevec, NULL); up_read(¤t->mm->mmap_sem); if (result < 0) { - nfs_writedata_release(data); + nfs_writedata_free(data); break; } if ((unsigned)result < data->npages) { bytes = result * PAGE_SIZE; if (bytes <= pgbase) { nfs_direct_release_pages(data->pagevec, result); - nfs_writedata_release(data); + nfs_writedata_free(data); break; } bytes -= pgbase; @@ -756,7 +756,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, data->inode = inode; data->cred = msg.rpc_cred; data->args.fh = NFS_FH(inode); - data->args.context = get_nfs_open_context(ctx); + data->args.context = ctx; data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = data->pagevec; @@ -934,9 +934,6 @@ out: * back into its cache. We let the server do generic write * parameter checking and report problems. * - * We also avoid an unnecessary invocation of generic_osync_inode(), - * as it is fairly meaningless to sync the metadata of an NFS file. - * * We eliminate local atime updates, see direct read above. * * We avoid unnecessary page cache invalidations for normal cached diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c new file mode 100644 index 000000000000..f4d54ba97cc6 --- /dev/null +++ b/fs/nfs/dns_resolve.c @@ -0,0 +1,335 @@ +/* + * linux/fs/nfs/dns_resolve.c + * + * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com> + * + * Resolves DNS hostnames into valid ip addresses + */ + +#include <linux/hash.h> +#include <linux/string.h> +#include <linux/kmod.h> +#include <linux/module.h> +#include <linux/socket.h> +#include <linux/seq_file.h> +#include <linux/inet.h> +#include <linux/sunrpc/clnt.h> +#include <linux/sunrpc/cache.h> +#include <linux/sunrpc/svcauth.h> + +#include "dns_resolve.h" +#include "cache_lib.h" + +#define NFS_DNS_HASHBITS 4 +#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS) + +static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE]; + +struct nfs_dns_ent { + struct cache_head h; + + char *hostname; + size_t namelen; + + struct sockaddr_storage addr; + size_t addrlen; +}; + + +static void nfs_dns_ent_init(struct cache_head *cnew, + struct cache_head *ckey) +{ + struct nfs_dns_ent *new; + struct nfs_dns_ent *key; + + new = container_of(cnew, struct nfs_dns_ent, h); + key = container_of(ckey, struct nfs_dns_ent, h); + + kfree(new->hostname); + new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL); + if (new->hostname) { + new->namelen = key->namelen; + memcpy(&new->addr, &key->addr, key->addrlen); + new->addrlen = key->addrlen; + } else { + new->namelen = 0; + new->addrlen = 0; + } +} + +static void nfs_dns_ent_put(struct kref *ref) +{ + struct nfs_dns_ent *item; + + item = container_of(ref, struct nfs_dns_ent, h.ref); + kfree(item->hostname); + kfree(item); +} + +static struct cache_head *nfs_dns_ent_alloc(void) +{ + struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL); + + if (item != NULL) { + item->hostname = NULL; + item->namelen = 0; + item->addrlen = 0; + return &item->h; + } + return NULL; +}; + +static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key) +{ + return hash_str(key->hostname, NFS_DNS_HASHBITS); +} + +static void nfs_dns_request(struct cache_detail *cd, + struct cache_head *ch, + char **bpp, int *blen) +{ + struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h); + + qword_add(bpp, blen, key->hostname); + (*bpp)[-1] = '\n'; +} + +static int nfs_dns_upcall(struct cache_detail *cd, + struct cache_head *ch) +{ + struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h); + int ret; + + ret = nfs_cache_upcall(cd, key->hostname); + if (ret) + ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request); + return ret; +} + +static int nfs_dns_match(struct cache_head *ca, + struct cache_head *cb) +{ + struct nfs_dns_ent *a; + struct nfs_dns_ent *b; + + a = container_of(ca, struct nfs_dns_ent, h); + b = container_of(cb, struct nfs_dns_ent, h); + + if (a->namelen == 0 || a->namelen != b->namelen) + return 0; + return memcmp(a->hostname, b->hostname, a->namelen) == 0; +} + +static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd, + struct cache_head *h) +{ + struct nfs_dns_ent *item; + long ttl; + + if (h == NULL) { + seq_puts(m, "# ip address hostname ttl\n"); + return 0; + } + item = container_of(h, struct nfs_dns_ent, h); + ttl = (long)item->h.expiry_time - (long)get_seconds(); + if (ttl < 0) + ttl = 0; + + if (!test_bit(CACHE_NEGATIVE, &h->flags)) { + char buf[INET6_ADDRSTRLEN+IPV6_SCOPE_ID_LEN+1]; + + rpc_ntop((struct sockaddr *)&item->addr, buf, sizeof(buf)); + seq_printf(m, "%15s ", buf); + } else + seq_puts(m, "<none> "); + seq_printf(m, "%15s %ld\n", item->hostname, ttl); + return 0; +} + +struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd, + struct nfs_dns_ent *key) +{ + struct cache_head *ch; + + ch = sunrpc_cache_lookup(cd, + &key->h, + nfs_dns_hash(key)); + if (!ch) + return NULL; + return container_of(ch, struct nfs_dns_ent, h); +} + +struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd, + struct nfs_dns_ent *new, + struct nfs_dns_ent *key) +{ + struct cache_head *ch; + + ch = sunrpc_cache_update(cd, + &new->h, &key->h, + nfs_dns_hash(key)); + if (!ch) + return NULL; + return container_of(ch, struct nfs_dns_ent, h); +} + +static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen) +{ + char buf1[NFS_DNS_HOSTNAME_MAXLEN+1]; + struct nfs_dns_ent key, *item; + unsigned long ttl; + ssize_t len; + int ret = -EINVAL; + + if (buf[buflen-1] != '\n') + goto out; + buf[buflen-1] = '\0'; + + len = qword_get(&buf, buf1, sizeof(buf1)); + if (len <= 0) + goto out; + key.addrlen = rpc_pton(buf1, len, + (struct sockaddr *)&key.addr, + sizeof(key.addr)); + + len = qword_get(&buf, buf1, sizeof(buf1)); + if (len <= 0) + goto out; + + key.hostname = buf1; + key.namelen = len; + memset(&key.h, 0, sizeof(key.h)); + + ttl = get_expiry(&buf); + if (ttl == 0) + goto out; + key.h.expiry_time = ttl + get_seconds(); + + ret = -ENOMEM; + item = nfs_dns_lookup(cd, &key); + if (item == NULL) + goto out; + + if (key.addrlen == 0) + set_bit(CACHE_NEGATIVE, &key.h.flags); + + item = nfs_dns_update(cd, &key, item); + if (item == NULL) + goto out; + + ret = 0; + cache_put(&item->h, cd); +out: + return ret; +} + +static struct cache_detail nfs_dns_resolve = { + .owner = THIS_MODULE, + .hash_size = NFS_DNS_HASHTBL_SIZE, + .hash_table = nfs_dns_table, + .name = "dns_resolve", + .cache_put = nfs_dns_ent_put, + .cache_upcall = nfs_dns_upcall, + .cache_parse = nfs_dns_parse, + .cache_show = nfs_dns_show, + .match = nfs_dns_match, + .init = nfs_dns_ent_init, + .update = nfs_dns_ent_init, + .alloc = nfs_dns_ent_alloc, +}; + +static int do_cache_lookup(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item, + struct nfs_cache_defer_req *dreq) +{ + int ret = -ENOMEM; + + *item = nfs_dns_lookup(cd, key); + if (*item) { + ret = cache_check(cd, &(*item)->h, &dreq->req); + if (ret) + *item = NULL; + } + return ret; +} + +static int do_cache_lookup_nowait(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item) +{ + int ret = -ENOMEM; + + *item = nfs_dns_lookup(cd, key); + if (!*item) + goto out_err; + ret = -ETIMEDOUT; + if (!test_bit(CACHE_VALID, &(*item)->h.flags) + || (*item)->h.expiry_time < get_seconds() + || cd->flush_time > (*item)->h.last_refresh) + goto out_put; + ret = -ENOENT; + if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags)) + goto out_put; + return 0; +out_put: + cache_put(&(*item)->h, cd); +out_err: + *item = NULL; + return ret; +} + +static int do_cache_lookup_wait(struct cache_detail *cd, + struct nfs_dns_ent *key, + struct nfs_dns_ent **item) +{ + struct nfs_cache_defer_req *dreq; + int ret = -ENOMEM; + + dreq = nfs_cache_defer_req_alloc(); + if (!dreq) + goto out; + ret = do_cache_lookup(cd, key, item, dreq); + if (ret == -EAGAIN) { + ret = nfs_cache_wait_for_upcall(dreq); + if (!ret) + ret = do_cache_lookup_nowait(cd, key, item); + } + nfs_cache_defer_req_put(dreq); +out: + return ret; +} + +ssize_t nfs_dns_resolve_name(char *name, size_t namelen, + struct sockaddr *sa, size_t salen) +{ + struct nfs_dns_ent key = { + .hostname = name, + .namelen = namelen, + }; + struct nfs_dns_ent *item = NULL; + ssize_t ret; + + ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item); + if (ret == 0) { + if (salen >= item->addrlen) { + memcpy(sa, &item->addr, item->addrlen); + ret = item->addrlen; + } else + ret = -EOVERFLOW; + cache_put(&item->h, &nfs_dns_resolve); + } else if (ret == -ENOENT) + ret = -ESRCH; + return ret; +} + +int nfs_dns_resolver_init(void) +{ + return nfs_cache_register(&nfs_dns_resolve); +} + +void nfs_dns_resolver_destroy(void) +{ + nfs_cache_unregister(&nfs_dns_resolve); +} + diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h new file mode 100644 index 000000000000..a3f0938babf7 --- /dev/null +++ b/fs/nfs/dns_resolve.h @@ -0,0 +1,14 @@ +/* + * Resolve DNS hostnames into valid ip addresses + */ +#ifndef __LINUX_FS_NFS_DNS_RESOLVE_H +#define __LINUX_FS_NFS_DNS_RESOLVE_H + +#define NFS_DNS_HOSTNAME_MAXLEN (128) + +extern int nfs_dns_resolver_init(void); +extern void nfs_dns_resolver_destroy(void); +extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen, + struct sockaddr *sa, size_t salen); + +#endif diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 05062329b678..5021b75d2d1e 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -328,6 +328,42 @@ nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync) } /* + * Decide whether a read/modify/write cycle may be more efficient + * then a modify/write/read cycle when writing to a page in the + * page cache. + * + * The modify/write/read cycle may occur if a page is read before + * being completely filled by the writer. In this situation, the + * page must be completely written to stable storage on the server + * before it can be refilled by reading in the page from the server. + * This can lead to expensive, small, FILE_SYNC mode writes being + * done. + * + * It may be more efficient to read the page first if the file is + * open for reading in addition to writing, the page is not marked + * as Uptodate, it is not dirty or waiting to be committed, + * indicating that it was previously allocated and then modified, + * that there were valid bytes of data in that range of the file, + * and that the new data won't completely replace the old data in + * that range of the file. + */ +static int nfs_want_read_modify_write(struct file *file, struct page *page, + loff_t pos, unsigned len) +{ + unsigned int pglen = nfs_page_length(page); + unsigned int offset = pos & (PAGE_CACHE_SIZE - 1); + unsigned int end = offset + len; + + if ((file->f_mode & FMODE_READ) && /* open for read? */ + !PageUptodate(page) && /* Uptodate? */ + !PagePrivate(page) && /* i/o request already? */ + pglen && /* valid bytes of file? */ + (end < pglen || offset)) /* replace all valid bytes? */ + return 1; + return 0; +} + +/* * This does the "real" work of the write. We must allocate and lock the * page to be sent back to the generic routine, which then copies the * data from user space. @@ -340,15 +376,16 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { int ret; - pgoff_t index; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; struct page *page; - index = pos >> PAGE_CACHE_SHIFT; + int once_thru = 0; dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n", file->f_path.dentry->d_parent->d_name.name, file->f_path.dentry->d_name.name, mapping->host->i_ino, len, (long long) pos); +start: /* * Prevent starvation issues if someone is doing a consistency * sync-to-disk @@ -367,6 +404,13 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, if (ret) { unlock_page(page); page_cache_release(page); + } else if (!once_thru && + nfs_want_read_modify_write(file, page, pos, len)) { + once_thru = 1; + ret = nfs_readpage(file, page); + page_cache_release(page); + if (!ret) + goto start; } return ret; } @@ -479,6 +523,7 @@ const struct address_space_operations nfs_file_aops = { .invalidatepage = nfs_invalidate_page, .releasepage = nfs_release_page, .direct_IO = nfs_direct_IO, + .migratepage = nfs_migrate_page, .launder_page = nfs_launder_page, }; diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 86147b0ab2cf..21a84d45916f 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -101,7 +101,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *); static unsigned int fnvhash32(const void *, size_t); -static struct rpc_pipe_ops idmap_upcall_ops = { +static const struct rpc_pipe_ops idmap_upcall_ops = { .upcall = idmap_pipe_upcall, .downcall = idmap_pipe_downcall, .destroy_msg = idmap_pipe_destroy_msg, @@ -119,8 +119,8 @@ nfs_idmap_new(struct nfs_client *clp) if (idmap == NULL) return -ENOMEM; - idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap", - idmap, &idmap_upcall_ops, 0); + idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry, + "idmap", idmap, &idmap_upcall_ops, 0); if (IS_ERR(idmap->idmap_dentry)) { error = PTR_ERR(idmap->idmap_dentry); kfree(idmap); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index bd7938eda6a8..060022b4651c 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -46,6 +46,7 @@ #include "iostat.h" #include "internal.h" #include "fscache.h" +#include "dns_resolve.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -286,6 +287,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) /* We can't support update_atime(), since the server will reset it */ inode->i_flags |= S_NOATIME|S_NOCMTIME; inode->i_mode = fattr->mode; + if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 + && nfs_server_capable(inode, NFS_CAP_MODE)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. */ @@ -330,20 +336,46 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) nfsi->attr_gencount = fattr->gencount; if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode->i_atime = fattr->atime; + else if (nfs_server_capable(inode, NFS_CAP_ATIME)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode->i_mtime = fattr->mtime; + else if (nfs_server_capable(inode, NFS_CAP_MTIME)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA; if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode->i_ctime = fattr->ctime; + else if (nfs_server_capable(inode, NFS_CAP_CTIME)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; if (fattr->valid & NFS_ATTR_FATTR_CHANGE) nfsi->change_attr = fattr->change_attr; + else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA; if (fattr->valid & NFS_ATTR_FATTR_SIZE) inode->i_size = nfs_size_to_loff_t(fattr->size); + else + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA + | NFS_INO_REVAL_PAGECACHE; if (fattr->valid & NFS_ATTR_FATTR_NLINK) inode->i_nlink = fattr->nlink; + else if (nfs_server_capable(inode, NFS_CAP_NLINK)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR; if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; + else if (nfs_server_capable(inode, NFS_CAP_OWNER)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; if (fattr->valid & NFS_ATTR_FATTR_GROUP) inode->i_gid = fattr->gid; + else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) + nfsi->cache_validity |= NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL; if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { @@ -1145,6 +1177,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) loff_t cur_isize, new_isize; unsigned long invalid = 0; unsigned long now = jiffies; + unsigned long save_cache_validity; dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n", __func__, inode->i_sb->s_id, inode->i_ino, @@ -1171,10 +1204,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) */ nfsi->read_cache_jiffies = fattr->time_start; - if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME))) - nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ATIME - | NFS_INO_REVAL_PAGECACHE); + save_cache_validity = nfsi->cache_validity; + nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ATIME + | NFS_INO_REVAL_FORCED + | NFS_INO_REVAL_PAGECACHE); /* Do atomic weak cache consistency updates */ nfs_wcc_update_inode(inode, fattr); @@ -1189,7 +1223,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_force_lookup_revalidate(inode); nfsi->change_attr = fattr->change_attr; } - } + } else if (server->caps & NFS_CAP_CHANGE_ATTR) + invalid |= save_cache_validity; if (fattr->valid & NFS_ATTR_FATTR_MTIME) { /* NFSv2/v3: Check if the mtime agrees */ @@ -1201,7 +1236,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_force_lookup_revalidate(inode); memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime)); } - } + } else if (server->caps & NFS_CAP_MTIME) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_DATA + | NFS_INO_REVAL_PAGECACHE + | NFS_INO_REVAL_FORCED); + if (fattr->valid & NFS_ATTR_FATTR_CTIME) { /* If ctime has changed we should definitely clear access+acl caches */ if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) { @@ -1215,7 +1255,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime)); } - } + } else if (server->caps & NFS_CAP_CTIME) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); /* Check if our cached file size is stale */ if (fattr->valid & NFS_ATTR_FATTR_SIZE) { @@ -1231,30 +1275,50 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) dprintk("NFS: isize change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); } - } + } else + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_PAGECACHE + | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_ATIME) memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime)); + else if (server->caps & NFS_CAP_ATIME) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME + | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_MODE) { if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_mode = fattr->mode; } - } + } else if (server->caps & NFS_CAP_MODE) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + if (fattr->valid & NFS_ATTR_FATTR_OWNER) { if (inode->i_uid != fattr->uid) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_uid = fattr->uid; } - } + } else if (server->caps & NFS_CAP_OWNER) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); + if (fattr->valid & NFS_ATTR_FATTR_GROUP) { if (inode->i_gid != fattr->gid) { invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; inode->i_gid = fattr->gid; } - } + } else if (server->caps & NFS_CAP_OWNER_GROUP) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_INVALID_ACCESS + | NFS_INO_INVALID_ACL + | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_NLINK) { if (inode->i_nlink != fattr->nlink) { @@ -1263,7 +1327,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) invalid |= NFS_INO_INVALID_DATA; inode->i_nlink = fattr->nlink; } - } + } else if (server->caps & NFS_CAP_NLINK) + invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_FORCED); if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* @@ -1293,9 +1359,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) || S_ISLNK(inode->i_mode))) invalid &= ~NFS_INO_INVALID_DATA; if (!nfs_have_delegation(inode, FMODE_READ) || - (nfsi->cache_validity & NFS_INO_REVAL_FORCED)) + (save_cache_validity & NFS_INO_REVAL_FORCED)) nfsi->cache_validity |= invalid; - nfsi->cache_validity &= ~NFS_INO_REVAL_FORCED; return 0; out_changed: @@ -1442,6 +1507,10 @@ static int __init init_nfs_fs(void) { int err; + err = nfs_dns_resolver_init(); + if (err < 0) + goto out8; + err = nfs_fscache_register(); if (err < 0) goto out7; @@ -1500,6 +1569,8 @@ out5: out6: nfs_fscache_unregister(); out7: + nfs_dns_resolver_destroy(); +out8: return err; } @@ -1511,6 +1582,7 @@ static void __exit exit_nfs_fs(void) nfs_destroy_inodecache(); nfs_destroy_nfspagecache(); nfs_fscache_unregister(); + nfs_dns_resolver_destroy(); #ifdef CONFIG_PROC_FS rpc_proc_unregister("nfs"); #endif diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 7dd90a6769d0..e21b1bb9972f 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -49,6 +49,11 @@ struct nfs_clone_mount { #define NFS_MAX_SECFLAVORS (12) /* + * Value used if the user did not specify a port value. + */ +#define NFS_UNSPEC_PORT (-1) + +/* * In-kernel mount arguments */ struct nfs_parsed_mount_data { @@ -63,6 +68,7 @@ struct nfs_parsed_mount_data { unsigned int auth_flavor_len; rpc_authflavor_t auth_flavors[1]; char *client_address; + unsigned int version; unsigned int minorversion; char *fscache_uniq; @@ -71,7 +77,7 @@ struct nfs_parsed_mount_data { size_t addrlen; char *hostname; u32 version; - unsigned short port; + int port; unsigned short protocol; } mount_server; @@ -80,7 +86,7 @@ struct nfs_parsed_mount_data { size_t addrlen; char *hostname; char *export_path; - unsigned short port; + int port; unsigned short protocol; } nfs_server; @@ -102,6 +108,7 @@ struct nfs_mount_request { }; extern int nfs_mount(struct nfs_mount_request *info); +extern void nfs_umount(const struct nfs_mount_request *info); /* client.c */ extern struct rpc_program nfs_program; @@ -213,7 +220,6 @@ void nfs_zap_acl_cache(struct inode *inode); extern int nfs_wait_bit_killable(void *word); /* super.c */ -void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *); extern struct file_system_type nfs_xdev_fs_type; #ifdef CONFIG_NFS_V4 extern struct file_system_type nfs4_xdev_fs_type; @@ -248,6 +254,12 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata); /* write.c */ extern void nfs_write_prepare(struct rpc_task *task, void *calldata); +#ifdef CONFIG_MIGRATION +extern int nfs_migrate_page(struct address_space *, + struct page *, struct page *); +#else +#define nfs_migrate_page NULL +#endif /* nfs4proc.c */ extern int _nfs4_call_sync(struct nfs_server *server, @@ -368,24 +380,3 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) return ((unsigned long)len + (unsigned long)base + PAGE_SIZE - 1) >> PAGE_SHIFT; } - -#define IPV6_SCOPE_DELIMITER '%' - -/* - * Set the port number in an address. Be agnostic about the address - * family. - */ -static inline void nfs_set_port(struct sockaddr *sap, unsigned short port) -{ - struct sockaddr_in *ap = (struct sockaddr_in *)sap; - struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap; - - switch (sap->sa_family) { - case AF_INET: - ap->sin_port = htons(port); - break; - case AF_INET6: - ap6->sin6_port = htons(port); - break; - } -} diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 38ef9eaec407..0adefc40cc89 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -209,6 +209,71 @@ out_mnt_err: goto out; } +/** + * nfs_umount - Notify a server that we have unmounted this export + * @info: pointer to umount request arguments + * + * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always + * use UDP. + */ +void nfs_umount(const struct nfs_mount_request *info) +{ + static const struct rpc_timeout nfs_umnt_timeout = { + .to_initval = 1 * HZ, + .to_maxval = 3 * HZ, + .to_retries = 2, + }; + struct rpc_create_args args = { + .protocol = IPPROTO_UDP, + .address = info->sap, + .addrsize = info->salen, + .timeout = &nfs_umnt_timeout, + .servername = info->hostname, + .program = &mnt_program, + .version = info->version, + .authflavor = RPC_AUTH_UNIX, + .flags = RPC_CLNT_CREATE_NOPING, + }; + struct mountres result; + struct rpc_message msg = { + .rpc_argp = info->dirpath, + .rpc_resp = &result, + }; + struct rpc_clnt *clnt; + int status; + + if (info->noresvport) + args.flags |= RPC_CLNT_CREATE_NONPRIVPORT; + + clnt = rpc_create(&args); + if (unlikely(IS_ERR(clnt))) + goto out_clnt_err; + + dprintk("NFS: sending UMNT request for %s:%s\n", + (info->hostname ? info->hostname : "server"), info->dirpath); + + if (info->version == NFS_MNT3_VERSION) + msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT]; + else + msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT]; + + status = rpc_call_sync(clnt, &msg, 0); + rpc_shutdown_client(clnt); + + if (unlikely(status < 0)) + goto out_call_err; + + return; + +out_clnt_err: + dprintk("NFS: failed to create UMNT RPC client, status=%ld\n", + PTR_ERR(clnt)); + return; + +out_call_err: + dprintk("NFS: UMNT request failed, status=%d\n", status); +} + /* * XDR encode/decode functions for MOUNT */ @@ -258,7 +323,7 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res) return -EIO; status = ntohl(*p); - for (i = 0; i <= ARRAY_SIZE(mnt_errtbl); i++) { + for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) { if (mnt_errtbl[i].status == status) { res->errno = mnt_errtbl[i].errno; return 0; @@ -309,7 +374,7 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res) return -EIO; status = ntohl(*p); - for (i = 0; i <= ARRAY_SIZE(mnt3_errtbl); i++) { + for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) { if (mnt3_errtbl[i].status == status) { res->errno = mnt3_errtbl[i].errno; return 0; @@ -407,6 +472,13 @@ static struct rpc_procinfo mnt_procedures[] = { .p_statidx = MOUNTPROC_MNT, .p_name = "MOUNT", }, + [MOUNTPROC_UMNT] = { + .p_proc = MOUNTPROC_UMNT, + .p_encode = (kxdrproc_t)mnt_enc_dirpath, + .p_arglen = MNT_enc_dirpath_sz, + .p_statidx = MOUNTPROC_UMNT, + .p_name = "UMOUNT", + }, }; static struct rpc_procinfo mnt3_procedures[] = { @@ -419,6 +491,13 @@ static struct rpc_procinfo mnt3_procedures[] = { .p_statidx = MOUNTPROC3_MNT, .p_name = "MOUNT", }, + [MOUNTPROC3_UMNT] = { + .p_proc = MOUNTPROC3_UMNT, + .p_encode = (kxdrproc_t)mnt_enc_dirpath, + .p_arglen = MNT_enc_dirpath_sz, + .p_statidx = MOUNTPROC3_UMNT, + .p_name = "UMOUNT", + }, }; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index d0cc5ce0edfe..ee6a13f05443 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -299,7 +299,6 @@ static void nfs3_free_createdata(struct nfs3_createdata *data) /* * Create a regular file. - * For now, we don't implement O_EXCL. */ static int nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 2a2a0a7143ad..2636c26d56fa 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -17,6 +17,7 @@ #include <linux/inet.h> #include "internal.h" #include "nfs4_fs.h" +#include "dns_resolve.h" #define NFSDBG_FACILITY NFSDBG_VFS @@ -95,6 +96,20 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent, return 0; } +static size_t nfs_parse_server_name(char *string, size_t len, + struct sockaddr *sa, size_t salen) +{ + ssize_t ret; + + ret = rpc_pton(string, len, sa, salen); + if (ret == 0) { + ret = nfs_dns_resolve_name(string, len, sa, salen); + if (ret < 0) + ret = 0; + } + return ret; +} + static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, char *page, char *page2, const struct nfs4_fs_location *location) @@ -121,11 +136,12 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) continue; - nfs_parse_ip_address(buf->data, buf->len, - mountdata->addr, &mountdata->addrlen); - if (mountdata->addr->sa_family == AF_UNSPEC) + mountdata->addrlen = nfs_parse_server_name(buf->data, + buf->len, + mountdata->addr, mountdata->addrlen); + if (mountdata->addrlen == 0) continue; - nfs_set_port(mountdata->addr, NFS_PORT); + rpc_set_port(mountdata->addr, NFS_PORT); memcpy(page2, buf->data, buf->len); page2[buf->len] = '\0'; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6917311f201c..be6544aef41f 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -61,6 +61,8 @@ #define NFS4_POLL_RETRY_MIN (HZ/10) #define NFS4_POLL_RETRY_MAX (15*HZ) +#define NFS4_MAX_LOOP_ON_RECOVER (10) + struct nfs4_opendata; static int _nfs4_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); @@ -426,17 +428,19 @@ out: static int nfs4_recover_session(struct nfs4_session *session) { struct nfs_client *clp = session->clp; + unsigned int loop; int ret; - for (;;) { + for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { ret = nfs4_wait_clnt_recover(clp); if (ret != 0) - return ret; + break; if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) break; nfs4_schedule_state_manager(clp); + ret = -EIO; } - return 0; + return ret; } static int nfs41_setup_sequence(struct nfs4_session *session, @@ -1444,18 +1448,20 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) static int nfs4_recover_expired_lease(struct nfs_server *server) { struct nfs_client *clp = server->nfs_client; + unsigned int loop; int ret; - for (;;) { + for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) { ret = nfs4_wait_clnt_recover(clp); if (ret != 0) - return ret; + break; if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) && !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state)) break; nfs4_schedule_state_recovery(clp); + ret = -EIO; } - return 0; + return ret; } /* @@ -1997,12 +2003,34 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f status = nfs4_call_sync(server, &msg, &args, &res, 0); if (status == 0) { memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask)); + server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS| + NFS_CAP_SYMLINKS|NFS_CAP_FILEID| + NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER| + NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME| + NFS_CAP_CTIME|NFS_CAP_MTIME); if (res.attr_bitmask[0] & FATTR4_WORD0_ACL) server->caps |= NFS_CAP_ACLS; if (res.has_links != 0) server->caps |= NFS_CAP_HARDLINKS; if (res.has_symlinks != 0) server->caps |= NFS_CAP_SYMLINKS; + if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID) + server->caps |= NFS_CAP_FILEID; + if (res.attr_bitmask[1] & FATTR4_WORD1_MODE) + server->caps |= NFS_CAP_MODE; + if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS) + server->caps |= NFS_CAP_NLINK; + if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER) + server->caps |= NFS_CAP_OWNER; + if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP) + server->caps |= NFS_CAP_OWNER_GROUP; + if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS) + server->caps |= NFS_CAP_ATIME; + if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA) + server->caps |= NFS_CAP_CTIME; + if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY) + server->caps |= NFS_CAP_MTIME; + memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask)); server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE; server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 65ca8c18476f..1434080aefeb 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1250,8 +1250,8 @@ static void nfs4_state_manager(struct nfs_client *clp) continue; } /* Initialize or reset the session */ - if (nfs4_has_session(clp) && - test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) { + if (test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state) + && nfs4_has_session(clp)) { if (clp->cl_cons_state == NFS_CS_SESSION_INITING) status = nfs4_initialize_session(clp); else diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 617273e7d47f..cfc30d362f94 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -702,29 +702,12 @@ struct compound_hdr { u32 minorversion; }; -/* - * START OF "GENERIC" ENCODE ROUTINES. - * These may look a little ugly since they are imported from a "generic" - * set of XDR encode/decode routines which are intended to be shared by - * all of our NFSv4 implementations (OpenBSD, MacOS X...). - * - * If the pain of reading these is too great, it should be a straightforward - * task to translate them into Linux-specific versions which are more - * consistent with the style used in NFSv2/v3... - */ -#define WRITE32(n) *p++ = htonl(n) -#define WRITE64(n) do { \ - *p++ = htonl((uint32_t)((n) >> 32)); \ - *p++ = htonl((uint32_t)(n)); \ -} while (0) -#define WRITEMEM(ptr,nbytes) do { \ - p = xdr_encode_opaque_fixed(p, ptr, nbytes); \ -} while (0) - -#define RESERVE_SPACE(nbytes) do { \ - p = xdr_reserve_space(xdr, nbytes); \ - BUG_ON(!p); \ -} while (0) +static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes) +{ + __be32 *p = xdr_reserve_space(xdr, nbytes); + BUG_ON(!p); + return p; +} static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str) { @@ -749,12 +732,11 @@ static void encode_compound_hdr(struct xdr_stream *xdr, dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag); BUG_ON(hdr->taglen > NFS4_MAXTAGLEN); - RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2)); - WRITE32(hdr->taglen); - WRITEMEM(hdr->tag, hdr->taglen); - WRITE32(hdr->minorversion); + p = reserve_space(xdr, 4 + hdr->taglen + 8); + p = xdr_encode_opaque(p, hdr->tag, hdr->taglen); + *p++ = cpu_to_be32(hdr->minorversion); hdr->nops_p = p; - WRITE32(hdr->nops); + *p = cpu_to_be32(hdr->nops); } static void encode_nops(struct compound_hdr *hdr) @@ -829,55 +811,53 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const len += 16; else if (iap->ia_valid & ATTR_MTIME) len += 4; - RESERVE_SPACE(len); + p = reserve_space(xdr, len); /* * We write the bitmap length now, but leave the bitmap and the attribute * buffer length to be backfilled at the end of this routine. */ - WRITE32(2); + *p++ = cpu_to_be32(2); q = p; p += 3; if (iap->ia_valid & ATTR_SIZE) { bmval0 |= FATTR4_WORD0_SIZE; - WRITE64(iap->ia_size); + p = xdr_encode_hyper(p, iap->ia_size); } if (iap->ia_valid & ATTR_MODE) { bmval1 |= FATTR4_WORD1_MODE; - WRITE32(iap->ia_mode & S_IALLUGO); + *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO); } if (iap->ia_valid & ATTR_UID) { bmval1 |= FATTR4_WORD1_OWNER; - WRITE32(owner_namelen); - WRITEMEM(owner_name, owner_namelen); + p = xdr_encode_opaque(p, owner_name, owner_namelen); } if (iap->ia_valid & ATTR_GID) { bmval1 |= FATTR4_WORD1_OWNER_GROUP; - WRITE32(owner_grouplen); - WRITEMEM(owner_group, owner_grouplen); + p = xdr_encode_opaque(p, owner_group, owner_grouplen); } if (iap->ia_valid & ATTR_ATIME_SET) { bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; - WRITE32(NFS4_SET_TO_CLIENT_TIME); - WRITE32(0); - WRITE32(iap->ia_mtime.tv_sec); - WRITE32(iap->ia_mtime.tv_nsec); + *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(iap->ia_mtime.tv_sec); + *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); } else if (iap->ia_valid & ATTR_ATIME) { bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET; - WRITE32(NFS4_SET_TO_SERVER_TIME); + *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } if (iap->ia_valid & ATTR_MTIME_SET) { bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; - WRITE32(NFS4_SET_TO_CLIENT_TIME); - WRITE32(0); - WRITE32(iap->ia_mtime.tv_sec); - WRITE32(iap->ia_mtime.tv_nsec); + *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME); + *p++ = cpu_to_be32(0); + *p++ = cpu_to_be32(iap->ia_mtime.tv_sec); + *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec); } else if (iap->ia_valid & ATTR_MTIME) { bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET; - WRITE32(NFS4_SET_TO_SERVER_TIME); + *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME); } /* @@ -891,7 +871,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const len = (char *)p - (char *)q - 12; *q++ = htonl(bmval0); *q++ = htonl(bmval1); - *q++ = htonl(len); + *q = htonl(len); /* out: */ } @@ -900,9 +880,9 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd { __be32 *p; - RESERVE_SPACE(8); - WRITE32(OP_ACCESS); - WRITE32(access); + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(OP_ACCESS); + *p = cpu_to_be32(access); hdr->nops++; hdr->replen += decode_access_maxsz; } @@ -911,10 +891,10 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg { __be32 *p; - RESERVE_SPACE(8+NFS4_STATEID_SIZE); - WRITE32(OP_CLOSE); - WRITE32(arg->seqid->sequence->counter); - WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 8+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_CLOSE); + *p++ = cpu_to_be32(arg->seqid->sequence->counter); + xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_close_maxsz; } @@ -923,10 +903,10 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar { __be32 *p; - RESERVE_SPACE(16); - WRITE32(OP_COMMIT); - WRITE64(args->offset); - WRITE32(args->count); + p = reserve_space(xdr, 16); + *p++ = cpu_to_be32(OP_COMMIT); + p = xdr_encode_hyper(p, args->offset); + *p = cpu_to_be32(args->count); hdr->nops++; hdr->replen += decode_commit_maxsz; } @@ -935,30 +915,28 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg * { __be32 *p; - RESERVE_SPACE(8); - WRITE32(OP_CREATE); - WRITE32(create->ftype); + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(OP_CREATE); + *p = cpu_to_be32(create->ftype); switch (create->ftype) { case NF4LNK: - RESERVE_SPACE(4); - WRITE32(create->u.symlink.len); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(create->u.symlink.len); xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len); break; case NF4BLK: case NF4CHR: - RESERVE_SPACE(8); - WRITE32(create->u.device.specdata1); - WRITE32(create->u.device.specdata2); + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(create->u.device.specdata1); + *p = cpu_to_be32(create->u.device.specdata2); break; default: break; } - RESERVE_SPACE(4 + create->name->len); - WRITE32(create->name->len); - WRITEMEM(create->name->name, create->name->len); + encode_string(xdr, create->name->len, create->name->name); hdr->nops++; hdr->replen += decode_create_maxsz; @@ -969,10 +947,10 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c { __be32 *p; - RESERVE_SPACE(12); - WRITE32(OP_GETATTR); - WRITE32(1); - WRITE32(bitmap); + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(OP_GETATTR); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(bitmap); hdr->nops++; hdr->replen += decode_getattr_maxsz; } @@ -981,11 +959,11 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm { __be32 *p; - RESERVE_SPACE(16); - WRITE32(OP_GETATTR); - WRITE32(2); - WRITE32(bm0); - WRITE32(bm1); + p = reserve_space(xdr, 16); + *p++ = cpu_to_be32(OP_GETATTR); + *p++ = cpu_to_be32(2); + *p++ = cpu_to_be32(bm0); + *p = cpu_to_be32(bm1); hdr->nops++; hdr->replen += decode_getattr_maxsz; } @@ -1012,8 +990,8 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_GETFH); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_GETFH); hdr->nops++; hdr->replen += decode_getfh_maxsz; } @@ -1022,10 +1000,9 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct { __be32 *p; - RESERVE_SPACE(8 + name->len); - WRITE32(OP_LINK); - WRITE32(name->len); - WRITEMEM(name->name, name->len); + p = reserve_space(xdr, 8 + name->len); + *p++ = cpu_to_be32(OP_LINK); + xdr_encode_opaque(p, name->name, name->len); hdr->nops++; hdr->replen += decode_link_maxsz; } @@ -1052,27 +1029,27 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args { __be32 *p; - RESERVE_SPACE(32); - WRITE32(OP_LOCK); - WRITE32(nfs4_lock_type(args->fl, args->block)); - WRITE32(args->reclaim); - WRITE64(args->fl->fl_start); - WRITE64(nfs4_lock_length(args->fl)); - WRITE32(args->new_lock_owner); + p = reserve_space(xdr, 32); + *p++ = cpu_to_be32(OP_LOCK); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block)); + *p++ = cpu_to_be32(args->reclaim); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + *p = cpu_to_be32(args->new_lock_owner); if (args->new_lock_owner){ - RESERVE_SPACE(4+NFS4_STATEID_SIZE+32); - WRITE32(args->open_seqid->sequence->counter); - WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE); - WRITE32(args->lock_seqid->sequence->counter); - WRITE64(args->lock_owner.clientid); - WRITE32(16); - WRITEMEM("lock id:", 8); - WRITE64(args->lock_owner.id); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32); + *p++ = cpu_to_be32(args->open_seqid->sequence->counter); + p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); + p = xdr_encode_hyper(p, args->lock_owner.clientid); + *p++ = cpu_to_be32(16); + p = xdr_encode_opaque_fixed(p, "lock id:", 8); + xdr_encode_hyper(p, args->lock_owner.id); } else { - RESERVE_SPACE(NFS4_STATEID_SIZE+4); - WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE); - WRITE32(args->lock_seqid->sequence->counter); + p = reserve_space(xdr, NFS4_STATEID_SIZE+4); + p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(args->lock_seqid->sequence->counter); } hdr->nops++; hdr->replen += decode_lock_maxsz; @@ -1082,15 +1059,15 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar { __be32 *p; - RESERVE_SPACE(52); - WRITE32(OP_LOCKT); - WRITE32(nfs4_lock_type(args->fl, 0)); - WRITE64(args->fl->fl_start); - WRITE64(nfs4_lock_length(args->fl)); - WRITE64(args->lock_owner.clientid); - WRITE32(16); - WRITEMEM("lock id:", 8); - WRITE64(args->lock_owner.id); + p = reserve_space(xdr, 52); + *p++ = cpu_to_be32(OP_LOCKT); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + p = xdr_encode_hyper(p, args->fl->fl_start); + p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); + p = xdr_encode_hyper(p, args->lock_owner.clientid); + *p++ = cpu_to_be32(16); + p = xdr_encode_opaque_fixed(p, "lock id:", 8); + xdr_encode_hyper(p, args->lock_owner.id); hdr->nops++; hdr->replen += decode_lockt_maxsz; } @@ -1099,13 +1076,13 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar { __be32 *p; - RESERVE_SPACE(12+NFS4_STATEID_SIZE+16); - WRITE32(OP_LOCKU); - WRITE32(nfs4_lock_type(args->fl, 0)); - WRITE32(args->seqid->sequence->counter); - WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE); - WRITE64(args->fl->fl_start); - WRITE64(nfs4_lock_length(args->fl)); + p = reserve_space(xdr, 12+NFS4_STATEID_SIZE+16); + *p++ = cpu_to_be32(OP_LOCKU); + *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); + *p++ = cpu_to_be32(args->seqid->sequence->counter); + p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE); + p = xdr_encode_hyper(p, args->fl->fl_start); + xdr_encode_hyper(p, nfs4_lock_length(args->fl)); hdr->nops++; hdr->replen += decode_locku_maxsz; } @@ -1115,10 +1092,9 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc int len = name->len; __be32 *p; - RESERVE_SPACE(8 + len); - WRITE32(OP_LOOKUP); - WRITE32(len); - WRITEMEM(name->name, len); + p = reserve_space(xdr, 8 + len); + *p++ = cpu_to_be32(OP_LOOKUP); + xdr_encode_opaque(p, name->name, len); hdr->nops++; hdr->replen += decode_lookup_maxsz; } @@ -1127,21 +1103,21 @@ static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode) { __be32 *p; - RESERVE_SPACE(8); + p = reserve_space(xdr, 8); switch (fmode & (FMODE_READ|FMODE_WRITE)) { case FMODE_READ: - WRITE32(NFS4_SHARE_ACCESS_READ); + *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ); break; case FMODE_WRITE: - WRITE32(NFS4_SHARE_ACCESS_WRITE); + *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE); break; case FMODE_READ|FMODE_WRITE: - WRITE32(NFS4_SHARE_ACCESS_BOTH); + *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH); break; default: - WRITE32(0); + *p++ = cpu_to_be32(0); } - WRITE32(0); /* for linux, share_deny = 0 always */ + *p = cpu_to_be32(0); /* for linux, share_deny = 0 always */ } static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg) @@ -1151,29 +1127,29 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4, * owner 4 = 32 */ - RESERVE_SPACE(8); - WRITE32(OP_OPEN); - WRITE32(arg->seqid->sequence->counter); + p = reserve_space(xdr, 8); + *p++ = cpu_to_be32(OP_OPEN); + *p = cpu_to_be32(arg->seqid->sequence->counter); encode_share_access(xdr, arg->fmode); - RESERVE_SPACE(28); - WRITE64(arg->clientid); - WRITE32(16); - WRITEMEM("open id:", 8); - WRITE64(arg->id); + p = reserve_space(xdr, 28); + p = xdr_encode_hyper(p, arg->clientid); + *p++ = cpu_to_be32(16); + p = xdr_encode_opaque_fixed(p, "open id:", 8); + xdr_encode_hyper(p, arg->id); } static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg) { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); switch(arg->open_flags & O_EXCL) { case 0: - WRITE32(NFS4_CREATE_UNCHECKED); + *p = cpu_to_be32(NFS4_CREATE_UNCHECKED); encode_attrs(xdr, arg->u.attrs, arg->server); break; default: - WRITE32(NFS4_CREATE_EXCLUSIVE); + *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE); encode_nfs4_verifier(xdr, &arg->u.verifier); } } @@ -1182,14 +1158,14 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); switch (arg->open_flags & O_CREAT) { case 0: - WRITE32(NFS4_OPEN_NOCREATE); + *p = cpu_to_be32(NFS4_OPEN_NOCREATE); break; default: BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL); - WRITE32(NFS4_OPEN_CREATE); + *p = cpu_to_be32(NFS4_OPEN_CREATE); encode_createmode(xdr, arg); } } @@ -1198,16 +1174,16 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delega { __be32 *p; - RESERVE_SPACE(4); + p = reserve_space(xdr, 4); switch (delegation_type) { case 0: - WRITE32(NFS4_OPEN_DELEGATE_NONE); + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE); break; case FMODE_READ: - WRITE32(NFS4_OPEN_DELEGATE_READ); + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ); break; case FMODE_WRITE|FMODE_READ: - WRITE32(NFS4_OPEN_DELEGATE_WRITE); + *p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE); break; default: BUG(); @@ -1218,8 +1194,8 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr * { __be32 *p; - RESERVE_SPACE(4); - WRITE32(NFS4_OPEN_CLAIM_NULL); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_NULL); encode_string(xdr, name->len, name->name); } @@ -1227,8 +1203,8 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(NFS4_OPEN_CLAIM_PREVIOUS); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS); encode_delegation_type(xdr, type); } @@ -1236,9 +1212,9 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); - WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR); - WRITEMEM(stateid->data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR); + xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); encode_string(xdr, name->len, name->name); } @@ -1267,10 +1243,10 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE+4); - WRITE32(OP_OPEN_CONFIRM); - WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); - WRITE32(arg->seqid->sequence->counter); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_CONFIRM); + p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); hdr->nops++; hdr->replen += decode_open_confirm_maxsz; } @@ -1279,10 +1255,10 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE+4); - WRITE32(OP_OPEN_DOWNGRADE); - WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE); - WRITE32(arg->seqid->sequence->counter); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); + *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE); + p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE); + *p = cpu_to_be32(arg->seqid->sequence->counter); encode_share_access(xdr, arg->fmode); hdr->nops++; hdr->replen += decode_open_downgrade_maxsz; @@ -1294,10 +1270,9 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd int len = fh->size; __be32 *p; - RESERVE_SPACE(8 + len); - WRITE32(OP_PUTFH); - WRITE32(len); - WRITEMEM(fh->data, len); + p = reserve_space(xdr, 8 + len); + *p++ = cpu_to_be32(OP_PUTFH); + xdr_encode_opaque(p, fh->data, len); hdr->nops++; hdr->replen += decode_putfh_maxsz; } @@ -1306,8 +1281,8 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_PUTROOTFH); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_PUTROOTFH); hdr->nops++; hdr->replen += decode_putrootfh_maxsz; } @@ -1317,26 +1292,26 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context nfs4_stateid stateid; __be32 *p; - RESERVE_SPACE(NFS4_STATEID_SIZE); + p = reserve_space(xdr, NFS4_STATEID_SIZE); if (ctx->state != NULL) { nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); - WRITEMEM(stateid.data, NFS4_STATEID_SIZE); + xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); } else - WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE); + xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); } static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_READ); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_READ); encode_stateid(xdr, args->context); - RESERVE_SPACE(12); - WRITE64(args->offset); - WRITE32(args->count); + p = reserve_space(xdr, 12); + p = xdr_encode_hyper(p, args->offset); + *p = cpu_to_be32(args->count); hdr->nops++; hdr->replen += decode_read_maxsz; } @@ -1349,20 +1324,20 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg }; __be32 *p; - RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20); - WRITE32(OP_READDIR); - WRITE64(readdir->cookie); - WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE); - WRITE32(readdir->count >> 1); /* We're not doing readdirplus */ - WRITE32(readdir->count); - WRITE32(2); + p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20); + *p++ = cpu_to_be32(OP_READDIR); + p = xdr_encode_hyper(p, readdir->cookie); + p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE); + *p++ = cpu_to_be32(readdir->count >> 1); /* We're not doing readdirplus */ + *p++ = cpu_to_be32(readdir->count); + *p++ = cpu_to_be32(2); /* Switch to mounted_on_fileid if the server supports it */ if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID) attrs[0] &= ~FATTR4_WORD0_FILEID; else attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; - WRITE32(attrs[0] & readdir->bitmask[0]); - WRITE32(attrs[1] & readdir->bitmask[1]); + *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]); + *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]); hdr->nops++; hdr->replen += decode_readdir_maxsz; dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", @@ -1378,8 +1353,8 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink * { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_READLINK); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_READLINK); hdr->nops++; hdr->replen += decode_readlink_maxsz; } @@ -1388,10 +1363,9 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc { __be32 *p; - RESERVE_SPACE(8 + name->len); - WRITE32(OP_REMOVE); - WRITE32(name->len); - WRITEMEM(name->name, name->len); + p = reserve_space(xdr, 8 + name->len); + *p++ = cpu_to_be32(OP_REMOVE); + xdr_encode_opaque(p, name->name, name->len); hdr->nops++; hdr->replen += decode_remove_maxsz; } @@ -1400,14 +1374,10 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co { __be32 *p; - RESERVE_SPACE(8 + oldname->len); - WRITE32(OP_RENAME); - WRITE32(oldname->len); - WRITEMEM(oldname->name, oldname->len); - - RESERVE_SPACE(4 + newname->len); - WRITE32(newname->len); - WRITEMEM(newname->name, newname->len); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_RENAME); + encode_string(xdr, oldname->len, oldname->name); + encode_string(xdr, newname->len, newname->name); hdr->nops++; hdr->replen += decode_rename_maxsz; } @@ -1416,9 +1386,9 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client { __be32 *p; - RESERVE_SPACE(12); - WRITE32(OP_RENEW); - WRITE64(client_stateid->cl_clientid); + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(OP_RENEW); + xdr_encode_hyper(p, client_stateid->cl_clientid); hdr->nops++; hdr->replen += decode_renew_maxsz; } @@ -1428,8 +1398,8 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_RESTOREFH); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_RESTOREFH); hdr->nops++; hdr->replen += decode_restorefh_maxsz; } @@ -1439,16 +1409,16 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); - WRITE32(OP_SETATTR); - WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE); - RESERVE_SPACE(2*4); - WRITE32(1); - WRITE32(FATTR4_WORD0_ACL); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); + xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 2*4); + *p++ = cpu_to_be32(1); + *p = cpu_to_be32(FATTR4_WORD0_ACL); if (arg->acl_len % 4) return -EINVAL; - RESERVE_SPACE(4); - WRITE32(arg->acl_len); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(arg->acl_len); xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len); hdr->nops++; hdr->replen += decode_setacl_maxsz; @@ -1460,8 +1430,8 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_SAVEFH); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_SAVEFH); hdr->nops++; hdr->replen += decode_savefh_maxsz; } @@ -1470,9 +1440,9 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); - WRITE32(OP_SETATTR); - WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_SETATTR); + xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_setattr_maxsz; encode_attrs(xdr, arg->iap, server); @@ -1482,17 +1452,17 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie { __be32 *p; - RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE); - WRITE32(OP_SETCLIENTID); - WRITEMEM(setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE); + p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE); + *p++ = cpu_to_be32(OP_SETCLIENTID); + xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE); encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); - RESERVE_SPACE(4); - WRITE32(setclientid->sc_prog); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(setclientid->sc_prog); encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid); encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); - RESERVE_SPACE(4); - WRITE32(setclientid->sc_cb_ident); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(setclientid->sc_cb_ident); hdr->nops++; hdr->replen += decode_setclientid_maxsz; } @@ -1501,10 +1471,10 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_ { __be32 *p; - RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE); - WRITE32(OP_SETCLIENTID_CONFIRM); - WRITE64(client_state->cl_clientid); - WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); + p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE); + *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM); + p = xdr_encode_hyper(p, client_state->cl_clientid); + xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE); hdr->nops++; hdr->replen += decode_setclientid_confirm_maxsz; } @@ -1513,15 +1483,15 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg { __be32 *p; - RESERVE_SPACE(4); - WRITE32(OP_WRITE); + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_WRITE); encode_stateid(xdr, args->context); - RESERVE_SPACE(16); - WRITE64(args->offset); - WRITE32(args->stable); - WRITE32(args->count); + p = reserve_space(xdr, 16); + p = xdr_encode_hyper(p, args->offset); + *p++ = cpu_to_be32(args->stable); + *p = cpu_to_be32(args->count); xdr_write_pages(xdr, args->pages, args->pgbase, args->count); hdr->nops++; @@ -1532,10 +1502,10 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state { __be32 *p; - RESERVE_SPACE(4+NFS4_STATEID_SIZE); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE); - WRITE32(OP_DELEGRETURN); - WRITEMEM(stateid->data, NFS4_STATEID_SIZE); + *p++ = cpu_to_be32(OP_DELEGRETURN); + xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE); hdr->nops++; hdr->replen += decode_delegreturn_maxsz; } @@ -1548,16 +1518,16 @@ static void encode_exchange_id(struct xdr_stream *xdr, { __be32 *p; - RESERVE_SPACE(4 + sizeof(args->verifier->data)); - WRITE32(OP_EXCHANGE_ID); - WRITEMEM(args->verifier->data, sizeof(args->verifier->data)); + p = reserve_space(xdr, 4 + sizeof(args->verifier->data)); + *p++ = cpu_to_be32(OP_EXCHANGE_ID); + xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data)); encode_string(xdr, args->id_len, args->id); - RESERVE_SPACE(12); - WRITE32(args->flags); - WRITE32(0); /* zero length state_protect4_a */ - WRITE32(0); /* zero length implementation id array */ + p = reserve_space(xdr, 12); + *p++ = cpu_to_be32(args->flags); + *p++ = cpu_to_be32(0); /* zero length state_protect4_a */ + *p = cpu_to_be32(0); /* zero length implementation id array */ hdr->nops++; hdr->replen += decode_exchange_id_maxsz; } @@ -1571,55 +1541,43 @@ static void encode_create_session(struct xdr_stream *xdr, uint32_t len; struct nfs_client *clp = args->client; - RESERVE_SPACE(4); - WRITE32(OP_CREATE_SESSION); - - RESERVE_SPACE(8); - WRITE64(clp->cl_ex_clid); + len = scnprintf(machine_name, sizeof(machine_name), "%s", + clp->cl_ipaddr); - RESERVE_SPACE(8); - WRITE32(clp->cl_seqid); /*Sequence id */ - WRITE32(args->flags); /*flags */ + p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12); + *p++ = cpu_to_be32(OP_CREATE_SESSION); + p = xdr_encode_hyper(p, clp->cl_ex_clid); + *p++ = cpu_to_be32(clp->cl_seqid); /*Sequence id */ + *p++ = cpu_to_be32(args->flags); /*flags */ - RESERVE_SPACE(2*28); /* 2 channel_attrs */ /* Fore Channel */ - WRITE32(args->fc_attrs.headerpadsz); /* header padding size */ - WRITE32(args->fc_attrs.max_rqst_sz); /* max req size */ - WRITE32(args->fc_attrs.max_resp_sz); /* max resp size */ - WRITE32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */ - WRITE32(args->fc_attrs.max_ops); /* max operations */ - WRITE32(args->fc_attrs.max_reqs); /* max requests */ - WRITE32(0); /* rdmachannel_attrs */ + *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */ + *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */ + *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */ + *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz_cached); /* Max resp sz cached */ + *p++ = cpu_to_be32(args->fc_attrs.max_ops); /* max operations */ + *p++ = cpu_to_be32(args->fc_attrs.max_reqs); /* max requests */ + *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ /* Back Channel */ - WRITE32(args->fc_attrs.headerpadsz); /* header padding size */ - WRITE32(args->bc_attrs.max_rqst_sz); /* max req size */ - WRITE32(args->bc_attrs.max_resp_sz); /* max resp size */ - WRITE32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */ - WRITE32(args->bc_attrs.max_ops); /* max operations */ - WRITE32(args->bc_attrs.max_reqs); /* max requests */ - WRITE32(0); /* rdmachannel_attrs */ - - RESERVE_SPACE(4); - WRITE32(args->cb_program); /* cb_program */ - - RESERVE_SPACE(4); /* # of security flavors */ - WRITE32(1); - - RESERVE_SPACE(4); - WRITE32(RPC_AUTH_UNIX); /* auth_sys */ + *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */ + *p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz); /* max req size */ + *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz); /* max resp size */ + *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached); /* Max resp sz cached */ + *p++ = cpu_to_be32(args->bc_attrs.max_ops); /* max operations */ + *p++ = cpu_to_be32(args->bc_attrs.max_reqs); /* max requests */ + *p++ = cpu_to_be32(0); /* rdmachannel_attrs */ + + *p++ = cpu_to_be32(args->cb_program); /* cb_program */ + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(RPC_AUTH_UNIX); /* auth_sys */ /* authsys_parms rfc1831 */ - RESERVE_SPACE(4); - WRITE32((u32)clp->cl_boot_time.tv_nsec); /* stamp */ - len = scnprintf(machine_name, sizeof(machine_name), "%s", - clp->cl_ipaddr); - RESERVE_SPACE(16 + len); - WRITE32(len); - WRITEMEM(machine_name, len); - WRITE32(0); /* UID */ - WRITE32(0); /* GID */ - WRITE32(0); /* No more gids */ + *p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec); /* stamp */ + p = xdr_encode_opaque(p, machine_name, len); + *p++ = cpu_to_be32(0); /* UID */ + *p++ = cpu_to_be32(0); /* GID */ + *p = cpu_to_be32(0); /* No more gids */ hdr->nops++; hdr->replen += decode_create_session_maxsz; } @@ -1629,9 +1587,9 @@ static void encode_destroy_session(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN); - WRITE32(OP_DESTROY_SESSION); - WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN); + p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(OP_DESTROY_SESSION); + xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); hdr->nops++; hdr->replen += decode_destroy_session_maxsz; } @@ -1655,8 +1613,8 @@ static void encode_sequence(struct xdr_stream *xdr, WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE); slot = tp->slots + args->sa_slotid; - RESERVE_SPACE(4); - WRITE32(OP_SEQUENCE); + p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN + 16); + *p++ = cpu_to_be32(OP_SEQUENCE); /* * Sessionid + seqid + slotid + max slotid + cache_this @@ -1670,12 +1628,11 @@ static void encode_sequence(struct xdr_stream *xdr, ((u32 *)session->sess_id.data)[3], slot->seq_nr, args->sa_slotid, tp->highest_used_slotid, args->sa_cache_this); - RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16); - WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN); - WRITE32(slot->seq_nr); - WRITE32(args->sa_slotid); - WRITE32(tp->highest_used_slotid); - WRITE32(args->sa_cache_this); + p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN); + *p++ = cpu_to_be32(slot->seq_nr); + *p++ = cpu_to_be32(args->sa_slotid); + *p++ = cpu_to_be32(tp->highest_used_slotid); + *p = cpu_to_be32(args->sa_cache_this); hdr->nops++; hdr->replen += decode_sequence_maxsz; #endif /* CONFIG_NFS_V4_1 */ @@ -2466,68 +2423,53 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p, } #endif /* CONFIG_NFS_V4_1 */ -/* - * START OF "GENERIC" DECODE ROUTINES. - * These may look a little ugly since they are imported from a "generic" - * set of XDR encode/decode routines which are intended to be shared by - * all of our NFSv4 implementations (OpenBSD, MacOS X...). - * - * If the pain of reading these is too great, it should be a straightforward - * task to translate them into Linux-specific versions which are more - * consistent with the style used in NFSv2/v3... - */ -#define READ32(x) (x) = ntohl(*p++) -#define READ64(x) do { \ - (x) = (u64)ntohl(*p++) << 32; \ - (x) |= ntohl(*p++); \ -} while (0) -#define READTIME(x) do { \ - p++; \ - (x.tv_sec) = ntohl(*p++); \ - (x.tv_nsec) = ntohl(*p++); \ -} while (0) -#define COPYMEM(x,nbytes) do { \ - memcpy((x), p, nbytes); \ - p += XDR_QUADLEN(nbytes); \ -} while (0) - -#define READ_BUF(nbytes) do { \ - p = xdr_inline_decode(xdr, nbytes); \ - if (unlikely(!p)) { \ - dprintk("nfs: %s: prematurely hit end of receive" \ - " buffer\n", __func__); \ - dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \ - __func__, xdr->p, nbytes, xdr->end); \ - return -EIO; \ - } \ -} while (0) +static void print_overflow_msg(const char *func, const struct xdr_stream *xdr) +{ + dprintk("nfs: %s: prematurely hit end of receive buffer. " + "Remaining buffer length is %tu words.\n", + func, xdr->end - xdr->p); +} static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string) { __be32 *p; - READ_BUF(4); - READ32(*len); - READ_BUF(*len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, *len); + if (unlikely(!p)) + goto out_overflow; *string = (char *)p; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - READ_BUF(8); - READ32(hdr->status); - READ32(hdr->taglen); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + hdr->status = be32_to_cpup(p++); + hdr->taglen = be32_to_cpup(p); - READ_BUF(hdr->taglen + 4); + p = xdr_inline_decode(xdr, hdr->taglen + 4); + if (unlikely(!p)) + goto out_overflow; hdr->tag = (char *)p; p += XDR_QUADLEN(hdr->taglen); - READ32(hdr->nops); + hdr->nops = be32_to_cpup(p); if (unlikely(hdr->nops < 1)) return nfs4_stat_to_errno(hdr->status); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) @@ -2536,18 +2478,23 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) uint32_t opnum; int32_t nfserr; - READ_BUF(8); - READ32(opnum); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + opnum = be32_to_cpup(p++); if (opnum != expected) { dprintk("nfs: Server returned operation" " %d but we issued a request for %d\n", opnum, expected); return -EIO; } - READ32(nfserr); + nfserr = be32_to_cpup(p); if (nfserr != NFS_OK) return nfs4_stat_to_errno(nfserr); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } /* Dummy routine */ @@ -2557,8 +2504,11 @@ static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp) unsigned int strlen; char *str; - READ_BUF(12); - return decode_opaque_inline(xdr, &strlen, &str); + p = xdr_inline_decode(xdr, 12); + if (likely(p)) + return decode_opaque_inline(xdr, &strlen, &str); + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) @@ -2566,27 +2516,39 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap) uint32_t bmlen; __be32 *p; - READ_BUF(4); - READ32(bmlen); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bmlen = be32_to_cpup(p); bitmap[0] = bitmap[1] = 0; - READ_BUF((bmlen << 2)); + p = xdr_inline_decode(xdr, (bmlen << 2)); + if (unlikely(!p)) + goto out_overflow; if (bmlen > 0) { - READ32(bitmap[0]); + bitmap[0] = be32_to_cpup(p++); if (bmlen > 1) - READ32(bitmap[1]); + bitmap[1] = be32_to_cpup(p); } return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep) { __be32 *p; - READ_BUF(4); - READ32(*attrlen); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *attrlen = be32_to_cpup(p); *savep = xdr->p; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask) @@ -2609,8 +2571,10 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) { - READ_BUF(4); - READ32(*type); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *type = be32_to_cpup(p); if (*type < NF4REG || *type > NF4NAMEDATTR) { dprintk("%s: bad type %d\n", __func__, *type); return -EIO; @@ -2620,6 +2584,9 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t * } dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change) @@ -2631,14 +2598,19 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) { - READ_BUF(8); - READ64(*change); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, change); bitmap[0] &= ~FATTR4_WORD0_CHANGE; ret = NFS_ATTR_FATTR_CHANGE; } dprintk("%s: change attribute=%Lu\n", __func__, (unsigned long long)*change); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size) @@ -2650,13 +2622,18 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t * if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) { - READ_BUF(8); - READ64(*size); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, size); bitmap[0] &= ~FATTR4_WORD0_SIZE; ret = NFS_ATTR_FATTR_SIZE; } dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2667,12 +2644,17 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) { - READ_BUF(4); - READ32(*res); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT; } dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true"); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2683,12 +2665,17 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) { - READ_BUF(4); - READ32(*res); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT; } dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true"); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid) @@ -2701,9 +2688,11 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FSID)) { - READ_BUF(16); - READ64(fsid->major); - READ64(fsid->minor); + p = xdr_inline_decode(xdr, 16); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &fsid->major); + xdr_decode_hyper(p, &fsid->minor); bitmap[0] &= ~FATTR4_WORD0_FSID; ret = NFS_ATTR_FATTR_FSID; } @@ -2711,6 +2700,9 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs (unsigned long long)fsid->major, (unsigned long long)fsid->minor); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2721,12 +2713,17 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) { - READ_BUF(4); - READ32(*res); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME; } dprintk("%s: file size=%u\n", __func__, (unsigned int)*res); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -2737,12 +2734,17 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) { - READ_BUF(4); - READ32(*res); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *res = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT; } dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) @@ -2754,13 +2756,18 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) { - READ_BUF(8); - READ64(*fileid); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, fileid); bitmap[0] &= ~FATTR4_WORD0_FILEID; ret = NFS_ATTR_FATTR_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid) @@ -2772,13 +2779,18 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) { - READ_BUF(8); - READ64(*fileid); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, fileid); bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; ret = NFS_ATTR_FATTR_FILEID; } dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2790,12 +2802,17 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL; } dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2807,12 +2824,17 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_FREE; } dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -2824,12 +2846,17 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL; } dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) @@ -2838,8 +2865,10 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path) __be32 *p; int status = 0; - READ_BUF(4); - READ32(n); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + n = be32_to_cpup(p); if (n == 0) goto root_path; dprintk("path "); @@ -2873,6 +2902,9 @@ out_eio: dprintk(" status %d", status); status = -EIO; goto out; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res) @@ -2890,8 +2922,10 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st status = decode_pathname(xdr, &res->fs_path); if (unlikely(status != 0)) goto out; - READ_BUF(4); - READ32(n); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + n = be32_to_cpup(p); if (n <= 0) goto out_eio; res->nlocations = 0; @@ -2899,8 +2933,10 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st u32 m; struct nfs4_fs_location *loc = &res->locations[res->nlocations]; - READ_BUF(4); - READ32(m); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + m = be32_to_cpup(p); loc->nservers = 0; dprintk("%s: servers ", __func__); @@ -2939,6 +2975,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st out: dprintk("%s: fs_locations done, error = %d\n", __func__, status); return status; +out_overflow: + print_overflow_msg(__func__, xdr); out_eio: status = -EIO; goto out; @@ -2953,12 +2991,17 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE; } dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink) @@ -2970,12 +3013,17 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) { - READ_BUF(4); - READ32(*maxlink); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *maxlink = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_MAXLINK; } dprintk("%s: maxlink=%u\n", __func__, *maxlink); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname) @@ -2987,12 +3035,17 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U))) return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) { - READ_BUF(4); - READ32(*maxname); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *maxname = be32_to_cpup(p); bitmap[0] &= ~FATTR4_WORD0_MAXNAME; } dprintk("%s: maxname=%u\n", __func__, *maxname); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3005,8 +3058,10 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) { uint64_t maxread; - READ_BUF(8); - READ64(maxread); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, &maxread); if (maxread > 0x7FFFFFFF) maxread = 0x7FFFFFFF; *res = (uint32_t)maxread; @@ -3014,6 +3069,9 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_ } dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -3026,8 +3084,10 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 return -EIO; if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) { uint64_t maxwrite; - READ_BUF(8); - READ64(maxwrite); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, &maxwrite); if (maxwrite > 0x7FFFFFFF) maxwrite = 0x7FFFFFFF; *res = (uint32_t)maxwrite; @@ -3035,6 +3095,9 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32 } dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode) @@ -3047,14 +3110,19 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_MODE)) { - READ_BUF(4); - READ32(tmp); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + tmp = be32_to_cpup(p); *mode = tmp & ~S_IFMT; bitmap[1] &= ~FATTR4_WORD1_MODE; ret = NFS_ATTR_FATTR_MODE; } dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink) @@ -3066,16 +3134,22 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) { - READ_BUF(4); - READ32(*nlink); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + *nlink = be32_to_cpup(p); bitmap[1] &= ~FATTR4_WORD1_NUMLINKS; ret = NFS_ATTR_FATTR_NLINK; } dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid) +static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs_client *clp, uint32_t *uid, int may_sleep) { uint32_t len; __be32 *p; @@ -3085,10 +3159,16 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) { - READ_BUF(4); - READ32(len); - READ_BUF(len); - if (len < XDR_MAX_NETOBJ) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + if (!may_sleep) { + /* do nothing */ + } else if (len < XDR_MAX_NETOBJ) { if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0) ret = NFS_ATTR_FATTR_OWNER; else @@ -3101,9 +3181,13 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf } dprintk("%s: uid=%d\n", __func__, (int)*uid); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid) +static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, + struct nfs_client *clp, uint32_t *gid, int may_sleep) { uint32_t len; __be32 *p; @@ -3113,10 +3197,16 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) { - READ_BUF(4); - READ32(len); - READ_BUF(len); - if (len < XDR_MAX_NETOBJ) { + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + if (!may_sleep) { + /* do nothing */ + } else if (len < XDR_MAX_NETOBJ) { if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0) ret = NFS_ATTR_FATTR_GROUP; else @@ -3129,6 +3219,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf } dprintk("%s: gid=%d\n", __func__, (int)*gid); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev) @@ -3143,9 +3236,11 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) { dev_t tmp; - READ_BUF(8); - READ32(major); - READ32(minor); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + major = be32_to_cpup(p++); + minor = be32_to_cpup(p); tmp = MKDEV(major, minor); if (MAJOR(tmp) == major && MINOR(tmp) == minor) *rdev = tmp; @@ -3154,6 +3249,9 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde } dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3165,12 +3263,17 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL; } dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3182,12 +3285,17 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE; } dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res) @@ -3199,12 +3307,17 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) { - READ_BUF(8); - READ64(*res); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, res); bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL; } dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used) @@ -3216,14 +3329,19 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U))) return -EIO; if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) { - READ_BUF(8); - READ64(*used); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, used); bitmap[1] &= ~FATTR4_WORD1_SPACE_USED; ret = NFS_ATTR_FATTR_SPACE_USED; } dprintk("%s: space used=%Lu\n", __func__, (unsigned long long)*used); return ret; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) @@ -3232,12 +3350,17 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time) uint64_t sec; uint32_t nsec; - READ_BUF(12); - READ64(sec); - READ32(nsec); + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &sec); + nsec = be32_to_cpup(p); time->tv_sec = (time_t)sec; time->tv_nsec = (long)nsec; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time) @@ -3315,11 +3438,16 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c { __be32 *p; - READ_BUF(20); - READ32(cinfo->atomic); - READ64(cinfo->before); - READ64(cinfo->after); + p = xdr_inline_decode(xdr, 20); + if (unlikely(!p)) + goto out_overflow; + cinfo->atomic = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &cinfo->before); + xdr_decode_hyper(p, &cinfo->after); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) @@ -3331,40 +3459,62 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access) status = decode_op_hdr(xdr, OP_ACCESS); if (status) return status; - READ_BUF(8); - READ32(supp); - READ32(acc); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + supp = be32_to_cpup(p++); + acc = be32_to_cpup(p); access->supported = supp; access->access = acc; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } -static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) +static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len) { __be32 *p; + + p = xdr_inline_decode(xdr, len); + if (likely(p)) { + memcpy(buf, p, len); + return 0; + } + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) +{ + return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE); +} + +static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) +{ int status; status = decode_op_hdr(xdr, OP_CLOSE); if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); - if (status) - return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - return 0; + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; +} + +static int decode_verifier(struct xdr_stream *xdr, void *verifier) +{ + return decode_opaque_fixed(xdr, verifier, 8); } static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_COMMIT); - if (status) - return status; - READ_BUF(8); - COPYMEM(res->verf->verifier, 8); - return 0; + if (!status) + status = decode_verifier(xdr, res->verf->verifier); + return status; } static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -3378,10 +3528,16 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) return status; if ((status = decode_change_info(xdr, cinfo))) return status; - READ_BUF(4); - READ32(bmlen); - READ_BUF(bmlen << 2); - return 0; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bmlen = be32_to_cpup(p); + p = xdr_inline_decode(xdr, bmlen << 2); + if (likely(p)) + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res) @@ -3466,7 +3622,8 @@ xdr_error: return status; } -static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, const struct nfs_server *server) +static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, + const struct nfs_server *server, int may_sleep) { __be32 *savep; uint32_t attrlen, @@ -3538,12 +3695,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons goto xdr_error; fattr->valid |= status; - status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid); + status = decode_attr_owner(xdr, bitmap, server->nfs_client, + &fattr->uid, may_sleep); if (status < 0) goto xdr_error; fattr->valid |= status; - status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid); + status = decode_attr_group(xdr, bitmap, server->nfs_client, + &fattr->gid, may_sleep); if (status < 0) goto xdr_error; fattr->valid |= status; @@ -3633,14 +3792,21 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh) if (status) return status; - READ_BUF(4); - READ32(len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); if (len > NFS4_FHSIZE) return -EIO; fh->size = len; - READ_BUF(len); - COPYMEM(fh->data, len); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; + memcpy(fh->data, p, len); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -3662,10 +3828,12 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) __be32 *p; uint32_t namelen, type; - READ_BUF(32); - READ64(offset); - READ64(length); - READ32(type); + p = xdr_inline_decode(xdr, 32); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &offset); + p = xdr_decode_hyper(p, &length); + type = be32_to_cpup(p++); if (fl != NULL) { fl->fl_start = (loff_t)offset; fl->fl_end = fl->fl_start + (loff_t)length - 1; @@ -3676,23 +3844,27 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) fl->fl_type = F_RDLCK; fl->fl_pid = 0; } - READ64(clientid); - READ32(namelen); - READ_BUF(namelen); - return -NFS4ERR_DENIED; + p = xdr_decode_hyper(p, &clientid); + namelen = be32_to_cpup(p); + p = xdr_inline_decode(xdr, namelen); + if (likely(p)) + return -NFS4ERR_DENIED; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_LOCK); if (status == -EIO) goto out; if (status == 0) { - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); + status = decode_stateid(xdr, &res->stateid); + if (unlikely(status)) + goto out; } else if (status == -NFS4ERR_DENIED) status = decode_lock_denied(xdr, NULL); if (res->open_seqid != NULL) @@ -3713,16 +3885,13 @@ static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res) static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_LOCKU); if (status != -EIO) nfs_increment_lock_seqid(status, res->seqid); - if (status == 0) { - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - } + if (status == 0) + status = decode_stateid(xdr, &res->stateid); return status; } @@ -3737,34 +3906,46 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize) __be32 *p; uint32_t limit_type, nblocks, blocksize; - READ_BUF(12); - READ32(limit_type); + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + limit_type = be32_to_cpup(p++); switch (limit_type) { case 1: - READ64(*maxsize); + xdr_decode_hyper(p, maxsize); break; case 2: - READ32(nblocks); - READ32(blocksize); + nblocks = be32_to_cpup(p++); + blocksize = be32_to_cpup(p); *maxsize = (uint64_t)nblocks * (uint64_t)blocksize; } return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) { __be32 *p; uint32_t delegation_type; + int status; - READ_BUF(4); - READ32(delegation_type); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + delegation_type = be32_to_cpup(p); if (delegation_type == NFS4_OPEN_DELEGATE_NONE) { res->delegation_type = 0; return 0; } - READ_BUF(NFS4_STATEID_SIZE+4); - COPYMEM(res->delegation.data, NFS4_STATEID_SIZE); - READ32(res->do_recall); + status = decode_stateid(xdr, &res->delegation); + if (unlikely(status)) + return status; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + res->do_recall = be32_to_cpup(p); switch (delegation_type) { case NFS4_OPEN_DELEGATE_READ: @@ -3776,6 +3957,9 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res) return -EIO; } return decode_ace(xdr, NULL, res->server->nfs_client); +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) @@ -3787,23 +3971,27 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) status = decode_op_hdr(xdr, OP_OPEN); if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); - if (status) + if (!status) + status = decode_stateid(xdr, &res->stateid); + if (unlikely(status)) return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); decode_change_info(xdr, &res->cinfo); - READ_BUF(8); - READ32(res->rflags); - READ32(bmlen); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + res->rflags = be32_to_cpup(p++); + bmlen = be32_to_cpup(p); if (bmlen > 10) goto xdr_error; - READ_BUF(bmlen << 2); + p = xdr_inline_decode(xdr, bmlen << 2); + if (unlikely(!p)) + goto out_overflow; savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE); for (i = 0; i < savewords; ++i) - READ32(res->attrset[i]); + res->attrset[i] = be32_to_cpup(p++); for (; i < NFS4_BITMAP_SIZE; i++) res->attrset[i] = 0; @@ -3811,36 +3999,33 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) xdr_error: dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen); return -EIO; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_OPEN_CONFIRM); if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); - if (status) - return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - return 0; + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; } static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res) { - __be32 *p; int status; status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE); if (status != -EIO) nfs_increment_open_seqid(status, res->seqid); - if (status) - return status; - READ_BUF(NFS4_STATEID_SIZE); - COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); - return 0; + if (!status) + status = decode_stateid(xdr, &res->stateid); + return status; } static int decode_putfh(struct xdr_stream *xdr) @@ -3863,9 +4048,11 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_ status = decode_op_hdr(xdr, OP_READ); if (status) return status; - READ_BUF(8); - READ32(eof); - READ32(count); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + eof = be32_to_cpup(p++); + count = be32_to_cpup(p); hdrlen = (u8 *) p - (u8 *) iov->iov_base; recvd = req->rq_rcv_buf.len - hdrlen; if (count > recvd) { @@ -3878,6 +4065,9 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_ res->eof = eof; res->count = count; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir) @@ -3892,17 +4082,17 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n int status; status = decode_op_hdr(xdr, OP_READDIR); - if (status) + if (!status) + status = decode_verifier(xdr, readdir->verifier.data); + if (unlikely(status)) return status; - READ_BUF(8); - COPYMEM(readdir->verifier.data, 8); dprintk("%s: verifier = %08x:%08x\n", __func__, ((u32 *)readdir->verifier.data)[0], ((u32 *)readdir->verifier.data)[1]); - hdrlen = (char *) p - (char *) iov->iov_base; + hdrlen = (char *) xdr->p - (char *) iov->iov_base; recvd = rcvbuf->len - hdrlen; if (pglen > recvd) pglen = recvd; @@ -3990,8 +4180,10 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) return status; /* Convert length of symlink */ - READ_BUF(4); - READ32(len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); if (len >= rcvbuf->page_len || len <= 0) { dprintk("nfs: server returned giant symlink!\n"); return -ENAMETOOLONG; @@ -4015,6 +4207,9 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req) kaddr[len+rcvbuf->page_base] = '\0'; kunmap_atomic(kaddr, KM_USER0); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) @@ -4112,10 +4307,16 @@ static int decode_setattr(struct xdr_stream *xdr) status = decode_op_hdr(xdr, OP_SETATTR); if (status) return status; - READ_BUF(4); - READ32(bmlen); - READ_BUF(bmlen << 2); - return 0; + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + bmlen = be32_to_cpup(p); + p = xdr_inline_decode(xdr, bmlen << 2); + if (likely(p)) + return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) @@ -4124,35 +4325,50 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) uint32_t opnum; int32_t nfserr; - READ_BUF(8); - READ32(opnum); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + opnum = be32_to_cpup(p++); if (opnum != OP_SETCLIENTID) { dprintk("nfs: decode_setclientid: Server returned operation" " %d\n", opnum); return -EIO; } - READ32(nfserr); + nfserr = be32_to_cpup(p); if (nfserr == NFS_OK) { - READ_BUF(8 + NFS4_VERIFIER_SIZE); - READ64(clp->cl_clientid); - COPYMEM(clp->cl_confirm.data, NFS4_VERIFIER_SIZE); + p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE); + if (unlikely(!p)) + goto out_overflow; + p = xdr_decode_hyper(p, &clp->cl_clientid); + memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE); } else if (nfserr == NFSERR_CLID_INUSE) { uint32_t len; /* skip netid string */ - READ_BUF(4); - READ32(len); - READ_BUF(len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; /* skip uaddr string */ - READ_BUF(4); - READ32(len); - READ_BUF(len); + p = xdr_inline_decode(xdr, 4); + if (unlikely(!p)) + goto out_overflow; + len = be32_to_cpup(p); + p = xdr_inline_decode(xdr, len); + if (unlikely(!p)) + goto out_overflow; return -NFSERR_CLID_INUSE; } else return nfs4_stat_to_errno(nfserr); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_setclientid_confirm(struct xdr_stream *xdr) @@ -4169,11 +4385,16 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res) if (status) return status; - READ_BUF(16); - READ32(res->count); - READ32(res->verf->committed); - COPYMEM(res->verf->verifier, 8); + p = xdr_inline_decode(xdr, 16); + if (unlikely(!p)) + goto out_overflow; + res->count = be32_to_cpup(p++); + res->verf->committed = be32_to_cpup(p++); + memcpy(res->verf->verifier, p, 8); return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_delegreturn(struct xdr_stream *xdr) @@ -4187,6 +4408,7 @@ static int decode_exchange_id(struct xdr_stream *xdr, { __be32 *p; uint32_t dummy; + char *dummy_str; int status; struct nfs_client *clp = res->client; @@ -4194,36 +4416,45 @@ static int decode_exchange_id(struct xdr_stream *xdr, if (status) return status; - READ_BUF(8); - READ64(clp->cl_ex_clid); - READ_BUF(12); - READ32(clp->cl_seqid); - READ32(clp->cl_exchange_flags); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + xdr_decode_hyper(p, &clp->cl_ex_clid); + p = xdr_inline_decode(xdr, 12); + if (unlikely(!p)) + goto out_overflow; + clp->cl_seqid = be32_to_cpup(p++); + clp->cl_exchange_flags = be32_to_cpup(p++); /* We ask for SP4_NONE */ - READ32(dummy); + dummy = be32_to_cpup(p); if (dummy != SP4_NONE) return -EIO; /* Throw away minor_id */ - READ_BUF(8); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; /* Throw away Major id */ - READ_BUF(4); - READ32(dummy); - READ_BUF(dummy); + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; /* Throw away server_scope */ - READ_BUF(4); - READ32(dummy); - READ_BUF(dummy); + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; /* Throw away Implementation id array */ - READ_BUF(4); - READ32(dummy); - READ_BUF(dummy); + status = decode_opaque_inline(xdr, &dummy, &dummy_str); + if (unlikely(status)) + return status; return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_chan_attrs(struct xdr_stream *xdr, @@ -4232,22 +4463,35 @@ static int decode_chan_attrs(struct xdr_stream *xdr, __be32 *p; u32 nr_attrs; - READ_BUF(28); - READ32(attrs->headerpadsz); - READ32(attrs->max_rqst_sz); - READ32(attrs->max_resp_sz); - READ32(attrs->max_resp_sz_cached); - READ32(attrs->max_ops); - READ32(attrs->max_reqs); - READ32(nr_attrs); + p = xdr_inline_decode(xdr, 28); + if (unlikely(!p)) + goto out_overflow; + attrs->headerpadsz = be32_to_cpup(p++); + attrs->max_rqst_sz = be32_to_cpup(p++); + attrs->max_resp_sz = be32_to_cpup(p++); + attrs->max_resp_sz_cached = be32_to_cpup(p++); + attrs->max_ops = be32_to_cpup(p++); + attrs->max_reqs = be32_to_cpup(p++); + nr_attrs = be32_to_cpup(p); if (unlikely(nr_attrs > 1)) { printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n", __func__, nr_attrs); return -EINVAL; } - if (nr_attrs == 1) - READ_BUF(4); /* skip rdma_attrs */ + if (nr_attrs == 1) { + p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */ + if (unlikely(!p)) + goto out_overflow; + } return 0; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid) +{ + return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN); } static int decode_create_session(struct xdr_stream *xdr, @@ -4259,24 +4503,26 @@ static int decode_create_session(struct xdr_stream *xdr, struct nfs4_session *session = clp->cl_session; status = decode_op_hdr(xdr, OP_CREATE_SESSION); - - if (status) + if (!status) + status = decode_sessionid(xdr, &session->sess_id); + if (unlikely(status)) return status; - /* sessionid */ - READ_BUF(NFS4_MAX_SESSIONID_LEN); - COPYMEM(&session->sess_id, NFS4_MAX_SESSIONID_LEN); - /* seqid, flags */ - READ_BUF(8); - READ32(clp->cl_seqid); - READ32(session->flags); + p = xdr_inline_decode(xdr, 8); + if (unlikely(!p)) + goto out_overflow; + clp->cl_seqid = be32_to_cpup(p++); + session->flags = be32_to_cpup(p); /* Channel attributes */ status = decode_chan_attrs(xdr, &session->fc_attrs); if (!status) status = decode_chan_attrs(xdr, &session->bc_attrs); return status; +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; } static int decode_destroy_session(struct xdr_stream *xdr, void *dummy) @@ -4300,7 +4546,9 @@ static int decode_sequence(struct xdr_stream *xdr, return 0; status = decode_op_hdr(xdr, OP_SEQUENCE); - if (status) + if (!status) + status = decode_sessionid(xdr, &id); + if (unlikely(status)) goto out_err; /* @@ -4309,36 +4557,43 @@ static int decode_sequence(struct xdr_stream *xdr, */ status = -ESERVERFAULT; - slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid]; - READ_BUF(NFS4_MAX_SESSIONID_LEN + 20); - COPYMEM(id.data, NFS4_MAX_SESSIONID_LEN); if (memcmp(id.data, res->sr_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) { dprintk("%s Invalid session id\n", __func__); goto out_err; } + + p = xdr_inline_decode(xdr, 20); + if (unlikely(!p)) + goto out_overflow; + /* seqid */ - READ32(dummy); + slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid]; + dummy = be32_to_cpup(p++); if (dummy != slot->seq_nr) { dprintk("%s Invalid sequence number\n", __func__); goto out_err; } /* slot id */ - READ32(dummy); + dummy = be32_to_cpup(p++); if (dummy != res->sr_slotid) { dprintk("%s Invalid slot id\n", __func__); goto out_err; } /* highest slot id - currently not processed */ - READ32(dummy); + dummy = be32_to_cpup(p++); /* target highest slot id - currently not processed */ - READ32(dummy); + dummy = be32_to_cpup(p++); /* result flags - currently not processed */ - READ32(dummy); + dummy = be32_to_cpup(p); status = 0; out_err: res->sr_status = status; return status; +out_overflow: + print_overflow_msg(__func__, xdr); + status = -EIO; + goto out_err; #else /* CONFIG_NFS_V4_1 */ return 0; #endif /* CONFIG_NFS_V4_1 */ @@ -4370,7 +4625,8 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct status = decode_open_downgrade(&xdr, res); if (status != 0) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4397,7 +4653,8 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac status = decode_access(&xdr, res); if (status != 0) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4424,7 +4681,8 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lo goto out; if ((status = decode_getfh(&xdr, res->fh)) != 0) goto out; - status = decode_getfattr(&xdr, res->fattr, res->server); + status = decode_getfattr(&xdr, res->fattr, res->server + ,!RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4448,7 +4706,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nf if ((status = decode_putrootfh(&xdr)) != 0) goto out; if ((status = decode_getfh(&xdr, res->fh)) == 0) - status = decode_getfattr(&xdr, res->fattr, res->server); + status = decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4473,7 +4732,8 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem goto out; if ((status = decode_remove(&xdr, &res->cinfo)) != 0) goto out; - decode_getfattr(&xdr, &res->dir_attr, res->server); + decode_getfattr(&xdr, &res->dir_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4503,11 +4763,13 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_re if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0) goto out; /* Current FH is target directory */ - if (decode_getfattr(&xdr, res->new_fattr, res->server) != 0) + if (decode_getfattr(&xdr, res->new_fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)) != 0) goto out; if ((status = decode_restorefh(&xdr)) != 0) goto out; - decode_getfattr(&xdr, res->old_fattr, res->server); + decode_getfattr(&xdr, res->old_fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4540,11 +4802,13 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link * Note order: OP_LINK leaves the directory as the current * filehandle. */ - if (decode_getfattr(&xdr, res->dir_attr, res->server) != 0) + if (decode_getfattr(&xdr, res->dir_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)) != 0) goto out; if ((status = decode_restorefh(&xdr)) != 0) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4573,11 +4837,13 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_cr goto out; if ((status = decode_getfh(&xdr, res->fh)) != 0) goto out; - if (decode_getfattr(&xdr, res->fattr, res->server) != 0) + if (decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)) != 0) goto out; if ((status = decode_restorefh(&xdr)) != 0) goto out; - decode_getfattr(&xdr, res->dir_fattr, res->server); + decode_getfattr(&xdr, res->dir_fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4609,7 +4875,8 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g status = decode_putfh(&xdr); if (status) goto out; - status = decode_getfattr(&xdr, res->fattr, res->server); + status = decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4716,7 +4983,8 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos * an ESTALE error. Shouldn't be a problem, * though, since fattr->valid will remain unset. */ - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4748,11 +5016,13 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openr goto out; if (decode_getfh(&xdr, &res->fh) != 0) goto out; - if (decode_getfattr(&xdr, res->f_attr, res->server) != 0) + if (decode_getfattr(&xdr, res->f_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)) != 0) goto out; if (decode_restorefh(&xdr) != 0) goto out; - decode_getfattr(&xdr, res->dir_attr, res->server); + decode_getfattr(&xdr, res->dir_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4800,7 +5070,8 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nf status = decode_open(&xdr, res); if (status) goto out; - decode_getfattr(&xdr, res->f_attr, res->server); + decode_getfattr(&xdr, res->f_attr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -4827,7 +5098,8 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se status = decode_setattr(&xdr); if (status) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -5001,7 +5273,8 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writ status = decode_write(&xdr, res); if (status) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); if (!status) status = res->count; out: @@ -5030,7 +5303,8 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_wri status = decode_commit(&xdr, res); if (status) goto out; - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -5194,7 +5468,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf if (status != 0) goto out; status = decode_delegreturn(&xdr); - decode_getfattr(&xdr, res->fattr, res->server); + decode_getfattr(&xdr, res->fattr, res->server, + !RPC_IS_ASYNC(rqstp->rq_task)); out: return status; } @@ -5222,7 +5497,8 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p, goto out; xdr_enter_page(&xdr, PAGE_SIZE); status = decode_getfattr(&xdr, &res->fs_locations->fattr, - res->fs_locations->server); + res->fs_locations->server, + !RPC_IS_ASYNC(req->rq_task)); out: return status; } diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 73ea5e8d66ce..12c9e66d3f1d 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -60,17 +60,15 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) return p; } -static void nfs_readdata_free(struct nfs_read_data *p) +void nfs_readdata_free(struct nfs_read_data *p) { if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); mempool_free(p, nfs_rdata_mempool); } -void nfs_readdata_release(void *data) +static void nfs_readdata_release(struct nfs_read_data *rdata) { - struct nfs_read_data *rdata = data; - put_nfs_open_context(rdata->args.context); nfs_readdata_free(rdata); } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0b4cbdc60abd..867f70504531 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -73,7 +73,7 @@ enum { Opt_cto, Opt_nocto, Opt_ac, Opt_noac, Opt_lock, Opt_nolock, - Opt_v2, Opt_v3, + Opt_v2, Opt_v3, Opt_v4, Opt_udp, Opt_tcp, Opt_rdma, Opt_acl, Opt_noacl, Opt_rdirplus, Opt_nordirplus, @@ -127,6 +127,7 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_nolock, "nolock" }, { Opt_v2, "v2" }, { Opt_v3, "v3" }, + { Opt_v4, "v4" }, { Opt_udp, "udp" }, { Opt_tcp, "tcp" }, { Opt_rdma, "rdma" }, @@ -158,7 +159,7 @@ static const match_table_t nfs_mount_option_tokens = { { Opt_mountvers, "mountvers=%s" }, { Opt_nfsvers, "nfsvers=%s" }, { Opt_nfsvers, "vers=%s" }, - { Opt_minorversion, "minorversion=%u" }, + { Opt_minorversion, "minorversion=%s" }, { Opt_sec, "sec=%s" }, { Opt_proto, "proto=%s" }, @@ -272,6 +273,10 @@ static const struct super_operations nfs_sops = { }; #ifdef CONFIG_NFS_V4 +static int nfs4_validate_text_mount_data(void *options, + struct nfs_parsed_mount_data *args, const char *dev_name); +static int nfs4_try_mount(int flags, const char *dev_name, + struct nfs_parsed_mount_data *data, struct vfsmount *mnt); static int nfs4_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); static int nfs4_remote_get_sb(struct file_system_type *fs_type, @@ -742,127 +747,23 @@ static int nfs_verify_server_address(struct sockaddr *addr) } } + dfprintk(MOUNT, "NFS: Invalid IP address specified\n"); return 0; } -static void nfs_parse_ipv4_address(char *string, size_t str_len, - struct sockaddr *sap, size_t *addr_len) -{ - struct sockaddr_in *sin = (struct sockaddr_in *)sap; - u8 *addr = (u8 *)&sin->sin_addr.s_addr; - - if (str_len <= INET_ADDRSTRLEN) { - dfprintk(MOUNT, "NFS: parsing IPv4 address %*s\n", - (int)str_len, string); - - sin->sin_family = AF_INET; - *addr_len = sizeof(*sin); - if (in4_pton(string, str_len, addr, '\0', NULL)) - return; - } - - sap->sa_family = AF_UNSPEC; - *addr_len = 0; -} - -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, - const char *delim, - struct sockaddr_in6 *sin6) -{ - char *p; - size_t len; - - if ((string + str_len) == delim) - return 1; - - if (*delim != IPV6_SCOPE_DELIMITER) - return 0; - - if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) - return 0; - - len = (string + str_len) - delim - 1; - p = kstrndup(delim + 1, len, GFP_KERNEL); - if (p) { - unsigned long scope_id = 0; - struct net_device *dev; - - dev = dev_get_by_name(&init_net, p); - if (dev != NULL) { - scope_id = dev->ifindex; - dev_put(dev); - } else { - if (strict_strtoul(p, 10, &scope_id) == 0) { - kfree(p); - return 0; - } - } - - kfree(p); - - sin6->sin6_scope_id = scope_id; - dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id); - return 1; - } - - return 0; -} - -static void nfs_parse_ipv6_address(char *string, size_t str_len, - struct sockaddr *sap, size_t *addr_len) -{ - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; - u8 *addr = (u8 *)&sin6->sin6_addr.in6_u; - const char *delim; - - if (str_len <= INET6_ADDRSTRLEN) { - dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n", - (int)str_len, string); - - sin6->sin6_family = AF_INET6; - *addr_len = sizeof(*sin6); - if (in6_pton(string, str_len, addr, - IPV6_SCOPE_DELIMITER, &delim) != 0) { - if (nfs_parse_ipv6_scope_id(string, str_len, - delim, sin6) != 0) - return; - } - } - - sap->sa_family = AF_UNSPEC; - *addr_len = 0; -} -#else -static void nfs_parse_ipv6_address(char *string, size_t str_len, - struct sockaddr *sap, size_t *addr_len) -{ - sap->sa_family = AF_UNSPEC; - *addr_len = 0; -} -#endif - /* - * Construct a sockaddr based on the contents of a string that contains - * an IP address in presentation format. - * - * If there is a problem constructing the new sockaddr, set the address - * family to AF_UNSPEC. + * Select between a default port value and a user-specified port value. + * If a zero value is set, then autobind will be used. */ -void nfs_parse_ip_address(char *string, size_t str_len, - struct sockaddr *sap, size_t *addr_len) +static void nfs_set_default_port(struct sockaddr *sap, const int parsed_port, + const unsigned short default_port) { - unsigned int i, colons; + unsigned short port = default_port; - colons = 0; - for (i = 0; i < str_len; i++) - if (string[i] == ':') - colons++; + if (parsed_port != NFS_UNSPEC_PORT) + port = parsed_port; - if (colons >= 2) - nfs_parse_ipv6_address(string, str_len, sap, addr_len); - else - nfs_parse_ipv4_address(string, str_len, sap, addr_len); + rpc_set_port(sap, port); } /* @@ -904,8 +805,6 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt) /* * Parse the value of the 'sec=' option. - * - * The flavor_len setting is for v4 mounts. */ static int nfs_parse_security_flavors(char *value, struct nfs_parsed_mount_data *mnt) @@ -916,53 +815,43 @@ static int nfs_parse_security_flavors(char *value, switch (match_token(value, nfs_secflavor_tokens, args)) { case Opt_sec_none: - mnt->auth_flavor_len = 0; mnt->auth_flavors[0] = RPC_AUTH_NULL; break; case Opt_sec_sys: - mnt->auth_flavor_len = 0; mnt->auth_flavors[0] = RPC_AUTH_UNIX; break; case Opt_sec_krb5: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5; break; case Opt_sec_krb5i: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I; break; case Opt_sec_krb5p: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P; break; case Opt_sec_lkey: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY; break; case Opt_sec_lkeyi: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI; break; case Opt_sec_lkeyp: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP; break; case Opt_sec_spkm: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM; break; case Opt_sec_spkmi: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI; break; case Opt_sec_spkmp: - mnt->auth_flavor_len = 1; mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP; break; default: return 0; } + mnt->auth_flavor_len = 1; return 1; } @@ -1001,7 +890,6 @@ static int nfs_parse_mount_options(char *raw, while ((p = strsep(&raw, ",")) != NULL) { substring_t args[MAX_OPT_ARGS]; unsigned long option; - int int_option; int token; if (!*p) @@ -1047,10 +935,18 @@ static int nfs_parse_mount_options(char *raw, break; case Opt_v2: mnt->flags &= ~NFS_MOUNT_VER3; + mnt->version = 2; break; case Opt_v3: mnt->flags |= NFS_MOUNT_VER3; + mnt->version = 3; break; +#ifdef CONFIG_NFS_V4 + case Opt_v4: + mnt->flags &= ~NFS_MOUNT_VER3; + mnt->version = 4; + break; +#endif case Opt_udp: mnt->flags &= ~NFS_MOUNT_TCP; mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP; @@ -1264,20 +1160,33 @@ static int nfs_parse_mount_options(char *raw, switch (option) { case NFS2_VERSION: mnt->flags &= ~NFS_MOUNT_VER3; + mnt->version = 2; break; case NFS3_VERSION: mnt->flags |= NFS_MOUNT_VER3; + mnt->version = 3; break; +#ifdef CONFIG_NFS_V4 + case NFS4_VERSION: + mnt->flags &= ~NFS_MOUNT_VER3; + mnt->version = 4; + break; +#endif default: goto out_invalid_value; } break; case Opt_minorversion: - if (match_int(args, &int_option)) - return 0; - if (int_option < 0 || int_option > NFS4_MAX_MINOR_VERSION) - return 0; - mnt->minorversion = int_option; + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + rc = strict_strtoul(string, 10, &option); + kfree(string); + if (rc != 0) + goto out_invalid_value; + if (option > NFS4_MAX_MINOR_VERSION) + goto out_invalid_value; + mnt->minorversion = option; break; /* @@ -1352,11 +1261,14 @@ static int nfs_parse_mount_options(char *raw, string = match_strdup(args); if (string == NULL) goto out_nomem; - nfs_parse_ip_address(string, strlen(string), - (struct sockaddr *) - &mnt->nfs_server.address, - &mnt->nfs_server.addrlen); + mnt->nfs_server.addrlen = + rpc_pton(string, strlen(string), + (struct sockaddr *) + &mnt->nfs_server.address, + sizeof(mnt->nfs_server.address)); kfree(string); + if (mnt->nfs_server.addrlen == 0) + goto out_invalid_address; break; case Opt_clientaddr: string = match_strdup(args); @@ -1376,11 +1288,14 @@ static int nfs_parse_mount_options(char *raw, string = match_strdup(args); if (string == NULL) goto out_nomem; - nfs_parse_ip_address(string, strlen(string), - (struct sockaddr *) - &mnt->mount_server.address, - &mnt->mount_server.addrlen); + mnt->mount_server.addrlen = + rpc_pton(string, strlen(string), + (struct sockaddr *) + &mnt->mount_server.address, + sizeof(mnt->mount_server.address)); kfree(string); + if (mnt->mount_server.addrlen == 0) + goto out_invalid_address; break; case Opt_lookupcache: string = match_strdup(args); @@ -1432,8 +1347,11 @@ static int nfs_parse_mount_options(char *raw, return 1; +out_invalid_address: + printk(KERN_INFO "NFS: bad IP address specified: %s\n", p); + return 0; out_invalid_value: - printk(KERN_INFO "NFS: bad mount option value specified: %s \n", p); + printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p); return 0; out_nomem: printk(KERN_INFO "NFS: not enough memory to parse option\n"); @@ -1445,13 +1363,60 @@ out_security_failure: } /* + * Match the requested auth flavors with the list returned by + * the server. Returns zero and sets the mount's authentication + * flavor on success; returns -EACCES if server does not support + * the requested flavor. + */ +static int nfs_walk_authlist(struct nfs_parsed_mount_data *args, + struct nfs_mount_request *request) +{ + unsigned int i, j, server_authlist_len = *(request->auth_flav_len); + + /* + * Certain releases of Linux's mountd return an empty + * flavor list. To prevent behavioral regression with + * these servers (ie. rejecting mounts that used to + * succeed), revert to pre-2.6.32 behavior (no checking) + * if the returned flavor list is empty. + */ + if (server_authlist_len == 0) + return 0; + + /* + * We avoid sophisticated negotiating here, as there are + * plenty of cases where we can get it wrong, providing + * either too little or too much security. + * + * RFC 2623, section 2.7 suggests we SHOULD prefer the + * flavor listed first. However, some servers list + * AUTH_NULL first. Our caller plants AUTH_SYS, the + * preferred default, in args->auth_flavors[0] if user + * didn't specify sec= mount option. + */ + for (i = 0; i < args->auth_flavor_len; i++) + for (j = 0; j < server_authlist_len; j++) + if (args->auth_flavors[i] == request->auth_flavs[j]) { + dfprintk(MOUNT, "NFS: using auth flavor %d\n", + request->auth_flavs[j]); + args->auth_flavors[0] = request->auth_flavs[j]; + return 0; + } + + dfprintk(MOUNT, "NFS: server does not support requested auth flavor\n"); + nfs_umount(request); + return -EACCES; +} + +/* * Use the remote server's MOUNT service to request the NFS file handle * corresponding to the provided path. */ static int nfs_try_mount(struct nfs_parsed_mount_data *args, struct nfs_fh *root_fh) { - unsigned int auth_flavor_len = 0; + rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS]; + unsigned int server_authlist_len = ARRAY_SIZE(server_authlist); struct nfs_mount_request request = { .sap = (struct sockaddr *) &args->mount_server.address, @@ -1459,7 +1424,8 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, .protocol = args->mount_server.protocol, .fh = root_fh, .noresvport = args->flags & NFS_MOUNT_NORESVPORT, - .auth_flav_len = &auth_flavor_len, + .auth_flav_len = &server_authlist_len, + .auth_flavs = server_authlist, }; int status; @@ -1485,23 +1451,25 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args, args->mount_server.addrlen = args->nfs_server.addrlen; } request.salen = args->mount_server.addrlen; - - /* - * autobind will be used if mount_server.port == 0 - */ - nfs_set_port(request.sap, args->mount_server.port); + nfs_set_default_port(request.sap, args->mount_server.port, 0); /* * Now ask the mount server to map our export path * to a file handle. */ status = nfs_mount(&request); - if (status == 0) - return 0; + if (status != 0) { + dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", + request.hostname, status); + return status; + } - dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n", - request.hostname, status); - return status; + /* + * MNTv1 (NFSv2) does not support auth flavor negotiation. + */ + if (args->mount_server.version != NFS_MNT3_VERSION) + return 0; + return nfs_walk_authlist(args, &request); } static int nfs_parse_simple_hostname(const char *dev_name, @@ -1661,6 +1629,7 @@ static int nfs_validate_mount_data(void *options, const char *dev_name) { struct nfs_mount_data *data = (struct nfs_mount_data *)options; + struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; if (data == NULL) goto out_no_data; @@ -1672,10 +1641,12 @@ static int nfs_validate_mount_data(void *options, args->acregmax = NFS_DEF_ACREGMAX; args->acdirmin = NFS_DEF_ACDIRMIN; args->acdirmax = NFS_DEF_ACDIRMAX; - args->mount_server.port = 0; /* autobind unless user sets port */ - args->nfs_server.port = 0; /* autobind unless user sets port */ + args->mount_server.port = NFS_UNSPEC_PORT; + args->nfs_server.port = NFS_UNSPEC_PORT; args->nfs_server.protocol = XPRT_TRANSPORT_TCP; args->auth_flavors[0] = RPC_AUTH_UNIX; + args->auth_flavor_len = 1; + args->minorversion = 0; switch (data->version) { case 1: @@ -1697,8 +1668,11 @@ static int nfs_validate_mount_data(void *options, if (data->root.size > NFS3_FHSIZE || data->root.size == 0) goto out_invalid_fh; mntfh->size = data->root.size; - } else + args->version = 3; + } else { mntfh->size = NFS2_FHSIZE; + args->version = 2; + } memcpy(mntfh->data, data->root.data, mntfh->size); @@ -1720,11 +1694,9 @@ static int nfs_validate_mount_data(void *options, args->acdirmin = data->acdirmin; args->acdirmax = data->acdirmax; - memcpy(&args->nfs_server.address, &data->addr, - sizeof(data->addr)); + memcpy(sap, &data->addr, sizeof(data->addr)); args->nfs_server.addrlen = sizeof(data->addr); - if (!nfs_verify_server_address((struct sockaddr *) - &args->nfs_server.address)) + if (!nfs_verify_server_address(sap)) goto out_no_address; if (!(data->flags & NFS_MOUNT_TCP)) @@ -1772,12 +1744,18 @@ static int nfs_validate_mount_data(void *options, if (nfs_parse_mount_options((char *)options, args) == 0) return -EINVAL; - if (!nfs_verify_server_address((struct sockaddr *) - &args->nfs_server.address)) + if (!nfs_verify_server_address(sap)) goto out_no_address; - nfs_set_port((struct sockaddr *)&args->nfs_server.address, - args->nfs_server.port); + if (args->version == 4) +#ifdef CONFIG_NFS_V4 + return nfs4_validate_text_mount_data(options, + args, dev_name); +#else + goto out_v4_not_compiled; +#endif + + nfs_set_default_port(sap, args->nfs_server.port, 0); nfs_set_mount_transport_protocol(args); @@ -1825,6 +1803,12 @@ out_v3_not_compiled: return -EPROTONOSUPPORT; #endif /* !CONFIG_NFS_V3 */ +#ifndef CONFIG_NFS_V4 +out_v4_not_compiled: + dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n"); + return -EPROTONOSUPPORT; +#endif /* !CONFIG_NFS_V4 */ + out_nomem: dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n"); return -ENOMEM; @@ -2120,6 +2104,14 @@ static int nfs_get_sb(struct file_system_type *fs_type, if (error < 0) goto out; +#ifdef CONFIG_NFS_V4 + if (data->version == 4) { + error = nfs4_try_mount(flags, dev_name, data, mnt); + kfree(data->client_address); + goto out; + } +#endif /* CONFIG_NFS_V4 */ + /* Get a volume representation */ server = nfs_create_server(data, mntfh); if (IS_ERR(server)) { @@ -2317,6 +2309,43 @@ static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args) args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3); } +static int nfs4_validate_text_mount_data(void *options, + struct nfs_parsed_mount_data *args, + const char *dev_name) +{ + struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; + + nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT); + + nfs_validate_transport_protocol(args); + + nfs4_validate_mount_flags(args); + + if (args->version != 4) { + dfprintk(MOUNT, + "NFS4: Illegal mount version\n"); + return -EINVAL; + } + + if (args->auth_flavor_len > 1) { + dfprintk(MOUNT, + "NFS4: Too many RPC auth flavours specified\n"); + return -EINVAL; + } + + if (args->client_address == NULL) { + dfprintk(MOUNT, + "NFS4: mount program didn't pass callback address\n"); + return -EINVAL; + } + + return nfs_parse_devname(dev_name, + &args->nfs_server.hostname, + NFS4_MAXNAMLEN, + &args->nfs_server.export_path, + NFS4_MAXPATHLEN); +} + /* * Validate NFSv4 mount options */ @@ -2324,7 +2353,7 @@ static int nfs4_validate_mount_data(void *options, struct nfs_parsed_mount_data *args, const char *dev_name) { - struct sockaddr_in *ap; + struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address; struct nfs4_mount_data *data = (struct nfs4_mount_data *)options; char *c; @@ -2337,23 +2366,22 @@ static int nfs4_validate_mount_data(void *options, args->acregmax = NFS_DEF_ACREGMAX; args->acdirmin = NFS_DEF_ACDIRMIN; args->acdirmax = NFS_DEF_ACDIRMAX; - args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */ + args->nfs_server.port = NFS_UNSPEC_PORT; args->auth_flavors[0] = RPC_AUTH_UNIX; - args->auth_flavor_len = 0; + args->auth_flavor_len = 1; + args->version = 4; args->minorversion = 0; switch (data->version) { case 1: - ap = (struct sockaddr_in *)&args->nfs_server.address; if (data->host_addrlen > sizeof(args->nfs_server.address)) goto out_no_address; if (data->host_addrlen == 0) goto out_no_address; args->nfs_server.addrlen = data->host_addrlen; - if (copy_from_user(ap, data->host_addr, data->host_addrlen)) + if (copy_from_user(sap, data->host_addr, data->host_addrlen)) return -EFAULT; - if (!nfs_verify_server_address((struct sockaddr *) - &args->nfs_server.address)) + if (!nfs_verify_server_address(sap)) goto out_no_address; if (data->auth_flavourlen) { @@ -2399,39 +2427,14 @@ static int nfs4_validate_mount_data(void *options, nfs_validate_transport_protocol(args); break; - default: { - int status; - + default: if (nfs_parse_mount_options((char *)options, args) == 0) return -EINVAL; - if (!nfs_verify_server_address((struct sockaddr *) - &args->nfs_server.address)) + if (!nfs_verify_server_address(sap)) return -EINVAL; - nfs_set_port((struct sockaddr *)&args->nfs_server.address, - args->nfs_server.port); - - nfs_validate_transport_protocol(args); - - nfs4_validate_mount_flags(args); - - if (args->auth_flavor_len > 1) - goto out_inval_auth; - - if (args->client_address == NULL) - goto out_no_client_address; - - status = nfs_parse_devname(dev_name, - &args->nfs_server.hostname, - NFS4_MAXNAMLEN, - &args->nfs_server.export_path, - NFS4_MAXPATHLEN); - if (status < 0) - return status; - - break; - } + return nfs4_validate_text_mount_data(options, args, dev_name); } return 0; @@ -2448,10 +2451,6 @@ out_inval_auth: out_no_address: dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n"); return -EINVAL; - -out_no_client_address: - dfprintk(MOUNT, "NFS4: mount program didn't pass callback address\n"); - return -EINVAL; } /* @@ -2618,6 +2617,34 @@ out_err: return ret; } +static int nfs4_try_mount(int flags, const char *dev_name, + struct nfs_parsed_mount_data *data, + struct vfsmount *mnt) +{ + char *export_path; + struct vfsmount *root_mnt; + int error; + + dfprintk(MOUNT, "--> nfs4_try_mount()\n"); + + export_path = data->nfs_server.export_path; + data->nfs_server.export_path = "/"; + root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data, + data->nfs_server.hostname); + data->nfs_server.export_path = export_path; + + error = PTR_ERR(root_mnt); + if (IS_ERR(root_mnt)) + goto out; + + error = nfs_follow_remote_path(root_mnt, export_path, mnt); + +out: + dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", error, + error != 0 ? " [error]" : ""); + return error; +} + /* * Get the superblock for an NFS4 mountpoint */ @@ -2625,8 +2652,6 @@ static int nfs4_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) { struct nfs_parsed_mount_data *data; - char *export_path; - struct vfsmount *root_mnt; int error = -ENOMEM; data = kzalloc(sizeof(*data), GFP_KERNEL); @@ -2638,17 +2663,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type, if (error < 0) goto out; - export_path = data->nfs_server.export_path; - data->nfs_server.export_path = "/"; - root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data, - data->nfs_server.hostname); - data->nfs_server.export_path = export_path; - - error = PTR_ERR(root_mnt); - if (IS_ERR(root_mnt)) - goto out; - - error = nfs_follow_remote_path(root_mnt, export_path, mnt); + error = nfs4_try_mount(flags, dev_name, data, mnt); out: kfree(data->client_address); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0a0a2ff767c3..120acadc6a84 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -13,6 +13,7 @@ #include <linux/file.h> #include <linux/writeback.h> #include <linux/swap.h> +#include <linux/migrate.h> #include <linux/sunrpc/clnt.h> #include <linux/nfs_fs.h> @@ -26,6 +27,7 @@ #include "internal.h" #include "iostat.h" #include "nfs4_fs.h" +#include "fscache.h" #define NFSDBG_FACILITY NFSDBG_PAGECACHE @@ -87,17 +89,15 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) return p; } -static void nfs_writedata_free(struct nfs_write_data *p) +void nfs_writedata_free(struct nfs_write_data *p) { if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); mempool_free(p, nfs_wdata_mempool); } -void nfs_writedata_release(void *data) +static void nfs_writedata_release(struct nfs_write_data *wdata) { - struct nfs_write_data *wdata = data; - put_nfs_open_context(wdata->args.context); nfs_writedata_free(wdata); } @@ -220,24 +220,17 @@ static void nfs_end_page_writeback(struct page *page) clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); } -/* - * Find an associated nfs write request, and prepare to flush it out - * May return an error if the user signalled nfs_wait_on_request(). - */ -static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page) +static struct nfs_page *nfs_find_and_lock_request(struct page *page) { struct inode *inode = page->mapping->host; struct nfs_page *req; int ret; spin_lock(&inode->i_lock); - for(;;) { + for (;;) { req = nfs_page_find_request_locked(page); - if (req == NULL) { - spin_unlock(&inode->i_lock); - return 0; - } + if (req == NULL) + break; if (nfs_set_page_tag_locked(req)) break; /* Note: If we hold the page lock, as is the case in nfs_writepage, @@ -249,23 +242,40 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, ret = nfs_wait_on_request(req); nfs_release_request(req); if (ret != 0) - return ret; + return ERR_PTR(ret); spin_lock(&inode->i_lock); } - if (test_bit(PG_CLEAN, &req->wb_flags)) { - spin_unlock(&inode->i_lock); - BUG(); - } - if (nfs_set_page_writeback(page) != 0) { - spin_unlock(&inode->i_lock); - BUG(); - } spin_unlock(&inode->i_lock); + return req; +} + +/* + * Find an associated nfs write request, and prepare to flush it out + * May return an error if the user signalled nfs_wait_on_request(). + */ +static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, + struct page *page) +{ + struct nfs_page *req; + int ret = 0; + + req = nfs_find_and_lock_request(page); + if (!req) + goto out; + ret = PTR_ERR(req); + if (IS_ERR(req)) + goto out; + + ret = nfs_set_page_writeback(page); + BUG_ON(ret != 0); + BUG_ON(test_bit(PG_CLEAN, &req->wb_flags)); + if (!nfs_pageio_add_request(pgio, req)) { nfs_redirty_request(req); - return pgio->pg_error; + ret = pgio->pg_error; } - return 0; +out: + return ret; } static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) @@ -1582,6 +1592,41 @@ int nfs_wb_page(struct inode *inode, struct page* page) return nfs_wb_page_priority(inode, page, FLUSH_STABLE); } +#ifdef CONFIG_MIGRATION +int nfs_migrate_page(struct address_space *mapping, struct page *newpage, + struct page *page) +{ + struct nfs_page *req; + int ret; + + if (PageFsCache(page)) + nfs_fscache_release_page(page, GFP_KERNEL); + + req = nfs_find_and_lock_request(page); + ret = PTR_ERR(req); + if (IS_ERR(req)) + goto out; + + ret = migrate_page(mapping, newpage, page); + if (!req) + goto out; + if (ret) + goto out_unlock; + page_cache_get(newpage); + req->wb_page = newpage; + SetPagePrivate(newpage); + set_page_private(newpage, page_private(page)); + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); +out_unlock: + nfs_clear_page_tag_locked(req); + nfs_release_request(req); +out: + return ret; +} +#endif + int __init nfs_init_writepagecache(void) { nfs_wdata_cachep = kmem_cache_create("nfs_write_data", diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 5573508f707f..36fcabbf5186 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -34,6 +34,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) int flags = nfsexp_flags(rqstp, exp); int ret; + validate_process_creds(); + /* discard any old override before preparing the new set */ revert_creds(get_cred(current->real_cred)); new = prepare_creds(); @@ -86,8 +88,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) else new->cap_effective = cap_raise_nfsd_set(new->cap_effective, new->cap_permitted); + validate_process_creds(); put_cred(override_creds(new)); put_cred(new); + validate_process_creds(); return 0; oom: diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index b92a27629fb7..d9462643155c 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -85,6 +85,11 @@ static void expkey_request(struct cache_detail *cd, (*bpp)[-1] = '\n'; } +static int expkey_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h, expkey_request); +} + static struct svc_expkey *svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old); static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *); static struct cache_detail svc_expkey_cache; @@ -259,7 +264,7 @@ static struct cache_detail svc_expkey_cache = { .hash_table = expkey_table, .name = "nfsd.fh", .cache_put = expkey_put, - .cache_request = expkey_request, + .cache_upcall = expkey_upcall, .cache_parse = expkey_parse, .cache_show = expkey_show, .match = expkey_match, @@ -355,6 +360,11 @@ static void svc_export_request(struct cache_detail *cd, (*bpp)[-1] = '\n'; } +static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h) +{ + return sunrpc_cache_pipe_upcall(cd, h, svc_export_request); +} + static struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old); static struct svc_export *svc_export_lookup(struct svc_export *); @@ -724,7 +734,7 @@ struct cache_detail svc_export_cache = { .hash_table = export_table, .name = "nfsd.export", .cache_put = svc_export_put, - .cache_request = svc_export_request, + .cache_upcall = svc_export_upcall, .cache_parse = svc_export_parse, .cache_show = svc_export_show, .match = svc_export_match, diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 5b398421b051..cdfa86fa1471 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -146,6 +146,12 @@ idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, } static int +idtoname_upcall(struct cache_detail *cd, struct cache_head *ch) +{ + return sunrpc_cache_pipe_upcall(cd, ch, idtoname_request); +} + +static int idtoname_match(struct cache_head *ca, struct cache_head *cb) { struct ent *a = container_of(ca, struct ent, h); @@ -175,10 +181,10 @@ idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h) } static void -warn_no_idmapd(struct cache_detail *detail) +warn_no_idmapd(struct cache_detail *detail, int has_died) { printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n", - detail->last_close? "died" : "not been started"); + has_died ? "died" : "not been started"); } @@ -192,7 +198,7 @@ static struct cache_detail idtoname_cache = { .hash_table = idtoname_table, .name = "nfs4.idtoname", .cache_put = ent_put, - .cache_request = idtoname_request, + .cache_upcall = idtoname_upcall, .cache_parse = idtoname_parse, .cache_show = idtoname_show, .warn_no_listener = warn_no_idmapd, @@ -325,6 +331,12 @@ nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp, } static int +nametoid_upcall(struct cache_detail *cd, struct cache_head *ch) +{ + return sunrpc_cache_pipe_upcall(cd, ch, nametoid_request); +} + +static int nametoid_match(struct cache_head *ca, struct cache_head *cb) { struct ent *a = container_of(ca, struct ent, h); @@ -363,7 +375,7 @@ static struct cache_detail nametoid_cache = { .hash_table = nametoid_table, .name = "nfs4.nametoid", .cache_put = ent_put, - .cache_request = nametoid_request, + .cache_upcall = nametoid_upcall, .cache_parse = nametoid_parse, .cache_show = nametoid_show, .warn_no_listener = warn_no_idmapd, diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 6d0847562d87..7e906c5b7671 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -37,6 +37,7 @@ #include <linux/nfsd/xdr.h> #include <linux/nfsd/syscall.h> #include <linux/lockd/lockd.h> +#include <linux/sunrpc/clnt.h> #include <asm/uaccess.h> #include <net/ipv6.h> @@ -490,22 +491,18 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size) * * Input: * buf: '\n'-terminated C string containing a - * presentation format IPv4 address + * presentation format IP address * size: length of C string in @buf * Output: * On success: returns zero if all specified locks were released; * returns one if one or more locks were not released * On error: return code is negative errno value - * - * Note: Only AF_INET client addresses are passed in */ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) { - struct sockaddr_in sin = { - .sin_family = AF_INET, - }; - int b1, b2, b3, b4; - char c; + struct sockaddr_storage address; + struct sockaddr *sap = (struct sockaddr *)&address; + size_t salen = sizeof(address); char *fo_path; /* sanity check */ @@ -519,14 +516,10 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) if (qword_get(&buf, fo_path, size) < 0) return -EINVAL; - /* get ipv4 address */ - if (sscanf(fo_path, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4) - return -EINVAL; - if (b1 > 255 || b2 > 255 || b3 > 255 || b4 > 255) + if (rpc_pton(fo_path, size, sap, salen) == 0) return -EINVAL; - sin.sin_addr.s_addr = htonl((b1 << 24) | (b2 << 16) | (b3 << 8) | b4); - return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin); + return nlmsvc_unlock_all_by_ip(sap); } /** diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 492c79b7800b..24d58adfe5fd 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -496,7 +496,9 @@ nfsd(void *vrqstp) /* Lock the export hash tables for reading. */ exp_readlock(); + validate_process_creds(); svc_process(rqstp); + validate_process_creds(); /* Unlock export hash tables */ exp_readunlock(); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 23341c1063bc..8fa09bfbcba7 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -684,6 +684,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, __be32 err; int host_err; + validate_process_creds(); + /* * If we get here, then the client has already done an "open", * and (hopefully) checked permission - so allow OWNER_OVERRIDE @@ -740,6 +742,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, out_nfserr: err = nfserrno(host_err); out: + validate_process_creds(); return err; } diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig index 72da095d4009..251da07b2a1d 100644 --- a/fs/nilfs2/Kconfig +++ b/fs/nilfs2/Kconfig @@ -1,6 +1,6 @@ config NILFS2_FS tristate "NILFS2 file system support (EXPERIMENTAL)" - depends on BLOCK && EXPERIMENTAL + depends on EXPERIMENTAL select CRC32 help NILFS2 is a log-structured file system (LFS) supporting continuous diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index 99d58a028b94..08834df6ec68 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -36,6 +36,26 @@ struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap) return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode)); } +/** + * nilfs_bmap_lookup_at_level - find a data block or node block + * @bmap: bmap + * @key: key + * @level: level + * @ptrp: place to store the value associated to @key + * + * Description: nilfs_bmap_lookup_at_level() finds a record whose key + * matches @key in the block at @level of the bmap. + * + * Return Value: On success, 0 is returned and the record associated with @key + * is stored in the place pointed by @ptrp. On error, one of the following + * negative error codes is returned. + * + * %-EIO - I/O error. + * + * %-ENOMEM - Insufficient amount of memory available. + * + * %-ENOENT - A record associated with @key does not exist. + */ int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level, __u64 *ptrp) { @@ -69,39 +89,6 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp, return ret; } -/** - * nilfs_bmap_lookup - find a record - * @bmap: bmap - * @key: key - * @recp: pointer to record - * - * Description: nilfs_bmap_lookup() finds a record whose key matches @key in - * @bmap. - * - * Return Value: On success, 0 is returned and the record associated with @key - * is stored in the place pointed by @recp. On error, one of the following - * negative error codes is returned. - * - * %-EIO - I/O error. - * - * %-ENOMEM - Insufficient amount of memory available. - * - * %-ENOENT - A record associated with @key does not exist. - */ -int nilfs_bmap_lookup(struct nilfs_bmap *bmap, - unsigned long key, - unsigned long *recp) -{ - __u64 ptr; - int ret; - - /* XXX: use macro for level 1 */ - ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr); - if (recp != NULL) - *recp = ptr; - return ret; -} - static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) { __u64 keys[NILFS_BMAP_SMALL_HIGH + 1]; @@ -469,104 +456,6 @@ __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap) (entries_per_group / NILFS_BMAP_GROUP_DIV); } -int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); -} - -void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); -} - -void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req); -} - -int nilfs_bmap_start_v(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req, - sector_t blocknr) -{ - struct inode *dat = nilfs_bmap_get_dat(bmap); - int ret; - - ret = nilfs_dat_prepare_start(dat, &req->bpr_req); - if (likely(!ret)) - nilfs_dat_commit_start(dat, &req->bpr_req, blocknr); - return ret; -} - -int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req); -} - -void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, - bmap->b_ptr_type == NILFS_BMAP_PTR_VS); -} - -void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) -{ - nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req); -} - -int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr, - sector_t blocknr) -{ - return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr); -} - -int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr) -{ - return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr); -} - -int nilfs_bmap_prepare_update_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *oldreq, - union nilfs_bmap_ptr_req *newreq) -{ - struct inode *dat = nilfs_bmap_get_dat(bmap); - int ret; - - ret = nilfs_dat_prepare_end(dat, &oldreq->bpr_req); - if (ret < 0) - return ret; - ret = nilfs_dat_prepare_alloc(dat, &newreq->bpr_req); - if (ret < 0) - nilfs_dat_abort_end(dat, &oldreq->bpr_req); - - return ret; -} - -void nilfs_bmap_commit_update_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *oldreq, - union nilfs_bmap_ptr_req *newreq) -{ - struct inode *dat = nilfs_bmap_get_dat(bmap); - - nilfs_dat_commit_end(dat, &oldreq->bpr_req, - bmap->b_ptr_type == NILFS_BMAP_PTR_VS); - nilfs_dat_commit_alloc(dat, &newreq->bpr_req); -} - -void nilfs_bmap_abort_update_v(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *oldreq, - union nilfs_bmap_ptr_req *newreq) -{ - struct inode *dat = nilfs_bmap_get_dat(bmap); - - nilfs_dat_abort_end(dat, &oldreq->bpr_req); - nilfs_dat_abort_alloc(dat, &newreq->bpr_req); -} - static struct lock_class_key nilfs_bmap_dat_lock_key; static struct lock_class_key nilfs_bmap_mdt_lock_key; diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h index b2890cdcef12..9980d7dbab91 100644 --- a/fs/nilfs2/bmap.h +++ b/fs/nilfs2/bmap.h @@ -28,6 +28,7 @@ #include <linux/buffer_head.h> #include <linux/nilfs2_fs.h> #include "alloc.h" +#include "dat.h" #define NILFS_BMAP_INVALID_PTR 0 @@ -141,7 +142,6 @@ struct nilfs_bmap { int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *); int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *); void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *); -int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *); int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned); int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long); int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long); @@ -160,90 +160,76 @@ void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *); void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *); +static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key, + __u64 *ptr) +{ + return nilfs_bmap_lookup_at_level(bmap, key, 1, ptr); +} + /* * Internal use only */ struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *); -int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *); -void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *); -void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *); static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - if (NILFS_BMAP_USE_VBN(bmap)) - return nilfs_bmap_prepare_alloc_v(bmap, req); + if (dat) + return nilfs_dat_prepare_alloc(dat, &req->bpr_req); /* ignore target ptr */ req->bpr_ptr = bmap->b_last_allocated_ptr++; return 0; } static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - if (NILFS_BMAP_USE_VBN(bmap)) - nilfs_bmap_commit_alloc_v(bmap, req); + if (dat) + nilfs_dat_commit_alloc(dat, &req->bpr_req); } static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - if (NILFS_BMAP_USE_VBN(bmap)) - nilfs_bmap_abort_alloc_v(bmap, req); + if (dat) + nilfs_dat_abort_alloc(dat, &req->bpr_req); else bmap->b_last_allocated_ptr--; } -int nilfs_bmap_prepare_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *); -void nilfs_bmap_commit_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *); -void nilfs_bmap_abort_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *); - static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - return NILFS_BMAP_USE_VBN(bmap) ? - nilfs_bmap_prepare_end_v(bmap, req) : 0; + return dat ? nilfs_dat_prepare_end(dat, &req->bpr_req) : 0; } static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - if (NILFS_BMAP_USE_VBN(bmap)) - nilfs_bmap_commit_end_v(bmap, req); + if (dat) + nilfs_dat_commit_end(dat, &req->bpr_req, + bmap->b_ptr_type == NILFS_BMAP_PTR_VS); } static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap, - union nilfs_bmap_ptr_req *req) + union nilfs_bmap_ptr_req *req, + struct inode *dat) { - if (NILFS_BMAP_USE_VBN(bmap)) - nilfs_bmap_abort_end_v(bmap, req); + if (dat) + nilfs_dat_abort_end(dat, &req->bpr_req); } -int nilfs_bmap_start_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *, - sector_t); -int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t); -int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64); - - __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *, const struct buffer_head *); __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64); __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *); -int nilfs_bmap_prepare_update_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *, - union nilfs_bmap_ptr_req *); -void nilfs_bmap_commit_update_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *, - union nilfs_bmap_ptr_req *); -void nilfs_bmap_abort_update_v(struct nilfs_bmap *, - union nilfs_bmap_ptr_req *, - union nilfs_bmap_ptr_req *); - void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int); void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int); diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 7e0b61be212e..c668bca579c1 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -209,6 +209,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, * We cannot call radix_tree_preload for the kernels older * than 2.6.23, because it is not exported for modules. */ +retry: err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM); if (err) goto failed_unlock; @@ -219,7 +220,6 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc, (unsigned long long)oldkey, (unsigned long long)newkey); -retry: spin_lock_irq(&btnc->tree_lock); err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page); spin_unlock_irq(&btnc->tree_lock); diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index aa412724b64e..e25b507a474f 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -71,21 +71,17 @@ void nilfs_btree_path_cache_destroy(void) kmem_cache_destroy(nilfs_btree_path_cache); } -static inline struct nilfs_btree_path * -nilfs_btree_alloc_path(const struct nilfs_btree *btree) +static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void) { - return (struct nilfs_btree_path *) - kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS); + return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS); } -static inline void nilfs_btree_free_path(const struct nilfs_btree *btree, - struct nilfs_btree_path *path) +static inline void nilfs_btree_free_path(struct nilfs_btree_path *path) { kmem_cache_free(nilfs_btree_path_cache, path); } -static void nilfs_btree_init_path(const struct nilfs_btree *btree, - struct nilfs_btree_path *path) +static void nilfs_btree_init_path(struct nilfs_btree_path *path) { int level; @@ -101,26 +97,13 @@ static void nilfs_btree_init_path(const struct nilfs_btree *btree, } } -static void nilfs_btree_clear_path(const struct nilfs_btree *btree, - struct nilfs_btree_path *path) +static void nilfs_btree_release_path(struct nilfs_btree_path *path) { int level; - for (level = NILFS_BTREE_LEVEL_DATA; - level < NILFS_BTREE_LEVEL_MAX; - level++) { - if (path[level].bp_bh != NULL) { - brelse(path[level].bp_bh); - path[level].bp_bh = NULL; - } - /* sib_bh is released or deleted by prepare or commit - * operations. */ - path[level].bp_sib_bh = NULL; - path[level].bp_index = 0; - path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; - path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR; - path[level].bp_op = NULL; - } + for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX; + level++) + brelse(path[level].bp_bh); } /* @@ -148,129 +131,110 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree, } static inline int -nilfs_btree_node_get_flags(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_get_flags(const struct nilfs_btree_node *node) { return node->bn_flags; } static inline void -nilfs_btree_node_set_flags(struct nilfs_btree *btree, - struct nilfs_btree_node *node, - int flags) +nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags) { node->bn_flags = flags; } -static inline int nilfs_btree_node_root(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +static inline int nilfs_btree_node_root(const struct nilfs_btree_node *node) { - return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT; + return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT; } static inline int -nilfs_btree_node_get_level(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_get_level(const struct nilfs_btree_node *node) { return node->bn_level; } static inline void -nilfs_btree_node_set_level(struct nilfs_btree *btree, - struct nilfs_btree_node *node, - int level) +nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level) { node->bn_level = level; } static inline int -nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node) { return le16_to_cpu(node->bn_nchildren); } static inline void -nilfs_btree_node_set_nchildren(struct nilfs_btree *btree, - struct nilfs_btree_node *node, - int nchildren) +nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren) { node->bn_nchildren = cpu_to_le16(nchildren); } -static inline int -nilfs_btree_node_size(const struct nilfs_btree *btree) +static inline int nilfs_btree_node_size(const struct nilfs_btree *btree) { return 1 << btree->bt_bmap.b_inode->i_blkbits; } static inline int -nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_nchildren_min(const struct nilfs_btree_node *node, + const struct nilfs_btree *btree) { - return nilfs_btree_node_root(btree, node) ? + return nilfs_btree_node_root(node) ? NILFS_BTREE_ROOT_NCHILDREN_MIN : NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree)); } static inline int -nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_nchildren_max(const struct nilfs_btree_node *node, + const struct nilfs_btree *btree) { - return nilfs_btree_node_root(btree, node) ? + return nilfs_btree_node_root(node) ? NILFS_BTREE_ROOT_NCHILDREN_MAX : NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree)); } static inline __le64 * -nilfs_btree_node_dkeys(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_dkeys(const struct nilfs_btree_node *node) { return (__le64 *)((char *)(node + 1) + - (nilfs_btree_node_root(btree, node) ? + (nilfs_btree_node_root(node) ? 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE)); } static inline __le64 * -nilfs_btree_node_dptrs(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node) +nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, + const struct nilfs_btree *btree) { - return (__le64 *)(nilfs_btree_node_dkeys(btree, node) + - nilfs_btree_node_nchildren_max(btree, node)); + return (__le64 *)(nilfs_btree_node_dkeys(node) + + nilfs_btree_node_nchildren_max(node, btree)); } static inline __u64 -nilfs_btree_node_get_key(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node, int index) +nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index) { - return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) + - index)); + return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(node) + index)); } static inline void -nilfs_btree_node_set_key(struct nilfs_btree *btree, - struct nilfs_btree_node *node, int index, __u64 key) +nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key) { - *(nilfs_btree_node_dkeys(btree, node) + index) = - nilfs_bmap_key_to_dkey(key); + *(nilfs_btree_node_dkeys(node) + index) = nilfs_bmap_key_to_dkey(key); } static inline __u64 nilfs_btree_node_get_ptr(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node, - int index) + const struct nilfs_btree_node *node, int index) { - return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) + + return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(node, btree) + index)); } static inline void nilfs_btree_node_set_ptr(struct nilfs_btree *btree, - struct nilfs_btree_node *node, - int index, - __u64 ptr) + struct nilfs_btree_node *node, int index, __u64 ptr) { - *(nilfs_btree_node_dptrs(btree, node) + index) = + *(nilfs_btree_node_dptrs(node, btree) + index) = nilfs_bmap_ptr_to_dptr(ptr); } @@ -283,12 +247,12 @@ static void nilfs_btree_node_init(struct nilfs_btree *btree, __le64 *dptrs; int i; - nilfs_btree_node_set_flags(btree, node, flags); - nilfs_btree_node_set_level(btree, node, level); - nilfs_btree_node_set_nchildren(btree, node, nchildren); + nilfs_btree_node_set_flags(node, flags); + nilfs_btree_node_set_level(node, level); + nilfs_btree_node_set_nchildren(node, nchildren); - dkeys = nilfs_btree_node_dkeys(btree, node); - dptrs = nilfs_btree_node_dptrs(btree, node); + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, btree); for (i = 0; i < nchildren; i++) { dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]); dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]); @@ -305,13 +269,13 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree, __le64 *ldptrs, *rdptrs; int lnchildren, rnchildren; - ldkeys = nilfs_btree_node_dkeys(btree, left); - ldptrs = nilfs_btree_node_dptrs(btree, left); - lnchildren = nilfs_btree_node_get_nchildren(btree, left); + ldkeys = nilfs_btree_node_dkeys(left); + ldptrs = nilfs_btree_node_dptrs(left, btree); + lnchildren = nilfs_btree_node_get_nchildren(left); - rdkeys = nilfs_btree_node_dkeys(btree, right); - rdptrs = nilfs_btree_node_dptrs(btree, right); - rnchildren = nilfs_btree_node_get_nchildren(btree, right); + rdkeys = nilfs_btree_node_dkeys(right); + rdptrs = nilfs_btree_node_dptrs(right, btree); + rnchildren = nilfs_btree_node_get_nchildren(right); memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys)); memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs)); @@ -320,8 +284,8 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree, lnchildren += n; rnchildren -= n; - nilfs_btree_node_set_nchildren(btree, left, lnchildren); - nilfs_btree_node_set_nchildren(btree, right, rnchildren); + nilfs_btree_node_set_nchildren(left, lnchildren); + nilfs_btree_node_set_nchildren(right, rnchildren); } /* Assume that the buffer heads corresponding to left and right are locked. */ @@ -334,13 +298,13 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree, __le64 *ldptrs, *rdptrs; int lnchildren, rnchildren; - ldkeys = nilfs_btree_node_dkeys(btree, left); - ldptrs = nilfs_btree_node_dptrs(btree, left); - lnchildren = nilfs_btree_node_get_nchildren(btree, left); + ldkeys = nilfs_btree_node_dkeys(left); + ldptrs = nilfs_btree_node_dptrs(left, btree); + lnchildren = nilfs_btree_node_get_nchildren(left); - rdkeys = nilfs_btree_node_dkeys(btree, right); - rdptrs = nilfs_btree_node_dptrs(btree, right); - rnchildren = nilfs_btree_node_get_nchildren(btree, right); + rdkeys = nilfs_btree_node_dkeys(right); + rdptrs = nilfs_btree_node_dptrs(right, btree); + rnchildren = nilfs_btree_node_get_nchildren(right); memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys)); memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs)); @@ -349,8 +313,8 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree, lnchildren -= n; rnchildren += n; - nilfs_btree_node_set_nchildren(btree, left, lnchildren); - nilfs_btree_node_set_nchildren(btree, right, rnchildren); + nilfs_btree_node_set_nchildren(left, lnchildren); + nilfs_btree_node_set_nchildren(right, rnchildren); } /* Assume that the buffer head corresponding to node is locked. */ @@ -362,9 +326,9 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree, __le64 *dptrs; int nchildren; - dkeys = nilfs_btree_node_dkeys(btree, node); - dptrs = nilfs_btree_node_dptrs(btree, node); - nchildren = nilfs_btree_node_get_nchildren(btree, node); + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, btree); + nchildren = nilfs_btree_node_get_nchildren(node); if (index < nchildren) { memmove(dkeys + index + 1, dkeys + index, (nchildren - index) * sizeof(*dkeys)); @@ -374,7 +338,7 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree, dkeys[index] = nilfs_bmap_key_to_dkey(key); dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr); nchildren++; - nilfs_btree_node_set_nchildren(btree, node, nchildren); + nilfs_btree_node_set_nchildren(node, nchildren); } /* Assume that the buffer head corresponding to node is locked. */ @@ -388,11 +352,11 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree, __le64 *dptrs; int nchildren; - dkeys = nilfs_btree_node_dkeys(btree, node); - dptrs = nilfs_btree_node_dptrs(btree, node); + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, btree); key = nilfs_bmap_dkey_to_key(dkeys[index]); ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]); - nchildren = nilfs_btree_node_get_nchildren(btree, node); + nchildren = nilfs_btree_node_get_nchildren(node); if (keyp != NULL) *keyp = key; if (ptrp != NULL) @@ -405,11 +369,10 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree, (nchildren - index - 1) * sizeof(*dptrs)); } nchildren--; - nilfs_btree_node_set_nchildren(btree, node, nchildren); + nilfs_btree_node_set_nchildren(node, nchildren); } -static int nilfs_btree_node_lookup(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node, +static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node, __u64 key, int *indexp) { __u64 nkey; @@ -417,12 +380,12 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree, /* binary search */ low = 0; - high = nilfs_btree_node_get_nchildren(btree, node) - 1; + high = nilfs_btree_node_get_nchildren(node) - 1; index = 0; s = 0; while (low <= high) { index = (low + high) / 2; - nkey = nilfs_btree_node_get_key(btree, node, index); + nkey = nilfs_btree_node_get_key(node, index); if (nkey == key) { s = 0; goto out; @@ -436,9 +399,8 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree, } /* adjust index */ - if (nilfs_btree_node_get_level(btree, node) > - NILFS_BTREE_LEVEL_NODE_MIN) { - if ((s > 0) && (index > 0)) + if (nilfs_btree_node_get_level(node) > NILFS_BTREE_LEVEL_NODE_MIN) { + if (s > 0 && index > 0) index--; } else if (s < 0) index++; @@ -456,25 +418,20 @@ nilfs_btree_get_root(const struct nilfs_btree *btree) } static inline struct nilfs_btree_node * -nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree, - const struct nilfs_btree_path *path, - int level) +nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level) { return (struct nilfs_btree_node *)path[level].bp_bh->b_data; } static inline struct nilfs_btree_node * -nilfs_btree_get_sib_node(const struct nilfs_btree *btree, - const struct nilfs_btree_path *path, - int level) +nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level) { return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data; } static inline int nilfs_btree_height(const struct nilfs_btree *btree) { - return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree)) - + 1; + return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1; } static inline struct nilfs_btree_node * @@ -484,7 +441,7 @@ nilfs_btree_get_node(const struct nilfs_btree *btree, { return (level == nilfs_btree_height(btree) - 1) ? nilfs_btree_get_root(btree) : - nilfs_btree_get_nonroot_node(btree, path, level); + nilfs_btree_get_nonroot_node(path, level); } static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, @@ -496,12 +453,11 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, int level, index, found, ret; node = nilfs_btree_get_root(btree); - level = nilfs_btree_node_get_level(btree, node); - if ((level < minlevel) || - (nilfs_btree_node_get_nchildren(btree, node) <= 0)) + level = nilfs_btree_node_get_level(node); + if (level < minlevel || nilfs_btree_node_get_nchildren(node) <= 0) return -ENOENT; - found = nilfs_btree_node_lookup(btree, node, key, &index); + found = nilfs_btree_node_lookup(node, key, &index); ptr = nilfs_btree_node_get_ptr(btree, node, index); path[level].bp_bh = NULL; path[level].bp_index = index; @@ -510,14 +466,13 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); if (ret < 0) return ret; - node = nilfs_btree_get_nonroot_node(btree, path, level); - BUG_ON(level != nilfs_btree_node_get_level(btree, node)); + node = nilfs_btree_get_nonroot_node(path, level); + BUG_ON(level != nilfs_btree_node_get_level(node)); if (!found) - found = nilfs_btree_node_lookup(btree, node, key, - &index); + found = nilfs_btree_node_lookup(node, key, &index); else index = 0; - if (index < nilfs_btree_node_nchildren_max(btree, node)) + if (index < nilfs_btree_node_nchildren_max(node, btree)) ptr = nilfs_btree_node_get_ptr(btree, node, index); else { WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN); @@ -544,10 +499,10 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, int index, level, ret; node = nilfs_btree_get_root(btree); - index = nilfs_btree_node_get_nchildren(btree, node) - 1; + index = nilfs_btree_node_get_nchildren(node) - 1; if (index < 0) return -ENOENT; - level = nilfs_btree_node_get_level(btree, node); + level = nilfs_btree_node_get_level(node); ptr = nilfs_btree_node_get_ptr(btree, node, index); path[level].bp_bh = NULL; path[level].bp_index = index; @@ -556,15 +511,15 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); if (ret < 0) return ret; - node = nilfs_btree_get_nonroot_node(btree, path, level); - BUG_ON(level != nilfs_btree_node_get_level(btree, node)); - index = nilfs_btree_node_get_nchildren(btree, node) - 1; + node = nilfs_btree_get_nonroot_node(path, level); + BUG_ON(level != nilfs_btree_node_get_level(node)); + index = nilfs_btree_node_get_nchildren(node) - 1; ptr = nilfs_btree_node_get_ptr(btree, node, index); path[level].bp_index = index; } if (keyp != NULL) - *keyp = nilfs_btree_node_get_key(btree, node, index); + *keyp = nilfs_btree_node_get_key(node, index); if (ptrp != NULL) *ptrp = ptr; @@ -580,18 +535,18 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap, int ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); if (ptrp != NULL) *ptrp = ptr; - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -608,10 +563,10 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, int level = NILFS_BTREE_LEVEL_NODE_MIN; int ret, cnt, index, maxlevel; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); if (ret < 0) goto out; @@ -631,8 +586,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, node = nilfs_btree_get_node(btree, path, level); index = path[level].bp_index + 1; for (;;) { - while (index < nilfs_btree_node_get_nchildren(btree, node)) { - if (nilfs_btree_node_get_key(btree, node, index) != + while (index < nilfs_btree_node_get_nchildren(node)) { + if (nilfs_btree_node_get_key(node, index) != key + cnt) goto end; ptr2 = nilfs_btree_node_get_ptr(btree, node, index); @@ -653,8 +608,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, /* look-up right sibling node */ node = nilfs_btree_get_node(btree, path, level + 1); index = path[level + 1].bp_index + 1; - if (index >= nilfs_btree_node_get_nchildren(btree, node) || - nilfs_btree_node_get_key(btree, node, index) != key + cnt) + if (index >= nilfs_btree_node_get_nchildren(node) || + nilfs_btree_node_get_key(node, index) != key + cnt) break; ptr2 = nilfs_btree_node_get_ptr(btree, node, index); path[level + 1].bp_index = index; @@ -664,7 +619,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh); if (ret < 0) goto out; - node = nilfs_btree_get_nonroot_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); index = 0; path[level].bp_index = index; } @@ -672,8 +627,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, *ptrp = ptr; ret = cnt; out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -685,9 +640,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree, do { lock_buffer(path[level].bp_bh); nilfs_btree_node_set_key( - btree, - nilfs_btree_get_nonroot_node( - btree, path, level), + nilfs_btree_get_nonroot_node(path, level), path[level].bp_index, key); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); @@ -698,8 +651,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree, /* root */ if (level == nilfs_btree_height(btree) - 1) { - nilfs_btree_node_set_key(btree, - nilfs_btree_get_root(btree), + nilfs_btree_node_set_key(nilfs_btree_get_root(btree), path[level].bp_index, key); } } @@ -712,7 +664,7 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree, if (level < nilfs_btree_height(btree) - 1) { lock_buffer(path[level].bp_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); nilfs_btree_node_insert(btree, node, *keyp, *ptrp, path[level].bp_index); if (!buffer_dirty(path[level].bp_bh)) @@ -721,8 +673,8 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree, if (path[level].bp_index == 0) nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key( - btree, node, 0)); + nilfs_btree_node_get_key(node, + 0)); } else { node = nilfs_btree_get_root(btree); nilfs_btree_node_insert(btree, node, *keyp, *ptrp, @@ -740,10 +692,10 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - left = nilfs_btree_get_sib_node(btree, path, level); - nchildren = nilfs_btree_node_get_nchildren(btree, node); - lnchildren = nilfs_btree_node_get_nchildren(btree, left); + node = nilfs_btree_get_nonroot_node(path, level); + left = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + lnchildren = nilfs_btree_node_get_nchildren(left); move = 0; n = (nchildren + lnchildren + 1) / 2 - lnchildren; @@ -764,7 +716,7 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree, unlock_buffer(path[level].bp_sib_bh); nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key(btree, node, 0)); + nilfs_btree_node_get_key(node, 0)); if (move) { brelse(path[level].bp_bh); @@ -791,10 +743,10 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - right = nilfs_btree_get_sib_node(btree, path, level); - nchildren = nilfs_btree_node_get_nchildren(btree, node); - rnchildren = nilfs_btree_node_get_nchildren(btree, right); + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + rnchildren = nilfs_btree_node_get_nchildren(right); move = 0; n = (nchildren + rnchildren + 1) / 2 - rnchildren; @@ -816,15 +768,14 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree, path[level + 1].bp_index++; nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key(btree, right, 0)); + nilfs_btree_node_get_key(right, 0)); path[level + 1].bp_index--; if (move) { brelse(path[level].bp_bh); path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; - path[level].bp_index -= - nilfs_btree_node_get_nchildren(btree, node); + path[level].bp_index -= nilfs_btree_node_get_nchildren(node); path[level + 1].bp_index++; } else { brelse(path[level].bp_sib_bh); @@ -846,9 +797,9 @@ static void nilfs_btree_split(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - right = nilfs_btree_get_sib_node(btree, path, level); - nchildren = nilfs_btree_node_get_nchildren(btree, node); + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); move = 0; n = (nchildren + 1) / 2; @@ -867,16 +818,15 @@ static void nilfs_btree_split(struct nilfs_btree *btree, unlock_buffer(path[level].bp_bh); unlock_buffer(path[level].bp_sib_bh); - newkey = nilfs_btree_node_get_key(btree, right, 0); + newkey = nilfs_btree_node_get_key(right, 0); newptr = path[level].bp_newreq.bpr_ptr; if (move) { - path[level].bp_index -= - nilfs_btree_node_get_nchildren(btree, node); + path[level].bp_index -= nilfs_btree_node_get_nchildren(node); nilfs_btree_node_insert(btree, right, *keyp, *ptrp, path[level].bp_index); - *keyp = nilfs_btree_node_get_key(btree, right, 0); + *keyp = nilfs_btree_node_get_key(right, 0); *ptrp = path[level].bp_newreq.bpr_ptr; brelse(path[level].bp_bh); @@ -885,7 +835,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree, } else { nilfs_btree_do_insert(btree, path, level, keyp, ptrp); - *keyp = nilfs_btree_node_get_key(btree, right, 0); + *keyp = nilfs_btree_node_get_key(right, 0); *ptrp = path[level].bp_newreq.bpr_ptr; brelse(path[level].bp_sib_bh); @@ -905,12 +855,12 @@ static void nilfs_btree_grow(struct nilfs_btree *btree, lock_buffer(path[level].bp_sib_bh); root = nilfs_btree_get_root(btree); - child = nilfs_btree_get_sib_node(btree, path, level); + child = nilfs_btree_get_sib_node(path, level); - n = nilfs_btree_node_get_nchildren(btree, root); + n = nilfs_btree_node_get_nchildren(root); nilfs_btree_node_move_right(btree, root, child, n); - nilfs_btree_node_set_level(btree, root, level + 1); + nilfs_btree_node_set_level(root, level + 1); if (!buffer_dirty(path[level].bp_sib_bh)) nilfs_btnode_mark_dirty(path[level].bp_sib_bh); @@ -922,7 +872,7 @@ static void nilfs_btree_grow(struct nilfs_btree *btree, nilfs_btree_do_insert(btree, path, level, keyp, ptrp); - *keyp = nilfs_btree_node_get_key(btree, child, 0); + *keyp = nilfs_btree_node_get_key(child, 0); *ptrp = path[level].bp_newreq.bpr_ptr; } @@ -990,26 +940,29 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, struct nilfs_btree_node *node, *parent, *sib; __u64 sibptr; int pindex, level, ret; + struct inode *dat = NULL; stats->bs_nblocks = 0; level = NILFS_BTREE_LEVEL_DATA; /* allocate a new ptr for data block */ - if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) + if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) { path[level].bp_newreq.bpr_ptr = nilfs_btree_find_target_v(btree, path, key); + dat = nilfs_bmap_get_dat(&btree->bt_bmap); + } ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq); + &path[level].bp_newreq, dat); if (ret < 0) goto err_out_data; for (level = NILFS_BTREE_LEVEL_NODE_MIN; level < nilfs_btree_height(btree) - 1; level++) { - node = nilfs_btree_get_nonroot_node(btree, path, level); - if (nilfs_btree_node_get_nchildren(btree, node) < - nilfs_btree_node_nchildren_max(btree, node)) { + node = nilfs_btree_get_nonroot_node(path, level); + if (nilfs_btree_node_get_nchildren(node) < + nilfs_btree_node_nchildren_max(node, btree)) { path[level].bp_op = nilfs_btree_do_insert; stats->bs_nblocks++; goto out; @@ -1026,8 +979,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, if (ret < 0) goto err_out_child_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(btree, sib) < - nilfs_btree_node_nchildren_max(btree, sib)) { + if (nilfs_btree_node_get_nchildren(sib) < + nilfs_btree_node_nchildren_max(sib, btree)) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_carry_left; stats->bs_nblocks++; @@ -1038,15 +991,15 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* right sibling */ if (pindex < - nilfs_btree_node_get_nchildren(btree, parent) - 1) { + nilfs_btree_node_get_nchildren(parent) - 1) { sibptr = nilfs_btree_node_get_ptr(btree, parent, pindex + 1); ret = nilfs_btree_get_block(btree, sibptr, &bh); if (ret < 0) goto err_out_child_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(btree, sib) < - nilfs_btree_node_nchildren_max(btree, sib)) { + if (nilfs_btree_node_get_nchildren(sib) < + nilfs_btree_node_nchildren_max(sib, btree)) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_carry_right; stats->bs_nblocks++; @@ -1059,7 +1012,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq); + &path[level].bp_newreq, dat); if (ret < 0) goto err_out_child_node; ret = nilfs_btree_get_new_block(btree, @@ -1081,8 +1034,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* root */ node = nilfs_btree_get_root(btree); - if (nilfs_btree_node_get_nchildren(btree, node) < - nilfs_btree_node_nchildren_max(btree, node)) { + if (nilfs_btree_node_get_nchildren(node) < + nilfs_btree_node_nchildren_max(node, btree)) { path[level].bp_op = nilfs_btree_do_insert; stats->bs_nblocks++; goto out; @@ -1091,7 +1044,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* grow */ path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq); + &path[level].bp_newreq, dat); if (ret < 0) goto err_out_child_node; ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr, @@ -1119,16 +1072,18 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* error */ err_out_curr_node: - nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq); + nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq, + dat); err_out_child_node: for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) { nilfs_btnode_delete(path[level].bp_sib_bh); nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq); + &path[level].bp_newreq, dat); } - nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq); + nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq, + dat); err_out_data: *levelp = level; stats->bs_nblocks = 0; @@ -1139,16 +1094,19 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree, struct nilfs_btree_path *path, int maxlevel, __u64 key, __u64 ptr) { + struct inode *dat = NULL; int level; set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr; - if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) + if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) { nilfs_btree_set_target_v(btree, key, ptr); + dat = nilfs_bmap_get_dat(&btree->bt_bmap); + } for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap, - &path[level - 1].bp_newreq); + &path[level - 1].bp_newreq, dat); path[level].bp_op(btree, path, level, &key, &ptr); } @@ -1164,10 +1122,10 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) int level, ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, NULL, NILFS_BTREE_LEVEL_NODE_MIN); @@ -1184,8 +1142,8 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -1197,7 +1155,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree, if (level < nilfs_btree_height(btree) - 1) { lock_buffer(path[level].bp_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); nilfs_btree_node_delete(btree, node, keyp, ptrp, path[level].bp_index); if (!buffer_dirty(path[level].bp_bh)) @@ -1205,7 +1163,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree, unlock_buffer(path[level].bp_bh); if (path[level].bp_index == 0) nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key(btree, node, 0)); + nilfs_btree_node_get_key(node, 0)); } else { node = nilfs_btree_get_root(btree); nilfs_btree_node_delete(btree, node, keyp, ptrp, @@ -1225,10 +1183,10 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - left = nilfs_btree_get_sib_node(btree, path, level); - nchildren = nilfs_btree_node_get_nchildren(btree, node); - lnchildren = nilfs_btree_node_get_nchildren(btree, left); + node = nilfs_btree_get_nonroot_node(path, level); + left = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + lnchildren = nilfs_btree_node_get_nchildren(left); n = (nchildren + lnchildren) / 2 - nchildren; @@ -1243,7 +1201,7 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree, unlock_buffer(path[level].bp_sib_bh); nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key(btree, node, 0)); + nilfs_btree_node_get_key(node, 0)); brelse(path[level].bp_sib_bh); path[level].bp_sib_bh = NULL; @@ -1262,10 +1220,10 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - right = nilfs_btree_get_sib_node(btree, path, level); - nchildren = nilfs_btree_node_get_nchildren(btree, node); - rnchildren = nilfs_btree_node_get_nchildren(btree, right); + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); + nchildren = nilfs_btree_node_get_nchildren(node); + rnchildren = nilfs_btree_node_get_nchildren(right); n = (nchildren + rnchildren) / 2 - nchildren; @@ -1281,7 +1239,7 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree, path[level + 1].bp_index++; nilfs_btree_promote_key(btree, path, level + 1, - nilfs_btree_node_get_key(btree, right, 0)); + nilfs_btree_node_get_key(right, 0)); path[level + 1].bp_index--; brelse(path[level].bp_sib_bh); @@ -1300,10 +1258,10 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - left = nilfs_btree_get_sib_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); + left = nilfs_btree_get_sib_node(path, level); - n = nilfs_btree_node_get_nchildren(btree, node); + n = nilfs_btree_node_get_nchildren(node); nilfs_btree_node_move_left(btree, left, node, n); @@ -1316,7 +1274,7 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree, nilfs_btnode_delete(path[level].bp_bh); path[level].bp_bh = path[level].bp_sib_bh; path[level].bp_sib_bh = NULL; - path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left); + path[level].bp_index += nilfs_btree_node_get_nchildren(left); } static void nilfs_btree_concat_right(struct nilfs_btree *btree, @@ -1331,10 +1289,10 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); lock_buffer(path[level].bp_sib_bh); - node = nilfs_btree_get_nonroot_node(btree, path, level); - right = nilfs_btree_get_sib_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); + right = nilfs_btree_get_sib_node(path, level); - n = nilfs_btree_node_get_nchildren(btree, right); + n = nilfs_btree_node_get_nchildren(right); nilfs_btree_node_move_left(btree, node, right, n); @@ -1360,11 +1318,11 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree, lock_buffer(path[level].bp_bh); root = nilfs_btree_get_root(btree); - child = nilfs_btree_get_nonroot_node(btree, path, level); + child = nilfs_btree_get_nonroot_node(path, level); nilfs_btree_node_delete(btree, root, NULL, NULL, 0); - nilfs_btree_node_set_level(btree, root, level); - n = nilfs_btree_node_get_nchildren(btree, child); + nilfs_btree_node_set_level(root, level); + n = nilfs_btree_node_get_nchildren(child); nilfs_btree_node_move_left(btree, root, child, n); unlock_buffer(path[level].bp_bh); @@ -1376,7 +1334,8 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree, static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, struct nilfs_btree_path *path, int *levelp, - struct nilfs_bmap_stats *stats) + struct nilfs_bmap_stats *stats, + struct inode *dat) { struct buffer_head *bh; struct nilfs_btree_node *node, *parent, *sib; @@ -1388,17 +1347,17 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, for (level = NILFS_BTREE_LEVEL_NODE_MIN; level < nilfs_btree_height(btree) - 1; level++) { - node = nilfs_btree_get_nonroot_node(btree, path, level); + node = nilfs_btree_get_nonroot_node(path, level); path[level].bp_oldreq.bpr_ptr = nilfs_btree_node_get_ptr(btree, node, path[level].bp_index); ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, - &path[level].bp_oldreq); + &path[level].bp_oldreq, dat); if (ret < 0) goto err_out_child_node; - if (nilfs_btree_node_get_nchildren(btree, node) > - nilfs_btree_node_nchildren_min(btree, node)) { + if (nilfs_btree_node_get_nchildren(node) > + nilfs_btree_node_nchildren_min(node, btree)) { path[level].bp_op = nilfs_btree_do_delete; stats->bs_nblocks++; goto out; @@ -1415,8 +1374,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, if (ret < 0) goto err_out_curr_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(btree, sib) > - nilfs_btree_node_nchildren_min(btree, sib)) { + if (nilfs_btree_node_get_nchildren(sib) > + nilfs_btree_node_nchildren_min(sib, btree)) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_borrow_left; stats->bs_nblocks++; @@ -1428,7 +1387,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, /* continue; */ } } else if (pindex < - nilfs_btree_node_get_nchildren(btree, parent) - 1) { + nilfs_btree_node_get_nchildren(parent) - 1) { /* right sibling */ sibptr = nilfs_btree_node_get_ptr(btree, parent, pindex + 1); @@ -1436,8 +1395,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, if (ret < 0) goto err_out_curr_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(btree, sib) > - nilfs_btree_node_nchildren_min(btree, sib)) { + if (nilfs_btree_node_get_nchildren(sib) > + nilfs_btree_node_nchildren_min(sib, btree)) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_borrow_right; stats->bs_nblocks++; @@ -1452,7 +1411,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, /* no siblings */ /* the only child of the root node */ WARN_ON(level != nilfs_btree_height(btree) - 2); - if (nilfs_btree_node_get_nchildren(btree, node) - 1 <= + if (nilfs_btree_node_get_nchildren(node) - 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) { path[level].bp_op = nilfs_btree_shrink; stats->bs_nblocks += 2; @@ -1471,7 +1430,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, nilfs_btree_node_get_ptr(btree, node, path[level].bp_index); ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, - &path[level].bp_oldreq); + &path[level].bp_oldreq, dat); if (ret < 0) goto err_out_child_node; @@ -1486,12 +1445,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, /* error */ err_out_curr_node: - nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq); + nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq, dat); err_out_child_node: for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) { brelse(path[level].bp_sib_bh); nilfs_bmap_abort_end_ptr(&btree->bt_bmap, - &path[level].bp_oldreq); + &path[level].bp_oldreq, dat); } *levelp = level; stats->bs_nblocks = 0; @@ -1500,13 +1459,13 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, static void nilfs_btree_commit_delete(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int maxlevel) + int maxlevel, struct inode *dat) { int level; for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { nilfs_bmap_commit_end_ptr(&btree->bt_bmap, - &path[level].bp_oldreq); + &path[level].bp_oldreq, dat); path[level].bp_op(btree, path, level, NULL, NULL); } @@ -1520,27 +1479,32 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key) struct nilfs_btree *btree; struct nilfs_btree_path *path; struct nilfs_bmap_stats stats; + struct inode *dat; int level, ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, NULL, NILFS_BTREE_LEVEL_NODE_MIN); if (ret < 0) goto out; - ret = nilfs_btree_prepare_delete(btree, path, &level, &stats); + + dat = NILFS_BMAP_USE_VBN(&btree->bt_bmap) ? + nilfs_bmap_get_dat(&btree->bt_bmap) : NULL; + + ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat); if (ret < 0) goto out; - nilfs_btree_commit_delete(btree, path, level); + nilfs_btree_commit_delete(btree, path, level, dat); nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -1551,15 +1515,15 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) int ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL); - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -1581,7 +1545,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key) node = root; break; case 3: - nchildren = nilfs_btree_node_get_nchildren(btree, root); + nchildren = nilfs_btree_node_get_nchildren(root); if (nchildren > 1) return 0; ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); @@ -1594,10 +1558,10 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key) return 0; } - nchildren = nilfs_btree_node_get_nchildren(btree, node); - maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1); + nchildren = nilfs_btree_node_get_nchildren(node); + maxkey = nilfs_btree_node_get_key(node, nchildren - 1); nextmaxkey = (nchildren > 1) ? - nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0; + nilfs_btree_node_get_key(node, nchildren - 2) : 0; if (bh != NULL) brelse(bh); @@ -1623,7 +1587,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, node = root; break; case 3: - nchildren = nilfs_btree_node_get_nchildren(btree, root); + nchildren = nilfs_btree_node_get_nchildren(root); WARN_ON(nchildren > 1); ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); ret = nilfs_btree_get_block(btree, ptr, &bh); @@ -1636,11 +1600,11 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, return -EINVAL; } - nchildren = nilfs_btree_node_get_nchildren(btree, node); + nchildren = nilfs_btree_node_get_nchildren(node); if (nchildren < nitems) nitems = nchildren; - dkeys = nilfs_btree_node_dkeys(btree, node); - dptrs = nilfs_btree_node_dptrs(btree, node); + dkeys = nilfs_btree_node_dkeys(node); + dptrs = nilfs_btree_node_dptrs(node, btree); for (i = 0; i < nitems; i++) { keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]); ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]); @@ -1660,18 +1624,20 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, struct nilfs_bmap_stats *stats) { struct buffer_head *bh; - struct nilfs_btree *btree; + struct nilfs_btree *btree = (struct nilfs_btree *)bmap; + struct inode *dat = NULL; int ret; - btree = (struct nilfs_btree *)bmap; stats->bs_nblocks = 0; /* for data */ /* cannot find near ptr */ - if (NILFS_BMAP_USE_VBN(bmap)) + if (NILFS_BMAP_USE_VBN(bmap)) { dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key); + dat = nilfs_bmap_get_dat(bmap); + } - ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq); + ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq, dat); if (ret < 0) return ret; @@ -1679,7 +1645,7 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, stats->bs_nblocks++; if (nreq != NULL) { nreq->bpr_ptr = dreq->bpr_ptr + 1; - ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq); + ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq, dat); if (ret < 0) goto err_out_dreq; @@ -1696,9 +1662,9 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, /* error */ err_out_nreq: - nilfs_bmap_abort_alloc_ptr(bmap, nreq); + nilfs_bmap_abort_alloc_ptr(bmap, nreq, dat); err_out_dreq: - nilfs_bmap_abort_alloc_ptr(bmap, dreq); + nilfs_bmap_abort_alloc_ptr(bmap, dreq, dat); stats->bs_nblocks = 0; return ret; @@ -1713,8 +1679,9 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *nreq, struct buffer_head *bh) { - struct nilfs_btree *btree; + struct nilfs_btree *btree = (struct nilfs_btree *)bmap; struct nilfs_btree_node *node; + struct inode *dat; __u64 tmpptr; /* free resources */ @@ -1725,11 +1692,11 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); /* convert and insert */ - btree = (struct nilfs_btree *)bmap; + dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL; nilfs_btree_init(bmap); if (nreq != NULL) { - nilfs_bmap_commit_alloc_ptr(bmap, dreq); - nilfs_bmap_commit_alloc_ptr(bmap, nreq); + nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat); + nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat); /* create child node at level 1 */ lock_buffer(bh); @@ -1751,7 +1718,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, 2, 1, &keys[0], &tmpptr); } else { - nilfs_bmap_commit_alloc_ptr(bmap, dreq); + nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat); /* create root node at level 1 */ node = nilfs_btree_get_root(btree); @@ -1822,7 +1789,7 @@ static int nilfs_btree_propagate_p(struct nilfs_btree *btree, static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int level) + int level, struct inode *dat) { struct nilfs_btree_node *parent; int ret; @@ -1832,9 +1799,8 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, nilfs_btree_node_get_ptr(btree, parent, path[level + 1].bp_index); path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1; - ret = nilfs_bmap_prepare_update_v(&btree->bt_bmap, - &path[level].bp_oldreq, - &path[level].bp_newreq); + ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req); if (ret < 0) return ret; @@ -1846,9 +1812,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, &path[level].bp_ctxt); if (ret < 0) { - nilfs_bmap_abort_update_v(&btree->bt_bmap, - &path[level].bp_oldreq, - &path[level].bp_newreq); + nilfs_dat_abort_update(dat, + &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req); return ret; } } @@ -1858,13 +1824,13 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, static void nilfs_btree_commit_update_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int level) + int level, struct inode *dat) { struct nilfs_btree_node *parent; - nilfs_bmap_commit_update_v(&btree->bt_bmap, - &path[level].bp_oldreq, - &path[level].bp_newreq); + nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req, + btree->bt_bmap.b_ptr_type == NILFS_BMAP_PTR_VS); if (buffer_nilfs_node(path[level].bp_bh)) { nilfs_btnode_commit_change_key( @@ -1881,11 +1847,10 @@ static void nilfs_btree_commit_update_v(struct nilfs_btree *btree, static void nilfs_btree_abort_update_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int level) + int level, struct inode *dat) { - nilfs_bmap_abort_update_v(&btree->bt_bmap, - &path[level].bp_oldreq, - &path[level].bp_newreq); + nilfs_dat_abort_update(dat, &path[level].bp_oldreq.bpr_req, + &path[level].bp_newreq.bpr_req); if (buffer_nilfs_node(path[level].bp_bh)) nilfs_btnode_abort_change_key( &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, @@ -1894,14 +1859,14 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree, static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int minlevel, - int *maxlevelp) + int minlevel, int *maxlevelp, + struct inode *dat) { int level, ret; level = minlevel; if (!buffer_nilfs_volatile(path[level].bp_bh)) { - ret = nilfs_btree_prepare_update_v(btree, path, level); + ret = nilfs_btree_prepare_update_v(btree, path, level, dat); if (ret < 0) return ret; } @@ -1909,7 +1874,7 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, !buffer_dirty(path[level].bp_bh)) { WARN_ON(buffer_nilfs_volatile(path[level].bp_bh)); - ret = nilfs_btree_prepare_update_v(btree, path, level); + ret = nilfs_btree_prepare_update_v(btree, path, level, dat); if (ret < 0) goto out; } @@ -1921,39 +1886,40 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, /* error */ out: while (--level > minlevel) - nilfs_btree_abort_update_v(btree, path, level); + nilfs_btree_abort_update_v(btree, path, level, dat); if (!buffer_nilfs_volatile(path[level].bp_bh)) - nilfs_btree_abort_update_v(btree, path, level); + nilfs_btree_abort_update_v(btree, path, level, dat); return ret; } static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int minlevel, - int maxlevel, - struct buffer_head *bh) + int minlevel, int maxlevel, + struct buffer_head *bh, + struct inode *dat) { int level; if (!buffer_nilfs_volatile(path[minlevel].bp_bh)) - nilfs_btree_commit_update_v(btree, path, minlevel); + nilfs_btree_commit_update_v(btree, path, minlevel, dat); for (level = minlevel + 1; level <= maxlevel; level++) - nilfs_btree_commit_update_v(btree, path, level); + nilfs_btree_commit_update_v(btree, path, level, dat); } static int nilfs_btree_propagate_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, - int level, - struct buffer_head *bh) + int level, struct buffer_head *bh) { int maxlevel, ret; struct nilfs_btree_node *parent; + struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); __u64 ptr; get_bh(bh); path[level].bp_bh = bh; - ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel); + ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel, + dat); if (ret < 0) goto out; @@ -1961,12 +1927,12 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree, parent = nilfs_btree_get_node(btree, path, level + 1); ptr = nilfs_btree_node_get_ptr(btree, parent, path[level + 1].bp_index); - ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr); + ret = nilfs_dat_mark_dirty(dat, ptr); if (ret < 0) goto out; } - nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh); + nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh, dat); out: brelse(path[level].bp_bh); @@ -1986,15 +1952,15 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, WARN_ON(!buffer_dirty(bh)); btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); if (buffer_nilfs_node(bh)) { node = (struct nilfs_btree_node *)bh->b_data; - key = nilfs_btree_node_get_key(btree, node, 0); - level = nilfs_btree_node_get_level(btree, node); + key = nilfs_btree_node_get_key(node, 0); + level = nilfs_btree_node_get_level(node); } else { key = nilfs_bmap_data_get_key(bmap, bh); level = NILFS_BTREE_LEVEL_DATA; @@ -2013,8 +1979,8 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, nilfs_btree_propagate_p(btree, path, level, bh); out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -2022,7 +1988,7 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap, struct buffer_head *bh) { - return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr); + return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), bh->b_blocknr); } static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, @@ -2037,12 +2003,12 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, get_bh(bh); node = (struct nilfs_btree_node *)bh->b_data; - key = nilfs_btree_node_get_key(btree, node, 0); - level = nilfs_btree_node_get_level(btree, node); + key = nilfs_btree_node_get_key(node, 0); + level = nilfs_btree_node_get_level(node); list_for_each(head, &lists[level]) { cbh = list_entry(head, struct buffer_head, b_assoc_buffers); cnode = (struct nilfs_btree_node *)cbh->b_data; - ckey = nilfs_btree_node_get_key(btree, cnode, 0); + ckey = nilfs_btree_node_get_key(cnode, 0); if (key < ckey) break; } @@ -2120,8 +2086,7 @@ static int nilfs_btree_assign_p(struct nilfs_btree *btree, nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index, blocknr); - key = nilfs_btree_node_get_key(btree, parent, - path[level + 1].bp_index); + key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); /* on-disk format */ binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); binfo->bi_dat.bi_level = level; @@ -2137,6 +2102,7 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree, union nilfs_binfo *binfo) { struct nilfs_btree_node *parent; + struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); __u64 key; __u64 ptr; union nilfs_bmap_ptr_req req; @@ -2146,12 +2112,12 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree, ptr = nilfs_btree_node_get_ptr(btree, parent, path[level + 1].bp_index); req.bpr_ptr = ptr; - ret = nilfs_bmap_start_v(&btree->bt_bmap, &req, blocknr); - if (unlikely(ret < 0)) + ret = nilfs_dat_prepare_start(dat, &req.bpr_req); + if (ret < 0) return ret; + nilfs_dat_commit_start(dat, &req.bpr_req, blocknr); - key = nilfs_btree_node_get_key(btree, parent, - path[level + 1].bp_index); + key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); /* on-disk format */ binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); @@ -2171,15 +2137,15 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap, int level, ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); if (buffer_nilfs_node(*bh)) { node = (struct nilfs_btree_node *)(*bh)->b_data; - key = nilfs_btree_node_get_key(btree, node, 0); - level = nilfs_btree_node_get_level(btree, node); + key = nilfs_btree_node_get_key(node, 0); + level = nilfs_btree_node_get_level(node); } else { key = nilfs_bmap_data_get_key(bmap, *bh); level = NILFS_BTREE_LEVEL_DATA; @@ -2196,8 +2162,8 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap, nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo); out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } @@ -2207,19 +2173,18 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap, sector_t blocknr, union nilfs_binfo *binfo) { - struct nilfs_btree *btree; struct nilfs_btree_node *node; __u64 key; int ret; - btree = (struct nilfs_btree *)bmap; - ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr); + ret = nilfs_dat_move(nilfs_bmap_get_dat(bmap), (*bh)->b_blocknr, + blocknr); if (ret < 0) return ret; if (buffer_nilfs_node(*bh)) { node = (struct nilfs_btree_node *)(*bh)->b_data; - key = nilfs_btree_node_get_key(btree, node, 0); + key = nilfs_btree_node_get_key(node, 0); } else key = nilfs_bmap_data_get_key(bmap, *bh); @@ -2239,10 +2204,10 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) int ret; btree = (struct nilfs_btree *)bmap; - path = nilfs_btree_alloc_path(btree); + path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - nilfs_btree_init_path(btree, path); + nilfs_btree_init_path(path); ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1); if (ret < 0) { @@ -2262,8 +2227,8 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) nilfs_bmap_set_dirty(&btree->bt_bmap); out: - nilfs_btree_clear_path(btree, path); - nilfs_btree_free_path(btree, path); + nilfs_btree_release_path(path); + nilfs_btree_free_path(path); return ret; } diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index aec942cf79e3..1c6cfb59128d 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -815,8 +815,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) void *kaddr; int ret; - if (cno == 0) - return -ENOENT; /* checkpoint number 0 is invalid */ + /* CP number is invalid if it's zero or larger than the + largest exist one.*/ + if (cno == 0 || cno >= nilfs_mdt_cno(cpfile)) + return -ENOENT; down_read(&NILFS_MDT(cpfile)->mi_sem); ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh); @@ -824,7 +826,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno) goto out; kaddr = kmap_atomic(bh->b_page, KM_USER0); cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr); - ret = nilfs_checkpoint_snapshot(cp); + if (nilfs_checkpoint_invalid(cp)) + ret = -ENOENT; + else + ret = nilfs_checkpoint_snapshot(cp); kunmap_atomic(kaddr, KM_USER0); brelse(bh); diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h index 788a45950197..debea896e701 100644 --- a/fs/nilfs2/cpfile.h +++ b/fs/nilfs2/cpfile.h @@ -27,8 +27,6 @@ #include <linux/buffer_head.h> #include <linux/nilfs2_fs.h> -#define NILFS_CPFILE_GFP NILFS_MDT_GFP - int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int, struct nilfs_checkpoint **, diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c index 8927ca27e6f7..1ff8e15bd36b 100644 --- a/fs/nilfs2/dat.c +++ b/fs/nilfs2/dat.c @@ -109,12 +109,6 @@ void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req) nilfs_palloc_commit_free_entry(dat, req); } -void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req) -{ - nilfs_dat_abort_entry(dat, req); - nilfs_palloc_abort_free_entry(dat, req); -} - int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req) { int ret; @@ -140,11 +134,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req, nilfs_dat_commit_entry(dat, req); } -void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req) -{ - nilfs_dat_abort_entry(dat, req); -} - int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req) { struct nilfs_dat_entry *entry; @@ -222,6 +211,37 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req) nilfs_dat_abort_entry(dat, req); } +int nilfs_dat_prepare_update(struct inode *dat, + struct nilfs_palloc_req *oldreq, + struct nilfs_palloc_req *newreq) +{ + int ret; + + ret = nilfs_dat_prepare_end(dat, oldreq); + if (!ret) { + ret = nilfs_dat_prepare_alloc(dat, newreq); + if (ret < 0) + nilfs_dat_abort_end(dat, oldreq); + } + return ret; +} + +void nilfs_dat_commit_update(struct inode *dat, + struct nilfs_palloc_req *oldreq, + struct nilfs_palloc_req *newreq, int dead) +{ + nilfs_dat_commit_end(dat, oldreq, dead); + nilfs_dat_commit_alloc(dat, newreq); +} + +void nilfs_dat_abort_update(struct inode *dat, + struct nilfs_palloc_req *oldreq, + struct nilfs_palloc_req *newreq) +{ + nilfs_dat_abort_end(dat, oldreq); + nilfs_dat_abort_alloc(dat, newreq); +} + /** * nilfs_dat_mark_dirty - * @dat: DAT file inode diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h index d328b81eead4..406070d3ff49 100644 --- a/fs/nilfs2/dat.h +++ b/fs/nilfs2/dat.h @@ -27,7 +27,6 @@ #include <linux/buffer_head.h> #include <linux/fs.h> -#define NILFS_DAT_GFP NILFS_MDT_GFP struct nilfs_palloc_req; @@ -39,10 +38,15 @@ void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *); int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *); void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *, sector_t); -void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *); int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *); void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int); void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *); +int nilfs_dat_prepare_update(struct inode *, struct nilfs_palloc_req *, + struct nilfs_palloc_req *); +void nilfs_dat_commit_update(struct inode *, struct nilfs_palloc_req *, + struct nilfs_palloc_req *, int); +void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *, + struct nilfs_palloc_req *); int nilfs_dat_mark_dirty(struct inode *, __u64); int nilfs_dat_freev(struct inode *, __u64 *, size_t); diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index 342d9765df8d..d369ac718277 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -125,106 +125,64 @@ static void nilfs_direct_set_target_v(struct nilfs_direct *direct, direct->d_bmap.b_last_allocated_ptr = ptr; } -static int nilfs_direct_prepare_insert(struct nilfs_direct *direct, - __u64 key, - union nilfs_bmap_ptr_req *req, - struct nilfs_bmap_stats *stats) -{ - int ret; - - if (NILFS_BMAP_USE_VBN(&direct->d_bmap)) - req->bpr_ptr = nilfs_direct_find_target_v(direct, key); - ret = nilfs_bmap_prepare_alloc_ptr(&direct->d_bmap, req); - if (ret < 0) - return ret; - - stats->bs_nblocks = 1; - return 0; -} - -static void nilfs_direct_commit_insert(struct nilfs_direct *direct, - union nilfs_bmap_ptr_req *req, - __u64 key, __u64 ptr) -{ - struct buffer_head *bh; - - /* ptr must be a pointer to a buffer head. */ - bh = (struct buffer_head *)((unsigned long)ptr); - set_buffer_nilfs_volatile(bh); - - nilfs_bmap_commit_alloc_ptr(&direct->d_bmap, req); - nilfs_direct_set_ptr(direct, key, req->bpr_ptr); - - if (!nilfs_bmap_dirty(&direct->d_bmap)) - nilfs_bmap_set_dirty(&direct->d_bmap); - - if (NILFS_BMAP_USE_VBN(&direct->d_bmap)) - nilfs_direct_set_target_v(direct, key, req->bpr_ptr); -} - static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) { - struct nilfs_direct *direct; + struct nilfs_direct *direct = (struct nilfs_direct *)bmap; union nilfs_bmap_ptr_req req; - struct nilfs_bmap_stats stats; + struct inode *dat = NULL; + struct buffer_head *bh; int ret; - direct = (struct nilfs_direct *)bmap; if (key > NILFS_DIRECT_KEY_MAX) return -ENOENT; if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR) return -EEXIST; - ret = nilfs_direct_prepare_insert(direct, key, &req, &stats); - if (ret < 0) - return ret; - nilfs_direct_commit_insert(direct, &req, key, ptr); - nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); + if (NILFS_BMAP_USE_VBN(bmap)) { + req.bpr_ptr = nilfs_direct_find_target_v(direct, key); + dat = nilfs_bmap_get_dat(bmap); + } + ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat); + if (!ret) { + /* ptr must be a pointer to a buffer head. */ + bh = (struct buffer_head *)((unsigned long)ptr); + set_buffer_nilfs_volatile(bh); - return 0; -} + nilfs_bmap_commit_alloc_ptr(bmap, &req, dat); + nilfs_direct_set_ptr(direct, key, req.bpr_ptr); -static int nilfs_direct_prepare_delete(struct nilfs_direct *direct, - union nilfs_bmap_ptr_req *req, - __u64 key, - struct nilfs_bmap_stats *stats) -{ - int ret; + if (!nilfs_bmap_dirty(bmap)) + nilfs_bmap_set_dirty(bmap); - req->bpr_ptr = nilfs_direct_get_ptr(direct, key); - ret = nilfs_bmap_prepare_end_ptr(&direct->d_bmap, req); - if (!ret) - stats->bs_nblocks = 1; - return ret; -} + if (NILFS_BMAP_USE_VBN(bmap)) + nilfs_direct_set_target_v(direct, key, req.bpr_ptr); -static void nilfs_direct_commit_delete(struct nilfs_direct *direct, - union nilfs_bmap_ptr_req *req, - __u64 key) -{ - nilfs_bmap_commit_end_ptr(&direct->d_bmap, req); - nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR); + nilfs_bmap_add_blocks(bmap, 1); + } + return ret; } static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key) { - struct nilfs_direct *direct; + struct nilfs_direct *direct = (struct nilfs_direct *)bmap; union nilfs_bmap_ptr_req req; - struct nilfs_bmap_stats stats; + struct inode *dat; int ret; - direct = (struct nilfs_direct *)bmap; - if ((key > NILFS_DIRECT_KEY_MAX) || + if (key > NILFS_DIRECT_KEY_MAX || nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR) return -ENOENT; - ret = nilfs_direct_prepare_delete(direct, &req, key, &stats); - if (ret < 0) - return ret; - nilfs_direct_commit_delete(direct, &req, key); - nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); + dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL; + req.bpr_ptr = nilfs_direct_get_ptr(direct, key); - return 0; + ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat); + if (!ret) { + nilfs_bmap_commit_end_ptr(bmap, &req, dat); + nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR); + nilfs_bmap_sub_blocks(bmap, 1); + } + return ret; } static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) @@ -310,59 +268,56 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap, return 0; } -static int nilfs_direct_propagate_v(struct nilfs_direct *direct, - struct buffer_head *bh) +static int nilfs_direct_propagate(const struct nilfs_bmap *bmap, + struct buffer_head *bh) { - union nilfs_bmap_ptr_req oldreq, newreq; + struct nilfs_direct *direct = (struct nilfs_direct *)bmap; + struct nilfs_palloc_req oldreq, newreq; + struct inode *dat; __u64 key; __u64 ptr; int ret; - key = nilfs_bmap_data_get_key(&direct->d_bmap, bh); + if (!NILFS_BMAP_USE_VBN(bmap)) + return 0; + + dat = nilfs_bmap_get_dat(bmap); + key = nilfs_bmap_data_get_key(bmap, bh); ptr = nilfs_direct_get_ptr(direct, key); if (!buffer_nilfs_volatile(bh)) { - oldreq.bpr_ptr = ptr; - newreq.bpr_ptr = ptr; - ret = nilfs_bmap_prepare_update_v(&direct->d_bmap, &oldreq, - &newreq); + oldreq.pr_entry_nr = ptr; + newreq.pr_entry_nr = ptr; + ret = nilfs_dat_prepare_update(dat, &oldreq, &newreq); if (ret < 0) return ret; - nilfs_bmap_commit_update_v(&direct->d_bmap, &oldreq, &newreq); + nilfs_dat_commit_update(dat, &oldreq, &newreq, + bmap->b_ptr_type == NILFS_BMAP_PTR_VS); set_buffer_nilfs_volatile(bh); - nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr); + nilfs_direct_set_ptr(direct, key, newreq.pr_entry_nr); } else - ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr); + ret = nilfs_dat_mark_dirty(dat, ptr); return ret; } -static int nilfs_direct_propagate(const struct nilfs_bmap *bmap, - struct buffer_head *bh) -{ - struct nilfs_direct *direct = (struct nilfs_direct *)bmap; - - return NILFS_BMAP_USE_VBN(bmap) ? - nilfs_direct_propagate_v(direct, bh) : 0; -} - static int nilfs_direct_assign_v(struct nilfs_direct *direct, __u64 key, __u64 ptr, struct buffer_head **bh, sector_t blocknr, union nilfs_binfo *binfo) { + struct inode *dat = nilfs_bmap_get_dat(&direct->d_bmap); union nilfs_bmap_ptr_req req; int ret; req.bpr_ptr = ptr; - ret = nilfs_bmap_start_v(&direct->d_bmap, &req, blocknr); - if (unlikely(ret < 0)) - return ret; - - binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); - binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); - - return 0; + ret = nilfs_dat_prepare_start(dat, &req.bpr_req); + if (!ret) { + nilfs_dat_commit_start(dat, &req.bpr_req, blocknr); + binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); + binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); + } + return ret; } static int nilfs_direct_assign_p(struct nilfs_direct *direct, diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h index 5d30a35679b5..ecc3ba76db47 100644 --- a/fs/nilfs2/ifile.h +++ b/fs/nilfs2/ifile.h @@ -31,7 +31,6 @@ #include "mdt.h" #include "alloc.h" -#define NILFS_IFILE_GFP NILFS_MDT_GFP static inline struct nilfs_inode * nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index fe9d8f2a13f8..807e584b163d 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -430,7 +430,8 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino, raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh); - if (nilfs_read_inode_common(inode, raw_inode)) + err = nilfs_read_inode_common(inode, raw_inode); + if (err) goto failed_unmap; if (S_ISREG(inode->i_mode)) { diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 6ea5f872e2de..6572ea4bc4df 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -442,12 +442,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs, const char *msg; int ret; - ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]); - if (ret < 0) { - msg = "cannot read source blocks"; - goto failed; - } - ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]); if (ret < 0) { /* @@ -548,7 +542,25 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, } } - ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); + /* + * nilfs_ioctl_move_blocks() will call nilfs_gc_iget(), + * which will operates an inode list without blocking. + * To protect the list from concurrent operations, + * nilfs_ioctl_move_blocks should be atomic operation. + */ + if (test_and_set_bit(THE_NILFS_GC_RUNNING, &nilfs->ns_flags)) { + ret = -EBUSY; + goto out_free; + } + + ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]); + if (ret < 0) + printk(KERN_ERR "NILFS: GC failed during preparation: " + "cannot read source blocks: err=%d\n", ret); + else + ret = nilfs_clean_segments(inode->i_sb, argv, kbufs); + + clear_nilfs_gc_running(nilfs); out_free: while (--n >= 0) diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 3d3ddb3f5177..156bf6091a96 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -103,15 +103,12 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, goto failed_unlock; err = -EEXIST; - if (buffer_uptodate(bh) || buffer_mapped(bh)) + if (buffer_uptodate(bh)) goto failed_bh; -#if 0 - /* The uptodate flag is not protected by the page lock, but - the mapped flag is. Thus, we don't have to wait the buffer. */ + wait_on_buffer(bh); if (buffer_uptodate(bh)) goto failed_bh; -#endif bh->b_bdev = nilfs->ns_bdev; err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); @@ -139,7 +136,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, int mode, struct buffer_head **out_bh) { struct buffer_head *bh; - unsigned long blknum = 0; + __u64 blknum = 0; int ret = -ENOMEM; bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0); @@ -162,17 +159,15 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff, unlock_buffer(bh); goto out; } - if (!buffer_mapped(bh)) { /* unused buffer */ - ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, - &blknum); - if (unlikely(ret)) { - unlock_buffer(bh); - goto failed_bh; - } - bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev; - bh->b_blocknr = blknum; - set_buffer_mapped(bh); + + ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, &blknum); + if (unlikely(ret)) { + unlock_buffer(bh); + goto failed_bh; } + bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev; + bh->b_blocknr = (sector_t)blknum; + set_buffer_mapped(bh); bh->b_end_io = end_buffer_read_sync; get_bh(bh); @@ -402,6 +397,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) struct inode *inode = container_of(page->mapping, struct inode, i_data); struct super_block *sb = inode->i_sb; + struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs; struct nilfs_sb_info *writer = NULL; int err = 0; @@ -411,9 +407,12 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) if (page->mapping->assoc_mapping) return 0; /* Do not request flush for shadow page cache */ if (!sb) { - writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs); - if (!writer) + down_read(&nilfs->ns_writer_sem); + writer = nilfs->ns_writer; + if (!writer) { + up_read(&nilfs->ns_writer_sem); return -EROFS; + } sb = writer->s_super; } @@ -423,7 +422,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) nilfs_flush_segment(sb, inode->i_ino); if (writer) - nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs); + up_read(&nilfs->ns_writer_sem); return err; } @@ -514,9 +513,10 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb, } struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb, - ino_t ino, gfp_t gfp_mask) + ino_t ino) { - struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask); + struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, + NILFS_MDT_GFP); if (!inode) return NULL; diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h index df683e0bca6a..431599733c9b 100644 --- a/fs/nilfs2/mdt.h +++ b/fs/nilfs2/mdt.h @@ -74,8 +74,7 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long); int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long); int nilfs_mdt_fetch_dirty(struct inode *); -struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t, - gfp_t); +struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t); struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *, ino_t, gfp_t); void nilfs_mdt_destroy(struct inode *); diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index d80cc71be749..6dc83591d118 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -552,7 +552,8 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi, printk(KERN_WARNING "NILFS warning: error recovering data block " "(err=%d, ino=%lu, block-offset=%llu)\n", - err, rb->ino, (unsigned long long)rb->blkoff); + err, (unsigned long)rb->ino, + (unsigned long long)rb->blkoff); if (!err2) err2 = err; next: diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index 9e3fe17bb96b..e6d9e37fa241 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -316,10 +316,10 @@ static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start, { struct bio *bio; - bio = bio_alloc(GFP_NOWAIT, nr_vecs); + bio = bio_alloc(GFP_NOIO, nr_vecs); if (bio == NULL) { while (!bio && (nr_vecs >>= 1)) - bio = bio_alloc(GFP_NOWAIT, nr_vecs); + bio = bio_alloc(GFP_NOIO, nr_vecs); } if (likely(bio)) { bio->bi_bdev = sb->s_bdev; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 8b5e4778cf28..683df89dbae5 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1859,12 +1859,26 @@ static void nilfs_end_page_io(struct page *page, int err) if (!page) return; - if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page)) + if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page)) { /* * For b-tree node pages, this function may be called twice * or more because they might be split in a segment. */ + if (PageDirty(page)) { + /* + * For pages holding split b-tree node buffers, dirty + * flag on the buffers may be cleared discretely. + * In that case, the page is once redirtied for + * remaining buffers, and it must be cancelled if + * all the buffers get cleaned later. + */ + lock_page(page); + if (nilfs_page_buffers_clean(page)) + __nilfs_clear_page_dirty(page); + unlock_page(page); + } return; + } __nilfs_end_page_io(page, err); } @@ -2487,7 +2501,8 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && nilfs_discontinued(nilfs)) { down_write(&nilfs->ns_sem); - req->sb_err = nilfs_commit_super(sbi, 0); + req->sb_err = nilfs_commit_super(sbi, + nilfs_altsb_need_update(nilfs)); up_write(&nilfs->ns_sem); } } @@ -2675,6 +2690,7 @@ static int nilfs_segctor_thread(void *arg) } else { DEFINE_WAIT(wait); int should_sleep = 1; + struct the_nilfs *nilfs; prepare_to_wait(&sci->sc_wait_daemon, &wait, TASK_INTERRUPTIBLE); @@ -2695,6 +2711,9 @@ static int nilfs_segctor_thread(void *arg) finish_wait(&sci->sc_wait_daemon, &wait); timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) && time_after_eq(jiffies, sci->sc_timer->expires)); + nilfs = sci->sc_sbi->s_nilfs; + if (sci->sc_super->s_dirt && nilfs_sb_need_update(nilfs)) + set_nilfs_discontinued(nilfs); } goto loop; diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h index a2c4d76c3366..0e99e5c0bd0f 100644 --- a/fs/nilfs2/sufile.h +++ b/fs/nilfs2/sufile.h @@ -28,7 +28,6 @@ #include <linux/nilfs2_fs.h> #include "mdt.h" -#define NILFS_SUFILE_GFP NILFS_MDT_GFP static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile) { diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 8e2ec43b18f4..55f3d6b60732 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -50,6 +50,8 @@ #include <linux/writeback.h> #include <linux/kobject.h> #include <linux/exportfs.h> +#include <linux/seq_file.h> +#include <linux/mount.h> #include "nilfs.h" #include "mdt.h" #include "alloc.h" @@ -65,7 +67,6 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem " "(NILFS)"); MODULE_LICENSE("GPL"); -static void nilfs_write_super(struct super_block *sb); static int nilfs_remount(struct super_block *sb, int *flags, char *data); /** @@ -311,9 +312,6 @@ static void nilfs_put_super(struct super_block *sb) lock_kernel(); - if (sb->s_dirt) - nilfs_write_super(sb); - nilfs_detach_segment_constructor(sbi); if (!(sb->s_flags & MS_RDONLY)) { @@ -336,63 +334,21 @@ static void nilfs_put_super(struct super_block *sb) unlock_kernel(); } -/** - * nilfs_write_super - write super block(s) of NILFS - * @sb: super_block - * - * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and - * clears s_dirt. This function is called in the section protected by - * lock_super(). - * - * The s_dirt flag is managed by each filesystem and we protect it by ns_sem - * of the struct the_nilfs. Lock order must be as follows: - * - * 1. lock_super() - * 2. down_write(&nilfs->ns_sem) - * - * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer - * of the super block (nilfs->ns_sbp[]). - * - * In most cases, VFS functions call lock_super() before calling these - * methods. So we must be careful not to bring on deadlocks when using - * lock_super(); see generic_shutdown_super(), write_super(), and so on. - * - * Note that order of lock_kernel() and lock_super() depends on contexts - * of VFS. We should also note that lock_kernel() can be used in its - * protective section and only the outermost one has an effect. - */ -static void nilfs_write_super(struct super_block *sb) +static int nilfs_sync_fs(struct super_block *sb, int wait) { struct nilfs_sb_info *sbi = NILFS_SB(sb); struct the_nilfs *nilfs = sbi->s_nilfs; - - down_write(&nilfs->ns_sem); - if (!(sb->s_flags & MS_RDONLY)) { - struct nilfs_super_block **sbp = nilfs->ns_sbp; - u64 t = get_seconds(); - int dupsb; - - if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] && - t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) { - up_write(&nilfs->ns_sem); - return; - } - dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ; - nilfs_commit_super(sbi, dupsb); - } - sb->s_dirt = 0; - up_write(&nilfs->ns_sem); -} - -static int nilfs_sync_fs(struct super_block *sb, int wait) -{ int err = 0; - nilfs_write_super(sb); - /* This function is called when super block should be written back */ if (wait) err = nilfs_construct_segment(sb); + + down_write(&nilfs->ns_sem); + if (sb->s_dirt) + nilfs_commit_super(sbi, 1); + up_write(&nilfs->ns_sem); + return err; } @@ -407,8 +363,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) list_add(&sbi->s_list, &nilfs->ns_supers); up_write(&nilfs->ns_super_sem); - sbi->s_ifile = nilfs_mdt_new( - nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP); + sbi->s_ifile = nilfs_mdt_new(nilfs, sbi->s_super, NILFS_IFILE_INO); if (!sbi->s_ifile) return -ENOMEM; @@ -416,8 +371,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) if (unlikely(err)) goto failed; + down_read(&nilfs->ns_segctor_sem); err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, &bh_cp); + up_read(&nilfs->ns_segctor_sem); if (unlikely(err)) { if (err == -ENOENT || err == -EINVAL) { printk(KERN_ERR @@ -527,6 +484,26 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } +static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs) +{ + struct super_block *sb = vfs->mnt_sb; + struct nilfs_sb_info *sbi = NILFS_SB(sb); + + if (!nilfs_test_opt(sbi, BARRIER)) + seq_printf(seq, ",barrier=off"); + if (nilfs_test_opt(sbi, SNAPSHOT)) + seq_printf(seq, ",cp=%llu", + (unsigned long long int)sbi->s_snapshot_cno); + if (nilfs_test_opt(sbi, ERRORS_RO)) + seq_printf(seq, ",errors=remount-ro"); + if (nilfs_test_opt(sbi, ERRORS_PANIC)) + seq_printf(seq, ",errors=panic"); + if (nilfs_test_opt(sbi, STRICT_ORDER)) + seq_printf(seq, ",order=strict"); + + return 0; +} + static struct super_operations nilfs_sops = { .alloc_inode = nilfs_alloc_inode, .destroy_inode = nilfs_destroy_inode, @@ -536,7 +513,7 @@ static struct super_operations nilfs_sops = { /* .drop_inode = nilfs_drop_inode, */ .delete_inode = nilfs_delete_inode, .put_super = nilfs_put_super, - .write_super = nilfs_write_super, + /* .write_super = nilfs_write_super, */ .sync_fs = nilfs_sync_fs, /* .write_super_lockfs */ /* .unlockfs */ @@ -544,7 +521,7 @@ static struct super_operations nilfs_sops = { .remount_fs = nilfs_remount, .clear_inode = nilfs_clear_inode, /* .umount_begin */ - /* .show_options */ + .show_options = nilfs_show_options }; static struct inode * @@ -814,10 +791,15 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, if (sb->s_flags & MS_RDONLY) { if (nilfs_test_opt(sbi, SNAPSHOT)) { + down_read(&nilfs->ns_segctor_sem); err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile, sbi->s_snapshot_cno); - if (err < 0) + up_read(&nilfs->ns_segctor_sem); + if (err < 0) { + if (err == -ENOENT) + err = -EINVAL; goto failed_sbi; + } if (!err) { printk(KERN_ERR "NILFS: The specified checkpoint is " @@ -1125,10 +1107,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags, */ sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno); - if (!sd.cno) - /* trying to get the latest checkpoint. */ - sd.cno = nilfs_last_cno(nilfs); - /* * Get super block instance holding the nilfs_sb_info struct. * A new instance is allocated if no existing mount is present or diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 8b8889825716..d4168e269c5d 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -68,12 +68,11 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev) nilfs->ns_bdev = bdev; atomic_set(&nilfs->ns_count, 1); - atomic_set(&nilfs->ns_writer_refcount, -1); atomic_set(&nilfs->ns_ndirtyblks, 0); init_rwsem(&nilfs->ns_sem); init_rwsem(&nilfs->ns_super_sem); mutex_init(&nilfs->ns_mount_mutex); - mutex_init(&nilfs->ns_writer_mutex); + init_rwsem(&nilfs->ns_writer_sem); INIT_LIST_HEAD(&nilfs->ns_list); INIT_LIST_HEAD(&nilfs->ns_supers); spin_lock_init(&nilfs->ns_last_segment_lock); @@ -188,23 +187,19 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, inode_size = nilfs->ns_inode_size; err = -ENOMEM; - nilfs->ns_dat = nilfs_mdt_new( - nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP); + nilfs->ns_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO); if (unlikely(!nilfs->ns_dat)) goto failed; - nilfs->ns_gc_dat = nilfs_mdt_new( - nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP); + nilfs->ns_gc_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO); if (unlikely(!nilfs->ns_gc_dat)) goto failed_dat; - nilfs->ns_cpfile = nilfs_mdt_new( - nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP); + nilfs->ns_cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO); if (unlikely(!nilfs->ns_cpfile)) goto failed_gc_dat; - nilfs->ns_sufile = nilfs_mdt_new( - nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP); + nilfs->ns_sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO); if (unlikely(!nilfs->ns_sufile)) goto failed_cpfile; diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index e8adbffc626f..20abd55881e0 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -37,6 +37,7 @@ enum { THE_NILFS_LOADED, /* Roll-back/roll-forward has done and the latest checkpoint was loaded */ THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */ + THE_NILFS_GC_RUNNING, /* gc process is running */ }; /** @@ -50,8 +51,7 @@ enum { * @ns_sem: semaphore for shared states * @ns_super_sem: semaphore for global operations across super block instances * @ns_mount_mutex: mutex protecting mount process of nilfs - * @ns_writer_mutex: mutex protecting ns_writer attach/detach - * @ns_writer_refcount: number of referrers on ns_writer + * @ns_writer_sem: semaphore protecting ns_writer attach/detach * @ns_current: back pointer to current mount * @ns_sbh: buffer heads of on-disk super blocks * @ns_sbp: pointers to super block data @@ -100,8 +100,7 @@ struct the_nilfs { struct rw_semaphore ns_sem; struct rw_semaphore ns_super_sem; struct mutex ns_mount_mutex; - struct mutex ns_writer_mutex; - atomic_t ns_writer_refcount; + struct rw_semaphore ns_writer_sem; /* * components protected by ns_super_sem @@ -197,11 +196,26 @@ static inline int nilfs_##name(struct the_nilfs *nilfs) \ THE_NILFS_FNS(INIT, init) THE_NILFS_FNS(LOADED, loaded) THE_NILFS_FNS(DISCONTINUED, discontinued) +THE_NILFS_FNS(GC_RUNNING, gc_running) /* Minimum interval of periodical update of superblocks (in seconds) */ #define NILFS_SB_FREQ 10 #define NILFS_ALTSB_FREQ 60 /* spare superblock */ +static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) +{ + u64 t = get_seconds(); + return t < nilfs->ns_sbwtime[0] || + t > nilfs->ns_sbwtime[0] + NILFS_SB_FREQ; +} + +static inline int nilfs_altsb_need_update(struct the_nilfs *nilfs) +{ + u64 t = get_seconds(); + struct nilfs_super_block **sbp = nilfs->ns_sbp; + return sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ; +} + void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); struct the_nilfs *find_or_create_nilfs(struct block_device *); void put_nilfs(struct the_nilfs *); @@ -221,39 +235,26 @@ static inline void get_nilfs(struct the_nilfs *nilfs) atomic_inc(&nilfs->ns_count); } -static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs) -{ - if (atomic_inc_and_test(&nilfs->ns_writer_refcount)) - mutex_lock(&nilfs->ns_writer_mutex); - return nilfs->ns_writer; -} - -static inline void nilfs_put_writer(struct the_nilfs *nilfs) -{ - if (atomic_add_negative(-1, &nilfs->ns_writer_refcount)) - mutex_unlock(&nilfs->ns_writer_mutex); -} - static inline void nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) { - mutex_lock(&nilfs->ns_writer_mutex); + down_write(&nilfs->ns_writer_sem); nilfs->ns_writer = sbi; - mutex_unlock(&nilfs->ns_writer_mutex); + up_write(&nilfs->ns_writer_sem); } static inline void nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) { - mutex_lock(&nilfs->ns_writer_mutex); + down_write(&nilfs->ns_writer_sem); if (sbi == nilfs->ns_writer) nilfs->ns_writer = NULL; - mutex_unlock(&nilfs->ns_writer_mutex); + up_write(&nilfs->ns_writer_sem); } static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi) { - if (!atomic_dec_and_test(&sbi->s_count)) + if (atomic_dec_and_test(&sbi->s_count)) kfree(sbi); } diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig index 31dac7e3b0f1..dffbb0911d02 100644 --- a/fs/notify/Kconfig +++ b/fs/notify/Kconfig @@ -1,15 +1,5 @@ config FSNOTIFY - bool "Filesystem notification backend" - default y - ---help--- - fsnotify is a backend for filesystem notification. fsnotify does - not provide any userspace interface but does provide the basis - needed for other notification schemes such as dnotify, inotify, - and fanotify. - - Say Y here to enable fsnotify suport. - - If unsure, say Y. + def_bool n source "fs/notify/dnotify/Kconfig" source "fs/notify/inotify/Kconfig" diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig index 904ff8d5405a..f9c1ca139d8f 100644 --- a/fs/notify/dnotify/Kconfig +++ b/fs/notify/dnotify/Kconfig @@ -1,6 +1,6 @@ config DNOTIFY bool "Dnotify support" - depends on FSNOTIFY + select FSNOTIFY default y help Dnotify is a directory-based per-fd file change notification system diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index ec2f7bd76818..037e878e03fc 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -159,7 +159,9 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const if (!group->ops->should_send_event(group, to_tell, mask)) continue; if (!event) { - event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie); + event = fsnotify_create_event(to_tell, mask, data, + data_is, file_name, cookie, + GFP_KERNEL); /* shit, we OOM'd and now we can't tell, maybe * someday someone else will want to do something * here */ diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig index 5356884289a1..3e56dbffe729 100644 --- a/fs/notify/inotify/Kconfig +++ b/fs/notify/inotify/Kconfig @@ -15,7 +15,7 @@ config INOTIFY config INOTIFY_USER bool "Inotify support for userspace" - depends on FSNOTIFY + select FSNOTIFY default y ---help--- Say Y here to enable inotify support for userspace, including the diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 47cd258fd24d..c9ee67b442e1 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -62,13 +62,14 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev event_priv->wd = wd; ret = fsnotify_add_notify_event(group, event, fsn_event_priv); - /* EEXIST is not an error */ - if (ret == -EEXIST) - ret = 0; - - /* did event_priv get attached? */ - if (list_empty(&fsn_event_priv->event_list)) + if (ret) { inotify_free_event_priv(fsn_event_priv); + /* EEXIST says we tail matched, EOVERFLOW isn't something + * to report up the stack. */ + if ((ret == -EEXIST) || + (ret == -EOVERFLOW)) + ret = 0; + } /* * If we hold the entry until after the event is on the queue @@ -104,16 +105,45 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode return send; } +/* + * This is NEVER supposed to be called. Inotify marks should either have been + * removed from the idr when the watch was removed or in the + * fsnotify_destroy_mark_by_group() call when the inotify instance was being + * torn down. This is only called if the idr is about to be freed but there + * are still marks in it. + */ static int idr_callback(int id, void *p, void *data) { - BUG(); + struct fsnotify_mark_entry *entry; + struct inotify_inode_mark_entry *ientry; + static bool warned = false; + + if (warned) + return 0; + + warned = false; + entry = p; + ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); + + WARN(1, "inotify closing but id=%d for entry=%p in group=%p still in " + "idr. Probably leaking memory\n", id, p, data); + + /* + * I'm taking the liberty of assuming that the mark in question is a + * valid address and I'm dereferencing it. This might help to figure + * out why we got here and the panic is no worse than the original + * BUG() that was here. + */ + if (entry) + printk(KERN_WARNING "entry->group=%p inode=%p wd=%d\n", + entry->group, entry->inode, ientry->wd); return 0; } static void inotify_free_group_priv(struct fsnotify_group *group) { /* ideally the idr is empty and we won't hit the BUG in teh callback */ - idr_for_each(&group->inotify_data.idr, idr_callback, NULL); + idr_for_each(&group->inotify_data.idr, idr_callback, group); idr_remove_all(&group->inotify_data.idr); idr_destroy(&group->inotify_data.idr); } diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index ff27a2965844..dcd2040d330c 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -47,9 +47,6 @@ static struct vfsmount *inotify_mnt __read_mostly; -/* this just sits here and wastes global memory. used to just pad userspace messages with zeros */ -static struct inotify_event nul_inotify_event; - /* these are configurable via /proc/sys/fs/inotify/ */ static int inotify_max_user_instances __read_mostly; static int inotify_max_queued_events __read_mostly; @@ -57,7 +54,6 @@ int inotify_max_user_watches __read_mostly; static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; struct kmem_cache *event_priv_cachep __read_mostly; -static struct fsnotify_event *inotify_ignored_event; /* * When inotify registers a new group it increments this and uses that @@ -158,7 +154,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group, event = fsnotify_peek_notify_event(group); - event_size += roundup(event->name_len, event_size); + if (event->name_len) + event_size += roundup(event->name_len + 1, event_size); if (event_size > count) return ERR_PTR(-EINVAL); @@ -184,7 +181,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, struct fsnotify_event_private_data *fsn_priv; struct inotify_event_private_data *priv; size_t event_size = sizeof(struct inotify_event); - size_t name_len; + size_t name_len = 0; /* we get the inotify watch descriptor from the event private data */ spin_lock(&event->lock); @@ -200,8 +197,12 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, inotify_free_event_priv(fsn_priv); } - /* round up event->name_len so it is a multiple of event_size */ - name_len = roundup(event->name_len, event_size); + /* + * round up event->name_len so it is a multiple of event_size + * plus an extra byte for the terminating '\0'. + */ + if (event->name_len) + name_len = roundup(event->name_len + 1, event_size); inotify_event.len = name_len; inotify_event.mask = inotify_mask_to_arg(event->mask); @@ -225,8 +226,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, return -EFAULT; buf += event->name_len; - /* fill userspace with 0's from nul_inotify_event */ - if (copy_to_user(buf, &nul_inotify_event, len_to_zero)) + /* fill userspace with 0's */ + if (clear_user(buf, len_to_zero)) return -EFAULT; buf += len_to_zero; event_size += name_len; @@ -327,8 +328,9 @@ static long inotify_ioctl(struct file *file, unsigned int cmd, list_for_each_entry(holder, &group->notification_list, event_list) { event = holder->event; send_len += sizeof(struct inotify_event); - send_len += roundup(event->name_len, - sizeof(struct inotify_event)); + if (event->name_len) + send_len += roundup(event->name_len + 1, + sizeof(struct inotify_event)); } mutex_unlock(&group->notification_mutex); ret = put_user(send_len, (int __user *) p); @@ -366,20 +368,71 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns } /* - * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the - * internal reference help on the mark because it is in the idr. + * Remove the mark from the idr (if present) and drop the reference + * on the mark because it was in the idr. + */ +static void inotify_remove_from_idr(struct fsnotify_group *group, + struct inotify_inode_mark_entry *ientry) +{ + struct idr *idr; + struct fsnotify_mark_entry *entry; + struct inotify_inode_mark_entry *found_ientry; + int wd; + + spin_lock(&group->inotify_data.idr_lock); + idr = &group->inotify_data.idr; + wd = ientry->wd; + + if (wd == -1) + goto out; + + entry = idr_find(&group->inotify_data.idr, wd); + if (unlikely(!entry)) + goto out; + + found_ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); + if (unlikely(found_ientry != ientry)) { + /* We found an entry in the idr with the right wd, but it's + * not the entry we were told to remove. eparis seriously + * fucked up somewhere. */ + WARN_ON(1); + ientry->wd = -1; + goto out; + } + + /* One ref for being in the idr, one ref held by the caller */ + BUG_ON(atomic_read(&entry->refcnt) < 2); + + idr_remove(idr, wd); + ientry->wd = -1; + + /* removed from the idr, drop that ref */ + fsnotify_put_mark(entry); +out: + spin_unlock(&group->inotify_data.idr_lock); +} + +/* + * Send IN_IGNORED for this wd, remove this wd from the idr. */ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, struct fsnotify_group *group) { struct inotify_inode_mark_entry *ientry; + struct fsnotify_event *ignored_event; struct inotify_event_private_data *event_priv; struct fsnotify_event_private_data *fsn_event_priv; - struct idr *idr; + int ret; + + ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, + FSNOTIFY_EVENT_NONE, NULL, 0, + GFP_NOFS); + if (!ignored_event) + return; ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); - event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); + event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS); if (unlikely(!event_priv)) goto skip_send_ignore; @@ -388,22 +441,19 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, fsn_event_priv->group = group; event_priv->wd = ientry->wd; - fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv); - - /* did the private data get added? */ - if (list_empty(&fsn_event_priv->event_list)) + ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv); + if (ret) inotify_free_event_priv(fsn_event_priv); skip_send_ignore: + /* matches the reference taken when the event was created */ + fsnotify_put_event(ignored_event); + /* remove this entry from the idr */ - spin_lock(&group->inotify_data.idr_lock); - idr = &group->inotify_data.idr; - idr_remove(idr, ientry->wd); - spin_unlock(&group->inotify_data.idr_lock); + inotify_remove_from_idr(group, ientry); - /* removed from idr, drop that reference */ - fsnotify_put_mark(entry); + atomic_dec(&group->inotify_data.user->inotify_watches); } /* ding dong the mark is dead */ @@ -414,67 +464,29 @@ static void inotify_free_mark(struct fsnotify_mark_entry *entry) kmem_cache_free(inotify_inode_mark_cachep, ientry); } -static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) +static int inotify_update_existing_watch(struct fsnotify_group *group, + struct inode *inode, + u32 arg) { - struct fsnotify_mark_entry *entry = NULL; + struct fsnotify_mark_entry *entry; struct inotify_inode_mark_entry *ientry; - int ret = 0; - int add = (arg & IN_MASK_ADD); - __u32 mask; __u32 old_mask, new_mask; + __u32 mask; + int add = (arg & IN_MASK_ADD); + int ret; /* don't allow invalid bits: we don't want flags set */ mask = inotify_arg_to_mask(arg); if (unlikely(!mask)) return -EINVAL; - ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); - if (unlikely(!ientry)) - return -ENOMEM; - /* we set the mask at the end after attaching it */ - fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark); - ientry->wd = 0; - -find_entry: spin_lock(&inode->i_lock); entry = fsnotify_find_mark_entry(group, inode); spin_unlock(&inode->i_lock); - if (entry) { - kmem_cache_free(inotify_inode_mark_cachep, ientry); - ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); - } else { - if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) { - ret = -ENOSPC; - goto out_err; - } - - ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode); - if (ret == -EEXIST) - goto find_entry; - else if (ret) - goto out_err; + if (!entry) + return -ENOENT; - entry = &ientry->fsn_entry; -retry: - ret = -ENOMEM; - if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL))) - goto out_err; - - spin_lock(&group->inotify_data.idr_lock); - /* if entry is added to the idr we keep the reference obtained - * through fsnotify_mark_add. remember to drop this reference - * when entry is removed from idr */ - ret = idr_get_new_above(&group->inotify_data.idr, entry, - ++group->inotify_data.last_wd, - &ientry->wd); - spin_unlock(&group->inotify_data.idr_lock); - if (ret) { - if (ret == -EAGAIN) - goto retry; - goto out_err; - } - atomic_inc(&group->inotify_data.user->inotify_watches); - } + ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); spin_lock(&entry->lock); @@ -506,14 +518,108 @@ retry: fsnotify_recalc_group_mask(group); } - return ientry->wd; + /* return the wd */ + ret = ientry->wd; -out_err: - /* see this isn't supposed to happen, just kill the watch */ - if (entry) { - fsnotify_destroy_mark_by_entry(entry); - fsnotify_put_mark(entry); + /* match the get from fsnotify_find_mark_entry() */ + fsnotify_put_mark(entry); + + return ret; +} + +static int inotify_new_watch(struct fsnotify_group *group, + struct inode *inode, + u32 arg) +{ + struct inotify_inode_mark_entry *tmp_ientry; + __u32 mask; + int ret; + + /* don't allow invalid bits: we don't want flags set */ + mask = inotify_arg_to_mask(arg); + if (unlikely(!mask)) + return -EINVAL; + + tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); + if (unlikely(!tmp_ientry)) + return -ENOMEM; + + fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark); + tmp_ientry->fsn_entry.mask = mask; + tmp_ientry->wd = -1; + + ret = -ENOSPC; + if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) + goto out_err; +retry: + ret = -ENOMEM; + if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL))) + goto out_err; + + spin_lock(&group->inotify_data.idr_lock); + ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry, + group->inotify_data.last_wd, + &tmp_ientry->wd); + spin_unlock(&group->inotify_data.idr_lock); + if (ret) { + /* idr was out of memory allocate and try again */ + if (ret == -EAGAIN) + goto retry; + goto out_err; + } + + /* we put the mark on the idr, take a reference */ + fsnotify_get_mark(&tmp_ientry->fsn_entry); + + /* we are on the idr, now get on the inode */ + ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode); + if (ret) { + /* we failed to get on the inode, get off the idr */ + inotify_remove_from_idr(group, tmp_ientry); + goto out_err; } + + /* update the idr hint, who cares about races, it's just a hint */ + group->inotify_data.last_wd = tmp_ientry->wd; + + /* increment the number of watches the user has */ + atomic_inc(&group->inotify_data.user->inotify_watches); + + /* return the watch descriptor for this new entry */ + ret = tmp_ientry->wd; + + /* match the ref from fsnotify_init_markentry() */ + fsnotify_put_mark(&tmp_ientry->fsn_entry); + + /* if this mark added a new event update the group mask */ + if (mask & ~group->mask) + fsnotify_recalc_group_mask(group); + +out_err: + if (ret < 0) + kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry); + + return ret; +} + +static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) +{ + int ret = 0; + +retry: + /* try to update and existing watch with the new arg */ + ret = inotify_update_existing_watch(group, inode, arg); + /* no mark present, try to add a new one */ + if (ret == -ENOENT) + ret = inotify_new_watch(group, inode, arg); + /* + * inotify_new_watch could race with another thread which did an + * inotify_new_watch between the update_existing and the add watch + * here, go back and try to update an existing mark again. + */ + if (ret == -EEXIST) + goto retry; + return ret; } @@ -532,7 +638,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); - group->inotify_data.last_wd = 0; + group->inotify_data.last_wd = 1; group->inotify_data.user = user; group->inotify_data.fa = NULL; @@ -721,9 +827,6 @@ static int __init inotify_user_setup(void) inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC); event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); - inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0); - if (!inotify_ignored_event) - panic("unable to allocate the inotify ignored event\n"); inotify_max_queued_events = 16384; inotify_max_user_instances = 128; diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 959b73e756fd..3816d5750dd5 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -136,18 +136,28 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new { if ((old->mask == new->mask) && (old->to_tell == new->to_tell) && - (old->data_type == new->data_type)) { + (old->data_type == new->data_type) && + (old->name_len == new->name_len)) { switch (old->data_type) { case (FSNOTIFY_EVENT_INODE): - if (old->inode == new->inode) + /* remember, after old was put on the wait_q we aren't + * allowed to look at the inode any more, only thing + * left to check was if the file_name is the same */ + if (old->name_len && + !strcmp(old->file_name, new->file_name)) return true; break; case (FSNOTIFY_EVENT_PATH): if ((old->path.mnt == new->path.mnt) && (old->path.dentry == new->path.dentry)) return true; + break; case (FSNOTIFY_EVENT_NONE): - return true; + if (old->mask & FS_Q_OVERFLOW) + return true; + else if (old->mask & FS_IN_IGNORED) + return false; + return false; }; } return false; @@ -165,9 +175,7 @@ int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_even struct list_head *list = &group->notification_list; struct fsnotify_event_holder *last_holder; struct fsnotify_event *last_event; - - /* easy to tell if priv was attached to the event */ - INIT_LIST_HEAD(&priv->event_list); + int ret = 0; /* * There is one fsnotify_event_holder embedded inside each fsnotify_event. @@ -188,6 +196,7 @@ alloc_holder: if (group->q_len >= group->max_events) { event = &q_overflow_event; + ret = -EOVERFLOW; /* sorry, no private data on the overflow event */ priv = NULL; } @@ -229,7 +238,7 @@ alloc_holder: mutex_unlock(&group->notification_mutex); wake_up(&group->notification_waitq); - return 0; + return ret; } /* @@ -339,18 +348,19 @@ static void initialize_event(struct fsnotify_event *event) * @name the filename, if available */ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, - int data_type, const char *name, u32 cookie) + int data_type, const char *name, u32 cookie, + gfp_t gfp) { struct fsnotify_event *event; - event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL); + event = kmem_cache_alloc(fsnotify_event_cachep, gfp); if (!event) return NULL; initialize_event(event); if (name) { - event->file_name = kstrdup(name, GFP_KERNEL); + event->file_name = kstrdup(name, gfp); if (!event->file_name) { kmem_cache_free(fsnotify_event_cachep, event); return NULL; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 3140a4429af1..4350d4993b18 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2076,14 +2076,6 @@ err_out: *ppos = pos; if (cached_page) page_cache_release(cached_page); - /* For now, when the user asks for O_SYNC, we actually give O_DSYNC. */ - if (likely(!status)) { - if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(vi))) { - if (!mapping->a_ops->writepage || !is_sync_kiocb(iocb)) - status = generic_osync_inode(vi, mapping, - OSYNC_METADATA|OSYNC_DATA); - } - } pagevec_lru_add_file(&lru_pvec); ntfs_debug("Done. Returning %s (written 0x%lx, status %li).", written ? "written" : "status", (unsigned long)written, @@ -2145,8 +2137,8 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, mutex_lock(&inode->i_mutex); ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err = sync_page_range(inode, mapping, pos, ret); + if (ret > 0) { + int err = generic_write_sync(file, pos, ret); if (err < 0) ret = err; } @@ -2173,8 +2165,8 @@ static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov, if (ret == -EIOCBQUEUED) ret = wait_on_sync_kiocb(&kiocb); mutex_unlock(&inode->i_mutex); - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err = sync_page_range(inode, mapping, *ppos - ret, ret); + if (ret > 0) { + int err = generic_write_sync(file, *ppos - ret, ret); if (err < 0) ret = err; } diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 23bf68453d7d..1caa0ef0b2bb 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -384,13 +384,12 @@ unm_err_out: * it is dirty in the inode meta data rather than the data page cache of the * inode, and thus there are no data pages that need writing out. Therefore, a * full mark_inode_dirty() is overkill. A mark_inode_dirty_sync(), on the - * other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to - * ensure ->write_inode is called from generic_osync_inode() and this needs to - * happen or the file data would not necessarily hit the device synchronously, - * even though the vfs inode has the O_SYNC flag set. Also, I_DIRTY_DATASYNC - * simply "feels" better than just I_DIRTY_SYNC, since the file data has not - * actually hit the block device yet, which is not what I_DIRTY_SYNC on its own - * would suggest. + * other hand, is not sufficient, because ->write_inode needs to be called even + * in case of fdatasync. This needs to happen or the file data would not + * necessarily hit the device synchronously, even though the vfs inode has the + * O_SYNC flag set. Also, I_DIRTY_DATASYNC simply "feels" better than just + * I_DIRTY_SYNC, since the file data has not actually hit the block device yet, + * which is not what I_DIRTY_SYNC on its own would suggest. */ void __mark_mft_record_dirty(ntfs_inode *ni) { diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 9edcde4974aa..ab513ddaeff2 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1914,7 +1914,8 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, * immediately to their right. */ left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); - if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) { + if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) { + BUG_ON(right_child_el->l_tree_depth); BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1); left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos); } @@ -2476,15 +2477,37 @@ out_ret_path: return ret; } -static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, - struct ocfs2_path *path) +static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, + int subtree_index, struct ocfs2_path *path) { - int i, idx; + int i, idx, ret; struct ocfs2_extent_rec *rec; struct ocfs2_extent_list *el; struct ocfs2_extent_block *eb; u32 range; + /* + * In normal tree rotation process, we will never touch the + * tree branch above subtree_index and ocfs2_extend_rotate_transaction + * doesn't reserve the credits for them either. + * + * But we do have a special case here which will update the rightmost + * records for all the bh in the path. + * So we have to allocate extra credits and access them. + */ + ret = ocfs2_extend_trans(handle, + handle->h_buffer_credits + subtree_index); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(inode, handle, path); + if (ret) { + mlog_errno(ret); + goto out; + } + /* Path should always be rightmost. */ eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; BUG_ON(eb->h_next_leaf_blk != 0ULL); @@ -2505,6 +2528,8 @@ static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, ocfs2_journal_dirty(handle, path->p_node[i].bh); } +out: + return ret; } static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, @@ -2717,7 +2742,12 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, if (del_right_subtree) { ocfs2_unlink_subtree(inode, handle, left_path, right_path, subtree_index, dealloc); - ocfs2_update_edge_lengths(inode, handle, left_path); + ret = ocfs2_update_edge_lengths(inode, handle, subtree_index, + left_path); + if (ret) { + mlog_errno(ret); + goto out; + } eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); @@ -3034,7 +3064,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, ocfs2_unlink_subtree(inode, handle, left_path, path, subtree_index, dealloc); - ocfs2_update_edge_lengths(inode, handle, left_path); + ret = ocfs2_update_edge_lengths(inode, handle, subtree_index, + left_path); + if (ret) { + mlog_errno(ret); + goto out; + } eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); @@ -6816,7 +6851,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, } status = 0; bail: - + brelse(last_eb_bh); mlog_exit(status); return status; } diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index b2c52b3a1484..8a1e61545f41 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -193,6 +193,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, (unsigned long long)OCFS2_I(inode)->ip_blkno); mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); dump_stack(); + goto bail; } past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); @@ -894,18 +895,17 @@ struct ocfs2_write_cluster_desc { */ unsigned c_new; unsigned c_unwritten; + unsigned c_needs_zero; }; -static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d) -{ - return d->c_new || d->c_unwritten; -} - struct ocfs2_write_ctxt { /* Logical cluster position / len of write */ u32 w_cpos; u32 w_clen; + /* First cluster allocated in a nonsparse extend */ + u32 w_first_new_cpos; + struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; /* @@ -983,6 +983,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, return -ENOMEM; wc->w_cpos = pos >> osb->s_clustersize_bits; + wc->w_first_new_cpos = UINT_MAX; cend = (pos + len - 1) >> osb->s_clustersize_bits; wc->w_clen = cend - wc->w_cpos + 1; get_bh(di_bh); @@ -1217,20 +1218,18 @@ out: */ static int ocfs2_write_cluster(struct address_space *mapping, u32 phys, unsigned int unwritten, + unsigned int should_zero, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, struct ocfs2_write_ctxt *wc, u32 cpos, loff_t user_pos, unsigned user_len) { - int ret, i, new, should_zero = 0; + int ret, i, new; u64 v_blkno, p_blkno; struct inode *inode = mapping->host; struct ocfs2_extent_tree et; new = phys == 0 ? 1 : 0; - if (new || unwritten) - should_zero = 1; - if (new) { u32 tmp_pos; @@ -1301,7 +1300,7 @@ static int ocfs2_write_cluster(struct address_space *mapping, if (tmpret) { mlog_errno(tmpret); if (ret == 0) - tmpret = ret; + ret = tmpret; } } @@ -1341,7 +1340,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping, local_len = osb->s_clustersize - cluster_off; ret = ocfs2_write_cluster(mapping, desc->c_phys, - desc->c_unwritten, data_ac, meta_ac, + desc->c_unwritten, + desc->c_needs_zero, + data_ac, meta_ac, wc, desc->c_cpos, pos, local_len); if (ret) { mlog_errno(ret); @@ -1391,14 +1392,14 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, * newly allocated cluster. */ desc = &wc->w_desc[0]; - if (ocfs2_should_zero_cluster(desc)) + if (desc->c_needs_zero) ocfs2_figure_cluster_boundaries(osb, desc->c_cpos, &wc->w_target_from, NULL); desc = &wc->w_desc[wc->w_clen - 1]; - if (ocfs2_should_zero_cluster(desc)) + if (desc->c_needs_zero) ocfs2_figure_cluster_boundaries(osb, desc->c_cpos, NULL, @@ -1466,13 +1467,28 @@ static int ocfs2_populate_write_desc(struct inode *inode, phys++; } + /* + * If w_first_new_cpos is < UINT_MAX, we have a non-sparse + * file that got extended. w_first_new_cpos tells us + * where the newly allocated clusters are so we can + * zero them. + */ + if (desc->c_cpos >= wc->w_first_new_cpos) { + BUG_ON(phys == 0); + desc->c_needs_zero = 1; + } + desc->c_phys = phys; if (phys == 0) { desc->c_new = 1; + desc->c_needs_zero = 1; *clusters_to_alloc = *clusters_to_alloc + 1; } - if (ext_flags & OCFS2_EXT_UNWRITTEN) + + if (ext_flags & OCFS2_EXT_UNWRITTEN) { desc->c_unwritten = 1; + desc->c_needs_zero = 1; + } num_clusters--; } @@ -1632,10 +1648,13 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, if (newsize <= i_size_read(inode)) return 0; - ret = ocfs2_extend_no_holes(inode, newsize, newsize - len); + ret = ocfs2_extend_no_holes(inode, newsize, pos); if (ret) mlog_errno(ret); + wc->w_first_new_cpos = + ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); + return ret; } @@ -1644,7 +1663,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, struct page **pagep, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page) { - int ret, credits = OCFS2_INODE_UPDATE_CREDITS; + int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS; unsigned int clusters_to_alloc, extents_to_split; struct ocfs2_write_ctxt *wc; struct inode *inode = mapping->host; @@ -1722,8 +1741,19 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, } - ocfs2_set_target_boundaries(osb, wc, pos, len, - clusters_to_alloc + extents_to_split); + /* + * We have to zero sparse allocated clusters, unwritten extent clusters, + * and non-sparse clusters we just extended. For non-sparse writes, + * we know zeros will only be needed in the first and/or last cluster. + */ + if (clusters_to_alloc || extents_to_split || + (wc->w_clen && (wc->w_desc[0].c_needs_zero || + wc->w_desc[wc->w_clen - 1].c_needs_zero))) + cluster_of_pages = 1; + else + cluster_of_pages = 0; + + ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { @@ -1756,8 +1786,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, * extent. */ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, - clusters_to_alloc + extents_to_split, - mmap_page); + cluster_of_pages, mmap_page); if (ret) { mlog_errno(ret); goto out_quota; diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index b574431a031d..b4957c7d9fe2 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -85,6 +85,17 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry, goto bail; } + /* + * If the last lookup failed to create dentry lock, let us + * redo it. + */ + if (!dentry->d_fsdata) { + mlog(0, "Inode %llu doesn't have dentry lock, " + "returning false\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + goto bail; + } + ret = 1; bail: @@ -310,22 +321,19 @@ out_attach: return ret; } -static DEFINE_SPINLOCK(dentry_list_lock); +DEFINE_SPINLOCK(dentry_list_lock); /* We limit the number of dentry locks to drop in one go. We have * this limit so that we don't starve other users of ocfs2_wq. */ #define DL_INODE_DROP_COUNT 64 /* Drop inode references from dentry locks */ -void ocfs2_drop_dl_inodes(struct work_struct *work) +static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count) { - struct ocfs2_super *osb = container_of(work, struct ocfs2_super, - dentry_lock_work); struct ocfs2_dentry_lock *dl; - int drop_count = DL_INODE_DROP_COUNT; spin_lock(&dentry_list_lock); - while (osb->dentry_lock_list && drop_count--) { + while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) { dl = osb->dentry_lock_list; osb->dentry_lock_list = dl->dl_next; spin_unlock(&dentry_list_lock); @@ -333,11 +341,32 @@ void ocfs2_drop_dl_inodes(struct work_struct *work) kfree(dl); spin_lock(&dentry_list_lock); } - if (osb->dentry_lock_list) + spin_unlock(&dentry_list_lock); +} + +void ocfs2_drop_dl_inodes(struct work_struct *work) +{ + struct ocfs2_super *osb = container_of(work, struct ocfs2_super, + dentry_lock_work); + + __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT); + /* + * Don't queue dropping if umount is in progress. We flush the + * list in ocfs2_dismount_volume + */ + spin_lock(&dentry_list_lock); + if (osb->dentry_lock_list && + !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) queue_work(ocfs2_wq, &osb->dentry_lock_work); spin_unlock(&dentry_list_lock); } +/* Flush the whole work queue */ +void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) +{ + __ocfs2_drop_dl_inodes(osb, -1); +} + /* * ocfs2_dentry_iput() and friends. * @@ -368,7 +397,8 @@ static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, /* We leave dropping of inode reference to ocfs2_wq as that can * possibly lead to inode deletion which gets tricky */ spin_lock(&dentry_list_lock); - if (!osb->dentry_lock_list) + if (!osb->dentry_lock_list && + !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) queue_work(ocfs2_wq, &osb->dentry_lock_work); dl->dl_next = osb->dentry_lock_list; osb->dentry_lock_list = dl; diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h index faa12e75f98d..f5dd1789acf1 100644 --- a/fs/ocfs2/dcache.h +++ b/fs/ocfs2/dcache.h @@ -49,10 +49,13 @@ struct ocfs2_dentry_lock { int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, u64 parent_blkno); +extern spinlock_t dentry_list_lock; + void ocfs2_dentry_lock_put(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl); void ocfs2_drop_dl_inodes(struct work_struct *work); +void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb); struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, int skip_unhashed); diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index d07ddbe4b283..81eff8e58322 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -103,7 +103,6 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) lock->ast_pending, lock->ml.type); BUG(); } - BUG_ON(!list_empty(&lock->ast_list)); if (lock->ast_pending) mlog(0, "lock has an ast getting flushed right now\n"); diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c index 1c9efb406a96..02bf17808bdc 100644 --- a/fs/ocfs2/dlm/dlmfs.c +++ b/fs/ocfs2/dlm/dlmfs.c @@ -325,6 +325,7 @@ clear_fields: } static struct backing_dev_info dlmfs_backing_dev_info = { + .name = "ocfs2-dlmfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index bcb9260c3735..43e6e3280569 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1118,7 +1118,7 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n", dlm->name, res->lockname.len, res->lockname.name, - orig_flags & DLM_MRES_MIGRATION ? "migrate" : "recovery", + orig_flags & DLM_MRES_MIGRATION ? "migration" : "recovery", send_to); /* send it */ diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index fcf879ed6930..756f5b0998e0 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -122,7 +122,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, * that still has AST's pending... */ in_use = !list_empty(&lock->ast_list); spin_unlock(&dlm->ast_lock); - if (in_use) { + if (in_use && !(flags & LKM_CANCEL)) { mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock " "while waiting for an ast!", res->lockname.len, res->lockname.name); @@ -131,7 +131,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); if (res->state & DLM_LOCK_RES_IN_PROGRESS) { - if (master_node) { + if (master_node && !(flags & LKM_CANCEL)) { mlog(ML_ERROR, "lockres in progress!\n"); spin_unlock(&res->spinlock); return DLM_FORWARD; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 62442e413a00..221c5e98957b 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1851,6 +1851,7 @@ relock: if (ret) goto out_dio; + count = ocount; ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode)); if (ret) @@ -1870,8 +1871,7 @@ relock: goto out_dio; } } else { - written = generic_file_aio_write_nolock(iocb, iov, nr_segs, - *ppos); + written = __generic_file_aio_write(iocb, iov, nr_segs, ppos); } out_dio: @@ -1879,18 +1879,21 @@ out_dio: BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) { - /* - * The generic write paths have handled getting data - * to disk, but since we don't make use of the dirty - * inode list, a manual journal commit is necessary - * here. - */ - if (old_size != i_size_read(inode) || - old_clusters != OCFS2_I(inode)->ip_clusters) { + ret = filemap_fdatawrite_range(file->f_mapping, pos, + pos + count - 1); + if (ret < 0) + written = ret; + + if (!ret && (old_size != i_size_read(inode) || + old_clusters != OCFS2_I(inode)->ip_clusters)) { ret = jbd2_journal_force_commit(osb->journal->j_journal); if (ret < 0) written = ret; } + + if (!ret) + ret = filemap_fdatawait_range(file->f_mapping, pos, + pos + count - 1); } /* @@ -1918,8 +1921,10 @@ out_sems: mutex_unlock(&inode->i_mutex); + if (written) + ret = written; mlog_exit(ret); - return written ? written : ret; + return ret; } static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, @@ -1988,31 +1993,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, if (ret > 0) { unsigned long nr_pages; + int err; - *ppos += ret; nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - /* - * If file or inode is SYNC and we actually wrote some data, - * sync it. - */ - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err; - - mutex_lock(&inode->i_mutex); - err = ocfs2_rw_lock(inode, 1); - if (err < 0) { - mlog_errno(err); - } else { - err = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - ocfs2_rw_unlock(inode, 1); - } - mutex_unlock(&inode->i_mutex); + err = generic_write_sync(out, *ppos, ret); + if (err) + ret = err; + else + *ppos += ret; - if (err) - ret = err; - } balance_dirty_pages_ratelimited_nr(mapping, nr_pages); } diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index f033760ecbea..c48b93ac6b65 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1954,10 +1954,16 @@ void ocfs2_orphan_scan_init(struct ocfs2_super *osb) os->os_osb = osb; os->os_count = 0; os->os_seqno = 0; - os->os_scantime = CURRENT_TIME; mutex_init(&os->os_lock); INIT_DELAYED_WORK(&os->os_orphan_scan_work, ocfs2_orphan_scan_work); +} +void ocfs2_orphan_scan_start(struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan *os; + + os = &osb->osb_orphan_scan; + os->os_scantime = CURRENT_TIME; if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb)) atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); else { diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 5432c7f79cc6..2c3222aec622 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -145,6 +145,7 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb, /* Exported only for the journal struct init code in super.c. Do not call. */ void ocfs2_orphan_scan_init(struct ocfs2_super *osb); +void ocfs2_orphan_scan_start(struct ocfs2_super *osb); void ocfs2_orphan_scan_stop(struct ocfs2_super *osb); void ocfs2_orphan_scan_exit(struct ocfs2_super *osb); @@ -329,20 +330,27 @@ int ocfs2_journal_dirty(handle_t *handle, /* extended attribute block update */ #define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1 +/* Update of a single quota block */ +#define OCFS2_QUOTA_BLOCK_UPDATE_CREDITS 1 + /* global quotafile inode update, data block */ -#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) +#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \ + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) +#define OCFS2_LOCAL_QINFO_WRITE_CREDITS OCFS2_QUOTA_BLOCK_UPDATE_CREDITS /* * The two writes below can accidentally see global info dirty due * to set_info() quotactl so make them prepared for the writes. */ /* quota data block, global info */ /* Write to local quota file */ -#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1) +#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \ + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) /* global quota data block, local quota data block, global quota inode, * global quota info */ -#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3) +#define OCFS2_QSYNC_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \ + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) static inline int ocfs2_quota_trans_credits(struct super_block *sb) { @@ -355,11 +363,6 @@ static inline int ocfs2_quota_trans_credits(struct super_block *sb) return credits; } -/* Number of credits needed for removing quota structure from file */ -int ocfs2_calc_qdel_credits(struct super_block *sb, int type); -/* Number of credits needed for initialization of new quota structure */ -int ocfs2_calc_qinit_credits(struct super_block *sb, int type); - /* group extend. inode update and last group update. */ #define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index c9345ebb8493..39e1d5a39505 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -224,10 +224,12 @@ enum ocfs2_mount_options OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */ }; -#define OCFS2_OSB_SOFT_RO 0x0001 -#define OCFS2_OSB_HARD_RO 0x0002 -#define OCFS2_OSB_ERROR_FS 0x0004 -#define OCFS2_DEFAULT_ATIME_QUANTUM 60 +#define OCFS2_OSB_SOFT_RO 0x0001 +#define OCFS2_OSB_HARD_RO 0x0002 +#define OCFS2_OSB_ERROR_FS 0x0004 +#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008 + +#define OCFS2_DEFAULT_ATIME_QUANTUM 60 struct ocfs2_journal; struct ocfs2_slot_info; @@ -490,6 +492,18 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, spin_unlock(&osb->osb_lock); } + +static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb, + unsigned long flag) +{ + unsigned long ret; + + spin_lock(&osb->osb_lock); + ret = osb->osb_flags & flag; + spin_unlock(&osb->osb_lock); + return ret; +} + static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, int hard) { diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index fcdba091af3d..c212cf5a2bdf 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -108,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = { [OCFS2_LOCK_TYPE_OPEN] = "Open", [OCFS2_LOCK_TYPE_FLOCK] = "Flock", [OCFS2_LOCK_TYPE_QINFO] = "Quota", + [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync", [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan", }; diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index 7365e2e08706..3fb96fcd4c81 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -50,7 +50,6 @@ struct ocfs2_mem_dqinfo { unsigned int dqi_chunks; /* Number of chunks in local quota file */ unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */ unsigned int dqi_syncms; /* How often should we sync with other nodes */ - unsigned int dqi_syncjiff; /* Precomputed dqi_syncms in jiffies */ struct list_head dqi_chunk; /* List of chunks */ struct inode *dqi_gqinode; /* Global quota file inode */ struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */ diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index edfa60cd155c..44f2a5e1d042 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -23,6 +23,7 @@ #include "sysfile.h" #include "dlmglue.h" #include "uptodate.h" +#include "super.h" #include "quota.h" static struct workqueue_struct *ocfs2_quota_wq = NULL; @@ -69,6 +70,7 @@ static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot) d->dqb_curspace = cpu_to_le64(m->dqb_curspace); d->dqb_btime = cpu_to_le64(m->dqb_btime); d->dqb_itime = cpu_to_le64(m->dqb_itime); + d->dqb_pad1 = d->dqb_pad2 = 0; } static int ocfs2_global_is_id(void *dp, struct dquot *dquot) @@ -113,6 +115,15 @@ int ocfs2_read_quota_block(struct inode *inode, u64 v_block, int rc = 0; struct buffer_head *tmp = *bh; + if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { + ocfs2_error(inode->i_sb, + "Quota file %llu is probably corrupted! Requested " + "to read block %Lu but file has size only %Lu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)v_block, + (unsigned long long)i_size_read(inode)); + return -EIO; + } rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0, ocfs2_validate_quota_block); if (rc) @@ -211,14 +222,13 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA); if (gqinode->i_size < off + len) { - down_write(&OCFS2_I(gqinode)->ip_alloc_sem); - err = ocfs2_extend_no_holes(gqinode, off + len, off); - up_write(&OCFS2_I(gqinode)->ip_alloc_sem); - if (err < 0) - goto out; + loff_t rounded_end = + ocfs2_align_bytes_to_blocks(sb, off + len); + + /* Space is already allocated in ocfs2_global_read_dquot() */ err = ocfs2_simple_size_update(gqinode, oinfo->dqi_gqi_bh, - off + len); + rounded_end); if (err < 0) goto out; new = 1; @@ -234,7 +244,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, } if (err) { mlog_errno(err); - return err; + goto out; } lock_buffer(bh); if (new) @@ -342,7 +352,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type) info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms); - oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms); oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); @@ -352,7 +361,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type) oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi); INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn); queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, - oinfo->dqi_syncjiff); + msecs_to_jiffies(oinfo->dqi_syncms)); out_err: mlog_exit(status); @@ -402,13 +411,36 @@ int ocfs2_global_write_info(struct super_block *sb, int type) return err; } +static int ocfs2_global_qinit_alloc(struct super_block *sb, int type) +{ + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + + /* + * We may need to allocate tree blocks and a leaf block but not the + * root block + */ + return oinfo->dqi_gi.dqi_qtree_depth; +} + +static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type) +{ + /* We modify all the allocated blocks, tree root, and info block */ + return (ocfs2_global_qinit_alloc(sb, type) + 2) * + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS; +} + /* Read in information from global quota file and acquire a reference to it. * dquot_acquire() has already started the transaction and locked quota file */ int ocfs2_global_read_dquot(struct dquot *dquot) { int err, err2, ex = 0; - struct ocfs2_mem_dqinfo *info = - sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + struct super_block *sb = dquot->dq_sb; + int type = dquot->dq_type; + struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; + struct ocfs2_super *osb = OCFS2_SB(sb); + struct inode *gqinode = info->dqi_gqinode; + int need_alloc = ocfs2_global_qinit_alloc(sb, type); + handle_t *handle = NULL; err = ocfs2_qinfo_lock(info, 0); if (err < 0) @@ -419,14 +451,33 @@ int ocfs2_global_read_dquot(struct dquot *dquot) OCFS2_DQUOT(dquot)->dq_use_count++; OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes; + ocfs2_qinfo_unlock(info, 0); + if (!dquot->dq_off) { /* No real quota entry? */ - /* Upgrade to exclusive lock for allocation */ - ocfs2_qinfo_unlock(info, 0); - err = ocfs2_qinfo_lock(info, 1); - if (err < 0) - goto out_qlock; ex = 1; + /* + * Add blocks to quota file before we start a transaction since + * locking allocators ranks above a transaction start + */ + WARN_ON(journal_current_handle()); + down_write(&OCFS2_I(gqinode)->ip_alloc_sem); + err = ocfs2_extend_no_holes(gqinode, + gqinode->i_size + (need_alloc << sb->s_blocksize_bits), + gqinode->i_size); + up_write(&OCFS2_I(gqinode)->ip_alloc_sem); + if (err < 0) + goto out; } + + handle = ocfs2_start_trans(osb, + ocfs2_calc_global_qinit_credits(sb, type)); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out; + } + err = ocfs2_qinfo_lock(info, ex); + if (err < 0) + goto out_trans; err = qtree_write_dquot(&info->dqi_gi, dquot); if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) { err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type); @@ -438,6 +489,9 @@ out_qlock: ocfs2_qinfo_unlock(info, 1); else ocfs2_qinfo_unlock(info, 0); +out_trans: + if (handle) + ocfs2_commit_trans(osb, handle); out: if (err < 0) mlog_errno(err); @@ -607,7 +661,7 @@ static void qsync_work_fn(struct work_struct *work) dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type); queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, - oinfo->dqi_syncjiff); + msecs_to_jiffies(oinfo->dqi_syncms)); } /* @@ -635,20 +689,18 @@ out: return status; } -int ocfs2_calc_qdel_credits(struct super_block *sb, int type) +static int ocfs2_calc_qdel_credits(struct super_block *sb, int type) { - struct ocfs2_mem_dqinfo *oinfo; - int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA }; - - if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type])) - return 0; - - oinfo = sb_dqinfo(sb, type)->dqi_priv; - /* We modify tree, leaf block, global info, local chunk header, - * global and local inode */ - return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 + - 2 * OCFS2_INODE_UPDATE_CREDITS; + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + /* + * We modify tree, leaf block, global info, local chunk header, + * global and local inode; OCFS2_QINFO_WRITE_CREDITS already + * accounts for inode update + */ + return (oinfo->dqi_gi.dqi_qtree_depth + 2) * + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + + OCFS2_QINFO_WRITE_CREDITS + + OCFS2_INODE_UPDATE_CREDITS; } static int ocfs2_release_dquot(struct dquot *dquot) @@ -680,33 +732,10 @@ out: return status; } -int ocfs2_calc_qinit_credits(struct super_block *sb, int type) -{ - struct ocfs2_mem_dqinfo *oinfo; - int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA }; - struct ocfs2_dinode *lfe, *gfe; - - if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type])) - return 0; - - oinfo = sb_dqinfo(sb, type)->dqi_priv; - gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data; - lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data; - /* We can extend local file + global file. In local file we - * can modify info, chunk header block and dquot block. In - * global file we can modify info, tree and leaf block */ - return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) + - ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) + - 3 + oinfo->dqi_gi.dqi_qtree_depth + 2; -} - static int ocfs2_acquire_dquot(struct dquot *dquot) { - handle_t *handle; struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; - struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); int status = 0; mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); @@ -715,16 +744,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot) status = ocfs2_lock_global_qf(oinfo, 1); if (status < 0) goto out; - handle = ocfs2_start_trans(osb, - ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type)); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto out_ilock; - } status = dquot_acquire(dquot); - ocfs2_commit_trans(osb, handle); -out_ilock: ocfs2_unlock_global_qf(oinfo, 1); out: mlog_exit(status); diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 5a460fa82553..bdb09cb6e1fe 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -20,6 +20,7 @@ #include "sysfile.h" #include "dlmglue.h" #include "quota.h" +#include "uptodate.h" /* Number of local quota structures per block */ static inline unsigned int ol_quota_entries_per_block(struct super_block *sb) @@ -100,7 +101,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh, handle_t *handle; int status; - handle = ocfs2_start_trans(OCFS2_SB(sb), 1); + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -610,7 +612,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, goto out_bh; /* Mark quota file as clean if we are recovering quota file of * some other node. */ - handle = ocfs2_start_trans(osb, 1); + handle = ocfs2_start_trans(osb, + OCFS2_LOCAL_QINFO_WRITE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -940,7 +943,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( struct ocfs2_local_disk_chunk *dchunk; int status; handle_t *handle; - struct buffer_head *bh = NULL; + struct buffer_head *bh = NULL, *dbh = NULL; u64 p_blkno; /* We are protected by dqio_sem so no locking needed */ @@ -964,32 +967,35 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( mlog_errno(status); goto out; } + /* Local quota info and two new blocks we initialize */ + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_LOCAL_QINFO_WRITE_CREDITS + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + /* Initialize chunk header */ down_read(&OCFS2_I(lqinode)->ip_alloc_sem); status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, &p_blkno, NULL, NULL); up_read(&OCFS2_I(lqinode)->ip_alloc_sem); if (status < 0) { mlog_errno(status); - goto out; + goto out_trans; } bh = sb_getblk(sb, p_blkno); if (!bh) { status = -ENOMEM; mlog_errno(status); - goto out; + goto out_trans; } dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; - - handle = ocfs2_start_trans(OCFS2_SB(sb), 2); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto out; - } - + ocfs2_set_new_buffer_uptodate(lqinode, bh); status = ocfs2_journal_access_dq(handle, lqinode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto out_trans; @@ -999,7 +1005,6 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( memset(dchunk->dqc_bitmap, 0, sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - OCFS2_QBLK_RESERVED_SPACE); - set_buffer_uptodate(bh); unlock_buffer(bh); status = ocfs2_journal_dirty(handle, bh); if (status < 0) { @@ -1007,6 +1012,38 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( goto out_trans; } + /* Initialize new block with structures */ + down_read(&OCFS2_I(lqinode)->ip_alloc_sem); + status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1, + &p_blkno, NULL, NULL); + up_read(&OCFS2_I(lqinode)->ip_alloc_sem); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + dbh = sb_getblk(sb, p_blkno); + if (!dbh) { + status = -ENOMEM; + mlog_errno(status); + goto out_trans; + } + ocfs2_set_new_buffer_uptodate(lqinode, dbh); + status = ocfs2_journal_access_dq(handle, lqinode, dbh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + lock_buffer(dbh); + memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE); + unlock_buffer(dbh); + status = ocfs2_journal_dirty(handle, dbh); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + + /* Update local quotafile info */ oinfo->dqi_blocks += 2; oinfo->dqi_chunks++; status = ocfs2_local_write_info(sb, type); @@ -1031,6 +1068,7 @@ out_trans: ocfs2_commit_trans(OCFS2_SB(sb), handle); out: brelse(bh); + brelse(dbh); kmem_cache_free(ocfs2_qf_chunk_cachep, chunk); return ERR_PTR(status); } @@ -1048,6 +1086,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( struct ocfs2_local_disk_chunk *dchunk; int epb = ol_quota_entries_per_block(sb); unsigned int chunk_blocks; + struct buffer_head *bh; + u64 p_blkno; int status; handle_t *handle; @@ -1075,12 +1115,49 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( mlog_errno(status); goto out; } - handle = ocfs2_start_trans(OCFS2_SB(sb), 2); + + /* Get buffer from the just added block */ + down_read(&OCFS2_I(lqinode)->ip_alloc_sem); + status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, + &p_blkno, NULL, NULL); + up_read(&OCFS2_I(lqinode)->ip_alloc_sem); + if (status < 0) { + mlog_errno(status); + goto out; + } + bh = sb_getblk(sb, p_blkno); + if (!bh) { + status = -ENOMEM; + mlog_errno(status); + goto out; + } + ocfs2_set_new_buffer_uptodate(lqinode, bh); + + /* Local quota info, chunk header and the new block we initialize */ + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_LOCAL_QINFO_WRITE_CREDITS + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); goto out; } + /* Zero created block */ + status = ocfs2_journal_access_dq(handle, lqinode, bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + unlock_buffer(bh); + status = ocfs2_journal_dirty(handle, bh); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + /* Update chunk header */ status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { @@ -1097,6 +1174,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( mlog_errno(status); goto out_trans; } + /* Update file header */ oinfo->dqi_blocks++; status = ocfs2_local_write_info(sb, type); if (status < 0) { diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 3f661376a2de..e49c41050264 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -17,6 +17,7 @@ * General Public License for more details. */ +#include <linux/kernel.h> #include <linux/crc32.h> #include <linux/module.h> @@ -153,7 +154,7 @@ static int status_map[] = { static int dlm_status_to_errno(enum dlm_status status) { - BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0]))); + BUG_ON(status < 0 || status >= ARRAY_SIZE(status_map)); return status_map[status]; } diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 7efb349fb9bd..a3f8871d21fd 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -777,6 +777,7 @@ static int ocfs2_sb_probe(struct super_block *sb, } di = (struct ocfs2_dinode *) (*bh)->b_data; memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats)); + spin_lock_init(&stats->b_lock); status = ocfs2_verify_volume(di, *bh, blksize, stats); if (status >= 0) goto bail; @@ -1182,7 +1183,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) wake_up(&osb->osb_mount_event); /* Start this when the mount is almost sure of being successful */ - ocfs2_orphan_scan_init(osb); + ocfs2_orphan_scan_start(osb); mlog_exit(status); return status; @@ -1213,14 +1214,31 @@ static int ocfs2_get_sb(struct file_system_type *fs_type, mnt); } +static void ocfs2_kill_sb(struct super_block *sb) +{ + struct ocfs2_super *osb = OCFS2_SB(sb); + + /* Failed mount? */ + if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED) + goto out; + + /* Prevent further queueing of inode drop events */ + spin_lock(&dentry_list_lock); + ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED); + spin_unlock(&dentry_list_lock); + /* Wait for work to finish and/or remove it */ + cancel_work_sync(&osb->dentry_lock_work); +out: + kill_block_super(sb); +} + static struct file_system_type ocfs2_fs_type = { .owner = THIS_MODULE, .name = "ocfs2", .get_sb = ocfs2_get_sb, /* is this called when we mount * the fs? */ - .kill_sb = kill_block_super, /* set to the generic one - * right now, but do we - * need to change that? */ + .kill_sb = ocfs2_kill_sb, + .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, .next = NULL }; @@ -1819,6 +1837,12 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) debugfs_remove(osb->osb_ctxt); + /* + * Flush inode dropping work queue so that deletes are + * performed while the filesystem is still working + */ + ocfs2_drop_all_dl_inodes(osb); + /* Orphan scan should be stopped as early as possible */ ocfs2_orphan_scan_stop(osb); @@ -1981,6 +2005,8 @@ static int ocfs2_initialize_super(struct super_block *sb, snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); + ocfs2_orphan_scan_init(osb); + status = ocfs2_recovery_init(osb); if (status) { mlog(ML_ERROR, "Unable to initialize recovery state\n"); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index ba320e250747..d1a27cda984f 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1052,7 +1052,8 @@ static int ocfs2_xattr_block_get(struct inode *inode, struct ocfs2_xattr_block *xb; struct ocfs2_xattr_value_root *xv; size_t size; - int ret = -ENODATA, name_offset, name_len, block_off, i; + int ret = -ENODATA, name_offset, name_len, i; + int uninitialized_var(block_off); xs->bucket = ocfs2_xattr_bucket_new(inode); if (!xs->bucket) { diff --git a/fs/open.c b/fs/open.c index dd98e8076024..31191bf513e4 100644 --- a/fs/open.c +++ b/fs/open.c @@ -199,7 +199,7 @@ out: int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, struct file *filp) { - int err; + int ret; struct iattr newattrs; /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */ @@ -214,12 +214,14 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, } /* Remove suid/sgid on truncate too */ - newattrs.ia_valid |= should_remove_suid(dentry); + ret = should_remove_suid(dentry); + if (ret) + newattrs.ia_valid |= ret | ATTR_FORCE; mutex_lock(&dentry->d_inode->i_mutex); - err = notify_change(dentry, &newattrs); + ret = notify_change(dentry, &newattrs); mutex_unlock(&dentry->d_inode->i_mutex); - return err; + return ret; } static long do_sys_truncate(const char __user *pathname, loff_t length) @@ -957,6 +959,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, int error; struct file *f; + validate_creds(cred); + /* * We must always pass in a valid mount pointer. Historically * callers got away with not passing it, but we must enforce this at diff --git a/fs/partitions/check.c b/fs/partitions/check.c index ea4e6cb29e13..619ba99dfe39 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -248,11 +248,19 @@ ssize_t part_stat_show(struct device *dev, part_stat_read(p, merges[WRITE]), (unsigned long long)part_stat_read(p, sectors[WRITE]), jiffies_to_msecs(part_stat_read(p, ticks[WRITE])), - p->in_flight, + part_in_flight(p), jiffies_to_msecs(part_stat_read(p, io_ticks)), jiffies_to_msecs(part_stat_read(p, time_in_queue))); } +ssize_t part_inflight_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]); +} + #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -281,6 +289,7 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); +static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store); @@ -292,6 +301,7 @@ static struct attribute *part_attrs[] = { &dev_attr_size.attr, &dev_attr_alignment_offset.attr, &dev_attr_stat.attr, + &dev_attr_inflight.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif diff --git a/fs/pipe.c b/fs/pipe.c index f7dd21ad85a6..52c415114838 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -68,8 +68,8 @@ void pipe_double_lock(struct pipe_inode_info *pipe1, pipe_lock_nested(pipe1, I_MUTEX_PARENT); pipe_lock_nested(pipe2, I_MUTEX_CHILD); } else { - pipe_lock_nested(pipe2, I_MUTEX_CHILD); - pipe_lock_nested(pipe1, I_MUTEX_PARENT); + pipe_lock_nested(pipe2, I_MUTEX_PARENT); + pipe_lock_nested(pipe1, I_MUTEX_CHILD); } } diff --git a/fs/proc/base.c b/fs/proc/base.c index 3ce5ae9e3d2d..6f742f6658a9 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -234,23 +234,20 @@ static int check_mem_permission(struct task_struct *task) struct mm_struct *mm_for_maps(struct task_struct *task) { - struct mm_struct *mm = get_task_mm(task); - if (!mm) + struct mm_struct *mm; + + if (mutex_lock_killable(&task->cred_guard_mutex)) return NULL; - down_read(&mm->mmap_sem); - task_lock(task); - if (task->mm != mm) - goto out; - if (task->mm != current->mm && - __ptrace_may_access(task, PTRACE_MODE_READ) < 0) - goto out; - task_unlock(task); + + mm = get_task_mm(task); + if (mm && mm != current->mm && + !ptrace_may_access(task, PTRACE_MODE_READ)) { + mmput(mm); + mm = NULL; + } + mutex_unlock(&task->cred_guard_mutex); + return mm; -out: - task_unlock(task); - up_read(&mm->mmap_sem); - mmput(mm); - return NULL; } static int proc_pid_cmdline(struct task_struct *task, char * buffer) @@ -1006,12 +1003,7 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf, if (!task) return -ESRCH; - task_lock(task); - if (task->mm) - oom_adjust = task->mm->oom_adj; - else - oom_adjust = OOM_DISABLE; - task_unlock(task); + oom_adjust = task->oomkilladj; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); @@ -1040,19 +1032,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, task = get_proc_task(file->f_path.dentry->d_inode); if (!task) return -ESRCH; - task_lock(task); - if (!task->mm) { - task_unlock(task); - put_task_struct(task); - return -EINVAL; - } - if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) { - task_unlock(task); + if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { put_task_struct(task); return -EACCES; } - task->mm->oom_adj = oom_adjust; - task_unlock(task); + task->oomkilladj = oom_adjust; put_task_struct(task); if (end - buffer == 0) return -EIO; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6f61b7cc32e0..9bd8be1d235c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -119,6 +119,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) mm = mm_for_maps(priv->task); if (!mm) return NULL; + down_read(&mm->mmap_sem); tail_vma = get_gate_vma(priv->task); priv->tail_vma = tail_vma; diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 64a72e2e7650..8f5c05d3dbd3 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -189,6 +189,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) priv->task = NULL; return NULL; } + down_read(&mm->mmap_sem); /* start from the Nth VMA */ for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 70f36c043d62..38f7bd559f35 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2043,7 +2043,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, invalidate_bdev(sb->s_bdev); } mutex_lock(&dqopt->dqonoff_mutex); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); if (sb_has_quota_loaded(sb, type)) { error = -EBUSY; goto out_lock; @@ -2054,9 +2053,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, * possible) Also nobody should write to the file - we use * special IO operations which ignore the immutable bit. */ down_write(&dqopt->dqptr_sem); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; + mutex_unlock(&inode->i_mutex); up_write(&dqopt->dqptr_sem); sb->dq_op->drop(inode); } @@ -2080,7 +2081,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, goto out_file_init; } mutex_unlock(&dqopt->dqio_mutex); - mutex_unlock(&inode->i_mutex); spin_lock(&dq_state_lock); dqopt->flags |= dquot_state_flag(flags, type); spin_unlock(&dq_state_lock); @@ -2096,13 +2096,14 @@ out_file_init: out_lock: if (oldflags != -1) { down_write(&dqopt->dqptr_sem); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA); /* Set the flags back (in the case of accidental quotaon() * on a wrong file we don't want to mess up the flags) */ inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); inode->i_flags |= oldflags; + mutex_unlock(&inode->i_mutex); up_write(&dqopt->dqptr_sem); } - mutex_unlock(&inode->i_mutex); mutex_unlock(&dqopt->dqonoff_mutex); out_fmt: put_quota_format(fmt); diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index ebb2c417912c..11f0c06316de 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -20,6 +20,7 @@ #include <linux/ramfs.h> #include <linux/pagevec.h> #include <linux/mman.h> +#include <linux/sched.h> #include <asm/uaccess.h> #include "internal.h" diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 0ff7566c767c..a7f0110fca4c 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -46,6 +46,7 @@ static const struct super_operations ramfs_ops; static const struct inode_operations ramfs_dir_inode_operations; static struct backing_dev_info ramfs_backing_dev_info = { + .name = "ramfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY | diff --git a/fs/select.c b/fs/select.c index d870237e42c7..8084834e123e 100644 --- a/fs/select.c +++ b/fs/select.c @@ -110,6 +110,7 @@ void poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait); pwq->polling_task = current; + pwq->triggered = 0; pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; diff --git a/fs/splice.c b/fs/splice.c index 73766d24f97b..7394e9e17534 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -502,8 +502,10 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, len = left; ret = __generic_file_splice_read(in, ppos, pipe, len, flags); - if (ret > 0) + if (ret > 0) { *ppos += ret; + file_accessed(in); + } return ret; } @@ -963,8 +965,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); ret = file_remove_suid(out); - if (!ret) + if (!ret) { + file_update_time(out); ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file); + } mutex_unlock(&inode->i_mutex); } while (ret > 0); splice_from_pipe_end(pipe, &sd); @@ -976,25 +980,15 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, if (ret > 0) { unsigned long nr_pages; + int err; - *ppos += ret; nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - /* - * If file or inode is SYNC and we actually wrote some data, - * sync it. - */ - if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err; - - mutex_lock(&inode->i_mutex); - err = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - mutex_unlock(&inode->i_mutex); - - if (err) - ret = err; - } + err = generic_write_sync(out, *ppos, ret); + if (err) + ret = err; + else + *ppos += ret; balance_dirty_pages_ratelimited_nr(mapping, nr_pages); } diff --git a/fs/super.c b/fs/super.c index 2761d3e22ed9..9cda337ddae2 100644 --- a/fs/super.c +++ b/fs/super.c @@ -62,9 +62,6 @@ static struct super_block *alloc_super(struct file_system_type *type) s = NULL; goto out; } - INIT_LIST_HEAD(&s->s_dirty); - INIT_LIST_HEAD(&s->s_io); - INIT_LIST_HEAD(&s->s_more_io); INIT_LIST_HEAD(&s->s_files); INIT_LIST_HEAD(&s->s_instances); INIT_HLIST_HEAD(&s->s_anon); @@ -171,7 +168,7 @@ int __put_super_and_need_restart(struct super_block *sb) * Drops a temporary reference, frees superblock if there's no * references left. */ -static void put_super(struct super_block *sb) +void put_super(struct super_block *sb) { spin_lock(&sb_lock); __put_super(sb); diff --git a/fs/sync.c b/fs/sync.c index 3422ba61d86d..192340930bb4 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -19,20 +19,22 @@ SYNC_FILE_RANGE_WAIT_AFTER) /* - * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0) - * just dirties buffers with inodes so we have to submit IO for these buffers - * via __sync_blockdev(). This also speeds up the wait == 1 case since in that - * case write_inode() functions do sync_dirty_buffer() and thus effectively - * write one block at a time. + * Do the filesystem syncing work. For simple filesystems + * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to + * submit IO for these buffers via __sync_blockdev(). This also speeds up the + * wait == 1 case since in that case write_inode() functions do + * sync_dirty_buffer() and thus effectively write one block at a time. */ static int __sync_filesystem(struct super_block *sb, int wait) { /* Avoid doing twice syncing and cache pruning for quota sync */ - if (!wait) + if (!wait) { writeout_quota_sb(sb, -1); - else + writeback_inodes_sb(sb); + } else { sync_quota_sb(sb, -1); - sync_inodes_sb(sb, wait); + sync_inodes_sb(sb); + } if (sb->s_op->sync_fs) sb->s_op->sync_fs(sb, wait); return __sync_blockdev(sb->s_bdev, wait); @@ -118,7 +120,7 @@ restart: */ SYSCALL_DEFINE0(sync) { - wakeup_pdflush(0); + wakeup_flusher_threads(0); sync_filesystems(0); sync_filesystems(1); if (unlikely(laptop_mode)) @@ -176,19 +178,23 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync) } /** - * vfs_fsync - perform a fsync or fdatasync on a file + * vfs_fsync_range - helper to sync a range of data & metadata to disk * @file: file to sync * @dentry: dentry of @file - * @data: only perform a fdatasync operation + * @start: offset in bytes of the beginning of data range to sync + * @end: offset in bytes of the end of data range (inclusive) + * @datasync: perform only datasync * - * Write back data and metadata for @file to disk. If @datasync is - * set only metadata needed to access modified file data is written. + * Write back data in range @start..@end and metadata for @file to disk. If + * @datasync is set only metadata needed to access modified file data is + * written. * * In case this function is called from nfsd @file may be %NULL and * only @dentry is set. This can only happen when the filesystem * implements the export_operations API. */ -int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) +int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start, + loff_t end, int datasync) { const struct file_operations *fop; struct address_space *mapping; @@ -212,7 +218,7 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) goto out; } - ret = filemap_fdatawrite(mapping); + ret = filemap_write_and_wait_range(mapping, start, end); /* * We need to protect against concurrent writers, which could cause @@ -223,12 +229,29 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) if (!ret) ret = err; mutex_unlock(&mapping->host->i_mutex); - err = filemap_fdatawait(mapping); - if (!ret) - ret = err; + out: return ret; } +EXPORT_SYMBOL(vfs_fsync_range); + +/** + * vfs_fsync - perform a fsync or fdatasync on a file + * @file: file to sync + * @dentry: dentry of @file + * @datasync: only perform a fdatasync operation + * + * Write back data and metadata for @file to disk. If @datasync is + * set only metadata needed to access modified file data is written. + * + * In case this function is called from nfsd @file may be %NULL and + * only @dentry is set. This can only happen when the filesystem + * implements the export_operations API. + */ +int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) +{ + return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync); +} EXPORT_SYMBOL(vfs_fsync); static int do_fsync(unsigned int fd, int datasync) @@ -254,6 +277,23 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd) return do_fsync(fd, 1); } +/** + * generic_write_sync - perform syncing after a write if file / inode is sync + * @file: file to which the write happened + * @pos: offset where the write started + * @count: length of the write + * + * This is just a simple wrapper about our general syncing function. + */ +int generic_write_sync(struct file *file, loff_t pos, loff_t count) +{ + if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host)) + return 0; + return vfs_fsync_range(file, file->f_path.dentry, pos, + pos + count - 1, 1); +} +EXPORT_SYMBOL(generic_write_sync); + /* * sys_sync_file_range() permits finely controlled syncing over a segment of * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index d88d0fac9fa5..0050fc40e8c9 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -760,6 +760,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry, const struct inode_operations sysfs_dir_inode_operations = { .lookup = sysfs_lookup, .setattr = sysfs_setattr, + .setxattr = sysfs_setxattr, }; static void remove_dir(struct sysfs_dirent *sd) @@ -939,8 +940,10 @@ again: /* Remove from old parent's list and insert into new parent's list. */ sysfs_unlink_sibling(sd); sysfs_get(new_parent_sd); + drop_nlink(old_parent->d_inode); sysfs_put(sd->s_parent); sd->s_parent = new_parent_sd; + inc_nlink(new_parent->d_inode); sysfs_link_sibling(sd); out_unlock: diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 555f0ff988df..e28cecf179f5 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -18,6 +18,8 @@ #include <linux/capability.h> #include <linux/errno.h> #include <linux/sched.h> +#include <linux/xattr.h> +#include <linux/security.h> #include "sysfs.h" extern struct super_block * sysfs_sb; @@ -29,12 +31,14 @@ static const struct address_space_operations sysfs_aops = { }; static struct backing_dev_info sysfs_backing_dev_info = { + .name = "sysfs", .ra_pages = 0, /* No readahead */ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; static const struct inode_operations sysfs_inode_operations ={ .setattr = sysfs_setattr, + .setxattr = sysfs_setxattr, }; int __init sysfs_inode_init(void) @@ -42,18 +46,37 @@ int __init sysfs_inode_init(void) return bdi_init(&sysfs_backing_dev_info); } +struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd) +{ + struct sysfs_inode_attrs *attrs; + struct iattr *iattrs; + + attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL); + if (!attrs) + return NULL; + iattrs = &attrs->ia_iattr; + + /* assign default attributes */ + iattrs->ia_mode = sd->s_mode; + iattrs->ia_uid = 0; + iattrs->ia_gid = 0; + iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME; + + return attrs; +} int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) { struct inode * inode = dentry->d_inode; struct sysfs_dirent * sd = dentry->d_fsdata; - struct iattr * sd_iattr; + struct sysfs_inode_attrs *sd_attrs; + struct iattr *iattrs; unsigned int ia_valid = iattr->ia_valid; int error; if (!sd) return -EINVAL; - sd_iattr = sd->s_iattr; + sd_attrs = sd->s_iattr; error = inode_change_ok(inode, iattr); if (error) @@ -65,42 +88,77 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) if (error) return error; - if (!sd_iattr) { + if (!sd_attrs) { /* setting attributes for the first time, allocate now */ - sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL); - if (!sd_iattr) + sd_attrs = sysfs_init_inode_attrs(sd); + if (!sd_attrs) return -ENOMEM; - /* assign default attributes */ - sd_iattr->ia_mode = sd->s_mode; - sd_iattr->ia_uid = 0; - sd_iattr->ia_gid = 0; - sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME; - sd->s_iattr = sd_iattr; + sd->s_iattr = sd_attrs; + } else { + /* attributes were changed at least once in past */ + iattrs = &sd_attrs->ia_iattr; + + if (ia_valid & ATTR_UID) + iattrs->ia_uid = iattr->ia_uid; + if (ia_valid & ATTR_GID) + iattrs->ia_gid = iattr->ia_gid; + if (ia_valid & ATTR_ATIME) + iattrs->ia_atime = timespec_trunc(iattr->ia_atime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MTIME) + iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_CTIME) + iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime, + inode->i_sb->s_time_gran); + if (ia_valid & ATTR_MODE) { + umode_t mode = iattr->ia_mode; + + if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) + mode &= ~S_ISGID; + iattrs->ia_mode = sd->s_mode = mode; + } } + return error; +} - /* attributes were changed atleast once in past */ - - if (ia_valid & ATTR_UID) - sd_iattr->ia_uid = iattr->ia_uid; - if (ia_valid & ATTR_GID) - sd_iattr->ia_gid = iattr->ia_gid; - if (ia_valid & ATTR_ATIME) - sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime, - inode->i_sb->s_time_gran); - if (ia_valid & ATTR_MTIME) - sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime, - inode->i_sb->s_time_gran); - if (ia_valid & ATTR_CTIME) - sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime, - inode->i_sb->s_time_gran); - if (ia_valid & ATTR_MODE) { - umode_t mode = iattr->ia_mode; - - if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) - mode &= ~S_ISGID; - sd_iattr->ia_mode = sd->s_mode = mode; - } +int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + struct sysfs_dirent *sd = dentry->d_fsdata; + struct sysfs_inode_attrs *iattrs; + void *secdata; + int error; + u32 secdata_len = 0; + + if (!sd) + return -EINVAL; + if (!sd->s_iattr) + sd->s_iattr = sysfs_init_inode_attrs(sd); + if (!sd->s_iattr) + return -ENOMEM; + + iattrs = sd->s_iattr; + + if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { + const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + error = security_inode_setsecurity(dentry->d_inode, suffix, + value, size, flags); + if (error) + goto out; + error = security_inode_getsecctx(dentry->d_inode, + &secdata, &secdata_len); + if (error) + goto out; + if (iattrs->ia_secdata) + security_release_secctx(iattrs->ia_secdata, + iattrs->ia_secdata_len); + iattrs->ia_secdata = secdata; + iattrs->ia_secdata_len = secdata_len; + } else + return -EINVAL; +out: return error; } @@ -146,6 +204,7 @@ static int sysfs_count_nlink(struct sysfs_dirent *sd) static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) { struct bin_attribute *bin_attr; + struct sysfs_inode_attrs *iattrs; inode->i_private = sysfs_get(sd); inode->i_mapping->a_ops = &sysfs_aops; @@ -154,16 +213,20 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) inode->i_ino = sd->s_ino; lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key); - if (sd->s_iattr) { + iattrs = sd->s_iattr; + if (iattrs) { /* sysfs_dirent has non-default attributes * get them for the new inode from persistent copy * in sysfs_dirent */ - set_inode_attr(inode, sd->s_iattr); + set_inode_attr(inode, &iattrs->ia_iattr); + if (iattrs->ia_secdata) + security_inode_notifysecctx(inode, + iattrs->ia_secdata, + iattrs->ia_secdata_len); } else set_default_inode_attr(inode, sd->s_mode); - /* initialize inode according to type */ switch (sysfs_type(sd)) { case SYSFS_DIR: diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index 1d897ad808e0..c5081ad77026 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -16,6 +16,7 @@ #include <linux/kobject.h> #include <linux/namei.h> #include <linux/mutex.h> +#include <linux/security.h> #include "sysfs.h" @@ -209,6 +210,7 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *co } const struct inode_operations sysfs_symlink_inode_operations = { + .setxattr = sysfs_setxattr, .readlink = generic_readlink, .follow_link = sysfs_follow_link, .put_link = sysfs_put_link, diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 3fa0d98481e2..af4c4e7482ac 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -8,6 +8,8 @@ * This file is released under the GPLv2. */ +#include <linux/fs.h> + struct sysfs_open_dirent; /* type-specific structures for sysfs_dirent->s_* union members */ @@ -31,6 +33,12 @@ struct sysfs_elem_bin_attr { struct hlist_head buffers; }; +struct sysfs_inode_attrs { + struct iattr ia_iattr; + void *ia_secdata; + u32 ia_secdata_len; +}; + /* * sysfs_dirent - the building block of sysfs hierarchy. Each and * every sysfs node is represented by single sysfs_dirent. @@ -56,7 +64,7 @@ struct sysfs_dirent { unsigned int s_flags; ino_t s_ino; umode_t s_mode; - struct iattr *s_iattr; + struct sysfs_inode_attrs *s_iattr; }; #define SD_DEACTIVATED_BIAS INT_MIN @@ -148,6 +156,8 @@ static inline void __sysfs_put(struct sysfs_dirent *sd) struct inode *sysfs_get_inode(struct sysfs_dirent *sd); void sysfs_delete_inode(struct inode *inode); int sysfs_setattr(struct dentry *dentry, struct iattr *iattr); +int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags); int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name); int sysfs_inode_init(void); diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index eaf6d891d46f..1c8991b0db13 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -65,26 +65,14 @@ static int shrink_liability(struct ubifs_info *c, int nr_to_write) { int nr_written; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE, - .range_end = LLONG_MAX, - .nr_to_write = nr_to_write, - }; - - generic_sync_sb_inodes(c->vfs_sb, &wbc); - nr_written = nr_to_write - wbc.nr_to_write; + nr_written = writeback_inodes_sb(c->vfs_sb); if (!nr_written) { /* * Re-try again but wait on pages/inodes which are being * written-back concurrently (e.g., by pdflush). */ - memset(&wbc, 0, sizeof(struct writeback_control)); - wbc.sync_mode = WB_SYNC_ALL; - wbc.range_end = LLONG_MAX; - wbc.nr_to_write = nr_to_write; - generic_sync_sb_inodes(c->vfs_sb, &wbc); - nr_written = nr_to_write - wbc.nr_to_write; + nr_written = sync_inodes_sb(c->vfs_sb); } dbg_budg("%d pages were written back", nr_written); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 26d2e0d80465..51763aa8f4de 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -438,12 +438,6 @@ static int ubifs_sync_fs(struct super_block *sb, int wait) { int i, err; struct ubifs_info *c = sb->s_fs_info; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .range_start = 0, - .range_end = LLONG_MAX, - .nr_to_write = LONG_MAX, - }; /* * Zero @wait is just an advisory thing to help the file system shove @@ -462,7 +456,7 @@ static int ubifs_sync_fs(struct super_block *sb, int wait) * the user be able to get more accurate results of 'statfs()' after * they synchronize the file system. */ - generic_sync_sb_inodes(sb, &wbc); + sync_inodes_sb(sb); /* * Synchronize write buffers, because 'ubifs_run_commit()' does not @@ -1971,6 +1965,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) * * Read-ahead will be disabled because @c->bdi.ra_pages is 0. */ + c->bdi.name = "ubifs", c->bdi.capabilities = BDI_CAP_MAP_COPY; c->bdi.unplug_io_fn = default_unplug_io_fn; err = bdi_init(&c->bdi); diff --git a/fs/udf/directory.c b/fs/udf/directory.c index 1d2c570704c8..2ffdb6733af1 100644 --- a/fs/udf/directory.c +++ b/fs/udf/directory.c @@ -18,59 +18,6 @@ #include <linux/string.h> #include <linux/buffer_head.h> -#if 0 -static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad, - uint8_t ad_size, struct kernel_lb_addr fe_loc, - int *pos, int *offset, struct buffer_head **bh, - int *error) -{ - int loffset = *offset; - int block; - uint8_t *ad; - int remainder; - - *error = 0; - - ad = (uint8_t *)(*bh)->b_data + *offset; - *offset += ad_size; - - if (!ad) { - brelse(*bh); - *error = 1; - return NULL; - } - - if (*offset == dir->i_sb->s_blocksize) { - brelse(*bh); - block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos); - if (!block) - return NULL; - *bh = udf_tread(dir->i_sb, block); - if (!*bh) - return NULL; - } else if (*offset > dir->i_sb->s_blocksize) { - ad = tmpad; - - remainder = dir->i_sb->s_blocksize - loffset; - memcpy((uint8_t *)ad, (*bh)->b_data + loffset, remainder); - - brelse(*bh); - block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos); - if (!block) - return NULL; - (*bh) = udf_tread(dir->i_sb, block); - if (!*bh) - return NULL; - - memcpy((uint8_t *)ad + remainder, (*bh)->b_data, - ad_size - remainder); - *offset = ad_size - remainder; - } - - return ad; -} -#endif - struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos, struct udf_fileident_bh *fibh, struct fileIdentDesc *cfi, @@ -248,39 +195,6 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset) return fi; } -#if 0 -static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset) -{ - struct extent_ad *ext; - struct fileEntry *fe; - uint8_t *ptr; - - if ((!buffer) || (!offset)) { - printk(KERN_ERR "udf: udf_get_fileextent() invalidparms\n"); - return NULL; - } - - fe = (struct fileEntry *)buffer; - - if (fe->descTag.tagIdent != cpu_to_le16(TAG_IDENT_FE)) { - udf_debug("0x%x != TAG_IDENT_FE\n", - le16_to_cpu(fe->descTag.tagIdent)); - return NULL; - } - - ptr = (uint8_t *)(fe->extendedAttr) + - le32_to_cpu(fe->lengthExtendedAttr); - - if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs))) - ptr += *offset; - - ext = (struct extent_ad *)ptr; - - *offset = *offset + sizeof(struct extent_ad); - return ext; -} -#endif - struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset, int inc) { diff --git a/fs/udf/file.c b/fs/udf/file.c index 7464305382b5..b80cbd78833c 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -193,9 +193,11 @@ int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, static int udf_release_file(struct inode *inode, struct file *filp) { if (filp->f_mode & FMODE_WRITE) { + mutex_lock(&inode->i_mutex); lock_kernel(); udf_discard_prealloc(inode); unlock_kernel(); + mutex_unlock(&inode->i_mutex); } return 0; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index e7533f785636..6d24c2c63f93 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -90,19 +90,16 @@ no_delete: } /* - * If we are going to release inode from memory, we discard preallocation and - * truncate last inode extent to proper length. We could use drop_inode() but - * it's called under inode_lock and thus we cannot mark inode dirty there. We - * use clear_inode() but we have to make sure to write inode as it's not written - * automatically. + * If we are going to release inode from memory, we truncate last inode extent + * to proper length. We could use drop_inode() but it's called under inode_lock + * and thus we cannot mark inode dirty there. We use clear_inode() but we have + * to make sure to write inode as it's not written automatically. */ void udf_clear_inode(struct inode *inode) { struct udf_inode_info *iinfo; if (!(inode->i_sb->s_flags & MS_RDONLY)) { lock_kernel(); - /* Discard preallocation for directories, symlinks, etc. */ - udf_discard_prealloc(inode); udf_truncate_tail_extent(inode); unlock_kernel(); write_inode_now(inode, 0); @@ -664,8 +661,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block, udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum); #ifdef UDF_PREALLOCATE - /* preallocate blocks */ - udf_prealloc_extents(inode, c, lastblock, laarr, &endnum); + /* We preallocate blocks only for regular files. It also makes sense + * for directories but there's a problem when to drop the + * preallocation. We might use some delayed work for that but I feel + * it's overengineering for a filesystem like UDF. */ + if (S_ISREG(inode->i_mode)) + udf_prealloc_extents(inode, c, lastblock, laarr, &endnum); #endif /* merge any continuous blocks in laarr */ diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c index 1b88fd5df05d..43e24a3b8e10 100644 --- a/fs/udf/lowlevel.c +++ b/fs/udf/lowlevel.c @@ -36,14 +36,10 @@ unsigned int udf_get_last_session(struct super_block *sb) ms_info.addr_format = CDROM_LBA; i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long)&ms_info); -#define WE_OBEY_THE_WRITTEN_STANDARDS 1 - if (i == 0) { udf_debug("XA disk: %s, vol_desc_start=%d\n", (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba); -#if WE_OBEY_THE_WRITTEN_STANDARDS if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */ -#endif vol_desc_start = ms_info.addr.lba; } else { udf_debug("CDROMMULTISESSION not supported: rc=%d\n", i); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 6a29fa34c478..21dad8c608f9 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -943,7 +943,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, pc->componentType = 1; pc->lengthComponentIdent = 0; pc->componentFileVersionNum = 0; - pc += sizeof(struct pathComponent); elen += sizeof(struct pathComponent); } diff --git a/fs/udf/super.c b/fs/udf/super.c index 6832135159b6..9d1b8c2e6c45 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1087,11 +1087,23 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index) struct udf_inode_info *vati; uint32_t pos; struct virtualAllocationTable20 *vat20; + sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits; /* VAT file entry is in the last recorded block */ ino.partitionReferenceNum = type1_index; ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root; sbi->s_vat_inode = udf_iget(sb, &ino); + if (!sbi->s_vat_inode && + sbi->s_last_block != blocks - 1) { + printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the" + " last recorded block (%lu), retrying with the last " + "block of the device (%lu).\n", + (unsigned long)sbi->s_last_block, + (unsigned long)blocks - 1); + ino.partitionReferenceNum = type1_index; + ino.logicalBlockNum = blocks - 1 - map->s_partition_root; + sbi->s_vat_inode = udf_iget(sb, &ino); + } if (!sbi->s_vat_inode) return 1; diff --git a/fs/xattr.c b/fs/xattr.c index 1c3d0af59ddf..6d4f6d3449fb 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -66,22 +66,28 @@ xattr_permission(struct inode *inode, const char *name, int mask) return inode_permission(inode, mask); } -int -vfs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) +/** + * __vfs_setxattr_noperm - perform setxattr operation without performing + * permission checks. + * + * @dentry - object to perform setxattr on + * @name - xattr name to set + * @value - value to set @name to + * @size - size of @value + * @flags - flags to pass into filesystem operations + * + * returns the result of the internal setxattr or setsecurity operations. + * + * This function requires the caller to lock the inode's i_mutex before it + * is executed. It also assumes that the caller will make the appropriate + * permission checks. + */ +int __vfs_setxattr_noperm(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; - int error; - - error = xattr_permission(inode, name, MAY_WRITE); - if (error) - return error; + int error = -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); - error = security_inode_setxattr(dentry, name, value, size, flags); - if (error) - goto out; - error = -EOPNOTSUPP; if (inode->i_op->setxattr) { error = inode->i_op->setxattr(dentry, name, value, size, flags); if (!error) { @@ -97,6 +103,29 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (!error) fsnotify_xattr(dentry); } + + return error; +} + + +int +vfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + struct inode *inode = dentry->d_inode; + int error; + + error = xattr_permission(inode, name, MAY_WRITE); + if (error) + return error; + + mutex_lock(&inode->i_mutex); + error = security_inode_setxattr(dentry, name, value, size, flags); + if (error) + goto out; + + error = __vfs_setxattr_noperm(dentry, name, value, size, flags); + out: mutex_unlock(&inode->i_mutex); return error; diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 7ec89fc05b2b..aecf2519db76 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1268,6 +1268,14 @@ xfs_vm_writepage( if (!page_has_buffers(page)) create_empty_buffers(page, 1 << inode->i_blkbits, 0); + + /* + * VM calculation for nr_to_write seems off. Bump it way + * up, this gets simple streaming writes zippy again. + * To be reviewed again after Jens' writeback changes. + */ + wbc->nr_to_write *= 4; + /* * Convert delayed allocate, unwritten or unmapped space * to real space and flush out to disk. diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 0c93c7ef3d18..965df1227d64 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -770,7 +770,7 @@ xfs_buf_associate_memory( bp->b_pages = NULL; bp->b_addr = mem; - rval = _xfs_buf_get_pages(bp, page_count, 0); + rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK); if (rval) return rval; diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index 0882d166239a..eafcc7c18706 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -619,7 +619,7 @@ xfs_file_compat_ioctl( case XFS_IOC_GETVERSION_32: cmd = _NATIVE_IOC(cmd, long); return xfs_file_ioctl(filp, cmd, p); - case XFS_IOC_SWAPEXT: { + case XFS_IOC_SWAPEXT_32: { struct xfs_swapext sxp; struct compat_xfs_swapext __user *sxu = arg; diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 58973bb46038..6c32f1d63d8c 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -485,14 +485,6 @@ xfs_vn_put_link( } STATIC int -xfs_vn_permission( - struct inode *inode, - int mask) -{ - return generic_permission(inode, mask, xfs_check_acl); -} - -STATIC int xfs_vn_getattr( struct vfsmount *mnt, struct dentry *dentry, @@ -680,8 +672,8 @@ xfs_vn_fiemap( else bm.bmv_length = BTOBB(length); - /* our formatter will tell xfs_getbmap when to stop. */ - bm.bmv_count = MAXEXTNUM; + /* We add one because in getbmap world count includes the header */ + bm.bmv_count = fieinfo->fi_extents_max + 1; bm.bmv_iflags = BMV_IF_PREALLOC; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) bm.bmv_iflags |= BMV_IF_ATTRFORK; @@ -696,7 +688,7 @@ xfs_vn_fiemap( } static const struct inode_operations xfs_inode_operations = { - .permission = xfs_vn_permission, + .check_acl = xfs_check_acl, .truncate = xfs_vn_truncate, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, @@ -724,7 +716,7 @@ static const struct inode_operations xfs_dir_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .permission = xfs_vn_permission, + .check_acl = xfs_check_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -749,7 +741,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { .rmdir = xfs_vn_unlink, .mknod = xfs_vn_mknod, .rename = xfs_vn_rename, - .permission = xfs_vn_permission, + .check_acl = xfs_check_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -762,7 +754,7 @@ static const struct inode_operations xfs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = xfs_vn_follow_link, .put_link = xfs_vn_put_link, - .permission = xfs_vn_permission, + .check_acl = xfs_check_acl, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 7078974a6eee..fde63a3c4ecc 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -817,7 +817,8 @@ write_retry: xfs_iunlock(xip, iolock); if (need_i_mutex) mutex_unlock(&inode->i_mutex); - error2 = sync_page_range(inode, mapping, pos, ret); + error2 = filemap_write_and_wait_range(mapping, pos, + pos + ret - 1); if (!error) error = error2; if (need_i_mutex) diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index b619d6b8ca43..98ef624d9baf 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -708,6 +708,16 @@ xfs_reclaim_inode( return 0; } +void +__xfs_inode_set_reclaim_tag( + struct xfs_perag *pag, + struct xfs_inode *ip) +{ + radix_tree_tag_set(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); +} + /* * We set the inode flag atomically with the radix tree tag. * Once we get tag lookups on the radix tree, this inode flag @@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag( read_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock); - radix_tree_tag_set(&pag->pag_ici_root, - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); + __xfs_inode_set_reclaim_tag(pag, ip); __xfs_iflags_set(ip, XFS_IRECLAIMABLE); spin_unlock(&ip->i_flags_lock); read_unlock(&pag->pag_ici_lock); diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index 2a10301c99c7..59120602588a 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -48,6 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode); int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); +void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip); void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, struct xfs_inode *ip); diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index db15feb906ff..4ece1906bd41 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -2010,7 +2010,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno, - blkcnt, XFS_BUF_LOCK, &bp); + blkcnt, + XFS_BUF_LOCK | XBF_DONT_BLOCK, + &bp); if (error) return(error); @@ -2141,8 +2143,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, - blkcnt, XFS_BUF_LOCK); + bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, blkcnt, + XFS_BUF_LOCK | XBF_DONT_BLOCK); ASSERT(bp); ASSERT(!XFS_BUF_GETERROR(bp)); diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 7928b9983c1d..8ee5b5a76a2a 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -6009,7 +6009,7 @@ xfs_getbmap( */ error = ENOMEM; subnex = 16; - map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL); + map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS); if (!map) goto out_unlock_ilock; diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index e9df99574829..26717388acf5 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -120,8 +120,8 @@ xfs_btree_check_sblock( XFS_RANDOM_BTREE_CHECK_SBLOCK))) { if (bp) xfs_buftrace("SBTREE ERROR", bp); - XFS_ERROR_REPORT("xfs_btree_check_sblock", XFS_ERRLEVEL_LOW, - cur->bc_mp); + XFS_CORRUPTION_ERROR("xfs_btree_check_sblock", + XFS_ERRLEVEL_LOW, cur->bc_mp, block); return XFS_ERROR(EFSCORRUPTED); } return 0; diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 9ff6e57a5075..2847bbc1c534 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -2201,7 +2201,7 @@ kmem_zone_t *xfs_dabuf_zone; /* dabuf zone */ xfs_da_state_t * xfs_da_state_alloc(void) { - return kmem_zone_zalloc(xfs_da_state_zone, KM_SLEEP); + return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS); } /* @@ -2261,9 +2261,9 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra) int off; if (nbuf == 1) - dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_SLEEP); + dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_NOFS); else - dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_SLEEP); + dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_NOFS); dabuf->dirty = 0; #ifdef XFS_DABUF_DEBUG dabuf->ra = ra; diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index c657bec6d951..bb1d58eb3982 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -256,7 +256,7 @@ xfs_dir_cilookup_result( !(args->op_flags & XFS_DA_OP_CILOOKUP)) return EEXIST; - args->value = kmem_alloc(len, KM_MAYFAIL); + args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL); if (!args->value) return ENOMEM; diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index cbd451bb4848..2d0b3e1da9e6 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -167,17 +167,25 @@ xfs_growfs_data_private( new = nb - mp->m_sb.sb_dblocks; oagcount = mp->m_sb.sb_agcount; if (nagcount > oagcount) { + void *new_perag, *old_perag; + xfs_filestream_flush(mp); + + new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount, + KM_MAYFAIL); + if (!new_perag) + return XFS_ERROR(ENOMEM); + down_write(&mp->m_peraglock); - mp->m_perag = kmem_realloc(mp->m_perag, - sizeof(xfs_perag_t) * nagcount, - sizeof(xfs_perag_t) * oagcount, - KM_SLEEP); - memset(&mp->m_perag[oagcount], 0, - (nagcount - oagcount) * sizeof(xfs_perag_t)); + memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount); + old_perag = mp->m_perag; + mp->m_perag = new_perag; + mp->m_flags |= XFS_MOUNT_32BITINODES; nagimax = xfs_initialize_perag(mp, nagcount); up_write(&mp->m_peraglock); + + kmem_free(old_perag); } tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); tp->t_flags |= XFS_TRANS_RESERVE; diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 5fcec6f020a7..ecbf8b4d2e2e 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -64,6 +64,10 @@ xfs_inode_alloc( ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); if (!ip) return NULL; + if (inode_init_always(mp->m_super, VFS_I(ip))) { + kmem_zone_free(xfs_inode_zone, ip); + return NULL; + } ASSERT(atomic_read(&ip->i_iocount) == 0); ASSERT(atomic_read(&ip->i_pincount) == 0); @@ -105,17 +109,6 @@ xfs_inode_alloc( #ifdef XFS_DIR2_TRACE ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS); #endif - /* - * Now initialise the VFS inode. We do this after the xfs_inode - * initialisation as internal failures will result in ->destroy_inode - * being called and that will pass down through the reclaim path and - * free the XFS inode. This path requires the XFS inode to already be - * initialised. Hence if this call fails, the xfs_inode has already - * been freed and we should not reference it at all in the error - * handling. - */ - if (!inode_init_always(mp->m_super, VFS_I(ip))) - return NULL; /* prevent anyone from using this yet */ VFS_I(ip)->i_state = I_NEW|I_LOCK; @@ -123,6 +116,71 @@ xfs_inode_alloc( return ip; } +STATIC void +xfs_inode_free( + struct xfs_inode *ip) +{ + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + xfs_idestroy_fork(ip, XFS_DATA_FORK); + break; + } + + if (ip->i_afp) + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + +#ifdef XFS_INODE_TRACE + ktrace_free(ip->i_trace); +#endif +#ifdef XFS_BMAP_TRACE + ktrace_free(ip->i_xtrace); +#endif +#ifdef XFS_BTREE_TRACE + ktrace_free(ip->i_btrace); +#endif +#ifdef XFS_RW_TRACE + ktrace_free(ip->i_rwtrace); +#endif +#ifdef XFS_ILOCK_TRACE + ktrace_free(ip->i_lock_trace); +#endif +#ifdef XFS_DIR2_TRACE + ktrace_free(ip->i_dir_trace); +#endif + + if (ip->i_itemp) { + /* + * Only if we are shutting down the fs will we see an + * inode still in the AIL. If it is there, we should remove + * it to prevent a use-after-free from occurring. + */ + xfs_log_item_t *lip = &ip->i_itemp->ili_item; + struct xfs_ail *ailp = lip->li_ailp; + + ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) || + XFS_FORCED_SHUTDOWN(ip->i_mount)); + if (lip->li_flags & XFS_LI_IN_AIL) { + spin_lock(&ailp->xa_lock); + if (lip->li_flags & XFS_LI_IN_AIL) + xfs_trans_ail_delete(ailp, lip); + else + spin_unlock(&ailp->xa_lock); + } + xfs_inode_item_destroy(ip); + ip->i_itemp = NULL; + } + + /* asserts to verify all state is correct here */ + ASSERT(atomic_read(&ip->i_iocount) == 0); + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!spin_is_locked(&ip->i_flags_lock)); + ASSERT(completion_done(&ip->i_flush)); + + kmem_zone_free(xfs_inode_zone, ip); +} + /* * Check the validity of the inode we just found it the cache */ @@ -133,80 +191,82 @@ xfs_iget_cache_hit( int flags, int lock_flags) __releases(pag->pag_ici_lock) { + struct inode *inode = VFS_I(ip); struct xfs_mount *mp = ip->i_mount; - int error = EAGAIN; + int error; + + spin_lock(&ip->i_flags_lock); /* - * If INEW is set this inode is being set up - * If IRECLAIM is set this inode is being torn down - * Pause and try again. + * If we are racing with another cache hit that is currently + * instantiating this inode or currently recycling it out of + * reclaimabe state, wait for the initialisation to complete + * before continuing. + * + * XXX(hch): eventually we should do something equivalent to + * wait_on_inode to wait for these flags to be cleared + * instead of polling for it. */ - if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) { + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { XFS_STATS_INC(xs_ig_frecycle); + error = EAGAIN; goto out_error; } - /* If IRECLAIMABLE is set, we've torn down the vfs inode part */ - if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { - - /* - * If lookup is racing with unlink, then we should return an - * error immediately so we don't remove it from the reclaim - * list and potentially leak the inode. - */ - if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { - error = ENOENT; - goto out_error; - } + /* + * If lookup is racing with unlink return an error immediately. + */ + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { + error = ENOENT; + goto out_error; + } + /* + * If IRECLAIMABLE is set, we've torn down the VFS inode already. + * Need to carefully get it back into useable state. + */ + if (ip->i_flags & XFS_IRECLAIMABLE) { xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); /* - * We need to re-initialise the VFS inode as it has been - * 'freed' by the VFS. Do this here so we can deal with - * errors cleanly, then tag it so it can be set up correctly - * later. + * We need to set XFS_INEW atomically with clearing the + * reclaimable tag so that we do have an indicator of the + * inode still being initialized. */ - if (!inode_init_always(mp->m_super, VFS_I(ip))) { - error = ENOMEM; - goto out_error; - } + ip->i_flags |= XFS_INEW; + ip->i_flags &= ~XFS_IRECLAIMABLE; + __xfs_inode_clear_reclaim_tag(mp, pag, ip); - /* - * We must set the XFS_INEW flag before clearing the - * XFS_IRECLAIMABLE flag so that if a racing lookup does - * not find the XFS_IRECLAIMABLE above but has the igrab() - * below succeed we can safely check XFS_INEW to detect - * that this inode is still being initialised. - */ - xfs_iflags_set(ip, XFS_INEW); - xfs_iflags_clear(ip, XFS_IRECLAIMABLE); + spin_unlock(&ip->i_flags_lock); + read_unlock(&pag->pag_ici_lock); - /* clear the radix tree reclaim flag as well. */ - __xfs_inode_clear_reclaim_tag(mp, pag, ip); - } else if (!igrab(VFS_I(ip))) { + error = -inode_init_always(mp->m_super, inode); + if (error) { + /* + * Re-initializing the inode failed, and we are in deep + * trouble. Try to re-add it to the reclaim list. + */ + read_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + ip->i_flags &= ~XFS_INEW; + ip->i_flags |= XFS_IRECLAIMABLE; + __xfs_inode_set_reclaim_tag(pag, ip); + goto out_error; + } + inode->i_state = I_LOCK|I_NEW; + } else { /* If the VFS inode is being torn down, pause and try again. */ - XFS_STATS_INC(xs_ig_frecycle); - goto out_error; - } else if (xfs_iflags_test(ip, XFS_INEW)) { - /* - * We are racing with another cache hit that is - * currently recycling this inode out of the XFS_IRECLAIMABLE - * state. Wait for the initialisation to complete before - * continuing. - */ - wait_on_inode(VFS_I(ip)); - } + if (!igrab(inode)) { + error = EAGAIN; + goto out_error; + } - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { - error = ENOENT; - iput(VFS_I(ip)); - goto out_error; + /* We've got a live one. */ + spin_unlock(&ip->i_flags_lock); + read_unlock(&pag->pag_ici_lock); } - /* We've got a live one. */ - read_unlock(&pag->pag_ici_lock); - if (lock_flags != 0) xfs_ilock(ip, lock_flags); @@ -216,6 +276,7 @@ xfs_iget_cache_hit( return 0; out_error: + spin_unlock(&ip->i_flags_lock); read_unlock(&pag->pag_ici_lock); return error; } @@ -299,7 +360,8 @@ out_preload_end: if (lock_flags) xfs_iunlock(ip, lock_flags); out_destroy: - xfs_destroy_inode(ip); + __destroy_inode(VFS_I(ip)); + xfs_inode_free(ip); return error; } @@ -504,62 +566,7 @@ xfs_ireclaim( xfs_qm_dqdetach(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - switch (ip->i_d.di_mode & S_IFMT) { - case S_IFREG: - case S_IFDIR: - case S_IFLNK: - xfs_idestroy_fork(ip, XFS_DATA_FORK); - break; - } - - if (ip->i_afp) - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - -#ifdef XFS_INODE_TRACE - ktrace_free(ip->i_trace); -#endif -#ifdef XFS_BMAP_TRACE - ktrace_free(ip->i_xtrace); -#endif -#ifdef XFS_BTREE_TRACE - ktrace_free(ip->i_btrace); -#endif -#ifdef XFS_RW_TRACE - ktrace_free(ip->i_rwtrace); -#endif -#ifdef XFS_ILOCK_TRACE - ktrace_free(ip->i_lock_trace); -#endif -#ifdef XFS_DIR2_TRACE - ktrace_free(ip->i_dir_trace); -#endif - if (ip->i_itemp) { - /* - * Only if we are shutting down the fs will we see an - * inode still in the AIL. If it is there, we should remove - * it to prevent a use-after-free from occurring. - */ - xfs_log_item_t *lip = &ip->i_itemp->ili_item; - struct xfs_ail *ailp = lip->li_ailp; - - ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) || - XFS_FORCED_SHUTDOWN(ip->i_mount)); - if (lip->li_flags & XFS_LI_IN_AIL) { - spin_lock(&ailp->xa_lock); - if (lip->li_flags & XFS_LI_IN_AIL) - xfs_trans_ail_delete(ailp, lip); - else - spin_unlock(&ailp->xa_lock); - } - xfs_inode_item_destroy(ip); - ip->i_itemp = NULL; - } - /* asserts to verify all state is correct here */ - ASSERT(atomic_read(&ip->i_iocount) == 0); - ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(completion_done(&ip->i_flush)); - kmem_zone_free(xfs_inode_zone, ip); + xfs_inode_free(ip); } /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 1f22d65fed0a..da428b3fe0f5 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -343,6 +343,16 @@ xfs_iformat( return XFS_ERROR(EFSCORRUPTED); } + if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && + !ip->i_mount->m_rtdev_targp)) { + xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, + "corrupt dinode %Lu, has realtime flag set.", + ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", + XFS_ERRLEVEL_LOW, ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + switch (ip->i_d.di_mode & S_IFMT) { case S_IFIFO: case S_IFCHR: diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 1804f866a71d..65f24a3cc992 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -310,23 +310,6 @@ static inline struct inode *VFS_I(struct xfs_inode *ip) } /* - * Get rid of a partially initialized inode. - * - * We have to go through destroy_inode to make sure allocations - * from init_inode_always like the security data are undone. - * - * We mark the inode bad so that it takes the short cut in - * the reclaim path instead of going through the flush path - * which doesn't make sense for an inode that has never seen the - * light of day. - */ -static inline void xfs_destroy_inode(struct xfs_inode *ip) -{ - make_bad_inode(VFS_I(ip)); - return destroy_inode(VFS_I(ip)); -} - -/* * i_flags helper functions */ static inline void diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3750f04ede0b..9dbdff3ea484 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3180,7 +3180,7 @@ try_again: STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) { - ASSERT(spin_is_locked(&log->l_icloglock)); + assert_spin_locked(&log->l_icloglock); if (iclog->ic_state == XLOG_STATE_ACTIVE) { xlog_state_switch_iclogs(log, iclog, 0); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index c4eca5ed5dab..492d75bae2bf 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -538,7 +538,9 @@ xfs_readlink_bmap( d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); + bp = xfs_buf_read_flags(mp->m_ddev_targp, d, BTOBB(byte_cnt), + XBF_LOCK | XBF_MAPPED | + XBF_DONT_BLOCK); error = XFS_BUF_GETERROR(bp); if (error) { xfs_ioerror_alert("xfs_readlink", |