diff options
Diffstat (limited to 'fs')
304 files changed, 13830 insertions, 6423 deletions
diff --git a/fs/Makefile b/fs/Makefile index 1148c555c4d3..98be354fdb61 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -37,7 +37,7 @@ obj-$(CONFIG_FS_DAX) += dax.o obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FS_VERITY) += verity/ obj-$(CONFIG_FILE_LOCKING) += locks.o -obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o +obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o obj-$(CONFIG_BINFMT_EM86) += binfmt_em86.o obj-$(CONFIG_BINFMT_MISC) += binfmt_misc.o diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index b7e844d2f321..699c4fa8b78b 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -26,14 +26,13 @@ static inline u16 adfs_filetype(u32 loadaddr) #define ADFS_NDA_PUBLIC_READ (1 << 5) #define ADFS_NDA_PUBLIC_WRITE (1 << 6) -#include "dir_f.h" - /* * adfs file system inode data in memory */ struct adfs_inode_info { loff_t mmu_private; __u32 parent_id; /* parent indirect disc address */ + __u32 indaddr; /* object indirect disc address */ __u32 loadaddr; /* RISC OS load address */ __u32 execaddr; /* RISC OS exec address */ unsigned int attr; /* RISC OS permissions */ @@ -93,15 +92,19 @@ struct adfs_dir { int nr_buffers; struct buffer_head *bh[4]; - - /* big directories need allocated buffers */ - struct buffer_head **bh_fplus; + struct buffer_head **bhs; unsigned int pos; __u32 parent_id; - struct adfs_dirheader dirhead; - union adfs_dirtail dirtail; + union { + struct adfs_dirheader *dirhead; + struct adfs_bigdirheader *bighead; + }; + union { + struct adfs_newdirtail *newtail; + struct adfs_bigdirtail *bigtail; + }; }; /* @@ -122,13 +125,13 @@ struct object_info { struct adfs_dir_ops { int (*read)(struct super_block *sb, unsigned int indaddr, unsigned int size, struct adfs_dir *dir); + int (*iterate)(struct adfs_dir *dir, struct dir_context *ctx); int (*setpos)(struct adfs_dir *dir, unsigned int fpos); int (*getnext)(struct adfs_dir *dir, struct object_info *obj); int (*update)(struct adfs_dir *dir, struct object_info *obj); int (*create)(struct adfs_dir *dir, struct object_info *obj); int (*remove)(struct adfs_dir *dir, struct object_info *obj); - int (*sync)(struct adfs_dir *dir); - void (*free)(struct adfs_dir *dir); + int (*commit)(struct adfs_dir *dir); }; struct adfs_discmap { @@ -145,7 +148,9 @@ int adfs_notify_change(struct dentry *dentry, struct iattr *attr); /* map.c */ int adfs_map_lookup(struct super_block *sb, u32 frag_id, unsigned int offset); -extern unsigned int adfs_map_free(struct super_block *sb); +void adfs_map_statfs(struct super_block *sb, struct kstatfs *buf); +struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_discrecord *dr); +void adfs_free_map(struct super_block *sb); /* Misc */ __printf(3, 4) @@ -167,6 +172,13 @@ extern const struct dentry_operations adfs_dentry_operations; extern const struct adfs_dir_ops adfs_f_dir_ops; extern const struct adfs_dir_ops adfs_fplus_dir_ops; +int adfs_dir_copyfrom(void *dst, struct adfs_dir *dir, unsigned int offset, + size_t len); +int adfs_dir_copyto(struct adfs_dir *dir, unsigned int offset, const void *src, + size_t len); +void adfs_dir_relse(struct adfs_dir *dir); +int adfs_dir_read_buffers(struct super_block *sb, u32 indaddr, + unsigned int size, struct adfs_dir *dir); void adfs_object_fixup(struct adfs_dir *dir, struct object_info *obj); extern int adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait); diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index a54c53244992..77fbd196008f 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -6,12 +6,196 @@ * * Common directory handling for ADFS */ +#include <linux/slab.h> #include "adfs.h" /* * For future. This should probably be per-directory. */ -static DEFINE_RWLOCK(adfs_dir_lock); +static DECLARE_RWSEM(adfs_dir_rwsem); + +int adfs_dir_copyfrom(void *dst, struct adfs_dir *dir, unsigned int offset, + size_t len) +{ + struct super_block *sb = dir->sb; + unsigned int index, remain; + + index = offset >> sb->s_blocksize_bits; + offset &= sb->s_blocksize - 1; + remain = sb->s_blocksize - offset; + if (index + (remain < len) >= dir->nr_buffers) + return -EINVAL; + + if (remain < len) { + memcpy(dst, dir->bhs[index]->b_data + offset, remain); + dst += remain; + len -= remain; + index += 1; + offset = 0; + } + + memcpy(dst, dir->bhs[index]->b_data + offset, len); + + return 0; +} + +int adfs_dir_copyto(struct adfs_dir *dir, unsigned int offset, const void *src, + size_t len) +{ + struct super_block *sb = dir->sb; + unsigned int index, remain; + + index = offset >> sb->s_blocksize_bits; + offset &= sb->s_blocksize - 1; + remain = sb->s_blocksize - offset; + if (index + (remain < len) >= dir->nr_buffers) + return -EINVAL; + + if (remain < len) { + memcpy(dir->bhs[index]->b_data + offset, src, remain); + src += remain; + len -= remain; + index += 1; + offset = 0; + } + + memcpy(dir->bhs[index]->b_data + offset, src, len); + + return 0; +} + +static void __adfs_dir_cleanup(struct adfs_dir *dir) +{ + dir->nr_buffers = 0; + + if (dir->bhs != dir->bh) + kfree(dir->bhs); + dir->bhs = NULL; + dir->sb = NULL; +} + +void adfs_dir_relse(struct adfs_dir *dir) +{ + unsigned int i; + + for (i = 0; i < dir->nr_buffers; i++) + brelse(dir->bhs[i]); + + __adfs_dir_cleanup(dir); +} + +static void adfs_dir_forget(struct adfs_dir *dir) +{ + unsigned int i; + + for (i = 0; i < dir->nr_buffers; i++) + bforget(dir->bhs[i]); + + __adfs_dir_cleanup(dir); +} + +int adfs_dir_read_buffers(struct super_block *sb, u32 indaddr, + unsigned int size, struct adfs_dir *dir) +{ + struct buffer_head **bhs; + unsigned int i, num; + int block; + + num = ALIGN(size, sb->s_blocksize) >> sb->s_blocksize_bits; + if (num > ARRAY_SIZE(dir->bh)) { + /* We only allow one extension */ + if (dir->bhs != dir->bh) + return -EINVAL; + + bhs = kcalloc(num, sizeof(*bhs), GFP_KERNEL); + if (!bhs) + return -ENOMEM; + + if (dir->nr_buffers) + memcpy(bhs, dir->bhs, dir->nr_buffers * sizeof(*bhs)); + + dir->bhs = bhs; + } + + for (i = dir->nr_buffers; i < num; i++) { + block = __adfs_block_map(sb, indaddr, i); + if (!block) { + adfs_error(sb, "dir %06x has a hole at offset %u", + indaddr, i); + goto error; + } + + dir->bhs[i] = sb_bread(sb, block); + if (!dir->bhs[i]) { + adfs_error(sb, + "dir %06x failed read at offset %u, mapped block 0x%08x", + indaddr, i, block); + goto error; + } + + dir->nr_buffers++; + } + return 0; + +error: + adfs_dir_relse(dir); + + return -EIO; +} + +static int adfs_dir_read(struct super_block *sb, u32 indaddr, + unsigned int size, struct adfs_dir *dir) +{ + dir->sb = sb; + dir->bhs = dir->bh; + dir->nr_buffers = 0; + + return ADFS_SB(sb)->s_dir->read(sb, indaddr, size, dir); +} + +static int adfs_dir_read_inode(struct super_block *sb, struct inode *inode, + struct adfs_dir *dir) +{ + int ret; + + ret = adfs_dir_read(sb, ADFS_I(inode)->indaddr, inode->i_size, dir); + if (ret) + return ret; + + if (ADFS_I(inode)->parent_id != dir->parent_id) { + adfs_error(sb, + "parent directory id changed under me! (%06x but got %06x)\n", + ADFS_I(inode)->parent_id, dir->parent_id); + adfs_dir_relse(dir); + ret = -EIO; + } + + return ret; +} + +static void adfs_dir_mark_dirty(struct adfs_dir *dir) +{ + unsigned int i; + + /* Mark the buffers dirty */ + for (i = 0; i < dir->nr_buffers; i++) + mark_buffer_dirty(dir->bhs[i]); +} + +static int adfs_dir_sync(struct adfs_dir *dir) +{ + int err = 0; + int i; + + for (i = dir->nr_buffers - 1; i >= 0; i--) { + struct buffer_head *bh = dir->bhs[i]; + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) + err = -EIO; + } + + return err; +} void adfs_object_fixup(struct adfs_dir *dir, struct object_info *obj) { @@ -51,87 +235,90 @@ void adfs_object_fixup(struct adfs_dir *dir, struct object_info *obj) } } -static int -adfs_readdir(struct file *file, struct dir_context *ctx) +static int adfs_iterate(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; - struct object_info obj; struct adfs_dir dir; - int ret = 0; - - if (ctx->pos >> 32) - return 0; + int ret; - ret = ops->read(sb, inode->i_ino, inode->i_size, &dir); + down_read(&adfs_dir_rwsem); + ret = adfs_dir_read_inode(sb, inode, &dir); if (ret) - return ret; + goto unlock; if (ctx->pos == 0) { if (!dir_emit_dot(file, ctx)) - goto free_out; + goto unlock_relse; ctx->pos = 1; } if (ctx->pos == 1) { if (!dir_emit(ctx, "..", 2, dir.parent_id, DT_DIR)) - goto free_out; + goto unlock_relse; ctx->pos = 2; } - read_lock(&adfs_dir_lock); + ret = ops->iterate(&dir, ctx); - ret = ops->setpos(&dir, ctx->pos - 2); - if (ret) - goto unlock_out; - while (ops->getnext(&dir, &obj) == 0) { - if (!dir_emit(ctx, obj.name, obj.name_len, - obj.indaddr, DT_UNKNOWN)) - break; - ctx->pos++; - } - -unlock_out: - read_unlock(&adfs_dir_lock); +unlock_relse: + up_read(&adfs_dir_rwsem); + adfs_dir_relse(&dir); + return ret; -free_out: - ops->free(&dir); +unlock: + up_read(&adfs_dir_rwsem); return ret; } int adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait) { - int ret = -EINVAL; -#ifdef CONFIG_ADFS_FS_RW const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; struct adfs_dir dir; + int ret; - printk(KERN_INFO "adfs_dir_update: object %06x in dir %06x\n", - obj->indaddr, obj->parent_id); + if (!IS_ENABLED(CONFIG_ADFS_FS_RW)) + return -EINVAL; - if (!ops->update) { - ret = -EINVAL; - goto out; - } + if (!ops->update) + return -EINVAL; - ret = ops->read(sb, obj->parent_id, 0, &dir); + down_write(&adfs_dir_rwsem); + ret = adfs_dir_read(sb, obj->parent_id, 0, &dir); if (ret) - goto out; + goto unlock; - write_lock(&adfs_dir_lock); ret = ops->update(&dir, obj); - write_unlock(&adfs_dir_lock); + if (ret) + goto forget; - if (wait) { - int err = ops->sync(&dir); - if (!ret) - ret = err; - } + ret = ops->commit(&dir); + if (ret) + goto forget; + up_write(&adfs_dir_rwsem); + + adfs_dir_mark_dirty(&dir); + + if (wait) + ret = adfs_dir_sync(&dir); + + adfs_dir_relse(&dir); + return ret; + + /* + * If the updated failed because the entry wasn't found, we can + * just release the buffers. If it was any other error, forget + * the dirtied buffers so they aren't written back to the media. + */ +forget: + if (ret == -ENOENT) + adfs_dir_relse(&dir); + else + adfs_dir_forget(&dir); +unlock: + up_write(&adfs_dir_rwsem); - ops->free(&dir); -out: -#endif return ret; } @@ -167,25 +354,14 @@ static int adfs_dir_lookup_byname(struct inode *inode, const struct qstr *qstr, u32 name_len; int ret; - ret = ops->read(sb, inode->i_ino, inode->i_size, &dir); + down_read(&adfs_dir_rwsem); + ret = adfs_dir_read_inode(sb, inode, &dir); if (ret) - goto out; - - if (ADFS_I(inode)->parent_id != dir.parent_id) { - adfs_error(sb, - "parent directory changed under me! (%06x but got %06x)\n", - ADFS_I(inode)->parent_id, dir.parent_id); - ret = -EIO; - goto free_out; - } - - obj->parent_id = inode->i_ino; - - read_lock(&adfs_dir_lock); + goto unlock; ret = ops->setpos(&dir, 0); if (ret) - goto unlock_out; + goto unlock_relse; ret = -ENOENT; name = qstr->name; @@ -196,20 +372,22 @@ static int adfs_dir_lookup_byname(struct inode *inode, const struct qstr *qstr, break; } } + obj->parent_id = ADFS_I(inode)->indaddr; -unlock_out: - read_unlock(&adfs_dir_lock); +unlock_relse: + up_read(&adfs_dir_rwsem); + adfs_dir_relse(&dir); + return ret; -free_out: - ops->free(&dir); -out: +unlock: + up_read(&adfs_dir_rwsem); return ret; } const struct file_operations adfs_dir_operations = { .read = generic_read_dir, .llseek = generic_file_llseek, - .iterate = adfs_readdir, + .iterate_shared = adfs_iterate, .fsync = generic_file_fsync, }; diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c index c1a950c7400a..30d526fecc3f 100644 --- a/fs/adfs/dir_f.c +++ b/fs/adfs/dir_f.c @@ -9,8 +9,6 @@ #include "adfs.h" #include "dir_f.h" -static void adfs_f_free(struct adfs_dir *dir); - /* * Read an (unaligned) value of length 1..4 bytes */ @@ -60,7 +58,7 @@ static inline void adfs_writeval(unsigned char *p, int len, unsigned int val) #define bufoff(_bh,_idx) \ ({ int _buf = _idx >> blocksize_bits; \ int _off = _idx - (_buf << blocksize_bits);\ - (u8 *)(_bh[_buf]->b_data + _off); \ + (void *)(_bh[_buf]->b_data + _off); \ }) /* @@ -123,65 +121,49 @@ adfs_dir_checkbyte(const struct adfs_dir *dir) return (dircheck ^ (dircheck >> 8) ^ (dircheck >> 16) ^ (dircheck >> 24)) & 0xff; } -/* Read and check that a directory is valid */ -static int adfs_dir_read(struct super_block *sb, u32 indaddr, - unsigned int size, struct adfs_dir *dir) +static int adfs_f_validate(struct adfs_dir *dir) { - const unsigned int blocksize_bits = sb->s_blocksize_bits; - int blk = 0; - - /* - * Directories which are not a multiple of 2048 bytes - * are considered bad v2 [3.6] - */ - if (size & 2047) - goto bad_dir; - - size >>= blocksize_bits; - - dir->nr_buffers = 0; - dir->sb = sb; - - for (blk = 0; blk < size; blk++) { - int phys; + struct adfs_dirheader *head = dir->dirhead; + struct adfs_newdirtail *tail = dir->newtail; + + if (head->startmasseq != tail->endmasseq || + tail->dirlastmask || tail->reserved[0] || tail->reserved[1] || + (memcmp(&head->startname, "Nick", 4) && + memcmp(&head->startname, "Hugo", 4)) || + memcmp(&head->startname, &tail->endname, 4) || + adfs_dir_checkbyte(dir) != tail->dircheckbyte) + return -EIO; - phys = __adfs_block_map(sb, indaddr, blk); - if (!phys) { - adfs_error(sb, "dir %06x has a hole at offset %d", - indaddr, blk); - goto release_buffers; - } + return 0; +} - dir->bh[blk] = sb_bread(sb, phys); - if (!dir->bh[blk]) - goto release_buffers; - } +/* Read and check that a directory is valid */ +static int adfs_f_read(struct super_block *sb, u32 indaddr, unsigned int size, + struct adfs_dir *dir) +{ + const unsigned int blocksize_bits = sb->s_blocksize_bits; + int ret; - memcpy(&dir->dirhead, bufoff(dir->bh, 0), sizeof(dir->dirhead)); - memcpy(&dir->dirtail, bufoff(dir->bh, 2007), sizeof(dir->dirtail)); + if (size && size != ADFS_NEWDIR_SIZE) + return -EIO; - if (dir->dirhead.startmasseq != dir->dirtail.new.endmasseq || - memcmp(&dir->dirhead.startname, &dir->dirtail.new.endname, 4)) - goto bad_dir; + ret = adfs_dir_read_buffers(sb, indaddr, ADFS_NEWDIR_SIZE, dir); + if (ret) + return ret; - if (memcmp(&dir->dirhead.startname, "Nick", 4) && - memcmp(&dir->dirhead.startname, "Hugo", 4)) - goto bad_dir; + dir->dirhead = bufoff(dir->bh, 0); + dir->newtail = bufoff(dir->bh, 2007); - if (adfs_dir_checkbyte(dir) != dir->dirtail.new.dircheckbyte) + if (adfs_f_validate(dir)) goto bad_dir; - dir->nr_buffers = blk; + dir->parent_id = adfs_readval(dir->newtail->dirparent, 3); return 0; bad_dir: adfs_error(sb, "dir %06x is corrupted", indaddr); -release_buffers: - for (blk -= 1; blk >= 0; blk -= 1) - brelse(dir->bh[blk]); - - dir->sb = NULL; + adfs_dir_relse(dir); return -EIO; } @@ -232,24 +214,12 @@ adfs_obj2dir(struct adfs_direntry *de, struct object_info *obj) static int __adfs_dir_get(struct adfs_dir *dir, int pos, struct object_info *obj) { - struct super_block *sb = dir->sb; struct adfs_direntry de; - int thissize, buffer, offset; - - buffer = pos >> sb->s_blocksize_bits; - - if (buffer > dir->nr_buffers) - return -EINVAL; - - offset = pos & (sb->s_blocksize - 1); - thissize = sb->s_blocksize - offset; - if (thissize > 26) - thissize = 26; + int ret; - memcpy(&de, dir->bh[buffer]->b_data + offset, thissize); - if (thissize != 26) - memcpy(((char *)&de) + thissize, dir->bh[buffer + 1]->b_data, - 26 - thissize); + ret = adfs_dir_copyfrom(&de, dir, pos, 26); + if (ret) + return ret; if (!de.dirobname[0]) return -ENOENT; @@ -260,89 +230,6 @@ __adfs_dir_get(struct adfs_dir *dir, int pos, struct object_info *obj) } static int -__adfs_dir_put(struct adfs_dir *dir, int pos, struct object_info *obj) -{ - struct super_block *sb = dir->sb; - struct adfs_direntry de; - int thissize, buffer, offset; - - buffer = pos >> sb->s_blocksize_bits; - - if (buffer > dir->nr_buffers) - return -EINVAL; - - offset = pos & (sb->s_blocksize - 1); - thissize = sb->s_blocksize - offset; - if (thissize > 26) - thissize = 26; - - /* - * Get the entry in total - */ - memcpy(&de, dir->bh[buffer]->b_data + offset, thissize); - if (thissize != 26) - memcpy(((char *)&de) + thissize, dir->bh[buffer + 1]->b_data, - 26 - thissize); - - /* - * update it - */ - adfs_obj2dir(&de, obj); - - /* - * Put the new entry back - */ - memcpy(dir->bh[buffer]->b_data + offset, &de, thissize); - if (thissize != 26) - memcpy(dir->bh[buffer + 1]->b_data, ((char *)&de) + thissize, - 26 - thissize); - - return 0; -} - -/* - * the caller is responsible for holding the necessary - * locks. - */ -static int adfs_dir_find_entry(struct adfs_dir *dir, u32 indaddr) -{ - int pos, ret; - - ret = -ENOENT; - - for (pos = 5; pos < ADFS_NUM_DIR_ENTRIES * 26 + 5; pos += 26) { - struct object_info obj; - - if (!__adfs_dir_get(dir, pos, &obj)) - break; - - if (obj.indaddr == indaddr) { - ret = pos; - break; - } - } - - return ret; -} - -static int adfs_f_read(struct super_block *sb, u32 indaddr, unsigned int size, - struct adfs_dir *dir) -{ - int ret; - - if (size != ADFS_NEWDIR_SIZE) - return -EIO; - - ret = adfs_dir_read(sb, indaddr, size, dir); - if (ret) - adfs_error(sb, "unable to read directory"); - else - dir->parent_id = adfs_readval(dir->dirtail.new.dirparent, 3); - - return ret; -} - -static int adfs_f_setpos(struct adfs_dir *dir, unsigned int fpos) { if (fpos >= ADFS_NUM_DIR_ENTRIES) @@ -364,99 +251,74 @@ adfs_f_getnext(struct adfs_dir *dir, struct object_info *obj) return ret; } -static int -adfs_f_update(struct adfs_dir *dir, struct object_info *obj) +static int adfs_f_iterate(struct adfs_dir *dir, struct dir_context *ctx) { - struct super_block *sb = dir->sb; - int ret, i; + struct object_info obj; + int pos = 5 + (ctx->pos - 2) * 26; - ret = adfs_dir_find_entry(dir, obj->indaddr); - if (ret < 0) { - adfs_error(dir->sb, "unable to locate entry to update"); - goto out; + while (ctx->pos < 2 + ADFS_NUM_DIR_ENTRIES) { + if (__adfs_dir_get(dir, pos, &obj)) + break; + if (!dir_emit(ctx, obj.name, obj.name_len, + obj.indaddr, DT_UNKNOWN)) + break; + pos += 26; + ctx->pos++; } + return 0; +} - __adfs_dir_put(dir, ret, obj); - - /* - * Increment directory sequence number - */ - dir->bh[0]->b_data[0] += 1; - dir->bh[dir->nr_buffers - 1]->b_data[sb->s_blocksize - 6] += 1; - - ret = adfs_dir_checkbyte(dir); - /* - * Update directory check byte - */ - dir->bh[dir->nr_buffers - 1]->b_data[sb->s_blocksize - 1] = ret; - -#if 1 - { - const unsigned int blocksize_bits = sb->s_blocksize_bits; - - memcpy(&dir->dirhead, bufoff(dir->bh, 0), sizeof(dir->dirhead)); - memcpy(&dir->dirtail, bufoff(dir->bh, 2007), sizeof(dir->dirtail)); +static int adfs_f_update(struct adfs_dir *dir, struct object_info *obj) +{ + struct adfs_direntry de; + int offset, ret; - if (dir->dirhead.startmasseq != dir->dirtail.new.endmasseq || - memcmp(&dir->dirhead.startname, &dir->dirtail.new.endname, 4)) - goto bad_dir; + offset = 5 - (int)sizeof(de); - if (memcmp(&dir->dirhead.startname, "Nick", 4) && - memcmp(&dir->dirhead.startname, "Hugo", 4)) - goto bad_dir; + do { + offset += sizeof(de); + ret = adfs_dir_copyfrom(&de, dir, offset, sizeof(de)); + if (ret) { + adfs_error(dir->sb, "error reading directory entry"); + return -ENOENT; + } + if (!de.dirobname[0]) { + adfs_error(dir->sb, "unable to locate entry to update"); + return -ENOENT; + } + } while (adfs_readval(de.dirinddiscadd, 3) != obj->indaddr); - if (adfs_dir_checkbyte(dir) != dir->dirtail.new.dircheckbyte) - goto bad_dir; - } -#endif - for (i = dir->nr_buffers - 1; i >= 0; i--) - mark_buffer_dirty(dir->bh[i]); + /* Update the directory entry with the new object state */ + adfs_obj2dir(&de, obj); - ret = 0; -out: - return ret; -#if 1 -bad_dir: - adfs_error(dir->sb, "whoops! I broke a directory!"); - return -EIO; -#endif + /* Write the directory entry back to the directory */ + return adfs_dir_copyto(dir, offset, &de, 26); } -static int -adfs_f_sync(struct adfs_dir *dir) +static int adfs_f_commit(struct adfs_dir *dir) { - int err = 0; - int i; - - for (i = dir->nr_buffers - 1; i >= 0; i--) { - struct buffer_head *bh = dir->bh[i]; - sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) - err = -EIO; - } + int ret; - return err; -} + /* Increment directory sequence number */ + dir->dirhead->startmasseq += 1; + dir->newtail->endmasseq += 1; -static void -adfs_f_free(struct adfs_dir *dir) -{ - int i; + /* Update directory check byte */ + dir->newtail->dircheckbyte = adfs_dir_checkbyte(dir); - for (i = dir->nr_buffers - 1; i >= 0; i--) { - brelse(dir->bh[i]); - dir->bh[i] = NULL; - } + /* Make sure the directory still validates correctly */ + ret = adfs_f_validate(dir); + if (ret) + adfs_msg(dir->sb, KERN_ERR, "error: update broke directory"); - dir->nr_buffers = 0; - dir->sb = NULL; + return ret; } const struct adfs_dir_ops adfs_f_dir_ops = { .read = adfs_f_read, + .iterate = adfs_f_iterate, .setpos = adfs_f_setpos, .getnext = adfs_f_getnext, .update = adfs_f_update, - .sync = adfs_f_sync, - .free = adfs_f_free + .commit = adfs_f_commit, }; diff --git a/fs/adfs/dir_f.h b/fs/adfs/dir_f.h index 5aec332b90f5..a5393e6cf9f4 100644 --- a/fs/adfs/dir_f.h +++ b/fs/adfs/dir_f.h @@ -13,9 +13,9 @@ * Directory header */ struct adfs_dirheader { - unsigned char startmasseq; - unsigned char startname[4]; -}; + __u8 startmasseq; + __u8 startname[4]; +} __attribute__((packed)); #define ADFS_NEWDIR_SIZE 2048 #define ADFS_NUM_DIR_ENTRIES 77 @@ -31,32 +31,36 @@ struct adfs_direntry { __u8 dirlen[4]; __u8 dirinddiscadd[3]; __u8 newdiratts; -}; +} __attribute__((packed)); /* * Directory tail */ +struct adfs_olddirtail { + __u8 dirlastmask; + char dirname[10]; + __u8 dirparent[3]; + char dirtitle[19]; + __u8 reserved[14]; + __u8 endmasseq; + __u8 endname[4]; + __u8 dircheckbyte; +} __attribute__((packed)); + +struct adfs_newdirtail { + __u8 dirlastmask; + __u8 reserved[2]; + __u8 dirparent[3]; + char dirtitle[19]; + char dirname[10]; + __u8 endmasseq; + __u8 endname[4]; + __u8 dircheckbyte; +} __attribute__((packed)); + union adfs_dirtail { - struct { - unsigned char dirlastmask; - char dirname[10]; - unsigned char dirparent[3]; - char dirtitle[19]; - unsigned char reserved[14]; - unsigned char endmasseq; - unsigned char endname[4]; - unsigned char dircheckbyte; - } old; - struct { - unsigned char dirlastmask; - unsigned char reserved[2]; - unsigned char dirparent[3]; - char dirtitle[19]; - char dirname[10]; - unsigned char endmasseq; - unsigned char endname[4]; - unsigned char dircheckbyte; - } new; + struct adfs_olddirtail old; + struct adfs_newdirtail new; }; #endif diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c index d56924c11b17..4a15924014da 100644 --- a/fs/adfs/dir_fplus.c +++ b/fs/adfs/dir_fplus.c @@ -4,123 +4,163 @@ * * Copyright (C) 1997-1999 Russell King */ -#include <linux/slab.h> #include "adfs.h" #include "dir_fplus.h" -static int -adfs_fplus_read(struct super_block *sb, unsigned int id, unsigned int sz, struct adfs_dir *dir) +/* Return the byte offset to directory entry pos */ +static unsigned int adfs_fplus_offset(const struct adfs_bigdirheader *h, + unsigned int pos) { - struct adfs_bigdirheader *h; - struct adfs_bigdirtail *t; - unsigned long block; - unsigned int blk, size; - int i, ret = -EIO; + return offsetof(struct adfs_bigdirheader, bigdirname) + + ALIGN(le32_to_cpu(h->bigdirnamelen), 4) + + pos * sizeof(struct adfs_bigdirentry); +} - dir->nr_buffers = 0; +static int adfs_fplus_validate_header(const struct adfs_bigdirheader *h) +{ + unsigned int size = le32_to_cpu(h->bigdirsize); + unsigned int len; - /* start off using fixed bh set - only alloc for big dirs */ - dir->bh_fplus = &dir->bh[0]; + if (h->bigdirversion[0] != 0 || h->bigdirversion[1] != 0 || + h->bigdirversion[2] != 0 || + h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME) || + !size || size & 2047 || size > SZ_4M) + return -EIO; - block = __adfs_block_map(sb, id, 0); - if (!block) { - adfs_error(sb, "dir object %X has a hole at offset 0", id); - goto out; - } + size -= sizeof(struct adfs_bigdirtail) + + offsetof(struct adfs_bigdirheader, bigdirname); - dir->bh_fplus[0] = sb_bread(sb, block); - if (!dir->bh_fplus[0]) - goto out; - dir->nr_buffers += 1; + /* Check that bigdirnamelen fits within the directory */ + len = ALIGN(le32_to_cpu(h->bigdirnamelen), 4); + if (len > size) + return -EIO; - h = (struct adfs_bigdirheader *)dir->bh_fplus[0]->b_data; - size = le32_to_cpu(h->bigdirsize); - if (size != sz) { - adfs_msg(sb, KERN_WARNING, - "directory header size %X does not match directory size %X", - size, sz); + size -= len; + + /* Check that bigdirnamesize fits within the directory */ + len = le32_to_cpu(h->bigdirnamesize); + if (len > size) + return -EIO; + + size -= len; + + /* + * Avoid division, we know that absolute maximum number of entries + * can not be so large to cause overflow of the multiplication below. + */ + len = le32_to_cpu(h->bigdirentries); + if (len > SZ_4M / sizeof(struct adfs_bigdirentry) || + len * sizeof(struct adfs_bigdirentry) > size) + return -EIO; + + return 0; +} + +static int adfs_fplus_validate_tail(const struct adfs_bigdirheader *h, + const struct adfs_bigdirtail *t) +{ + if (t->bigdirendname != cpu_to_le32(BIGDIRENDNAME) || + t->bigdirendmasseq != h->startmasseq || + t->reserved[0] != 0 || t->reserved[1] != 0) + return -EIO; + + return 0; +} + +static u8 adfs_fplus_checkbyte(struct adfs_dir *dir) +{ + struct adfs_bigdirheader *h = dir->bighead; + struct adfs_bigdirtail *t = dir->bigtail; + unsigned int end, bs, bi, i; + __le32 *bp; + u32 dircheck; + + end = adfs_fplus_offset(h, le32_to_cpu(h->bigdirentries)) + + le32_to_cpu(h->bigdirnamesize); + + /* Accumulate the contents of the header, entries and names */ + for (dircheck = 0, bi = 0; end; bi++) { + bp = (void *)dir->bhs[bi]->b_data; + bs = dir->bhs[bi]->b_size; + if (bs > end) + bs = end; + + for (i = 0; i < bs; i += sizeof(u32)) + dircheck = ror32(dircheck, 13) ^ le32_to_cpup(bp++); + + end -= bs; } - if (h->bigdirversion[0] != 0 || h->bigdirversion[1] != 0 || - h->bigdirversion[2] != 0 || size & 2047 || - h->bigdirstartname != cpu_to_le32(BIGDIRSTARTNAME)) { - adfs_error(sb, "dir %06x has malformed header", id); + /* Accumulate the contents of the tail except for the check byte */ + dircheck = ror32(dircheck, 13) ^ le32_to_cpu(t->bigdirendname); + dircheck = ror32(dircheck, 13) ^ t->bigdirendmasseq; + dircheck = ror32(dircheck, 13) ^ t->reserved[0]; + dircheck = ror32(dircheck, 13) ^ t->reserved[1]; + + return dircheck ^ dircheck >> 8 ^ dircheck >> 16 ^ dircheck >> 24; +} + +static int adfs_fplus_read(struct super_block *sb, u32 indaddr, + unsigned int size, struct adfs_dir *dir) +{ + struct adfs_bigdirheader *h; + struct adfs_bigdirtail *t; + unsigned int dirsize; + int ret; + + /* Read first buffer */ + ret = adfs_dir_read_buffers(sb, indaddr, sb->s_blocksize, dir); + if (ret) + return ret; + + dir->bighead = h = (void *)dir->bhs[0]->b_data; + ret = adfs_fplus_validate_header(h); + if (ret) { + adfs_error(sb, "dir %06x has malformed header", indaddr); goto out; } - size >>= sb->s_blocksize_bits; - if (size > ARRAY_SIZE(dir->bh)) { - /* this directory is too big for fixed bh set, must allocate */ - struct buffer_head **bh_fplus = - kcalloc(size, sizeof(struct buffer_head *), - GFP_KERNEL); - if (!bh_fplus) { - adfs_msg(sb, KERN_ERR, - "not enough memory for dir object %X (%d blocks)", - id, size); - ret = -ENOMEM; - goto out; - } - dir->bh_fplus = bh_fplus; - /* copy over the pointer to the block that we've already read */ - dir->bh_fplus[0] = dir->bh[0]; + dirsize = le32_to_cpu(h->bigdirsize); + if (size && dirsize != size) { + adfs_msg(sb, KERN_WARNING, + "dir %06x header size %X does not match directory size %X", + indaddr, dirsize, size); } - for (blk = 1; blk < size; blk++) { - block = __adfs_block_map(sb, id, blk); - if (!block) { - adfs_error(sb, "dir object %X has a hole at offset %d", id, blk); - goto out; - } + /* Read remaining buffers */ + ret = adfs_dir_read_buffers(sb, indaddr, dirsize, dir); + if (ret) + return ret; - dir->bh_fplus[blk] = sb_bread(sb, block); - if (!dir->bh_fplus[blk]) { - adfs_error(sb, "dir object %x failed read for offset %d, mapped block %lX", - id, blk, block); - goto out; - } + dir->bigtail = t = (struct adfs_bigdirtail *) + (dir->bhs[dir->nr_buffers - 1]->b_data + (sb->s_blocksize - 8)); - dir->nr_buffers += 1; + ret = adfs_fplus_validate_tail(h, t); + if (ret) { + adfs_error(sb, "dir %06x has malformed tail", indaddr); + goto out; } - t = (struct adfs_bigdirtail *) - (dir->bh_fplus[size - 1]->b_data + (sb->s_blocksize - 8)); - - if (t->bigdirendname != cpu_to_le32(BIGDIRENDNAME) || - t->bigdirendmasseq != h->startmasseq || - t->reserved[0] != 0 || t->reserved[1] != 0) { - adfs_error(sb, "dir %06x has malformed tail", id); + if (adfs_fplus_checkbyte(dir) != t->bigdircheckbyte) { + adfs_error(sb, "dir %06x checkbyte mismatch\n", indaddr); goto out; } dir->parent_id = le32_to_cpu(h->bigdirparent); - dir->sb = sb; return 0; out: - if (dir->bh_fplus) { - for (i = 0; i < dir->nr_buffers; i++) - brelse(dir->bh_fplus[i]); - - if (&dir->bh[0] != dir->bh_fplus) - kfree(dir->bh_fplus); + adfs_dir_relse(dir); - dir->bh_fplus = NULL; - } - - dir->nr_buffers = 0; - dir->sb = NULL; return ret; } static int adfs_fplus_setpos(struct adfs_dir *dir, unsigned int fpos) { - struct adfs_bigdirheader *h = - (struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data; int ret = -ENOENT; - if (fpos <= le32_to_cpu(h->bigdirentries)) { + if (fpos <= le32_to_cpu(dir->bighead->bigdirentries)) { dir->pos = fpos; ret = 0; } @@ -128,51 +168,23 @@ adfs_fplus_setpos(struct adfs_dir *dir, unsigned int fpos) return ret; } -static void -dir_memcpy(struct adfs_dir *dir, unsigned int offset, void *to, int len) -{ - struct super_block *sb = dir->sb; - unsigned int buffer, partial, remainder; - - buffer = offset >> sb->s_blocksize_bits; - offset &= sb->s_blocksize - 1; - - partial = sb->s_blocksize - offset; - - if (partial >= len) - memcpy(to, dir->bh_fplus[buffer]->b_data + offset, len); - else { - char *c = (char *)to; - - remainder = len - partial; - - memcpy(c, - dir->bh_fplus[buffer]->b_data + offset, - partial); - - memcpy(c + partial, - dir->bh_fplus[buffer + 1]->b_data, - remainder); - } -} - static int adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj) { - struct adfs_bigdirheader *h = - (struct adfs_bigdirheader *) dir->bh_fplus[0]->b_data; + struct adfs_bigdirheader *h = dir->bighead; struct adfs_bigdirentry bde; unsigned int offset; - int ret = -ENOENT; + int ret; if (dir->pos >= le32_to_cpu(h->bigdirentries)) - goto out; + return -ENOENT; - offset = offsetof(struct adfs_bigdirheader, bigdirname); - offset += ((le32_to_cpu(h->bigdirnamelen) + 4) & ~3); - offset += dir->pos * sizeof(struct adfs_bigdirentry); + offset = adfs_fplus_offset(h, dir->pos); - dir_memcpy(dir, offset, &bde, sizeof(struct adfs_bigdirentry)); + ret = adfs_dir_copyfrom(&bde, dir, offset, + sizeof(struct adfs_bigdirentry)); + if (ret) + return ret; obj->loadaddr = le32_to_cpu(bde.bigdirload); obj->execaddr = le32_to_cpu(bde.bigdirexec); @@ -181,59 +193,95 @@ adfs_fplus_getnext(struct adfs_dir *dir, struct object_info *obj) obj->attr = le32_to_cpu(bde.bigdirattr); obj->name_len = le32_to_cpu(bde.bigdirobnamelen); - offset = offsetof(struct adfs_bigdirheader, bigdirname); - offset += ((le32_to_cpu(h->bigdirnamelen) + 4) & ~3); - offset += le32_to_cpu(h->bigdirentries) * sizeof(struct adfs_bigdirentry); + offset = adfs_fplus_offset(h, le32_to_cpu(h->bigdirentries)); offset += le32_to_cpu(bde.bigdirobnameptr); - dir_memcpy(dir, offset, obj->name, obj->name_len); + ret = adfs_dir_copyfrom(obj->name, dir, offset, obj->name_len); + if (ret) + return ret; + adfs_object_fixup(dir, obj); dir->pos += 1; - ret = 0; -out: - return ret; + + return 0; } -static int -adfs_fplus_sync(struct adfs_dir *dir) +static int adfs_fplus_iterate(struct adfs_dir *dir, struct dir_context *ctx) { - int err = 0; - int i; - - for (i = dir->nr_buffers - 1; i >= 0; i--) { - struct buffer_head *bh = dir->bh_fplus[i]; - sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) - err = -EIO; + struct object_info obj; + + if ((ctx->pos - 2) >> 32) + return 0; + + if (adfs_fplus_setpos(dir, ctx->pos - 2)) + return 0; + + while (!adfs_fplus_getnext(dir, &obj)) { + if (!dir_emit(ctx, obj.name, obj.name_len, + obj.indaddr, DT_UNKNOWN)) + break; + ctx->pos++; } - return err; + return 0; } -static void -adfs_fplus_free(struct adfs_dir *dir) +static int adfs_fplus_update(struct adfs_dir *dir, struct object_info *obj) { - int i; + struct adfs_bigdirheader *h = dir->bighead; + struct adfs_bigdirentry bde; + int offset, end, ret; - if (dir->bh_fplus) { - for (i = 0; i < dir->nr_buffers; i++) - brelse(dir->bh_fplus[i]); + offset = adfs_fplus_offset(h, 0) - sizeof(bde); + end = adfs_fplus_offset(h, le32_to_cpu(h->bigdirentries)); - if (&dir->bh[0] != dir->bh_fplus) - kfree(dir->bh_fplus); + do { + offset += sizeof(bde); + if (offset >= end) { + adfs_error(dir->sb, "unable to locate entry to update"); + return -ENOENT; + } + ret = adfs_dir_copyfrom(&bde, dir, offset, sizeof(bde)); + if (ret) { + adfs_error(dir->sb, "error reading directory entry"); + return -ENOENT; + } + } while (le32_to_cpu(bde.bigdirindaddr) != obj->indaddr); - dir->bh_fplus = NULL; - } + bde.bigdirload = cpu_to_le32(obj->loadaddr); + bde.bigdirexec = cpu_to_le32(obj->execaddr); + bde.bigdirlen = cpu_to_le32(obj->size); + bde.bigdirindaddr = cpu_to_le32(obj->indaddr); + bde.bigdirattr = cpu_to_le32(obj->attr); + + return adfs_dir_copyto(dir, offset, &bde, sizeof(bde)); +} + +static int adfs_fplus_commit(struct adfs_dir *dir) +{ + int ret; - dir->nr_buffers = 0; - dir->sb = NULL; + /* Increment directory sequence number */ + dir->bighead->startmasseq += 1; + dir->bigtail->bigdirendmasseq += 1; + + /* Update directory check byte */ + dir->bigtail->bigdircheckbyte = adfs_fplus_checkbyte(dir); + + /* Make sure the directory still validates correctly */ + ret = adfs_fplus_validate_header(dir->bighead); + if (ret == 0) + ret = adfs_fplus_validate_tail(dir->bighead, dir->bigtail); + + return ret; } const struct adfs_dir_ops adfs_fplus_dir_ops = { .read = adfs_fplus_read, + .iterate = adfs_fplus_iterate, .setpos = adfs_fplus_setpos, .getnext = adfs_fplus_getnext, - .sync = adfs_fplus_sync, - .free = adfs_fplus_free + .update = adfs_fplus_update, + .commit = adfs_fplus_commit, }; diff --git a/fs/adfs/dir_fplus.h b/fs/adfs/dir_fplus.h index 4ec0931e36ad..d729b1591e5e 100644 --- a/fs/adfs/dir_fplus.h +++ b/fs/adfs/dir_fplus.h @@ -22,7 +22,7 @@ struct adfs_bigdirheader { __le32 bigdirnamesize; __le32 bigdirparent; char bigdirname[1]; -}; +} __attribute__((packed, aligned(4))); struct adfs_bigdirentry { __le32 bigdirload; @@ -32,11 +32,11 @@ struct adfs_bigdirentry { __le32 bigdirattr; __le32 bigdirobnamelen; __le32 bigdirobnameptr; -}; +} __attribute__((packed, aligned(4))); struct adfs_bigdirtail { __le32 bigdirendname; __u8 bigdirendmasseq; __u8 reserved[2]; __u8 bigdircheckbyte; -}; +} __attribute__((packed, aligned(4))); diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 124de75413a5..32620f4a7623 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -20,7 +20,8 @@ adfs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh, if (block >= inode->i_blocks) goto abort_toobig; - block = __adfs_block_map(inode->i_sb, inode->i_ino, block); + block = __adfs_block_map(inode->i_sb, ADFS_I(inode)->indaddr, + block); if (block) map_bh(bh, inode->i_sb, block); return 0; @@ -126,29 +127,29 @@ adfs_atts2mode(struct super_block *sb, struct inode *inode) * Convert Linux permission to ADFS attribute. We try to do the reverse * of atts2mode, but there is not a 1:1 translation. */ -static int -adfs_mode2atts(struct super_block *sb, struct inode *inode) +static int adfs_mode2atts(struct super_block *sb, struct inode *inode, + umode_t ia_mode) { + struct adfs_sb_info *asb = ADFS_SB(sb); umode_t mode; int attr; - struct adfs_sb_info *asb = ADFS_SB(sb); /* FIXME: should we be able to alter a link? */ if (S_ISLNK(inode->i_mode)) return ADFS_I(inode)->attr; + /* Directories do not have read/write permissions on the media */ if (S_ISDIR(inode->i_mode)) - attr = ADFS_NDA_DIRECTORY; - else - attr = 0; + return ADFS_NDA_DIRECTORY; - mode = inode->i_mode & asb->s_owner_mask; + attr = 0; + mode = ia_mode & asb->s_owner_mask; if (mode & S_IRUGO) attr |= ADFS_NDA_OWNER_READ; if (mode & S_IWUGO) attr |= ADFS_NDA_OWNER_WRITE; - mode = inode->i_mode & asb->s_other_mask; + mode = ia_mode & asb->s_other_mask; mode &= ~asb->s_owner_mask; if (mode & S_IRUGO) attr |= ADFS_NDA_PUBLIC_READ; @@ -158,6 +159,8 @@ adfs_mode2atts(struct super_block *sb, struct inode *inode) return attr; } +static const s64 nsec_unix_epoch_diff_risc_os_epoch = 2208988800000000000LL; + /* * Convert an ADFS time to Unix time. ADFS has a 40-bit centi-second time * referenced to 1 Jan 1900 (til 2248) so we need to discard 2208988800 seconds @@ -170,8 +173,6 @@ adfs_adfs2unix_time(struct timespec64 *tv, struct inode *inode) /* 01 Jan 1970 00:00:00 (Unix epoch) as nanoseconds since * 01 Jan 1900 00:00:00 (RISC OS epoch) */ - static const s64 nsec_unix_epoch_diff_risc_os_epoch = - 2208988800000000000LL; s64 nsec; if (!adfs_inode_is_stamped(inode)) @@ -204,24 +205,23 @@ adfs_adfs2unix_time(struct timespec64 *tv, struct inode *inode) return; } -/* - * Convert an Unix time to ADFS time. We only do this if the entry has a - * time/date stamp already. - */ -static void -adfs_unix2adfs_time(struct inode *inode, unsigned int secs) +/* Convert an Unix time to ADFS time for an entry that is already stamped. */ +static void adfs_unix2adfs_time(struct inode *inode, + const struct timespec64 *ts) { - unsigned int high, low; + s64 cs, nsec = timespec64_to_ns(ts); - if (adfs_inode_is_stamped(inode)) { - /* convert 32-bit seconds to 40-bit centi-seconds */ - low = (secs & 255) * 100; - high = (secs / 256) * 100 + (low >> 8) + 0x336e996a; + /* convert from Unix to RISC OS epoch */ + nsec += nsec_unix_epoch_diff_risc_os_epoch; - ADFS_I(inode)->loadaddr = (high >> 24) | - (ADFS_I(inode)->loadaddr & ~0xff); - ADFS_I(inode)->execaddr = (low & 255) | (high << 8); - } + /* convert from nanoseconds to centiseconds */ + cs = div_s64(nsec, 10000000); + + cs = clamp_t(s64, cs, 0, 0xffffffffff); + + ADFS_I(inode)->loadaddr &= ~0xff; + ADFS_I(inode)->loadaddr |= (cs >> 32) & 0xff; + ADFS_I(inode)->execaddr = cs; } /* @@ -260,6 +260,7 @@ adfs_iget(struct super_block *sb, struct object_info *obj) * for cross-directory renames. */ ADFS_I(inode)->parent_id = obj->parent_id; + ADFS_I(inode)->indaddr = obj->indaddr; ADFS_I(inode)->loadaddr = obj->loadaddr; ADFS_I(inode)->execaddr = obj->execaddr; ADFS_I(inode)->attr = obj->attr; @@ -315,10 +316,11 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr) if (ia_valid & ATTR_SIZE) truncate_setsize(inode, attr->ia_size); - if (ia_valid & ATTR_MTIME) { - inode->i_mtime = attr->ia_mtime; - adfs_unix2adfs_time(inode, attr->ia_mtime.tv_sec); + if (ia_valid & ATTR_MTIME && adfs_inode_is_stamped(inode)) { + adfs_unix2adfs_time(inode, &attr->ia_mtime); + adfs_adfs2unix_time(&inode->i_mtime, inode); } + /* * FIXME: should we make these == to i_mtime since we don't * have the ability to represent them in our filesystem? @@ -328,7 +330,7 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr) if (ia_valid & ATTR_CTIME) inode->i_ctime = attr->ia_ctime; if (ia_valid & ATTR_MODE) { - ADFS_I(inode)->attr = adfs_mode2atts(sb, inode); + ADFS_I(inode)->attr = adfs_mode2atts(sb, inode, attr->ia_mode); inode->i_mode = adfs_atts2mode(sb, inode); } @@ -353,7 +355,7 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc) struct object_info obj; int ret; - obj.indaddr = inode->i_ino; + obj.indaddr = ADFS_I(inode)->indaddr; obj.name_len = 0; obj.parent_id = ADFS_I(inode)->parent_id; obj.loadaddr = ADFS_I(inode)->loadaddr; diff --git a/fs/adfs/map.c b/fs/adfs/map.c index f44d12cef5be..a81de80c45c1 100644 --- a/fs/adfs/map.c +++ b/fs/adfs/map.c @@ -4,6 +4,8 @@ * * Copyright (C) 1997-2002 Russell King */ +#include <linux/slab.h> +#include <linux/statfs.h> #include <asm/unaligned.h> #include "adfs.h" @@ -66,54 +68,41 @@ static DEFINE_RWLOCK(adfs_map_lock); static int lookup_zone(const struct adfs_discmap *dm, const unsigned int idlen, const u32 frag_id, unsigned int *offset) { - const unsigned int mapsize = dm->dm_endbit; + const unsigned int endbit = dm->dm_endbit; const u32 idmask = (1 << idlen) - 1; - unsigned char *map = dm->dm_bh->b_data + 4; + unsigned char *map = dm->dm_bh->b_data; unsigned int start = dm->dm_startbit; - unsigned int mapptr; + unsigned int freelink, fragend; u32 frag; + frag = GET_FRAG_ID(map, 8, idmask & 0x7fff); + freelink = frag ? 8 + frag : 0; + do { frag = GET_FRAG_ID(map, start, idmask); - mapptr = start + idlen; - - /* - * find end of fragment - */ - { - __le32 *_map = (__le32 *)map; - u32 v = le32_to_cpu(_map[mapptr >> 5]) >> (mapptr & 31); - while (v == 0) { - mapptr = (mapptr & ~31) + 32; - if (mapptr >= mapsize) - goto error; - v = le32_to_cpu(_map[mapptr >> 5]); - } - - mapptr += 1 + ffz(~v); + + fragend = find_next_bit_le(map, endbit, start + idlen); + if (fragend >= endbit) + goto error; + + if (start == freelink) { + freelink += frag & 0x7fff; + } else if (frag == frag_id) { + unsigned int length = fragend + 1 - start; + + if (*offset < length) + return start + *offset; + *offset -= length; } - if (frag == frag_id) - goto found; -again: - start = mapptr; - } while (mapptr < mapsize); + start = fragend + 1; + } while (start < endbit); return -1; error: printk(KERN_ERR "adfs: oversized fragment 0x%x at 0x%x-0x%x\n", - frag, start, mapptr); + frag, start, fragend); return -1; - -found: - { - int length = mapptr - start; - if (*offset >= length) { - *offset -= length; - goto again; - } - } - return start + *offset; } /* @@ -125,12 +114,12 @@ found: static unsigned int scan_free_map(struct adfs_sb_info *asb, struct adfs_discmap *dm) { - const unsigned int mapsize = dm->dm_endbit + 32; + const unsigned int endbit = dm->dm_endbit; const unsigned int idlen = asb->s_idlen; const unsigned int frag_idlen = idlen <= 15 ? idlen : 15; const u32 idmask = (1 << frag_idlen) - 1; unsigned char *map = dm->dm_bh->b_data; - unsigned int start = 8, mapptr; + unsigned int start = 8, fragend; u32 frag; unsigned long total = 0; @@ -149,29 +138,13 @@ scan_free_map(struct adfs_sb_info *asb, struct adfs_discmap *dm) do { start += frag; - /* - * get fragment id - */ frag = GET_FRAG_ID(map, start, idmask); - mapptr = start + idlen; - - /* - * find end of fragment - */ - { - __le32 *_map = (__le32 *)map; - u32 v = le32_to_cpu(_map[mapptr >> 5]) >> (mapptr & 31); - while (v == 0) { - mapptr = (mapptr & ~31) + 32; - if (mapptr >= mapsize) - goto error; - v = le32_to_cpu(_map[mapptr >> 5]); - } - - mapptr += 1 + ffz(~v); - } - total += mapptr - start; + fragend = find_next_bit_le(map, endbit, start + idlen); + if (fragend >= endbit) + goto error; + + total += fragend + 1 - start; } while (frag >= idlen + 1); if (frag != 0) @@ -220,10 +193,10 @@ found: * total_free = E(free_in_zone_n) * nzones */ -unsigned int -adfs_map_free(struct super_block *sb) +void adfs_map_statfs(struct super_block *sb, struct kstatfs *buf) { struct adfs_sb_info *asb = ADFS_SB(sb); + struct adfs_discrecord *dr = adfs_map_discrecord(asb->s_map); struct adfs_discmap *dm; unsigned int total = 0; unsigned int zone; @@ -235,7 +208,10 @@ adfs_map_free(struct super_block *sb) total += scan_free_map(asb, dm++); } while (--zone > 0); - return signed_asl(total, asb->s_map2blk); + buf->f_blocks = adfs_disc_size(dr) >> sb->s_blocksize_bits; + buf->f_files = asb->s_ids_per_zone * asb->s_map_size; + buf->f_bavail = + buf->f_bfree = signed_asl(total, asb->s_map2blk); } int adfs_map_lookup(struct super_block *sb, u32 frag_id, unsigned int offset) @@ -280,3 +256,152 @@ bad_fragment: frag_id, zone, asb->s_map_size); return 0; } + +static unsigned char adfs_calczonecheck(struct super_block *sb, unsigned char *map) +{ + unsigned int v0, v1, v2, v3; + int i; + + v0 = v1 = v2 = v3 = 0; + for (i = sb->s_blocksize - 4; i; i -= 4) { + v0 += map[i] + (v3 >> 8); + v3 &= 0xff; + v1 += map[i + 1] + (v0 >> 8); + v0 &= 0xff; + v2 += map[i + 2] + (v1 >> 8); + v1 &= 0xff; + v3 += map[i + 3] + (v2 >> 8); + v2 &= 0xff; + } + v0 += v3 >> 8; + v1 += map[1] + (v0 >> 8); + v2 += map[2] + (v1 >> 8); + v3 += map[3] + (v2 >> 8); + + return v0 ^ v1 ^ v2 ^ v3; +} + +static int adfs_checkmap(struct super_block *sb, struct adfs_discmap *dm) +{ + unsigned char crosscheck = 0, zonecheck = 1; + int i; + + for (i = 0; i < ADFS_SB(sb)->s_map_size; i++) { + unsigned char *map; + + map = dm[i].dm_bh->b_data; + + if (adfs_calczonecheck(sb, map) != map[0]) { + adfs_error(sb, "zone %d fails zonecheck", i); + zonecheck = 0; + } + crosscheck ^= map[3]; + } + if (crosscheck != 0xff) + adfs_error(sb, "crosscheck != 0xff"); + return crosscheck == 0xff && zonecheck; +} + +/* + * Layout the map - the first zone contains a copy of the disc record, + * and the last zone must be limited to the size of the filesystem. + */ +static void adfs_map_layout(struct adfs_discmap *dm, unsigned int nzones, + struct adfs_discrecord *dr) +{ + unsigned int zone, zone_size; + u64 size; + + zone_size = (8 << dr->log2secsize) - le16_to_cpu(dr->zone_spare); + + dm[0].dm_bh = NULL; + dm[0].dm_startblk = 0; + dm[0].dm_startbit = 32 + ADFS_DR_SIZE_BITS; + dm[0].dm_endbit = 32 + zone_size; + + for (zone = 1; zone < nzones; zone++) { + dm[zone].dm_bh = NULL; + dm[zone].dm_startblk = zone * zone_size - ADFS_DR_SIZE_BITS; + dm[zone].dm_startbit = 32; + dm[zone].dm_endbit = 32 + zone_size; + } + + size = adfs_disc_size(dr) >> dr->log2bpmb; + size -= (nzones - 1) * zone_size - ADFS_DR_SIZE_BITS; + dm[nzones - 1].dm_endbit = 32 + size; +} + +static int adfs_map_read(struct adfs_discmap *dm, struct super_block *sb, + unsigned int map_addr, unsigned int nzones) +{ + unsigned int zone; + + for (zone = 0; zone < nzones; zone++) { + dm[zone].dm_bh = sb_bread(sb, map_addr + zone); + if (!dm[zone].dm_bh) + return -EIO; + } + + return 0; +} + +static void adfs_map_relse(struct adfs_discmap *dm, unsigned int nzones) +{ + unsigned int zone; + + for (zone = 0; zone < nzones; zone++) + brelse(dm[zone].dm_bh); +} + +struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_discrecord *dr) +{ + struct adfs_sb_info *asb = ADFS_SB(sb); + struct adfs_discmap *dm; + unsigned int map_addr, zone_size, nzones; + int ret; + + nzones = dr->nzones | dr->nzones_high << 8; + zone_size = (8 << dr->log2secsize) - le16_to_cpu(dr->zone_spare); + + asb->s_idlen = dr->idlen; + asb->s_map_size = nzones; + asb->s_map2blk = dr->log2bpmb - dr->log2secsize; + asb->s_log2sharesize = dr->log2sharesize; + asb->s_ids_per_zone = zone_size / (asb->s_idlen + 1); + + map_addr = (nzones >> 1) * zone_size - + ((nzones > 1) ? ADFS_DR_SIZE_BITS : 0); + map_addr = signed_asl(map_addr, asb->s_map2blk); + + dm = kmalloc_array(nzones, sizeof(*dm), GFP_KERNEL); + if (dm == NULL) { + adfs_error(sb, "not enough memory"); + return ERR_PTR(-ENOMEM); + } + + adfs_map_layout(dm, nzones, dr); + + ret = adfs_map_read(dm, sb, map_addr, nzones); + if (ret) { + adfs_error(sb, "unable to read map"); + goto error_free; + } + + if (adfs_checkmap(sb, dm)) + return dm; + + adfs_error(sb, "map corrupted"); + +error_free: + adfs_map_relse(dm, nzones); + kfree(dm); + return ERR_PTR(-EIO); +} + +void adfs_free_map(struct super_block *sb) +{ + struct adfs_sb_info *asb = ADFS_SB(sb); + + adfs_map_relse(asb->s_map, asb->s_map_size); + kfree(asb->s_map); +} diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 65b04ebb51c3..a3cc8ecb50da 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -88,59 +88,11 @@ static int adfs_checkdiscrecord(struct adfs_discrecord *dr) return 0; } -static unsigned char adfs_calczonecheck(struct super_block *sb, unsigned char *map) -{ - unsigned int v0, v1, v2, v3; - int i; - - v0 = v1 = v2 = v3 = 0; - for (i = sb->s_blocksize - 4; i; i -= 4) { - v0 += map[i] + (v3 >> 8); - v3 &= 0xff; - v1 += map[i + 1] + (v0 >> 8); - v0 &= 0xff; - v2 += map[i + 2] + (v1 >> 8); - v1 &= 0xff; - v3 += map[i + 3] + (v2 >> 8); - v2 &= 0xff; - } - v0 += v3 >> 8; - v1 += map[1] + (v0 >> 8); - v2 += map[2] + (v1 >> 8); - v3 += map[3] + (v2 >> 8); - - return v0 ^ v1 ^ v2 ^ v3; -} - -static int adfs_checkmap(struct super_block *sb, struct adfs_discmap *dm) -{ - unsigned char crosscheck = 0, zonecheck = 1; - int i; - - for (i = 0; i < ADFS_SB(sb)->s_map_size; i++) { - unsigned char *map; - - map = dm[i].dm_bh->b_data; - - if (adfs_calczonecheck(sb, map) != map[0]) { - adfs_error(sb, "zone %d fails zonecheck", i); - zonecheck = 0; - } - crosscheck ^= map[3]; - } - if (crosscheck != 0xff) - adfs_error(sb, "crosscheck != 0xff"); - return crosscheck == 0xff && zonecheck; -} - static void adfs_put_super(struct super_block *sb) { - int i; struct adfs_sb_info *asb = ADFS_SB(sb); - for (i = 0; i < asb->s_map_size; i++) - brelse(asb->s_map[i].dm_bh); - kfree(asb->s_map); + adfs_free_map(sb); kfree_rcu(asb, rcu); } @@ -249,16 +201,13 @@ static int adfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct adfs_sb_info *sbi = ADFS_SB(sb); - struct adfs_discrecord *dr = adfs_map_discrecord(sbi->s_map); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); + adfs_map_statfs(sb, buf); + buf->f_type = ADFS_SUPER_MAGIC; buf->f_namelen = sbi->s_namelen; buf->f_bsize = sb->s_blocksize; - buf->f_blocks = adfs_disc_size(dr) >> sb->s_blocksize_bits; - buf->f_files = sbi->s_ids_per_zone * sbi->s_map_size; - buf->f_bavail = - buf->f_bfree = adfs_map_free(sb); buf->f_ffree = (long)(buf->f_bfree * buf->f_files) / (long)buf->f_blocks; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); @@ -282,6 +231,12 @@ static void adfs_free_inode(struct inode *inode) kmem_cache_free(adfs_inode_cachep, ADFS_I(inode)); } +static int adfs_drop_inode(struct inode *inode) +{ + /* always drop inodes if we are read-only */ + return !IS_ENABLED(CONFIG_ADFS_FS_RW) || IS_RDONLY(inode); +} + static void init_once(void *foo) { struct adfs_inode_info *ei = (struct adfs_inode_info *) foo; @@ -314,7 +269,7 @@ static void destroy_inodecache(void) static const struct super_operations adfs_sops = { .alloc_inode = adfs_alloc_inode, .free_inode = adfs_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = adfs_drop_inode, .write_inode = adfs_write_inode, .put_super = adfs_put_super, .statfs = adfs_statfs, @@ -322,66 +277,94 @@ static const struct super_operations adfs_sops = { .show_options = adfs_show_options, }; -static struct adfs_discmap *adfs_read_map(struct super_block *sb, struct adfs_discrecord *dr) +static int adfs_probe(struct super_block *sb, unsigned int offset, int silent, + int (*validate)(struct super_block *sb, + struct buffer_head *bh, + struct adfs_discrecord **bhp)) { - struct adfs_discmap *dm; - unsigned int map_addr, zone_size, nzones; - int i, zone; struct adfs_sb_info *asb = ADFS_SB(sb); + struct adfs_discrecord *dr; + struct buffer_head *bh; + unsigned int blocksize = BLOCK_SIZE; + int ret, try; + + for (try = 0; try < 2; try++) { + /* try to set the requested block size */ + if (sb->s_blocksize != blocksize && + !sb_set_blocksize(sb, blocksize)) { + if (!silent) + adfs_msg(sb, KERN_ERR, + "error: unsupported blocksize"); + return -EINVAL; + } - nzones = asb->s_map_size; - zone_size = (8 << dr->log2secsize) - le16_to_cpu(dr->zone_spare); - map_addr = (nzones >> 1) * zone_size - - ((nzones > 1) ? ADFS_DR_SIZE_BITS : 0); - map_addr = signed_asl(map_addr, asb->s_map2blk); + /* read the buffer */ + bh = sb_bread(sb, offset >> sb->s_blocksize_bits); + if (!bh) { + adfs_msg(sb, KERN_ERR, + "error: unable to read block %u, try %d", + offset >> sb->s_blocksize_bits, try); + return -EIO; + } + + /* validate it */ + ret = validate(sb, bh, &dr); + if (ret) { + brelse(bh); + return ret; + } - asb->s_ids_per_zone = zone_size / (asb->s_idlen + 1); + /* does the block size match the filesystem block size? */ + blocksize = 1 << dr->log2secsize; + if (sb->s_blocksize == blocksize) { + asb->s_map = adfs_read_map(sb, dr); + brelse(bh); + return PTR_ERR_OR_ZERO(asb->s_map); + } - dm = kmalloc_array(nzones, sizeof(*dm), GFP_KERNEL); - if (dm == NULL) { - adfs_error(sb, "not enough memory"); - return ERR_PTR(-ENOMEM); + brelse(bh); } - for (zone = 0; zone < nzones; zone++, map_addr++) { - dm[zone].dm_startbit = 0; - dm[zone].dm_endbit = zone_size; - dm[zone].dm_startblk = zone * zone_size - ADFS_DR_SIZE_BITS; - dm[zone].dm_bh = sb_bread(sb, map_addr); + return -EIO; +} - if (!dm[zone].dm_bh) { - adfs_error(sb, "unable to read map"); - goto error_free; - } - } +static int adfs_validate_bblk(struct super_block *sb, struct buffer_head *bh, + struct adfs_discrecord **drp) +{ + struct adfs_discrecord *dr; + unsigned char *b_data; - /* adjust the limits for the first and last map zones */ - i = zone - 1; - dm[0].dm_startblk = 0; - dm[0].dm_startbit = ADFS_DR_SIZE_BITS; - dm[i].dm_endbit = (adfs_disc_size(dr) >> dr->log2bpmb) + - (ADFS_DR_SIZE_BITS - i * zone_size); + b_data = bh->b_data + (ADFS_DISCRECORD % sb->s_blocksize); + if (adfs_checkbblk(b_data)) + return -EILSEQ; - if (adfs_checkmap(sb, dm)) - return dm; + /* Do some sanity checks on the ADFS disc record */ + dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET); + if (adfs_checkdiscrecord(dr)) + return -EILSEQ; + + *drp = dr; + return 0; +} - adfs_error(sb, "map corrupted"); +static int adfs_validate_dr0(struct super_block *sb, struct buffer_head *bh, + struct adfs_discrecord **drp) +{ + struct adfs_discrecord *dr; -error_free: - while (--zone >= 0) - brelse(dm[zone].dm_bh); + /* Do some sanity checks on the ADFS disc record */ + dr = (struct adfs_discrecord *)(bh->b_data + 4); + if (adfs_checkdiscrecord(dr) || dr->nzones_high || dr->nzones != 1) + return -EILSEQ; - kfree(dm); - return ERR_PTR(-EIO); + *drp = dr; + return 0; } static int adfs_fill_super(struct super_block *sb, void *data, int silent) { struct adfs_discrecord *dr; - struct buffer_head *bh; struct object_info root_obj; - unsigned char *b_data; - unsigned int blocksize; struct adfs_sb_info *asb; struct inode *root; int ret = -EINVAL; @@ -391,7 +374,10 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) asb = kzalloc(sizeof(*asb), GFP_KERNEL); if (!asb) return -ENOMEM; + sb->s_fs_info = asb; + sb->s_magic = ADFS_SUPER_MAGIC; + sb->s_time_gran = 10000000; /* set default options */ asb->s_uid = GLOBAL_ROOT_UID; @@ -403,78 +389,21 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) if (parse_options(sb, asb, data)) goto error; - sb_set_blocksize(sb, BLOCK_SIZE); - if (!(bh = sb_bread(sb, ADFS_DISCRECORD / BLOCK_SIZE))) { - adfs_msg(sb, KERN_ERR, "error: unable to read superblock"); - ret = -EIO; - goto error; - } - - b_data = bh->b_data + (ADFS_DISCRECORD % BLOCK_SIZE); - - if (adfs_checkbblk(b_data)) { - ret = -EINVAL; - goto error_badfs; - } - - dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET); - - /* - * Do some sanity checks on the ADFS disc record - */ - if (adfs_checkdiscrecord(dr)) { - ret = -EINVAL; - goto error_badfs; - } - - blocksize = 1 << dr->log2secsize; - brelse(bh); - - if (sb_set_blocksize(sb, blocksize)) { - bh = sb_bread(sb, ADFS_DISCRECORD / sb->s_blocksize); - if (!bh) { - adfs_msg(sb, KERN_ERR, - "error: couldn't read superblock on 2nd try."); - ret = -EIO; - goto error; - } - b_data = bh->b_data + (ADFS_DISCRECORD % sb->s_blocksize); - if (adfs_checkbblk(b_data)) { - adfs_msg(sb, KERN_ERR, - "error: disc record mismatch, very weird!"); - ret = -EINVAL; - goto error_free_bh; - } - dr = (struct adfs_discrecord *)(b_data + ADFS_DR_OFFSET); - } else { + /* Try to probe the filesystem boot block */ + ret = adfs_probe(sb, ADFS_DISCRECORD, 1, adfs_validate_bblk); + if (ret == -EILSEQ) + ret = adfs_probe(sb, 0, silent, adfs_validate_dr0); + if (ret == -EILSEQ) { if (!silent) adfs_msg(sb, KERN_ERR, - "error: unsupported blocksize"); + "error: can't find an ADFS filesystem on dev %s.", + sb->s_id); ret = -EINVAL; - goto error; } + if (ret) + goto error; - /* - * blocksize on this device should now be set to the ADFS log2secsize - */ - - sb->s_magic = ADFS_SUPER_MAGIC; - asb->s_idlen = dr->idlen; - asb->s_map_size = dr->nzones | (dr->nzones_high << 8); - asb->s_map2blk = dr->log2bpmb - dr->log2secsize; - asb->s_log2sharesize = dr->log2sharesize; - - asb->s_map = adfs_read_map(sb, dr); - if (IS_ERR(asb->s_map)) { - ret = PTR_ERR(asb->s_map); - goto error_free_bh; - } - - brelse(bh); - - /* - * set up enough so that we can read an inode - */ + /* set up enough so that we can read an inode */ sb->s_op = &adfs_sops; dr = adfs_map_discrecord(asb->s_map); @@ -511,23 +440,13 @@ static int adfs_fill_super(struct super_block *sb, void *data, int silent) root = adfs_iget(sb, &root_obj); sb->s_root = d_make_root(root); if (!sb->s_root) { - int i; - for (i = 0; i < asb->s_map_size; i++) - brelse(asb->s_map[i].dm_bh); - kfree(asb->s_map); + adfs_free_map(sb); adfs_error(sb, "get root inode failed\n"); ret = -EIO; goto error; } return 0; -error_badfs: - if (!silent) - adfs_msg(sb, KERN_ERR, - "error: can't find an ADFS filesystem on dev %s.", - sb->s_id); -error_free_bh: - brelse(bh); error: sb->s_fs_info = NULL; kfree(asb); diff --git a/fs/afs/cell.c b/fs/afs/cell.c index fd5133e26a38..78ba5f932287 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -134,8 +134,17 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, _leave(" = -ENAMETOOLONG"); return ERR_PTR(-ENAMETOOLONG); } - if (namelen == 5 && memcmp(name, "@cell", 5) == 0) + + /* Prohibit cell names that contain unprintable chars, '/' and '@' or + * that begin with a dot. This also precludes "@cell". + */ + if (name[0] == '.') return ERR_PTR(-EINVAL); + for (i = 0; i < namelen; i++) { + char ch = name[i]; + if (!isprint(ch) || ch == '/' || ch == '@') + return ERR_PTR(-EINVAL); + } _enter("%*.*s,%s", namelen, namelen, name, addresses); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index ecd8d2698515..f4713ea76e82 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -97,7 +97,7 @@ static struct linux_binfmt elf_format = { .min_coredump = ELF_EXEC_PAGESIZE, }; -#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) +#define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE)) static int set_brk(unsigned long start, unsigned long end, int prot) { @@ -161,9 +161,11 @@ static int padzero(unsigned long elf_bss) #endif static int -create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, - unsigned long load_addr, unsigned long interp_load_addr) +create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, + unsigned long load_addr, unsigned long interp_load_addr, + unsigned long e_entry) { + struct mm_struct *mm = current->mm; unsigned long p = bprm->p; int argc = bprm->argc; int envc = bprm->envc; @@ -176,7 +178,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, unsigned char k_rand_bytes[16]; int items; elf_addr_t *elf_info; - int ei_index = 0; + int ei_index; const struct cred *cred = current_cred(); struct vm_area_struct *vma; @@ -226,12 +228,12 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, return -EFAULT; /* Create the ELF interpreter info */ - elf_info = (elf_addr_t *)current->mm->saved_auxv; + elf_info = (elf_addr_t *)mm->saved_auxv; /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */ #define NEW_AUX_ENT(id, val) \ do { \ - elf_info[ei_index++] = id; \ - elf_info[ei_index++] = val; \ + *elf_info++ = id; \ + *elf_info++ = val; \ } while (0) #ifdef ARCH_DLINFO @@ -251,7 +253,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, NEW_AUX_ENT(AT_PHNUM, exec->e_phnum); NEW_AUX_ENT(AT_BASE, interp_load_addr); NEW_AUX_ENT(AT_FLAGS, 0); - NEW_AUX_ENT(AT_ENTRY, exec->e_entry); + NEW_AUX_ENT(AT_ENTRY, e_entry); NEW_AUX_ENT(AT_UID, from_kuid_munged(cred->user_ns, cred->uid)); NEW_AUX_ENT(AT_EUID, from_kuid_munged(cred->user_ns, cred->euid)); NEW_AUX_ENT(AT_GID, from_kgid_munged(cred->user_ns, cred->gid)); @@ -275,12 +277,13 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, } #undef NEW_AUX_ENT /* AT_NULL is zero; clear the rest too */ - memset(&elf_info[ei_index], 0, - sizeof current->mm->saved_auxv - ei_index * sizeof elf_info[0]); + memset(elf_info, 0, (char *)mm->saved_auxv + + sizeof(mm->saved_auxv) - (char *)elf_info); /* And advance past the AT_NULL entry. */ - ei_index += 2; + elf_info += 2; + ei_index = elf_info - (elf_addr_t *)mm->saved_auxv; sp = STACK_ADD(p, ei_index); items = (argc + 1) + (envc + 1) + 1; @@ -299,7 +302,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, * Grow the stack manually; some architectures have a limit on how * far ahead a user-space access may be in order to grow the stack. */ - vma = find_extend_vma(current->mm, bprm->p); + vma = find_extend_vma(mm, bprm->p); if (!vma) return -EFAULT; @@ -308,7 +311,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, return -EFAULT; /* Populate list of argv pointers back to argv strings. */ - p = current->mm->arg_end = current->mm->arg_start; + p = mm->arg_end = mm->arg_start; while (argc-- > 0) { size_t len; if (__put_user((elf_addr_t)p, sp++)) @@ -320,10 +323,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, } if (__put_user(0, sp++)) return -EFAULT; - current->mm->arg_end = p; + mm->arg_end = p; /* Populate list of envp pointers back to envp strings. */ - current->mm->env_end = current->mm->env_start = p; + mm->env_end = mm->env_start = p; while (envc-- > 0) { size_t len; if (__put_user((elf_addr_t)p, sp++)) @@ -335,10 +338,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, } if (__put_user(0, sp++)) return -EFAULT; - current->mm->env_end = p; + mm->env_end = p; /* Put the elf_info on the stack in the right place. */ - if (copy_to_user(sp, elf_info, ei_index * sizeof(elf_addr_t))) + if (copy_to_user(sp, mm->saved_auxv, ei_index * sizeof(elf_addr_t))) return -EFAULT; return 0; } @@ -689,15 +692,17 @@ static int load_elf_binary(struct linux_binprm *bprm) int bss_prot = 0; int retval, i; unsigned long elf_entry; + unsigned long e_entry; unsigned long interp_load_addr = 0; unsigned long start_code, end_code, start_data, end_data; unsigned long reloc_func_desc __maybe_unused = 0; int executable_stack = EXSTACK_DEFAULT; + struct elfhdr *elf_ex = (struct elfhdr *)bprm->buf; struct { - struct elfhdr elf_ex; struct elfhdr interp_elf_ex; } *loc; struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE; + struct mm_struct *mm; struct pt_regs *regs; loc = kmalloc(sizeof(*loc), GFP_KERNEL); @@ -705,30 +710,27 @@ static int load_elf_binary(struct linux_binprm *bprm) retval = -ENOMEM; goto out_ret; } - - /* Get the exec-header */ - loc->elf_ex = *((struct elfhdr *)bprm->buf); retval = -ENOEXEC; /* First of all, some simple consistency checks */ - if (memcmp(loc->elf_ex.e_ident, ELFMAG, SELFMAG) != 0) + if (memcmp(elf_ex->e_ident, ELFMAG, SELFMAG) != 0) goto out; - if (loc->elf_ex.e_type != ET_EXEC && loc->elf_ex.e_type != ET_DYN) + if (elf_ex->e_type != ET_EXEC && elf_ex->e_type != ET_DYN) goto out; - if (!elf_check_arch(&loc->elf_ex)) + if (!elf_check_arch(elf_ex)) goto out; - if (elf_check_fdpic(&loc->elf_ex)) + if (elf_check_fdpic(elf_ex)) goto out; if (!bprm->file->f_op->mmap) goto out; - elf_phdata = load_elf_phdrs(&loc->elf_ex, bprm->file); + elf_phdata = load_elf_phdrs(elf_ex, bprm->file); if (!elf_phdata) goto out; elf_ppnt = elf_phdata; - for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { + for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) { char *elf_interpreter; if (elf_ppnt->p_type != PT_INTERP) @@ -782,7 +784,7 @@ out_free_interp: } elf_ppnt = elf_phdata; - for (i = 0; i < loc->elf_ex.e_phnum; i++, elf_ppnt++) + for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) switch (elf_ppnt->p_type) { case PT_GNU_STACK: if (elf_ppnt->p_flags & PF_X) @@ -792,7 +794,7 @@ out_free_interp: break; case PT_LOPROC ... PT_HIPROC: - retval = arch_elf_pt_proc(&loc->elf_ex, elf_ppnt, + retval = arch_elf_pt_proc(elf_ex, elf_ppnt, bprm->file, false, &arch_state); if (retval) @@ -836,7 +838,7 @@ out_free_interp: * still possible to return an error to the code that invoked * the exec syscall. */ - retval = arch_check_elf(&loc->elf_ex, + retval = arch_check_elf(elf_ex, !!interpreter, &loc->interp_elf_ex, &arch_state); if (retval) @@ -849,8 +851,8 @@ out_free_interp: /* Do this immediately, since STACK_TOP as used in setup_arg_pages may depend on the personality. */ - SET_PERSONALITY2(loc->elf_ex, &arch_state); - if (elf_read_implies_exec(loc->elf_ex, executable_stack)) + SET_PERSONALITY2(*elf_ex, &arch_state); + if (elf_read_implies_exec(*elf_ex, executable_stack)) current->personality |= READ_IMPLIES_EXEC; if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) @@ -877,7 +879,7 @@ out_free_interp: /* Now we do a little grungy work by mmapping the ELF image into the correct location in memory. */ for(i = 0, elf_ppnt = elf_phdata; - i < loc->elf_ex.e_phnum; i++, elf_ppnt++) { + i < elf_ex->e_phnum; i++, elf_ppnt++) { int elf_prot, elf_flags; unsigned long k, vaddr; unsigned long total_size = 0; @@ -921,9 +923,9 @@ out_free_interp: * If we are loading ET_EXEC or we have already performed * the ET_DYN load_addr calculations, proceed normally. */ - if (loc->elf_ex.e_type == ET_EXEC || load_addr_set) { + if (elf_ex->e_type == ET_EXEC || load_addr_set) { elf_flags |= MAP_FIXED; - } else if (loc->elf_ex.e_type == ET_DYN) { + } else if (elf_ex->e_type == ET_DYN) { /* * This logic is run once for the first LOAD Program * Header for ET_DYN binaries to calculate the @@ -972,7 +974,7 @@ out_free_interp: load_bias = ELF_PAGESTART(load_bias - vaddr); total_size = total_mapping_size(elf_phdata, - loc->elf_ex.e_phnum); + elf_ex->e_phnum); if (!total_size) { retval = -EINVAL; goto out_free_dentry; @@ -990,7 +992,7 @@ out_free_interp: if (!load_addr_set) { load_addr_set = 1; load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset); - if (loc->elf_ex.e_type == ET_DYN) { + if (elf_ex->e_type == ET_DYN) { load_bias += error - ELF_PAGESTART(load_bias + vaddr); load_addr += load_bias; @@ -998,7 +1000,7 @@ out_free_interp: } } k = elf_ppnt->p_vaddr; - if (k < start_code) + if ((elf_ppnt->p_flags & PF_X) && k < start_code) start_code = k; if (start_data < k) start_data = k; @@ -1031,7 +1033,7 @@ out_free_interp: } } - loc->elf_ex.e_entry += load_bias; + e_entry = elf_ex->e_entry + load_bias; elf_bss += load_bias; elf_brk += load_bias; start_code += load_bias; @@ -1074,7 +1076,7 @@ out_free_interp: allow_write_access(interpreter); fput(interpreter); } else { - elf_entry = loc->elf_ex.e_entry; + elf_entry = e_entry; if (BAD_ADDR(elf_entry)) { retval = -EINVAL; goto out_free_dentry; @@ -1092,15 +1094,17 @@ out_free_interp: goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ - retval = create_elf_tables(bprm, &loc->elf_ex, - load_addr, interp_load_addr); + retval = create_elf_tables(bprm, elf_ex, + load_addr, interp_load_addr, e_entry); if (retval < 0) goto out; - current->mm->end_code = end_code; - current->mm->start_code = start_code; - current->mm->start_data = start_data; - current->mm->end_data = end_data; - current->mm->start_stack = bprm->p; + + mm = current->mm; + mm->end_code = end_code; + mm->start_code = start_code; + mm->start_data = start_data; + mm->end_data = end_data; + mm->start_stack = bprm->p; if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) { /* @@ -1111,12 +1115,11 @@ out_free_interp: * growing down), and into the unused ELF_ET_DYN_BASE region. */ if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && - loc->elf_ex.e_type == ET_DYN && !interpreter) - current->mm->brk = current->mm->start_brk = - ELF_ET_DYN_BASE; + elf_ex->e_type == ET_DYN && !interpreter) { + mm->brk = mm->start_brk = ELF_ET_DYN_BASE; + } - current->mm->brk = current->mm->start_brk = - arch_randomize_brk(current->mm); + mm->brk = mm->start_brk = arch_randomize_brk(mm); #ifdef compat_brk_randomized current->brk_randomized = 1; #endif @@ -1574,6 +1577,7 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, */ static int fill_files_note(struct memelfnote *note) { + struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned count, size, names_ofs, remaining, n; user_long_t *data; @@ -1581,7 +1585,7 @@ static int fill_files_note(struct memelfnote *note) char *name_base, *name_curpos; /* *Estimated* file count and total data size needed */ - count = current->mm->map_count; + count = mm->map_count; if (count > UINT_MAX / 64) return -EINVAL; size = count * 64; @@ -1591,6 +1595,10 @@ static int fill_files_note(struct memelfnote *note) if (size >= MAX_FILE_NOTE_SIZE) /* paranoia check */ return -EINVAL; size = round_up(size, PAGE_SIZE); + /* + * "size" can be 0 here legitimately. + * Let it ENOMEM and omit NT_FILE section which will be empty anyway. + */ data = kvmalloc(size, GFP_KERNEL); if (ZERO_OR_NULL_PTR(data)) return -ENOMEM; @@ -1599,7 +1607,7 @@ static int fill_files_note(struct memelfnote *note) name_base = name_curpos = ((char *)data) + names_ofs; remaining = size - names_ofs; count = 0; - for (vma = current->mm->mmap; vma != NULL; vma = vma->vm_next) { + for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { struct file *file; const char *filename; @@ -1633,10 +1641,10 @@ static int fill_files_note(struct memelfnote *note) data[0] = count; data[1] = PAGE_SIZE; /* - * Count usually is less than current->mm->map_count, + * Count usually is less than mm->map_count, * we need to move filenames down. */ - n = current->mm->map_count - count; + n = mm->map_count - count; if (n != 0) { unsigned shift_bytes = n * 3 * sizeof(data[0]); memmove(name_base - shift_bytes, name_base, @@ -2182,7 +2190,7 @@ static int elf_core_dump(struct coredump_params *cprm) int segs, i; size_t vma_data_size = 0; struct vm_area_struct *vma, *gate_vma; - struct elfhdr *elf = NULL; + struct elfhdr elf; loff_t offset = 0, dataoff; struct elf_note_info info = { }; struct elf_phdr *phdr4note = NULL; @@ -2203,10 +2211,6 @@ static int elf_core_dump(struct coredump_params *cprm) * exists while dumping the mm->vm_next areas to the core file. */ - /* alloc memory for large data structures: too large to be on stack */ - elf = kmalloc(sizeof(*elf), GFP_KERNEL); - if (!elf) - goto out; /* * The number of segs are recored into ELF header as 16bit value. * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. @@ -2230,7 +2234,7 @@ static int elf_core_dump(struct coredump_params *cprm) * Collect all the non-memory information about the process for the * notes. This also sets up the file header. */ - if (!fill_note_info(elf, e_phnum, &info, cprm->siginfo, cprm->regs)) + if (!fill_note_info(&elf, e_phnum, &info, cprm->siginfo, cprm->regs)) goto cleanup; has_dumped = 1; @@ -2238,7 +2242,7 @@ static int elf_core_dump(struct coredump_params *cprm) fs = get_fs(); set_fs(KERNEL_DS); - offset += sizeof(*elf); /* Elf header */ + offset += sizeof(elf); /* Elf header */ offset += segs * sizeof(struct elf_phdr); /* Program headers */ /* Write notes phdr entry */ @@ -2257,11 +2261,13 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - if (segs - 1 > ULONG_MAX / sizeof(*vma_filesz)) - goto end_coredump; + /* + * Zero vma process will get ZERO_SIZE_PTR here. + * Let coredump continue for register state at least. + */ vma_filesz = kvmalloc(array_size(sizeof(*vma_filesz), (segs - 1)), GFP_KERNEL); - if (ZERO_OR_NULL_PTR(vma_filesz)) + if (!vma_filesz) goto end_coredump; for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; @@ -2281,12 +2287,12 @@ static int elf_core_dump(struct coredump_params *cprm) shdr4extnum = kmalloc(sizeof(*shdr4extnum), GFP_KERNEL); if (!shdr4extnum) goto end_coredump; - fill_extnum_info(elf, shdr4extnum, e_shoff, segs); + fill_extnum_info(&elf, shdr4extnum, e_shoff, segs); } offset = dataoff; - if (!dump_emit(cprm, elf, sizeof(*elf))) + if (!dump_emit(cprm, &elf, sizeof(elf))) goto end_coredump; if (!dump_emit(cprm, phdr4note, sizeof(*phdr4note))) @@ -2370,8 +2376,6 @@ cleanup: kfree(shdr4extnum); kvfree(vma_filesz); kfree(phdr4note); - kfree(elf); -out: return has_dumped; } diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 82200dbca5ac..9a0ff3384381 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -11,7 +11,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ - block-rsv.o delalloc-space.o block-group.o + block-rsv.o delalloc-space.o block-group.o discard.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 6934a5b8708f..404e050ce8ee 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -14,6 +14,8 @@ #include "sysfs.h" #include "tree-log.h" #include "delalloc-space.h" +#include "discard.h" +#include "raid56.h" /* * Return target flags in extended format or 0 if restripe for this chunk_type @@ -95,7 +97,7 @@ static u64 btrfs_reduce_alloc_profile(struct btrfs_fs_info *fs_info, u64 flags) return extended_to_chunk(flags | allowed); } -static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) +u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) { unsigned seq; u64 flags; @@ -115,11 +117,6 @@ static u64 get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) return btrfs_reduce_alloc_profile(fs_info, flags); } -u64 btrfs_get_alloc_profile(struct btrfs_fs_info *fs_info, u64 orig_flags) -{ - return get_alloc_profile(fs_info, orig_flags); -} - void btrfs_get_block_group(struct btrfs_block_group *cache) { atomic_inc(&cache->count); @@ -132,6 +129,15 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) WARN_ON(cache->reserved > 0); /* + * A block_group shouldn't be on the discard_list anymore. + * Remove the block_group from the discard_list to prevent us + * from causing a panic due to NULL pointer dereference. + */ + if (WARN_ON(!list_empty(&cache->discard_list))) + btrfs_discard_cancel_work(&cache->fs_info->discard_ctl, + cache); + + /* * If not empty, someone is still holding mutex of * full_stripe_lock, which can only be released by caller. * And it will definitely cause use-after-free when caller @@ -466,8 +472,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end } else if (extent_start > start && extent_start < end) { size = extent_start - start; total_added += size; - ret = btrfs_add_free_space(block_group, start, - size); + ret = btrfs_add_free_space_async_trimmed(block_group, + start, size); BUG_ON(ret); /* -ENOMEM or logic error */ start = extent_end + 1; } else { @@ -478,7 +484,8 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end if (start < end) { size = end - start; total_added += size; - ret = btrfs_add_free_space(block_group, start, size); + ret = btrfs_add_free_space_async_trimmed(block_group, start, + size); BUG_ON(ret); /* -ENOMEM or logic error */ } @@ -1184,22 +1191,8 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) { struct btrfs_space_info *sinfo = cache->space_info; u64 num_bytes; - u64 sinfo_used; - u64 min_allocable_bytes; int ret = -ENOSPC; - /* - * We need some metadata space and system metadata space for - * allocating chunks in some corner cases until we force to set - * it to be readonly. - */ - if ((sinfo->flags & - (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && - !force) - min_allocable_bytes = SZ_1M; - else - min_allocable_bytes = 0; - spin_lock(&sinfo->lock); spin_lock(&cache->lock); @@ -1211,20 +1204,38 @@ static int inc_block_group_ro(struct btrfs_block_group *cache, int force) num_bytes = cache->length - cache->reserved - cache->pinned - cache->bytes_super - cache->used; - sinfo_used = btrfs_space_info_used(sinfo, true); /* - * sinfo_used + num_bytes should always <= sinfo->total_bytes. - * - * Here we make sure if we mark this bg RO, we still have enough - * free space as buffer (if min_allocable_bytes is not 0). + * Data never overcommits, even in mixed mode, so do just the straight + * check of left over space in how much we have allocated. */ - if (sinfo_used + num_bytes + min_allocable_bytes <= - sinfo->total_bytes) { + if (force) { + ret = 0; + } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) { + u64 sinfo_used = btrfs_space_info_used(sinfo, true); + + /* + * Here we make sure if we mark this bg RO, we still have enough + * free space as buffer. + */ + if (sinfo_used + num_bytes <= sinfo->total_bytes) + ret = 0; + } else { + /* + * We overcommit metadata, so we need to do the + * btrfs_can_overcommit check here, and we need to pass in + * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of + * leeway to allow us to mark this block group as read only. + */ + if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes, + BTRFS_RESERVE_NO_FLUSH)) + ret = 0; + } + + if (!ret) { sinfo->bytes_readonly += num_bytes; cache->ro++; list_add_tail(&cache->ro_list, &sinfo->ro_bgs); - ret = 0; } out: spin_unlock(&cache->lock); @@ -1232,9 +1243,6 @@ out: if (ret == -ENOSPC && btrfs_test_opt(cache->fs_info, ENOSPC_DEBUG)) { btrfs_info(cache->fs_info, "unable to make block group %llu ro", cache->start); - btrfs_info(cache->fs_info, - "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu", - sinfo_used, num_bytes, min_allocable_bytes); btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0); } return ret; @@ -1249,6 +1257,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) struct btrfs_block_group *block_group; struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; + const bool async_trim_enabled = btrfs_test_opt(fs_info, DISCARD_ASYNC); int ret = 0; if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) @@ -1272,10 +1281,28 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->unused_bgs_lock); + btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); + mutex_lock(&fs_info->delete_unused_bgs_mutex); /* Don't want to race with allocators so take the groups_sem */ down_write(&space_info->groups_sem); + + /* + * Async discard moves the final block group discard to be prior + * to the unused_bgs code path. Therefore, if it's not fully + * trimmed, punt it back to the async discard lists. + */ + if (btrfs_test_opt(fs_info, DISCARD_ASYNC) && + !btrfs_is_free_space_trimmed(block_group)) { + trace_btrfs_skip_unused_block_group(block_group); + up_write(&space_info->groups_sem); + /* Requeue if we failed because of async discard */ + btrfs_discard_queue_work(&fs_info->discard_ctl, + block_group); + goto next; + } + spin_lock(&block_group->lock); if (block_group->reserved || block_group->pinned || block_group->used || block_group->ro || @@ -1347,6 +1374,23 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) } mutex_unlock(&fs_info->unused_bg_unpin_mutex); + /* + * At this point, the block_group is read only and should fail + * new allocations. However, btrfs_finish_extent_commit() can + * cause this block_group to be placed back on the discard + * lists because now the block_group isn't fully discarded. + * Bail here and try again later after discarding everything. + */ + spin_lock(&fs_info->discard_ctl.lock); + if (!list_empty(&block_group->discard_list)) { + spin_unlock(&fs_info->discard_ctl.lock); + btrfs_dec_block_group_ro(block_group); + btrfs_discard_queue_work(&fs_info->discard_ctl, + block_group); + goto end_trans; + } + spin_unlock(&fs_info->discard_ctl.lock); + /* Reset pinned so btrfs_put_block_group doesn't complain */ spin_lock(&space_info->lock); spin_lock(&block_group->lock); @@ -1362,8 +1406,18 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_unlock(&block_group->lock); spin_unlock(&space_info->lock); + /* + * The normal path here is an unused block group is passed here, + * then trimming is handled in the transaction commit path. + * Async discard interposes before this to do the trimming + * before coming down the unused block group path as trimming + * will no longer be done later in the transaction commit path. + */ + if (!async_trim_enabled && btrfs_test_opt(fs_info, DISCARD_ASYNC)) + goto flip_async; + /* DISCARD can flip during remount */ - trimming = btrfs_test_opt(fs_info, DISCARD); + trimming = btrfs_test_opt(fs_info, DISCARD_SYNC); /* Implicit trim during transaction commit. */ if (trimming) @@ -1406,6 +1460,13 @@ next: spin_lock(&fs_info->unused_bgs_lock); } spin_unlock(&fs_info->unused_bgs_lock); + return; + +flip_async: + btrfs_end_transaction(trans); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); + btrfs_put_block_group(block_group); + btrfs_discard_punt_unused_bgs_list(fs_info); } void btrfs_mark_bg_unused(struct btrfs_block_group *bg) @@ -1516,6 +1577,102 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) write_sequnlock(&fs_info->profiles_lock); } +/** + * btrfs_rmap_block - Map a physical disk address to a list of logical addresses + * @chunk_start: logical address of block group + * @physical: physical address to map to logical addresses + * @logical: return array of logical addresses which map to @physical + * @naddrs: length of @logical + * @stripe_len: size of IO stripe for the given block group + * + * Maps a particular @physical disk address to a list of @logical addresses. + * Used primarily to exclude those portions of a block group that contain super + * block copies. + */ +EXPORT_FOR_TESTS +int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, + u64 physical, u64 **logical, int *naddrs, int *stripe_len) +{ + struct extent_map *em; + struct map_lookup *map; + u64 *buf; + u64 bytenr; + u64 data_stripe_length; + u64 io_stripe_size; + int i, nr = 0; + int ret = 0; + + em = btrfs_get_chunk_map(fs_info, chunk_start, 1); + if (IS_ERR(em)) + return -EIO; + + map = em->map_lookup; + data_stripe_length = em->len; + io_stripe_size = map->stripe_len; + + if (map->type & BTRFS_BLOCK_GROUP_RAID10) + data_stripe_length = div_u64(data_stripe_length, + map->num_stripes / map->sub_stripes); + else if (map->type & BTRFS_BLOCK_GROUP_RAID0) + data_stripe_length = div_u64(data_stripe_length, map->num_stripes); + else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + data_stripe_length = div_u64(data_stripe_length, + nr_data_stripes(map)); + io_stripe_size = map->stripe_len * nr_data_stripes(map); + } + + buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); + if (!buf) { + ret = -ENOMEM; + goto out; + } + + for (i = 0; i < map->num_stripes; i++) { + bool already_inserted = false; + u64 stripe_nr; + int j; + + if (!in_range(physical, map->stripes[i].physical, + data_stripe_length)) + continue; + + stripe_nr = physical - map->stripes[i].physical; + stripe_nr = div64_u64(stripe_nr, map->stripe_len); + + if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + stripe_nr = stripe_nr * map->num_stripes + i; + stripe_nr = div_u64(stripe_nr, map->sub_stripes); + } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + stripe_nr = stripe_nr * map->num_stripes + i; + } + /* + * The remaining case would be for RAID56, multiply by + * nr_data_stripes(). Alternatively, just use rmap_len below + * instead of map->stripe_len + */ + + bytenr = chunk_start + stripe_nr * io_stripe_size; + + /* Ensure we don't add duplicate addresses */ + for (j = 0; j < nr; j++) { + if (buf[j] == bytenr) { + already_inserted = true; + break; + } + } + + if (!already_inserted) + buf[nr++] = bytenr; + } + + *logical = buf; + *naddrs = nr; + *stripe_len = io_stripe_size; +out: + free_extent_map(em); + return ret; +} + static int exclude_super_stripes(struct btrfs_block_group *cache) { struct btrfs_fs_info *fs_info = cache->fs_info; @@ -1610,6 +1767,8 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( cache->full_stripe_len = btrfs_full_stripe_len(fs_info, start); set_free_space_tree_thresholds(cache); + cache->discard_index = BTRFS_DISCARD_INDEX_UNUSED; + atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); init_rwsem(&cache->data_rwsem); @@ -1617,6 +1776,7 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( INIT_LIST_HEAD(&cache->cluster_list); INIT_LIST_HEAD(&cache->bg_list); INIT_LIST_HEAD(&cache->ro_list); + INIT_LIST_HEAD(&cache->discard_list); INIT_LIST_HEAD(&cache->dirty_list); INIT_LIST_HEAD(&cache->io_list); btrfs_init_free_space_ctl(cache); @@ -1775,7 +1935,10 @@ static int read_one_block_group(struct btrfs_fs_info *info, inc_block_group_ro(cache, 1); } else if (cache->used == 0) { ASSERT(list_empty(&cache->bg_list)); - btrfs_mark_bg_unused(cache); + if (btrfs_test_opt(info, DISCARD_ASYNC)) + btrfs_discard_queue_work(&info->discard_ctl, cache); + else + btrfs_mark_bg_unused(cache); } return 0; error: @@ -2077,7 +2240,7 @@ again: } } - ret = inc_block_group_ro(cache, !do_chunk_alloc); + ret = inc_block_group_ro(cache, 0); if (!do_chunk_alloc) goto unlock_out; if (!ret) @@ -2738,8 +2901,10 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, * dirty list to avoid races between cleaner kthread and space * cache writeout. */ - if (!alloc && old_val == 0) - btrfs_mark_bg_unused(cache); + if (!alloc && old_val == 0) { + if (!btrfs_test_opt(info, DISCARD_ASYNC)) + btrfs_mark_bg_unused(cache); + } btrfs_put_block_group(cache); total -= num_bytes; diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 9b409676c4b2..107bb557ca8d 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -13,6 +13,19 @@ enum btrfs_disk_cache_state { }; /* + * This describes the state of the block_group for async discard. This is due + * to the two pass nature of it where extent discarding is prioritized over + * bitmap discarding. BTRFS_DISCARD_RESET_CURSOR is set when we are resetting + * between lists to prevent contention for discard state variables + * (eg. discard_cursor). + */ +enum btrfs_discard_state { + BTRFS_DISCARD_EXTENTS, + BTRFS_DISCARD_BITMAPS, + BTRFS_DISCARD_RESET_CURSOR, +}; + +/* * Control flags for do_chunk_alloc's force field CHUNK_ALLOC_NO_FORCE means to * only allocate a chunk if we really need one. * @@ -116,7 +129,13 @@ struct btrfs_block_group { /* For read-only block groups */ struct list_head ro_list; + /* For discard operations */ atomic_t trimming; + struct list_head discard_list; + int discard_index; + u64 discard_eligible_time; + u64 discard_cursor; + enum btrfs_discard_state discard_state; /* For dirty block groups */ struct list_head dirty_list; @@ -158,6 +177,22 @@ struct btrfs_block_group { struct btrfs_full_stripe_locks_tree full_stripe_locks_root; }; +static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) +{ + return (block_group->start + block_group->length); +} + +static inline bool btrfs_is_block_group_data_only( + struct btrfs_block_group *block_group) +{ + /* + * In mixed mode the fragmentation is expected to be high, lowering the + * efficiency, so only proper data block groups are considered. + */ + return (block_group->flags & BTRFS_BLOCK_GROUP_DATA) && + !(block_group->flags & BTRFS_BLOCK_GROUP_METADATA); +} + #ifdef CONFIG_BTRFS_DEBUG static inline int btrfs_should_fragment_free_space( struct btrfs_block_group *block_group) @@ -248,4 +283,9 @@ static inline int btrfs_block_group_done(struct btrfs_block_group *cache) cache->cached == BTRFS_CACHE_ERROR; } +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, + u64 physical, u64 **logical, int *naddrs, int *stripe_len); +#endif + #endif /* BTRFS_BLOCK_GROUP_H */ diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 0b52ab4cb964..a0ce69f2d27c 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -629,7 +629,6 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(dev_t dev, static int btrfsic_process_superblock(struct btrfsic_state *state, struct btrfs_fs_devices *fs_devices) { - struct btrfs_fs_info *fs_info = state->fs_info; struct btrfs_super_block *selected_super; struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; @@ -637,7 +636,6 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, int ret = 0; int pass; - BUG_ON(NULL == state); selected_super = kzalloc(sizeof(*selected_super), GFP_NOFS); if (NULL == selected_super) { pr_info("btrfsic: error, kmalloc failed!\n"); @@ -700,7 +698,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state, break; } - num_copies = btrfs_num_copies(fs_info, next_bytenr, + num_copies = btrfs_num_copies(state->fs_info, next_bytenr, state->metablock_size); if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES) pr_info("num_copies(log_bytenr=%llu) = %d\n", diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 43e1660f450f..9ab610cc9114 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -763,7 +763,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_lookup_bio_sums(inode, comp_bio, - sums); + (u64)-1, sums); BUG_ON(ret); /* -ENOMEM */ } @@ -791,7 +791,7 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, BUG_ON(ret); /* -ENOMEM */ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); + ret = btrfs_lookup_bio_sums(inode, comp_bio, (u64)-1, sums); BUG_ON(ret); /* -ENOMEM */ } @@ -1290,7 +1290,7 @@ int btrfs_decompress_buf2page(const char *buf, unsigned long buf_start, /* copy bytes from the working buffer into the pages */ while (working_bytes > 0) { bytes = min_t(unsigned long, bvec.bv_len, - PAGE_SIZE - buf_offset); + PAGE_SIZE - (buf_offset % PAGE_SIZE)); bytes = min(bytes, working_bytes); kaddr = kmap_atomic(bvec.bv_page); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 24658b5a5787..f2ec1a9bae28 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -326,12 +326,10 @@ u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, struct seq_list *elem) { write_lock(&fs_info->tree_mod_log_lock); - spin_lock(&fs_info->tree_mod_seq_lock); if (!elem->seq) { elem->seq = btrfs_inc_tree_mod_seq(fs_info); list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); } - spin_unlock(&fs_info->tree_mod_seq_lock); write_unlock(&fs_info->tree_mod_log_lock); return elem->seq; @@ -351,7 +349,7 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, if (!seq_putting) return; - spin_lock(&fs_info->tree_mod_seq_lock); + write_lock(&fs_info->tree_mod_log_lock); list_del(&elem->list); elem->seq = 0; @@ -362,19 +360,17 @@ void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, * blocker with lower sequence number exists, we * cannot remove anything from the log */ - spin_unlock(&fs_info->tree_mod_seq_lock); + write_unlock(&fs_info->tree_mod_log_lock); return; } min_seq = cur_elem->seq; } } - spin_unlock(&fs_info->tree_mod_seq_lock); /* * anything that's lower than the lowest existing (read: blocked) * sequence number can be removed from the tree. */ - write_lock(&fs_info->tree_mod_log_lock); tm_root = &fs_info->tree_mod_log; for (node = rb_first(tm_root); node; node = next) { next = rb_next(node); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 54efb21c2727..36df977b64d9 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -101,6 +101,14 @@ struct btrfs_ref; #define BTRFS_MAX_EXTENT_SIZE SZ_128M +/* + * Deltas are an effective way to populate global statistics. Give macro names + * to make it clear what we're doing. An example is discard_extents in + * btrfs_free_space_ctl. + */ +#define BTRFS_STAT_NR_ENTRIES 2 +#define BTRFS_STAT_CURR 0 +#define BTRFS_STAT_PREV 1 /* * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size @@ -440,6 +448,36 @@ struct btrfs_full_stripe_locks_tree { struct mutex lock; }; +/* Discard control. */ +/* + * Async discard uses multiple lists to differentiate the discard filter + * parameters. Index 0 is for completely free block groups where we need to + * ensure the entire block group is trimmed without being lossy. Indices + * afterwards represent monotonically decreasing discard filter sizes to + * prioritize what should be discarded next. + */ +#define BTRFS_NR_DISCARD_LISTS 3 +#define BTRFS_DISCARD_INDEX_UNUSED 0 +#define BTRFS_DISCARD_INDEX_START 1 + +struct btrfs_discard_ctl { + struct workqueue_struct *discard_workers; + struct delayed_work work; + spinlock_t lock; + struct btrfs_block_group *block_group; + struct list_head discard_list[BTRFS_NR_DISCARD_LISTS]; + u64 prev_discard; + atomic_t discardable_extents; + atomic64_t discardable_bytes; + u64 max_discard_size; + unsigned long delay; + u32 iops_limit; + u32 kbps_limit; + u64 discard_extent_bytes; + u64 discard_bitmap_bytes; + atomic64_t discard_bytes_saved; +}; + /* delayed seq elem */ struct seq_list { struct list_head list; @@ -526,6 +564,9 @@ enum { * so we don't need to offload checksums to workqueues. */ BTRFS_FS_CSUM_IMPL_FAST, + + /* Indicate that the discard workqueue can service discards. */ + BTRFS_FS_DISCARD_RUNNING, }; struct btrfs_fs_info { @@ -673,14 +714,12 @@ struct btrfs_fs_info { atomic_t nr_delayed_iputs; wait_queue_head_t delayed_iputs_wait; - /* this protects tree_mod_seq_list */ - spinlock_t tree_mod_seq_lock; atomic64_t tree_mod_seq; - struct list_head tree_mod_seq_list; - /* this protects tree_mod_log */ + /* this protects tree_mod_log and tree_mod_seq_list */ rwlock_t tree_mod_log_lock; struct rb_root tree_mod_log; + struct list_head tree_mod_seq_list; atomic_t async_delalloc_pages; @@ -816,6 +855,8 @@ struct btrfs_fs_info { struct btrfs_workqueue *scrub_wr_completion_workers; struct btrfs_workqueue *scrub_parity_workers; + struct btrfs_discard_ctl discard_ctl; + #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY u32 check_integrity_print_mask; #endif @@ -902,6 +943,11 @@ struct btrfs_fs_info { spinlock_t ref_verify_lock; struct rb_root block_tree; #endif + +#ifdef CONFIG_BTRFS_DEBUG + struct kobject *debug_kobj; + struct kobject *discard_debug_kobj; +#endif }; static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) @@ -1170,7 +1216,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) #define BTRFS_MOUNT_FLUSHONCOMMIT (1 << 7) #define BTRFS_MOUNT_SSD_SPREAD (1 << 8) #define BTRFS_MOUNT_NOSSD (1 << 9) -#define BTRFS_MOUNT_DISCARD (1 << 10) +#define BTRFS_MOUNT_DISCARD_SYNC (1 << 10) #define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11) #define BTRFS_MOUNT_SPACE_CACHE (1 << 12) #define BTRFS_MOUNT_CLEAR_CACHE (1 << 13) @@ -1189,6 +1235,7 @@ static inline u32 BTRFS_MAX_XATTR_SIZE(const struct btrfs_fs_info *info) #define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26) #define BTRFS_MOUNT_NOLOGREPLAY (1 << 27) #define BTRFS_MOUNT_REF_VERIFY (1 << 28) +#define BTRFS_MOUNT_DISCARD_ASYNC (1 << 29) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (2048) @@ -2449,8 +2496,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, int delalloc); -int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len); +int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, + u64 len); void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, @@ -2789,9 +2836,7 @@ struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, - u8 *dst); -blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, - u64 logical_offset); + u64 offset, u8 *dst); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, @@ -2877,7 +2922,7 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, - u64 start, u64 end, int create); + u64 start, u64 end); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); @@ -3110,17 +3155,21 @@ do { \ rcu_read_unlock(); \ } while (0) -__cold -static inline void assfail(const char *expr, const char *file, int line) +#ifdef CONFIG_BTRFS_ASSERT +__cold __noreturn +static inline void assertfail(const char *expr, const char *file, int line) { - if (IS_ENABLED(CONFIG_BTRFS_ASSERT)) { - pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); - BUG(); - } + pr_err("assertion failed: %s, in %s:%d\n", expr, file, line); + BUG(); } -#define ASSERT(expr) \ - (likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) +#define ASSERT(expr) \ + (likely(expr) ? (void)0 : assertfail(#expr, __FILE__, __LINE__)) + +#else +static inline void assertfail(const char *expr, const char* file, int line) { } +#define ASSERT(expr) (void)(expr) +#endif /* * Use that for functions that are conditionally exported for sanity tests but diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index df3bd880061d..dfdb7d4f8406 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -492,7 +492,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, if (head->is_data) return; - spin_lock(&fs_info->tree_mod_seq_lock); + read_lock(&fs_info->tree_mod_log_lock); if (!list_empty(&fs_info->tree_mod_seq_list)) { struct seq_list *elem; @@ -500,7 +500,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, struct seq_list, list); seq = elem->seq; } - spin_unlock(&fs_info->tree_mod_seq_lock); + read_unlock(&fs_info->tree_mod_log_lock); again: for (node = rb_first_cached(&head->ref_tree); node; @@ -518,7 +518,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) struct seq_list *elem; int ret = 0; - spin_lock(&fs_info->tree_mod_seq_lock); + read_lock(&fs_info->tree_mod_log_lock); if (!list_empty(&fs_info->tree_mod_seq_list)) { elem = list_first_entry(&fs_info->tree_mod_seq_list, struct seq_list, list); @@ -531,7 +531,7 @@ int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) } } - spin_unlock(&fs_info->tree_mod_seq_lock); + read_unlock(&fs_info->tree_mod_log_lock); return ret; } diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f639dde2a679..2ca2a09d0e23 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -500,11 +500,8 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(fs_info, ret); - if (ret == -EINPROGRESS) { + if (ret == -EINPROGRESS) ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_SCRUB_INPROGRESS; - } else if (ret != -ECANCELED) { - WARN_ON(ret); - } return ret; @@ -707,6 +704,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, /* replace the sysfs entry */ btrfs_sysfs_rm_device_link(fs_info->fs_devices, src_device); + btrfs_sysfs_update_devid(tgt_device); btrfs_rm_dev_replace_free_srcdev(src_device); /* write back the superblocks */ diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c new file mode 100644 index 000000000000..5615320fa659 --- /dev/null +++ b/fs/btrfs/discard.c @@ -0,0 +1,702 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/jiffies.h> +#include <linux/kernel.h> +#include <linux/ktime.h> +#include <linux/list.h> +#include <linux/math64.h> +#include <linux/sizes.h> +#include <linux/workqueue.h> +#include "ctree.h" +#include "block-group.h" +#include "discard.h" +#include "free-space-cache.h" + +/* + * This contains the logic to handle async discard. + * + * Async discard manages trimming of free space outside of transaction commit. + * Discarding is done by managing the block_groups on a LRU list based on free + * space recency. Two passes are used to first prioritize discarding extents + * and then allow for trimming in the bitmap the best opportunity to coalesce. + * The block_groups are maintained on multiple lists to allow for multiple + * passes with different discard filter requirements. A delayed work item is + * used to manage discarding with timeout determined by a max of the delay + * incurred by the iops rate limit, the byte rate limit, and the max delay of + * BTRFS_DISCARD_MAX_DELAY. + * + * Note, this only keeps track of block_groups that are explicitly for data. + * Mixed block_groups are not supported. + * + * The first list is special to manage discarding of fully free block groups. + * This is necessary because we issue a final trim for a full free block group + * after forgetting it. When a block group becomes unused, instead of directly + * being added to the unused_bgs list, we add it to this first list. Then + * from there, if it becomes fully discarded, we place it onto the unused_bgs + * list. + * + * The in-memory free space cache serves as the backing state for discard. + * Consequently this means there is no persistence. We opt to load all the + * block groups in as not discarded, so the mount case degenerates to the + * crashing case. + * + * As the free space cache uses bitmaps, there exists a tradeoff between + * ease/efficiency for find_free_extent() and the accuracy of discard state. + * Here we opt to let untrimmed regions merge with everything while only letting + * trimmed regions merge with other trimmed regions. This can cause + * overtrimming, but the coalescing benefit seems to be worth it. Additionally, + * bitmap state is tracked as a whole. If we're able to fully trim a bitmap, + * the trimmed flag is set on the bitmap. Otherwise, if an allocation comes in, + * this resets the state and we will retry trimming the whole bitmap. This is a + * tradeoff between discard state accuracy and the cost of accounting. + */ + +/* This is an initial delay to give some chance for block reuse */ +#define BTRFS_DISCARD_DELAY (120ULL * NSEC_PER_SEC) +#define BTRFS_DISCARD_UNUSED_DELAY (10ULL * NSEC_PER_SEC) + +/* Target completion latency of discarding all discardable extents */ +#define BTRFS_DISCARD_TARGET_MSEC (6 * 60 * 60UL * MSEC_PER_SEC) +#define BTRFS_DISCARD_MIN_DELAY_MSEC (1UL) +#define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL) +#define BTRFS_DISCARD_MAX_IOPS (10U) + +/* Montonically decreasing minimum length filters after index 0 */ +static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = { + 0, + BTRFS_ASYNC_DISCARD_MAX_FILTER, + BTRFS_ASYNC_DISCARD_MIN_FILTER +}; + +static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + return &discard_ctl->discard_list[block_group->discard_index]; +} + +static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + if (!btrfs_run_discard_work(discard_ctl)) + return; + + if (list_empty(&block_group->discard_list) || + block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) { + if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) + block_group->discard_index = BTRFS_DISCARD_INDEX_START; + block_group->discard_eligible_time = (ktime_get_ns() + + BTRFS_DISCARD_DELAY); + block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; + } + + list_move_tail(&block_group->discard_list, + get_discard_list(discard_ctl, block_group)); +} + +static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + if (!btrfs_is_block_group_data_only(block_group)) + return; + + spin_lock(&discard_ctl->lock); + __add_to_discard_list(discard_ctl, block_group); + spin_unlock(&discard_ctl->lock); +} + +static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + spin_lock(&discard_ctl->lock); + + if (!btrfs_run_discard_work(discard_ctl)) { + spin_unlock(&discard_ctl->lock); + return; + } + + list_del_init(&block_group->discard_list); + + block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED; + block_group->discard_eligible_time = (ktime_get_ns() + + BTRFS_DISCARD_UNUSED_DELAY); + block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR; + list_add_tail(&block_group->discard_list, + &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]); + + spin_unlock(&discard_ctl->lock); +} + +static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + bool running = false; + + spin_lock(&discard_ctl->lock); + + if (block_group == discard_ctl->block_group) { + running = true; + discard_ctl->block_group = NULL; + } + + block_group->discard_eligible_time = 0; + list_del_init(&block_group->discard_list); + + spin_unlock(&discard_ctl->lock); + + return running; +} + +/** + * find_next_block_group - find block_group that's up next for discarding + * @discard_ctl: discard control + * @now: current time + * + * Iterate over the discard lists to find the next block_group up for + * discarding checking the discard_eligible_time of block_group. + */ +static struct btrfs_block_group *find_next_block_group( + struct btrfs_discard_ctl *discard_ctl, + u64 now) +{ + struct btrfs_block_group *ret_block_group = NULL, *block_group; + int i; + + for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { + struct list_head *discard_list = &discard_ctl->discard_list[i]; + + if (!list_empty(discard_list)) { + block_group = list_first_entry(discard_list, + struct btrfs_block_group, + discard_list); + + if (!ret_block_group) + ret_block_group = block_group; + + if (ret_block_group->discard_eligible_time < now) + break; + + if (ret_block_group->discard_eligible_time > + block_group->discard_eligible_time) + ret_block_group = block_group; + } + } + + return ret_block_group; +} + +/** + * peek_discard_list - wrap find_next_block_group() + * @discard_ctl: discard control + * @discard_state: the discard_state of the block_group after state management + * @discard_index: the discard_index of the block_group after state management + * + * This wraps find_next_block_group() and sets the block_group to be in use. + * discard_state's control flow is managed here. Variables related to + * discard_state are reset here as needed (eg discard_cursor). @discard_state + * and @discard_index are remembered as it may change while we're discarding, + * but we want the discard to execute in the context determined here. + */ +static struct btrfs_block_group *peek_discard_list( + struct btrfs_discard_ctl *discard_ctl, + enum btrfs_discard_state *discard_state, + int *discard_index) +{ + struct btrfs_block_group *block_group; + const u64 now = ktime_get_ns(); + + spin_lock(&discard_ctl->lock); +again: + block_group = find_next_block_group(discard_ctl, now); + + if (block_group && now > block_group->discard_eligible_time) { + if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED && + block_group->used != 0) { + if (btrfs_is_block_group_data_only(block_group)) + __add_to_discard_list(discard_ctl, block_group); + else + list_del_init(&block_group->discard_list); + goto again; + } + if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) { + block_group->discard_cursor = block_group->start; + block_group->discard_state = BTRFS_DISCARD_EXTENTS; + } + discard_ctl->block_group = block_group; + *discard_state = block_group->discard_state; + *discard_index = block_group->discard_index; + } else { + block_group = NULL; + } + + spin_unlock(&discard_ctl->lock); + + return block_group; +} + +/** + * btrfs_discard_check_filter - updates a block groups filters + * @block_group: block group of interest + * @bytes: recently freed region size after coalescing + * + * Async discard maintains multiple lists with progressively smaller filters + * to prioritize discarding based on size. Should a free space that matches + * a larger filter be returned to the free_space_cache, prioritize that discard + * by moving @block_group to the proper filter. + */ +void btrfs_discard_check_filter(struct btrfs_block_group *block_group, + u64 bytes) +{ + struct btrfs_discard_ctl *discard_ctl; + + if (!block_group || + !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) + return; + + discard_ctl = &block_group->fs_info->discard_ctl; + + if (block_group->discard_index > BTRFS_DISCARD_INDEX_START && + bytes >= discard_minlen[block_group->discard_index - 1]) { + int i; + + remove_from_discard_list(discard_ctl, block_group); + + for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS; + i++) { + if (bytes >= discard_minlen[i]) { + block_group->discard_index = i; + add_to_discard_list(discard_ctl, block_group); + break; + } + } + } +} + +/** + * btrfs_update_discard_index - moves a block group along the discard lists + * @discard_ctl: discard control + * @block_group: block_group of interest + * + * Increment @block_group's discard_index. If it falls of the list, let it be. + * Otherwise add it back to the appropriate list. + */ +static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + block_group->discard_index++; + if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) { + block_group->discard_index = 1; + return; + } + + add_to_discard_list(discard_ctl, block_group); +} + +/** + * btrfs_discard_cancel_work - remove a block_group from the discard lists + * @discard_ctl: discard control + * @block_group: block_group of interest + * + * This removes @block_group from the discard lists. If necessary, it waits on + * the current work and then reschedules the delayed work. + */ +void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + if (remove_from_discard_list(discard_ctl, block_group)) { + cancel_delayed_work_sync(&discard_ctl->work); + btrfs_discard_schedule_work(discard_ctl, true); + } +} + +/** + * btrfs_discard_queue_work - handles queuing the block_groups + * @discard_ctl: discard control + * @block_group: block_group of interest + * + * This maintains the LRU order of the discard lists. + */ +void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) + return; + + if (block_group->used == 0) + add_to_discard_unused_list(discard_ctl, block_group); + else + add_to_discard_list(discard_ctl, block_group); + + if (!delayed_work_pending(&discard_ctl->work)) + btrfs_discard_schedule_work(discard_ctl, false); +} + +/** + * btrfs_discard_schedule_work - responsible for scheduling the discard work + * @discard_ctl: discard control + * @override: override the current timer + * + * Discards are issued by a delayed workqueue item. @override is used to + * update the current delay as the baseline delay interval is reevaluated on + * transaction commit. This is also maxed with any other rate limit. + */ +void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, + bool override) +{ + struct btrfs_block_group *block_group; + const u64 now = ktime_get_ns(); + + spin_lock(&discard_ctl->lock); + + if (!btrfs_run_discard_work(discard_ctl)) + goto out; + + if (!override && delayed_work_pending(&discard_ctl->work)) + goto out; + + block_group = find_next_block_group(discard_ctl, now); + if (block_group) { + unsigned long delay = discard_ctl->delay; + u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit); + + /* + * A single delayed workqueue item is responsible for + * discarding, so we can manage the bytes rate limit by keeping + * track of the previous discard. + */ + if (kbps_limit && discard_ctl->prev_discard) { + u64 bps_limit = ((u64)kbps_limit) * SZ_1K; + u64 bps_delay = div64_u64(discard_ctl->prev_discard * + MSEC_PER_SEC, bps_limit); + + delay = max(delay, msecs_to_jiffies(bps_delay)); + } + + /* + * This timeout is to hopefully prevent immediate discarding + * in a recently allocated block group. + */ + if (now < block_group->discard_eligible_time) { + u64 bg_timeout = block_group->discard_eligible_time - now; + + delay = max(delay, nsecs_to_jiffies(bg_timeout)); + } + + mod_delayed_work(discard_ctl->discard_workers, + &discard_ctl->work, delay); + } +out: + spin_unlock(&discard_ctl->lock); +} + +/** + * btrfs_finish_discard_pass - determine next step of a block_group + * @discard_ctl: discard control + * @block_group: block_group of interest + * + * This determines the next step for a block group after it's finished going + * through a pass on a discard list. If it is unused and fully trimmed, we can + * mark it unused and send it to the unused_bgs path. Otherwise, pass it onto + * the appropriate filter list or let it fall off. + */ +static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group) +{ + remove_from_discard_list(discard_ctl, block_group); + + if (block_group->used == 0) { + if (btrfs_is_free_space_trimmed(block_group)) + btrfs_mark_bg_unused(block_group); + else + add_to_discard_unused_list(discard_ctl, block_group); + } else { + btrfs_update_discard_index(discard_ctl, block_group); + } +} + +/** + * btrfs_discard_workfn - discard work function + * @work: work + * + * This finds the next block_group to start discarding and then discards a + * single region. It does this in a two-pass fashion: first extents and second + * bitmaps. Completely discarded block groups are sent to the unused_bgs path. + */ +static void btrfs_discard_workfn(struct work_struct *work) +{ + struct btrfs_discard_ctl *discard_ctl; + struct btrfs_block_group *block_group; + enum btrfs_discard_state discard_state; + int discard_index = 0; + u64 trimmed = 0; + u64 minlen = 0; + + discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work); + + block_group = peek_discard_list(discard_ctl, &discard_state, + &discard_index); + if (!block_group || !btrfs_run_discard_work(discard_ctl)) + return; + + /* Perform discarding */ + minlen = discard_minlen[discard_index]; + + if (discard_state == BTRFS_DISCARD_BITMAPS) { + u64 maxlen = 0; + + /* + * Use the previous levels minimum discard length as the max + * length filter. In the case something is added to make a + * region go beyond the max filter, the entire bitmap is set + * back to BTRFS_TRIM_STATE_UNTRIMMED. + */ + if (discard_index != BTRFS_DISCARD_INDEX_UNUSED) + maxlen = discard_minlen[discard_index - 1]; + + btrfs_trim_block_group_bitmaps(block_group, &trimmed, + block_group->discard_cursor, + btrfs_block_group_end(block_group), + minlen, maxlen, true); + discard_ctl->discard_bitmap_bytes += trimmed; + } else { + btrfs_trim_block_group_extents(block_group, &trimmed, + block_group->discard_cursor, + btrfs_block_group_end(block_group), + minlen, true); + discard_ctl->discard_extent_bytes += trimmed; + } + + discard_ctl->prev_discard = trimmed; + + /* Determine next steps for a block_group */ + if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) { + if (discard_state == BTRFS_DISCARD_BITMAPS) { + btrfs_finish_discard_pass(discard_ctl, block_group); + } else { + block_group->discard_cursor = block_group->start; + spin_lock(&discard_ctl->lock); + if (block_group->discard_state != + BTRFS_DISCARD_RESET_CURSOR) + block_group->discard_state = + BTRFS_DISCARD_BITMAPS; + spin_unlock(&discard_ctl->lock); + } + } + + spin_lock(&discard_ctl->lock); + discard_ctl->block_group = NULL; + spin_unlock(&discard_ctl->lock); + + btrfs_discard_schedule_work(discard_ctl, false); +} + +/** + * btrfs_run_discard_work - determines if async discard should be running + * @discard_ctl: discard control + * + * Checks if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set. + */ +bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl) +{ + struct btrfs_fs_info *fs_info = container_of(discard_ctl, + struct btrfs_fs_info, + discard_ctl); + + return (!(fs_info->sb->s_flags & SB_RDONLY) && + test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags)); +} + +/** + * btrfs_discard_calc_delay - recalculate the base delay + * @discard_ctl: discard control + * + * Recalculate the base delay which is based off the total number of + * discardable_extents. Clamp this between the lower_limit (iops_limit or 1ms) + * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC). + */ +void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl) +{ + s32 discardable_extents; + s64 discardable_bytes; + u32 iops_limit; + unsigned long delay; + unsigned long lower_limit = BTRFS_DISCARD_MIN_DELAY_MSEC; + + discardable_extents = atomic_read(&discard_ctl->discardable_extents); + if (!discardable_extents) + return; + + spin_lock(&discard_ctl->lock); + + /* + * The following is to fix a potential -1 discrepenancy that we're not + * sure how to reproduce. But given that this is the only place that + * utilizes these numbers and this is only called by from + * btrfs_finish_extent_commit() which is synchronized, we can correct + * here. + */ + if (discardable_extents < 0) + atomic_add(-discardable_extents, + &discard_ctl->discardable_extents); + + discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes); + if (discardable_bytes < 0) + atomic64_add(-discardable_bytes, + &discard_ctl->discardable_bytes); + + if (discardable_extents <= 0) { + spin_unlock(&discard_ctl->lock); + return; + } + + iops_limit = READ_ONCE(discard_ctl->iops_limit); + if (iops_limit) + lower_limit = max_t(unsigned long, lower_limit, + MSEC_PER_SEC / iops_limit); + + delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents; + delay = clamp(delay, lower_limit, BTRFS_DISCARD_MAX_DELAY_MSEC); + discard_ctl->delay = msecs_to_jiffies(delay); + + spin_unlock(&discard_ctl->lock); +} + +/** + * btrfs_discard_update_discardable - propagate discard counters + * @block_group: block_group of interest + * @ctl: free_space_ctl of @block_group + * + * This propagates deltas of counters up to the discard_ctl. It maintains a + * current counter and a previous counter passing the delta up to the global + * stat. Then the current counter value becomes the previous counter value. + */ +void btrfs_discard_update_discardable(struct btrfs_block_group *block_group, + struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_discard_ctl *discard_ctl; + s32 extents_delta; + s64 bytes_delta; + + if (!block_group || + !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) || + !btrfs_is_block_group_data_only(block_group)) + return; + + discard_ctl = &block_group->fs_info->discard_ctl; + + extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] - + ctl->discardable_extents[BTRFS_STAT_PREV]; + if (extents_delta) { + atomic_add(extents_delta, &discard_ctl->discardable_extents); + ctl->discardable_extents[BTRFS_STAT_PREV] = + ctl->discardable_extents[BTRFS_STAT_CURR]; + } + + bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] - + ctl->discardable_bytes[BTRFS_STAT_PREV]; + if (bytes_delta) { + atomic64_add(bytes_delta, &discard_ctl->discardable_bytes); + ctl->discardable_bytes[BTRFS_STAT_PREV] = + ctl->discardable_bytes[BTRFS_STAT_CURR]; + } +} + +/** + * btrfs_discard_punt_unused_bgs_list - punt unused_bgs list to discard lists + * @fs_info: fs_info of interest + * + * The unused_bgs list needs to be punted to the discard lists because the + * order of operations is changed. In the normal sychronous discard path, the + * block groups are trimmed via a single large trim in transaction commit. This + * is ultimately what we are trying to avoid with asynchronous discard. Thus, + * it must be done before going down the unused_bgs path. + */ +void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group *block_group, *next; + + spin_lock(&fs_info->unused_bgs_lock); + /* We enabled async discard, so punt all to the queue */ + list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs, + bg_list) { + list_del_init(&block_group->bg_list); + btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); + } + spin_unlock(&fs_info->unused_bgs_lock); +} + +/** + * btrfs_discard_purge_list - purge discard lists + * @discard_ctl: discard control + * + * If we are disabling async discard, we may have intercepted block groups that + * are completely free and ready for the unused_bgs path. As discarding will + * now happen in transaction commit or not at all, we can safely mark the + * corresponding block groups as unused and they will be sent on their merry + * way to the unused_bgs list. + */ +static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl) +{ + struct btrfs_block_group *block_group, *next; + int i; + + spin_lock(&discard_ctl->lock); + for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) { + list_for_each_entry_safe(block_group, next, + &discard_ctl->discard_list[i], + discard_list) { + list_del_init(&block_group->discard_list); + spin_unlock(&discard_ctl->lock); + if (block_group->used == 0) + btrfs_mark_bg_unused(block_group); + spin_lock(&discard_ctl->lock); + } + } + spin_unlock(&discard_ctl->lock); +} + +void btrfs_discard_resume(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) { + btrfs_discard_cleanup(fs_info); + return; + } + + btrfs_discard_punt_unused_bgs_list(fs_info); + + set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags); +} + +void btrfs_discard_stop(struct btrfs_fs_info *fs_info) +{ + clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags); +} + +void btrfs_discard_init(struct btrfs_fs_info *fs_info) +{ + struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; + int i; + + spin_lock_init(&discard_ctl->lock); + INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn); + + for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) + INIT_LIST_HEAD(&discard_ctl->discard_list[i]); + + discard_ctl->prev_discard = 0; + atomic_set(&discard_ctl->discardable_extents, 0); + atomic64_set(&discard_ctl->discardable_bytes, 0); + discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE; + discard_ctl->delay = BTRFS_DISCARD_MAX_DELAY_MSEC; + discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS; + discard_ctl->kbps_limit = 0; + discard_ctl->discard_extent_bytes = 0; + discard_ctl->discard_bitmap_bytes = 0; + atomic64_set(&discard_ctl->discard_bytes_saved, 0); +} + +void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info) +{ + btrfs_discard_stop(fs_info); + cancel_delayed_work_sync(&fs_info->discard_ctl.work); + btrfs_discard_purge_list(&fs_info->discard_ctl); +} diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h new file mode 100644 index 000000000000..21a15776dac4 --- /dev/null +++ b/fs/btrfs/discard.h @@ -0,0 +1,41 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef BTRFS_DISCARD_H +#define BTRFS_DISCARD_H + +#include <linux/sizes.h> + +struct btrfs_fs_info; +struct btrfs_discard_ctl; +struct btrfs_block_group; + +/* Discard size limits */ +#define BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE (SZ_64M) +#define BTRFS_ASYNC_DISCARD_MAX_FILTER (SZ_1M) +#define BTRFS_ASYNC_DISCARD_MIN_FILTER (SZ_32K) + +/* List operations */ +void btrfs_discard_check_filter(struct btrfs_block_group *block_group, u64 bytes); + +/* Work operations */ +void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group); +void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl, + struct btrfs_block_group *block_group); +void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl, + bool override); +bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl); + +/* Update operations */ +void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl); +void btrfs_discard_update_discardable(struct btrfs_block_group *block_group, + struct btrfs_free_space_ctl *ctl); + +/* Setup/cleanup operations */ +void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info); +void btrfs_discard_resume(struct btrfs_fs_info *fs_info); +void btrfs_discard_stop(struct btrfs_fs_info *fs_info); +void btrfs_discard_init(struct btrfs_fs_info *fs_info); +void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info); + +#endif diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e0edfdc9c82b..7fa9bb79ad08 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -41,6 +41,7 @@ #include "tree-checker.h" #include "ref-verify.h" #include "block-group.h" +#include "discard.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -202,8 +203,8 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, * that covers the entire device */ struct extent_map *btree_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, u64 start, u64 len, - int create) + struct page *page, size_t pg_offset, + u64 start, u64 len) { struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; @@ -1953,6 +1954,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->readahead_workers); btrfs_destroy_workqueue(fs_info->flush_workers); btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); + if (fs_info->discard_ctl.discard_workers) + destroy_workqueue(fs_info->discard_ctl.discard_workers); /* * Now that all other work queues are destroyed, we can safely destroy * the queues used for metadata I/O, since tasks from those other work @@ -2148,6 +2151,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, max_active, 2); fs_info->qgroup_rescan_workers = btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0); + fs_info->discard_ctl.discard_workers = + alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1); if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->flush_workers && @@ -2158,7 +2163,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info, fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->readahead_workers && fs_info->fixup_workers && fs_info->delayed_workers && - fs_info->qgroup_rescan_workers)) { + fs_info->qgroup_rescan_workers && + fs_info->discard_ctl.discard_workers)) { return -ENOMEM; } @@ -2691,7 +2697,6 @@ int __cold open_ctree(struct super_block *sb, spin_lock_init(&fs_info->fs_roots_radix_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); - spin_lock_init(&fs_info->tree_mod_seq_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); @@ -2792,6 +2797,7 @@ int __cold open_ctree(struct super_block *sb, btrfs_init_dev_replace_locks(fs_info); btrfs_init_qgroup(fs_info); + btrfs_discard_init(fs_info); btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); btrfs_init_free_cluster(&fs_info->data_alloc_cluster); @@ -3082,20 +3088,13 @@ int __cold open_ctree(struct super_block *sb, btrfs_free_extra_devids(fs_devices, 1); - ret = btrfs_sysfs_add_fsid(fs_devices, NULL); + ret = btrfs_sysfs_add_fsid(fs_devices); if (ret) { btrfs_err(fs_info, "failed to init sysfs fsid interface: %d", ret); goto fail_block_groups; } - ret = btrfs_sysfs_add_device(fs_devices); - if (ret) { - btrfs_err(fs_info, "failed to init sysfs device interface: %d", - ret); - goto fail_fsdev_sysfs; - } - ret = btrfs_sysfs_add_mounted(fs_info); if (ret) { btrfs_err(fs_info, "failed to init sysfs interface: %d", ret); @@ -3262,6 +3261,7 @@ int __cold open_ctree(struct super_block *sb, } btrfs_qgroup_rescan_resume(fs_info); + btrfs_discard_resume(fs_info); if (!fs_info->uuid_root) { btrfs_info(fs_info, "creating UUID tree"); @@ -3978,6 +3978,9 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_reclaim_work); + /* Cancel or finish ongoing discard work */ + btrfs_discard_cleanup(fs_info); + if (!sb_rdonly(fs_info->sb)) { /* * The cleaner kthread is stopped, so do one final pass over @@ -4026,11 +4029,18 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) invalidate_inode_pages2(fs_info->btree_inode->i_mapping); btrfs_stop_all_workers(fs_info); - btrfs_free_block_groups(fs_info); - clear_bit(BTRFS_FS_OPEN, &fs_info->flags); free_root_pointers(fs_info, true); + /* + * We must free the block groups after dropping the fs_roots as we could + * have had an IO error and have left over tree log blocks that aren't + * cleaned up until the fs roots are freed. This makes the block group + * accounting appear to be wrong because there's pending reserved bytes, + * so make sure we do the block group cleanup afterwards. + */ + btrfs_free_block_groups(fs_info); + iput(fs_info->btree_inode); #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 76f123ebb292..8c2d6cf1ce59 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -134,8 +134,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); struct extent_map *btree_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, u64 start, u64 len, - int create); + struct page *page, size_t pg_offset, + u64 start, u64 len); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int __init btrfs_end_io_wq_init(void); void __cold btrfs_end_io_wq_exit(void); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 274318e9114e..0163fdd59f8f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -32,6 +32,7 @@ #include "block-rsv.h" #include "delalloc-space.h" #include "block-group.h" +#include "discard.h" #undef SCRAMBLE_DELAYED_REFS @@ -2923,7 +2924,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) break; } - if (btrfs_test_opt(fs_info, DISCARD)) + if (btrfs_test_opt(fs_info, DISCARD_SYNC)) ret = btrfs_discard_extent(fs_info, start, end + 1 - start, NULL); @@ -2934,6 +2935,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) cond_resched(); } + if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { + btrfs_discard_calc_delay(&fs_info->discard_ctl); + btrfs_discard_schedule_work(&fs_info->discard_ctl, true); + } + /* * Transaction is finished. We don't need the lock anymore. We * do need to clean up the block groups in case of a transaction @@ -3438,7 +3444,6 @@ btrfs_release_block_group(struct btrfs_block_group *cache, */ struct find_free_extent_ctl { /* Basic allocation info */ - u64 ram_bytes; u64 num_bytes; u64 empty_size; u64 flags; @@ -3810,7 +3815,6 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, WARN_ON(num_bytes < fs_info->sectorsize); - ffe_ctl.ram_bytes = ram_bytes; ffe_ctl.num_bytes = num_bytes; ffe_ctl.empty_size = empty_size; ffe_ctl.flags = flags; @@ -4165,12 +4169,10 @@ again: return ret; } -static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len, - int pin, int delalloc) +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, + u64 start, u64 len, int delalloc) { struct btrfs_block_group *cache; - int ret = 0; cache = btrfs_lookup_block_group(fs_info, start); if (!cache) { @@ -4179,30 +4181,28 @@ static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, return -ENOSPC; } - if (pin) - pin_down_extent(cache, start, len, 1); - else { - if (btrfs_test_opt(fs_info, DISCARD)) - ret = btrfs_discard_extent(fs_info, start, len, NULL); - btrfs_add_free_space(cache, start, len); - btrfs_free_reserved_bytes(cache, len, delalloc); - trace_btrfs_reserved_extent_free(fs_info, start, len); - } + btrfs_add_free_space(cache, start, len); + btrfs_free_reserved_bytes(cache, len, delalloc); + trace_btrfs_reserved_extent_free(fs_info, start, len); btrfs_put_block_group(cache); - return ret; + return 0; } -int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len, int delalloc) +int btrfs_pin_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { - return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc); -} + struct btrfs_block_group *cache; + int ret = 0; -int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len) -{ - return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0); + cache = btrfs_lookup_block_group(fs_info, start); + if (!cache) { + btrfs_err(fs_info, "unable to find block group for %llu", start); + return -ENOSPC; + } + + ret = pin_down_extent(cache, start, len, 1); + btrfs_put_block_group(cache); + return ret; } static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 2f4802f405a2..c0f202741e09 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1593,21 +1593,25 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, /* Find first extent with bits cleared */ while (1) { node = __etree_search(tree, start, &next, &prev, NULL, NULL); - if (!node) { + if (!node && !next && !prev) { + /* + * Tree is completely empty, send full range and let + * caller deal with it + */ + *start_ret = 0; + *end_ret = -1; + goto out; + } else if (!node && !next) { + /* + * We are past the last allocated chunk, set start at + * the end of the last extent. + */ + state = rb_entry(prev, struct extent_state, rb_node); + *start_ret = state->end + 1; + *end_ret = -1; + goto out; + } else if (!node) { node = next; - if (!node) { - /* - * We are past the last allocated chunk, - * set start at the end of the last extent. The - * device alloc tree should never be empty so - * prev is always set. - */ - ASSERT(prev); - state = rb_entry(prev, struct extent_state, rb_node); - *start_ret = state->end + 1; - *end_ret = -1; - goto out; - } } /* * At this point 'node' either contains 'start' or start is @@ -3043,7 +3047,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, *em_cached = NULL; } - em = get_extent(BTRFS_I(inode), page, pg_offset, start, len, 0); + em = get_extent(BTRFS_I(inode), page, pg_offset, start, len); if (em_cached && !IS_ERR_OR_NULL(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); @@ -3438,11 +3442,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, ret = btrfs_writepage_cow_fixup(page, start, page_end); if (ret) { /* Fixup worker will requeue */ - if (ret == -EBUSY) - wbc->pages_skipped++; - else - redirty_page_for_writepage(wbc, page); - + redirty_page_for_writepage(wbc, page); update_nr_written(wbc, nr_written); unlock_page(page); return 1; @@ -3455,11 +3455,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, update_nr_written(wbc, nr_written + 1); end = page_end; - if (i_size <= start) { - btrfs_writepage_endio_finish_ordered(page, start, page_end, 1); - goto done; - } - blocksize = inode->i_sb->s_blocksize; while (cur <= end) { @@ -3471,8 +3466,8 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, page_end, 1); break; } - em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, cur, - end - cur + 1, 1); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur, + end - cur + 1); if (IS_ERR_OR_NULL(em)) { SetPageError(page); ret = PTR_ERR_OR_ZERO(em); @@ -3497,22 +3492,11 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, */ if (compressed || block_start == EXTENT_MAP_HOLE || block_start == EXTENT_MAP_INLINE) { - /* - * end_io notification does not happen here for - * compressed extents - */ - if (!compressed) - btrfs_writepage_endio_finish_ordered(page, cur, - cur + iosize - 1, - 1); - else if (compressed) { - /* we don't want to end_page_writeback on - * a compressed extent. this happens - * elsewhere - */ + if (compressed) nr++; - } - + else + btrfs_writepage_endio_finish_ordered(page, cur, + cur + iosize - 1, 1); cur += iosize; pg_offset += iosize; continue; @@ -3540,7 +3524,6 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode, pg_offset += iosize; nr++; } -done: *nr_ret = nr; return ret; } @@ -3562,7 +3545,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, u64 page_end = start + PAGE_SIZE - 1; int ret; int nr = 0; - size_t pg_offset = 0; + size_t pg_offset; loff_t i_size = i_size_read(inode); unsigned long end_index = i_size >> PAGE_SHIFT; unsigned long nr_written = 0; @@ -3591,14 +3574,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, flush_dcache_page(page); } - pg_offset = 0; - set_page_extent_mapped(page); if (!epd->extent_locked) { ret = writepage_delalloc(inode, page, wbc, start, &nr_written); if (ret == 1) - goto done_unlocked; + return 0; if (ret) goto done; } @@ -3606,7 +3587,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, ret = __extent_writepage_io(inode, page, wbc, epd, i_size, nr_written, &nr); if (ret == 1) - goto done_unlocked; + return 0; done: if (nr == 0) { @@ -3621,9 +3602,6 @@ done: unlock_page(page); ASSERT(ret <= 0); return ret; - -done_unlocked: - return 0; } void wait_on_extent_buffer_writeback(struct extent_buffer *eb) @@ -3941,6 +3919,11 @@ int btree_write_cache_pages(struct address_space *mapping, if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; + /* + * Start from the beginning does not need to cycle over the + * range, mark it as scanned. + */ + scanned = (index == 0); } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; @@ -3958,7 +3941,6 @@ retry: tag))) { unsigned i; - scanned = 1; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -4087,6 +4069,11 @@ static int extent_write_cache_pages(struct address_space *mapping, if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; + /* + * Start from the beginning does not need to cycle over the + * range, mark it as scanned. + */ + scanned = (index == 0); } else { index = wbc->range_start >> PAGE_SHIFT; end = wbc->range_end >> PAGE_SHIFT; @@ -4120,7 +4107,6 @@ retry: &index, end, tag))) { unsigned i; - scanned = 1; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -4180,7 +4166,16 @@ retry: */ scanned = 1; index = 0; - goto retry; + + /* + * If we're looping we could run into a page that is locked by a + * writer and that writer could be waiting on writeback for a + * page in our current bio, and thus deadlock, so flush the + * write bio here. + */ + ret = flush_write_bio(epd); + if (!ret) + goto retry; } if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index a8551a1f56e2..5d205bbaafdc 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -183,10 +183,8 @@ static inline int extent_compress_type(unsigned long bio_flags) struct extent_map_tree; typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, - struct page *page, - size_t pg_offset, - u64 start, u64 len, - int create); + struct page *page, size_t pg_offset, + u64 start, u64 len); int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index b1bfdc5c1387..c2f365662d55 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -148,8 +148,19 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, return ret; } -static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, - u64 logical_offset, u8 *dst, int dio) +/** + * btrfs_lookup_bio_sums - Look up checksums for a bio. + * @inode: inode that the bio is for. + * @bio: bio embedded in btrfs_io_bio. + * @offset: Unless (u64)-1, look up checksums for this offset in the file. + * If (u64)-1, use the page offsets from the bio instead. + * @dst: Buffer of size btrfs_super_csum_size() used to return checksum. If + * NULL, the checksum is returned in btrfs_io_bio(bio)->csum instead. + * + * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. + */ +blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, + u64 offset, u8 *dst) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct bio_vec bvec; @@ -158,8 +169,8 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio struct btrfs_csum_item *item = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_path *path; + const bool page_offsets = (offset == (u64)-1); u8 *csum; - u64 offset = 0; u64 item_start_offset = 0; u64 item_last_offset = 0; u64 disk_bytenr; @@ -205,15 +216,13 @@ static blk_status_t __btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio } disk_bytenr = (u64)bio->bi_iter.bi_sector << 9; - if (dio) - offset = logical_offset; bio_for_each_segment(bvec, bio, iter) { page_bytes_left = bvec.bv_len; if (count) goto next; - if (!dio) + if (page_offsets) offset = page_offset(bvec.bv_page) + bvec.bv_offset; count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, csum, nblocks); @@ -274,7 +283,8 @@ found: csum += count * csum_size; nblocks -= count; next: - while (count--) { + while (count > 0) { + count--; disk_bytenr += fs_info->sectorsize; offset += fs_info->sectorsize; page_bytes_left -= fs_info->sectorsize; @@ -285,18 +295,7 @@ next: WARN_ON_ONCE(count); btrfs_free_path(path); - return 0; -} - -blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, - u8 *dst) -{ - return __btrfs_lookup_bio_sums(inode, bio, 0, dst, 0); -} - -blk_status_t btrfs_lookup_bio_sums_dio(struct inode *inode, struct bio *bio, u64 offset) -{ - return __btrfs_lookup_bio_sums(inode, bio, offset, NULL, 1); + return BLK_STS_OK; } int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, @@ -483,8 +482,8 @@ blk_status_t btrfs_csum_one_bio(struct inode *inode, struct bio *bio, - 1); for (i = 0; i < nr_sectors; i++) { - if (offset >= ordered->file_offset + ordered->len || - offset < ordered->file_offset) { + if (offset >= ordered->file_offset + ordered->num_bytes || + offset < ordered->file_offset) { unsigned long bytes_left; sums->len = this_sum_bytes; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 8d47c76b7bd1..a16da274c9aa 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -477,8 +477,7 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, u64 em_len; int ret = 0; - em = btrfs_get_extent(inode, NULL, 0, search_start, - search_len, 0); + em = btrfs_get_extent(inode, NULL, 0, search_start, search_len); if (IS_ERR(em)) return PTR_ERR(em); @@ -1501,7 +1500,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, ordered = btrfs_lookup_ordered_range(inode, start_pos, last_pos - start_pos + 1); if (ordered && - ordered->file_offset + ordered->len > start_pos && + ordered->file_offset + ordered->num_bytes > start_pos && ordered->file_offset <= last_pos) { unlock_extent_cached(&inode->io_tree, start_pos, last_pos, cached_state); @@ -2390,7 +2389,7 @@ static int find_first_non_hole(struct inode *inode, u64 *start, u64 *len) em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, round_down(*start, fs_info->sectorsize), - round_up(*len, fs_info->sectorsize), 0); + round_up(*len, fs_info->sectorsize)); if (IS_ERR(em)) return PTR_ERR(em); @@ -2426,7 +2425,7 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, * we need to try again. */ if ((!ordered || - (ordered->file_offset + ordered->len <= lockstart || + (ordered->file_offset + ordered->num_bytes <= lockstart || ordered->file_offset > lockend)) && !filemap_range_has_page(inode->i_mapping, lockstart, lockend)) { @@ -2957,7 +2956,7 @@ static int btrfs_zero_range_check_range_boundary(struct inode *inode, int ret; offset = round_down(offset, sectorsize); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) return PTR_ERR(em); @@ -2990,8 +2989,8 @@ static int btrfs_zero_range(struct inode *inode, inode_dio_wait(inode); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, - alloc_start, alloc_end - alloc_start, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, + alloc_end - alloc_start); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -3034,8 +3033,8 @@ static int btrfs_zero_range(struct inode *inode, if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, - alloc_start, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, + sectorsize); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -3248,7 +3247,7 @@ static long btrfs_fallocate(struct file *file, int mode, ordered = btrfs_lookup_first_ordered_extent(inode, locked_end); if (ordered && - ordered->file_offset + ordered->len > alloc_start && + ordered->file_offset + ordered->num_bytes > alloc_start && ordered->file_offset < alloc_end) { btrfs_put_ordered_extent(ordered); unlock_extent_cached(&BTRFS_I(inode)->io_tree, @@ -3273,7 +3272,7 @@ static long btrfs_fallocate(struct file *file, int mode, INIT_LIST_HEAD(&reserve_list); while (cur_offset < alloc_end) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, - alloc_end - cur_offset, 0); + alloc_end - cur_offset); if (IS_ERR(em)) { ret = PTR_ERR(em); break; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 3283da419200..0598fd3c6e3f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -21,9 +21,11 @@ #include "space-info.h" #include "delalloc-space.h" #include "block-group.h" +#include "discard.h" #define BITS_PER_BITMAP (PAGE_SIZE * 8UL) -#define MAX_CACHE_BYTES_PER_GIG SZ_32K +#define MAX_CACHE_BYTES_PER_GIG SZ_64K +#define FORCE_EXTENT_THRESHOLD SZ_1M struct btrfs_trim_range { u64 start; @@ -31,6 +33,8 @@ struct btrfs_trim_range { struct list_head list; }; +static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info); static int link_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info); static void unlink_free_space(struct btrfs_free_space_ctl *ctl, @@ -752,6 +756,16 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, goto free_cache; } + /* + * Sync discard ensures that the free space cache is always + * trimmed. So when reading this in, the state should reflect + * that. We also do this for async as a stop gap for lack of + * persistence. + */ + if (btrfs_test_opt(fs_info, DISCARD_SYNC) || + btrfs_test_opt(fs_info, DISCARD_ASYNC)) + e->trim_state = BTRFS_TRIM_STATE_TRIMMED; + if (!e->bytes) { kmem_cache_free(btrfs_free_space_cachep, e); goto free_cache; @@ -805,12 +819,19 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, ret = io_ctl_read_bitmap(&io_ctl, e); if (ret) goto free_cache; + e->bitmap_extents = count_bitmap_extents(ctl, e); + if (!btrfs_free_space_trimmed(e)) { + ctl->discardable_extents[BTRFS_STAT_CURR] += + e->bitmap_extents; + ctl->discardable_bytes[BTRFS_STAT_CURR] += e->bytes; + } } io_ctl_drop_pages(&io_ctl); merge_space_tree(ctl); ret = 1; out: + btrfs_discard_update_discardable(ctl->private, ctl); io_ctl_free(&io_ctl); return ret; free_cache: @@ -1624,6 +1645,11 @@ __unlink_free_space(struct btrfs_free_space_ctl *ctl, { rb_erase(&info->offset_index, &ctl->free_space_offset); ctl->free_extents--; + + if (!info->bitmap && !btrfs_free_space_trimmed(info)) { + ctl->discardable_extents[BTRFS_STAT_CURR]--; + ctl->discardable_bytes[BTRFS_STAT_CURR] -= info->bytes; + } } static void unlink_free_space(struct btrfs_free_space_ctl *ctl, @@ -1644,6 +1670,11 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl, if (ret) return ret; + if (!info->bitmap && !btrfs_free_space_trimmed(info)) { + ctl->discardable_extents[BTRFS_STAT_CURR]++; + ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; + } + ctl->free_space += info->bytes; ctl->free_extents++; return ret; @@ -1664,26 +1695,17 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) ASSERT(ctl->total_bitmaps <= max_bitmaps); /* - * The goal is to keep the total amount of memory used per 1gb of space - * at or below 32k, so we need to adjust how much memory we allow to be - * used by extent based free space tracking + * We are trying to keep the total amount of memory used per 1GiB of + * space to be MAX_CACHE_BYTES_PER_GIG. However, with a reclamation + * mechanism of pulling extents >= FORCE_EXTENT_THRESHOLD out of + * bitmaps, we may end up using more memory than this. */ if (size < SZ_1G) max_bytes = MAX_CACHE_BYTES_PER_GIG; else max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); - /* - * we want to account for 1 more bitmap than what we have so we can make - * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as - * we add more bitmaps. - */ - bitmap_bytes = (ctl->total_bitmaps + 1) * ctl->unit; - - if (bitmap_bytes >= max_bytes) { - ctl->extents_thresh = 0; - return; - } + bitmap_bytes = ctl->total_bitmaps * ctl->unit; /* * we want the extent entry threshold to always be at most 1/2 the max @@ -1700,17 +1722,31 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, u64 bytes) { - unsigned long start, count; + unsigned long start, count, end; + int extent_delta = -1; start = offset_to_bit(info->offset, ctl->unit, offset); count = bytes_to_bits(bytes, ctl->unit); - ASSERT(start + count <= BITS_PER_BITMAP); + end = start + count; + ASSERT(end <= BITS_PER_BITMAP); bitmap_clear(info->bitmap, start, count); info->bytes -= bytes; if (info->max_extent_size > ctl->unit) info->max_extent_size = 0; + + if (start && test_bit(start - 1, info->bitmap)) + extent_delta++; + + if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) + extent_delta++; + + info->bitmap_extents += extent_delta; + if (!btrfs_free_space_trimmed(info)) { + ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; + ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; + } } static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl, @@ -1725,16 +1761,30 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, u64 bytes) { - unsigned long start, count; + unsigned long start, count, end; + int extent_delta = 1; start = offset_to_bit(info->offset, ctl->unit, offset); count = bytes_to_bits(bytes, ctl->unit); - ASSERT(start + count <= BITS_PER_BITMAP); + end = start + count; + ASSERT(end <= BITS_PER_BITMAP); bitmap_set(info->bitmap, start, count); info->bytes += bytes; ctl->free_space += bytes; + + if (start && test_bit(start - 1, info->bitmap)) + extent_delta--; + + if (end < BITS_PER_BITMAP && test_bit(end, info->bitmap)) + extent_delta--; + + info->bitmap_extents += extent_delta; + if (!btrfs_free_space_trimmed(info)) { + ctl->discardable_extents[BTRFS_STAT_CURR] += extent_delta; + ctl->discardable_bytes[BTRFS_STAT_CURR] += bytes; + } } /* @@ -1870,11 +1920,35 @@ out: return NULL; } +static int count_bitmap_extents(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *bitmap_info) +{ + struct btrfs_block_group *block_group = ctl->private; + u64 bytes = bitmap_info->bytes; + unsigned int rs, re; + int count = 0; + + if (!block_group || !bytes) + return count; + + bitmap_for_each_set_region(bitmap_info->bitmap, rs, re, 0, + BITS_PER_BITMAP) { + bytes -= (rs - re) * ctl->unit; + count++; + + if (!bytes) + break; + } + + return count; +} + static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset) { info->offset = offset_to_bitmap(ctl, offset); info->bytes = 0; + info->bitmap_extents = 0; INIT_LIST_HEAD(&info->list); link_free_space(ctl, info); ctl->total_bitmaps++; @@ -1885,6 +1959,18 @@ static void add_new_bitmap(struct btrfs_free_space_ctl *ctl, static void free_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *bitmap_info) { + /* + * Normally when this is called, the bitmap is completely empty. However, + * if we are blowing up the free space cache for one reason or another + * via __btrfs_remove_free_space_cache(), then it may not be freed and + * we may leave stats on the table. + */ + if (bitmap_info->bytes && !btrfs_free_space_trimmed(bitmap_info)) { + ctl->discardable_extents[BTRFS_STAT_CURR] -= + bitmap_info->bitmap_extents; + ctl->discardable_bytes[BTRFS_STAT_CURR] -= bitmap_info->bytes; + + } unlink_free_space(ctl, bitmap_info); kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); kmem_cache_free(btrfs_free_space_cachep, bitmap_info); @@ -1971,11 +2057,24 @@ again: static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, u64 offset, - u64 bytes) + u64 bytes, enum btrfs_trim_state trim_state) { u64 bytes_to_set = 0; u64 end; + /* + * This is a tradeoff to make bitmap trim state minimal. We mark the + * whole bitmap untrimmed if at any point we add untrimmed regions. + */ + if (trim_state == BTRFS_TRIM_STATE_UNTRIMMED) { + if (btrfs_free_space_trimmed(info)) { + ctl->discardable_extents[BTRFS_STAT_CURR] += + info->bitmap_extents; + ctl->discardable_bytes[BTRFS_STAT_CURR] += info->bytes; + } + info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + } + end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit); bytes_to_set = min(end - offset, bytes); @@ -2004,6 +2103,10 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, forced = true; #endif + /* This is a way to reclaim large regions from the bitmaps. */ + if (!forced && info->bytes >= FORCE_EXTENT_THRESHOLD) + return false; + /* * If we are below the extents threshold then we can add this as an * extent, and don't have to deal with the bitmap @@ -2016,8 +2119,8 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, * of cache left then go ahead an dadd them, no sense in adding * the overhead of a bitmap if we don't have to. */ - if (info->bytes <= fs_info->sectorsize * 4) { - if (ctl->free_extents * 2 <= ctl->extents_thresh) + if (info->bytes <= fs_info->sectorsize * 8) { + if (ctl->free_extents * 3 <= ctl->extents_thresh) return false; } else { return false; @@ -2050,10 +2153,12 @@ static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl, struct btrfs_block_group *block_group = NULL; int added = 0; u64 bytes, offset, bytes_added; + enum btrfs_trim_state trim_state; int ret; bytes = info->bytes; offset = info->offset; + trim_state = info->trim_state; if (!ctl->op->use_bitmap(ctl, info)) return 0; @@ -2088,8 +2193,8 @@ again: } if (entry->offset == offset_to_bitmap(ctl, offset)) { - bytes_added = add_bytes_to_bitmap(ctl, entry, - offset, bytes); + bytes_added = add_bytes_to_bitmap(ctl, entry, offset, + bytes, trim_state); bytes -= bytes_added; offset += bytes_added; } @@ -2108,7 +2213,8 @@ no_cluster_bitmap: goto new_bitmap; } - bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); + bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, + trim_state); bytes -= bytes_added; offset += bytes_added; added = 0; @@ -2142,6 +2248,7 @@ new_bitmap: /* allocate the bitmap */ info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); + info->trim_state = BTRFS_TRIM_STATE_TRIMMED; spin_lock(&ctl->tree_lock); if (!info->bitmap) { ret = -ENOMEM; @@ -2161,6 +2268,22 @@ out: return ret; } +/* + * Free space merging rules: + * 1) Merge trimmed areas together + * 2) Let untrimmed areas coalesce with trimmed areas + * 3) Always pull neighboring regions from bitmaps + * + * The above rules are for when we merge free space based on btrfs_trim_state. + * Rules 2 and 3 are subtle because they are suboptimal, but are done for the + * same reason: to promote larger extent regions which makes life easier for + * find_free_extent(). Rule 2 enables coalescing based on the common path + * being returning free space from btrfs_finish_extent_commit(). So when free + * space is trimmed, it will prevent aggregating trimmed new region and + * untrimmed regions in the rb_tree. Rule 3 is purely to obtain larger extents + * and provide find_free_extent() with the largest extents possible hoping for + * the reuse path. + */ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, struct btrfs_free_space *info, bool update_stat) { @@ -2169,6 +2292,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, bool merged = false; u64 offset = info->offset; u64 bytes = info->bytes; + const bool is_trimmed = btrfs_free_space_trimmed(info); /* * first we want to see if there is free space adjacent to the range we @@ -2182,7 +2306,9 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, else left_info = tree_search_offset(ctl, offset - 1, 0, 0); - if (right_info && !right_info->bitmap) { + /* See try_merge_free_space() comment. */ + if (right_info && !right_info->bitmap && + (!is_trimmed || btrfs_free_space_trimmed(right_info))) { if (update_stat) unlink_free_space(ctl, right_info); else @@ -2192,8 +2318,10 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, merged = true; } + /* See try_merge_free_space() comment. */ if (left_info && !left_info->bitmap && - left_info->offset + left_info->bytes == offset) { + left_info->offset + left_info->bytes == offset && + (!is_trimmed || btrfs_free_space_trimmed(left_info))) { if (update_stat) unlink_free_space(ctl, left_info); else @@ -2229,6 +2357,10 @@ static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, bytes = (j - i) * ctl->unit; info->bytes += bytes; + /* See try_merge_free_space() comment. */ + if (!btrfs_free_space_trimmed(bitmap)) + info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + if (update_stat) bitmap_clear_bits(ctl, bitmap, end, bytes); else @@ -2282,6 +2414,10 @@ static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, info->offset -= bytes; info->bytes += bytes; + /* See try_merge_free_space() comment. */ + if (!btrfs_free_space_trimmed(bitmap)) + info->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + if (update_stat) bitmap_clear_bits(ctl, bitmap, info->offset, bytes); else @@ -2331,10 +2467,13 @@ static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, struct btrfs_free_space_ctl *ctl, - u64 offset, u64 bytes) + u64 offset, u64 bytes, + enum btrfs_trim_state trim_state) { + struct btrfs_block_group *block_group = ctl->private; struct btrfs_free_space *info; int ret = 0; + u64 filter_bytes = bytes; info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS); if (!info) @@ -2342,6 +2481,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, info->offset = offset; info->bytes = bytes; + info->trim_state = trim_state; RB_CLEAR_NODE(&info->offset_index); spin_lock(&ctl->tree_lock); @@ -2370,10 +2510,13 @@ link: */ steal_from_bitmap(ctl, info, true); + filter_bytes = max(filter_bytes, info->bytes); + ret = link_free_space(ctl, info); if (ret) kmem_cache_free(btrfs_free_space_cachep, info); out: + btrfs_discard_update_discardable(block_group, ctl); spin_unlock(&ctl->tree_lock); if (ret) { @@ -2381,15 +2524,44 @@ out: ASSERT(ret != -EEXIST); } + if (trim_state != BTRFS_TRIM_STATE_TRIMMED) { + btrfs_discard_check_filter(block_group, filter_bytes); + btrfs_discard_queue_work(&fs_info->discard_ctl, block_group); + } + return ret; } int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size) { + enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + + if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC)) + trim_state = BTRFS_TRIM_STATE_TRIMMED; + + return __btrfs_add_free_space(block_group->fs_info, + block_group->free_space_ctl, + bytenr, size, trim_state); +} + +/* + * This is a subtle distinction because when adding free space back in general, + * we want it to be added as untrimmed for async. But in the case where we add + * it on loading of a block group, we want to consider it trimmed. + */ +int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, + u64 bytenr, u64 size) +{ + enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + + if (btrfs_test_opt(block_group->fs_info, DISCARD_SYNC) || + btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC)) + trim_state = BTRFS_TRIM_STATE_TRIMMED; + return __btrfs_add_free_space(block_group->fs_info, block_group->free_space_ctl, - bytenr, size); + bytenr, size, trim_state); } int btrfs_remove_free_space(struct btrfs_block_group *block_group, @@ -2464,8 +2636,10 @@ again: } spin_unlock(&ctl->tree_lock); - ret = btrfs_add_free_space(block_group, offset + bytes, - old_end - (offset + bytes)); + ret = __btrfs_add_free_space(block_group->fs_info, ctl, + offset + bytes, + old_end - (offset + bytes), + info->trim_state); WARN_ON(ret); goto out; } @@ -2477,6 +2651,7 @@ again: goto again; } out_lock: + btrfs_discard_update_discardable(block_group, ctl); spin_unlock(&ctl->tree_lock); out: return ret; @@ -2562,8 +2737,22 @@ __btrfs_return_cluster_to_free_space( bitmap = (entry->bitmap != NULL); if (!bitmap) { + /* Merging treats extents as if they were new */ + if (!btrfs_free_space_trimmed(entry)) { + ctl->discardable_extents[BTRFS_STAT_CURR]--; + ctl->discardable_bytes[BTRFS_STAT_CURR] -= + entry->bytes; + } + try_merge_free_space(ctl, entry, false); steal_from_bitmap(ctl, entry, false); + + /* As we insert directly, update these statistics */ + if (!btrfs_free_space_trimmed(entry)) { + ctl->discardable_extents[BTRFS_STAT_CURR]++; + ctl->discardable_bytes[BTRFS_STAT_CURR] += + entry->bytes; + } } tree_insert_offset(&ctl->free_space_offset, entry->offset, &entry->offset_index, bitmap); @@ -2599,6 +2788,8 @@ void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl) { spin_lock(&ctl->tree_lock); __btrfs_remove_free_space_cache_locked(ctl); + if (ctl->private) + btrfs_discard_update_discardable(ctl->private, ctl); spin_unlock(&ctl->tree_lock); } @@ -2620,20 +2811,55 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group) cond_resched_lock(&ctl->tree_lock); } __btrfs_remove_free_space_cache_locked(ctl); + btrfs_discard_update_discardable(block_group, ctl); spin_unlock(&ctl->tree_lock); } +/** + * btrfs_is_free_space_trimmed - see if everything is trimmed + * @block_group: block_group of interest + * + * Walk @block_group's free space rb_tree to determine if everything is trimmed. + */ +bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group) +{ + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_free_space *info; + struct rb_node *node; + bool ret = true; + + spin_lock(&ctl->tree_lock); + node = rb_first(&ctl->free_space_offset); + + while (node) { + info = rb_entry(node, struct btrfs_free_space, offset_index); + + if (!btrfs_free_space_trimmed(info)) { + ret = false; + break; + } + + node = rb_next(node); + } + + spin_unlock(&ctl->tree_lock); + return ret; +} + u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 offset, u64 bytes, u64 empty_size, u64 *max_extent_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_discard_ctl *discard_ctl = + &block_group->fs_info->discard_ctl; struct btrfs_free_space *entry = NULL; u64 bytes_search = bytes + empty_size; u64 ret = 0; u64 align_gap = 0; u64 align_gap_len = 0; + enum btrfs_trim_state align_gap_trim_state = BTRFS_TRIM_STATE_UNTRIMMED; spin_lock(&ctl->tree_lock); entry = find_free_space(ctl, &offset, &bytes_search, @@ -2644,12 +2870,20 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, ret = offset; if (entry->bitmap) { bitmap_clear_bits(ctl, entry, offset, bytes); + + if (!btrfs_free_space_trimmed(entry)) + atomic64_add(bytes, &discard_ctl->discard_bytes_saved); + if (!entry->bytes) free_bitmap(ctl, entry); } else { unlink_free_space(ctl, entry); align_gap_len = offset - entry->offset; align_gap = entry->offset; + align_gap_trim_state = entry->trim_state; + + if (!btrfs_free_space_trimmed(entry)) + atomic64_add(bytes, &discard_ctl->discard_bytes_saved); entry->offset = offset + bytes; WARN_ON(entry->bytes < bytes + align_gap_len); @@ -2661,11 +2895,13 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, link_free_space(ctl, entry); } out: + btrfs_discard_update_discardable(block_group, ctl); spin_unlock(&ctl->tree_lock); if (align_gap_len) __btrfs_add_free_space(block_group->fs_info, ctl, - align_gap, align_gap_len); + align_gap, align_gap_len, + align_gap_trim_state); return ret; } @@ -2707,6 +2943,8 @@ int btrfs_return_cluster_to_free_space( ret = __btrfs_return_cluster_to_free_space(block_group, cluster); spin_unlock(&ctl->tree_lock); + btrfs_discard_queue_work(&block_group->fs_info->discard_ctl, block_group); + /* finally drop our ref */ btrfs_put_block_group(block_group); return ret; @@ -2750,6 +2988,8 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group *block_group, u64 min_start, u64 *max_extent_size) { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; + struct btrfs_discard_ctl *discard_ctl = + &block_group->fs_info->discard_ctl; struct btrfs_free_space *entry = NULL; struct rb_node *node; u64 ret = 0; @@ -2814,7 +3054,12 @@ out: spin_lock(&ctl->tree_lock); + if (!btrfs_free_space_trimmed(entry)) + atomic64_add(bytes, &discard_ctl->discard_bytes_saved); + ctl->free_space -= bytes; + if (!entry->bitmap && !btrfs_free_space_trimmed(entry)) + ctl->discardable_bytes[BTRFS_STAT_CURR] -= bytes; if (entry->bytes == 0) { ctl->free_extents--; if (entry->bitmap) { @@ -2822,6 +3067,8 @@ out: entry->bitmap); ctl->total_bitmaps--; ctl->op->recalc_thresholds(ctl); + } else if (!btrfs_free_space_trimmed(entry)) { + ctl->discardable_extents[BTRFS_STAT_CURR]--; } kmem_cache_free(btrfs_free_space_cachep, entry); } @@ -3148,6 +3395,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster) static int do_trimming(struct btrfs_block_group *block_group, u64 *total_trimmed, u64 start, u64 bytes, u64 reserved_start, u64 reserved_bytes, + enum btrfs_trim_state reserved_trim_state, struct btrfs_trim_range *trim_entry) { struct btrfs_space_info *space_info = block_group->space_info; @@ -3155,6 +3403,9 @@ static int do_trimming(struct btrfs_block_group *block_group, struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; int ret; int update = 0; + const u64 end = start + bytes; + const u64 reserved_end = reserved_start + reserved_bytes; + enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_UNTRIMMED; u64 trimmed = 0; spin_lock(&space_info->lock); @@ -3168,11 +3419,20 @@ static int do_trimming(struct btrfs_block_group *block_group, spin_unlock(&space_info->lock); ret = btrfs_discard_extent(fs_info, start, bytes, &trimmed); - if (!ret) + if (!ret) { *total_trimmed += trimmed; + trim_state = BTRFS_TRIM_STATE_TRIMMED; + } mutex_lock(&ctl->cache_writeout_mutex); - btrfs_add_free_space(block_group, reserved_start, reserved_bytes); + if (reserved_start < start) + __btrfs_add_free_space(fs_info, ctl, reserved_start, + start - reserved_start, + reserved_trim_state); + if (start + bytes < reserved_start + reserved_bytes) + __btrfs_add_free_space(fs_info, ctl, end, reserved_end - end, + reserved_trim_state); + __btrfs_add_free_space(fs_info, ctl, start, bytes, trim_state); list_del(&trim_entry->list); mutex_unlock(&ctl->cache_writeout_mutex); @@ -3190,16 +3450,24 @@ static int do_trimming(struct btrfs_block_group *block_group, return ret; } +/* + * If @async is set, then we will trim 1 region and return. + */ static int trim_no_bitmap(struct btrfs_block_group *block_group, - u64 *total_trimmed, u64 start, u64 end, u64 minlen) + u64 *total_trimmed, u64 start, u64 end, u64 minlen, + bool async) { + struct btrfs_discard_ctl *discard_ctl = + &block_group->fs_info->discard_ctl; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; struct rb_node *node; int ret = 0; u64 extent_start; u64 extent_bytes; + enum btrfs_trim_state extent_trim_state; u64 bytes; + const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); while (start < end) { struct btrfs_trim_range trim_entry; @@ -3207,49 +3475,66 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group, mutex_lock(&ctl->cache_writeout_mutex); spin_lock(&ctl->tree_lock); - if (ctl->free_space < minlen) { - spin_unlock(&ctl->tree_lock); - mutex_unlock(&ctl->cache_writeout_mutex); - break; - } + if (ctl->free_space < minlen) + goto out_unlock; entry = tree_search_offset(ctl, start, 0, 1); - if (!entry) { - spin_unlock(&ctl->tree_lock); - mutex_unlock(&ctl->cache_writeout_mutex); - break; - } + if (!entry) + goto out_unlock; - /* skip bitmaps */ - while (entry->bitmap) { + /* Skip bitmaps and if async, already trimmed entries */ + while (entry->bitmap || + (async && btrfs_free_space_trimmed(entry))) { node = rb_next(&entry->offset_index); - if (!node) { - spin_unlock(&ctl->tree_lock); - mutex_unlock(&ctl->cache_writeout_mutex); - goto out; - } + if (!node) + goto out_unlock; entry = rb_entry(node, struct btrfs_free_space, offset_index); } - if (entry->offset >= end) { - spin_unlock(&ctl->tree_lock); - mutex_unlock(&ctl->cache_writeout_mutex); - break; - } + if (entry->offset >= end) + goto out_unlock; extent_start = entry->offset; extent_bytes = entry->bytes; - start = max(start, extent_start); - bytes = min(extent_start + extent_bytes, end) - start; - if (bytes < minlen) { - spin_unlock(&ctl->tree_lock); - mutex_unlock(&ctl->cache_writeout_mutex); - goto next; - } + extent_trim_state = entry->trim_state; + if (async) { + start = entry->offset; + bytes = entry->bytes; + if (bytes < minlen) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + goto next; + } + unlink_free_space(ctl, entry); + /* + * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. + * If X < BTRFS_ASYNC_DISCARD_MIN_FILTER, we won't trim + * X when we come back around. So trim it now. + */ + if (max_discard_size && + bytes >= (max_discard_size + + BTRFS_ASYNC_DISCARD_MIN_FILTER)) { + bytes = max_discard_size; + extent_bytes = max_discard_size; + entry->offset += max_discard_size; + entry->bytes -= max_discard_size; + link_free_space(ctl, entry); + } else { + kmem_cache_free(btrfs_free_space_cachep, entry); + } + } else { + start = max(start, extent_start); + bytes = min(extent_start + extent_bytes, end) - start; + if (bytes < minlen) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + goto next; + } - unlink_free_space(ctl, entry); - kmem_cache_free(btrfs_free_space_cachep, entry); + unlink_free_space(ctl, entry); + kmem_cache_free(btrfs_free_space_cachep, entry); + } spin_unlock(&ctl->tree_lock); trim_entry.start = extent_start; @@ -3258,11 +3543,17 @@ static int trim_no_bitmap(struct btrfs_block_group *block_group, mutex_unlock(&ctl->cache_writeout_mutex); ret = do_trimming(block_group, total_trimmed, start, bytes, - extent_start, extent_bytes, &trim_entry); - if (ret) + extent_start, extent_bytes, extent_trim_state, + &trim_entry); + if (ret) { + block_group->discard_cursor = start + bytes; break; + } next: start += bytes; + block_group->discard_cursor = start; + if (async && *total_trimmed) + break; if (fatal_signal_pending(current)) { ret = -ERESTARTSYS; @@ -3271,19 +3562,76 @@ next: cond_resched(); } -out: + + return ret; + +out_unlock: + block_group->discard_cursor = btrfs_block_group_end(block_group); + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + return ret; } +/* + * If we break out of trimming a bitmap prematurely, we should reset the + * trimming bit. In a rather contrieved case, it's possible to race here so + * reset the state to BTRFS_TRIM_STATE_UNTRIMMED. + * + * start = start of bitmap + * end = near end of bitmap + * + * Thread 1: Thread 2: + * trim_bitmaps(start) + * trim_bitmaps(end) + * end_trimming_bitmap() + * reset_trimming_bitmap() + */ +static void reset_trimming_bitmap(struct btrfs_free_space_ctl *ctl, u64 offset) +{ + struct btrfs_free_space *entry; + + spin_lock(&ctl->tree_lock); + entry = tree_search_offset(ctl, offset, 1, 0); + if (entry) { + if (btrfs_free_space_trimmed(entry)) { + ctl->discardable_extents[BTRFS_STAT_CURR] += + entry->bitmap_extents; + ctl->discardable_bytes[BTRFS_STAT_CURR] += entry->bytes; + } + entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; + } + + spin_unlock(&ctl->tree_lock); +} + +static void end_trimming_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *entry) +{ + if (btrfs_free_space_trimming_bitmap(entry)) { + entry->trim_state = BTRFS_TRIM_STATE_TRIMMED; + ctl->discardable_extents[BTRFS_STAT_CURR] -= + entry->bitmap_extents; + ctl->discardable_bytes[BTRFS_STAT_CURR] -= entry->bytes; + } +} + +/* + * If @async is set, then we will trim 1 region and return. + */ static int trim_bitmaps(struct btrfs_block_group *block_group, - u64 *total_trimmed, u64 start, u64 end, u64 minlen) + u64 *total_trimmed, u64 start, u64 end, u64 minlen, + u64 maxlen, bool async) { + struct btrfs_discard_ctl *discard_ctl = + &block_group->fs_info->discard_ctl; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *entry; int ret = 0; int ret2; u64 bytes; u64 offset = offset_to_bitmap(ctl, start); + const u64 max_discard_size = READ_ONCE(discard_ctl->max_discard_size); while (offset < end) { bool next_bitmap = false; @@ -3293,35 +3641,84 @@ static int trim_bitmaps(struct btrfs_block_group *block_group, spin_lock(&ctl->tree_lock); if (ctl->free_space < minlen) { + block_group->discard_cursor = + btrfs_block_group_end(block_group); spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); break; } entry = tree_search_offset(ctl, offset, 1, 0); - if (!entry) { + /* + * Bitmaps are marked trimmed lossily now to prevent constant + * discarding of the same bitmap (the reason why we are bound + * by the filters). So, retrim the block group bitmaps when we + * are preparing to punt to the unused_bgs list. This uses + * @minlen to determine if we are in BTRFS_DISCARD_INDEX_UNUSED + * which is the only discard index which sets minlen to 0. + */ + if (!entry || (async && minlen && start == offset && + btrfs_free_space_trimmed(entry))) { spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); next_bitmap = true; goto next; } + /* + * Async discard bitmap trimming begins at by setting the start + * to be key.objectid and the offset_to_bitmap() aligns to the + * start of the bitmap. This lets us know we are fully + * scanning the bitmap rather than only some portion of it. + */ + if (start == offset) + entry->trim_state = BTRFS_TRIM_STATE_TRIMMING; + bytes = minlen; ret2 = search_bitmap(ctl, entry, &start, &bytes, false); if (ret2 || start >= end) { + /* + * We lossily consider a bitmap trimmed if we only skip + * over regions <= BTRFS_ASYNC_DISCARD_MIN_FILTER. + */ + if (ret2 && minlen <= BTRFS_ASYNC_DISCARD_MIN_FILTER) + end_trimming_bitmap(ctl, entry); + else + entry->trim_state = BTRFS_TRIM_STATE_UNTRIMMED; spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); next_bitmap = true; goto next; } + /* + * We already trimmed a region, but are using the locking above + * to reset the trim_state. + */ + if (async && *total_trimmed) { + spin_unlock(&ctl->tree_lock); + mutex_unlock(&ctl->cache_writeout_mutex); + goto out; + } + bytes = min(bytes, end - start); - if (bytes < minlen) { + if (bytes < minlen || (async && maxlen && bytes > maxlen)) { spin_unlock(&ctl->tree_lock); mutex_unlock(&ctl->cache_writeout_mutex); goto next; } + /* + * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. + * If X < @minlen, we won't trim X when we come back around. + * So trim it now. We differ here from trimming extents as we + * don't keep individual state per bit. + */ + if (async && + max_discard_size && + bytes > (max_discard_size + minlen)) + bytes = max_discard_size; + bitmap_clear_bits(ctl, entry, start, bytes); if (entry->bytes == 0) free_bitmap(ctl, entry); @@ -3333,19 +3730,25 @@ static int trim_bitmaps(struct btrfs_block_group *block_group, mutex_unlock(&ctl->cache_writeout_mutex); ret = do_trimming(block_group, total_trimmed, start, bytes, - start, bytes, &trim_entry); - if (ret) + start, bytes, 0, &trim_entry); + if (ret) { + reset_trimming_bitmap(ctl, offset); + block_group->discard_cursor = + btrfs_block_group_end(block_group); break; + } next: if (next_bitmap) { offset += BITS_PER_BITMAP * ctl->unit; + start = offset; } else { start += bytes; - if (start >= offset + BITS_PER_BITMAP * ctl->unit) - offset += BITS_PER_BITMAP * ctl->unit; } + block_group->discard_cursor = start; if (fatal_signal_pending(current)) { + if (start != offset) + reset_trimming_bitmap(ctl, offset); ret = -ERESTARTSYS; break; } @@ -3353,6 +3756,10 @@ next: cond_resched(); } + if (offset >= end) + block_group->discard_cursor = end; + +out: return ret; } @@ -3399,7 +3806,9 @@ void btrfs_put_block_group_trimming(struct btrfs_block_group *block_group) int btrfs_trim_block_group(struct btrfs_block_group *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen) { + struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; int ret; + u64 rem = 0; *trimmed = 0; @@ -3411,16 +3820,66 @@ int btrfs_trim_block_group(struct btrfs_block_group *block_group, btrfs_get_block_group_trimming(block_group); spin_unlock(&block_group->lock); - ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, false); if (ret) goto out; - ret = trim_bitmaps(block_group, trimmed, start, end, minlen); + ret = trim_bitmaps(block_group, trimmed, start, end, minlen, 0, false); + div64_u64_rem(end, BITS_PER_BITMAP * ctl->unit, &rem); + /* If we ended in the middle of a bitmap, reset the trimming flag */ + if (rem) + reset_trimming_bitmap(ctl, offset_to_bitmap(ctl, end)); out: btrfs_put_block_group_trimming(block_group); return ret; } +int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen, + bool async) +{ + int ret; + + *trimmed = 0; + + spin_lock(&block_group->lock); + if (block_group->removed) { + spin_unlock(&block_group->lock); + return 0; + } + btrfs_get_block_group_trimming(block_group); + spin_unlock(&block_group->lock); + + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async); + btrfs_put_block_group_trimming(block_group); + + return ret; +} + +int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen, + u64 maxlen, bool async) +{ + int ret; + + *trimmed = 0; + + spin_lock(&block_group->lock); + if (block_group->removed) { + spin_unlock(&block_group->lock); + return 0; + } + btrfs_get_block_group_trimming(block_group); + spin_unlock(&block_group->lock); + + ret = trim_bitmaps(block_group, trimmed, start, end, minlen, maxlen, + async); + + btrfs_put_block_group_trimming(block_group); + + return ret; +} + /* * Find the left-most item in the cache tree, and then return the * smallest inode number in the item. @@ -3600,6 +4059,7 @@ int test_add_free_space_entry(struct btrfs_block_group *cache, struct btrfs_free_space_ctl *ctl = cache->free_space_ctl; struct btrfs_free_space *info = NULL, *bitmap_info; void *map = NULL; + enum btrfs_trim_state trim_state = BTRFS_TRIM_STATE_TRIMMED; u64 bytes_added; int ret; @@ -3641,7 +4101,8 @@ again: info = NULL; } - bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); + bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes, + trim_state); bytes -= bytes_added; offset += bytes_added; diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index ba9a23241101..2e0a8077aa74 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -6,6 +6,20 @@ #ifndef BTRFS_FREE_SPACE_CACHE_H #define BTRFS_FREE_SPACE_CACHE_H +/* + * This is the trim state of an extent or bitmap. + * + * BTRFS_TRIM_STATE_TRIMMING is special and used to maintain the state of a + * bitmap as we may need several trims to fully trim a single bitmap entry. + * This is reset should any free space other than trimmed space be added to the + * bitmap. + */ +enum btrfs_trim_state { + BTRFS_TRIM_STATE_UNTRIMMED, + BTRFS_TRIM_STATE_TRIMMED, + BTRFS_TRIM_STATE_TRIMMING, +}; + struct btrfs_free_space { struct rb_node offset_index; u64 offset; @@ -13,8 +27,21 @@ struct btrfs_free_space { u64 max_extent_size; unsigned long *bitmap; struct list_head list; + enum btrfs_trim_state trim_state; + s32 bitmap_extents; }; +static inline bool btrfs_free_space_trimmed(struct btrfs_free_space *info) +{ + return (info->trim_state == BTRFS_TRIM_STATE_TRIMMED); +} + +static inline bool btrfs_free_space_trimming_bitmap( + struct btrfs_free_space *info) +{ + return (info->trim_state == BTRFS_TRIM_STATE_TRIMMING); +} + struct btrfs_free_space_ctl { spinlock_t tree_lock; struct rb_root free_space_offset; @@ -24,6 +51,8 @@ struct btrfs_free_space_ctl { int total_bitmaps; int unit; u64 start; + s32 discardable_extents[BTRFS_STAT_NR_ENTRIES]; + s64 discardable_bytes[BTRFS_STAT_NR_ENTRIES]; const struct btrfs_free_space_op *op; void *private; struct mutex cache_writeout_mutex; @@ -83,13 +112,17 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root, void btrfs_init_free_space_ctl(struct btrfs_block_group *block_group); int __btrfs_add_free_space(struct btrfs_fs_info *fs_info, struct btrfs_free_space_ctl *ctl, - u64 bytenr, u64 size); + u64 bytenr, u64 size, + enum btrfs_trim_state trim_state); int btrfs_add_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); +int btrfs_add_free_space_async_trimmed(struct btrfs_block_group *block_group, + u64 bytenr, u64 size); int btrfs_remove_free_space(struct btrfs_block_group *block_group, u64 bytenr, u64 size); void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl); void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group); +bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group); u64 btrfs_find_space_for_alloc(struct btrfs_block_group *block_group, u64 offset, u64 bytes, u64 empty_size, u64 *max_extent_size); @@ -108,6 +141,12 @@ int btrfs_return_cluster_to_free_space( struct btrfs_free_cluster *cluster); int btrfs_trim_block_group(struct btrfs_block_group *block_group, u64 *trimmed, u64 start, u64 end, u64 minlen); +int btrfs_trim_block_group_extents(struct btrfs_block_group *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen, + bool async); +int btrfs_trim_block_group_bitmaps(struct btrfs_block_group *block_group, + u64 *trimmed, u64 start, u64 end, u64 minlen, + u64 maxlen, bool async); /* Support functions for running our sanity tests */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 37345fb6191d..d5c9c69d8263 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -107,7 +107,7 @@ again: if (last != (u64)-1 && last + 1 != key.objectid) { __btrfs_add_free_space(fs_info, ctl, last + 1, - key.objectid - last - 1); + key.objectid - last - 1, 0); wake_up(&root->ino_cache_wait); } @@ -118,7 +118,7 @@ next: if (last < root->highest_objectid - 1) { __btrfs_add_free_space(fs_info, ctl, last + 1, - root->highest_objectid - last - 1); + root->highest_objectid - last - 1, 0); } spin_lock(&root->ino_cache_lock); @@ -175,7 +175,8 @@ static void start_caching(struct btrfs_root *root) ret = btrfs_find_free_objectid(root, &objectid); if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) { __btrfs_add_free_space(fs_info, ctl, objectid, - BTRFS_LAST_FREE_OBJECTID - objectid + 1); + BTRFS_LAST_FREE_OBJECTID - objectid + 1, + 0); wake_up(&root->ino_cache_wait); } @@ -221,7 +222,7 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid) return; again: if (root->ino_cache_state == BTRFS_CACHE_FINISHED) { - __btrfs_add_free_space(fs_info, pinned, objectid, 1); + __btrfs_add_free_space(fs_info, pinned, objectid, 1, 0); } else { down_write(&fs_info->commit_root_sem); spin_lock(&root->ino_cache_lock); @@ -234,7 +235,7 @@ again: start_caching(root); - __btrfs_add_free_space(fs_info, pinned, objectid, 1); + __btrfs_add_free_space(fs_info, pinned, objectid, 1, 0); up_write(&fs_info->commit_root_sem); } @@ -281,7 +282,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) spin_unlock(rbroot_lock); if (count) __btrfs_add_free_space(root->fs_info, ctl, - info->offset, count); + info->offset, count, 0); kmem_cache_free(btrfs_free_space_cachep, info); } } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c70baafb2a39..5b3ec93ff911 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -44,7 +44,6 @@ #include "locking.h" #include "free-space-cache.h" #include "inode-map.h" -#include "backref.h" #include "props.h" #include "qgroup.h" #include "delalloc-space.h" @@ -64,7 +63,6 @@ struct btrfs_dio_data { static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; -static const struct inode_operations btrfs_dir_ro_inode_operations; static const struct inode_operations btrfs_special_inode_operations; static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; @@ -2128,7 +2126,7 @@ static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, bio_flags); goto out; } else if (!skip_sum) { - ret = btrfs_lookup_bio_sums(inode, bio, NULL); + ret = btrfs_lookup_bio_sums(inode, bio, (u64)-1, NULL); if (ret) goto out; } @@ -2191,6 +2189,7 @@ int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end, /* see btrfs_writepage_start_hook for details on why this is required */ struct btrfs_writepage_fixup { struct page *page; + struct inode *inode; struct btrfs_work work; }; @@ -2204,27 +2203,71 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) struct inode *inode; u64 page_start; u64 page_end; - int ret; + int ret = 0; + bool free_delalloc_space = true; fixup = container_of(work, struct btrfs_writepage_fixup, work); page = fixup->page; + inode = fixup->inode; + page_start = page_offset(page); + page_end = page_offset(page) + PAGE_SIZE - 1; + + /* + * This is similar to page_mkwrite, we need to reserve the space before + * we take the page lock. + */ + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, + PAGE_SIZE); again: lock_page(page); + + /* + * Before we queued this fixup, we took a reference on the page. + * page->mapping may go NULL, but it shouldn't be moved to a different + * address space. + */ if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { - ClearPageChecked(page); + /* + * Unfortunately this is a little tricky, either + * + * 1) We got here and our page had already been dealt with and + * we reserved our space, thus ret == 0, so we need to just + * drop our space reservation and bail. This can happen the + * first time we come into the fixup worker, or could happen + * while waiting for the ordered extent. + * 2) Our page was already dealt with, but we happened to get an + * ENOSPC above from the btrfs_delalloc_reserve_space. In + * this case we obviously don't have anything to release, but + * because the page was already dealt with we don't want to + * mark the page with an error, so make sure we're resetting + * ret to 0. This is why we have this check _before_ the ret + * check, because we do not want to have a surprise ENOSPC + * when the page was already properly dealt with. + */ + if (!ret) { + btrfs_delalloc_release_extents(BTRFS_I(inode), + PAGE_SIZE); + btrfs_delalloc_release_space(inode, data_reserved, + page_start, PAGE_SIZE, + true); + } + ret = 0; goto out_page; } - inode = page->mapping->host; - page_start = page_offset(page); - page_end = page_offset(page) + PAGE_SIZE - 1; + /* + * We can't mess with the page state unless it is locked, so now that + * it is locked bail if we failed to make our space reservation. + */ + if (ret) + goto out_page; lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ if (PagePrivate2(page)) - goto out; + goto out_reserved; ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); @@ -2237,39 +2280,49 @@ again: goto again; } - ret = btrfs_delalloc_reserve_space(inode, &data_reserved, page_start, - PAGE_SIZE); - if (ret) { - mapping_set_error(page->mapping, ret); - end_extent_writepage(page, ret, page_start, page_end); - ClearPageChecked(page); - goto out; - } - ret = btrfs_set_extent_delalloc(inode, page_start, page_end, 0, &cached_state); - if (ret) { - mapping_set_error(page->mapping, ret); - end_extent_writepage(page, ret, page_start, page_end); - ClearPageChecked(page); + if (ret) goto out_reserved; - } - ClearPageChecked(page); - set_page_dirty(page); + /* + * Everything went as planned, we're now the owner of a dirty page with + * delayed allocation bits set and space reserved for our COW + * destination. + * + * The page was dirty when we started, nothing should have cleaned it. + */ + BUG_ON(!PageDirty(page)); + free_delalloc_space = false; out_reserved: btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - if (ret) + if (free_delalloc_space) btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); -out: unlock_extent_cached(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state); out_page: + if (ret) { + /* + * We hit ENOSPC or other errors. Update the mapping and page + * to reflect the errors and clean the page. + */ + mapping_set_error(page->mapping, ret); + end_extent_writepage(page, ret, page_start, page_end); + clear_page_dirty_for_io(page); + SetPageError(page); + } + ClearPageChecked(page); unlock_page(page); put_page(page); kfree(fixup); extent_changeset_free(data_reserved); + /* + * As a precaution, do a delayed iput in case it would be the last iput + * that could need flushing space. Recursing back to fixup worker would + * deadlock. + */ + btrfs_add_delayed_iput(inode); } /* @@ -2293,6 +2346,13 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) if (TestClearPagePrivate2(page)) return 0; + /* + * PageChecked is set below when we create a fixup worker for this page, + * don't try to create another one if we're already PageChecked() + * + * The extent_io writepage code will redirty the page if we send back + * EAGAIN. + */ if (PageChecked(page)) return -EAGAIN; @@ -2300,12 +2360,21 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end) if (!fixup) return -EAGAIN; + /* + * We are already holding a reference to this inode from + * write_cache_pages. We need to hold it because the space reservation + * takes place outside of the page lock, and we can't trust + * page->mapping outside of the page lock. + */ + ihold(inode); SetPageChecked(page); get_page(page); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); fixup->page = page; + fixup->inode = inode; btrfs_queue_work(fs_info->fixup_workers, &fixup->work); - return -EBUSY; + + return -EAGAIN; } static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, @@ -2394,649 +2463,6 @@ out: return ret; } -/* snapshot-aware defrag */ -struct sa_defrag_extent_backref { - struct rb_node node; - struct old_sa_defrag_extent *old; - u64 root_id; - u64 inum; - u64 file_pos; - u64 extent_offset; - u64 num_bytes; - u64 generation; -}; - -struct old_sa_defrag_extent { - struct list_head list; - struct new_sa_defrag_extent *new; - - u64 extent_offset; - u64 bytenr; - u64 offset; - u64 len; - int count; -}; - -struct new_sa_defrag_extent { - struct rb_root root; - struct list_head head; - struct btrfs_path *path; - struct inode *inode; - u64 file_pos; - u64 len; - u64 bytenr; - u64 disk_len; - u8 compress_type; -}; - -static int backref_comp(struct sa_defrag_extent_backref *b1, - struct sa_defrag_extent_backref *b2) -{ - if (b1->root_id < b2->root_id) - return -1; - else if (b1->root_id > b2->root_id) - return 1; - - if (b1->inum < b2->inum) - return -1; - else if (b1->inum > b2->inum) - return 1; - - if (b1->file_pos < b2->file_pos) - return -1; - else if (b1->file_pos > b2->file_pos) - return 1; - - /* - * [------------------------------] ===> (a range of space) - * |<--->| |<---->| =============> (fs/file tree A) - * |<---------------------------->| ===> (fs/file tree B) - * - * A range of space can refer to two file extents in one tree while - * refer to only one file extent in another tree. - * - * So we may process a disk offset more than one time(two extents in A) - * and locate at the same extent(one extent in B), then insert two same - * backrefs(both refer to the extent in B). - */ - return 0; -} - -static void backref_insert(struct rb_root *root, - struct sa_defrag_extent_backref *backref) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - struct sa_defrag_extent_backref *entry; - int ret; - - while (*p) { - parent = *p; - entry = rb_entry(parent, struct sa_defrag_extent_backref, node); - - ret = backref_comp(backref, entry); - if (ret < 0) - p = &(*p)->rb_left; - else - p = &(*p)->rb_right; - } - - rb_link_node(&backref->node, parent, p); - rb_insert_color(&backref->node, root); -} - -/* - * Note the backref might has changed, and in this case we just return 0. - */ -static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id, - void *ctx) -{ - struct btrfs_file_extent_item *extent; - struct old_sa_defrag_extent *old = ctx; - struct new_sa_defrag_extent *new = old->new; - struct btrfs_path *path = new->path; - struct btrfs_key key; - struct btrfs_root *root; - struct sa_defrag_extent_backref *backref; - struct extent_buffer *leaf; - struct inode *inode = new->inode; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int slot; - int ret; - u64 extent_offset; - u64 num_bytes; - - if (BTRFS_I(inode)->root->root_key.objectid == root_id && - inum == btrfs_ino(BTRFS_I(inode))) - return 0; - - key.objectid = root_id; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(root)) { - if (PTR_ERR(root) == -ENOENT) - return 0; - WARN_ON(1); - btrfs_debug(fs_info, "inum=%llu, offset=%llu, root_id=%llu", - inum, offset, root_id); - return PTR_ERR(root); - } - - key.objectid = inum; - key.type = BTRFS_EXTENT_DATA_KEY; - if (offset > (u64)-1 << 32) - key.offset = 0; - else - key.offset = offset; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (WARN_ON(ret < 0)) - return ret; - ret = 0; - - while (1) { - cond_resched(); - - leaf = path->nodes[0]; - slot = path->slots[0]; - - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = 0; - goto out; - } - continue; - } - - path->slots[0]++; - - btrfs_item_key_to_cpu(leaf, &key, slot); - - if (key.objectid > inum) - goto out; - - if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY) - continue; - - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - - if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr) - continue; - - /* - * 'offset' refers to the exact key.offset, - * NOT the 'offset' field in btrfs_extent_data_ref, ie. - * (key.offset - extent_offset). - */ - if (key.offset != offset) - continue; - - extent_offset = btrfs_file_extent_offset(leaf, extent); - num_bytes = btrfs_file_extent_num_bytes(leaf, extent); - - if (extent_offset >= old->extent_offset + old->offset + - old->len || extent_offset + num_bytes <= - old->extent_offset + old->offset) - continue; - break; - } - - backref = kmalloc(sizeof(*backref), GFP_NOFS); - if (!backref) { - ret = -ENOENT; - goto out; - } - - backref->root_id = root_id; - backref->inum = inum; - backref->file_pos = offset; - backref->num_bytes = num_bytes; - backref->extent_offset = extent_offset; - backref->generation = btrfs_file_extent_generation(leaf, extent); - backref->old = old; - backref_insert(&new->root, backref); - old->count++; -out: - btrfs_release_path(path); - WARN_ON(ret); - return ret; -} - -static noinline bool record_extent_backrefs(struct btrfs_path *path, - struct new_sa_defrag_extent *new) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb); - struct old_sa_defrag_extent *old, *tmp; - int ret; - - new->path = path; - - list_for_each_entry_safe(old, tmp, &new->head, list) { - ret = iterate_inodes_from_logical(old->bytenr + - old->extent_offset, fs_info, - path, record_one_backref, - old, false); - if (ret < 0 && ret != -ENOENT) - return false; - - /* no backref to be processed for this extent */ - if (!old->count) { - list_del(&old->list); - kfree(old); - } - } - - if (list_empty(&new->head)) - return false; - - return true; -} - -static int relink_is_mergable(struct extent_buffer *leaf, - struct btrfs_file_extent_item *fi, - struct new_sa_defrag_extent *new) -{ - if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr) - return 0; - - if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) - return 0; - - if (btrfs_file_extent_compression(leaf, fi) != new->compress_type) - return 0; - - if (btrfs_file_extent_encryption(leaf, fi) || - btrfs_file_extent_other_encoding(leaf, fi)) - return 0; - - return 1; -} - -/* - * Note the backref might has changed, and in this case we just return 0. - */ -static noinline int relink_extent_backref(struct btrfs_path *path, - struct sa_defrag_extent_backref *prev, - struct sa_defrag_extent_backref *backref) -{ - struct btrfs_file_extent_item *extent; - struct btrfs_file_extent_item *item; - struct btrfs_ordered_extent *ordered; - struct btrfs_trans_handle *trans; - struct btrfs_ref ref = { 0 }; - struct btrfs_root *root; - struct btrfs_key key; - struct extent_buffer *leaf; - struct old_sa_defrag_extent *old = backref->old; - struct new_sa_defrag_extent *new = old->new; - struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb); - struct inode *inode; - struct extent_state *cached = NULL; - int ret = 0; - u64 start; - u64 len; - u64 lock_start; - u64 lock_end; - bool merge = false; - int index; - - if (prev && prev->root_id == backref->root_id && - prev->inum == backref->inum && - prev->file_pos + prev->num_bytes == backref->file_pos) - merge = true; - - /* step 1: get root */ - key.objectid = backref->root_id; - key.type = BTRFS_ROOT_ITEM_KEY; - key.offset = (u64)-1; - - index = srcu_read_lock(&fs_info->subvol_srcu); - - root = btrfs_read_fs_root_no_name(fs_info, &key); - if (IS_ERR(root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); - if (PTR_ERR(root) == -ENOENT) - return 0; - return PTR_ERR(root); - } - - if (btrfs_root_readonly(root)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); - return 0; - } - - /* step 2: get inode */ - key.objectid = backref->inum; - key.type = BTRFS_INODE_ITEM_KEY; - key.offset = 0; - - inode = btrfs_iget(fs_info->sb, &key, root); - if (IS_ERR(inode)) { - srcu_read_unlock(&fs_info->subvol_srcu, index); - return 0; - } - - srcu_read_unlock(&fs_info->subvol_srcu, index); - - /* step 3: relink backref */ - lock_start = backref->file_pos; - lock_end = backref->file_pos + backref->num_bytes - 1; - lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, - &cached); - - ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); - if (ordered) { - btrfs_put_ordered_extent(ordered); - goto out_unlock; - } - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out_unlock; - } - - key.objectid = backref->inum; - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = backref->file_pos; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) { - goto out_free_path; - } else if (ret > 0) { - ret = 0; - goto out_free_path; - } - - extent = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - - if (btrfs_file_extent_generation(path->nodes[0], extent) != - backref->generation) - goto out_free_path; - - btrfs_release_path(path); - - start = backref->file_pos; - if (backref->extent_offset < old->extent_offset + old->offset) - start += old->extent_offset + old->offset - - backref->extent_offset; - - len = min(backref->extent_offset + backref->num_bytes, - old->extent_offset + old->offset + old->len); - len -= max(backref->extent_offset, old->extent_offset + old->offset); - - ret = btrfs_drop_extents(trans, root, inode, start, - start + len, 1); - if (ret) - goto out_free_path; -again: - key.objectid = btrfs_ino(BTRFS_I(inode)); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = start; - - path->leave_spinning = 1; - if (merge) { - struct btrfs_file_extent_item *fi; - u64 extent_len; - struct btrfs_key found_key; - - ret = btrfs_search_slot(trans, root, &key, path, 0, 1); - if (ret < 0) - goto out_free_path; - - path->slots[0]--; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - extent_len = btrfs_file_extent_num_bytes(leaf, fi); - - if (extent_len + found_key.offset == start && - relink_is_mergable(leaf, fi, new)) { - btrfs_set_file_extent_num_bytes(leaf, fi, - extent_len + len); - btrfs_mark_buffer_dirty(leaf); - inode_add_bytes(inode, len); - - ret = 1; - goto out_free_path; - } else { - merge = false; - btrfs_release_path(path); - goto again; - } - } - - ret = btrfs_insert_empty_item(trans, root, path, &key, - sizeof(*extent)); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_free_path; - } - - leaf = path->nodes[0]; - item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr); - btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len); - btrfs_set_file_extent_offset(leaf, item, start - new->file_pos); - btrfs_set_file_extent_num_bytes(leaf, item, len); - btrfs_set_file_extent_ram_bytes(leaf, item, new->len); - btrfs_set_file_extent_generation(leaf, item, trans->transid); - btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); - btrfs_set_file_extent_compression(leaf, item, new->compress_type); - btrfs_set_file_extent_encryption(leaf, item, 0); - btrfs_set_file_extent_other_encoding(leaf, item, 0); - - btrfs_mark_buffer_dirty(leaf); - inode_add_bytes(inode, len); - btrfs_release_path(path); - - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new->bytenr, - new->disk_len, 0); - btrfs_init_data_ref(&ref, backref->root_id, backref->inum, - new->file_pos); /* start - extent_offset */ - ret = btrfs_inc_extent_ref(trans, &ref); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out_free_path; - } - - ret = 1; -out_free_path: - btrfs_release_path(path); - path->leave_spinning = 0; - btrfs_end_transaction(trans); -out_unlock: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end, - &cached); - iput(inode); - return ret; -} - -static void free_sa_defrag_extent(struct new_sa_defrag_extent *new) -{ - struct old_sa_defrag_extent *old, *tmp; - - if (!new) - return; - - list_for_each_entry_safe(old, tmp, &new->head, list) { - kfree(old); - } - kfree(new); -} - -static void relink_file_extents(struct new_sa_defrag_extent *new) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(new->inode->i_sb); - struct btrfs_path *path; - struct sa_defrag_extent_backref *backref; - struct sa_defrag_extent_backref *prev = NULL; - struct rb_node *node; - int ret; - - path = btrfs_alloc_path(); - if (!path) - return; - - if (!record_extent_backrefs(path, new)) { - btrfs_free_path(path); - goto out; - } - btrfs_release_path(path); - - while (1) { - node = rb_first(&new->root); - if (!node) - break; - rb_erase(node, &new->root); - - backref = rb_entry(node, struct sa_defrag_extent_backref, node); - - ret = relink_extent_backref(path, prev, backref); - WARN_ON(ret < 0); - - kfree(prev); - - if (ret == 1) - prev = backref; - else - prev = NULL; - cond_resched(); - } - kfree(prev); - - btrfs_free_path(path); -out: - free_sa_defrag_extent(new); - - atomic_dec(&fs_info->defrag_running); - wake_up(&fs_info->transaction_wait); -} - -static struct new_sa_defrag_extent * -record_old_file_extents(struct inode *inode, - struct btrfs_ordered_extent *ordered) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_path *path; - struct btrfs_key key; - struct old_sa_defrag_extent *old; - struct new_sa_defrag_extent *new; - int ret; - - new = kmalloc(sizeof(*new), GFP_NOFS); - if (!new) - return NULL; - - new->inode = inode; - new->file_pos = ordered->file_offset; - new->len = ordered->len; - new->bytenr = ordered->start; - new->disk_len = ordered->disk_len; - new->compress_type = ordered->compress_type; - new->root = RB_ROOT; - INIT_LIST_HEAD(&new->head); - - path = btrfs_alloc_path(); - if (!path) - goto out_kfree; - - key.objectid = btrfs_ino(BTRFS_I(inode)); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = new->file_pos; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out_free_path; - if (ret > 0 && path->slots[0] > 0) - path->slots[0]--; - - /* find out all the old extents for the file range */ - while (1) { - struct btrfs_file_extent_item *extent; - struct extent_buffer *l; - int slot; - u64 num_bytes; - u64 offset; - u64 end; - u64 disk_bytenr; - u64 extent_offset; - - l = path->nodes[0]; - slot = path->slots[0]; - - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out_free_path; - else if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(l, &key, slot); - - if (key.objectid != btrfs_ino(BTRFS_I(inode))) - break; - if (key.type != BTRFS_EXTENT_DATA_KEY) - break; - if (key.offset >= new->file_pos + new->len) - break; - - extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item); - - num_bytes = btrfs_file_extent_num_bytes(l, extent); - if (key.offset + num_bytes < new->file_pos) - goto next; - - disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent); - if (!disk_bytenr) - goto next; - - extent_offset = btrfs_file_extent_offset(l, extent); - - old = kmalloc(sizeof(*old), GFP_NOFS); - if (!old) - goto out_free_path; - - offset = max(new->file_pos, key.offset); - end = min(new->file_pos + new->len, key.offset + num_bytes); - - old->bytenr = disk_bytenr; - old->extent_offset = extent_offset; - old->offset = offset - key.offset; - old->len = end - offset; - old->new = new; - old->count = 0; - list_add_tail(&old->list, &new->head); -next: - path->slots[0]++; - cond_resched(); - } - - btrfs_free_path(path); - atomic_inc(&fs_info->defrag_running); - - return new; - -out_free_path: - btrfs_free_path(path); -out_kfree: - free_sa_defrag_extent(new); - return NULL; -} - static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, u64 start, u64 len) { @@ -3064,15 +2490,19 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) struct btrfs_trans_handle *trans = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; - struct new_sa_defrag_extent *new = NULL; + u64 start, end; int compress_type = 0; int ret = 0; - u64 logical_len = ordered_extent->len; + u64 logical_len = ordered_extent->num_bytes; bool freespace_inode; bool truncated = false; bool range_locked = false; bool clear_new_delalloc_bytes = false; bool clear_reserved_extent = true; + unsigned int clear_bits; + + start = ordered_extent->file_offset; + end = start + ordered_extent->num_bytes - 1; if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && @@ -3086,10 +2516,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - btrfs_free_io_failure_record(BTRFS_I(inode), - ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1); + btrfs_free_io_failure_record(BTRFS_I(inode), start, end); if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; @@ -3107,8 +2534,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) * space for NOCOW range. * As NOCOW won't cause a new delayed ref, just free the space */ - btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset, - ordered_extent->len); + btrfs_qgroup_free_data(inode, NULL, start, + ordered_extent->num_bytes); btrfs_ordered_update_i_size(inode, 0, ordered_extent); if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); @@ -3127,23 +2554,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) } range_locked = true; - lock_extent_bits(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + ordered_extent->len - 1, - &cached_state); - - ret = test_range_bit(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + ordered_extent->len - 1, - EXTENT_DEFRAG, 0, cached_state); - if (ret) { - u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item); - if (0 && last_snapshot >= BTRFS_I(inode)->generation) - /* the inode is shared */ - new = record_old_file_extents(inode, ordered_extent); - - clear_extent_bit(io_tree, ordered_extent->file_offset, - ordered_extent->file_offset + ordered_extent->len - 1, - EXTENT_DEFRAG, 0, 0, &cached_state); - } + lock_extent_bits(io_tree, start, end, &cached_state); if (freespace_inode) trans = btrfs_join_transaction_spacecache(root); @@ -3161,31 +2572,30 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) compress_type = ordered_extent->compress_type; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { BUG_ON(compress_type); - btrfs_qgroup_free_data(inode, NULL, ordered_extent->file_offset, - ordered_extent->len); + btrfs_qgroup_free_data(inode, NULL, start, + ordered_extent->num_bytes); ret = btrfs_mark_extent_written(trans, BTRFS_I(inode), ordered_extent->file_offset, ordered_extent->file_offset + logical_len); } else { BUG_ON(root == fs_info->tree_root); - ret = insert_reserved_file_extent(trans, inode, - ordered_extent->file_offset, - ordered_extent->start, - ordered_extent->disk_len, + ret = insert_reserved_file_extent(trans, inode, start, + ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes, logical_len, logical_len, compress_type, 0, 0, BTRFS_FILE_EXTENT_REG); if (!ret) { clear_reserved_extent = false; btrfs_release_delalloc_bytes(fs_info, - ordered_extent->start, - ordered_extent->disk_len); + ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); } } unpin_extent_cache(&BTRFS_I(inode)->extent_tree, - ordered_extent->file_offset, ordered_extent->len, - trans->transid); + ordered_extent->file_offset, + ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; @@ -3205,37 +2615,27 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) } ret = 0; out: - if (range_locked || clear_new_delalloc_bytes) { - unsigned int clear_bits = 0; - - if (range_locked) - clear_bits |= EXTENT_LOCKED; - if (clear_new_delalloc_bytes) - clear_bits |= EXTENT_DELALLOC_NEW; - clear_extent_bit(&BTRFS_I(inode)->io_tree, - ordered_extent->file_offset, - ordered_extent->file_offset + - ordered_extent->len - 1, - clear_bits, - (clear_bits & EXTENT_LOCKED) ? 1 : 0, - 0, &cached_state); - } + clear_bits = EXTENT_DEFRAG; + if (range_locked) + clear_bits |= EXTENT_LOCKED; + if (clear_new_delalloc_bytes) + clear_bits |= EXTENT_DELALLOC_NEW; + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, clear_bits, + (clear_bits & EXTENT_LOCKED) ? 1 : 0, 0, + &cached_state); if (trans) btrfs_end_transaction(trans); if (ret || truncated) { - u64 start, end; + u64 unwritten_start = start; if (truncated) - start = ordered_extent->file_offset + logical_len; - else - start = ordered_extent->file_offset; - end = ordered_extent->file_offset + ordered_extent->len - 1; - clear_extent_uptodate(io_tree, start, end, NULL); + unwritten_start += logical_len; + clear_extent_uptodate(io_tree, unwritten_start, end, NULL); /* Drop the cache for the part of the extent we didn't write. */ - btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 0); + btrfs_drop_extent_cache(BTRFS_I(inode), unwritten_start, end, 0); /* * If the ordered extent had an IOERR or something else went @@ -3250,29 +2650,28 @@ out: if ((ret || !logical_len) && clear_reserved_extent && !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && - !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { + /* + * Discard the range before returning it back to the + * free space pool + */ + if (ret && btrfs_test_opt(fs_info, DISCARD_SYNC)) + btrfs_discard_extent(fs_info, + ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes, + NULL); btrfs_free_reserved_extent(fs_info, - ordered_extent->start, - ordered_extent->disk_len, 1); + ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes, 1); + } } - /* * This needs to be done to make sure anybody waiting knows we are done * updating everything for this ordered extent. */ btrfs_remove_ordered_extent(inode, ordered_extent); - /* for snapshot-aware defrag */ - if (new) { - if (ret) { - free_sa_defrag_extent(new); - atomic_dec(&fs_info->defrag_running); - } else { - relink_file_extents(new); - } - } - /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ @@ -5176,7 +4575,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) cur_offset = hole_start; while (1) { em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset, - block_end - cur_offset, 0); + block_end - cur_offset); if (IS_ERR(em)) { err = PTR_ERR(em); em = NULL; @@ -5860,7 +5259,11 @@ static struct inode *new_simple_dir(struct super_block *s, set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); inode->i_ino = BTRFS_EMPTY_SUBVOL_DIR_OBJECTID; - inode->i_op = &btrfs_dir_ro_inode_operations; + /* + * We only need lookup, the rest is read-only and there's no inode + * associated with the dentry + */ + inode->i_op = &simple_dir_inode_operations; inode->i_opflags &= ~IOP_XATTR; inode->i_fop = &simple_dir_operations; inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; @@ -6951,18 +6354,27 @@ static noinline int uncompress_inline(struct btrfs_path *path, return ret; } -/* - * a bit scary, this does extent mapping from logical file offset to the disk. - * the ugly parts come from merging extents from the disk with the in-ram - * representation. This gets more complex because of the data=ordered code, - * where the in-ram extents might be locked pending data=ordered completion. +/** + * btrfs_get_extent - Lookup the first extent overlapping a range in a file. + * @inode: file to search in + * @page: page to read extent data into if the extent is inline + * @pg_offset: offset into @page to copy to + * @start: file offset + * @len: length of range starting at @start + * + * This returns the first &struct extent_map which overlaps with the given + * range, reading it from the B-tree and caching it if necessary. Note that + * there may be more extents which overlap the given range after the returned + * extent_map. * - * This also copies inline extents directly into the page. + * If @page is not NULL and the extent is inline, this also reads the extent + * data directly into the page and marks the extent up to date in the io_tree. + * + * Return: ERR_PTR on error, non-NULL extent_map on success. */ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, - struct page *page, - size_t pg_offset, u64 start, u64 len, - int create) + struct page *page, size_t pg_offset, + u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; int ret; @@ -6979,7 +6391,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct extent_map *em = NULL; struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_io_tree *io_tree = &inode->io_tree; - const bool new_inline = !page || create; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, len); @@ -7102,8 +6513,7 @@ next: goto insert; } - btrfs_extent_item_to_extent_map(inode, path, item, - new_inline, em); + btrfs_extent_item_to_extent_map(inode, path, item, !page, em); if (extent_type == BTRFS_FILE_EXTENT_REG || extent_type == BTRFS_FILE_EXTENT_PREALLOC) { @@ -7115,7 +6525,7 @@ next: size_t extent_offset; size_t copy_size; - if (new_inline) + if (!page) goto out; size = btrfs_file_extent_ram_bytes(leaf, item); @@ -7198,7 +6608,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 delalloc_end; int err = 0; - em = btrfs_get_extent(inode, NULL, 0, start, len, 0); + em = btrfs_get_extent(inode, NULL, 0, start, len); if (IS_ERR(em)) return em; /* @@ -7823,7 +7233,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, goto err; } - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto unlock_err; @@ -8375,8 +7785,8 @@ static inline blk_status_t btrfs_lookup_and_bind_dio_csum(struct inode *inode, * contention. */ if (dip->logical_offset == file_offset) { - ret = btrfs_lookup_bio_sums_dio(inode, dip->orig_bio, - file_offset); + ret = btrfs_lookup_bio_sums(inode, dip->orig_bio, file_offset, + NULL); if (ret) return ret; } @@ -8889,7 +8299,8 @@ again: ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, page_end - start + 1); if (ordered) { - end = min(page_end, ordered->file_offset + ordered->len - 1); + end = min(page_end, + ordered->file_offset + ordered->num_bytes - 1); /* * IO on this page will never be started, so we need * to account for any ordered extents now @@ -9090,7 +8501,6 @@ again: ret = VM_FAULT_SIGBUS; goto out_unlock; } - ret2 = 0; /* page is wholly or partially inside EOF */ if (page_start + PAGE_SIZE > size) @@ -9114,12 +8524,10 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state); - if (!ret2) { - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); - sb_end_pagefault(inode->i_sb); - extent_changeset_free(data_reserved); - return VM_FAULT_LOCKED; - } + btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + sb_end_pagefault(inode->i_sb); + extent_changeset_free(data_reserved); + return VM_FAULT_LOCKED; out_unlock: unlock_page(page); @@ -9417,7 +8825,7 @@ void btrfs_destroy_inode(struct inode *inode) else { btrfs_err(fs_info, "found ordered extent %llu %llu on inode cleanup", - ordered->file_offset, ordered->len); + ordered->file_offset, ordered->num_bytes); btrfs_remove_ordered_extent(inode, ordered); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); @@ -10836,7 +10244,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct btrfs_block_group *bg; u64 len = isize - start; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -11004,11 +10412,6 @@ static const struct inode_operations btrfs_dir_inode_operations = { .update_time = btrfs_update_time, .tmpfile = btrfs_tmpfile, }; -static const struct inode_operations btrfs_dir_ro_inode_operations = { - .lookup = btrfs_lookup, - .permission = btrfs_permission, - .update_time = btrfs_update_time, -}; static const struct file_operations btrfs_dir_file_operations = { .llseek = generic_file_llseek, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 12ae31e1813e..4f4b13830b25 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1128,7 +1128,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) /* get the big lock and read metadata off disk */ lock_extent_bits(io_tree, start, end, &cached); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); unlock_extent_cached(io_tree, start, end, &cached); if (IS_ERR(em)) @@ -3243,6 +3243,7 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, struct inode *dst, u64 dst_loff) { + const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; int ret; /* @@ -3250,7 +3251,7 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, * source range to serialize with relocation. */ btrfs_double_extent_lock(src, loff, dst, dst_loff, len); - ret = btrfs_clone(src, dst, loff, len, len, dst_loff, 1); + ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); return ret; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index fb09bc2f8e4d..ecb9fb6a6fe0 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -20,9 +20,9 @@ static struct kmem_cache *btrfs_ordered_extent_cache; static u64 entry_end(struct btrfs_ordered_extent *entry) { - if (entry->file_offset + entry->len < entry->file_offset) + if (entry->file_offset + entry->num_bytes < entry->file_offset) return (u64)-1; - return entry->file_offset + entry->len; + return entry->file_offset + entry->num_bytes; } /* returns NULL if the insertion worked, or it returns the node it did find @@ -52,14 +52,6 @@ static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, return NULL; } -static void ordered_data_tree_panic(struct inode *inode, int errno, - u64 offset) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - btrfs_panic(fs_info, errno, - "Inconsistency in ordered tree at offset %llu", offset); -} - /* * look for a given offset in the tree, and if it can't be found return the * first lesser offset @@ -120,7 +112,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) { if (file_offset < entry->file_offset || - entry->file_offset + entry->len <= file_offset) + entry->file_offset + entry->num_bytes <= file_offset) return 0; return 1; } @@ -129,7 +121,7 @@ static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, u64 len) { if (file_offset + len <= entry->file_offset || - entry->file_offset + entry->len <= file_offset) + entry->file_offset + entry->num_bytes <= file_offset) return 0; return 1; } @@ -161,19 +153,14 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, } /* allocate and add a new ordered_extent into the per-inode tree. - * file_offset is the logical offset in the file - * - * start is the disk block number of an extent already reserved in the - * extent allocation tree - * - * len is the length of the extent * * The tree is given a single reference on the ordered extent that was * inserted. */ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, - int type, int dio, int compress_type) + u64 disk_bytenr, u64 num_bytes, + u64 disk_num_bytes, int type, int dio, + int compress_type) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -187,10 +174,10 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, return -ENOMEM; entry->file_offset = file_offset; - entry->start = start; - entry->len = len; - entry->disk_len = disk_len; - entry->bytes_left = len; + entry->disk_bytenr = disk_bytenr; + entry->num_bytes = num_bytes; + entry->disk_num_bytes = disk_num_bytes; + entry->bytes_left = num_bytes; entry->inode = igrab(inode); entry->compress_type = compress_type; entry->truncated_len = (u64)-1; @@ -198,7 +185,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, set_bit(type, &entry->flags); if (dio) { - percpu_counter_add_batch(&fs_info->dio_bytes, len, + percpu_counter_add_batch(&fs_info->dio_bytes, num_bytes, fs_info->delalloc_batch); set_bit(BTRFS_ORDERED_DIRECT, &entry->flags); } @@ -219,7 +206,9 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, node = tree_insert(&tree->tree, file_offset, &entry->rb_node); if (node) - ordered_data_tree_panic(inode, -EEXIST, file_offset); + btrfs_panic(fs_info, -EEXIST, + "inconsistency in ordered tree at offset %llu", + file_offset); spin_unlock_irq(&tree->lock); spin_lock(&root->ordered_extent_lock); @@ -247,27 +236,30 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, } int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int type) + u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, + int type) { - return __btrfs_add_ordered_extent(inode, file_offset, start, len, - disk_len, type, 0, + return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, + num_bytes, disk_num_bytes, type, 0, BTRFS_COMPRESS_NONE); } int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int type) + u64 disk_bytenr, u64 num_bytes, + u64 disk_num_bytes, int type) { - return __btrfs_add_ordered_extent(inode, file_offset, start, len, - disk_len, type, 1, + return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, + num_bytes, disk_num_bytes, type, 1, BTRFS_COMPRESS_NONE); } int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, - int type, int compress_type) + u64 disk_bytenr, u64 num_bytes, + u64 disk_num_bytes, int type, + int compress_type) { - return __btrfs_add_ordered_extent(inode, file_offset, start, len, - disk_len, type, 0, + return __btrfs_add_ordered_extent(inode, file_offset, disk_bytenr, + num_bytes, disk_num_bytes, type, 0, compress_type); } @@ -328,8 +320,8 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, } dec_start = max(*file_offset, entry->file_offset); - dec_end = min(*file_offset + io_size, entry->file_offset + - entry->len); + dec_end = min(*file_offset + io_size, + entry->file_offset + entry->num_bytes); *file_offset = dec_end; if (dec_start > dec_end) { btrfs_crit(fs_info, "bad ordering dec_start %llu end %llu", @@ -471,10 +463,11 @@ void btrfs_remove_ordered_extent(struct inode *inode, btrfs_mod_outstanding_extents(btrfs_inode, -1); spin_unlock(&btrfs_inode->lock); if (root != fs_info->tree_root) - btrfs_delalloc_release_metadata(btrfs_inode, entry->len, false); + btrfs_delalloc_release_metadata(btrfs_inode, entry->num_bytes, + false); if (test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - percpu_counter_add_batch(&fs_info->dio_bytes, -entry->len, + percpu_counter_add_batch(&fs_info->dio_bytes, -entry->num_bytes, fs_info->delalloc_batch); tree = &btrfs_inode->ordered_tree; @@ -534,8 +527,8 @@ u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, ordered = list_first_entry(&splice, struct btrfs_ordered_extent, root_extent_list); - if (range_end <= ordered->start || - ordered->start + ordered->disk_len <= range_start) { + if (range_end <= ordered->disk_bytenr || + ordered->disk_bytenr + ordered->disk_num_bytes <= range_start) { list_move_tail(&ordered->root_extent_list, &skipped); cond_resched_lock(&root->ordered_extent_lock); continue; @@ -619,7 +612,7 @@ void btrfs_start_ordered_extent(struct inode *inode, int wait) { u64 start = entry->file_offset; - u64 end = start + entry->len - 1; + u64 end = start + entry->num_bytes - 1; trace_btrfs_ordered_extent_start(inode, entry); @@ -680,7 +673,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } - if (ordered->file_offset + ordered->len <= start) { + if (ordered->file_offset + ordered->num_bytes <= start) { btrfs_put_ordered_extent(ordered); break; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 4eb0319a86d7..3beb4da4ab41 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -67,14 +67,13 @@ struct btrfs_ordered_extent { /* logical offset in the file */ u64 file_offset; - /* disk byte number */ - u64 start; - - /* ram length of the extent in bytes */ - u64 len; - - /* extent length on disk */ - u64 disk_len; + /* + * These fields directly correspond to the same fields in + * btrfs_file_extent_item. + */ + u64 disk_bytenr; + u64 num_bytes; + u64 disk_num_bytes; /* number of bytes that still need writing */ u64 bytes_left; @@ -161,12 +160,15 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode, u64 *file_offset, u64 io_size, int uptodate); int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int type); + u64 disk_bytenr, u64 num_bytes, u64 disk_num_bytes, + int type); int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, int type); + u64 disk_bytenr, u64 num_bytes, + u64 disk_num_bytes, int type); int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset, - u64 start, u64 len, u64 disk_len, - int type, int compress_type); + u64 disk_bytenr, u64 num_bytes, + u64 disk_num_bytes, int type, + int compress_type); void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 873b6b694107..61f44e78e3c9 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -317,7 +317,7 @@ void btrfs_print_leaf(struct extent_buffer *l) print_uuid_item(l, btrfs_item_ptr_offset(l, i), btrfs_item_size_nr(l, i)); break; - }; + } } } diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 39fc8c3d3a75..98d9a50352d6 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1243,7 +1243,6 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; @@ -1259,9 +1258,8 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, return -ENOMEM; mutex_lock(&fs_info->qgroup_ioctl_lock); - quota_root = fs_info->quota_root; - if (!quota_root) { - ret = -EINVAL; + if (!fs_info->quota_root) { + ret = -ENOTCONN; goto out; } member = find_qgroup_rb(fs_info, src); @@ -1307,7 +1305,6 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, u64 dst) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root; struct btrfs_qgroup *parent; struct btrfs_qgroup *member; struct btrfs_qgroup_list *list; @@ -1320,9 +1317,8 @@ static int __del_qgroup_relation(struct btrfs_trans_handle *trans, u64 src, if (!tmp) return -ENOMEM; - quota_root = fs_info->quota_root; - if (!quota_root) { - ret = -EINVAL; + if (!fs_info->quota_root) { + ret = -ENOTCONN; goto out; } @@ -1387,11 +1383,11 @@ int btrfs_create_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) int ret = 0; mutex_lock(&fs_info->qgroup_ioctl_lock); - quota_root = fs_info->quota_root; - if (!quota_root) { - ret = -EINVAL; + if (!fs_info->quota_root) { + ret = -ENOTCONN; goto out; } + quota_root = fs_info->quota_root; qgroup = find_qgroup_rb(fs_info, qgroupid); if (qgroup) { ret = -EEXIST; @@ -1416,15 +1412,13 @@ out: int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; struct btrfs_qgroup_list *list; int ret = 0; mutex_lock(&fs_info->qgroup_ioctl_lock); - quota_root = fs_info->quota_root; - if (!quota_root) { - ret = -EINVAL; + if (!fs_info->quota_root) { + ret = -ENOTCONN; goto out; } @@ -1465,7 +1459,6 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, struct btrfs_qgroup_limit *limit) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; int ret = 0; /* Sometimes we would want to clear the limit on this qgroup. @@ -1475,9 +1468,8 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid, const u64 CLEAR_VALUE = -1; mutex_lock(&fs_info->qgroup_ioctl_lock); - quota_root = fs_info->quota_root; - if (!quota_root) { - ret = -EINVAL; + if (!fs_info->quota_root) { + ret = -ENOTCONN; goto out; } @@ -2582,10 +2574,9 @@ cleanup: int btrfs_run_qgroups(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_root *quota_root = fs_info->quota_root; int ret = 0; - if (!quota_root) + if (!fs_info->quota_root) return ret; spin_lock(&fs_info->qgroup_lock); @@ -2879,7 +2870,6 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info, static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, enum btrfs_qgroup_rsv_type type) { - struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; struct btrfs_fs_info *fs_info = root->fs_info; u64 ref_root = root->root_key.objectid; @@ -2898,8 +2888,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce, enforce = false; spin_lock(&fs_info->qgroup_lock); - quota_root = fs_info->quota_root; - if (!quota_root) + if (!fs_info->quota_root) goto out; qgroup = find_qgroup_rb(fs_info, ref_root); @@ -2966,7 +2955,6 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, u64 ref_root, u64 num_bytes, enum btrfs_qgroup_rsv_type type) { - struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; struct ulist_node *unode; struct ulist_iterator uiter; @@ -2984,8 +2972,7 @@ void btrfs_qgroup_free_refroot(struct btrfs_fs_info *fs_info, } spin_lock(&fs_info->qgroup_lock); - quota_root = fs_info->quota_root; - if (!quota_root) + if (!fs_info->quota_root) goto out; qgroup = find_qgroup_rb(fs_info, ref_root); @@ -3685,7 +3672,6 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, int num_bytes) { - struct btrfs_root *quota_root = fs_info->quota_root; struct btrfs_qgroup *qgroup; struct ulist_node *unode; struct ulist_iterator uiter; @@ -3693,7 +3679,7 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, if (num_bytes == 0) return; - if (!quota_root) + if (!fs_info->quota_root) return; spin_lock(&fs_info->qgroup_lock); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index da5abd62db22..995d4b8b1cfd 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -4332,6 +4332,15 @@ static void describe_relocation(struct btrfs_fs_info *fs_info, block_group->start, buf); } +static const char *stage_to_string(int stage) +{ + if (stage == MOVE_DATA_EXTENTS) + return "move data extents"; + if (stage == UPDATE_DATA_PTRS) + return "update data pointers"; + return "unknown"; +} + /* * function to relocate all extents in a block group. */ @@ -4406,12 +4415,15 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) rc->block_group->length); while (1) { + int finishes_stage; + mutex_lock(&fs_info->cleaner_mutex); ret = relocate_block_group(rc); mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) err = ret; + finishes_stage = rc->stage; /* * We may have gotten ENOSPC after we already dirtied some * extents. If writeout happens while we're relocating a @@ -4437,8 +4449,8 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) if (rc->extents_found == 0) break; - btrfs_info(fs_info, "found %llu extents", rc->extents_found); - + btrfs_info(fs_info, "found %llu extents, stage: %s", + rc->extents_found, stage_to_string(finishes_stage)); } WARN_ON(rc->block_group->pinned > 0); @@ -4656,7 +4668,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) LIST_HEAD(list); ordered = btrfs_lookup_ordered_extent(inode, file_pos); - BUG_ON(ordered->file_offset != file_pos || ordered->len != len); + BUG_ON(ordered->file_offset != file_pos || ordered->num_bytes != len); disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; ret = btrfs_lookup_csums_range(fs_info->csum_root, disk_bytenr, @@ -4680,7 +4692,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) * disk_len vs real len like with real inodes since it's all * disk length. */ - new_bytenr = ordered->start + (sums->bytenr - disk_bytenr); + new_bytenr = ordered->disk_bytenr + sums->bytenr - disk_bytenr; sums->bytenr = new_bytenr; btrfs_add_ordered_sum(ordered, sums); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 21de630b0730..61b37c56a7fb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -8,6 +8,7 @@ #include <linux/sched/mm.h> #include <crypto/hash.h> #include "ctree.h" +#include "discard.h" #include "volumes.h" #include "disk-io.h" #include "ordered-data.h" @@ -3577,17 +3578,27 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, * This can easily boost the amount of SYSTEM chunks if cleaner * thread can't be triggered fast enough, and use up all space * of btrfs_super_block::sys_chunk_array + * + * While for dev replace, we need to try our best to mark block + * group RO, to prevent race between: + * - Write duplication + * Contains latest data + * - Scrub copy + * Contains data from commit tree + * + * If target block group is not marked RO, nocow writes can + * be overwritten by scrub copy, causing data corruption. + * So for dev-replace, it's not allowed to continue if a block + * group is not RO. */ - ret = btrfs_inc_block_group_ro(cache, false); - scrub_pause_off(fs_info); - + ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace); if (ret == 0) { ro_set = 1; - } else if (ret == -ENOSPC) { + } else if (ret == -ENOSPC && !sctx->is_dev_replace) { /* * btrfs_inc_block_group_ro return -ENOSPC when it * failed in creating new chunk for metadata. - * It is not a problem for scrub/replace, because + * It is not a problem for scrub, because * metadata are always cowed, and our scrub paused * commit_transactions. */ @@ -3596,9 +3607,22 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, btrfs_warn(fs_info, "failed setting block group ro: %d", ret); btrfs_put_block_group(cache); + scrub_pause_off(fs_info); break; } + /* + * Now the target block is marked RO, wait for nocow writes to + * finish before dev-replace. + * COW is fine, as COW never overwrites extents in commit tree. + */ + if (sctx->is_dev_replace) { + btrfs_wait_nocow_writers(cache); + btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, + cache->length); + } + + scrub_pause_off(fs_info); down_write(&dev_replace->rwsem); dev_replace->cursor_right = found_key.offset + length; dev_replace->cursor_left = found_key.offset; @@ -3659,7 +3683,11 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!cache->removed && !cache->ro && cache->reserved == 0 && cache->used == 0) { spin_unlock(&cache->lock); - btrfs_mark_bg_unused(cache); + if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_discard_queue_work(&fs_info->discard_ctl, + cache); + else + btrfs_mark_bg_unused(cache); } else { spin_unlock(&cache->lock); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 091e5bc8c7ea..a055b657cb85 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1269,7 +1269,8 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_) * destination of the stream. */ if (ino == bctx->cur_objectid && - offset >= bctx->sctx->cur_inode_next_write_offset) + offset + bctx->extent_len > + bctx->sctx->cur_inode_next_write_offset) return 0; } diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index f09aa6ee9113..01297c5b2666 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -159,10 +159,9 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) return (global->size << 1); } -static int can_overcommit(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, u64 bytes, - enum btrfs_reserve_flush_enum flush, - bool system_chunk) +int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) { u64 profile; u64 avail; @@ -173,7 +172,7 @@ static int can_overcommit(struct btrfs_fs_info *fs_info, if (space_info->flags & BTRFS_BLOCK_GROUP_DATA) return 0; - if (system_chunk) + if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) profile = btrfs_system_alloc_profile(fs_info); else profile = btrfs_metadata_alloc_profile(fs_info); @@ -227,8 +226,8 @@ again: /* Check and see if our ticket can be satisified now. */ if ((used + ticket->bytes <= space_info->total_bytes) || - can_overcommit(fs_info, space_info, ticket->bytes, flush, - false)) { + btrfs_can_overcommit(fs_info, space_info, ticket->bytes, + flush)) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, ticket->bytes); @@ -626,8 +625,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, static inline u64 btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - bool system_chunk) + struct btrfs_space_info *space_info) { struct reserve_ticket *ticket; u64 used; @@ -642,14 +640,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, return to_reclaim; to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); - if (can_overcommit(fs_info, space_info, to_reclaim, - BTRFS_RESERVE_FLUSH_ALL, system_chunk)) + if (btrfs_can_overcommit(fs_info, space_info, to_reclaim, + BTRFS_RESERVE_FLUSH_ALL)) return 0; used = btrfs_space_info_used(space_info, true); - if (can_overcommit(fs_info, space_info, SZ_1M, - BTRFS_RESERVE_FLUSH_ALL, system_chunk)) + if (btrfs_can_overcommit(fs_info, space_info, SZ_1M, + BTRFS_RESERVE_FLUSH_ALL)) expected = div_factor_fine(space_info->total_bytes, 95); else expected = div_factor_fine(space_info->total_bytes, 90); @@ -665,7 +663,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, - u64 used, bool system_chunk) + u64 used) { u64 thresh = div_factor_fine(space_info->total_bytes, 98); @@ -673,8 +671,7 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info, if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh) return 0; - if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info, - system_chunk)) + if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info)) return 0; return (used >= thresh && !btrfs_fs_closing(fs_info) && @@ -765,8 +762,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, - false); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); if (!to_reclaim) { space_info->flush = 0; spin_unlock(&space_info->lock); @@ -785,8 +781,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) return; } to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, - space_info, - false); + space_info); if (last_tickets_id == space_info->tickets_id) { flush_state++; } else { @@ -858,8 +853,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, int flush_state; spin_lock(&space_info->lock); - to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info, - false); + to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); if (!to_reclaim) { spin_unlock(&space_info->lock); return; @@ -990,8 +984,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 orig_bytes, - enum btrfs_reserve_flush_enum flush, - bool system_chunk) + enum btrfs_reserve_flush_enum flush) { struct reserve_ticket ticket; u64 used; @@ -1013,8 +1006,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, */ if (!pending_tickets && ((used + orig_bytes <= space_info->total_bytes) || - can_overcommit(fs_info, space_info, orig_bytes, flush, - system_chunk))) { + btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, orig_bytes); ret = 0; @@ -1054,8 +1046,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * the async reclaim as we will panic. */ if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) && - need_do_async_reclaim(fs_info, space_info, - used, system_chunk) && + need_do_async_reclaim(fs_info, space_info, used) && !work_busy(&fs_info->async_reclaim_work)) { trace_btrfs_trigger_flush(fs_info, space_info->flags, orig_bytes, flush, "preempt"); @@ -1092,10 +1083,9 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; int ret; - bool system_chunk = (root == fs_info->chunk_root); ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, - orig_bytes, flush, system_chunk); + orig_bytes, flush); if (ret == -ENOSPC && unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { if (block_rsv != global_rsv && diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 1a349e3f9cc1..24514cd2c6c1 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -127,6 +127,9 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, enum btrfs_reserve_flush_enum flush); void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info); +int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 bytes, + enum btrfs_reserve_flush_enum flush); static inline void btrfs_space_info_free_bytes_may_use( struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f452a94abdc3..0616a5434793 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -46,6 +46,7 @@ #include "sysfs.h" #include "tests/btrfs-tests.h" #include "block-group.h" +#include "discard.h" #include "qgroup.h" #define CREATE_TRACE_POINTS @@ -146,6 +147,8 @@ void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function if (sb_rdonly(sb)) return; + btrfs_discard_stop(fs_info); + /* btrfs handle error by forcing the filesystem readonly */ sb->s_flags |= SB_RDONLY; btrfs_info(fs_info, "forced readonly"); @@ -313,6 +316,7 @@ enum { Opt_datasum, Opt_nodatasum, Opt_defrag, Opt_nodefrag, Opt_discard, Opt_nodiscard, + Opt_discard_mode, Opt_nologreplay, Opt_norecovery, Opt_ratio, @@ -375,6 +379,7 @@ static const match_table_t tokens = { {Opt_defrag, "autodefrag"}, {Opt_nodefrag, "noautodefrag"}, {Opt_discard, "discard"}, + {Opt_discard_mode, "discard=%s"}, {Opt_nodiscard, "nodiscard"}, {Opt_nologreplay, "nologreplay"}, {Opt_norecovery, "norecovery"}, @@ -695,12 +700,26 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, info->metadata_ratio); break; case Opt_discard: - btrfs_set_and_info(info, DISCARD, - "turning on discard"); + case Opt_discard_mode: + if (token == Opt_discard || + strcmp(args[0].from, "sync") == 0) { + btrfs_clear_opt(info->mount_opt, DISCARD_ASYNC); + btrfs_set_and_info(info, DISCARD_SYNC, + "turning on sync discard"); + } else if (strcmp(args[0].from, "async") == 0) { + btrfs_clear_opt(info->mount_opt, DISCARD_SYNC); + btrfs_set_and_info(info, DISCARD_ASYNC, + "turning on async discard"); + } else { + ret = -EINVAL; + goto out; + } break; case Opt_nodiscard: - btrfs_clear_and_info(info, DISCARD, + btrfs_clear_and_info(info, DISCARD_SYNC, "turning off discard"); + btrfs_clear_and_info(info, DISCARD_ASYNC, + "turning off async discard"); break; case Opt_space_cache: case Opt_space_cache_version: @@ -1322,8 +1341,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",nologreplay"); if (btrfs_test_opt(info, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); - if (btrfs_test_opt(info, DISCARD)) + if (btrfs_test_opt(info, DISCARD_SYNC)) seq_puts(seq, ",discard"); + if (btrfs_test_opt(info, DISCARD_ASYNC)) + seq_puts(seq, ",discard=async"); if (!(info->sb->s_flags & SB_POSIXACL)) seq_puts(seq, ",noacl"); if (btrfs_test_opt(info, SPACE_CACHE)) @@ -1713,6 +1734,14 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, btrfs_cleanup_defrag_inodes(fs_info); } + /* If we toggled discard async */ + if (!btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) && + btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_discard_resume(fs_info); + else if (btrfs_raw_test_opt(old_opts, DISCARD_ASYNC) && + !btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_discard_cleanup(fs_info); + clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); } @@ -1760,6 +1789,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) */ cancel_work_sync(&fs_info->async_reclaim_work); + btrfs_discard_cleanup(fs_info); + /* wait for the uuid_scan task to finish */ down(&fs_info->uuid_tree_rescan_sem); /* avoid complains from lockdep et al. */ @@ -2104,7 +2135,15 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) */ thresh = SZ_4M; - if (!mixed && total_free_meta - thresh < block_rsv->size) + /* + * We only want to claim there's no available space if we can no longer + * allocate chunks for our metadata profile and our global reserve will + * not fit in the free metadata space. If we aren't ->full then we + * still can allocate chunks and thus are fine using the currently + * calculated f_bavail. + */ + if (!mixed && block_rsv->space_info->full && + total_free_meta - thresh < block_rsv->size) buf->f_bavail = 0; buf->f_type = BTRFS_SUPER_MAGIC; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 5ebbe8a5ee76..7436422194da 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -12,6 +12,7 @@ #include <crypto/hash.h> #include "ctree.h" +#include "discard.h" #include "disk-io.h" #include "transaction.h" #include "sysfs.h" @@ -339,11 +340,177 @@ static const struct attribute_group btrfs_static_feature_attr_group = { #ifdef CONFIG_BTRFS_DEBUG /* + * Discard statistics and tunables + */ +#define discard_to_fs_info(_kobj) to_fs_info((_kobj)->parent->parent) + +static ssize_t btrfs_discardable_bytes_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%lld\n", + atomic64_read(&fs_info->discard_ctl.discardable_bytes)); +} +BTRFS_ATTR(discard, discardable_bytes, btrfs_discardable_bytes_show); + +static ssize_t btrfs_discardable_extents_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%d\n", + atomic_read(&fs_info->discard_ctl.discardable_extents)); +} +BTRFS_ATTR(discard, discardable_extents, btrfs_discardable_extents_show); + +static ssize_t btrfs_discard_bitmap_bytes_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%lld\n", + fs_info->discard_ctl.discard_bitmap_bytes); +} +BTRFS_ATTR(discard, discard_bitmap_bytes, btrfs_discard_bitmap_bytes_show); + +static ssize_t btrfs_discard_bytes_saved_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%lld\n", + atomic64_read(&fs_info->discard_ctl.discard_bytes_saved)); +} +BTRFS_ATTR(discard, discard_bytes_saved, btrfs_discard_bytes_saved_show); + +static ssize_t btrfs_discard_extent_bytes_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%lld\n", + fs_info->discard_ctl.discard_extent_bytes); +} +BTRFS_ATTR(discard, discard_extent_bytes, btrfs_discard_extent_bytes_show); + +static ssize_t btrfs_discard_iops_limit_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", + READ_ONCE(fs_info->discard_ctl.iops_limit)); +} + +static ssize_t btrfs_discard_iops_limit_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; + u32 iops_limit; + int ret; + + ret = kstrtou32(buf, 10, &iops_limit); + if (ret) + return -EINVAL; + + WRITE_ONCE(discard_ctl->iops_limit, iops_limit); + + return len; +} +BTRFS_ATTR_RW(discard, iops_limit, btrfs_discard_iops_limit_show, + btrfs_discard_iops_limit_store); + +static ssize_t btrfs_discard_kbps_limit_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%u\n", + READ_ONCE(fs_info->discard_ctl.kbps_limit)); +} + +static ssize_t btrfs_discard_kbps_limit_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; + u32 kbps_limit; + int ret; + + ret = kstrtou32(buf, 10, &kbps_limit); + if (ret) + return -EINVAL; + + WRITE_ONCE(discard_ctl->kbps_limit, kbps_limit); + + return len; +} +BTRFS_ATTR_RW(discard, kbps_limit, btrfs_discard_kbps_limit_show, + btrfs_discard_kbps_limit_store); + +static ssize_t btrfs_discard_max_discard_size_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + READ_ONCE(fs_info->discard_ctl.max_discard_size)); +} + +static ssize_t btrfs_discard_max_discard_size_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = discard_to_fs_info(kobj); + struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl; + u64 max_discard_size; + int ret; + + ret = kstrtou64(buf, 10, &max_discard_size); + if (ret) + return -EINVAL; + + WRITE_ONCE(discard_ctl->max_discard_size, max_discard_size); + + return len; +} +BTRFS_ATTR_RW(discard, max_discard_size, btrfs_discard_max_discard_size_show, + btrfs_discard_max_discard_size_store); + +static const struct attribute *discard_debug_attrs[] = { + BTRFS_ATTR_PTR(discard, discardable_bytes), + BTRFS_ATTR_PTR(discard, discardable_extents), + BTRFS_ATTR_PTR(discard, discard_bitmap_bytes), + BTRFS_ATTR_PTR(discard, discard_bytes_saved), + BTRFS_ATTR_PTR(discard, discard_extent_bytes), + BTRFS_ATTR_PTR(discard, iops_limit), + BTRFS_ATTR_PTR(discard, kbps_limit), + BTRFS_ATTR_PTR(discard, max_discard_size), + NULL, +}; + +/* * Runtime debugging exported via sysfs * * /sys/fs/btrfs/debug - applies to module or all filesystems * /sys/fs/btrfs/UUID - applies only to the given filesystem */ +static const struct attribute *btrfs_debug_mount_attrs[] = { + NULL, +}; + static struct attribute *btrfs_debug_feature_attrs[] = { NULL }; @@ -734,10 +901,10 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) { - if (fs_devs->device_dir_kobj) { - kobject_del(fs_devs->device_dir_kobj); - kobject_put(fs_devs->device_dir_kobj); - fs_devs->device_dir_kobj = NULL; + if (fs_devs->devices_kobj) { + kobject_del(fs_devs->devices_kobj); + kobject_put(fs_devs->devices_kobj); + fs_devs->devices_kobj = NULL; } if (fs_devs->fsid_kobj.state_initialized) { @@ -771,6 +938,19 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) kobject_del(fs_info->space_info_kobj); kobject_put(fs_info->space_info_kobj); } +#ifdef CONFIG_BTRFS_DEBUG + if (fs_info->discard_debug_kobj) { + sysfs_remove_files(fs_info->discard_debug_kobj, + discard_debug_attrs); + kobject_del(fs_info->discard_debug_kobj); + kobject_put(fs_info->discard_debug_kobj); + } + if (fs_info->debug_kobj) { + sysfs_remove_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); + kobject_del(fs_info->debug_kobj); + kobject_put(fs_info->debug_kobj); + } +#endif addrm_unknown_feature_attrs(fs_info, false); sysfs_remove_group(&fs_info->fs_devices->fsid_kobj, &btrfs_feature_attr_group); sysfs_remove_files(&fs_info->fs_devices->fsid_kobj, btrfs_attrs); @@ -969,46 +1149,120 @@ int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, struct hd_struct *disk; struct kobject *disk_kobj; - if (!fs_devices->device_dir_kobj) + if (!fs_devices->devices_kobj) return -EINVAL; - if (one_device && one_device->bdev) { - disk = one_device->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; + if (one_device) { + if (one_device->bdev) { + disk = one_device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + sysfs_remove_link(fs_devices->devices_kobj, + disk_kobj->name); + } - sysfs_remove_link(fs_devices->device_dir_kobj, - disk_kobj->name); - } + kobject_del(&one_device->devid_kobj); + kobject_put(&one_device->devid_kobj); + + wait_for_completion(&one_device->kobj_unregister); - if (one_device) return 0; + } - list_for_each_entry(one_device, - &fs_devices->devices, dev_list) { - if (!one_device->bdev) - continue; - disk = one_device->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; + list_for_each_entry(one_device, &fs_devices->devices, dev_list) { + + if (one_device->bdev) { + disk = one_device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + sysfs_remove_link(fs_devices->devices_kobj, + disk_kobj->name); + } + kobject_del(&one_device->devid_kobj); + kobject_put(&one_device->devid_kobj); - sysfs_remove_link(fs_devices->device_dir_kobj, - disk_kobj->name); + wait_for_completion(&one_device->kobj_unregister); } return 0; } -int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs) +static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) { - if (!fs_devs->device_dir_kobj) - fs_devs->device_dir_kobj = kobject_create_and_add("devices", - &fs_devs->fsid_kobj); + int val; + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); - if (!fs_devs->device_dir_kobj) - return -ENOMEM; + val = !!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state); - return 0; + return snprintf(buf, PAGE_SIZE, "%d\n", val); +} +BTRFS_ATTR(devid, in_fs_metadata, btrfs_devinfo_in_fs_metadata_show); + +static ssize_t btrfs_sysfs_missing_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + int val; + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + val = !!test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state); + + return snprintf(buf, PAGE_SIZE, "%d\n", val); +} +BTRFS_ATTR(devid, missing, btrfs_sysfs_missing_show); + +static ssize_t btrfs_devinfo_replace_target_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + int val; + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + val = !!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state); + + return snprintf(buf, PAGE_SIZE, "%d\n", val); +} +BTRFS_ATTR(devid, replace_target, btrfs_devinfo_replace_target_show); + +static ssize_t btrfs_devinfo_writeable_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + int val; + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + val = !!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); + + return snprintf(buf, PAGE_SIZE, "%d\n", val); +} +BTRFS_ATTR(devid, writeable, btrfs_devinfo_writeable_show); + +static struct attribute *devid_attrs[] = { + BTRFS_ATTR_PTR(devid, in_fs_metadata), + BTRFS_ATTR_PTR(devid, missing), + BTRFS_ATTR_PTR(devid, replace_target), + BTRFS_ATTR_PTR(devid, writeable), + NULL +}; +ATTRIBUTE_GROUPS(devid); + +static void btrfs_release_devid_kobj(struct kobject *kobj) +{ + struct btrfs_device *device = container_of(kobj, struct btrfs_device, + devid_kobj); + + memset(&device->devid_kobj, 0, sizeof(struct kobject)); + complete(&device->kobj_unregister); } +static struct kobj_type devid_ktype = { + .sysfs_ops = &kobj_sysfs_ops, + .default_groups = devid_groups, + .release = btrfs_release_devid_kobj, +}; + int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { @@ -1016,22 +1270,31 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *dev; list_for_each_entry(dev, &fs_devices->devices, dev_list) { - struct hd_struct *disk; - struct kobject *disk_kobj; - - if (!dev->bdev) - continue; if (one_device && one_device != dev) continue; - disk = dev->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; + if (dev->bdev) { + struct hd_struct *disk; + struct kobject *disk_kobj; + + disk = dev->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; - error = sysfs_create_link(fs_devices->device_dir_kobj, - disk_kobj, disk_kobj->name); - if (error) + error = sysfs_create_link(fs_devices->devices_kobj, + disk_kobj, disk_kobj->name); + if (error) + break; + } + + init_completion(&dev->kobj_unregister); + error = kobject_init_and_add(&dev->devid_kobj, &devid_ktype, + fs_devices->devices_kobj, "%llu", + dev->devid); + if (error) { + kobject_put(&dev->devid_kobj); break; + } } return error; @@ -1063,27 +1326,49 @@ void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, "sysfs: failed to create fsid for sprout"); } +void btrfs_sysfs_update_devid(struct btrfs_device *device) +{ + char tmp[24]; + + snprintf(tmp, sizeof(tmp), "%llu", device->devid); + + if (kobject_rename(&device->devid_kobj, tmp)) + btrfs_warn(device->fs_devices->fs_info, + "sysfs: failed to update devid for %llu", + device->devid); +} + /* /sys/fs/btrfs/ entry */ static struct kset *btrfs_kset; /* + * Creates: + * /sys/fs/btrfs/UUID + * * Can be called by the device discovery thread. - * And parent can be specified for seed device */ -int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, - struct kobject *parent) +int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs) { int error; init_completion(&fs_devs->kobj_unregister); fs_devs->fsid_kobj.kset = btrfs_kset; - error = kobject_init_and_add(&fs_devs->fsid_kobj, - &btrfs_ktype, parent, "%pU", fs_devs->fsid); + error = kobject_init_and_add(&fs_devs->fsid_kobj, &btrfs_ktype, NULL, + "%pU", fs_devs->fsid); if (error) { kobject_put(&fs_devs->fsid_kobj); return error; } + fs_devs->devices_kobj = kobject_create_and_add("devices", + &fs_devs->fsid_kobj); + if (!fs_devs->devices_kobj) { + btrfs_err(fs_devs->fs_info, + "failed to init sysfs device interface"); + kobject_put(&fs_devs->fsid_kobj); + return -ENOMEM; + } + return 0; } @@ -1111,8 +1396,26 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) goto failure; #ifdef CONFIG_BTRFS_DEBUG - error = sysfs_create_group(fsid_kobj, - &btrfs_debug_feature_attr_group); + fs_info->debug_kobj = kobject_create_and_add("debug", fsid_kobj); + if (!fs_info->debug_kobj) { + error = -ENOMEM; + goto failure; + } + + error = sysfs_create_files(fs_info->debug_kobj, btrfs_debug_mount_attrs); + if (error) + goto failure; + + /* Discard directory */ + fs_info->discard_debug_kobj = kobject_create_and_add("discard", + fs_info->debug_kobj); + if (!fs_info->discard_debug_kobj) { + error = -ENOMEM; + goto failure; + } + + error = sysfs_create_files(fs_info->discard_debug_kobj, + discard_debug_attrs); if (error) goto failure; #endif @@ -1209,6 +1512,9 @@ void __cold btrfs_exit_sysfs(void) sysfs_unmerge_group(&btrfs_kset->kobj, &btrfs_static_feature_attr_group); sysfs_remove_group(&btrfs_kset->kobj, &btrfs_feature_attr_group); +#ifdef CONFIG_BTRFS_DEBUG + sysfs_remove_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group); +#endif kset_unregister(btrfs_kset); } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index e10c3adfc30f..c68582add92e 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -18,9 +18,7 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); int btrfs_sysfs_rm_device_link(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); -int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, - struct kobject *parent); -int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs); +int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, const u8 *fsid); @@ -36,5 +34,6 @@ void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache); int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info); void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info); +void btrfs_sysfs_update_devid(struct btrfs_device *device); #endif diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index a7aca4141788..84fb3fa940a6 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -86,6 +86,27 @@ static void btrfs_destroy_test_fs(void) unregister_filesystem(&test_type); } +struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info) +{ + struct btrfs_device *dev; + + dev = kzalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return ERR_PTR(-ENOMEM); + + extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL); + INIT_LIST_HEAD(&dev->dev_list); + list_add(&dev->dev_list, &fs_info->fs_devices->devices); + + return dev; +} + +static void btrfs_free_dummy_device(struct btrfs_device *dev) +{ + extent_io_tree_release(&dev->alloc_state); + kfree(dev); +} + struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) { struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info), @@ -121,7 +142,6 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) spin_lock_init(&fs_info->qgroup_lock); spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); - spin_lock_init(&fs_info->tree_mod_seq_lock); mutex_init(&fs_info->qgroup_ioctl_lock); mutex_init(&fs_info->qgroup_rescan_lock); rwlock_init(&fs_info->tree_mod_log_lock); @@ -132,12 +152,14 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) INIT_LIST_HEAD(&fs_info->dirty_qgroups); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); + INIT_LIST_HEAD(&fs_info->fs_devices->devices); INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); extent_io_tree_init(fs_info, &fs_info->freed_extents[0], IO_TREE_FS_INFO_FREED_EXTENTS0, NULL); extent_io_tree_init(fs_info, &fs_info->freed_extents[1], IO_TREE_FS_INFO_FREED_EXTENTS1, NULL); + extent_map_tree_init(&fs_info->mapping_tree); fs_info->pinned_extents = &fs_info->freed_extents[0]; set_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); @@ -150,6 +172,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) { struct radix_tree_iter iter; void **slot; + struct btrfs_device *dev, *tmp; if (!fs_info) return; @@ -180,6 +203,11 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->buffer_lock); + btrfs_mapping_tree_free(&fs_info->mapping_tree); + list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, + dev_list) { + btrfs_free_dummy_device(dev); + } btrfs_free_qgroup_config(fs_info); btrfs_free_fs_roots(fs_info); cleanup_srcu_struct(&fs_info->subvol_srcu); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 9e52527357d8..7a2d7ffbe30e 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -46,6 +46,7 @@ btrfs_alloc_dummy_block_group(struct btrfs_fs_info *fs_info, unsigned long lengt void btrfs_free_dummy_block_group(struct btrfs_block_group *cache); void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); +struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info); #else static inline int btrfs_run_sanity_tests(void) { diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 123d9a614357..df7ce874a74b 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -441,8 +441,17 @@ static int test_find_first_clear_extent_bit(void) int ret = -EINVAL; test_msg("running find_first_clear_extent_bit test"); + extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL); + /* Test correct handling of empty tree */ + find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED); + if (start != 0 || end != -1) { + test_err( + "error getting a range from completely empty tree: start %llu end %llu", + start, end); + goto out; + } /* * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between * 4M-32M diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 4a7f796c9900..57379e96ccc9 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -6,6 +6,9 @@ #include <linux/types.h> #include "btrfs-tests.h" #include "../ctree.h" +#include "../volumes.h" +#include "../disk-io.h" +#include "../block-group.h" static void free_extent_map_tree(struct extent_map_tree *em_tree) { @@ -437,11 +440,153 @@ static int test_case_4(struct btrfs_fs_info *fs_info, return ret; } +struct rmap_test_vector { + u64 raid_type; + u64 physical_start; + u64 data_stripe_size; + u64 num_data_stripes; + u64 num_stripes; + /* Assume we won't have more than 5 physical stripes */ + u64 data_stripe_phys_start[5]; + bool expected_mapped_addr; + /* Physical to logical addresses */ + u64 mapped_logical[5]; +}; + +static int test_rmap_block(struct btrfs_fs_info *fs_info, + struct rmap_test_vector *test) +{ + struct extent_map *em; + struct map_lookup *map = NULL; + u64 *logical = NULL; + int i, out_ndaddrs, out_stripe_len; + int ret; + + em = alloc_extent_map(); + if (!em) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + map = kmalloc(map_lookup_size(test->num_stripes), GFP_KERNEL); + if (!map) { + kfree(em); + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); + /* Start at 4GiB logical address */ + em->start = SZ_4G; + em->len = test->data_stripe_size * test->num_data_stripes; + em->block_len = em->len; + em->orig_block_len = test->data_stripe_size; + em->map_lookup = map; + + map->num_stripes = test->num_stripes; + map->stripe_len = BTRFS_STRIPE_LEN; + map->type = test->raid_type; + + for (i = 0; i < map->num_stripes; i++) { + struct btrfs_device *dev = btrfs_alloc_dummy_device(fs_info); + + if (IS_ERR(dev)) { + test_err("cannot allocate device"); + ret = PTR_ERR(dev); + goto out; + } + map->stripes[i].dev = dev; + map->stripes[i].physical = test->data_stripe_phys_start[i]; + } + + write_lock(&fs_info->mapping_tree.lock); + ret = add_extent_mapping(&fs_info->mapping_tree, em, 0); + write_unlock(&fs_info->mapping_tree.lock); + if (ret) { + test_err("error adding block group mapping to mapping tree"); + goto out_free; + } + + ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1), + &logical, &out_ndaddrs, &out_stripe_len); + if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) { + test_err("didn't rmap anything but expected %d", + test->expected_mapped_addr); + goto out; + } + + if (out_stripe_len != BTRFS_STRIPE_LEN) { + test_err("calculated stripe length doesn't match"); + goto out; + } + + if (out_ndaddrs != test->expected_mapped_addr) { + for (i = 0; i < out_ndaddrs; i++) + test_msg("mapped %llu", logical[i]); + test_err("unexpected number of mapped addresses: %d", out_ndaddrs); + goto out; + } + + for (i = 0; i < out_ndaddrs; i++) { + if (logical[i] != test->mapped_logical[i]) { + test_err("unexpected logical address mapped"); + goto out; + } + } + + ret = 0; +out: + write_lock(&fs_info->mapping_tree.lock); + remove_extent_mapping(&fs_info->mapping_tree, em); + write_unlock(&fs_info->mapping_tree.lock); + /* For us */ + free_extent_map(em); +out_free: + /* For the tree */ + free_extent_map(em); + kfree(logical); + return ret; +} + int btrfs_test_extent_map(void) { struct btrfs_fs_info *fs_info = NULL; struct extent_map_tree *em_tree; - int ret = 0; + int ret = 0, i; + struct rmap_test_vector rmap_tests[] = { + { + /* + * Test a chunk with 2 data stripes one of which + * interesects the physical address of the super block + * is correctly recognised. + */ + .raid_type = BTRFS_BLOCK_GROUP_RAID1, + .physical_start = SZ_64M - SZ_4M, + .data_stripe_size = SZ_256M, + .num_data_stripes = 2, + .num_stripes = 2, + .data_stripe_phys_start = + {SZ_64M - SZ_4M, SZ_64M - SZ_4M + SZ_256M}, + .expected_mapped_addr = true, + .mapped_logical= {SZ_4G + SZ_4M} + }, + { + /* + * Test that out-of-range physical addresses are + * ignored + */ + + /* SINGLE chunk type */ + .raid_type = 0, + .physical_start = SZ_4G, + .data_stripe_size = SZ_256M, + .num_data_stripes = 1, + .num_stripes = 1, + .data_stripe_phys_start = {SZ_256M}, + .expected_mapped_addr = false, + .mapped_logical = {0} + } + }; test_msg("running extent_map tests"); @@ -474,6 +619,13 @@ int btrfs_test_extent_map(void) goto out; ret = test_case_4(fs_info, em_tree); + test_msg("running rmap tests"); + for (i = 0; i < ARRAY_SIZE(rmap_tests); i++) { + ret = test_rmap_block(fs_info, &rmap_tests[i]); + if (ret) + goto out; + } + out: kfree(em_tree); btrfs_free_dummy_fs_info(fs_info); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 09ecf7dc7b08..24a8c714f56c 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -263,7 +263,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) /* First with no extents */ BTRFS_I(inode)->root = root; - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, sectorsize); if (IS_ERR(em)) { em = NULL; test_err("got an error when we shouldn't have"); @@ -283,7 +283,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) */ setup_file_extents(root, sectorsize); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, (u64)-1); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -305,7 +305,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -333,7 +333,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -356,7 +356,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Regular extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -384,7 +384,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* The next 3 are split extents */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -413,7 +413,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -435,7 +435,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -469,7 +469,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Prealloc extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -498,7 +498,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* The next 3 are a half written prealloc extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -528,7 +528,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -561,7 +561,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -596,7 +596,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Now for the compressed extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -630,7 +630,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* Split compressed extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -665,7 +665,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -692,7 +692,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -727,8 +727,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) free_extent_map(em); /* A hole between regular extents but no hole extent */ - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, - sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset + 6, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -755,7 +754,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, SZ_4M); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -788,7 +787,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) offset = em->start + em->len; free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -872,7 +871,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) insert_inode_item_key(root); insert_extent(root, sectorsize, sectorsize, sectorsize, 0, sectorsize, sectorsize, BTRFS_FILE_EXTENT_REG, 0, 1); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, 0, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; @@ -894,8 +893,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) } free_extent_map(em); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, - 2 * sectorsize, 0); + em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, sectorsize, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index cfc08ef9b876..33dcc88b428a 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -147,13 +147,14 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) } } -static noinline void switch_commit_roots(struct btrfs_transaction *trans) +static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) { + struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root, *tmp; down_write(&fs_info->commit_root_sem); - list_for_each_entry_safe(root, tmp, &trans->switch_commits, + list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits, dirty_list) { list_del_init(&root->dirty_list); free_extent_buffer(root->commit_root); @@ -165,16 +166,17 @@ static noinline void switch_commit_roots(struct btrfs_transaction *trans) } /* We can free old roots now. */ - spin_lock(&trans->dropped_roots_lock); - while (!list_empty(&trans->dropped_roots)) { - root = list_first_entry(&trans->dropped_roots, + spin_lock(&cur_trans->dropped_roots_lock); + while (!list_empty(&cur_trans->dropped_roots)) { + root = list_first_entry(&cur_trans->dropped_roots, struct btrfs_root, root_list); list_del_init(&root->root_list); - spin_unlock(&trans->dropped_roots_lock); + spin_unlock(&cur_trans->dropped_roots_lock); + btrfs_free_log(trans, root); btrfs_drop_and_free_fs_root(fs_info, root); - spin_lock(&trans->dropped_roots_lock); + spin_lock(&cur_trans->dropped_roots_lock); } - spin_unlock(&trans->dropped_roots_lock); + spin_unlock(&cur_trans->dropped_roots_lock); up_write(&fs_info->commit_root_sem); } @@ -1421,7 +1423,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, ret = commit_cowonly_roots(trans); if (ret) goto out; - switch_commit_roots(trans->transaction); + switch_commit_roots(trans); ret = btrfs_write_and_wait_transaction(trans); if (ret) btrfs_handle_fs_error(fs_info, ret, @@ -2013,6 +2015,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) ASSERT(refcount_read(&trans->use_count) == 1); + /* + * Some places just start a transaction to commit it. We need to make + * sure that if this commit fails that the abort code actually marks the + * transaction as failed, so set trans->dirty to make the abort code do + * the right thing. + */ + trans->dirty = true; + /* Stop the commit early if ->aborted is set */ if (unlikely(READ_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; @@ -2301,7 +2311,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) list_add_tail(&fs_info->chunk_root->dirty_list, &cur_trans->switch_commits); - switch_commit_roots(cur_trans); + switch_commit_roots(trans); ASSERT(list_empty(&cur_trans->dirty_bgs)); ASSERT(list_empty(&cur_trans->io_bgs)); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 97f3520b8d98..a92f8a6dd192 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -373,6 +373,104 @@ static int check_csum_item(struct extent_buffer *leaf, struct btrfs_key *key, return 0; } +/* Inode item error output has the same format as dir_item_err() */ +#define inode_item_err(eb, slot, fmt, ...) \ + dir_item_err(eb, slot, fmt, __VA_ARGS__) + +static int check_inode_key(struct extent_buffer *leaf, struct btrfs_key *key, + int slot) +{ + struct btrfs_key item_key; + bool is_inode_item; + + btrfs_item_key_to_cpu(leaf, &item_key, slot); + is_inode_item = (item_key.type == BTRFS_INODE_ITEM_KEY); + + /* For XATTR_ITEM, location key should be all 0 */ + if (item_key.type == BTRFS_XATTR_ITEM_KEY) { + if (key->type != 0 || key->objectid != 0 || key->offset != 0) + return -EUCLEAN; + return 0; + } + + if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID || + key->objectid > BTRFS_LAST_FREE_OBJECTID) && + key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID && + key->objectid != BTRFS_FREE_INO_OBJECTID) { + if (is_inode_item) { + generic_err(leaf, slot, + "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu", + key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID, + BTRFS_FIRST_FREE_OBJECTID, + BTRFS_LAST_FREE_OBJECTID, + BTRFS_FREE_INO_OBJECTID); + } else { + dir_item_err(leaf, slot, +"invalid location key objectid: has %llu expect %llu or [%llu, %llu] or %llu", + key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID, + BTRFS_FIRST_FREE_OBJECTID, + BTRFS_LAST_FREE_OBJECTID, + BTRFS_FREE_INO_OBJECTID); + } + return -EUCLEAN; + } + if (key->offset != 0) { + if (is_inode_item) + inode_item_err(leaf, slot, + "invalid key offset: has %llu expect 0", + key->offset); + else + dir_item_err(leaf, slot, + "invalid location key offset:has %llu expect 0", + key->offset); + return -EUCLEAN; + } + return 0; +} + +static int check_root_key(struct extent_buffer *leaf, struct btrfs_key *key, + int slot) +{ + struct btrfs_key item_key; + bool is_root_item; + + btrfs_item_key_to_cpu(leaf, &item_key, slot); + is_root_item = (item_key.type == BTRFS_ROOT_ITEM_KEY); + + /* No such tree id */ + if (key->objectid == 0) { + if (is_root_item) + generic_err(leaf, slot, "invalid root id 0"); + else + dir_item_err(leaf, slot, + "invalid location key root id 0"); + return -EUCLEAN; + } + + /* DIR_ITEM/INDEX/INODE_REF is not allowed to point to non-fs trees */ + if (!is_fstree(key->objectid) && !is_root_item) { + dir_item_err(leaf, slot, + "invalid location key objectid, have %llu expect [%llu, %llu]", + key->objectid, BTRFS_FIRST_FREE_OBJECTID, + BTRFS_LAST_FREE_OBJECTID); + return -EUCLEAN; + } + + /* + * ROOT_ITEM with non-zero offset means this is a snapshot, created at + * @offset transid. + * Furthermore, for location key in DIR_ITEM, its offset is always -1. + * + * So here we only check offset for reloc tree whose key->offset must + * be a valid tree. + */ + if (key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0) { + generic_err(leaf, slot, "invalid root id 0 for reloc tree"); + return -EUCLEAN; + } + return 0; +} + static int check_dir_item(struct extent_buffer *leaf, struct btrfs_key *key, struct btrfs_key *prev_key, int slot) @@ -386,12 +484,14 @@ static int check_dir_item(struct extent_buffer *leaf, return -EUCLEAN; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); while (cur < item_size) { + struct btrfs_key location_key; u32 name_len; u32 data_len; u32 max_name_len; u32 total_size; u32 name_hash; u8 dir_type; + int ret; /* header itself should not cross item boundary */ if (cur + sizeof(*di) > item_size) { @@ -401,6 +501,25 @@ static int check_dir_item(struct extent_buffer *leaf, return -EUCLEAN; } + /* Location key check */ + btrfs_dir_item_key_to_cpu(leaf, di, &location_key); + if (location_key.type == BTRFS_ROOT_ITEM_KEY) { + ret = check_root_key(leaf, &location_key, slot); + if (ret < 0) + return ret; + } else if (location_key.type == BTRFS_INODE_ITEM_KEY || + location_key.type == 0) { + ret = check_inode_key(leaf, &location_key, slot); + if (ret < 0) + return ret; + } else { + dir_item_err(leaf, slot, + "invalid location key type, have %u, expect %u or %u", + location_key.type, BTRFS_ROOT_ITEM_KEY, + BTRFS_INODE_ITEM_KEY); + return -EUCLEAN; + } + /* dir type check */ dir_type = btrfs_dir_type(leaf, di); if (dir_type >= BTRFS_FT_MAX) { @@ -738,6 +857,44 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf, return 0; } +/* + * Enhanced version of chunk item checker. + * + * The common btrfs_check_chunk_valid() doesn't check item size since it needs + * to work on super block sys_chunk_array which doesn't have full item ptr. + */ +static int check_leaf_chunk_item(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + struct btrfs_key *key, int slot) +{ + int num_stripes; + + if (btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk)) { + chunk_err(leaf, chunk, key->offset, + "invalid chunk item size: have %u expect [%zu, %u)", + btrfs_item_size_nr(leaf, slot), + sizeof(struct btrfs_chunk), + BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); + return -EUCLEAN; + } + + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + /* Let btrfs_check_chunk_valid() handle this error type */ + if (num_stripes == 0) + goto out; + + if (btrfs_chunk_item_size(num_stripes) != + btrfs_item_size_nr(leaf, slot)) { + chunk_err(leaf, chunk, key->offset, + "invalid chunk item size: have %u expect %lu", + btrfs_item_size_nr(leaf, slot), + btrfs_chunk_item_size(num_stripes)); + return -EUCLEAN; + } +out: + return btrfs_check_chunk_valid(leaf, chunk, key->offset); +} + __printf(3, 4) __cold static void dev_item_err(const struct extent_buffer *eb, int slot, @@ -801,7 +958,7 @@ static int check_dev_item(struct extent_buffer *leaf, } /* Inode item error output has the same format as dir_item_err() */ -#define inode_item_err(fs_info, eb, slot, fmt, ...) \ +#define inode_item_err(eb, slot, fmt, ...) \ dir_item_err(eb, slot, fmt, __VA_ARGS__) static int check_inode_item(struct extent_buffer *leaf, @@ -812,30 +969,17 @@ static int check_inode_item(struct extent_buffer *leaf, u64 super_gen = btrfs_super_generation(fs_info->super_copy); u32 valid_mask = (S_IFMT | S_ISUID | S_ISGID | S_ISVTX | 0777); u32 mode; + int ret; + + ret = check_inode_key(leaf, key, slot); + if (ret < 0) + return ret; - if ((key->objectid < BTRFS_FIRST_FREE_OBJECTID || - key->objectid > BTRFS_LAST_FREE_OBJECTID) && - key->objectid != BTRFS_ROOT_TREE_DIR_OBJECTID && - key->objectid != BTRFS_FREE_INO_OBJECTID) { - generic_err(leaf, slot, - "invalid key objectid: has %llu expect %llu or [%llu, %llu] or %llu", - key->objectid, BTRFS_ROOT_TREE_DIR_OBJECTID, - BTRFS_FIRST_FREE_OBJECTID, - BTRFS_LAST_FREE_OBJECTID, - BTRFS_FREE_INO_OBJECTID); - return -EUCLEAN; - } - if (key->offset != 0) { - inode_item_err(fs_info, leaf, slot, - "invalid key offset: has %llu expect 0", - key->offset); - return -EUCLEAN; - } iitem = btrfs_item_ptr(leaf, slot, struct btrfs_inode_item); /* Here we use super block generation + 1 to handle log tree */ if (btrfs_inode_generation(leaf, iitem) > super_gen + 1) { - inode_item_err(fs_info, leaf, slot, + inode_item_err(leaf, slot, "invalid inode generation: has %llu expect (0, %llu]", btrfs_inode_generation(leaf, iitem), super_gen + 1); @@ -843,7 +987,7 @@ static int check_inode_item(struct extent_buffer *leaf, } /* Note for ROOT_TREE_DIR_ITEM, mkfs could set its transid 0 */ if (btrfs_inode_transid(leaf, iitem) > super_gen + 1) { - inode_item_err(fs_info, leaf, slot, + inode_item_err(leaf, slot, "invalid inode generation: has %llu expect [0, %llu]", btrfs_inode_transid(leaf, iitem), super_gen + 1); return -EUCLEAN; @@ -856,7 +1000,7 @@ static int check_inode_item(struct extent_buffer *leaf, */ mode = btrfs_inode_mode(leaf, iitem); if (mode & ~valid_mask) { - inode_item_err(fs_info, leaf, slot, + inode_item_err(leaf, slot, "unknown mode bit detected: 0x%x", mode & ~valid_mask); return -EUCLEAN; @@ -869,20 +1013,20 @@ static int check_inode_item(struct extent_buffer *leaf, */ if (!has_single_bit_set(mode & S_IFMT)) { if (!S_ISLNK(mode) && !S_ISBLK(mode) && !S_ISSOCK(mode)) { - inode_item_err(fs_info, leaf, slot, + inode_item_err(leaf, slot, "invalid mode: has 0%o expect valid S_IF* bit(s)", mode & S_IFMT); return -EUCLEAN; } } if (S_ISDIR(mode) && btrfs_inode_nlink(leaf, iitem) > 1) { - inode_item_err(fs_info, leaf, slot, + inode_item_err(leaf, slot, "invalid nlink: has %u expect no more than 1 for dir", btrfs_inode_nlink(leaf, iitem)); return -EUCLEAN; } if (btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK) { - inode_item_err(fs_info, leaf, slot, + inode_item_err(leaf, slot, "unknown flags detected: 0x%llx", btrfs_inode_flags(leaf, iitem) & ~BTRFS_INODE_FLAG_MASK); @@ -898,22 +1042,11 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, struct btrfs_root_item ri; const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY | BTRFS_ROOT_SUBVOL_DEAD; + int ret; - /* No such tree id */ - if (key->objectid == 0) { - generic_err(leaf, slot, "invalid root id 0"); - return -EUCLEAN; - } - - /* - * Some older kernel may create ROOT_ITEM with non-zero offset, so here - * we only check offset for reloc tree whose key->offset must be a - * valid tree. - */ - if (key->objectid == BTRFS_TREE_RELOC_OBJECTID && key->offset == 0) { - generic_err(leaf, slot, "invalid root id 0 for reloc tree"); - return -EUCLEAN; - } + ret = check_root_key(leaf, key, slot); + if (ret < 0) + return ret; if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) { generic_err(leaf, slot, @@ -1302,8 +1435,8 @@ static int check_extent_data_ref(struct extent_buffer *leaf, return 0; } -#define inode_ref_err(fs_info, eb, slot, fmt, args...) \ - inode_item_err(fs_info, eb, slot, fmt, ##args) +#define inode_ref_err(eb, slot, fmt, args...) \ + inode_item_err(eb, slot, fmt, ##args) static int check_inode_ref(struct extent_buffer *leaf, struct btrfs_key *key, struct btrfs_key *prev_key, int slot) @@ -1316,7 +1449,7 @@ static int check_inode_ref(struct extent_buffer *leaf, return -EUCLEAN; /* namelen can't be 0, so item_size == sizeof() is also invalid */ if (btrfs_item_size_nr(leaf, slot) <= sizeof(*iref)) { - inode_ref_err(fs_info, leaf, slot, + inode_ref_err(leaf, slot, "invalid item size, have %u expect (%zu, %u)", btrfs_item_size_nr(leaf, slot), sizeof(*iref), BTRFS_LEAF_DATA_SIZE(leaf->fs_info)); @@ -1329,7 +1462,7 @@ static int check_inode_ref(struct extent_buffer *leaf, u16 namelen; if (ptr + sizeof(iref) > end) { - inode_ref_err(fs_info, leaf, slot, + inode_ref_err(leaf, slot, "inode ref overflow, ptr %lu end %lu inode_ref_size %zu", ptr, end, sizeof(iref)); return -EUCLEAN; @@ -1338,7 +1471,7 @@ static int check_inode_ref(struct extent_buffer *leaf, iref = (struct btrfs_inode_ref *)ptr; namelen = btrfs_inode_ref_name_len(leaf, iref); if (ptr + sizeof(*iref) + namelen > end) { - inode_ref_err(fs_info, leaf, slot, + inode_ref_err(leaf, slot, "inode ref overflow, ptr %lu end %lu namelen %u", ptr, end, namelen); return -EUCLEAN; @@ -1384,7 +1517,7 @@ static int check_leaf_item(struct extent_buffer *leaf, break; case BTRFS_CHUNK_ITEM_KEY: chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); - ret = btrfs_check_chunk_valid(leaf, chunk, key->offset); + ret = check_leaf_chunk_item(leaf, chunk, key, slot); break; case BTRFS_DEV_ITEM_KEY: ret = check_dev_item(leaf, key, slot); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d3f115909ff0..7dd7552f53a4 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2674,14 +2674,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, u32 blocksize; int ret = 0; - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); - while (*level > 0) { struct btrfs_key first_key; - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); cur = path->nodes[*level]; WARN_ON(btrfs_header_level(cur) != *level); @@ -2732,9 +2727,8 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_and_pin_reserved_extent( - fs_info, bytenr, - blocksize); + ret = btrfs_pin_reserved_extent(fs_info, + bytenr, blocksize); if (ret) { free_extent_buffer(next); return ret; @@ -2749,7 +2743,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, return ret; } - WARN_ON(*level <= 0); if (path->nodes[*level-1]) free_extent_buffer(path->nodes[*level-1]); path->nodes[*level-1] = next; @@ -2757,9 +2750,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, path->slots[*level] = 0; cond_resched(); } - WARN_ON(*level < 0); - WARN_ON(*level >= BTRFS_MAX_LEVEL); - path->slots[*level] = btrfs_header_nritems(path->nodes[*level]); cond_resched(); @@ -2815,8 +2805,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, } WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_and_pin_reserved_extent( - fs_info, + ret = btrfs_pin_reserved_extent(fs_info, path->nodes[*level]->start, path->nodes[*level]->len); if (ret) @@ -2896,10 +2885,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, clear_extent_buffer_dirty(next); } - WARN_ON(log->root_key.objectid != - BTRFS_TREE_LOG_OBJECTID); - ret = btrfs_free_and_pin_reserved_extent(fs_info, - next->start, next->len); + ret = btrfs_pin_reserved_extent(fs_info, next->start, + next->len); if (ret) goto out; } @@ -3935,7 +3922,7 @@ static int log_csums(struct btrfs_trans_handle *trans, static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_path *dst_path, - struct btrfs_path *src_path, u64 *last_extent, + struct btrfs_path *src_path, int start_slot, int nr, int inode_only, u64 logged_isize) { @@ -3946,7 +3933,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item *extent; struct btrfs_inode_item *inode_item; struct extent_buffer *src = src_path->nodes[0]; - struct btrfs_key first_key, last_key, key; int ret; struct btrfs_key *ins_keys; u32 *ins_sizes; @@ -3954,9 +3940,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, int i; struct list_head ordered_sums; int skip_csum = inode->flags & BTRFS_INODE_NODATASUM; - bool has_extents = false; - bool need_find_last_extent = true; - bool done = false; INIT_LIST_HEAD(&ordered_sums); @@ -3965,8 +3948,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, if (!ins_data) return -ENOMEM; - first_key.objectid = (u64)-1; - ins_sizes = (u32 *)ins_data; ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); @@ -3987,9 +3968,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src_offset = btrfs_item_ptr_offset(src, start_slot + i); - if (i == nr - 1) - last_key = ins_keys[i]; - if (ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_path->slots[0], @@ -4003,20 +3981,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, src_offset, ins_sizes[i]); } - /* - * We set need_find_last_extent here in case we know we were - * processing other items and then walk into the first extent in - * the inode. If we don't hit an extent then nothing changes, - * we'll do the last search the next time around. - */ - if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY) { - has_extents = true; - if (first_key.objectid == (u64)-1) - first_key = ins_keys[i]; - } else { - need_find_last_extent = false; - } - /* take a reference on file data extents so that truncates * or deletes of this inode don't have to relog the inode * again @@ -4082,167 +4046,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, kfree(sums); } - if (!has_extents) - return ret; - - if (need_find_last_extent && *last_extent == first_key.offset) { - /* - * We don't have any leafs between our current one and the one - * we processed before that can have file extent items for our - * inode (and have a generation number smaller than our current - * transaction id). - */ - need_find_last_extent = false; - } - - /* - * Because we use btrfs_search_forward we could skip leaves that were - * not modified and then assume *last_extent is valid when it really - * isn't. So back up to the previous leaf and read the end of the last - * extent before we go and fill in holes. - */ - if (need_find_last_extent) { - u64 len; - - ret = btrfs_prev_leaf(inode->root, src_path); - if (ret < 0) - return ret; - if (ret) - goto fill_holes; - if (src_path->slots[0]) - src_path->slots[0]--; - src = src_path->nodes[0]; - btrfs_item_key_to_cpu(src, &key, src_path->slots[0]); - if (key.objectid != btrfs_ino(inode) || - key.type != BTRFS_EXTENT_DATA_KEY) - goto fill_holes; - extent = btrfs_item_ptr(src, src_path->slots[0], - struct btrfs_file_extent_item); - if (btrfs_file_extent_type(src, extent) == - BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_ram_bytes(src, extent); - *last_extent = ALIGN(key.offset + len, - fs_info->sectorsize); - } else { - len = btrfs_file_extent_num_bytes(src, extent); - *last_extent = key.offset + len; - } - } -fill_holes: - /* So we did prev_leaf, now we need to move to the next leaf, but a few - * things could have happened - * - * 1) A merge could have happened, so we could currently be on a leaf - * that holds what we were copying in the first place. - * 2) A split could have happened, and now not all of the items we want - * are on the same leaf. - * - * So we need to adjust how we search for holes, we need to drop the - * path and re-search for the first extent key we found, and then walk - * forward until we hit the last one we copied. - */ - if (need_find_last_extent) { - /* btrfs_prev_leaf could return 1 without releasing the path */ - btrfs_release_path(src_path); - ret = btrfs_search_slot(NULL, inode->root, &first_key, - src_path, 0, 0); - if (ret < 0) - return ret; - ASSERT(ret == 0); - src = src_path->nodes[0]; - i = src_path->slots[0]; - } else { - i = start_slot; - } - - /* - * Ok so here we need to go through and fill in any holes we may have - * to make sure that holes are punched for those areas in case they had - * extents previously. - */ - while (!done) { - u64 offset, len; - u64 extent_end; - - if (i >= btrfs_header_nritems(src_path->nodes[0])) { - ret = btrfs_next_leaf(inode->root, src_path); - if (ret < 0) - return ret; - ASSERT(ret == 0); - src = src_path->nodes[0]; - i = 0; - need_find_last_extent = true; - } - - btrfs_item_key_to_cpu(src, &key, i); - if (!btrfs_comp_cpu_keys(&key, &last_key)) - done = true; - if (key.objectid != btrfs_ino(inode) || - key.type != BTRFS_EXTENT_DATA_KEY) { - i++; - continue; - } - extent = btrfs_item_ptr(src, i, struct btrfs_file_extent_item); - if (btrfs_file_extent_type(src, extent) == - BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_ram_bytes(src, extent); - extent_end = ALIGN(key.offset + len, - fs_info->sectorsize); - } else { - len = btrfs_file_extent_num_bytes(src, extent); - extent_end = key.offset + len; - } - i++; - - if (*last_extent == key.offset) { - *last_extent = extent_end; - continue; - } - offset = *last_extent; - len = key.offset - *last_extent; - ret = btrfs_insert_file_extent(trans, log, btrfs_ino(inode), - offset, 0, 0, len, 0, len, 0, 0, 0); - if (ret) - break; - *last_extent = extent_end; - } - - /* - * Check if there is a hole between the last extent found in our leaf - * and the first extent in the next leaf. If there is one, we need to - * log an explicit hole so that at replay time we can punch the hole. - */ - if (ret == 0 && - key.objectid == btrfs_ino(inode) && - key.type == BTRFS_EXTENT_DATA_KEY && - i == btrfs_header_nritems(src_path->nodes[0])) { - ret = btrfs_next_leaf(inode->root, src_path); - need_find_last_extent = true; - if (ret > 0) { - ret = 0; - } else if (ret == 0) { - btrfs_item_key_to_cpu(src_path->nodes[0], &key, - src_path->slots[0]); - if (key.objectid == btrfs_ino(inode) && - key.type == BTRFS_EXTENT_DATA_KEY && - *last_extent < key.offset) { - const u64 len = key.offset - *last_extent; - - ret = btrfs_insert_file_extent(trans, log, - btrfs_ino(inode), - *last_extent, 0, - 0, len, 0, len, - 0, 0, 0); - *last_extent += len; - } - } - } - /* - * Need to let the callers know we dropped the path so they should - * re-search. - */ - if (!ret && need_find_last_extent) - ret = 1; return ret; } @@ -4407,7 +4210,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, const u64 i_size = i_size_read(&inode->vfs_inode); const u64 ino = btrfs_ino(inode); struct btrfs_path *dst_path = NULL; - u64 last_extent = (u64)-1; + bool dropped_extents = false; int ins_nr = 0; int start_slot; int ret; @@ -4429,8 +4232,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, if (slot >= btrfs_header_nritems(leaf)) { if (ins_nr > 0) { ret = copy_items(trans, inode, dst_path, path, - &last_extent, start_slot, - ins_nr, 1, 0); + start_slot, ins_nr, 1, 0); if (ret < 0) goto out; ins_nr = 0; @@ -4454,8 +4256,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, path->slots[0]++; continue; } - if (last_extent == (u64)-1) { - last_extent = key.offset; + if (!dropped_extents) { /* * Avoid logging extent items logged in past fsync calls * and leading to duplicate keys in the log tree. @@ -4469,6 +4270,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, } while (ret == -EAGAIN); if (ret) goto out; + dropped_extents = true; } if (ins_nr == 0) start_slot = slot; @@ -4483,7 +4285,7 @@ static int btrfs_log_prealloc_extents(struct btrfs_trans_handle *trans, } } if (ins_nr > 0) { - ret = copy_items(trans, inode, dst_path, path, &last_extent, + ret = copy_items(trans, inode, dst_path, path, start_slot, ins_nr, 1, 0); if (ret > 0) ret = 0; @@ -4670,13 +4472,8 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, if (slot >= nritems) { if (ins_nr > 0) { - u64 last_extent = 0; - ret = copy_items(trans, inode, dst_path, path, - &last_extent, start_slot, - ins_nr, 1, 0); - /* can't be 1, extent items aren't processed */ - ASSERT(ret <= 0); + start_slot, ins_nr, 1, 0); if (ret < 0) return ret; ins_nr = 0; @@ -4700,13 +4497,8 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, cond_resched(); } if (ins_nr > 0) { - u64 last_extent = 0; - ret = copy_items(trans, inode, dst_path, path, - &last_extent, start_slot, - ins_nr, 1, 0); - /* can't be 1, extent items aren't processed */ - ASSERT(ret <= 0); + start_slot, ins_nr, 1, 0); if (ret < 0) return ret; } @@ -4715,100 +4507,119 @@ static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, } /* - * If the no holes feature is enabled we need to make sure any hole between the - * last extent and the i_size of our inode is explicitly marked in the log. This - * is to make sure that doing something like: - * - * 1) create file with 128Kb of data - * 2) truncate file to 64Kb - * 3) truncate file to 256Kb - * 4) fsync file - * 5) <crash/power failure> - * 6) mount fs and trigger log replay - * - * Will give us a file with a size of 256Kb, the first 64Kb of data match what - * the file had in its first 64Kb of data at step 1 and the last 192Kb of the - * file correspond to a hole. The presence of explicit holes in a log tree is - * what guarantees that log replay will remove/adjust file extent items in the - * fs/subvol tree. - * - * Here we do not need to care about holes between extents, that is already done - * by copy_items(). We also only need to do this in the full sync path, where we - * lookup for extents from the fs/subvol tree only. In the fast path case, we - * lookup the list of modified extent maps and if any represents a hole, we - * insert a corresponding extent representing a hole in the log tree. + * When using the NO_HOLES feature if we punched a hole that causes the + * deletion of entire leafs or all the extent items of the first leaf (the one + * that contains the inode item and references) we may end up not processing + * any extents, because there are no leafs with a generation matching the + * current transaction that have extent items for our inode. So we need to find + * if any holes exist and then log them. We also need to log holes after any + * truncate operation that changes the inode's size. */ -static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_inode *inode, - struct btrfs_path *path) +static int btrfs_log_holes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_inode *inode, + struct btrfs_path *path) { struct btrfs_fs_info *fs_info = root->fs_info; - int ret; struct btrfs_key key; - u64 hole_start; - u64 hole_size; - struct extent_buffer *leaf; - struct btrfs_root *log = root->log_root; const u64 ino = btrfs_ino(inode); const u64 i_size = i_size_read(&inode->vfs_inode); + u64 prev_extent_end = 0; + int ret; - if (!btrfs_fs_incompat(fs_info, NO_HOLES)) + if (!btrfs_fs_incompat(fs_info, NO_HOLES) || i_size == 0) return 0; key.objectid = ino; key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = (u64)-1; + key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - ASSERT(ret != 0); if (ret < 0) return ret; - ASSERT(path->slots[0] > 0); - path->slots[0]--; - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - - if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { - /* inode does not have any extents */ - hole_start = 0; - hole_size = i_size; - } else { + while (true) { struct btrfs_file_extent_item *extent; + struct extent_buffer *leaf = path->nodes[0]; u64 len; - /* - * If there's an extent beyond i_size, an explicit hole was - * already inserted by copy_items(). - */ - if (key.offset >= i_size) - return 0; + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + if (ret > 0) { + ret = 0; + break; + } + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) + break; + + /* We have a hole, log it. */ + if (prev_extent_end < key.offset) { + const u64 hole_len = key.offset - prev_extent_end; + + /* + * Release the path to avoid deadlocks with other code + * paths that search the root while holding locks on + * leafs from the log root. + */ + btrfs_release_path(path); + ret = btrfs_insert_file_extent(trans, root->log_root, + ino, prev_extent_end, 0, + 0, hole_len, 0, hole_len, + 0, 0, 0); + if (ret < 0) + return ret; + + /* + * Search for the same key again in the root. Since it's + * an extent item and we are holding the inode lock, the + * key must still exist. If it doesn't just emit warning + * and return an error to fall back to a transaction + * commit. + */ + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + if (WARN_ON(ret > 0)) + return -ENOENT; + leaf = path->nodes[0]; + } extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - if (btrfs_file_extent_type(leaf, extent) == - BTRFS_FILE_EXTENT_INLINE) - return 0; + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_ram_bytes(leaf, extent); + prev_extent_end = ALIGN(key.offset + len, + fs_info->sectorsize); + } else { + len = btrfs_file_extent_num_bytes(leaf, extent); + prev_extent_end = key.offset + len; + } - len = btrfs_file_extent_num_bytes(leaf, extent); - /* Last extent goes beyond i_size, no need to log a hole. */ - if (key.offset + len > i_size) - return 0; - hole_start = key.offset + len; - hole_size = i_size - hole_start; + path->slots[0]++; + cond_resched(); } - btrfs_release_path(path); - /* Last extent ends at i_size. */ - if (hole_size == 0) - return 0; + if (prev_extent_end < i_size) { + u64 hole_len; - hole_size = ALIGN(hole_size, fs_info->sectorsize); - ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, - hole_size, 0, hole_size, 0, 0, 0); - return ret; + btrfs_release_path(path); + hole_len = ALIGN(i_size - prev_extent_end, fs_info->sectorsize); + ret = btrfs_insert_file_extent(trans, root->log_root, + ino, prev_extent_end, 0, 0, + hole_len, 0, hole_len, + 0, 0, 0); + if (ret < 0) + return ret; + } + + return 0; } /* @@ -5012,6 +4823,50 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, continue; } /* + * If the inode was already logged skip it - otherwise we can + * hit an infinite loop. Example: + * + * From the commit root (previous transaction) we have the + * following inodes: + * + * inode 257 a directory + * inode 258 with references "zz" and "zz_link" on inode 257 + * inode 259 with reference "a" on inode 257 + * + * And in the current (uncommitted) transaction we have: + * + * inode 257 a directory, unchanged + * inode 258 with references "a" and "a2" on inode 257 + * inode 259 with reference "zz_link" on inode 257 + * inode 261 with reference "zz" on inode 257 + * + * When logging inode 261 the following infinite loop could + * happen if we don't skip already logged inodes: + * + * - we detect inode 258 as a conflicting inode, with inode 261 + * on reference "zz", and log it; + * + * - we detect inode 259 as a conflicting inode, with inode 258 + * on reference "a", and log it; + * + * - we detect inode 258 as a conflicting inode, with inode 259 + * on reference "zz_link", and log it - again! After this we + * repeat the above steps forever. + */ + spin_lock(&BTRFS_I(inode)->lock); + /* + * Check the inode's logged_trans only instead of + * btrfs_inode_in_log(). This is because the last_log_commit of + * the inode is not updated when we only log that it exists and + * and it has the full sync bit set (see btrfs_log_inode()). + */ + if (BTRFS_I(inode)->logged_trans == trans->transid) { + spin_unlock(&BTRFS_I(inode)->lock); + btrfs_add_delayed_iput(inode); + continue; + } + spin_unlock(&BTRFS_I(inode)->lock); + /* * We are safe logging the other inode without acquiring its * lock as long as we log with the LOG_INODE_EXISTS mode. We * are safe against concurrent renames of the other inode as @@ -5110,7 +4965,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_key min_key; struct btrfs_key max_key; struct btrfs_root *log = root->log_root; - u64 last_extent = 0; int err = 0; int ret; int nritems; @@ -5288,7 +5142,7 @@ again: ins_start_slot = path->slots[0]; } ret = copy_items(trans, inode, dst_path, path, - &last_extent, ins_start_slot, + ins_start_slot, ins_nr, inode_only, logged_isize); if (ret < 0) { @@ -5311,17 +5165,13 @@ again: if (ins_nr == 0) goto next_slot; ret = copy_items(trans, inode, dst_path, path, - &last_extent, ins_start_slot, + ins_start_slot, ins_nr, inode_only, logged_isize); if (ret < 0) { err = ret; goto out_unlock; } ins_nr = 0; - if (ret) { - btrfs_release_path(path); - continue; - } goto next_slot; } @@ -5334,18 +5184,13 @@ again: goto next_slot; } - ret = copy_items(trans, inode, dst_path, path, &last_extent, + ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, inode_only, logged_isize); if (ret < 0) { err = ret; goto out_unlock; } - if (ret) { - ins_nr = 0; - btrfs_release_path(path); - continue; - } ins_nr = 1; ins_start_slot = path->slots[0]; next_slot: @@ -5359,13 +5204,12 @@ next_slot: } if (ins_nr) { ret = copy_items(trans, inode, dst_path, path, - &last_extent, ins_start_slot, + ins_start_slot, ins_nr, inode_only, logged_isize); if (ret < 0) { err = ret; goto out_unlock; } - ret = 0; ins_nr = 0; } btrfs_release_path(path); @@ -5380,14 +5224,13 @@ next_key: } } if (ins_nr) { - ret = copy_items(trans, inode, dst_path, path, &last_extent, + ret = copy_items(trans, inode, dst_path, path, ins_start_slot, ins_nr, inode_only, logged_isize); if (ret < 0) { err = ret; goto out_unlock; } - ret = 0; ins_nr = 0; } @@ -5400,7 +5243,7 @@ next_key: if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { btrfs_release_path(path); btrfs_release_path(dst_path); - err = btrfs_log_trailing_hole(trans, root, inode, path); + err = btrfs_log_holes(trans, root, inode, path); if (err) goto out_unlock; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9b78e720c697..9cfc668f91f4 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -30,6 +30,7 @@ #include "tree-checker.h" #include "space-info.h" #include "block-group.h" +#include "discard.h" const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { @@ -66,6 +67,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 2, .devs_increment = 3, .ncopies = 3, + .nparity = 0, .raid_name = "raid1c3", .bg_flag = BTRFS_BLOCK_GROUP_RAID1C3, .mindev_error = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET, @@ -78,6 +80,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { .tolerated_failures = 3, .devs_increment = 4, .ncopies = 4, + .nparity = 0, .raid_name = "raid1c4", .bg_flag = BTRFS_BLOCK_GROUP_RAID1C4, .mindev_error = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET, @@ -438,39 +441,6 @@ static noinline struct btrfs_fs_devices *find_fsid( ASSERT(fsid); - if (metadata_fsid) { - /* - * Handle scanned device having completed its fsid change but - * belonging to a fs_devices that was created by first scanning - * a device which didn't have its fsid/metadata_uuid changed - * at all and the CHANGING_FSID_V2 flag set. - */ - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (fs_devices->fsid_change && - memcmp(metadata_fsid, fs_devices->fsid, - BTRFS_FSID_SIZE) == 0 && - memcmp(fs_devices->fsid, fs_devices->metadata_uuid, - BTRFS_FSID_SIZE) == 0) { - return fs_devices; - } - } - /* - * Handle scanned device having completed its fsid change but - * belonging to a fs_devices that was created by a device that - * has an outdated pair of fsid/metadata_uuid and - * CHANGING_FSID_V2 flag set. - */ - list_for_each_entry(fs_devices, &fs_uuids, fs_list) { - if (fs_devices->fsid_change && - memcmp(fs_devices->metadata_uuid, - fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && - memcmp(metadata_fsid, fs_devices->metadata_uuid, - BTRFS_FSID_SIZE) == 0) { - return fs_devices; - } - } - } - /* Handle non-split brain cases */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { if (metadata_fsid) { @@ -486,6 +456,47 @@ static noinline struct btrfs_fs_devices *find_fsid( return NULL; } +static struct btrfs_fs_devices *find_fsid_with_metadata_uuid( + struct btrfs_super_block *disk_super) +{ + + struct btrfs_fs_devices *fs_devices; + + /* + * Handle scanned device having completed its fsid change but + * belonging to a fs_devices that was created by first scanning + * a device which didn't have its fsid/metadata_uuid changed + * at all and the CHANGING_FSID_V2 flag set. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (fs_devices->fsid_change && + memcmp(disk_super->metadata_uuid, fs_devices->fsid, + BTRFS_FSID_SIZE) == 0 && + memcmp(fs_devices->fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) == 0) { + return fs_devices; + } + } + /* + * Handle scanned device having completed its fsid change but + * belonging to a fs_devices that was created by a device that + * has an outdated pair of fsid/metadata_uuid and + * CHANGING_FSID_V2 flag set. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (fs_devices->fsid_change && + memcmp(fs_devices->metadata_uuid, + fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && + memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) == 0) { + return fs_devices; + } + } + + return find_fsid(disk_super->fsid, disk_super->metadata_uuid); +} + + static int btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder, int flush, struct block_device **bdev, @@ -669,7 +680,9 @@ error_brelse: /* * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices - * being created with a disk that has already completed its fsid change. + * being created with a disk that has already completed its fsid change. Such + * disk can belong to an fs which has its FSID changed or to one which doesn't. + * Handle both cases here. */ static struct btrfs_fs_devices *find_fsid_inprogress( struct btrfs_super_block *disk_super) @@ -685,7 +698,7 @@ static struct btrfs_fs_devices *find_fsid_inprogress( } } - return NULL; + return find_fsid(disk_super->fsid, NULL); } @@ -697,17 +710,54 @@ static struct btrfs_fs_devices *find_fsid_changed( /* * Handles the case where scanned device is part of an fs that had * multiple successful changes of FSID but curently device didn't - * observe it. Meaning our fsid will be different than theirs. + * observe it. Meaning our fsid will be different than theirs. We need + * to handle two subcases : + * 1 - The fs still continues to have different METADATA/FSID uuids. + * 2 - The fs is switched back to its original FSID (METADATA/FSID + * are equal). */ list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + /* Changed UUIDs */ if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE) != 0 && memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid, BTRFS_FSID_SIZE) == 0 && memcmp(fs_devices->fsid, disk_super->fsid, - BTRFS_FSID_SIZE) != 0) { + BTRFS_FSID_SIZE) != 0) + return fs_devices; + + /* Unchanged UUIDs */ + if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid, + BTRFS_FSID_SIZE) == 0 && + memcmp(fs_devices->fsid, disk_super->metadata_uuid, + BTRFS_FSID_SIZE) == 0) + return fs_devices; + } + + return NULL; +} + +static struct btrfs_fs_devices *find_fsid_reverted_metadata( + struct btrfs_super_block *disk_super) +{ + struct btrfs_fs_devices *fs_devices; + + /* + * Handle the case where the scanned device is part of an fs whose last + * metadata UUID change reverted it to the original FSID. At the same + * time * fs_devices was first created by another constitutent device + * which didn't fully observe the operation. This results in an + * btrfs_fs_devices created with metadata/fsid different AND + * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the + * fs_devices equal to the FSID of the disk. + */ + list_for_each_entry(fs_devices, &fs_uuids, fs_list) { + if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid, + BTRFS_FSID_SIZE) != 0 && + memcmp(fs_devices->metadata_uuid, disk_super->fsid, + BTRFS_FSID_SIZE) == 0 && + fs_devices->fsid_change) return fs_devices; - } } return NULL; @@ -734,24 +784,16 @@ static noinline struct btrfs_device *device_list_add(const char *path, BTRFS_SUPER_FLAG_CHANGING_FSID_V2); if (fsid_change_in_progress) { - if (!has_metadata_uuid) { - /* - * When we have an image which has CHANGING_FSID_V2 set - * it might belong to either a filesystem which has - * disks with completed fsid change or it might belong - * to fs with no UUID changes in effect, handle both. - */ + if (!has_metadata_uuid) fs_devices = find_fsid_inprogress(disk_super); - if (!fs_devices) - fs_devices = find_fsid(disk_super->fsid, NULL); - } else { + else fs_devices = find_fsid_changed(disk_super); - } } else if (has_metadata_uuid) { - fs_devices = find_fsid(disk_super->fsid, - disk_super->metadata_uuid); + fs_devices = find_fsid_with_metadata_uuid(disk_super); } else { - fs_devices = find_fsid(disk_super->fsid, NULL); + fs_devices = find_fsid_reverted_metadata(disk_super); + if (!fs_devices) + fs_devices = find_fsid(disk_super->fsid, NULL); } @@ -781,12 +823,18 @@ static noinline struct btrfs_device *device_list_add(const char *path, * a device which had the CHANGING_FSID_V2 flag then replace the * metadata_uuid/fsid values of the fs_devices. */ - if (has_metadata_uuid && fs_devices->fsid_change && + if (fs_devices->fsid_change && found_transid > fs_devices->latest_generation) { memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); - memcpy(fs_devices->metadata_uuid, - disk_super->metadata_uuid, BTRFS_FSID_SIZE); + + if (has_metadata_uuid) + memcpy(fs_devices->metadata_uuid, + disk_super->metadata_uuid, + BTRFS_FSID_SIZE); + else + memcpy(fs_devices->metadata_uuid, + disk_super->fsid, BTRFS_FSID_SIZE); fs_devices->fsid_change = false; } @@ -1064,11 +1112,6 @@ static void btrfs_close_bdev(struct btrfs_device *device) static void btrfs_close_one_device(struct btrfs_device *device) { struct btrfs_fs_devices *fs_devices = device->fs_devices; - struct btrfs_device *new_device; - struct rcu_string *name; - - if (device->bdev) - fs_devices->open_devices--; if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) && device->devid != BTRFS_DEV_REPLACE_DEVID) { @@ -1080,23 +1123,22 @@ static void btrfs_close_one_device(struct btrfs_device *device) fs_devices->missing_devices--; btrfs_close_bdev(device); - - new_device = btrfs_alloc_device(NULL, &device->devid, - device->uuid); - BUG_ON(IS_ERR(new_device)); /* -ENOMEM */ - - /* Safe because we are under uuid_mutex */ - if (device->name) { - name = rcu_string_strdup(device->name->str, GFP_NOFS); - BUG_ON(!name); /* -ENOMEM */ - rcu_assign_pointer(new_device->name, name); + if (device->bdev) { + fs_devices->open_devices--; + device->bdev = NULL; } + clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); - list_replace_rcu(&device->dev_list, &new_device->dev_list); - new_device->fs_devices = device->fs_devices; + device->fs_info = NULL; + atomic_set(&device->dev_stats_ccnt, 0); + extent_io_tree_release(&device->alloc_state); - synchronize_rcu(); - btrfs_free_device(device); + /* Verify the device is back in a pristine state */ + ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)); + ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); + ASSERT(list_empty(&device->dev_alloc_list)); + ASSERT(list_empty(&device->post_commit_list)); + ASSERT(atomic_read(&device->reada_in_flight) == 0); } static int close_fs_devices(struct btrfs_fs_devices *fs_devices) @@ -2130,7 +2172,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) { struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices; - WARN_ON(!tgtdev); mutex_lock(&fs_devices->device_list_mutex); btrfs_sysfs_rm_device_link(fs_devices, tgtdev); @@ -2875,6 +2916,7 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct btrfs_root *root = fs_info->chunk_root; struct btrfs_trans_handle *trans; + struct btrfs_block_group *block_group; int ret; /* @@ -2898,6 +2940,12 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) if (ret) return ret; + block_group = btrfs_lookup_block_group(fs_info, chunk_offset); + if (!block_group) + return -ENOENT; + btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); + btrfs_put_block_group(block_group); + trans = btrfs_start_trans_remove_block_group(root->fs_info, chunk_offset); if (IS_ERR(trans)) { @@ -6111,75 +6159,6 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1); } -int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - u64 physical, u64 **logical, int *naddrs, int *stripe_len) -{ - struct extent_map *em; - struct map_lookup *map; - u64 *buf; - u64 bytenr; - u64 length; - u64 stripe_nr; - u64 rmap_len; - int i, j, nr = 0; - - em = btrfs_get_chunk_map(fs_info, chunk_start, 1); - if (IS_ERR(em)) - return -EIO; - - map = em->map_lookup; - length = em->len; - rmap_len = map->stripe_len; - - if (map->type & BTRFS_BLOCK_GROUP_RAID10) - length = div_u64(length, map->num_stripes / map->sub_stripes); - else if (map->type & BTRFS_BLOCK_GROUP_RAID0) - length = div_u64(length, map->num_stripes); - else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - length = div_u64(length, nr_data_stripes(map)); - rmap_len = map->stripe_len * nr_data_stripes(map); - } - - buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); - BUG_ON(!buf); /* -ENOMEM */ - - for (i = 0; i < map->num_stripes; i++) { - if (map->stripes[i].physical > physical || - map->stripes[i].physical + length <= physical) - continue; - - stripe_nr = physical - map->stripes[i].physical; - stripe_nr = div64_u64(stripe_nr, map->stripe_len); - - if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - stripe_nr = stripe_nr * map->num_stripes + i; - stripe_nr = div_u64(stripe_nr, map->sub_stripes); - } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - stripe_nr = stripe_nr * map->num_stripes + i; - } /* else if RAID[56], multiply by nr_data_stripes(). - * Alternatively, just use rmap_len below instead of - * map->stripe_len */ - - bytenr = chunk_start + stripe_nr * rmap_len; - WARN_ON(nr >= map->num_stripes); - for (j = 0; j < nr; j++) { - if (buf[j] == bytenr) - break; - } - if (j == nr) { - WARN_ON(nr >= map->num_stripes); - buf[nr++] = bytenr; - } - } - - *logical = buf; - *naddrs = nr; - *stripe_len = rmap_len; - - free_extent_map(em); - return 0; -} - static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio) { bio->bi_private = bbio->private; @@ -6480,19 +6459,14 @@ static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) { int index = btrfs_bg_flags_to_raid_index(type); int ncopies = btrfs_raid_array[index].ncopies; + const int nparity = btrfs_raid_array[index].nparity; int data_stripes; - switch (type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { - case BTRFS_BLOCK_GROUP_RAID5: - data_stripes = num_stripes - 1; - break; - case BTRFS_BLOCK_GROUP_RAID6: - data_stripes = num_stripes - 2; - break; - default: + if (nparity) + data_stripes = num_stripes - nparity; + else data_stripes = num_stripes / ncopies; - break; - } + return div_u64(chunk_len, data_stripes); } @@ -7331,6 +7305,8 @@ int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info, else btrfs_dev_stat_set(dev, i, 0); } + btrfs_info(fs_info, "device stats zeroed by %s (%d)", + current->comm, task_pid_nr(current)); } else { for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) if (stats->nr_items > i) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index fc1b564b9cfe..409f4816fb89 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -120,8 +120,6 @@ struct btrfs_device { /* per-device scrub information */ struct scrub_ctx *scrub_ctx; - struct btrfs_work work; - /* readahead state */ atomic_t reada_in_flight; u64 reada_next; @@ -138,6 +136,10 @@ struct btrfs_device { atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; struct extent_io_tree alloc_state; + + struct completion kobj_unregister; + /* For sysfs/FSID/devinfo/devid/ */ + struct kobject devid_kobj; }; /* @@ -168,7 +170,7 @@ btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \ write_seqcount_end(&dev->data_seqcount); \ preempt_enable(); \ } -#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) #define BTRFS_DEVICE_GETSET_FUNCS(name) \ static inline u64 \ btrfs_device_get_##name(const struct btrfs_device *dev) \ @@ -255,7 +257,7 @@ struct btrfs_fs_devices { struct btrfs_fs_info *fs_info; /* sysfs kobjects */ struct kobject fsid_kobj; - struct kobject *device_dir_kobj; + struct kobject *devices_kobj; struct completion kobj_unregister; }; @@ -417,8 +419,6 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct btrfs_bio **bbio_ret); int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 len, struct btrfs_io_geometry *io_geom); -int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, - u64 physical, u64 **logical, int *naddrs, int *stripe_len); int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index a6c90a003c12..05615a1099db 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -20,9 +20,13 @@ #include <linux/refcount.h> #include "compression.h" +/* workspace buffer size for s390 zlib hardware support */ +#define ZLIB_DFLTCC_BUF_SIZE (4 * PAGE_SIZE) + struct workspace { z_stream strm; char *buf; + unsigned int buf_size; struct list_head list; int level; }; @@ -61,7 +65,21 @@ struct list_head *zlib_alloc_workspace(unsigned int level) zlib_inflate_workspacesize()); workspace->strm.workspace = kvmalloc(workspacesize, GFP_KERNEL); workspace->level = level; - workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + workspace->buf = NULL; + /* + * In case of s390 zlib hardware support, allocate lager workspace + * buffer. If allocator fails, fall back to a single page buffer. + */ + if (zlib_deflate_dfltcc_enabled()) { + workspace->buf = kmalloc(ZLIB_DFLTCC_BUF_SIZE, + __GFP_NOMEMALLOC | __GFP_NORETRY | + __GFP_NOWARN | GFP_NOIO); + workspace->buf_size = ZLIB_DFLTCC_BUF_SIZE; + } + if (!workspace->buf) { + workspace->buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + workspace->buf_size = PAGE_SIZE; + } if (!workspace->strm.workspace || !workspace->buf) goto fail; @@ -85,6 +103,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, struct page *in_page = NULL; struct page *out_page = NULL; unsigned long bytes_left; + unsigned int in_buf_pages; unsigned long len = *total_out; unsigned long nr_dest_pages = *out_pages; const unsigned long max_out = nr_dest_pages * PAGE_SIZE; @@ -102,9 +121,6 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, workspace->strm.total_in = 0; workspace->strm.total_out = 0; - in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = kmap(in_page); - out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (out_page == NULL) { ret = -ENOMEM; @@ -114,12 +130,51 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, pages[0] = out_page; nr_pages = 1; - workspace->strm.next_in = data_in; + workspace->strm.next_in = workspace->buf; + workspace->strm.avail_in = 0; workspace->strm.next_out = cpage_out; workspace->strm.avail_out = PAGE_SIZE; - workspace->strm.avail_in = min(len, PAGE_SIZE); while (workspace->strm.total_in < len) { + /* + * Get next input pages and copy the contents to + * the workspace buffer if required. + */ + if (workspace->strm.avail_in == 0) { + bytes_left = len - workspace->strm.total_in; + in_buf_pages = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE), + workspace->buf_size / PAGE_SIZE); + if (in_buf_pages > 1) { + int i; + + for (i = 0; i < in_buf_pages; i++) { + if (in_page) { + kunmap(in_page); + put_page(in_page); + } + in_page = find_get_page(mapping, + start >> PAGE_SHIFT); + data_in = kmap(in_page); + memcpy(workspace->buf + i * PAGE_SIZE, + data_in, PAGE_SIZE); + start += PAGE_SIZE; + } + workspace->strm.next_in = workspace->buf; + } else { + if (in_page) { + kunmap(in_page); + put_page(in_page); + } + in_page = find_get_page(mapping, + start >> PAGE_SHIFT); + data_in = kmap(in_page); + start += PAGE_SIZE; + workspace->strm.next_in = data_in; + } + workspace->strm.avail_in = min(bytes_left, + (unsigned long) workspace->buf_size); + } + ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); if (ret != Z_OK) { pr_debug("BTRFS: deflate in loop returned %d\n", @@ -161,33 +216,43 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, /* we're all done */ if (workspace->strm.total_in >= len) break; - - /* we've read in a full page, get a new one */ - if (workspace->strm.avail_in == 0) { - if (workspace->strm.total_out > max_out) - break; - - bytes_left = len - workspace->strm.total_in; - kunmap(in_page); - put_page(in_page); - - start += PAGE_SIZE; - in_page = find_get_page(mapping, - start >> PAGE_SHIFT); - data_in = kmap(in_page); - workspace->strm.avail_in = min(bytes_left, - PAGE_SIZE); - workspace->strm.next_in = data_in; - } + if (workspace->strm.total_out > max_out) + break; } workspace->strm.avail_in = 0; - ret = zlib_deflate(&workspace->strm, Z_FINISH); - zlib_deflateEnd(&workspace->strm); - - if (ret != Z_STREAM_END) { - ret = -EIO; - goto out; + /* + * Call deflate with Z_FINISH flush parameter providing more output + * space but no more input data, until it returns with Z_STREAM_END. + */ + while (ret != Z_STREAM_END) { + ret = zlib_deflate(&workspace->strm, Z_FINISH); + if (ret == Z_STREAM_END) + break; + if (ret != Z_OK && ret != Z_BUF_ERROR) { + zlib_deflateEnd(&workspace->strm); + ret = -EIO; + goto out; + } else if (workspace->strm.avail_out == 0) { + /* get another page for the stream end */ + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -E2BIG; + goto out; + } + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (out_page == NULL) { + ret = -ENOMEM; + goto out; + } + cpage_out = kmap(out_page); + pages[nr_pages] = out_page; + nr_pages++; + workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.next_out = cpage_out; + } } + zlib_deflateEnd(&workspace->strm); if (workspace->strm.total_out >= workspace->strm.total_in) { ret = -E2BIG; @@ -231,7 +296,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) workspace->strm.total_out = 0; workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = workspace->buf_size; /* If it's deflate, and it's got no preset dictionary, then we can tell zlib to skip the adler32 check. */ @@ -270,7 +335,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) } workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = workspace->buf_size; if (workspace->strm.avail_in == 0) { unsigned long tmp; @@ -320,7 +385,7 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in, workspace->strm.total_in = 0; workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = workspace->buf_size; workspace->strm.total_out = 0; /* If it's deflate, and it's got no preset dictionary, then we can tell zlib to skip the adler32 check. */ @@ -364,7 +429,7 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in, buf_offset = 0; bytes = min(PAGE_SIZE - pg_offset, - PAGE_SIZE - buf_offset); + PAGE_SIZE - (buf_offset % PAGE_SIZE)); bytes = min(bytes, bytes_left); kaddr = kmap_atomic(dest_page); @@ -375,7 +440,7 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in, bytes_left -= bytes; next: workspace->strm.next_out = workspace->buf; - workspace->strm.avail_out = PAGE_SIZE; + workspace->strm.avail_out = workspace->buf_size; } if (ret != Z_STREAM_END && bytes_left != 0) diff --git a/fs/buffer.c b/fs/buffer.c index 18a87ec8a465..b8d28370cfd7 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1433,7 +1433,7 @@ static bool has_bh_in_lru(int cpu, void *dummy) void invalidate_bh_lrus(void) { - on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1, GFP_KERNEL); + on_each_cpu_cond(has_bh_in_lru, invalidate_bh_lru, NULL, 1); } EXPORT_SYMBOL_GPL(invalidate_bh_lrus); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 374db1bd57d1..145d46ba25ae 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -708,8 +708,10 @@ void ceph_mdsc_release_request(struct kref *kref) /* avoid calling iput_final() in mds dispatch threads */ ceph_async_iput(req->r_inode); } - if (req->r_parent) + if (req->r_parent) { ceph_put_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); + ceph_async_iput(req->r_parent); + } ceph_async_iput(req->r_target_inode); if (req->r_dentry) dput(req->r_dentry); @@ -2676,8 +2678,10 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir, /* take CAP_PIN refs for r_inode, r_parent, r_old_dentry */ if (req->r_inode) ceph_get_cap_refs(ceph_inode(req->r_inode), CEPH_CAP_PIN); - if (req->r_parent) + if (req->r_parent) { ceph_get_cap_refs(ceph_inode(req->r_parent), CEPH_CAP_PIN); + ihold(req->r_parent); + } if (req->r_old_dentry_dir) ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 19f6e592b941..276e4b5ea8e0 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -611,12 +611,12 @@ static int cifs_stats_proc_open(struct inode *inode, struct file *file) return single_open(file, cifs_stats_proc_show, NULL); } -static const struct file_operations cifs_stats_proc_fops = { - .open = cifs_stats_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = cifs_stats_proc_write, +static const struct proc_ops cifs_stats_proc_ops = { + .proc_open = cifs_stats_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = cifs_stats_proc_write, }; #ifdef CONFIG_CIFS_SMB_DIRECT @@ -640,12 +640,12 @@ static int name##_open(struct inode *inode, struct file *file) \ return single_open(file, name##_proc_show, NULL); \ } \ \ -static const struct file_operations cifs_##name##_proc_fops = { \ - .open = name##_open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ - .write = name##_write, \ +static const struct proc_ops cifs_##name##_proc_fops = { \ + .proc_open = name##_open, \ + .proc_read = seq_read, \ + .proc_lseek = seq_lseek, \ + .proc_release = single_release, \ + .proc_write = name##_write, \ } PROC_FILE_DEFINE(rdma_readwrite_threshold); @@ -659,11 +659,11 @@ PROC_FILE_DEFINE(smbd_receive_credit_max); #endif static struct proc_dir_entry *proc_fs_cifs; -static const struct file_operations cifsFYI_proc_fops; -static const struct file_operations cifs_lookup_cache_proc_fops; -static const struct file_operations traceSMB_proc_fops; -static const struct file_operations cifs_security_flags_proc_fops; -static const struct file_operations cifs_linux_ext_proc_fops; +static const struct proc_ops cifsFYI_proc_ops; +static const struct proc_ops cifs_lookup_cache_proc_ops; +static const struct proc_ops traceSMB_proc_ops; +static const struct proc_ops cifs_security_flags_proc_ops; +static const struct proc_ops cifs_linux_ext_proc_ops; void cifs_proc_init(void) @@ -678,18 +678,18 @@ cifs_proc_init(void) proc_create_single("open_files", 0400, proc_fs_cifs, cifs_debug_files_proc_show); - proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_fops); - proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_fops); - proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_fops); + proc_create("Stats", 0644, proc_fs_cifs, &cifs_stats_proc_ops); + proc_create("cifsFYI", 0644, proc_fs_cifs, &cifsFYI_proc_ops); + proc_create("traceSMB", 0644, proc_fs_cifs, &traceSMB_proc_ops); proc_create("LinuxExtensionsEnabled", 0644, proc_fs_cifs, - &cifs_linux_ext_proc_fops); + &cifs_linux_ext_proc_ops); proc_create("SecurityFlags", 0644, proc_fs_cifs, - &cifs_security_flags_proc_fops); + &cifs_security_flags_proc_ops); proc_create("LookupCacheEnabled", 0644, proc_fs_cifs, - &cifs_lookup_cache_proc_fops); + &cifs_lookup_cache_proc_ops); #ifdef CONFIG_CIFS_DFS_UPCALL - proc_create("dfscache", 0644, proc_fs_cifs, &dfscache_proc_fops); + proc_create("dfscache", 0644, proc_fs_cifs, &dfscache_proc_ops); #endif #ifdef CONFIG_CIFS_SMB_DIRECT @@ -774,12 +774,12 @@ static ssize_t cifsFYI_proc_write(struct file *file, const char __user *buffer, return count; } -static const struct file_operations cifsFYI_proc_fops = { - .open = cifsFYI_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = cifsFYI_proc_write, +static const struct proc_ops cifsFYI_proc_ops = { + .proc_open = cifsFYI_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = cifsFYI_proc_write, }; static int cifs_linux_ext_proc_show(struct seq_file *m, void *v) @@ -805,12 +805,12 @@ static ssize_t cifs_linux_ext_proc_write(struct file *file, return count; } -static const struct file_operations cifs_linux_ext_proc_fops = { - .open = cifs_linux_ext_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = cifs_linux_ext_proc_write, +static const struct proc_ops cifs_linux_ext_proc_ops = { + .proc_open = cifs_linux_ext_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = cifs_linux_ext_proc_write, }; static int cifs_lookup_cache_proc_show(struct seq_file *m, void *v) @@ -836,12 +836,12 @@ static ssize_t cifs_lookup_cache_proc_write(struct file *file, return count; } -static const struct file_operations cifs_lookup_cache_proc_fops = { - .open = cifs_lookup_cache_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = cifs_lookup_cache_proc_write, +static const struct proc_ops cifs_lookup_cache_proc_ops = { + .proc_open = cifs_lookup_cache_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = cifs_lookup_cache_proc_write, }; static int traceSMB_proc_show(struct seq_file *m, void *v) @@ -867,12 +867,12 @@ static ssize_t traceSMB_proc_write(struct file *file, const char __user *buffer, return count; } -static const struct file_operations traceSMB_proc_fops = { - .open = traceSMB_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = traceSMB_proc_write, +static const struct proc_ops traceSMB_proc_ops = { + .proc_open = traceSMB_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = traceSMB_proc_write, }; static int cifs_security_flags_proc_show(struct seq_file *m, void *v) @@ -978,12 +978,12 @@ static ssize_t cifs_security_flags_proc_write(struct file *file, return count; } -static const struct file_operations cifs_security_flags_proc_fops = { - .open = cifs_security_flags_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = cifs_security_flags_proc_write, +static const struct proc_ops cifs_security_flags_proc_ops = { + .proc_open = cifs_security_flags_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = cifs_security_flags_proc_write, }; #else inline void cifs_proc_init(void) diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 41957b82d796..606f26d862dc 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -120,17 +120,17 @@ cifs_build_devname(char *nodename, const char *prepath) /** - * cifs_compose_mount_options - creates mount options for refferral + * cifs_compose_mount_options - creates mount options for referral * @sb_mountdata: parent/root DFS mount options (template) * @fullpath: full path in UNC format - * @ref: server's referral + * @ref: optional server's referral * @devname: optional pointer for saving device name * * creates mount options for submount based on template options sb_mountdata * and replacing unc,ip,prefixpath options with ones we've got form ref_unc. * * Returns: pointer to new mount options or ERR_PTR. - * Caller is responcible for freeing retunrned value if it is not error. + * Caller is responsible for freeing returned value if it is not error. */ char *cifs_compose_mount_options(const char *sb_mountdata, const char *fullpath, @@ -150,18 +150,27 @@ char *cifs_compose_mount_options(const char *sb_mountdata, if (sb_mountdata == NULL) return ERR_PTR(-EINVAL); - if (strlen(fullpath) - ref->path_consumed) { - prepath = fullpath + ref->path_consumed; - /* skip initial delimiter */ - if (*prepath == '/' || *prepath == '\\') - prepath++; - } + if (ref) { + if (strlen(fullpath) - ref->path_consumed) { + prepath = fullpath + ref->path_consumed; + /* skip initial delimiter */ + if (*prepath == '/' || *prepath == '\\') + prepath++; + } - name = cifs_build_devname(ref->node_name, prepath); - if (IS_ERR(name)) { - rc = PTR_ERR(name); - name = NULL; - goto compose_mount_options_err; + name = cifs_build_devname(ref->node_name, prepath); + if (IS_ERR(name)) { + rc = PTR_ERR(name); + name = NULL; + goto compose_mount_options_err; + } + } else { + name = cifs_build_devname((char *)fullpath, NULL); + if (IS_ERR(name)) { + rc = PTR_ERR(name); + name = NULL; + goto compose_mount_options_err; + } } rc = dns_resolve_server_name_to_ip(name, &srvIP); @@ -225,6 +234,8 @@ char *cifs_compose_mount_options(const char *sb_mountdata, if (devname) *devname = name; + else + kfree(name); /*cifs_dbg(FYI, "%s: parent mountdata: %s\n", __func__, sb_mountdata);*/ /*cifs_dbg(FYI, "%s: submount mountdata: %s\n", __func__, mountdata );*/ @@ -241,23 +252,23 @@ compose_mount_options_err: } /** - * cifs_dfs_do_refmount - mounts specified path using provided refferal + * cifs_dfs_do_mount - mounts specified path using DFS full path + * + * Always pass down @fullpath to smb3_do_mount() so we can use the root server + * to perform failover in case we failed to connect to the first target in the + * referral. + * * @cifs_sb: parent/root superblock * @fullpath: full path in UNC format - * @ref: server's referral */ -static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt, - struct cifs_sb_info *cifs_sb, - const char *fullpath, const struct dfs_info3_param *ref) +static struct vfsmount *cifs_dfs_do_mount(struct dentry *mntpt, + struct cifs_sb_info *cifs_sb, + const char *fullpath) { struct vfsmount *mnt; char *mountdata; char *devname; - /* - * Always pass down the DFS full path to smb3_do_mount() so we - * can use it later for failover. - */ devname = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL); if (!devname) return ERR_PTR(-ENOMEM); @@ -266,7 +277,7 @@ static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt, /* strip first '\' from fullpath */ mountdata = cifs_compose_mount_options(cifs_sb->mountdata, - fullpath + 1, ref, NULL); + fullpath + 1, NULL, NULL); if (IS_ERR(mountdata)) { kfree(devname); return (struct vfsmount *)mountdata; @@ -278,28 +289,16 @@ static struct vfsmount *cifs_dfs_do_refmount(struct dentry *mntpt, return mnt; } -static void dump_referral(const struct dfs_info3_param *ref) -{ - cifs_dbg(FYI, "DFS: ref path: %s\n", ref->path_name); - cifs_dbg(FYI, "DFS: node path: %s\n", ref->node_name); - cifs_dbg(FYI, "DFS: fl: %d, srv_type: %d\n", - ref->flags, ref->server_type); - cifs_dbg(FYI, "DFS: ref_flags: %d, path_consumed: %d\n", - ref->ref_flag, ref->path_consumed); -} - /* * Create a vfsmount that we can automount */ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) { - struct dfs_info3_param referral = {0}; struct cifs_sb_info *cifs_sb; struct cifs_ses *ses; struct cifs_tcon *tcon; char *full_path, *root_path; unsigned int xid; - int len; int rc; struct vfsmount *mnt; @@ -357,7 +356,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) if (!rc) { rc = dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb), full_path + 1, - &referral, NULL); + NULL, NULL); } free_xid(xid); @@ -366,26 +365,16 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) mnt = ERR_PTR(rc); goto free_root_path; } - - dump_referral(&referral); - - len = strlen(referral.node_name); - if (len < 2) { - cifs_dbg(VFS, "%s: Net Address path too short: %s\n", - __func__, referral.node_name); - mnt = ERR_PTR(-EINVAL); - goto free_dfs_ref; - } /* - * cifs_mount() will retry every available node server in case - * of failures. + * OK - we were able to get and cache a referral for @full_path. + * + * Now, pass it down to cifs_mount() and it will retry every available + * node server in case of failures - no need to do it here. */ - mnt = cifs_dfs_do_refmount(mntpt, cifs_sb, full_path, &referral); - cifs_dbg(FYI, "%s: cifs_dfs_do_refmount:%s , mnt:%p\n", __func__, - referral.node_name, mnt); + mnt = cifs_dfs_do_mount(mntpt, cifs_sb, full_path); + cifs_dbg(FYI, "%s: cifs_dfs_do_mount:%s , mnt:%p\n", __func__, + full_path + 1, mnt); -free_dfs_ref: - free_dfs_info_param(&referral); free_root_path: kfree(root_path); free_full_path: diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 96ae72b556ac..fb41e51dd574 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -802,6 +802,26 @@ static void parse_dacl(struct cifs_acl *pdacl, char *end_of_acl, return; } +unsigned int setup_authusers_ACE(struct cifs_ace *pntace) +{ + int i; + unsigned int ace_size = 20; + + pntace->type = ACCESS_ALLOWED_ACE_TYPE; + pntace->flags = 0x0; + pntace->access_req = cpu_to_le32(GENERIC_ALL); + pntace->sid.num_subauth = 1; + pntace->sid.revision = 1; + for (i = 0; i < NUM_AUTHS; i++) + pntace->sid.authority[i] = sid_authusers.authority[i]; + + pntace->sid.sub_auth[0] = sid_authusers.sub_auth[0]; + + /* size = 1 + 1 + 2 + 4 + 1 + 1 + 6 + (psid->num_subauth*4) */ + pntace->size = cpu_to_le16(ace_size); + return ace_size; +} + /* * Fill in the special SID based on the mode. See * http://technet.microsoft.com/en-us/library/hh509017(v=ws.10).aspx diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index b59dc7478130..b87456bae1a1 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -149,9 +149,12 @@ extern ssize_t cifs_file_copychunk_range(unsigned int xid, size_t len, unsigned int flags); extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); +extern void cifs_setsize(struct inode *inode, loff_t offset); +extern int cifs_truncate_page(struct address_space *mapping, loff_t from); + #ifdef CONFIG_CIFS_NFSD_EXPORT extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.24" +#define CIFS_VERSION "2.25" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 40705e862451..239338d57086 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1588,6 +1588,7 @@ struct mid_q_entry { mid_callback_t *callback; /* call completion callback */ mid_handle_t *handle; /* call handle mid callback */ void *callback_data; /* general purpose pointer for callback */ + struct task_struct *creator; void *resp_buf; /* pointer to received SMB header */ unsigned int resp_buf_size; int mid_state; /* wish this were enum but can not pass to wait_event */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 9c229408a251..948bf3474db1 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -213,6 +213,7 @@ extern struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *, const struct cifs_fid *, u32 *); extern int set_cifs_acl(struct cifs_ntsd *, __u32, struct inode *, const char *, int); +extern unsigned int setup_authusers_ACE(struct cifs_ace *pace); extern unsigned int setup_special_mode_ACE(struct cifs_ace *pace, __u64 nmode); extern void dequeue_mid(struct mid_q_entry *mid, bool malformed); @@ -596,6 +597,9 @@ bool is_ses_using_iface(struct cifs_ses *ses, struct cifs_server_iface *iface); void extract_unc_hostname(const char *unc, const char **h, size_t *len); int copy_path_name(char *dst, const char *src); +int smb2_parse_query_directory(struct cifs_tcon *tcon, struct kvec *rsp_iov, + int resp_buftype, + struct cifs_search_info *srch_inf); #ifdef CONFIG_CIFS_DFS_UPCALL static inline int get_dfs_path(const unsigned int xid, struct cifs_ses *ses, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index cc86a67225d1..a481296f417f 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -4619,7 +4619,7 @@ findFirstRetry: psrch_inf->unicode = false; psrch_inf->ntwrk_buf_start = (char *)pSMBr; - psrch_inf->smallBuf = 0; + psrch_inf->smallBuf = false; psrch_inf->srch_entries_start = (char *) &pSMBr->hdr.Protocol + le16_to_cpu(pSMBr->t2.DataOffset); @@ -4753,7 +4753,7 @@ int CIFSFindNext(const unsigned int xid, struct cifs_tcon *tcon, cifs_buf_release(psrch_inf->ntwrk_buf_start); psrch_inf->srch_entries_start = response_data; psrch_inf->ntwrk_buf_start = (char *)pSMB; - psrch_inf->smallBuf = 0; + psrch_inf->smallBuf = false; if (parms->EndofSearch) psrch_inf->endOfSearch = true; else diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 05ea0e2b7e0e..0aa3623ae0e1 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3709,8 +3709,10 @@ match_prepath(struct super_block *sb, struct cifs_mnt_data *mnt_data) { struct cifs_sb_info *old = CIFS_SB(sb); struct cifs_sb_info *new = mnt_data->cifs_sb; - bool old_set = old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH; - bool new_set = new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH; + bool old_set = (old->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) && + old->prepath; + bool new_set = (new->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) && + new->prepath; if (old_set && new_set && !strcmp(new->prepath, old->prepath)) return 1; diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 2faa05860a48..43c1b43a07ec 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -5,11 +5,10 @@ * Copyright (c) 2018-2019 Paulo Alcantara <palcantara@suse.de> */ -#include <linux/rcupdate.h> -#include <linux/rculist.h> #include <linux/jhash.h> #include <linux/ktime.h> #include <linux/slab.h> +#include <linux/proc_fs.h> #include <linux/nls.h> #include <linux/workqueue.h> #include "cifsglob.h" @@ -22,67 +21,68 @@ #include "dfs_cache.h" -#define DFS_CACHE_HTABLE_SIZE 32 -#define DFS_CACHE_MAX_ENTRIES 64 +#define CACHE_HTABLE_SIZE 32 +#define CACHE_MAX_ENTRIES 64 #define IS_INTERLINK_SET(v) ((v) & (DFSREF_REFERRAL_SERVER | \ DFSREF_STORAGE_SERVER)) -struct dfs_cache_tgt { - char *t_name; - struct list_head t_list; +struct cache_dfs_tgt { + char *name; + struct list_head list; }; -struct dfs_cache_entry { - struct hlist_node ce_hlist; - const char *ce_path; - int ce_ttl; - int ce_srvtype; - int ce_flags; - struct timespec64 ce_etime; - int ce_path_consumed; - int ce_numtgts; - struct list_head ce_tlist; - struct dfs_cache_tgt *ce_tgthint; - struct rcu_head ce_rcu; +struct cache_entry { + struct hlist_node hlist; + const char *path; + int ttl; + int srvtype; + int flags; + struct timespec64 etime; + int path_consumed; + int numtgts; + struct list_head tlist; + struct cache_dfs_tgt *tgthint; }; -static struct kmem_cache *dfs_cache_slab __read_mostly; - -struct dfs_cache_vol_info { - char *vi_fullpath; - struct smb_vol vi_vol; - char *vi_mntdata; - struct list_head vi_list; +struct vol_info { + char *fullpath; + spinlock_t smb_vol_lock; + struct smb_vol smb_vol; + char *mntdata; + struct list_head list; + struct list_head rlist; + struct kref refcnt; }; -struct dfs_cache { - struct mutex dc_lock; - struct nls_table *dc_nlsc; - struct list_head dc_vol_list; - int dc_ttl; - struct delayed_work dc_refresh; -}; +static struct kmem_cache *cache_slab __read_mostly; +static struct workqueue_struct *dfscache_wq __read_mostly; -static struct dfs_cache dfs_cache; +static int cache_ttl; +static DEFINE_SPINLOCK(cache_ttl_lock); + +static struct nls_table *cache_nlsc; /* * Number of entries in the cache */ -static size_t dfs_cache_count; +static atomic_t cache_count; + +static struct hlist_head cache_htable[CACHE_HTABLE_SIZE]; +static DECLARE_RWSEM(htable_rw_lock); -static DEFINE_MUTEX(dfs_cache_list_lock); -static struct hlist_head dfs_cache_htable[DFS_CACHE_HTABLE_SIZE]; +static LIST_HEAD(vol_list); +static DEFINE_SPINLOCK(vol_list_lock); static void refresh_cache_worker(struct work_struct *work); -static inline bool is_path_valid(const char *path) -{ - return path && (strchr(path + 1, '\\') || strchr(path + 1, '/')); -} +static DECLARE_DELAYED_WORK(refresh_task, refresh_cache_worker); -static inline int get_normalized_path(const char *path, char **npath) +static int get_normalized_path(const char *path, char **npath) { + if (!path || strlen(path) < 3 || (*path != '\\' && *path != '/')) + return -EINVAL; + if (*path == '\\') { *npath = (char *)path; } else { @@ -100,57 +100,48 @@ static inline void free_normalized_path(const char *path, char *npath) kfree(npath); } -static inline bool cache_entry_expired(const struct dfs_cache_entry *ce) +static inline bool cache_entry_expired(const struct cache_entry *ce) { struct timespec64 ts; ktime_get_coarse_real_ts64(&ts); - return timespec64_compare(&ts, &ce->ce_etime) >= 0; + return timespec64_compare(&ts, &ce->etime) >= 0; } -static inline void free_tgts(struct dfs_cache_entry *ce) +static inline void free_tgts(struct cache_entry *ce) { - struct dfs_cache_tgt *t, *n; + struct cache_dfs_tgt *t, *n; - list_for_each_entry_safe(t, n, &ce->ce_tlist, t_list) { - list_del(&t->t_list); - kfree(t->t_name); + list_for_each_entry_safe(t, n, &ce->tlist, list) { + list_del(&t->list); + kfree(t->name); kfree(t); } } -static void free_cache_entry(struct rcu_head *rcu) +static inline void flush_cache_ent(struct cache_entry *ce) { - struct dfs_cache_entry *ce = container_of(rcu, struct dfs_cache_entry, - ce_rcu); - kmem_cache_free(dfs_cache_slab, ce); -} - -static inline void flush_cache_ent(struct dfs_cache_entry *ce) -{ - if (hlist_unhashed(&ce->ce_hlist)) - return; - - hlist_del_init_rcu(&ce->ce_hlist); - kfree_const(ce->ce_path); + hlist_del_init(&ce->hlist); + kfree(ce->path); free_tgts(ce); - dfs_cache_count--; - call_rcu(&ce->ce_rcu, free_cache_entry); + atomic_dec(&cache_count); + kmem_cache_free(cache_slab, ce); } static void flush_cache_ents(void) { int i; - rcu_read_lock(); - for (i = 0; i < DFS_CACHE_HTABLE_SIZE; i++) { - struct hlist_head *l = &dfs_cache_htable[i]; - struct dfs_cache_entry *ce; + for (i = 0; i < CACHE_HTABLE_SIZE; i++) { + struct hlist_head *l = &cache_htable[i]; + struct hlist_node *n; + struct cache_entry *ce; - hlist_for_each_entry_rcu(ce, l, ce_hlist) - flush_cache_ent(ce); + hlist_for_each_entry_safe(ce, n, l, hlist) { + if (!hlist_unhashed(&ce->hlist)) + flush_cache_ent(ce); + } } - rcu_read_unlock(); } /* @@ -158,36 +149,39 @@ static void flush_cache_ents(void) */ static int dfscache_proc_show(struct seq_file *m, void *v) { - int bucket; - struct dfs_cache_entry *ce; - struct dfs_cache_tgt *t; + int i; + struct cache_entry *ce; + struct cache_dfs_tgt *t; seq_puts(m, "DFS cache\n---------\n"); - mutex_lock(&dfs_cache_list_lock); - - rcu_read_lock(); - hash_for_each_rcu(dfs_cache_htable, bucket, ce, ce_hlist) { - seq_printf(m, - "cache entry: path=%s,type=%s,ttl=%d,etime=%ld," - "interlink=%s,path_consumed=%d,expired=%s\n", - ce->ce_path, - ce->ce_srvtype == DFS_TYPE_ROOT ? "root" : "link", - ce->ce_ttl, ce->ce_etime.tv_nsec, - IS_INTERLINK_SET(ce->ce_flags) ? "yes" : "no", - ce->ce_path_consumed, - cache_entry_expired(ce) ? "yes" : "no"); - - list_for_each_entry(t, &ce->ce_tlist, t_list) { - seq_printf(m, " %s%s\n", - t->t_name, - ce->ce_tgthint == t ? " (target hint)" : ""); + down_read(&htable_rw_lock); + for (i = 0; i < CACHE_HTABLE_SIZE; i++) { + struct hlist_head *l = &cache_htable[i]; + + hlist_for_each_entry(ce, l, hlist) { + if (hlist_unhashed(&ce->hlist)) + continue; + + seq_printf(m, + "cache entry: path=%s,type=%s,ttl=%d,etime=%ld," + "interlink=%s,path_consumed=%d,expired=%s\n", + ce->path, + ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", + ce->ttl, ce->etime.tv_nsec, + IS_INTERLINK_SET(ce->flags) ? "yes" : "no", + ce->path_consumed, + cache_entry_expired(ce) ? "yes" : "no"); + + list_for_each_entry(t, &ce->tlist, list) { + seq_printf(m, " %s%s\n", + t->name, + ce->tgthint == t ? " (target hint)" : ""); + } } - } - rcu_read_unlock(); + up_read(&htable_rw_lock); - mutex_unlock(&dfs_cache_list_lock); return 0; } @@ -205,9 +199,10 @@ static ssize_t dfscache_proc_write(struct file *file, const char __user *buffer, return -EINVAL; cifs_dbg(FYI, "clearing dfs cache"); - mutex_lock(&dfs_cache_list_lock); + + down_write(&htable_rw_lock); flush_cache_ents(); - mutex_unlock(&dfs_cache_list_lock); + up_write(&htable_rw_lock); return count; } @@ -217,34 +212,34 @@ static int dfscache_proc_open(struct inode *inode, struct file *file) return single_open(file, dfscache_proc_show, NULL); } -const struct file_operations dfscache_proc_fops = { - .open = dfscache_proc_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, - .write = dfscache_proc_write, +const struct proc_ops dfscache_proc_ops = { + .proc_open = dfscache_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + .proc_write = dfscache_proc_write, }; #ifdef CONFIG_CIFS_DEBUG2 -static inline void dump_tgts(const struct dfs_cache_entry *ce) +static inline void dump_tgts(const struct cache_entry *ce) { - struct dfs_cache_tgt *t; + struct cache_dfs_tgt *t; cifs_dbg(FYI, "target list:\n"); - list_for_each_entry(t, &ce->ce_tlist, t_list) { - cifs_dbg(FYI, " %s%s\n", t->t_name, - ce->ce_tgthint == t ? " (target hint)" : ""); + list_for_each_entry(t, &ce->tlist, list) { + cifs_dbg(FYI, " %s%s\n", t->name, + ce->tgthint == t ? " (target hint)" : ""); } } -static inline void dump_ce(const struct dfs_cache_entry *ce) +static inline void dump_ce(const struct cache_entry *ce) { cifs_dbg(FYI, "cache entry: path=%s,type=%s,ttl=%d,etime=%ld," - "interlink=%s,path_consumed=%d,expired=%s\n", ce->ce_path, - ce->ce_srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ce_ttl, - ce->ce_etime.tv_nsec, - IS_INTERLINK_SET(ce->ce_flags) ? "yes" : "no", - ce->ce_path_consumed, + "interlink=%s,path_consumed=%d,expired=%s\n", ce->path, + ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ttl, + ce->etime.tv_nsec, + IS_INTERLINK_SET(ce->flags) ? "yes" : "no", + ce->path_consumed, cache_entry_expired(ce) ? "yes" : "no"); dump_tgts(ce); } @@ -284,25 +279,34 @@ static inline void dump_refs(const struct dfs_info3_param *refs, int numrefs) */ int dfs_cache_init(void) { + int rc; int i; - dfs_cache_slab = kmem_cache_create("cifs_dfs_cache", - sizeof(struct dfs_cache_entry), 0, - SLAB_HWCACHE_ALIGN, NULL); - if (!dfs_cache_slab) + dfscache_wq = alloc_workqueue("cifs-dfscache", + WQ_FREEZABLE | WQ_MEM_RECLAIM, 1); + if (!dfscache_wq) return -ENOMEM; - for (i = 0; i < DFS_CACHE_HTABLE_SIZE; i++) - INIT_HLIST_HEAD(&dfs_cache_htable[i]); + cache_slab = kmem_cache_create("cifs_dfs_cache", + sizeof(struct cache_entry), 0, + SLAB_HWCACHE_ALIGN, NULL); + if (!cache_slab) { + rc = -ENOMEM; + goto out_destroy_wq; + } + + for (i = 0; i < CACHE_HTABLE_SIZE; i++) + INIT_HLIST_HEAD(&cache_htable[i]); - INIT_LIST_HEAD(&dfs_cache.dc_vol_list); - mutex_init(&dfs_cache.dc_lock); - INIT_DELAYED_WORK(&dfs_cache.dc_refresh, refresh_cache_worker); - dfs_cache.dc_ttl = -1; - dfs_cache.dc_nlsc = load_nls_default(); + atomic_set(&cache_count, 0); + cache_nlsc = load_nls_default(); cifs_dbg(FYI, "%s: initialized DFS referral cache\n", __func__); return 0; + +out_destroy_wq: + destroy_workqueue(dfscache_wq); + return rc; } static inline unsigned int cache_entry_hash(const void *data, int size) @@ -310,7 +314,7 @@ static inline unsigned int cache_entry_hash(const void *data, int size) unsigned int h; h = jhash(data, size, 0); - return h & (DFS_CACHE_HTABLE_SIZE - 1); + return h & (CACHE_HTABLE_SIZE - 1); } /* Check whether second path component of @path is SYSVOL or NETLOGON */ @@ -325,11 +329,11 @@ static inline bool is_sysvol_or_netlogon(const char *path) } /* Return target hint of a DFS cache entry */ -static inline char *get_tgt_name(const struct dfs_cache_entry *ce) +static inline char *get_tgt_name(const struct cache_entry *ce) { - struct dfs_cache_tgt *t = ce->ce_tgthint; + struct cache_dfs_tgt *t = ce->tgthint; - return t ? t->t_name : ERR_PTR(-ENOENT); + return t ? t->name : ERR_PTR(-ENOENT); } /* Return expire time out of a new entry's TTL */ @@ -346,19 +350,19 @@ static inline struct timespec64 get_expire_time(int ttl) } /* Allocate a new DFS target */ -static inline struct dfs_cache_tgt *alloc_tgt(const char *name) +static struct cache_dfs_tgt *alloc_target(const char *name) { - struct dfs_cache_tgt *t; + struct cache_dfs_tgt *t; - t = kmalloc(sizeof(*t), GFP_KERNEL); + t = kmalloc(sizeof(*t), GFP_ATOMIC); if (!t) return ERR_PTR(-ENOMEM); - t->t_name = kstrndup(name, strlen(name), GFP_KERNEL); - if (!t->t_name) { + t->name = kstrndup(name, strlen(name), GFP_ATOMIC); + if (!t->name) { kfree(t); return ERR_PTR(-ENOMEM); } - INIT_LIST_HEAD(&t->t_list); + INIT_LIST_HEAD(&t->list); return t; } @@ -367,180 +371,184 @@ static inline struct dfs_cache_tgt *alloc_tgt(const char *name) * target hint. */ static int copy_ref_data(const struct dfs_info3_param *refs, int numrefs, - struct dfs_cache_entry *ce, const char *tgthint) + struct cache_entry *ce, const char *tgthint) { int i; - ce->ce_ttl = refs[0].ttl; - ce->ce_etime = get_expire_time(ce->ce_ttl); - ce->ce_srvtype = refs[0].server_type; - ce->ce_flags = refs[0].ref_flag; - ce->ce_path_consumed = refs[0].path_consumed; + ce->ttl = refs[0].ttl; + ce->etime = get_expire_time(ce->ttl); + ce->srvtype = refs[0].server_type; + ce->flags = refs[0].ref_flag; + ce->path_consumed = refs[0].path_consumed; for (i = 0; i < numrefs; i++) { - struct dfs_cache_tgt *t; + struct cache_dfs_tgt *t; - t = alloc_tgt(refs[i].node_name); + t = alloc_target(refs[i].node_name); if (IS_ERR(t)) { free_tgts(ce); return PTR_ERR(t); } - if (tgthint && !strcasecmp(t->t_name, tgthint)) { - list_add(&t->t_list, &ce->ce_tlist); + if (tgthint && !strcasecmp(t->name, tgthint)) { + list_add(&t->list, &ce->tlist); tgthint = NULL; } else { - list_add_tail(&t->t_list, &ce->ce_tlist); + list_add_tail(&t->list, &ce->tlist); } - ce->ce_numtgts++; + ce->numtgts++; } - ce->ce_tgthint = list_first_entry_or_null(&ce->ce_tlist, - struct dfs_cache_tgt, t_list); + ce->tgthint = list_first_entry_or_null(&ce->tlist, + struct cache_dfs_tgt, list); return 0; } /* Allocate a new cache entry */ -static struct dfs_cache_entry * -alloc_cache_entry(const char *path, const struct dfs_info3_param *refs, - int numrefs) +static struct cache_entry *alloc_cache_entry(const char *path, + const struct dfs_info3_param *refs, + int numrefs) { - struct dfs_cache_entry *ce; + struct cache_entry *ce; int rc; - ce = kmem_cache_zalloc(dfs_cache_slab, GFP_KERNEL); + ce = kmem_cache_zalloc(cache_slab, GFP_KERNEL); if (!ce) return ERR_PTR(-ENOMEM); - ce->ce_path = kstrdup_const(path, GFP_KERNEL); - if (!ce->ce_path) { - kmem_cache_free(dfs_cache_slab, ce); + ce->path = kstrndup(path, strlen(path), GFP_KERNEL); + if (!ce->path) { + kmem_cache_free(cache_slab, ce); return ERR_PTR(-ENOMEM); } - INIT_HLIST_NODE(&ce->ce_hlist); - INIT_LIST_HEAD(&ce->ce_tlist); + INIT_HLIST_NODE(&ce->hlist); + INIT_LIST_HEAD(&ce->tlist); rc = copy_ref_data(refs, numrefs, ce, NULL); if (rc) { - kfree_const(ce->ce_path); - kmem_cache_free(dfs_cache_slab, ce); + kfree(ce->path); + kmem_cache_free(cache_slab, ce); ce = ERR_PTR(rc); } return ce; } +/* Must be called with htable_rw_lock held */ static void remove_oldest_entry(void) { - int bucket; - struct dfs_cache_entry *ce; - struct dfs_cache_entry *to_del = NULL; - - rcu_read_lock(); - hash_for_each_rcu(dfs_cache_htable, bucket, ce, ce_hlist) { - if (!to_del || timespec64_compare(&ce->ce_etime, - &to_del->ce_etime) < 0) - to_del = ce; + int i; + struct cache_entry *ce; + struct cache_entry *to_del = NULL; + + for (i = 0; i < CACHE_HTABLE_SIZE; i++) { + struct hlist_head *l = &cache_htable[i]; + + hlist_for_each_entry(ce, l, hlist) { + if (hlist_unhashed(&ce->hlist)) + continue; + if (!to_del || timespec64_compare(&ce->etime, + &to_del->etime) < 0) + to_del = ce; + } } + if (!to_del) { cifs_dbg(FYI, "%s: no entry to remove", __func__); - goto out; + return; } + cifs_dbg(FYI, "%s: removing entry", __func__); dump_ce(to_del); flush_cache_ent(to_del); -out: - rcu_read_unlock(); } /* Add a new DFS cache entry */ -static inline struct dfs_cache_entry * -add_cache_entry(unsigned int hash, const char *path, - const struct dfs_info3_param *refs, int numrefs) +static int add_cache_entry(const char *path, unsigned int hash, + struct dfs_info3_param *refs, int numrefs) { - struct dfs_cache_entry *ce; + struct cache_entry *ce; ce = alloc_cache_entry(path, refs, numrefs); if (IS_ERR(ce)) - return ce; + return PTR_ERR(ce); - hlist_add_head_rcu(&ce->ce_hlist, &dfs_cache_htable[hash]); - - mutex_lock(&dfs_cache.dc_lock); - if (dfs_cache.dc_ttl < 0) { - dfs_cache.dc_ttl = ce->ce_ttl; - queue_delayed_work(cifsiod_wq, &dfs_cache.dc_refresh, - dfs_cache.dc_ttl * HZ); + spin_lock(&cache_ttl_lock); + if (!cache_ttl) { + cache_ttl = ce->ttl; + queue_delayed_work(dfscache_wq, &refresh_task, cache_ttl * HZ); } else { - dfs_cache.dc_ttl = min_t(int, dfs_cache.dc_ttl, ce->ce_ttl); - mod_delayed_work(cifsiod_wq, &dfs_cache.dc_refresh, - dfs_cache.dc_ttl * HZ); + cache_ttl = min_t(int, cache_ttl, ce->ttl); + mod_delayed_work(dfscache_wq, &refresh_task, cache_ttl * HZ); } - mutex_unlock(&dfs_cache.dc_lock); + spin_unlock(&cache_ttl_lock); - return ce; + down_write(&htable_rw_lock); + hlist_add_head(&ce->hlist, &cache_htable[hash]); + dump_ce(ce); + up_write(&htable_rw_lock); + + return 0; } -static struct dfs_cache_entry *__find_cache_entry(unsigned int hash, - const char *path) +/* + * Find a DFS cache entry in hash table and optionally check prefix path against + * @path. + * Use whole path components in the match. + * Must be called with htable_rw_lock held. + * + * Return ERR_PTR(-ENOENT) if the entry is not found. + */ +static struct cache_entry *lookup_cache_entry(const char *path, + unsigned int *hash) { - struct dfs_cache_entry *ce; + struct cache_entry *ce; + unsigned int h; bool found = false; - rcu_read_lock(); - hlist_for_each_entry_rcu(ce, &dfs_cache_htable[hash], ce_hlist) { - if (!strcasecmp(path, ce->ce_path)) { -#ifdef CONFIG_CIFS_DEBUG2 - char *name = get_tgt_name(ce); + h = cache_entry_hash(path, strlen(path)); - if (IS_ERR(name)) { - rcu_read_unlock(); - return ERR_CAST(name); - } - cifs_dbg(FYI, "%s: cache hit\n", __func__); - cifs_dbg(FYI, "%s: target hint: %s\n", __func__, name); -#endif + hlist_for_each_entry(ce, &cache_htable[h], hlist) { + if (!strcasecmp(path, ce->path)) { found = true; + dump_ce(ce); break; } } - rcu_read_unlock(); - return found ? ce : ERR_PTR(-ENOENT); -} -/* - * Find a DFS cache entry in hash table and optionally check prefix path against - * @path. - * Use whole path components in the match. - * Return ERR_PTR(-ENOENT) if the entry is not found. - */ -static inline struct dfs_cache_entry *find_cache_entry(const char *path, - unsigned int *hash) -{ - *hash = cache_entry_hash(path, strlen(path)); - return __find_cache_entry(*hash, path); + if (!found) + ce = ERR_PTR(-ENOENT); + if (hash) + *hash = h; + + return ce; } -static inline void destroy_slab_cache(void) +static void __vol_release(struct vol_info *vi) { - rcu_barrier(); - kmem_cache_destroy(dfs_cache_slab); + kfree(vi->fullpath); + kfree(vi->mntdata); + cifs_cleanup_volume_info_contents(&vi->smb_vol); + kfree(vi); } -static inline void free_vol(struct dfs_cache_vol_info *vi) +static void vol_release(struct kref *kref) { - list_del(&vi->vi_list); - kfree(vi->vi_fullpath); - kfree(vi->vi_mntdata); - cifs_cleanup_volume_info_contents(&vi->vi_vol); - kfree(vi); + struct vol_info *vi = container_of(kref, struct vol_info, refcnt); + + spin_lock(&vol_list_lock); + list_del(&vi->list); + spin_unlock(&vol_list_lock); + __vol_release(vi); } static inline void free_vol_list(void) { - struct dfs_cache_vol_info *vi, *nvi; + struct vol_info *vi, *nvi; - list_for_each_entry_safe(vi, nvi, &dfs_cache.dc_vol_list, vi_list) - free_vol(vi); + list_for_each_entry_safe(vi, nvi, &vol_list, list) { + list_del_init(&vi->list); + __vol_release(vi); + } } /** @@ -548,83 +556,78 @@ static inline void free_vol_list(void) */ void dfs_cache_destroy(void) { - cancel_delayed_work_sync(&dfs_cache.dc_refresh); - unload_nls(dfs_cache.dc_nlsc); + cancel_delayed_work_sync(&refresh_task); + unload_nls(cache_nlsc); free_vol_list(); - mutex_destroy(&dfs_cache.dc_lock); - flush_cache_ents(); - destroy_slab_cache(); - mutex_destroy(&dfs_cache_list_lock); + kmem_cache_destroy(cache_slab); + destroy_workqueue(dfscache_wq); cifs_dbg(FYI, "%s: destroyed DFS referral cache\n", __func__); } -static inline struct dfs_cache_entry * -__update_cache_entry(const char *path, const struct dfs_info3_param *refs, - int numrefs) +/* Must be called with htable_rw_lock held */ +static int __update_cache_entry(const char *path, + const struct dfs_info3_param *refs, + int numrefs) { int rc; - unsigned int h; - struct dfs_cache_entry *ce; + struct cache_entry *ce; char *s, *th = NULL; - ce = find_cache_entry(path, &h); + ce = lookup_cache_entry(path, NULL); if (IS_ERR(ce)) - return ce; + return PTR_ERR(ce); - if (ce->ce_tgthint) { - s = ce->ce_tgthint->t_name; - th = kstrndup(s, strlen(s), GFP_KERNEL); + if (ce->tgthint) { + s = ce->tgthint->name; + th = kstrndup(s, strlen(s), GFP_ATOMIC); if (!th) - return ERR_PTR(-ENOMEM); + return -ENOMEM; } free_tgts(ce); - ce->ce_numtgts = 0; + ce->numtgts = 0; rc = copy_ref_data(refs, numrefs, ce, th); - kfree(th); - if (rc) - ce = ERR_PTR(rc); + kfree(th); - return ce; + return rc; } -/* Update an expired cache entry by getting a new DFS referral from server */ -static struct dfs_cache_entry * -update_cache_entry(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_codepage, int remap, - const char *path, struct dfs_cache_entry *ce) +static int get_dfs_referral(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_codepage, int remap, + const char *path, struct dfs_info3_param **refs, + int *numrefs) { - int rc; - struct dfs_info3_param *refs = NULL; - int numrefs = 0; + cifs_dbg(FYI, "%s: get an DFS referral for %s\n", __func__, path); - cifs_dbg(FYI, "%s: update expired cache entry\n", __func__); - /* - * Check if caller provided enough parameters to update an expired - * entry. - */ if (!ses || !ses->server || !ses->server->ops->get_dfs_refer) - return ERR_PTR(-ETIME); + return -EOPNOTSUPP; if (unlikely(!nls_codepage)) - return ERR_PTR(-ETIME); + return -EINVAL; - cifs_dbg(FYI, "%s: DFS referral request for %s\n", __func__, path); + *refs = NULL; + *numrefs = 0; - rc = ses->server->ops->get_dfs_refer(xid, ses, path, &refs, &numrefs, - nls_codepage, remap); - if (rc) - ce = ERR_PTR(rc); - else - ce = __update_cache_entry(path, refs, numrefs); + return ses->server->ops->get_dfs_refer(xid, ses, path, refs, numrefs, + nls_codepage, remap); +} - dump_refs(refs, numrefs); - free_dfs_info_array(refs, numrefs); +/* Update an expired cache entry by getting a new DFS referral from server */ +static int update_cache_entry(const char *path, + const struct dfs_info3_param *refs, + int numrefs) +{ - return ce; + int rc; + + down_write(&htable_rw_lock); + rc = __update_cache_entry(path, refs, numrefs); + up_write(&htable_rw_lock); + + return rc; } /* @@ -636,95 +639,86 @@ update_cache_entry(const unsigned int xid, struct cifs_ses *ses, * For interlinks, __cifs_dfs_mount() and expand_dfs_referral() are supposed to * handle them properly. */ -static struct dfs_cache_entry * -do_dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, - const struct nls_table *nls_codepage, int remap, - const char *path, bool noreq) +static int __dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, + const struct nls_table *nls_codepage, int remap, + const char *path, bool noreq) { int rc; - unsigned int h; - struct dfs_cache_entry *ce; - struct dfs_info3_param *nrefs; - int numnrefs; + unsigned int hash; + struct cache_entry *ce; + struct dfs_info3_param *refs = NULL; + int numrefs = 0; + bool newent = false; cifs_dbg(FYI, "%s: search path: %s\n", __func__, path); - ce = find_cache_entry(path, &h); - if (IS_ERR(ce)) { - cifs_dbg(FYI, "%s: cache miss\n", __func__); - /* - * If @noreq is set, no requests will be sent to the server for - * either updating or getting a new DFS referral. - */ - if (noreq) - return ce; - /* - * No cache entry was found, so check for valid parameters that - * will be required to get a new DFS referral and then create a - * new cache entry. - */ - if (!ses || !ses->server || !ses->server->ops->get_dfs_refer) { - ce = ERR_PTR(-EOPNOTSUPP); - return ce; - } - if (unlikely(!nls_codepage)) { - ce = ERR_PTR(-EINVAL); - return ce; - } + down_read(&htable_rw_lock); - nrefs = NULL; - numnrefs = 0; + ce = lookup_cache_entry(path, &hash); - cifs_dbg(FYI, "%s: DFS referral request for %s\n", __func__, - path); + /* + * If @noreq is set, no requests will be sent to the server. Just return + * the cache entry. + */ + if (noreq) { + up_read(&htable_rw_lock); + return PTR_ERR_OR_ZERO(ce); + } - rc = ses->server->ops->get_dfs_refer(xid, ses, path, &nrefs, - &numnrefs, nls_codepage, - remap); - if (rc) { - ce = ERR_PTR(rc); - return ce; + if (!IS_ERR(ce)) { + if (!cache_entry_expired(ce)) { + dump_ce(ce); + up_read(&htable_rw_lock); + return 0; } + } else { + newent = true; + } - dump_refs(nrefs, numnrefs); + up_read(&htable_rw_lock); - cifs_dbg(FYI, "%s: new cache entry\n", __func__); + /* + * No entry was found. + * + * Request a new DFS referral in order to create a new cache entry, or + * updating an existing one. + */ + rc = get_dfs_referral(xid, ses, nls_codepage, remap, path, + &refs, &numrefs); + if (rc) + return rc; - if (dfs_cache_count >= DFS_CACHE_MAX_ENTRIES) { - cifs_dbg(FYI, "%s: reached max cache size (%d)", - __func__, DFS_CACHE_MAX_ENTRIES); - remove_oldest_entry(); - } - ce = add_cache_entry(h, path, nrefs, numnrefs); - free_dfs_info_array(nrefs, numnrefs); + dump_refs(refs, numrefs); - if (IS_ERR(ce)) - return ce; + if (!newent) { + rc = update_cache_entry(path, refs, numrefs); + goto out_free_refs; + } - dfs_cache_count++; + if (atomic_read(&cache_count) >= CACHE_MAX_ENTRIES) { + cifs_dbg(FYI, "%s: reached max cache size (%d)", __func__, + CACHE_MAX_ENTRIES); + down_write(&htable_rw_lock); + remove_oldest_entry(); + up_write(&htable_rw_lock); } - dump_ce(ce); + rc = add_cache_entry(path, hash, refs, numrefs); + if (!rc) + atomic_inc(&cache_count); - /* Just return the found cache entry in case @noreq is set */ - if (noreq) - return ce; - - if (cache_entry_expired(ce)) { - cifs_dbg(FYI, "%s: expired cache entry\n", __func__); - ce = update_cache_entry(xid, ses, nls_codepage, remap, path, - ce); - if (IS_ERR(ce)) { - cifs_dbg(FYI, "%s: failed to update expired entry\n", - __func__); - } - } - return ce; +out_free_refs: + free_dfs_info_array(refs, numrefs); + return rc; } -/* Set up a new DFS referral from a given cache entry */ -static int setup_ref(const char *path, const struct dfs_cache_entry *ce, - struct dfs_info3_param *ref, const char *tgt) +/* + * Set up a DFS referral from a given cache entry. + * + * Must be called with htable_rw_lock held. + */ +static int setup_referral(const char *path, struct cache_entry *ce, + struct dfs_info3_param *ref, const char *target) { int rc; @@ -732,21 +726,20 @@ static int setup_ref(const char *path, const struct dfs_cache_entry *ce, memset(ref, 0, sizeof(*ref)); - ref->path_name = kstrndup(path, strlen(path), GFP_KERNEL); + ref->path_name = kstrndup(path, strlen(path), GFP_ATOMIC); if (!ref->path_name) return -ENOMEM; - ref->path_consumed = ce->ce_path_consumed; - - ref->node_name = kstrndup(tgt, strlen(tgt), GFP_KERNEL); + ref->node_name = kstrndup(target, strlen(target), GFP_ATOMIC); if (!ref->node_name) { rc = -ENOMEM; goto err_free_path; } - ref->ttl = ce->ce_ttl; - ref->server_type = ce->ce_srvtype; - ref->ref_flag = ce->ce_flags; + ref->path_consumed = ce->path_consumed; + ref->ttl = ce->ttl; + ref->server_type = ce->srvtype; + ref->ref_flag = ce->flags; return 0; @@ -757,38 +750,37 @@ err_free_path: } /* Return target list of a DFS cache entry */ -static int get_tgt_list(const struct dfs_cache_entry *ce, - struct dfs_cache_tgt_list *tl) +static int get_targets(struct cache_entry *ce, struct dfs_cache_tgt_list *tl) { int rc; struct list_head *head = &tl->tl_list; - struct dfs_cache_tgt *t; + struct cache_dfs_tgt *t; struct dfs_cache_tgt_iterator *it, *nit; memset(tl, 0, sizeof(*tl)); INIT_LIST_HEAD(head); - list_for_each_entry(t, &ce->ce_tlist, t_list) { - it = kzalloc(sizeof(*it), GFP_KERNEL); + list_for_each_entry(t, &ce->tlist, list) { + it = kzalloc(sizeof(*it), GFP_ATOMIC); if (!it) { rc = -ENOMEM; goto err_free_it; } - it->it_name = kstrndup(t->t_name, strlen(t->t_name), - GFP_KERNEL); + it->it_name = kstrndup(t->name, strlen(t->name), GFP_ATOMIC); if (!it->it_name) { kfree(it); rc = -ENOMEM; goto err_free_it; } - if (ce->ce_tgthint == t) + if (ce->tgthint == t) list_add(&it->it_list, head); else list_add_tail(&it->it_list, head); } - tl->tl_numtgts = ce->ce_numtgts; + + tl->tl_numtgts = ce->numtgts; return 0; @@ -829,28 +821,35 @@ int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, { int rc; char *npath; - struct dfs_cache_entry *ce; - - if (unlikely(!is_path_valid(path))) - return -EINVAL; + struct cache_entry *ce; rc = get_normalized_path(path, &npath); if (rc) return rc; - mutex_lock(&dfs_cache_list_lock); - ce = do_dfs_cache_find(xid, ses, nls_codepage, remap, npath, false); - if (!IS_ERR(ce)) { - if (ref) - rc = setup_ref(path, ce, ref, get_tgt_name(ce)); - else - rc = 0; - if (!rc && tgt_list) - rc = get_tgt_list(ce, tgt_list); - } else { + rc = __dfs_cache_find(xid, ses, nls_codepage, remap, npath, false); + if (rc) + goto out_free_path; + + down_read(&htable_rw_lock); + + ce = lookup_cache_entry(npath, NULL); + if (IS_ERR(ce)) { + up_read(&htable_rw_lock); rc = PTR_ERR(ce); + goto out_free_path; } - mutex_unlock(&dfs_cache_list_lock); + + if (ref) + rc = setup_referral(path, ce, ref, get_tgt_name(ce)); + else + rc = 0; + if (!rc && tgt_list) + rc = get_targets(ce, tgt_list); + + up_read(&htable_rw_lock); + +out_free_path: free_normalized_path(path, npath); return rc; } @@ -876,31 +875,33 @@ int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref, { int rc; char *npath; - struct dfs_cache_entry *ce; - - if (unlikely(!is_path_valid(path))) - return -EINVAL; + struct cache_entry *ce; rc = get_normalized_path(path, &npath); if (rc) return rc; - mutex_lock(&dfs_cache_list_lock); - ce = do_dfs_cache_find(0, NULL, NULL, 0, npath, true); + cifs_dbg(FYI, "%s: path: %s\n", __func__, npath); + + down_read(&htable_rw_lock); + + ce = lookup_cache_entry(npath, NULL); if (IS_ERR(ce)) { rc = PTR_ERR(ce); - goto out; + goto out_unlock; } if (ref) - rc = setup_ref(path, ce, ref, get_tgt_name(ce)); + rc = setup_referral(path, ce, ref, get_tgt_name(ce)); else rc = 0; if (!rc && tgt_list) - rc = get_tgt_list(ce, tgt_list); -out: - mutex_unlock(&dfs_cache_list_lock); + rc = get_targets(ce, tgt_list); + +out_unlock: + up_read(&htable_rw_lock); free_normalized_path(path, npath); + return rc; } @@ -929,44 +930,46 @@ int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses, { int rc; char *npath; - struct dfs_cache_entry *ce; - struct dfs_cache_tgt *t; - - if (unlikely(!is_path_valid(path))) - return -EINVAL; + struct cache_entry *ce; + struct cache_dfs_tgt *t; rc = get_normalized_path(path, &npath); if (rc) return rc; - cifs_dbg(FYI, "%s: path: %s\n", __func__, npath); + cifs_dbg(FYI, "%s: update target hint - path: %s\n", __func__, npath); - mutex_lock(&dfs_cache_list_lock); - ce = do_dfs_cache_find(xid, ses, nls_codepage, remap, npath, false); + rc = __dfs_cache_find(xid, ses, nls_codepage, remap, npath, false); + if (rc) + goto out_free_path; + + down_write(&htable_rw_lock); + + ce = lookup_cache_entry(npath, NULL); if (IS_ERR(ce)) { rc = PTR_ERR(ce); - goto out; + goto out_unlock; } - rc = 0; - - t = ce->ce_tgthint; + t = ce->tgthint; - if (likely(!strcasecmp(it->it_name, t->t_name))) - goto out; + if (likely(!strcasecmp(it->it_name, t->name))) + goto out_unlock; - list_for_each_entry(t, &ce->ce_tlist, t_list) { - if (!strcasecmp(t->t_name, it->it_name)) { - ce->ce_tgthint = t; + list_for_each_entry(t, &ce->tlist, list) { + if (!strcasecmp(t->name, it->it_name)) { + ce->tgthint = t; cifs_dbg(FYI, "%s: new target hint: %s\n", __func__, it->it_name); break; } } -out: - mutex_unlock(&dfs_cache_list_lock); +out_unlock: + up_write(&htable_rw_lock); +out_free_path: free_normalized_path(path, npath); + return rc; } @@ -989,10 +992,10 @@ int dfs_cache_noreq_update_tgthint(const char *path, { int rc; char *npath; - struct dfs_cache_entry *ce; - struct dfs_cache_tgt *t; + struct cache_entry *ce; + struct cache_dfs_tgt *t; - if (unlikely(!is_path_valid(path)) || !it) + if (!it) return -EINVAL; rc = get_normalized_path(path, &npath); @@ -1001,33 +1004,33 @@ int dfs_cache_noreq_update_tgthint(const char *path, cifs_dbg(FYI, "%s: path: %s\n", __func__, npath); - mutex_lock(&dfs_cache_list_lock); + down_write(&htable_rw_lock); - ce = do_dfs_cache_find(0, NULL, NULL, 0, npath, true); + ce = lookup_cache_entry(npath, NULL); if (IS_ERR(ce)) { rc = PTR_ERR(ce); - goto out; + goto out_unlock; } rc = 0; + t = ce->tgthint; - t = ce->ce_tgthint; + if (unlikely(!strcasecmp(it->it_name, t->name))) + goto out_unlock; - if (unlikely(!strcasecmp(it->it_name, t->t_name))) - goto out; - - list_for_each_entry(t, &ce->ce_tlist, t_list) { - if (!strcasecmp(t->t_name, it->it_name)) { - ce->ce_tgthint = t; + list_for_each_entry(t, &ce->tlist, list) { + if (!strcasecmp(t->name, it->it_name)) { + ce->tgthint = t; cifs_dbg(FYI, "%s: new target hint: %s\n", __func__, it->it_name); break; } } -out: - mutex_unlock(&dfs_cache_list_lock); +out_unlock: + up_write(&htable_rw_lock); free_normalized_path(path, npath); + return rc; } @@ -1047,13 +1050,10 @@ int dfs_cache_get_tgt_referral(const char *path, { int rc; char *npath; - struct dfs_cache_entry *ce; - unsigned int h; + struct cache_entry *ce; if (!it || !ref) return -EINVAL; - if (unlikely(!is_path_valid(path))) - return -EINVAL; rc = get_normalized_path(path, &npath); if (rc) @@ -1061,21 +1061,22 @@ int dfs_cache_get_tgt_referral(const char *path, cifs_dbg(FYI, "%s: path: %s\n", __func__, npath); - mutex_lock(&dfs_cache_list_lock); + down_read(&htable_rw_lock); - ce = find_cache_entry(npath, &h); + ce = lookup_cache_entry(npath, NULL); if (IS_ERR(ce)) { rc = PTR_ERR(ce); - goto out; + goto out_unlock; } cifs_dbg(FYI, "%s: target name: %s\n", __func__, it->it_name); - rc = setup_ref(path, ce, ref, it->it_name); + rc = setup_referral(path, ce, ref, it->it_name); -out: - mutex_unlock(&dfs_cache_list_lock); +out_unlock: + up_read(&htable_rw_lock); free_normalized_path(path, npath); + return rc; } @@ -1085,7 +1086,7 @@ static int dup_vol(struct smb_vol *vol, struct smb_vol *new) if (vol->username) { new->username = kstrndup(vol->username, strlen(vol->username), - GFP_KERNEL); + GFP_KERNEL); if (!new->username) return -ENOMEM; } @@ -1103,7 +1104,7 @@ static int dup_vol(struct smb_vol *vol, struct smb_vol *new) } if (vol->domainname) { new->domainname = kstrndup(vol->domainname, - strlen(vol->domainname), GFP_KERNEL); + strlen(vol->domainname), GFP_KERNEL); if (!new->domainname) goto err_free_unc; } @@ -1150,7 +1151,7 @@ err_free_username: int dfs_cache_add_vol(char *mntdata, struct smb_vol *vol, const char *fullpath) { int rc; - struct dfs_cache_vol_info *vi; + struct vol_info *vi; if (!vol || !fullpath || !mntdata) return -EINVAL; @@ -1161,38 +1162,41 @@ int dfs_cache_add_vol(char *mntdata, struct smb_vol *vol, const char *fullpath) if (!vi) return -ENOMEM; - vi->vi_fullpath = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL); - if (!vi->vi_fullpath) { + vi->fullpath = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL); + if (!vi->fullpath) { rc = -ENOMEM; goto err_free_vi; } - rc = dup_vol(vol, &vi->vi_vol); + rc = dup_vol(vol, &vi->smb_vol); if (rc) goto err_free_fullpath; - vi->vi_mntdata = mntdata; + vi->mntdata = mntdata; + spin_lock_init(&vi->smb_vol_lock); + kref_init(&vi->refcnt); + + spin_lock(&vol_list_lock); + list_add_tail(&vi->list, &vol_list); + spin_unlock(&vol_list_lock); - mutex_lock(&dfs_cache.dc_lock); - list_add_tail(&vi->vi_list, &dfs_cache.dc_vol_list); - mutex_unlock(&dfs_cache.dc_lock); return 0; err_free_fullpath: - kfree(vi->vi_fullpath); + kfree(vi->fullpath); err_free_vi: kfree(vi); return rc; } -static inline struct dfs_cache_vol_info *find_vol(const char *fullpath) +/* Must be called with vol_list_lock held */ +static struct vol_info *find_vol(const char *fullpath) { - struct dfs_cache_vol_info *vi; + struct vol_info *vi; - list_for_each_entry(vi, &dfs_cache.dc_vol_list, vi_list) { - cifs_dbg(FYI, "%s: vi->vi_fullpath: %s\n", __func__, - vi->vi_fullpath); - if (!strcasecmp(vi->vi_fullpath, fullpath)) + list_for_each_entry(vi, &vol_list, list) { + cifs_dbg(FYI, "%s: vi->fullpath: %s\n", __func__, vi->fullpath); + if (!strcasecmp(vi->fullpath, fullpath)) return vi; } return ERR_PTR(-ENOENT); @@ -1208,30 +1212,31 @@ static inline struct dfs_cache_vol_info *find_vol(const char *fullpath) */ int dfs_cache_update_vol(const char *fullpath, struct TCP_Server_Info *server) { - int rc; - struct dfs_cache_vol_info *vi; + struct vol_info *vi; if (!fullpath || !server) return -EINVAL; cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath); - mutex_lock(&dfs_cache.dc_lock); - + spin_lock(&vol_list_lock); vi = find_vol(fullpath); if (IS_ERR(vi)) { - rc = PTR_ERR(vi); - goto out; + spin_unlock(&vol_list_lock); + return PTR_ERR(vi); } + kref_get(&vi->refcnt); + spin_unlock(&vol_list_lock); cifs_dbg(FYI, "%s: updating volume info\n", __func__); - memcpy(&vi->vi_vol.dstaddr, &server->dstaddr, - sizeof(vi->vi_vol.dstaddr)); - rc = 0; + spin_lock(&vi->smb_vol_lock); + memcpy(&vi->smb_vol.dstaddr, &server->dstaddr, + sizeof(vi->smb_vol.dstaddr)); + spin_unlock(&vi->smb_vol_lock); -out: - mutex_unlock(&dfs_cache.dc_lock); - return rc; + kref_put(&vi->refcnt, vol_release); + + return 0; } /** @@ -1241,18 +1246,18 @@ out: */ void dfs_cache_del_vol(const char *fullpath) { - struct dfs_cache_vol_info *vi; + struct vol_info *vi; if (!fullpath || !*fullpath) return; cifs_dbg(FYI, "%s: fullpath: %s\n", __func__, fullpath); - mutex_lock(&dfs_cache.dc_lock); + spin_lock(&vol_list_lock); vi = find_vol(fullpath); - if (!IS_ERR(vi)) - free_vol(vi); - mutex_unlock(&dfs_cache.dc_lock); + spin_unlock(&vol_list_lock); + + kref_put(&vi->refcnt, vol_release); } /* Get all tcons that are within a DFS namespace and can be refreshed */ @@ -1280,7 +1285,7 @@ static void get_tcons(struct TCP_Server_Info *server, struct list_head *head) spin_unlock(&cifs_tcp_ses_lock); } -static inline bool is_dfs_link(const char *path) +static bool is_dfs_link(const char *path) { char *s; @@ -1290,7 +1295,7 @@ static inline bool is_dfs_link(const char *path) return !!strchr(s + 1, '\\'); } -static inline char *get_dfs_root(const char *path) +static char *get_dfs_root(const char *path) { char *s, *npath; @@ -1309,31 +1314,67 @@ static inline char *get_dfs_root(const char *path) return npath; } +static inline void put_tcp_server(struct TCP_Server_Info *server) +{ + cifs_put_tcp_session(server, 0); +} + +static struct TCP_Server_Info *get_tcp_server(struct smb_vol *vol) +{ + struct TCP_Server_Info *server; + + server = cifs_find_tcp_session(vol); + if (IS_ERR_OR_NULL(server)) + return NULL; + + spin_lock(&GlobalMid_Lock); + if (server->tcpStatus != CifsGood) { + spin_unlock(&GlobalMid_Lock); + put_tcp_server(server); + return NULL; + } + spin_unlock(&GlobalMid_Lock); + + return server; +} + /* Find root SMB session out of a DFS link path */ -static struct cifs_ses *find_root_ses(struct dfs_cache_vol_info *vi, - struct cifs_tcon *tcon, const char *path) +static struct cifs_ses *find_root_ses(struct vol_info *vi, + struct cifs_tcon *tcon, + const char *path) { char *rpath; int rc; + struct cache_entry *ce; struct dfs_info3_param ref = {0}; char *mdata = NULL, *devname = NULL; struct TCP_Server_Info *server; struct cifs_ses *ses; - struct smb_vol vol; + struct smb_vol vol = {NULL}; rpath = get_dfs_root(path); if (IS_ERR(rpath)) return ERR_CAST(rpath); - memset(&vol, 0, sizeof(vol)); + down_read(&htable_rw_lock); + + ce = lookup_cache_entry(rpath, NULL); + if (IS_ERR(ce)) { + up_read(&htable_rw_lock); + ses = ERR_CAST(ce); + goto out; + } - rc = dfs_cache_noreq_find(rpath, &ref, NULL); + rc = setup_referral(path, ce, &ref, get_tgt_name(ce)); if (rc) { + up_read(&htable_rw_lock); ses = ERR_PTR(rc); goto out; } - mdata = cifs_compose_mount_options(vi->vi_mntdata, rpath, &ref, + up_read(&htable_rw_lock); + + mdata = cifs_compose_mount_options(vi->mntdata, rpath, &ref, &devname); free_dfs_info_param(&ref); @@ -1351,13 +1392,8 @@ static struct cifs_ses *find_root_ses(struct dfs_cache_vol_info *vi, goto out; } - server = cifs_find_tcp_session(&vol); - if (IS_ERR_OR_NULL(server)) { - ses = ERR_PTR(-EHOSTDOWN); - goto out; - } - if (server->tcpStatus != CifsGood) { - cifs_put_tcp_session(server, 0); + server = get_tcp_server(&vol); + if (!server) { ses = ERR_PTR(-EHOSTDOWN); goto out; } @@ -1373,17 +1409,15 @@ out: } /* Refresh DFS cache entry from a given tcon */ -static void do_refresh_tcon(struct dfs_cache *dc, struct dfs_cache_vol_info *vi, - struct cifs_tcon *tcon) +static int refresh_tcon(struct vol_info *vi, struct cifs_tcon *tcon) { int rc = 0; unsigned int xid; char *path, *npath; - unsigned int h; - struct dfs_cache_entry *ce; + struct cache_entry *ce; + struct cifs_ses *root_ses = NULL, *ses; struct dfs_info3_param *refs = NULL; int numrefs = 0; - struct cifs_ses *root_ses = NULL, *ses; xid = get_xid(); @@ -1391,19 +1425,23 @@ static void do_refresh_tcon(struct dfs_cache *dc, struct dfs_cache_vol_info *vi, rc = get_normalized_path(path, &npath); if (rc) - goto out; + goto out_free_xid; - mutex_lock(&dfs_cache_list_lock); - ce = find_cache_entry(npath, &h); - mutex_unlock(&dfs_cache_list_lock); + down_read(&htable_rw_lock); + ce = lookup_cache_entry(npath, NULL); if (IS_ERR(ce)) { rc = PTR_ERR(ce); - goto out; + up_read(&htable_rw_lock); + goto out_free_path; } - if (!cache_entry_expired(ce)) - goto out; + if (!cache_entry_expired(ce)) { + up_read(&htable_rw_lock); + goto out_free_path; + } + + up_read(&htable_rw_lock); /* If it's a DFS Link, then use root SMB session for refreshing it */ if (is_dfs_link(npath)) { @@ -1411,35 +1449,29 @@ static void do_refresh_tcon(struct dfs_cache *dc, struct dfs_cache_vol_info *vi, if (IS_ERR(ses)) { rc = PTR_ERR(ses); root_ses = NULL; - goto out; + goto out_free_path; } } else { ses = tcon->ses; } - if (unlikely(!ses->server->ops->get_dfs_refer)) { - rc = -EOPNOTSUPP; - } else { - rc = ses->server->ops->get_dfs_refer(xid, ses, path, &refs, - &numrefs, dc->dc_nlsc, - tcon->remap); - if (!rc) { - mutex_lock(&dfs_cache_list_lock); - ce = __update_cache_entry(npath, refs, numrefs); - mutex_unlock(&dfs_cache_list_lock); - dump_refs(refs, numrefs); - free_dfs_info_array(refs, numrefs); - if (IS_ERR(ce)) - rc = PTR_ERR(ce); - } + rc = get_dfs_referral(xid, ses, cache_nlsc, tcon->remap, npath, &refs, + &numrefs); + if (!rc) { + dump_refs(refs, numrefs); + rc = update_cache_entry(npath, refs, numrefs); + free_dfs_info_array(refs, numrefs); } -out: if (root_ses) cifs_put_smb_ses(root_ses); - free_xid(xid); +out_free_path: free_normalized_path(path, npath); + +out_free_xid: + free_xid(xid); + return rc; } /* @@ -1448,30 +1480,61 @@ out: */ static void refresh_cache_worker(struct work_struct *work) { - struct dfs_cache *dc = container_of(work, struct dfs_cache, - dc_refresh.work); - struct dfs_cache_vol_info *vi; + struct vol_info *vi, *nvi; struct TCP_Server_Info *server; - LIST_HEAD(list); + LIST_HEAD(vols); + LIST_HEAD(tcons); struct cifs_tcon *tcon, *ntcon; + int rc; - mutex_lock(&dc->dc_lock); - - list_for_each_entry(vi, &dc->dc_vol_list, vi_list) { - server = cifs_find_tcp_session(&vi->vi_vol); - if (IS_ERR_OR_NULL(server)) + /* + * Find SMB volumes that are eligible (server->tcpStatus == CifsGood) + * for refreshing. + */ + spin_lock(&vol_list_lock); + list_for_each_entry(vi, &vol_list, list) { + server = get_tcp_server(&vi->smb_vol); + if (!server) continue; - if (server->tcpStatus != CifsGood) - goto next; - get_tcons(server, &list); - list_for_each_entry_safe(tcon, ntcon, &list, ulist) { - do_refresh_tcon(dc, vi, tcon); + + kref_get(&vi->refcnt); + list_add_tail(&vi->rlist, &vols); + put_tcp_server(server); + } + spin_unlock(&vol_list_lock); + + /* Walk through all TCONs and refresh any expired cache entry */ + list_for_each_entry_safe(vi, nvi, &vols, rlist) { + spin_lock(&vi->smb_vol_lock); + server = get_tcp_server(&vi->smb_vol); + spin_unlock(&vi->smb_vol_lock); + + if (!server) + goto next_vol; + + get_tcons(server, &tcons); + rc = 0; + + list_for_each_entry_safe(tcon, ntcon, &tcons, ulist) { + /* + * Skip tcp server if any of its tcons failed to refresh + * (possibily due to reconnects). + */ + if (!rc) + rc = refresh_tcon(vi, tcon); + list_del_init(&tcon->ulist); cifs_put_tcon(tcon); } -next: - cifs_put_tcp_session(server, 0); + + put_tcp_server(server); + +next_vol: + list_del_init(&vi->rlist); + kref_put(&vi->refcnt, vol_release); } - queue_delayed_work(cifsiod_wq, &dc->dc_refresh, dc->dc_ttl * HZ); - mutex_unlock(&dc->dc_lock); + + spin_lock(&cache_ttl_lock); + queue_delayed_work(dfscache_wq, &refresh_task, cache_ttl * HZ); + spin_unlock(&cache_ttl_lock); } diff --git a/fs/cifs/dfs_cache.h b/fs/cifs/dfs_cache.h index 76c732943f5f..99ee44f8ad07 100644 --- a/fs/cifs/dfs_cache.h +++ b/fs/cifs/dfs_cache.h @@ -24,7 +24,7 @@ struct dfs_cache_tgt_iterator { extern int dfs_cache_init(void); extern void dfs_cache_destroy(void); -extern const struct file_operations dfscache_proc_fops; +extern const struct proc_ops dfscache_proc_ops; extern int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, const struct nls_table *nls_codepage, int remap, diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 043288b5c728..a4e8f7d445ac 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2921,7 +2921,7 @@ cifs_write_from_iter(loff_t offset, size_t len, struct iov_iter *from, "direct_writev couldn't get user pages " "(rc=%zd) iter type %d iov_offset %zd " "count %zd\n", - result, from->type, + result, iov_iter_type(from), from->iov_offset, from->count); dump_stack(); @@ -3132,7 +3132,7 @@ static ssize_t __cifs_writev( * In this case, fall back to non-direct write function. * this could be improved by getting pages directly in ITER_KVEC */ - if (direct && from->type & ITER_KVEC) { + if (direct && iov_iter_is_kvec(from)) { cifs_dbg(FYI, "use non-direct cifs_writev for kvec I/O\n"); direct = false; } @@ -3652,7 +3652,7 @@ cifs_send_async_read(loff_t offset, size_t len, struct cifsFileInfo *open_file, "couldn't get user pages (rc=%zd)" " iter type %d" " iov_offset %zd count %zd\n", - result, direct_iov.type, + result, iov_iter_type(&direct_iov), direct_iov.iov_offset, direct_iov.count); dump_stack(); @@ -3863,7 +3863,7 @@ static ssize_t __cifs_readv( * fall back to data copy read path * this could be improved by getting pages directly in ITER_KVEC */ - if (direct && to->type & ITER_KVEC) { + if (direct && iov_iter_is_kvec(to)) { cifs_dbg(FYI, "use non-direct cifs_user_readv for kvec I/O\n"); direct = false; } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index ca76a9287456..9b547f7f5f5d 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -2228,7 +2228,7 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start, return -ENOTSUPP; } -static int cifs_truncate_page(struct address_space *mapping, loff_t from) +int cifs_truncate_page(struct address_space *mapping, loff_t from) { pgoff_t index = from >> PAGE_SHIFT; unsigned offset = from & (PAGE_SIZE - 1); @@ -2245,7 +2245,7 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from) return rc; } -static void cifs_setsize(struct inode *inode, loff_t offset) +void cifs_setsize(struct inode *inode, loff_t offset) { struct cifsInodeInfo *cifs_i = CIFS_I(inode); diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 0516fc482d43..0511aaf451d4 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -743,7 +743,7 @@ __smb2_handle_cancelled_cmd(struct cifs_tcon *tcon, __u16 cmd, __u64 mid, { struct close_cancelled_open *cancelled; - cancelled = kzalloc(sizeof(*cancelled), GFP_KERNEL); + cancelled = kzalloc(sizeof(*cancelled), GFP_ATOMIC); if (!cancelled) return -ENOMEM; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 6250370c1170..6787fce26f20 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -12,6 +12,7 @@ #include <linux/uuid.h> #include <linux/sort.h> #include <crypto/aead.h> +#include "cifsfs.h" #include "cifsglob.h" #include "smb2pdu.h" #include "smb2proto.h" @@ -804,7 +805,7 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, struct cifs_fid *pfid) sizeof(struct smb2_file_all_info), &rsp_iov[1], sizeof(struct smb2_file_all_info), (char *)&tcon->crfid.file_all_info)) - tcon->crfid.file_all_info_is_valid = 1; + tcon->crfid.file_all_info_is_valid = true; oshr_exit: mutex_unlock(&tcon->crfid.fid_mutex); @@ -1523,7 +1524,9 @@ smb2_ioctl_query_info(const unsigned int xid, COMPOUND_FID, COMPOUND_FID, qi.info_type, true, buffer, qi.output_buffer_length, - CIFSMaxBufSize); + CIFSMaxBufSize - + MAX_SMB2_CREATE_RESPONSE_SIZE - + MAX_SMB2_CLOSE_RESPONSE_SIZE); } } else if (qi.flags == PASSTHRU_SET_INFO) { /* Can eventually relax perm check since server enforces too */ @@ -2053,14 +2056,33 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_search_info *srch_inf) { __le16 *utf16_path; - int rc; - __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; + struct smb_rqst rqst[2]; + struct kvec rsp_iov[2]; + int resp_buftype[2]; + struct kvec open_iov[SMB2_CREATE_IOV_SIZE]; + struct kvec qd_iov[SMB2_QUERY_DIRECTORY_IOV_SIZE]; + int rc, flags = 0; + u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_open_parms oparms; + struct smb2_query_directory_rsp *qd_rsp = NULL; + struct smb2_create_rsp *op_rsp = NULL; utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (!utf16_path) return -ENOMEM; + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + memset(rqst, 0, sizeof(rqst)); + resp_buftype[0] = resp_buftype[1] = CIFS_NO_BUFFER; + memset(rsp_iov, 0, sizeof(rsp_iov)); + + /* Open */ + memset(&open_iov, 0, sizeof(open_iov)); + rqst[0].rq_iov = open_iov; + rqst[0].rq_nvec = SMB2_CREATE_IOV_SIZE; + oparms.tcon = tcon; oparms.desired_access = FILE_READ_ATTRIBUTES | FILE_READ_DATA; oparms.disposition = FILE_OPEN; @@ -2071,22 +2093,75 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = fid; oparms.reconnect = false; - rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL, NULL); - kfree(utf16_path); - if (rc) { - cifs_dbg(FYI, "open dir failed rc=%d\n", rc); - return rc; - } + rc = SMB2_open_init(tcon, &rqst[0], &oplock, &oparms, utf16_path); + if (rc) + goto qdf_free; + smb2_set_next_command(tcon, &rqst[0]); + /* Query directory */ srch_inf->entries_in_buffer = 0; srch_inf->index_of_last_entry = 2; - rc = SMB2_query_directory(xid, tcon, fid->persistent_fid, - fid->volatile_fid, 0, srch_inf); - if (rc) { - cifs_dbg(FYI, "query directory failed rc=%d\n", rc); + memset(&qd_iov, 0, sizeof(qd_iov)); + rqst[1].rq_iov = qd_iov; + rqst[1].rq_nvec = SMB2_QUERY_DIRECTORY_IOV_SIZE; + + rc = SMB2_query_directory_init(xid, tcon, &rqst[1], + COMPOUND_FID, COMPOUND_FID, + 0, srch_inf->info_level); + if (rc) + goto qdf_free; + + smb2_set_related(&rqst[1]); + + rc = compound_send_recv(xid, tcon->ses, flags, 2, rqst, + resp_buftype, rsp_iov); + + /* If the open failed there is nothing to do */ + op_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; + if (op_rsp == NULL || op_rsp->sync_hdr.Status != STATUS_SUCCESS) { + cifs_dbg(FYI, "query_dir_first: open failed rc=%d\n", rc); + goto qdf_free; + } + fid->persistent_fid = op_rsp->PersistentFileId; + fid->volatile_fid = op_rsp->VolatileFileId; + + /* Anything else than ENODATA means a genuine error */ + if (rc && rc != -ENODATA) { SMB2_close(xid, tcon, fid->persistent_fid, fid->volatile_fid); + cifs_dbg(FYI, "query_dir_first: query directory failed rc=%d\n", rc); + trace_smb3_query_dir_err(xid, fid->persistent_fid, + tcon->tid, tcon->ses->Suid, 0, 0, rc); + goto qdf_free; + } + + qd_rsp = (struct smb2_query_directory_rsp *)rsp_iov[1].iov_base; + if (qd_rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { + trace_smb3_query_dir_done(xid, fid->persistent_fid, + tcon->tid, tcon->ses->Suid, 0, 0); + srch_inf->endOfSearch = true; + rc = 0; + goto qdf_free; + } + + rc = smb2_parse_query_directory(tcon, &rsp_iov[1], resp_buftype[1], + srch_inf); + if (rc) { + trace_smb3_query_dir_err(xid, fid->persistent_fid, tcon->tid, + tcon->ses->Suid, 0, 0, rc); + goto qdf_free; } + resp_buftype[1] = CIFS_NO_BUFFER; + + trace_smb3_query_dir_done(xid, fid->persistent_fid, tcon->tid, + tcon->ses->Suid, 0, srch_inf->entries_in_buffer); + + qdf_free: + kfree(utf16_path); + SMB2_open_free(&rqst[0]); + SMB2_query_directory_free(&rqst[1]); + free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); + free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); return rc; } @@ -2697,7 +2772,10 @@ smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon, rc = SMB2_ioctl_init(tcon, &rqst[1], fid.persistent_fid, fid.volatile_fid, FSCTL_GET_REPARSE_POINT, - true /* is_fctl */, NULL, 0, CIFSMaxBufSize); + true /* is_fctl */, NULL, 0, + CIFSMaxBufSize - + MAX_SMB2_CREATE_RESPONSE_SIZE - + MAX_SMB2_CLOSE_RESPONSE_SIZE); if (rc) goto querty_exit; @@ -3095,28 +3173,32 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, } /* + * Extending the file + */ + if ((keep_size == false) && i_size_read(inode) < off + len) { + if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0) + smb2_set_sparse(xid, tcon, cfile, inode, false); + + eof = cpu_to_le64(off + len); + rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, cfile->pid, &eof); + if (rc == 0) { + cifsi->server_eof = off + len; + cifs_setsize(inode, off + len); + cifs_truncate_page(inode->i_mapping, inode->i_size); + truncate_setsize(inode, off + len); + } + goto out; + } + + /* * Files are non-sparse by default so falloc may be a no-op - * Must check if file sparse. If not sparse, and not extending - * then no need to do anything since file already allocated + * Must check if file sparse. If not sparse, and since we are not + * extending then no need to do anything since file already allocated */ if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0) { - if (keep_size == true) - rc = 0; - /* check if extending file */ - else if (i_size_read(inode) >= off + len) - /* not extending file and already not sparse */ - rc = 0; - /* BB: in future add else clause to extend file */ - else - rc = -EOPNOTSUPP; - if (rc) - trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, - tcon->tid, tcon->ses->Suid, off, len, rc); - else - trace_smb3_falloc_done(xid, cfile->fid.persistent_fid, - tcon->tid, tcon->ses->Suid, off, len); - free_xid(xid); - return rc; + rc = 0; + goto out; } if ((keep_size == true) || (i_size_read(inode) >= off + len)) { @@ -3130,25 +3212,14 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, */ if ((off > 8192) || (off + len + 8192 < i_size_read(inode))) { rc = -EOPNOTSUPP; - trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, - tcon->tid, tcon->ses->Suid, off, len, rc); - free_xid(xid); - return rc; - } - - smb2_set_sparse(xid, tcon, cfile, inode, false); - rc = 0; - } else { - smb2_set_sparse(xid, tcon, cfile, inode, false); - rc = 0; - if (i_size_read(inode) < off + len) { - eof = cpu_to_le64(off + len); - rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, - cfile->fid.volatile_fid, cfile->pid, - &eof); + goto out; } } + smb2_set_sparse(xid, tcon, cfile, inode, false); + rc = 0; + +out: if (rc) trace_smb3_falloc_err(xid, cfile->fid.persistent_fid, tcon->tid, tcon->ses->Suid, off, len, rc); diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 9434f6dd8df3..14f209f7376f 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -312,7 +312,7 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon) if (server->tcpStatus != CifsNeedReconnect) break; - if (--retries) + if (retries && --retries) continue; /* @@ -2199,13 +2199,14 @@ create_sd_buf(umode_t mode, unsigned int *len) struct cifs_ace *pace; unsigned int sdlen, acelen; - *len = roundup(sizeof(struct crt_sd_ctxt) + sizeof(struct cifs_ace), 8); + *len = roundup(sizeof(struct crt_sd_ctxt) + sizeof(struct cifs_ace) * 2, + 8); buf = kzalloc(*len, GFP_KERNEL); if (buf == NULL) return buf; sdlen = sizeof(struct smb3_sd) + sizeof(struct smb3_acl) + - sizeof(struct cifs_ace); + 2 * sizeof(struct cifs_ace); buf->ccontext.DataOffset = cpu_to_le16(offsetof (struct crt_sd_ctxt, sd)); @@ -2232,8 +2233,12 @@ create_sd_buf(umode_t mode, unsigned int *len) /* create one ACE to hold the mode embedded in reserved special SID */ pace = (struct cifs_ace *)(sizeof(struct crt_sd_ctxt) + (char *)buf); acelen = setup_special_mode_ACE(pace, (__u64)mode); + /* and one more ACE to allow access for authenticated users */ + pace = (struct cifs_ace *)(acelen + (sizeof(struct crt_sd_ctxt) + + (char *)buf)); + acelen += setup_authusers_ACE(pace); buf->acl.AclSize = cpu_to_le16(sizeof(struct cifs_acl) + acelen); - buf->acl.AceCount = cpu_to_le16(1); + buf->acl.AceCount = cpu_to_le16(2); return buf; } @@ -4296,56 +4301,38 @@ num_entries(char *bufstart, char *end_of_buf, char **lastentry, size_t size) /* * Readdir/FindFirst */ -int -SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, int index, - struct cifs_search_info *srch_inf) +int SMB2_query_directory_init(const unsigned int xid, + struct cifs_tcon *tcon, struct smb_rqst *rqst, + u64 persistent_fid, u64 volatile_fid, + int index, int info_level) { - struct smb_rqst rqst; + struct TCP_Server_Info *server = tcon->ses->server; struct smb2_query_directory_req *req; - struct smb2_query_directory_rsp *rsp = NULL; - struct kvec iov[2]; - struct kvec rsp_iov; - int rc = 0; - int len; - int resp_buftype = CIFS_NO_BUFFER; unsigned char *bufptr; - struct TCP_Server_Info *server; - struct cifs_ses *ses = tcon->ses; __le16 asteriks = cpu_to_le16('*'); - char *end_of_smb; - unsigned int output_size = CIFSMaxBufSize; - size_t info_buf_size; - int flags = 0; + unsigned int output_size = CIFSMaxBufSize - + MAX_SMB2_CREATE_RESPONSE_SIZE - + MAX_SMB2_CLOSE_RESPONSE_SIZE; unsigned int total_len; - - if (ses && (ses->server)) - server = ses->server; - else - return -EIO; + struct kvec *iov = rqst->rq_iov; + int len, rc; rc = smb2_plain_req_init(SMB2_QUERY_DIRECTORY, tcon, (void **) &req, &total_len); if (rc) return rc; - if (smb3_encryption_required(tcon)) - flags |= CIFS_TRANSFORM_REQ; - - switch (srch_inf->info_level) { + switch (info_level) { case SMB_FIND_FILE_DIRECTORY_INFO: req->FileInformationClass = FILE_DIRECTORY_INFORMATION; - info_buf_size = sizeof(FILE_DIRECTORY_INFO) - 1; break; case SMB_FIND_FILE_ID_FULL_DIR_INFO: req->FileInformationClass = FILEID_FULL_DIRECTORY_INFORMATION; - info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1; break; default: cifs_tcon_dbg(VFS, "info level %u isn't supported\n", - srch_inf->info_level); - rc = -EINVAL; - goto qdir_exit; + info_level); + return -EINVAL; } req->FileIndex = cpu_to_le32(index); @@ -4374,40 +4361,50 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, iov[1].iov_base = (char *)(req->Buffer); iov[1].iov_len = len; - memset(&rqst, 0, sizeof(struct smb_rqst)); - rqst.rq_iov = iov; - rqst.rq_nvec = 2; - trace_smb3_query_dir_enter(xid, persistent_fid, tcon->tid, tcon->ses->Suid, index, output_size); - rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); - cifs_small_buf_release(req); - rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base; + return 0; +} - if (rc) { - if (rc == -ENODATA && - rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { - trace_smb3_query_dir_done(xid, persistent_fid, - tcon->tid, tcon->ses->Suid, index, 0); - srch_inf->endOfSearch = true; - rc = 0; - } else { - trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid, - tcon->ses->Suid, index, 0, rc); - cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); - } - goto qdir_exit; +void SMB2_query_directory_free(struct smb_rqst *rqst) +{ + if (rqst && rqst->rq_iov) { + cifs_small_buf_release(rqst->rq_iov[0].iov_base); /* request */ + } +} + +int +smb2_parse_query_directory(struct cifs_tcon *tcon, + struct kvec *rsp_iov, + int resp_buftype, + struct cifs_search_info *srch_inf) +{ + struct smb2_query_directory_rsp *rsp; + size_t info_buf_size; + char *end_of_smb; + int rc; + + rsp = (struct smb2_query_directory_rsp *)rsp_iov->iov_base; + + switch (srch_inf->info_level) { + case SMB_FIND_FILE_DIRECTORY_INFO: + info_buf_size = sizeof(FILE_DIRECTORY_INFO) - 1; + break; + case SMB_FIND_FILE_ID_FULL_DIR_INFO: + info_buf_size = sizeof(SEARCH_ID_FULL_DIR_INFO) - 1; + break; + default: + cifs_tcon_dbg(VFS, "info level %u isn't supported\n", + srch_inf->info_level); + return -EINVAL; } rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset), - le32_to_cpu(rsp->OutputBufferLength), &rsp_iov, + le32_to_cpu(rsp->OutputBufferLength), rsp_iov, info_buf_size); - if (rc) { - trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid, - tcon->ses->Suid, index, 0, rc); - goto qdir_exit; - } + if (rc) + return rc; srch_inf->unicode = true; @@ -4420,7 +4417,7 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, srch_inf->ntwrk_buf_start = (char *)rsp; srch_inf->srch_entries_start = srch_inf->last_entry = (char *)rsp + le16_to_cpu(rsp->OutputBufferOffset); - end_of_smb = rsp_iov.iov_len + (char *)rsp; + end_of_smb = rsp_iov->iov_len + (char *)rsp; srch_inf->entries_in_buffer = num_entries(srch_inf->srch_entries_start, end_of_smb, &srch_inf->last_entry, info_buf_size); @@ -4435,11 +4432,72 @@ SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, else cifs_tcon_dbg(VFS, "illegal search buffer type\n"); + return 0; +} + +int +SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, + u64 persistent_fid, u64 volatile_fid, int index, + struct cifs_search_info *srch_inf) +{ + struct smb_rqst rqst; + struct kvec iov[SMB2_QUERY_DIRECTORY_IOV_SIZE]; + struct smb2_query_directory_rsp *rsp = NULL; + int resp_buftype = CIFS_NO_BUFFER; + struct kvec rsp_iov; + int rc = 0; + struct cifs_ses *ses = tcon->ses; + int flags = 0; + + if (!ses || !(ses->server)) + return -EIO; + + if (smb3_encryption_required(tcon)) + flags |= CIFS_TRANSFORM_REQ; + + memset(&rqst, 0, sizeof(struct smb_rqst)); + memset(&iov, 0, sizeof(iov)); + rqst.rq_iov = iov; + rqst.rq_nvec = SMB2_QUERY_DIRECTORY_IOV_SIZE; + + rc = SMB2_query_directory_init(xid, tcon, &rqst, persistent_fid, + volatile_fid, index, + srch_inf->info_level); + if (rc) + goto qdir_exit; + + rc = cifs_send_recv(xid, ses, &rqst, &resp_buftype, flags, &rsp_iov); + rsp = (struct smb2_query_directory_rsp *)rsp_iov.iov_base; + + if (rc) { + if (rc == -ENODATA && + rsp->sync_hdr.Status == STATUS_NO_MORE_FILES) { + trace_smb3_query_dir_done(xid, persistent_fid, + tcon->tid, tcon->ses->Suid, index, 0); + srch_inf->endOfSearch = true; + rc = 0; + } else { + trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid, + tcon->ses->Suid, index, 0, rc); + cifs_stats_fail_inc(tcon, SMB2_QUERY_DIRECTORY_HE); + } + goto qdir_exit; + } + + rc = smb2_parse_query_directory(tcon, &rsp_iov, resp_buftype, + srch_inf); + if (rc) { + trace_smb3_query_dir_err(xid, persistent_fid, tcon->tid, + tcon->ses->Suid, index, 0, rc); + goto qdir_exit; + } + resp_buftype = CIFS_NO_BUFFER; + trace_smb3_query_dir_done(xid, persistent_fid, tcon->tid, tcon->ses->Suid, index, srch_inf->entries_in_buffer); - return rc; qdir_exit: + SMB2_query_directory_free(&rqst); free_rsp_buf(resp_buftype, rsp); return rc; } diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 7b1c379fdf7a..4c43dbd1e089 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -1282,6 +1282,8 @@ struct smb2_echo_rsp { #define SMB2_INDEX_SPECIFIED 0x04 #define SMB2_REOPEN 0x10 +#define SMB2_QUERY_DIRECTORY_IOV_SIZE 2 + struct smb2_query_directory_req { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 33 */ diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 27d29f2eb6c8..6c678e00046f 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -197,6 +197,11 @@ extern int SMB2_echo(struct TCP_Server_Info *server); extern int SMB2_query_directory(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, int index, struct cifs_search_info *srch_inf); +extern int SMB2_query_directory_init(unsigned int xid, struct cifs_tcon *tcon, + struct smb_rqst *rqst, + u64 persistent_fid, u64 volatile_fid, + int index, int info_level); +extern void SMB2_query_directory_free(struct smb_rqst *rqst); extern int SMB2_set_eof(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, u32 pid, __le64 *eof); diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index 387c88704c52..fe6acfce3390 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -685,6 +685,8 @@ smb2_mid_entry_alloc(const struct smb2_sync_hdr *shdr, * The default is for the mid to be synchronous, so the * default callback just wakes up the current task. */ + get_task_struct(current); + temp->creator = current; temp->callback = cifs_wake_up_task; temp->callback_data = current; diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 3d2e11f85cba..cb3ee916f527 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -76,6 +76,8 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct TCP_Server_Info *server) * The default is for the mid to be synchronous, so the * default callback just wakes up the current task. */ + get_task_struct(current); + temp->creator = current; temp->callback = cifs_wake_up_task; temp->callback_data = current; @@ -158,6 +160,7 @@ static void _cifs_mid_q_entry_release(struct kref *refcount) } } #endif + put_task_struct(midEntry->creator); mempool_free(midEntry, cifs_mid_poolp); } diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index db4ba8f6077e..b8299173ea7e 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -32,7 +32,8 @@ #include "cifs_unicode.h" #define MAX_EA_VALUE_SIZE CIFSMaxBufSize -#define CIFS_XATTR_CIFS_ACL "system.cifs_acl" +#define CIFS_XATTR_CIFS_ACL "system.cifs_acl" /* DACL only */ +#define CIFS_XATTR_CIFS_NTSD "system.cifs_ntsd" /* owner plus DACL */ #define CIFS_XATTR_ATTRIB "cifs.dosattrib" /* full name: user.cifs.dosattrib */ #define CIFS_XATTR_CREATETIME "cifs.creationtime" /* user.cifs.creationtime */ /* @@ -40,12 +41,62 @@ * confusing users and using the 20+ year old term 'cifs' when it is no longer * secure, replaced by SMB2 (then even more highly secure SMB3) many years ago */ -#define SMB3_XATTR_CIFS_ACL "system.smb3_acl" +#define SMB3_XATTR_CIFS_ACL "system.smb3_acl" /* DACL only */ +#define SMB3_XATTR_CIFS_NTSD "system.smb3_ntsd" /* owner plus DACL */ #define SMB3_XATTR_ATTRIB "smb3.dosattrib" /* full name: user.smb3.dosattrib */ #define SMB3_XATTR_CREATETIME "smb3.creationtime" /* user.smb3.creationtime */ /* BB need to add server (Samba e.g) support for security and trusted prefix */ -enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT }; +enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT, + XATTR_CIFS_NTSD }; + +static int cifs_attrib_set(unsigned int xid, struct cifs_tcon *pTcon, + struct inode *inode, char *full_path, + const void *value, size_t size) +{ + ssize_t rc = -EOPNOTSUPP; + __u32 *pattrib = (__u32 *)value; + __u32 attrib; + FILE_BASIC_INFO info_buf; + + if ((value == NULL) || (size != sizeof(__u32))) + return -ERANGE; + + memset(&info_buf, 0, sizeof(info_buf)); + attrib = *pattrib; + info_buf.Attributes = cpu_to_le32(attrib); + if (pTcon->ses->server->ops->set_file_info) + rc = pTcon->ses->server->ops->set_file_info(inode, full_path, + &info_buf, xid); + if (rc == 0) + CIFS_I(inode)->cifsAttrs = attrib; + + return rc; +} + +static int cifs_creation_time_set(unsigned int xid, struct cifs_tcon *pTcon, + struct inode *inode, char *full_path, + const void *value, size_t size) +{ + ssize_t rc = -EOPNOTSUPP; + __u64 *pcreation_time = (__u64 *)value; + __u64 creation_time; + FILE_BASIC_INFO info_buf; + + if ((value == NULL) || (size != sizeof(__u64))) + return -ERANGE; + + memset(&info_buf, 0, sizeof(info_buf)); + creation_time = *pcreation_time; + info_buf.CreationTime = cpu_to_le64(creation_time); + if (pTcon->ses->server->ops->set_file_info) + rc = pTcon->ses->server->ops->set_file_info(inode, full_path, + &info_buf, xid); + if (rc == 0) + CIFS_I(inode)->createtime = creation_time; + + return rc; +} static int cifs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, @@ -86,6 +137,23 @@ static int cifs_xattr_set(const struct xattr_handler *handler, switch (handler->flags) { case XATTR_USER: + cifs_dbg(FYI, "%s:setting user xattr %s\n", __func__, name); + if ((strcmp(name, CIFS_XATTR_ATTRIB) == 0) || + (strcmp(name, SMB3_XATTR_ATTRIB) == 0)) { + rc = cifs_attrib_set(xid, pTcon, inode, full_path, + value, size); + if (rc == 0) /* force revalidate of the inode */ + CIFS_I(inode)->time = 0; + break; + } else if ((strcmp(name, CIFS_XATTR_CREATETIME) == 0) || + (strcmp(name, SMB3_XATTR_CREATETIME) == 0)) { + rc = cifs_creation_time_set(xid, pTcon, inode, + full_path, value, size); + if (rc == 0) /* force revalidate of the inode */ + CIFS_I(inode)->time = 0; + break; + } + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto out; @@ -95,7 +163,8 @@ static int cifs_xattr_set(const struct xattr_handler *handler, cifs_sb->local_nls, cifs_sb); break; - case XATTR_CIFS_ACL: { + case XATTR_CIFS_ACL: + case XATTR_CIFS_NTSD: { struct cifs_ntsd *pacl; if (!value) @@ -106,12 +175,25 @@ static int cifs_xattr_set(const struct xattr_handler *handler, } else { memcpy(pacl, value, size); if (value && - pTcon->ses->server->ops->set_acl) - rc = pTcon->ses->server->ops->set_acl(pacl, - size, inode, - full_path, CIFS_ACL_DACL); - else + pTcon->ses->server->ops->set_acl) { + rc = 0; + if (handler->flags == XATTR_CIFS_NTSD) { + /* set owner and DACL */ + rc = pTcon->ses->server->ops->set_acl( + pacl, size, inode, + full_path, + CIFS_ACL_OWNER); + } + if (rc == 0) { + /* set DACL */ + rc = pTcon->ses->server->ops->set_acl( + pacl, size, inode, + full_path, + CIFS_ACL_DACL); + } + } else { rc = -EOPNOTSUPP; + } if (rc == 0) /* force revalidate of the inode */ CIFS_I(inode)->time = 0; kfree(pacl); @@ -179,7 +261,7 @@ static int cifs_creation_time_get(struct dentry *dentry, struct inode *inode, void *value, size_t size) { ssize_t rc; - __u64 * pcreatetime; + __u64 *pcreatetime; rc = cifs_revalidate_dentry_attr(dentry); if (rc) @@ -244,7 +326,9 @@ static int cifs_xattr_get(const struct xattr_handler *handler, full_path, name, value, size, cifs_sb); break; - case XATTR_CIFS_ACL: { + case XATTR_CIFS_ACL: + case XATTR_CIFS_NTSD: { + /* the whole ntsd is fetched regardless */ u32 acllen; struct cifs_ntsd *pacl; @@ -382,6 +466,26 @@ static const struct xattr_handler smb3_acl_xattr_handler = { .set = cifs_xattr_set, }; +static const struct xattr_handler cifs_cifs_ntsd_xattr_handler = { + .name = CIFS_XATTR_CIFS_NTSD, + .flags = XATTR_CIFS_NTSD, + .get = cifs_xattr_get, + .set = cifs_xattr_set, +}; + +/* + * Although this is just an alias for the above, need to move away from + * confusing users and using the 20 year old term 'cifs' when it is no + * longer secure and was replaced by SMB2/SMB3 a long time ago, and + * SMB3 and later are highly secure. + */ +static const struct xattr_handler smb3_ntsd_xattr_handler = { + .name = SMB3_XATTR_CIFS_NTSD, + .flags = XATTR_CIFS_NTSD, + .get = cifs_xattr_get, + .set = cifs_xattr_set, +}; + static const struct xattr_handler cifs_posix_acl_access_xattr_handler = { .name = XATTR_NAME_POSIX_ACL_ACCESS, .flags = XATTR_ACL_ACCESS, @@ -401,6 +505,8 @@ const struct xattr_handler *cifs_xattr_handlers[] = { &cifs_os2_xattr_handler, &cifs_cifs_acl_xattr_handler, &smb3_acl_xattr_handler, /* alias for above since avoiding "cifs" */ + &cifs_cifs_ntsd_xattr_handler, + &smb3_ntsd_xattr_handler, /* alias for above since avoiding "cifs" */ &cifs_posix_acl_access_xattr_handler, &cifs_posix_acl_default_xattr_handler, NULL diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c deleted file mode 100644 index 358ea2ecf36b..000000000000 --- a/fs/compat_ioctl.c +++ /dev/null @@ -1,261 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * ioctl32.c: Conversion between 32bit and 64bit native ioctls. - * - * Copyright (C) 1997-2000 Jakub Jelinek (jakub@redhat.com) - * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) - * Copyright (C) 2001,2002 Andi Kleen, SuSE Labs - * Copyright (C) 2003 Pavel Machek (pavel@ucw.cz) - * - * These routines maintain argument size conversion between 32bit and 64bit - * ioctls. - */ - -#include <linux/types.h> -#include <linux/compat.h> -#include <linux/kernel.h> -#include <linux/capability.h> -#include <linux/compiler.h> -#include <linux/sched.h> -#include <linux/smp.h> -#include <linux/ioctl.h> -#include <linux/if.h> -#include <linux/raid/md_u.h> -#include <linux/falloc.h> -#include <linux/file.h> -#include <linux/ppp-ioctl.h> -#include <linux/if_pppox.h> -#include <linux/tty.h> -#include <linux/vt_kern.h> -#include <linux/blkdev.h> -#include <linux/serial.h> -#include <linux/ctype.h> -#include <linux/syscalls.h> -#include <linux/gfp.h> -#include <linux/cec.h> - -#include "internal.h" - -#ifdef CONFIG_BLOCK -#include <linux/cdrom.h> -#include <linux/fd.h> -#include <scsi/scsi.h> -#include <scsi/scsi_ioctl.h> -#include <scsi/sg.h> -#endif - -#include <linux/uaccess.h> -#include <linux/watchdog.h> - -#include <linux/hiddev.h> - - -#include <linux/sort.h> - -/* - * simple reversible transform to make our table more evenly - * distributed after sorting. - */ -#define XFORM(i) (((i) ^ ((i) << 27) ^ ((i) << 17)) & 0xffffffff) - -#define COMPATIBLE_IOCTL(cmd) XFORM((u32)cmd), -static unsigned int ioctl_pointer[] = { -#ifdef CONFIG_BLOCK -/* Big S */ -COMPATIBLE_IOCTL(SCSI_IOCTL_GET_IDLUN) -COMPATIBLE_IOCTL(SCSI_IOCTL_DOORLOCK) -COMPATIBLE_IOCTL(SCSI_IOCTL_DOORUNLOCK) -COMPATIBLE_IOCTL(SCSI_IOCTL_TEST_UNIT_READY) -COMPATIBLE_IOCTL(SCSI_IOCTL_GET_BUS_NUMBER) -COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND) -COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST) -COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI) -#endif -#ifdef CONFIG_BLOCK -/* SG stuff */ -COMPATIBLE_IOCTL(SG_IO) -COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE) -COMPATIBLE_IOCTL(SG_SET_TIMEOUT) -COMPATIBLE_IOCTL(SG_GET_TIMEOUT) -COMPATIBLE_IOCTL(SG_EMULATED_HOST) -COMPATIBLE_IOCTL(SG_GET_TRANSFORM) -COMPATIBLE_IOCTL(SG_SET_RESERVED_SIZE) -COMPATIBLE_IOCTL(SG_GET_RESERVED_SIZE) -COMPATIBLE_IOCTL(SG_GET_SCSI_ID) -COMPATIBLE_IOCTL(SG_SET_FORCE_LOW_DMA) -COMPATIBLE_IOCTL(SG_GET_LOW_DMA) -COMPATIBLE_IOCTL(SG_SET_FORCE_PACK_ID) -COMPATIBLE_IOCTL(SG_GET_PACK_ID) -COMPATIBLE_IOCTL(SG_GET_NUM_WAITING) -COMPATIBLE_IOCTL(SG_SET_DEBUG) -COMPATIBLE_IOCTL(SG_GET_SG_TABLESIZE) -COMPATIBLE_IOCTL(SG_GET_COMMAND_Q) -COMPATIBLE_IOCTL(SG_SET_COMMAND_Q) -COMPATIBLE_IOCTL(SG_GET_VERSION_NUM) -COMPATIBLE_IOCTL(SG_NEXT_CMD_LEN) -COMPATIBLE_IOCTL(SG_SCSI_RESET) -COMPATIBLE_IOCTL(SG_GET_REQUEST_TABLE) -COMPATIBLE_IOCTL(SG_SET_KEEP_ORPHAN) -COMPATIBLE_IOCTL(SG_GET_KEEP_ORPHAN) -#endif -}; - -/* - * Convert common ioctl arguments based on their command number - * - * Please do not add any code in here. Instead, implement - * a compat_ioctl operation in the place that handleѕ the - * ioctl for the native case. - */ -static long do_ioctl_trans(unsigned int cmd, - unsigned long arg, struct file *file) -{ - return -ENOIOCTLCMD; -} - -static int compat_ioctl_check_table(unsigned int xcmd) -{ -#ifdef CONFIG_BLOCK - int i; - const int max = ARRAY_SIZE(ioctl_pointer) - 1; - - BUILD_BUG_ON(max >= (1 << 16)); - - /* guess initial offset into table, assuming a - normalized distribution */ - i = ((xcmd >> 16) * max) >> 16; - - /* do linear search up first, until greater or equal */ - while (ioctl_pointer[i] < xcmd && i < max) - i++; - - /* then do linear search down */ - while (ioctl_pointer[i] > xcmd && i > 0) - i--; - - return ioctl_pointer[i] == xcmd; -#else - return 0; -#endif -} - -COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, - compat_ulong_t, arg32) -{ - unsigned long arg = arg32; - struct fd f = fdget(fd); - int error = -EBADF; - if (!f.file) - goto out; - - /* RED-PEN how should LSM module know it's handling 32bit? */ - error = security_file_ioctl(f.file, cmd, arg); - if (error) - goto out_fput; - - switch (cmd) { - /* these are never seen by ->ioctl(), no argument or int argument */ - case FIOCLEX: - case FIONCLEX: - case FIFREEZE: - case FITHAW: - case FICLONE: - goto do_ioctl; - /* these are never seen by ->ioctl(), pointer argument */ - case FIONBIO: - case FIOASYNC: - case FIOQSIZE: - case FS_IOC_FIEMAP: - case FIGETBSZ: - case FICLONERANGE: - case FIDEDUPERANGE: - goto found_handler; - /* - * The next group is the stuff handled inside file_ioctl(). - * For regular files these never reach ->ioctl(); for - * devices, sockets, etc. they do and one (FIONREAD) is - * even accepted in some cases. In all those cases - * argument has the same type, so we can handle these - * here, shunting them towards do_vfs_ioctl(). - * ->compat_ioctl() will never see any of those. - */ - /* pointer argument, never actually handled by ->ioctl() */ - case FIBMAP: - goto found_handler; - /* handled by some ->ioctl(); always a pointer to int */ - case FIONREAD: - goto found_handler; - /* these get messy on amd64 due to alignment differences */ -#if defined(CONFIG_X86_64) - case FS_IOC_RESVSP_32: - case FS_IOC_RESVSP64_32: - error = compat_ioctl_preallocate(f.file, 0, compat_ptr(arg)); - goto out_fput; - case FS_IOC_UNRESVSP_32: - case FS_IOC_UNRESVSP64_32: - error = compat_ioctl_preallocate(f.file, FALLOC_FL_PUNCH_HOLE, - compat_ptr(arg)); - goto out_fput; - case FS_IOC_ZERO_RANGE_32: - error = compat_ioctl_preallocate(f.file, FALLOC_FL_ZERO_RANGE, - compat_ptr(arg)); - goto out_fput; -#else - case FS_IOC_RESVSP: - case FS_IOC_RESVSP64: - case FS_IOC_UNRESVSP: - case FS_IOC_UNRESVSP64: - case FS_IOC_ZERO_RANGE: - goto found_handler; -#endif - - default: - if (f.file->f_op->compat_ioctl) { - error = f.file->f_op->compat_ioctl(f.file, cmd, arg); - if (error != -ENOIOCTLCMD) - goto out_fput; - } - - if (!f.file->f_op->unlocked_ioctl) - goto do_ioctl; - break; - } - - if (compat_ioctl_check_table(XFORM(cmd))) - goto found_handler; - - error = do_ioctl_trans(cmd, arg, f.file); - if (error == -ENOIOCTLCMD) - error = -ENOTTY; - - goto out_fput; - - found_handler: - arg = (unsigned long)compat_ptr(arg); - do_ioctl: - error = do_vfs_ioctl(f.file, fd, cmd, arg); - out_fput: - fdput(f); - out: - return error; -} - -static int __init init_sys32_ioctl_cmp(const void *p, const void *q) -{ - unsigned int a, b; - a = *(unsigned int *)p; - b = *(unsigned int *)q; - if (a > b) - return 1; - if (a < b) - return -1; - return 0; -} - -static int __init init_sys32_ioctl(void) -{ - sort(ioctl_pointer, ARRAY_SIZE(ioctl_pointer), sizeof(*ioctl_pointer), - init_sys32_ioctl_cmp, NULL); - return 0; -} -__initcall(init_sys32_ioctl); diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index ff5a1746cbae..8046d7c7a3e9 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -2,13 +2,8 @@ config FS_ENCRYPTION bool "FS Encryption (Per-file encryption)" select CRYPTO - select CRYPTO_AES - select CRYPTO_CBC - select CRYPTO_ECB - select CRYPTO_XTS - select CRYPTO_CTS - select CRYPTO_SHA512 - select CRYPTO_HMAC + select CRYPTO_HASH + select CRYPTO_SKCIPHER select KEYS help Enable encryption of files and directories. This @@ -16,3 +11,16 @@ config FS_ENCRYPTION efficient since it avoids caching the encrypted and decrypted pages in the page cache. Currently Ext4, F2FS and UBIFS make use of this feature. + +# Filesystems supporting encryption must select this if FS_ENCRYPTION. This +# allows the algorithms to be built as modules when all the filesystems are. +config FS_ENCRYPTION_ALGS + tristate + select CRYPTO_AES + select CRYPTO_CBC + select CRYPTO_CTS + select CRYPTO_ECB + select CRYPTO_HMAC + select CRYPTO_SHA256 + select CRYPTO_SHA512 + select CRYPTO_XTS diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c index 1f4b8a277060..4fa18fff9c4e 100644 --- a/fs/crypto/bio.c +++ b/fs/crypto/bio.c @@ -41,53 +41,101 @@ void fscrypt_decrypt_bio(struct bio *bio) } EXPORT_SYMBOL(fscrypt_decrypt_bio); +/** + * fscrypt_zeroout_range() - zero out a range of blocks in an encrypted file + * @inode: the file's inode + * @lblk: the first file logical block to zero out + * @pblk: the first filesystem physical block to zero out + * @len: number of blocks to zero out + * + * Zero out filesystem blocks in an encrypted regular file on-disk, i.e. write + * ciphertext blocks which decrypt to the all-zeroes block. The blocks must be + * both logically and physically contiguous. It's also assumed that the + * filesystem only uses a single block device, ->s_bdev. + * + * Note that since each block uses a different IV, this involves writing a + * different ciphertext to each block; we can't simply reuse the same one. + * + * Return: 0 on success; -errno on failure. + */ int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, - sector_t pblk, unsigned int len) + sector_t pblk, unsigned int len) { const unsigned int blockbits = inode->i_blkbits; const unsigned int blocksize = 1 << blockbits; - struct page *ciphertext_page; + const unsigned int blocks_per_page_bits = PAGE_SHIFT - blockbits; + const unsigned int blocks_per_page = 1 << blocks_per_page_bits; + struct page *pages[16]; /* write up to 16 pages at a time */ + unsigned int nr_pages; + unsigned int i; + unsigned int offset; struct bio *bio; - int ret, err = 0; + int ret, err; - ciphertext_page = fscrypt_alloc_bounce_page(GFP_NOWAIT); - if (!ciphertext_page) - return -ENOMEM; + if (len == 0) + return 0; - while (len--) { - err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk, - ZERO_PAGE(0), ciphertext_page, - blocksize, 0, GFP_NOFS); - if (err) - goto errout; + BUILD_BUG_ON(ARRAY_SIZE(pages) > BIO_MAX_PAGES); + nr_pages = min_t(unsigned int, ARRAY_SIZE(pages), + (len + blocks_per_page - 1) >> blocks_per_page_bits); - bio = bio_alloc(GFP_NOWAIT, 1); - if (!bio) { - err = -ENOMEM; - goto errout; - } + /* + * We need at least one page for ciphertext. Allocate the first one + * from a mempool, with __GFP_DIRECT_RECLAIM set so that it can't fail. + * + * Any additional page allocations are allowed to fail, as they only + * help performance, and waiting on the mempool for them could deadlock. + */ + for (i = 0; i < nr_pages; i++) { + pages[i] = fscrypt_alloc_bounce_page(i == 0 ? GFP_NOFS : + GFP_NOWAIT | __GFP_NOWARN); + if (!pages[i]) + break; + } + nr_pages = i; + if (WARN_ON(nr_pages <= 0)) + return -EINVAL; + + /* This always succeeds since __GFP_DIRECT_RECLAIM is set. */ + bio = bio_alloc(GFP_NOFS, nr_pages); + + do { bio_set_dev(bio, inode->i_sb->s_bdev); bio->bi_iter.bi_sector = pblk << (blockbits - 9); bio_set_op_attrs(bio, REQ_OP_WRITE, 0); - ret = bio_add_page(bio, ciphertext_page, blocksize, 0); - if (WARN_ON(ret != blocksize)) { - /* should never happen! */ - bio_put(bio); - err = -EIO; - goto errout; - } + + i = 0; + offset = 0; + do { + err = fscrypt_crypt_block(inode, FS_ENCRYPT, lblk, + ZERO_PAGE(0), pages[i], + blocksize, offset, GFP_NOFS); + if (err) + goto out; + lblk++; + pblk++; + len--; + offset += blocksize; + if (offset == PAGE_SIZE || len == 0) { + ret = bio_add_page(bio, pages[i++], offset, 0); + if (WARN_ON(ret != offset)) { + err = -EIO; + goto out; + } + offset = 0; + } + } while (i != nr_pages && len != 0); + err = submit_bio_wait(bio); - if (err == 0 && bio->bi_status) - err = -EIO; - bio_put(bio); if (err) - goto errout; - lblk++; - pblk++; - } + goto out; + bio_reset(bio); + } while (len != 0); err = 0; -errout: - fscrypt_free_bounce_page(ciphertext_page); +out: + bio_put(bio); + for (i = 0; i < nr_pages; i++) + fscrypt_free_bounce_page(pages[i]); return err; } EXPORT_SYMBOL(fscrypt_zeroout_range); diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 3719efa546c6..1ecaac7ee3cb 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -25,8 +25,6 @@ #include <linux/module.h> #include <linux/scatterlist.h> #include <linux/ratelimit.h> -#include <linux/dcache.h> -#include <linux/namei.h> #include <crypto/skcipher.h> #include "fscrypt_private.h" @@ -140,7 +138,7 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, * multiple of the filesystem's block size. * @offs: Byte offset within @page of the first block to encrypt. Must be * a multiple of the filesystem's block size. - * @gfp_flags: Memory allocation flags + * @gfp_flags: Memory allocation flags. See details below. * * A new bounce page is allocated, and the specified block(s) are encrypted into * it. In the bounce page, the ciphertext block(s) will be located at the same @@ -150,6 +148,11 @@ int fscrypt_crypt_block(const struct inode *inode, fscrypt_direction_t rw, * * This is for use by the filesystem's ->writepages() method. * + * The bounce page allocation is mempool-backed, so it will always succeed when + * @gfp_flags includes __GFP_DIRECT_RECLAIM, e.g. when it's GFP_NOFS. However, + * only the first page of each bio can be allocated this way. To prevent + * deadlocks, for any additional pages a mask like GFP_NOWAIT must be used. + * * Return: the new encrypted bounce page on success; an ERR_PTR() on failure */ struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, @@ -286,54 +289,6 @@ int fscrypt_decrypt_block_inplace(const struct inode *inode, struct page *page, } EXPORT_SYMBOL(fscrypt_decrypt_block_inplace); -/* - * Validate dentries in encrypted directories to make sure we aren't potentially - * caching stale dentries after a key has been added. - */ -static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) -{ - struct dentry *dir; - int err; - int valid; - - /* - * Plaintext names are always valid, since fscrypt doesn't support - * reverting to ciphertext names without evicting the directory's inode - * -- which implies eviction of the dentries in the directory. - */ - if (!(dentry->d_flags & DCACHE_ENCRYPTED_NAME)) - return 1; - - /* - * Ciphertext name; valid if the directory's key is still unavailable. - * - * Although fscrypt forbids rename() on ciphertext names, we still must - * use dget_parent() here rather than use ->d_parent directly. That's - * because a corrupted fs image may contain directory hard links, which - * the VFS handles by moving the directory's dentry tree in the dcache - * each time ->lookup() finds the directory and it already has a dentry - * elsewhere. Thus ->d_parent can be changing, and we must safely grab - * a reference to some ->d_parent to prevent it from being freed. - */ - - if (flags & LOOKUP_RCU) - return -ECHILD; - - dir = dget_parent(dentry); - err = fscrypt_get_encryption_info(d_inode(dir)); - valid = !fscrypt_has_encryption_key(d_inode(dir)); - dput(dir); - - if (err < 0) - return err; - - return valid; -} - -const struct dentry_operations fscrypt_d_ops = { - .d_revalidate = fscrypt_d_revalidate, -}; - /** * fscrypt_initialize() - allocate major buffers for fs encryption. * @cop_flags: fscrypt operations flags diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 3da3707c10e3..4c212442a8f7 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -11,10 +11,87 @@ * This has not yet undergone a rigorous security audit. */ +#include <linux/namei.h> #include <linux/scatterlist.h> +#include <crypto/hash.h> +#include <crypto/sha.h> #include <crypto/skcipher.h> #include "fscrypt_private.h" +/** + * struct fscrypt_nokey_name - identifier for directory entry when key is absent + * + * When userspace lists an encrypted directory without access to the key, the + * filesystem must present a unique "no-key name" for each filename that allows + * it to find the directory entry again if requested. Naively, that would just + * mean using the ciphertext filenames. However, since the ciphertext filenames + * can contain illegal characters ('\0' and '/'), they must be encoded in some + * way. We use base64. But that can cause names to exceed NAME_MAX (255 + * bytes), so we also need to use a strong hash to abbreviate long names. + * + * The filesystem may also need another kind of hash, the "dirhash", to quickly + * find the directory entry. Since filesystems normally compute the dirhash + * over the on-disk filename (i.e. the ciphertext), it's not computable from + * no-key names that abbreviate the ciphertext using the strong hash to fit in + * NAME_MAX. It's also not computable if it's a keyed hash taken over the + * plaintext (but it may still be available in the on-disk directory entry); + * casefolded directories use this type of dirhash. At least in these cases, + * each no-key name must include the name's dirhash too. + * + * To meet all these requirements, we base64-encode the following + * variable-length structure. It contains the dirhash, or 0's if the filesystem + * didn't provide one; up to 149 bytes of the ciphertext name; and for + * ciphertexts longer than 149 bytes, also the SHA-256 of the remaining bytes. + * + * This ensures that each no-key name contains everything needed to find the + * directory entry again, contains only legal characters, doesn't exceed + * NAME_MAX, is unambiguous unless there's a SHA-256 collision, and that we only + * take the performance hit of SHA-256 on very long filenames (which are rare). + */ +struct fscrypt_nokey_name { + u32 dirhash[2]; + u8 bytes[149]; + u8 sha256[SHA256_DIGEST_SIZE]; +}; /* 189 bytes => 252 bytes base64-encoded, which is <= NAME_MAX (255) */ + +/* + * Decoded size of max-size nokey name, i.e. a name that was abbreviated using + * the strong hash and thus includes the 'sha256' field. This isn't simply + * sizeof(struct fscrypt_nokey_name), as the padding at the end isn't included. + */ +#define FSCRYPT_NOKEY_NAME_MAX offsetofend(struct fscrypt_nokey_name, sha256) + +static struct crypto_shash *sha256_hash_tfm; + +static int fscrypt_do_sha256(const u8 *data, unsigned int data_len, u8 *result) +{ + struct crypto_shash *tfm = READ_ONCE(sha256_hash_tfm); + + if (unlikely(!tfm)) { + struct crypto_shash *prev_tfm; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + fscrypt_err(NULL, + "Error allocating SHA-256 transform: %ld", + PTR_ERR(tfm)); + return PTR_ERR(tfm); + } + prev_tfm = cmpxchg(&sha256_hash_tfm, NULL, tfm); + if (prev_tfm) { + crypto_free_shash(tfm); + tfm = prev_tfm; + } + } + { + SHASH_DESC_ON_STACK(desc, tfm); + + desc->tfm = tfm; + + return crypto_shash_digest(desc, data, data_len, result); + } +} + static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) { if (str->len == 1 && str->name[0] == '.') @@ -27,19 +104,19 @@ static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) } /** - * fname_encrypt() - encrypt a filename + * fscrypt_fname_encrypt() - encrypt a filename * * The output buffer must be at least as large as the input buffer. * Any extra space is filled with NUL padding before encryption. * * Return: 0 on success, -errno on failure */ -int fname_encrypt(struct inode *inode, const struct qstr *iname, - u8 *out, unsigned int olen) +int fscrypt_fname_encrypt(const struct inode *inode, const struct qstr *iname, + u8 *out, unsigned int olen) { struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); - struct fscrypt_info *ci = inode->i_crypt_info; + const struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_ctfm; union fscrypt_iv iv; struct scatterlist sg; @@ -85,14 +162,14 @@ int fname_encrypt(struct inode *inode, const struct qstr *iname, * * Return: 0 on success, -errno on failure */ -static int fname_decrypt(struct inode *inode, - const struct fscrypt_str *iname, - struct fscrypt_str *oname) +static int fname_decrypt(const struct inode *inode, + const struct fscrypt_str *iname, + struct fscrypt_str *oname) { struct skcipher_request *req = NULL; DECLARE_CRYPTO_WAIT(wait); struct scatterlist src_sg, dst_sg; - struct fscrypt_info *ci = inode->i_crypt_info; + const struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_ctfm; union fscrypt_iv iv; int res; @@ -206,9 +283,7 @@ int fscrypt_fname_alloc_buffer(const struct inode *inode, u32 max_encrypted_len, struct fscrypt_str *crypto_str) { - const u32 max_encoded_len = - max_t(u32, BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE), - 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))); + const u32 max_encoded_len = BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX); u32 max_presented_len; max_presented_len = max(max_encoded_len, max_encrypted_len); @@ -241,19 +316,21 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer); * * The caller must have allocated sufficient memory for the @oname string. * - * If the key is available, we'll decrypt the disk name; otherwise, we'll encode - * it for presentation. Short names are directly base64-encoded, while long - * names are encoded in fscrypt_digested_name format. + * If the key is available, we'll decrypt the disk name. Otherwise, we'll + * encode it for presentation in fscrypt_nokey_name format. + * See struct fscrypt_nokey_name for details. * * Return: 0 on success, -errno on failure */ -int fscrypt_fname_disk_to_usr(struct inode *inode, - u32 hash, u32 minor_hash, - const struct fscrypt_str *iname, - struct fscrypt_str *oname) +int fscrypt_fname_disk_to_usr(const struct inode *inode, + u32 hash, u32 minor_hash, + const struct fscrypt_str *iname, + struct fscrypt_str *oname) { const struct qstr qname = FSTR_TO_QSTR(iname); - struct fscrypt_digested_name digested_name; + struct fscrypt_nokey_name nokey_name; + u32 size; /* size of the unencoded no-key name */ + int err; if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; @@ -268,24 +345,37 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, if (fscrypt_has_encryption_key(inode)) return fname_decrypt(inode, iname, oname); - if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) { - oname->len = base64_encode(iname->name, iname->len, - oname->name); - return 0; - } + /* + * Sanity check that struct fscrypt_nokey_name doesn't have padding + * between fields and that its encoded size never exceeds NAME_MAX. + */ + BUILD_BUG_ON(offsetofend(struct fscrypt_nokey_name, dirhash) != + offsetof(struct fscrypt_nokey_name, bytes)); + BUILD_BUG_ON(offsetofend(struct fscrypt_nokey_name, bytes) != + offsetof(struct fscrypt_nokey_name, sha256)); + BUILD_BUG_ON(BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX) > NAME_MAX); + if (hash) { - digested_name.hash = hash; - digested_name.minor_hash = minor_hash; + nokey_name.dirhash[0] = hash; + nokey_name.dirhash[1] = minor_hash; + } else { + nokey_name.dirhash[0] = 0; + nokey_name.dirhash[1] = 0; + } + if (iname->len <= sizeof(nokey_name.bytes)) { + memcpy(nokey_name.bytes, iname->name, iname->len); + size = offsetof(struct fscrypt_nokey_name, bytes[iname->len]); } else { - digested_name.hash = 0; - digested_name.minor_hash = 0; + memcpy(nokey_name.bytes, iname->name, sizeof(nokey_name.bytes)); + /* Compute strong hash of remaining part of name. */ + err = fscrypt_do_sha256(&iname->name[sizeof(nokey_name.bytes)], + iname->len - sizeof(nokey_name.bytes), + nokey_name.sha256); + if (err) + return err; + size = FSCRYPT_NOKEY_NAME_MAX; } - memcpy(digested_name.digest, - FSCRYPT_FNAME_DIGEST(iname->name, iname->len), - FSCRYPT_FNAME_DIGEST_SIZE); - oname->name[0] = '_'; - oname->len = 1 + base64_encode((const u8 *)&digested_name, - sizeof(digested_name), oname->name + 1); + oname->len = base64_encode((const u8 *)&nokey_name, size, oname->name); return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); @@ -306,8 +396,7 @@ EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); * get the disk_name. * * Else, for keyless @lookup operations, @iname is the presented ciphertext, so - * we decode it to get either the ciphertext disk_name (for short names) or the - * fscrypt_digested_name (for long names). Non-@lookup operations will be + * we decode it to get the fscrypt_nokey_name. Non-@lookup operations will be * impossible in this case, so we fail them with ENOKEY. * * If successful, fscrypt_free_filename() must be called later to clean up. @@ -317,8 +406,8 @@ EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct fscrypt_name *fname) { + struct fscrypt_nokey_name *nokey_name; int ret; - int digested; memset(fname, 0, sizeof(struct fscrypt_name)); fname->usr_fname = iname; @@ -342,8 +431,8 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, if (!fname->crypto_buf.name) return -ENOMEM; - ret = fname_encrypt(dir, iname, fname->crypto_buf.name, - fname->crypto_buf.len); + ret = fscrypt_fname_encrypt(dir, iname, fname->crypto_buf.name, + fname->crypto_buf.len); if (ret) goto errout; fname->disk_name.name = fname->crypto_buf.name; @@ -358,40 +447,31 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, * We don't have the key and we are doing a lookup; decode the * user-supplied name */ - if (iname->name[0] == '_') { - if (iname->len != - 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))) - return -ENOENT; - digested = 1; - } else { - if (iname->len > - BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE)) - return -ENOENT; - digested = 0; - } - fname->crypto_buf.name = - kmalloc(max_t(size_t, FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE, - sizeof(struct fscrypt_digested_name)), - GFP_KERNEL); + if (iname->len > BASE64_CHARS(FSCRYPT_NOKEY_NAME_MAX)) + return -ENOENT; + + fname->crypto_buf.name = kmalloc(FSCRYPT_NOKEY_NAME_MAX, GFP_KERNEL); if (fname->crypto_buf.name == NULL) return -ENOMEM; - ret = base64_decode(iname->name + digested, iname->len - digested, - fname->crypto_buf.name); - if (ret < 0) { + ret = base64_decode(iname->name, iname->len, fname->crypto_buf.name); + if (ret < (int)offsetof(struct fscrypt_nokey_name, bytes[1]) || + (ret > offsetof(struct fscrypt_nokey_name, sha256) && + ret != FSCRYPT_NOKEY_NAME_MAX)) { ret = -ENOENT; goto errout; } fname->crypto_buf.len = ret; - if (digested) { - const struct fscrypt_digested_name *n = - (const void *)fname->crypto_buf.name; - fname->hash = n->hash; - fname->minor_hash = n->minor_hash; - } else { - fname->disk_name.name = fname->crypto_buf.name; - fname->disk_name.len = fname->crypto_buf.len; + + nokey_name = (void *)fname->crypto_buf.name; + fname->hash = nokey_name->dirhash[0]; + fname->minor_hash = nokey_name->dirhash[1]; + if (ret != FSCRYPT_NOKEY_NAME_MAX) { + /* The full ciphertext filename is available. */ + fname->disk_name.name = nokey_name->bytes; + fname->disk_name.len = + ret - offsetof(struct fscrypt_nokey_name, bytes); } return 0; @@ -400,3 +480,109 @@ errout: return ret; } EXPORT_SYMBOL(fscrypt_setup_filename); + +/** + * fscrypt_match_name() - test whether the given name matches a directory entry + * @fname: the name being searched for + * @de_name: the name from the directory entry + * @de_name_len: the length of @de_name in bytes + * + * Normally @fname->disk_name will be set, and in that case we simply compare + * that to the name stored in the directory entry. The only exception is that + * if we don't have the key for an encrypted directory and the name we're + * looking for is very long, then we won't have the full disk_name and instead + * we'll need to match against a fscrypt_nokey_name that includes a strong hash. + * + * Return: %true if the name matches, otherwise %false. + */ +bool fscrypt_match_name(const struct fscrypt_name *fname, + const u8 *de_name, u32 de_name_len) +{ + const struct fscrypt_nokey_name *nokey_name = + (const void *)fname->crypto_buf.name; + u8 sha256[SHA256_DIGEST_SIZE]; + + if (likely(fname->disk_name.name)) { + if (de_name_len != fname->disk_name.len) + return false; + return !memcmp(de_name, fname->disk_name.name, de_name_len); + } + if (de_name_len <= sizeof(nokey_name->bytes)) + return false; + if (memcmp(de_name, nokey_name->bytes, sizeof(nokey_name->bytes))) + return false; + if (fscrypt_do_sha256(&de_name[sizeof(nokey_name->bytes)], + de_name_len - sizeof(nokey_name->bytes), sha256)) + return false; + return !memcmp(sha256, nokey_name->sha256, sizeof(sha256)); +} +EXPORT_SYMBOL_GPL(fscrypt_match_name); + +/** + * fscrypt_fname_siphash() - calculate the SipHash of a filename + * @dir: the parent directory + * @name: the filename to calculate the SipHash of + * + * Given a plaintext filename @name and a directory @dir which uses SipHash as + * its dirhash method and has had its fscrypt key set up, this function + * calculates the SipHash of that name using the directory's secret dirhash key. + * + * Return: the SipHash of @name using the hash key of @dir + */ +u64 fscrypt_fname_siphash(const struct inode *dir, const struct qstr *name) +{ + const struct fscrypt_info *ci = dir->i_crypt_info; + + WARN_ON(!ci->ci_dirhash_key_initialized); + + return siphash(name->name, name->len, &ci->ci_dirhash_key); +} +EXPORT_SYMBOL_GPL(fscrypt_fname_siphash); + +/* + * Validate dentries in encrypted directories to make sure we aren't potentially + * caching stale dentries after a key has been added. + */ +static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct dentry *dir; + int err; + int valid; + + /* + * Plaintext names are always valid, since fscrypt doesn't support + * reverting to ciphertext names without evicting the directory's inode + * -- which implies eviction of the dentries in the directory. + */ + if (!(dentry->d_flags & DCACHE_ENCRYPTED_NAME)) + return 1; + + /* + * Ciphertext name; valid if the directory's key is still unavailable. + * + * Although fscrypt forbids rename() on ciphertext names, we still must + * use dget_parent() here rather than use ->d_parent directly. That's + * because a corrupted fs image may contain directory hard links, which + * the VFS handles by moving the directory's dentry tree in the dcache + * each time ->lookup() finds the directory and it already has a dentry + * elsewhere. Thus ->d_parent can be changing, and we must safely grab + * a reference to some ->d_parent to prevent it from being freed. + */ + + if (flags & LOOKUP_RCU) + return -ECHILD; + + dir = dget_parent(dentry); + err = fscrypt_get_encryption_info(d_inode(dir)); + valid = !fscrypt_has_encryption_key(d_inode(dir)); + dput(dir); + + if (err < 0) + return err; + + return valid; +} + +const struct dentry_operations fscrypt_d_ops = { + .d_revalidate = fscrypt_d_revalidate, +}; diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 130b50e5a011..9aae851409e5 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -12,6 +12,7 @@ #define _FSCRYPT_PRIVATE_H #include <linux/fscrypt.h> +#include <linux/siphash.h> #include <crypto/hash.h> #define CONST_STRLEN(str) (sizeof(str) - 1) @@ -136,12 +137,6 @@ fscrypt_policy_flags(const union fscrypt_policy *policy) BUG(); } -static inline bool -fscrypt_is_direct_key_policy(const union fscrypt_policy *policy) -{ - return fscrypt_policy_flags(policy) & FSCRYPT_POLICY_FLAG_DIRECT_KEY; -} - /** * For encrypted symlinks, the ciphertext length is stored at the beginning * of the string in little-endian format. @@ -194,6 +189,14 @@ struct fscrypt_info { */ struct fscrypt_direct_key *ci_direct_key; + /* + * This inode's hash key for filenames. This is a 128-bit SipHash-2-4 + * key. This is only set for directories that use a keyed dirhash over + * the plaintext filenames -- currently just casefolded directories. + */ + siphash_key_t ci_dirhash_key; + bool ci_dirhash_key_initialized; + /* The encryption policy used by this inode */ union fscrypt_policy ci_policy; @@ -206,24 +209,6 @@ typedef enum { FS_ENCRYPT, } fscrypt_direction_t; -static inline bool fscrypt_valid_enc_modes(u32 contents_mode, - u32 filenames_mode) -{ - if (contents_mode == FSCRYPT_MODE_AES_128_CBC && - filenames_mode == FSCRYPT_MODE_AES_128_CTS) - return true; - - if (contents_mode == FSCRYPT_MODE_AES_256_XTS && - filenames_mode == FSCRYPT_MODE_AES_256_CTS) - return true; - - if (contents_mode == FSCRYPT_MODE_ADIANTUM && - filenames_mode == FSCRYPT_MODE_ADIANTUM) - return true; - - return false; -} - /* crypto.c */ extern struct kmem_cache *fscrypt_info_cachep; extern int fscrypt_initialize(unsigned int cop_flags); @@ -233,7 +218,6 @@ extern int fscrypt_crypt_block(const struct inode *inode, unsigned int len, unsigned int offs, gfp_t gfp_flags); extern struct page *fscrypt_alloc_bounce_page(gfp_t gfp_flags); -extern const struct dentry_operations fscrypt_d_ops; extern void __printf(3, 4) __cold fscrypt_msg(const struct inode *inode, const char *level, const char *fmt, ...); @@ -260,11 +244,13 @@ void fscrypt_generate_iv(union fscrypt_iv *iv, u64 lblk_num, const struct fscrypt_info *ci); /* fname.c */ -extern int fname_encrypt(struct inode *inode, const struct qstr *iname, - u8 *out, unsigned int olen); +extern int fscrypt_fname_encrypt(const struct inode *inode, + const struct qstr *iname, + u8 *out, unsigned int olen); extern bool fscrypt_fname_encrypted_size(const struct inode *inode, u32 orig_len, u32 max_len, u32 *encrypted_len_ret); +extern const struct dentry_operations fscrypt_d_ops; /* hkdf.c */ @@ -283,11 +269,12 @@ extern int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, * output doesn't reveal another. */ #define HKDF_CONTEXT_KEY_IDENTIFIER 1 -#define HKDF_CONTEXT_PER_FILE_KEY 2 +#define HKDF_CONTEXT_PER_FILE_ENC_KEY 2 #define HKDF_CONTEXT_DIRECT_KEY 3 #define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 +#define HKDF_CONTEXT_DIRHASH_KEY 5 -extern int fscrypt_hkdf_expand(struct fscrypt_hkdf *hkdf, u8 context, +extern int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, const u8 *info, unsigned int infolen, u8 *okm, unsigned int okmlen); @@ -448,18 +435,17 @@ struct fscrypt_mode { int logged_impl_name; }; -static inline bool -fscrypt_mode_supports_direct_key(const struct fscrypt_mode *mode) -{ - return mode->ivsize >= offsetofend(union fscrypt_iv, nonce); -} +extern struct fscrypt_mode fscrypt_modes[]; extern struct crypto_skcipher * fscrypt_allocate_skcipher(struct fscrypt_mode *mode, const u8 *raw_key, const struct inode *inode); -extern int fscrypt_set_derived_key(struct fscrypt_info *ci, - const u8 *derived_key); +extern int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, + const u8 *raw_key); + +extern int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, + const struct fscrypt_master_key *mk); /* keysetup_v1.c */ diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c index f21873e1b467..efb95bd19a89 100644 --- a/fs/crypto/hkdf.c +++ b/fs/crypto/hkdf.c @@ -112,7 +112,7 @@ out: * adds to its application-specific info strings to guarantee that it doesn't * accidentally repeat an info string when using HKDF for different purposes.) */ -int fscrypt_hkdf_expand(struct fscrypt_hkdf *hkdf, u8 context, +int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, const u8 *info, unsigned int infolen, u8 *okm, unsigned int okmlen) { diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c index bb3b7fcfdd48..5ef861742921 100644 --- a/fs/crypto/hooks.c +++ b/fs/crypto/hooks.c @@ -5,6 +5,8 @@ * Encryption hooks for higher-level filesystem operations. */ +#include <linux/key.h> + #include "fscrypt_private.h" /** @@ -122,6 +124,48 @@ int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry, } EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup); +/** + * fscrypt_prepare_setflags() - prepare to change flags with FS_IOC_SETFLAGS + * @inode: the inode on which flags are being changed + * @oldflags: the old flags + * @flags: the new flags + * + * The caller should be holding i_rwsem for write. + * + * Return: 0 on success; -errno if the flags change isn't allowed or if + * another error occurs. + */ +int fscrypt_prepare_setflags(struct inode *inode, + unsigned int oldflags, unsigned int flags) +{ + struct fscrypt_info *ci; + struct fscrypt_master_key *mk; + int err; + + /* + * When the CASEFOLD flag is set on an encrypted directory, we must + * derive the secret key needed for the dirhash. This is only possible + * if the directory uses a v2 encryption policy. + */ + if (IS_ENCRYPTED(inode) && (flags & ~oldflags & FS_CASEFOLD_FL)) { + err = fscrypt_require_key(inode); + if (err) + return err; + ci = inode->i_crypt_info; + if (ci->ci_policy.version != FSCRYPT_POLICY_V2) + return -EINVAL; + mk = ci->ci_master_key->payload.data[0]; + down_read(&mk->mk_secret_sem); + if (is_master_key_secret_present(&mk->mk_secret)) + err = fscrypt_derive_dirhash_key(ci, mk); + else + err = -ENOKEY; + up_read(&mk->mk_secret_sem); + return err; + } + return 0; +} + int __fscrypt_prepare_symlink(struct inode *dir, unsigned int len, unsigned int max_len, struct fscrypt_str *disk_link) @@ -188,7 +232,8 @@ int __fscrypt_encrypt_symlink(struct inode *inode, const char *target, ciphertext_len = disk_link->len - sizeof(*sd); sd->len = cpu_to_le16(ciphertext_len); - err = fname_encrypt(inode, &iname, sd->encrypted_path, ciphertext_len); + err = fscrypt_fname_encrypt(inode, &iname, sd->encrypted_path, + ciphertext_len); if (err) goto err_free_sd; diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 40cca351273f..ab41b25d4fa1 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -465,6 +465,109 @@ out_unlock: return err; } +static int fscrypt_provisioning_key_preparse(struct key_preparsed_payload *prep) +{ + const struct fscrypt_provisioning_key_payload *payload = prep->data; + + if (prep->datalen < sizeof(*payload) + FSCRYPT_MIN_KEY_SIZE || + prep->datalen > sizeof(*payload) + FSCRYPT_MAX_KEY_SIZE) + return -EINVAL; + + if (payload->type != FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && + payload->type != FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) + return -EINVAL; + + if (payload->__reserved) + return -EINVAL; + + prep->payload.data[0] = kmemdup(payload, prep->datalen, GFP_KERNEL); + if (!prep->payload.data[0]) + return -ENOMEM; + + prep->quotalen = prep->datalen; + return 0; +} + +static void fscrypt_provisioning_key_free_preparse( + struct key_preparsed_payload *prep) +{ + kzfree(prep->payload.data[0]); +} + +static void fscrypt_provisioning_key_describe(const struct key *key, + struct seq_file *m) +{ + seq_puts(m, key->description); + if (key_is_positive(key)) { + const struct fscrypt_provisioning_key_payload *payload = + key->payload.data[0]; + + seq_printf(m, ": %u [%u]", key->datalen, payload->type); + } +} + +static void fscrypt_provisioning_key_destroy(struct key *key) +{ + kzfree(key->payload.data[0]); +} + +static struct key_type key_type_fscrypt_provisioning = { + .name = "fscrypt-provisioning", + .preparse = fscrypt_provisioning_key_preparse, + .free_preparse = fscrypt_provisioning_key_free_preparse, + .instantiate = generic_key_instantiate, + .describe = fscrypt_provisioning_key_describe, + .destroy = fscrypt_provisioning_key_destroy, +}; + +/* + * Retrieve the raw key from the Linux keyring key specified by 'key_id', and + * store it into 'secret'. + * + * The key must be of type "fscrypt-provisioning" and must have the field + * fscrypt_provisioning_key_payload::type set to 'type', indicating that it's + * only usable with fscrypt with the particular KDF version identified by + * 'type'. We don't use the "logon" key type because there's no way to + * completely restrict the use of such keys; they can be used by any kernel API + * that accepts "logon" keys and doesn't require a specific service prefix. + * + * The ability to specify the key via Linux keyring key is intended for cases + * where userspace needs to re-add keys after the filesystem is unmounted and + * re-mounted. Most users should just provide the raw key directly instead. + */ +static int get_keyring_key(u32 key_id, u32 type, + struct fscrypt_master_key_secret *secret) +{ + key_ref_t ref; + struct key *key; + const struct fscrypt_provisioning_key_payload *payload; + int err; + + ref = lookup_user_key(key_id, 0, KEY_NEED_SEARCH); + if (IS_ERR(ref)) + return PTR_ERR(ref); + key = key_ref_to_ptr(ref); + + if (key->type != &key_type_fscrypt_provisioning) + goto bad_key; + payload = key->payload.data[0]; + + /* Don't allow fscrypt v1 keys to be used as v2 keys and vice versa. */ + if (payload->type != type) + goto bad_key; + + secret->size = key->datalen - sizeof(*payload); + memcpy(secret->raw, payload->raw, secret->size); + err = 0; + goto out_put; + +bad_key: + err = -EKEYREJECTED; +out_put: + key_ref_put(ref); + return err; +} + /* * Add a master encryption key to the filesystem, causing all files which were * encrypted with it to appear "unlocked" (decrypted) when accessed. @@ -503,18 +606,25 @@ int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg) if (!valid_key_spec(&arg.key_spec)) return -EINVAL; - if (arg.raw_size < FSCRYPT_MIN_KEY_SIZE || - arg.raw_size > FSCRYPT_MAX_KEY_SIZE) - return -EINVAL; - if (memchr_inv(arg.__reserved, 0, sizeof(arg.__reserved))) return -EINVAL; memset(&secret, 0, sizeof(secret)); - secret.size = arg.raw_size; - err = -EFAULT; - if (copy_from_user(secret.raw, uarg->raw, secret.size)) - goto out_wipe_secret; + if (arg.key_id) { + if (arg.raw_size != 0) + return -EINVAL; + err = get_keyring_key(arg.key_id, arg.key_spec.type, &secret); + if (err) + goto out_wipe_secret; + } else { + if (arg.raw_size < FSCRYPT_MIN_KEY_SIZE || + arg.raw_size > FSCRYPT_MAX_KEY_SIZE) + return -EINVAL; + secret.size = arg.raw_size; + err = -EFAULT; + if (copy_from_user(secret.raw, uarg->raw, secret.size)) + goto out_wipe_secret; + } switch (arg.key_spec.type) { case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR: @@ -666,9 +776,6 @@ static int check_for_busy_inodes(struct super_block *sb, struct list_head *pos; size_t busy_count = 0; unsigned long ino; - struct dentry *dentry; - char _path[256]; - char *path = NULL; spin_lock(&mk->mk_decrypted_inodes_lock); @@ -687,22 +794,14 @@ static int check_for_busy_inodes(struct super_block *sb, struct fscrypt_info, ci_master_key_link)->ci_inode; ino = inode->i_ino; - dentry = d_find_alias(inode); } spin_unlock(&mk->mk_decrypted_inodes_lock); - if (dentry) { - path = dentry_path(dentry, _path, sizeof(_path)); - dput(dentry); - } - if (IS_ERR_OR_NULL(path)) - path = "(unknown)"; - fscrypt_warn(NULL, - "%s: %zu inode(s) still busy after removing key with %s %*phN, including ino %lu (%s)", + "%s: %zu inode(s) still busy after removing key with %s %*phN, including ino %lu", sb->s_id, busy_count, master_key_spec_type(&mk->mk_spec), master_key_spec_len(&mk->mk_spec), (u8 *)&mk->mk_spec.u, - ino, path); + ino); return -EBUSY; } @@ -978,8 +1077,14 @@ int __init fscrypt_init_keyring(void) if (err) goto err_unregister_fscrypt; + err = register_key_type(&key_type_fscrypt_provisioning); + if (err) + goto err_unregister_fscrypt_user; + return 0; +err_unregister_fscrypt_user: + unregister_key_type(&key_type_fscrypt_user); err_unregister_fscrypt: unregister_key_type(&key_type_fscrypt); return err; diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index f577bb6613f9..65cb09fa6ead 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -13,7 +13,7 @@ #include "fscrypt_private.h" -static struct fscrypt_mode available_modes[] = { +struct fscrypt_mode fscrypt_modes[] = { [FSCRYPT_MODE_AES_256_XTS] = { .friendly_name = "AES-256-XTS", .cipher_str = "xts(aes)", @@ -51,10 +51,10 @@ select_encryption_mode(const union fscrypt_policy *policy, const struct inode *inode) { if (S_ISREG(inode->i_mode)) - return &available_modes[fscrypt_policy_contents_mode(policy)]; + return &fscrypt_modes[fscrypt_policy_contents_mode(policy)]; if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - return &available_modes[fscrypt_policy_fnames_mode(policy)]; + return &fscrypt_modes[fscrypt_policy_fnames_mode(policy)]; WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", inode->i_ino, (inode->i_mode & S_IFMT)); @@ -89,8 +89,11 @@ struct crypto_skcipher *fscrypt_allocate_skcipher(struct fscrypt_mode *mode, * first time a mode is used. */ pr_info("fscrypt: %s using implementation \"%s\"\n", - mode->friendly_name, - crypto_skcipher_alg(tfm)->base.cra_driver_name); + mode->friendly_name, crypto_skcipher_driver_name(tfm)); + } + if (WARN_ON(crypto_skcipher_ivsize(tfm) != mode->ivsize)) { + err = -EINVAL; + goto err_free_tfm; } crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); err = crypto_skcipher_setkey(tfm, raw_key, mode->keysize); @@ -104,12 +107,12 @@ err_free_tfm: return ERR_PTR(err); } -/* Given the per-file key, set up the file's crypto transform object */ -int fscrypt_set_derived_key(struct fscrypt_info *ci, const u8 *derived_key) +/* Given a per-file encryption key, set up the file's crypto transform object */ +int fscrypt_set_per_file_enc_key(struct fscrypt_info *ci, const u8 *raw_key) { struct crypto_skcipher *tfm; - tfm = fscrypt_allocate_skcipher(ci->ci_mode, derived_key, ci->ci_inode); + tfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key, ci->ci_inode); if (IS_ERR(tfm)) return PTR_ERR(tfm); @@ -118,15 +121,15 @@ int fscrypt_set_derived_key(struct fscrypt_info *ci, const u8 *derived_key) return 0; } -static int setup_per_mode_key(struct fscrypt_info *ci, - struct fscrypt_master_key *mk, - struct crypto_skcipher **tfms, - u8 hkdf_context, bool include_fs_uuid) +static int setup_per_mode_enc_key(struct fscrypt_info *ci, + struct fscrypt_master_key *mk, + struct crypto_skcipher **tfms, + u8 hkdf_context, bool include_fs_uuid) { const struct inode *inode = ci->ci_inode; const struct super_block *sb = inode->i_sb; struct fscrypt_mode *mode = ci->ci_mode; - u8 mode_num = mode - available_modes; + const u8 mode_num = mode - fscrypt_modes; struct crypto_skcipher *tfm, *prev_tfm; u8 mode_key[FSCRYPT_MAX_KEY_SIZE]; u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)]; @@ -171,29 +174,37 @@ done: return 0; } +int fscrypt_derive_dirhash_key(struct fscrypt_info *ci, + const struct fscrypt_master_key *mk) +{ + int err; + + err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, HKDF_CONTEXT_DIRHASH_KEY, + ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE, + (u8 *)&ci->ci_dirhash_key, + sizeof(ci->ci_dirhash_key)); + if (err) + return err; + ci->ci_dirhash_key_initialized = true; + return 0; +} + static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, struct fscrypt_master_key *mk) { - u8 derived_key[FSCRYPT_MAX_KEY_SIZE]; int err; if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { /* - * DIRECT_KEY: instead of deriving per-file keys, the per-file - * nonce will be included in all the IVs. But unlike v1 - * policies, for v2 policies in this case we don't encrypt with - * the master key directly but rather derive a per-mode key. - * This ensures that the master key is consistently used only - * for HKDF, avoiding key reuse issues. + * DIRECT_KEY: instead of deriving per-file encryption keys, the + * per-file nonce will be included in all the IVs. But unlike + * v1 policies, for v2 policies in this case we don't encrypt + * with the master key directly but rather derive a per-mode + * encryption key. This ensures that the master key is + * consistently used only for HKDF, avoiding key reuse issues. */ - if (!fscrypt_mode_supports_direct_key(ci->ci_mode)) { - fscrypt_warn(ci->ci_inode, - "Direct key flag not allowed with %s", - ci->ci_mode->friendly_name); - return -EINVAL; - } - return setup_per_mode_key(ci, mk, mk->mk_direct_tfms, - HKDF_CONTEXT_DIRECT_KEY, false); + err = setup_per_mode_enc_key(ci, mk, mk->mk_direct_tfms, + HKDF_CONTEXT_DIRECT_KEY, false); } else if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) { /* @@ -202,21 +213,34 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_info *ci, * the IVs. This format is optimized for use with inline * encryption hardware compliant with the UFS or eMMC standards. */ - return setup_per_mode_key(ci, mk, mk->mk_iv_ino_lblk_64_tfms, - HKDF_CONTEXT_IV_INO_LBLK_64_KEY, - true); + err = setup_per_mode_enc_key(ci, mk, mk->mk_iv_ino_lblk_64_tfms, + HKDF_CONTEXT_IV_INO_LBLK_64_KEY, + true); + } else { + u8 derived_key[FSCRYPT_MAX_KEY_SIZE]; + + err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, + HKDF_CONTEXT_PER_FILE_ENC_KEY, + ci->ci_nonce, + FS_KEY_DERIVATION_NONCE_SIZE, + derived_key, ci->ci_mode->keysize); + if (err) + return err; + + err = fscrypt_set_per_file_enc_key(ci, derived_key); + memzero_explicit(derived_key, ci->ci_mode->keysize); } - - err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, - HKDF_CONTEXT_PER_FILE_KEY, - ci->ci_nonce, FS_KEY_DERIVATION_NONCE_SIZE, - derived_key, ci->ci_mode->keysize); if (err) return err; - err = fscrypt_set_derived_key(ci, derived_key); - memzero_explicit(derived_key, ci->ci_mode->keysize); - return err; + /* Derive a secret dirhash key for directories that need it. */ + if (S_ISDIR(ci->ci_inode->i_mode) && IS_CASEFOLDED(ci->ci_inode)) { + err = fscrypt_derive_dirhash_key(ci, mk); + if (err) + return err; + } + + return 0; } /* diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c index 5298ef22aa85..801b48c0cd7f 100644 --- a/fs/crypto/keysetup_v1.c +++ b/fs/crypto/keysetup_v1.c @@ -9,7 +9,7 @@ * This file implements compatibility functions for the original encryption * policy version ("v1"), including: * - * - Deriving per-file keys using the AES-128-ECB based KDF + * - Deriving per-file encryption keys using the AES-128-ECB based KDF * (rather than the new method of using HKDF-SHA512) * * - Retrieving fscrypt master keys from process-subscribed keyrings @@ -253,23 +253,8 @@ err_free_dk: static int setup_v1_file_key_direct(struct fscrypt_info *ci, const u8 *raw_master_key) { - const struct fscrypt_mode *mode = ci->ci_mode; struct fscrypt_direct_key *dk; - if (!fscrypt_mode_supports_direct_key(mode)) { - fscrypt_warn(ci->ci_inode, - "Direct key mode not allowed with %s", - mode->friendly_name); - return -EINVAL; - } - - if (ci->ci_policy.v1.contents_encryption_mode != - ci->ci_policy.v1.filenames_encryption_mode) { - fscrypt_warn(ci->ci_inode, - "Direct key mode not allowed with different contents and filenames modes"); - return -EINVAL; - } - dk = fscrypt_get_direct_key(ci, raw_master_key); if (IS_ERR(dk)) return PTR_ERR(dk); @@ -298,7 +283,7 @@ static int setup_v1_file_key_derived(struct fscrypt_info *ci, if (err) goto out; - err = fscrypt_set_derived_key(ci, derived_key); + err = fscrypt_set_per_file_enc_key(ci, derived_key); out: kzfree(derived_key); return err; diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 96f528071bed..cf2a9d26ef7d 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -29,6 +29,43 @@ bool fscrypt_policies_equal(const union fscrypt_policy *policy1, return !memcmp(policy1, policy2, fscrypt_policy_size(policy1)); } +static bool fscrypt_valid_enc_modes(u32 contents_mode, u32 filenames_mode) +{ + if (contents_mode == FSCRYPT_MODE_AES_256_XTS && + filenames_mode == FSCRYPT_MODE_AES_256_CTS) + return true; + + if (contents_mode == FSCRYPT_MODE_AES_128_CBC && + filenames_mode == FSCRYPT_MODE_AES_128_CTS) + return true; + + if (contents_mode == FSCRYPT_MODE_ADIANTUM && + filenames_mode == FSCRYPT_MODE_ADIANTUM) + return true; + + return false; +} + +static bool supported_direct_key_modes(const struct inode *inode, + u32 contents_mode, u32 filenames_mode) +{ + const struct fscrypt_mode *mode; + + if (contents_mode != filenames_mode) { + fscrypt_warn(inode, + "Direct key flag not allowed with different contents and filenames modes"); + return false; + } + mode = &fscrypt_modes[contents_mode]; + + if (mode->ivsize < offsetofend(union fscrypt_iv, nonce)) { + fscrypt_warn(inode, "Direct key flag not allowed with %s", + mode->friendly_name); + return false; + } + return true; +} + static bool supported_iv_ino_lblk_64_policy( const struct fscrypt_policy_v2 *policy, const struct inode *inode) @@ -63,13 +100,82 @@ static bool supported_iv_ino_lblk_64_policy( return true; } +static bool fscrypt_supported_v1_policy(const struct fscrypt_policy_v1 *policy, + const struct inode *inode) +{ + if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, + policy->filenames_encryption_mode)) { + fscrypt_warn(inode, + "Unsupported encryption modes (contents %d, filenames %d)", + policy->contents_encryption_mode, + policy->filenames_encryption_mode); + return false; + } + + if (policy->flags & ~(FSCRYPT_POLICY_FLAGS_PAD_MASK | + FSCRYPT_POLICY_FLAG_DIRECT_KEY)) { + fscrypt_warn(inode, "Unsupported encryption flags (0x%02x)", + policy->flags); + return false; + } + + if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) && + !supported_direct_key_modes(inode, policy->contents_encryption_mode, + policy->filenames_encryption_mode)) + return false; + + if (IS_CASEFOLDED(inode)) { + /* With v1, there's no way to derive dirhash keys. */ + fscrypt_warn(inode, + "v1 policies can't be used on casefolded directories"); + return false; + } + + return true; +} + +static bool fscrypt_supported_v2_policy(const struct fscrypt_policy_v2 *policy, + const struct inode *inode) +{ + if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, + policy->filenames_encryption_mode)) { + fscrypt_warn(inode, + "Unsupported encryption modes (contents %d, filenames %d)", + policy->contents_encryption_mode, + policy->filenames_encryption_mode); + return false; + } + + if (policy->flags & ~FSCRYPT_POLICY_FLAGS_VALID) { + fscrypt_warn(inode, "Unsupported encryption flags (0x%02x)", + policy->flags); + return false; + } + + if ((policy->flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) && + !supported_direct_key_modes(inode, policy->contents_encryption_mode, + policy->filenames_encryption_mode)) + return false; + + if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) && + !supported_iv_ino_lblk_64_policy(policy, inode)) + return false; + + if (memchr_inv(policy->__reserved, 0, sizeof(policy->__reserved))) { + fscrypt_warn(inode, "Reserved bits set in encryption policy"); + return false; + } + + return true; +} + /** * fscrypt_supported_policy - check whether an encryption policy is supported * * Given an encryption policy, check whether all its encryption modes and other - * settings are supported by this kernel. (But we don't currently don't check - * for crypto API support here, so attempting to use an algorithm not configured - * into the crypto API will still fail later.) + * settings are supported by this kernel on the given inode. (But we don't + * currently don't check for crypto API support here, so attempting to use an + * algorithm not configured into the crypto API will still fail later.) * * Return: %true if supported, else %false */ @@ -77,60 +183,10 @@ bool fscrypt_supported_policy(const union fscrypt_policy *policy_u, const struct inode *inode) { switch (policy_u->version) { - case FSCRYPT_POLICY_V1: { - const struct fscrypt_policy_v1 *policy = &policy_u->v1; - - if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, - policy->filenames_encryption_mode)) { - fscrypt_warn(inode, - "Unsupported encryption modes (contents %d, filenames %d)", - policy->contents_encryption_mode, - policy->filenames_encryption_mode); - return false; - } - - if (policy->flags & ~(FSCRYPT_POLICY_FLAGS_PAD_MASK | - FSCRYPT_POLICY_FLAG_DIRECT_KEY)) { - fscrypt_warn(inode, - "Unsupported encryption flags (0x%02x)", - policy->flags); - return false; - } - - return true; - } - case FSCRYPT_POLICY_V2: { - const struct fscrypt_policy_v2 *policy = &policy_u->v2; - - if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, - policy->filenames_encryption_mode)) { - fscrypt_warn(inode, - "Unsupported encryption modes (contents %d, filenames %d)", - policy->contents_encryption_mode, - policy->filenames_encryption_mode); - return false; - } - - if (policy->flags & ~FSCRYPT_POLICY_FLAGS_VALID) { - fscrypt_warn(inode, - "Unsupported encryption flags (0x%02x)", - policy->flags); - return false; - } - - if ((policy->flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64) && - !supported_iv_ino_lblk_64_policy(policy, inode)) - return false; - - if (memchr_inv(policy->__reserved, 0, - sizeof(policy->__reserved))) { - fscrypt_warn(inode, - "Reserved bits set in encryption policy"); - return false; - } - - return true; - } + case FSCRYPT_POLICY_V1: + return fscrypt_supported_v1_policy(&policy_u->v1, inode); + case FSCRYPT_POLICY_V2: + return fscrypt_supported_v2_policy(&policy_u->v2, inode); } return false; } diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index dede25247b81..634b09d18b77 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -142,18 +142,21 @@ EXPORT_SYMBOL_GPL(debugfs_file_put); * We also need to exclude any file that has ways to write or alter it as root * can bypass the permissions check. */ -static bool debugfs_is_locked_down(struct inode *inode, - struct file *filp, - const struct file_operations *real_fops) +static int debugfs_locked_down(struct inode *inode, + struct file *filp, + const struct file_operations *real_fops) { if ((inode->i_mode & 07777) == 0444 && !(filp->f_mode & FMODE_WRITE) && !real_fops->unlocked_ioctl && !real_fops->compat_ioctl && !real_fops->mmap) - return false; + return 0; - return security_locked_down(LOCKDOWN_DEBUGFS); + if (security_locked_down(LOCKDOWN_DEBUGFS)) + return -EPERM; + + return 0; } static int open_proxy_open(struct inode *inode, struct file *filp) @@ -168,7 +171,7 @@ static int open_proxy_open(struct inode *inode, struct file *filp) real_fops = debugfs_real_fops(filp); - r = debugfs_is_locked_down(inode, filp, real_fops); + r = debugfs_locked_down(inode, filp, real_fops); if (r) goto out; @@ -298,7 +301,7 @@ static int full_proxy_open(struct inode *inode, struct file *filp) real_fops = debugfs_real_fops(filp); - r = debugfs_is_locked_down(inode, filp, real_fops); + r = debugfs_locked_down(inode, filp, real_fops); if (r) goto out; @@ -496,10 +499,10 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * - * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will * be returned. */ struct dentry *debugfs_create_u32(const char *name, umode_t mode, @@ -581,10 +584,10 @@ DEFINE_DEBUGFS_ATTRIBUTE(fops_ulong_wo, NULL, debugfs_ulong_set, "%llu\n"); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * - * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will * be returned. */ struct dentry *debugfs_create_ulong(const char *name, umode_t mode, @@ -846,10 +849,10 @@ static const struct file_operations fops_bool_wo = { * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * - * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will * be returned. */ struct dentry *debugfs_create_bool(const char *name, umode_t mode, @@ -899,10 +902,10 @@ static const struct file_operations fops_blob = { * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * - * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will * be returned. */ struct dentry *debugfs_create_blob(const char *name, umode_t mode, @@ -1091,10 +1094,10 @@ static const struct file_operations fops_regset32 = { * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * - * If debugfs is not enabled in the kernel, the value %ERR_PTR(-ENODEV) will + * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will * be returned. */ struct dentry *debugfs_create_regset32(const char *name, umode_t mode, @@ -1158,4 +1161,3 @@ struct dentry *debugfs_create_devm_seqfile(struct device *dev, const char *name, &debugfs_devm_entry_ops); } EXPORT_SYMBOL_GPL(debugfs_create_devm_seqfile); - diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index f4d8df5e4714..dc6cffc4feba 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -423,7 +423,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV will be @@ -502,7 +502,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_file_unsafe); * This function will return a pointer to a dentry if it succeeds. This * pointer must be passed to the debugfs_remove() function when the file is * to be removed (no automatic cleanup happens if your module is unloaded, - * you are responsible here.) If an error occurs, %ERR_PTR(-ERROR) will be + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be * returned. * * If debugfs is not enabled in the kernel, the value -%ENODEV wil |