diff options
Diffstat (limited to 'fs/btrfs/btrfs_inode.h')
-rw-r--r-- | fs/btrfs/btrfs_inode.h | 266 |
1 files changed, 169 insertions, 97 deletions
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 4e12a477d32e..54c2ccb36b61 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -7,12 +7,20 @@ #define BTRFS_INODE_H #include <linux/hash.h> +#include <linux/refcount.h> #include "extent_map.h" #include "extent_io.h" #include "ordered-data.h" #include "delayed-inode.h" /* + * Since we search a directory based on f_pos (struct dir_context::pos) we have + * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so + * everybody else has to start at 2 (see btrfs_real_readdir() and dir_emit_dots()). + */ +#define BTRFS_DIR_START_INDEX 2 + +/* * ordered_data_close is set by truncate when a file that used * to have good data has been truncated to zero. When it is set * the btrfs file release call will add this inode to the @@ -20,16 +28,45 @@ * new data the application may have written before commit. */ enum { - BTRFS_INODE_ORDERED_DATA_CLOSE, + BTRFS_INODE_FLUSH_ON_CLOSE, BTRFS_INODE_DUMMY, BTRFS_INODE_IN_DEFRAG, BTRFS_INODE_HAS_ASYNC_EXTENT, + /* + * Always set under the VFS' inode lock, otherwise it can cause races + * during fsync (we start as a fast fsync and then end up in a full + * fsync racing with ordered extent completion). + */ BTRFS_INODE_NEEDS_FULL_SYNC, BTRFS_INODE_COPY_EVERYTHING, BTRFS_INODE_IN_DELALLOC_LIST, - BTRFS_INODE_READDIO_NEED_LOCK, BTRFS_INODE_HAS_PROPS, BTRFS_INODE_SNAPSHOT_FLUSH, + /* + * Set and used when logging an inode and it serves to signal that an + * inode does not have xattrs, so subsequent fsyncs can avoid searching + * for xattrs to log. This bit must be cleared whenever a xattr is added + * to an inode. + */ + BTRFS_INODE_NO_XATTRS, + /* + * Set when we are in a context where we need to start a transaction and + * have dirty pages with the respective file range locked. This is to + * ensure that when reserving space for the transaction, if we are low + * on available space and need to flush delalloc, we will not flush + * delalloc for this inode, because that could result in a deadlock (on + * the file range, inode's io_tree). + */ + BTRFS_INODE_NO_DELALLOC_FLUSH, + /* + * Set when we are working on enabling verity for a file. Computing and + * writing the whole Merkle tree can take a while so we want to prevent + * races where two separate tasks attempt to simultaneously start verity + * on the same file. + */ + BTRFS_INODE_VERITY_IN_PROGRESS, + /* Set when this inode is a free space inode. */ + BTRFS_INODE_FREE_SPACE_INODE, }; /* in memory btrfs inode */ @@ -45,7 +82,8 @@ struct btrfs_inode { /* * Lock for counters and all fields used to determine if the inode is in * the log or not (last_trans, last_sub_trans, last_log_commit, - * logged_trans). + * logged_trans), to access/update new_delalloc_bytes and to update the + * VFS' inode number of bytes used. */ spinlock_t lock; @@ -58,7 +96,14 @@ struct btrfs_inode { /* special utility tree used to record which mirrors have already been * tried when checksums fail for a given block */ - struct extent_io_tree io_failure_tree; + struct rb_root io_failure_tree; + spinlock_t io_failure_lock; + + /* + * Keep track of where the inode has extent items mapped in order to + * make sure the i_size adjustments are accurate + */ + struct extent_io_tree file_extent_tree; /* held while logging the inode in tree-log.c */ struct mutex log_mutex; @@ -103,17 +148,26 @@ struct btrfs_inode { /* a local copy of root's last_log_commit */ int last_log_commit; - /* total number of bytes pending delalloc, used by stat to calc the - * real block usage of the file + /* + * Total number of bytes pending delalloc, used by stat to calculate the + * real block usage of the file. This is used only for files. */ u64 delalloc_bytes; - /* - * Total number of bytes pending delalloc that fall within a file - * range that is either a hole or beyond EOF (and no prealloc extent - * exists in the range). This is always <= delalloc_bytes. - */ - u64 new_delalloc_bytes; + union { + /* + * Total number of bytes pending delalloc that fall within a file + * range that is either a hole or beyond EOF (and no prealloc extent + * exists in the range). This is always <= delalloc_bytes and this + * is used only for files. + */ + u64 new_delalloc_bytes; + /* + * The offset of the last dir index key that was logged. + * This is used only for directories. + */ + u64 last_dir_index_offset; + }; /* * total number of bytes pending defrag, used by stat to check whether @@ -129,8 +183,9 @@ struct btrfs_inode { u64 disk_i_size; /* - * if this is a directory then index_cnt is the counter for the index - * number for new files that are created + * If this is a directory then index_cnt is the counter for the index + * number for new files that are created. For an empty directory, this + * must be initialized to BTRFS_DIR_START_INDEX. */ u64 index_cnt; @@ -145,13 +200,26 @@ struct btrfs_inode { u64 last_unlink_trans; /* + * The id/generation of the last transaction where this inode was + * either the source or the destination of a clone/dedupe operation. + * Used when logging an inode to know if there are shared extents that + * need special care when logging checksum items, to avoid duplicate + * checksum items in a log (which can lead to a corruption where we end + * up with missing checksum ranges after log replay). + * Protected by the vfs inode lock. + */ + u64 last_reflink_trans; + + /* * Number of bytes outstanding that are going to need csums. This is * used in ENOSPC accounting. */ u64 csum_bytes; - /* flags field from the on disk inode */ + /* Backwards incompatible flags, lower half of inode_item::flags */ u32 flags; + /* Read-only compatibility flags, upper half of inode_item::flags */ + u32 ro_flags; /* * Counters to keep track of the number of extent item's we may use due @@ -181,16 +249,7 @@ struct btrfs_inode { /* Hook into fs_info->delayed_iputs */ struct list_head delayed_iput; - /* - * To avoid races between lockless (i_mutex not held) direct IO writes - * and concurrent fsync requests. Direct IO writes must acquire read - * access on this semaphore for creating an extent map and its - * corresponding ordered extent. The fast fsync path must acquire write - * access on this semaphore before it collects ordered extents and - * extent maps. - */ - struct rw_semaphore dio_sem; - + struct rw_semaphore i_mmap_lock; struct inode vfs_inode; }; @@ -211,26 +270,31 @@ static inline unsigned long btrfs_inode_hash(u64 objectid, return (unsigned long)h; } -static inline void btrfs_insert_inode_hash(struct inode *inode) -{ - unsigned long h = btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root); - - __insert_inode_hash(inode, h); -} +#if BITS_PER_LONG == 32 +/* + * On 32 bit systems the i_ino of struct inode is 32 bits (unsigned long), so + * we use the inode's location objectid which is a u64 to avoid truncation. + */ static inline u64 btrfs_ino(const struct btrfs_inode *inode) { u64 ino = inode->location.objectid; - /* - * !ino: btree_inode - * type == BTRFS_ROOT_ITEM_KEY: subvol dir - */ - if (!ino || inode->location.type == BTRFS_ROOT_ITEM_KEY) + /* type == BTRFS_ROOT_ITEM_KEY: subvol dir */ + if (inode->location.type == BTRFS_ROOT_ITEM_KEY) ino = inode->vfs_inode.i_ino; return ino; } +#else + +static inline u64 btrfs_ino(const struct btrfs_inode *inode) +{ + return inode->vfs_inode.i_ino; +} + +#endif + static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size) { i_size_write(&inode->vfs_inode, size); @@ -239,14 +303,7 @@ static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size) static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode) { - struct btrfs_root *root = inode->root; - - if (root == root->fs_info->tree_root && - btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID) - return true; - if (inode->location.objectid == BTRFS_FREE_INO_OBJECTID) - return true; - return false; + return test_bit(BTRFS_INODE_FREE_SPACE_INODE, &inode->runtime_flags); } static inline bool is_data_inode(struct inode *inode) @@ -265,73 +322,89 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, mod); } -static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) +/* + * Called every time after doing a buffered, direct IO or memory mapped write. + * + * This is to ensure that if we write to a file that was previously fsynced in + * the current transaction, then try to fsync it again in the same transaction, + * we will know that there were changes in the file and that it needs to be + * logged. + */ +static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode) +{ + spin_lock(&inode->lock); + inode->last_sub_trans = inode->root->log_transid; + spin_unlock(&inode->lock); +} + +/* + * Should be called while holding the inode's VFS lock in exclusive mode or in a + * context where no one else can access the inode concurrently (during inode + * creation or when loading an inode from disk). + */ +static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode) +{ + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); + /* + * The inode may have been part of a reflink operation in the last + * transaction that modified it, and then a fsync has reset the + * last_reflink_trans to avoid subsequent fsyncs in the same + * transaction to do unnecessary work. So update last_reflink_trans + * to the last_trans value (we have to be pessimistic and assume a + * reflink happened). + * + * The ->last_trans is protected by the inode's spinlock and we can + * have a concurrent ordered extent completion update it. Also set + * last_reflink_trans to ->last_trans only if the former is less than + * the later, because we can be called in a context where + * last_reflink_trans was set to the current transaction generation + * while ->last_trans was not yet updated in the current transaction, + * and therefore has a lower value. + */ + spin_lock(&inode->lock); + if (inode->last_reflink_trans < inode->last_trans) + inode->last_reflink_trans = inode->last_trans; + spin_unlock(&inode->lock); +} + +static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) { - int ret = 0; + bool ret = false; spin_lock(&inode->lock); if (inode->logged_trans == generation && inode->last_sub_trans <= inode->last_log_commit && - inode->last_sub_trans <= inode->root->last_log_commit) { - /* - * After a ranged fsync we might have left some extent maps - * (that fall outside the fsync's range). So return false - * here if the list isn't empty, to make sure btrfs_log_inode() - * will be called and process those extent maps. - */ - smp_mb(); - if (list_empty(&inode->extent_tree.modified_extents)) - ret = 1; - } + inode->last_sub_trans <= inode->root->last_log_commit) + ret = true; spin_unlock(&inode->lock); return ret; } -#define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1 - -struct btrfs_dio_private { - struct inode *inode; - unsigned long flags; - u64 logical_offset; - u64 disk_bytenr; - u64 bytes; - void *private; - - /* number of bios pending for this dio */ - atomic_t pending_bios; - - /* IO errors */ - int errors; - - /* orig_bio is our btrfs_io_bio */ - struct bio *orig_bio; - - /* dio_bio came from fs/direct-io.c */ - struct bio *dio_bio; - - /* - * The original bio may be split to several sub-bios, this is - * done during endio of sub-bios - */ - blk_status_t (*subio_endio)(struct inode *, struct btrfs_io_bio *, - blk_status_t); -}; +/* + * Check if the inode has flags compatible with compression + */ +static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode) +{ + if (inode->flags & BTRFS_INODE_NODATACOW || + inode->flags & BTRFS_INODE_NODATASUM) + return false; + return true; +} /* - * Disable DIO read nolock optimization, so new dio readers will be forced - * to grab i_mutex. It is used to avoid the endless truncate due to - * nonlocked dio read. + * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two + * separate u32s. These two functions convert between the two representations. */ -static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode) +static inline u64 btrfs_inode_combine_flags(u32 flags, u32 ro_flags) { - set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); - smp_mb(); + return (flags | ((u64)ro_flags << 32)); } -static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode) +static inline void btrfs_inode_split_flags(u64 inode_item_flags, + u32 *flags, u32 *ro_flags) { - smp_mb__before_atomic(); - clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); + *flags = (u32)inode_item_flags; + *ro_flags = (u32)(inode_item_flags >> 32); } /* Array of bytes with variable length, hexadecimal format 0x1234 */ @@ -342,8 +415,7 @@ static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode, u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num) { struct btrfs_root *root = inode->root; - struct btrfs_super_block *sb = root->fs_info->super_copy; - const u16 csum_size = btrfs_super_csum_size(sb); + const u32 csum_size = root->fs_info->csum_size; /* Output minus objectid, which is more meaningful */ if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID) |