diff options
Diffstat (limited to 'fs')
908 files changed, 49933 insertions, 40693 deletions
diff --git a/fs/9p/cache.c b/fs/9p/cache.c index f2ba131cede1..cebba4eaa0b5 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -16,186 +16,59 @@ #include "v9fs.h" #include "cache.h" -#define CACHETAG_LEN 11 - -struct fscache_netfs v9fs_cache_netfs = { - .name = "9p", - .version = 0, -}; - -/* - * v9fs_random_cachetag - Generate a random tag to be associated - * with a new cache session. - * - * The value of jiffies is used for a fairly randomly cache tag. - */ - -static -int v9fs_random_cachetag(struct v9fs_session_info *v9ses) +int v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses, + const char *dev_name) { - v9ses->cachetag = kmalloc(CACHETAG_LEN, GFP_KERNEL); - if (!v9ses->cachetag) - return -ENOMEM; + struct fscache_volume *vcookie; + char *name, *p; - return scnprintf(v9ses->cachetag, CACHETAG_LEN, "%lu", jiffies); -} - -const struct fscache_cookie_def v9fs_cache_session_index_def = { - .name = "9P.session", - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; + name = kasprintf(GFP_KERNEL, "9p,%s,%s", + dev_name, v9ses->cachetag ?: v9ses->aname); + if (!name) + return -ENOMEM; -void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses) -{ - /* If no cache session tag was specified, we generate a random one. */ - if (!v9ses->cachetag) { - if (v9fs_random_cachetag(v9ses) < 0) { - v9ses->fscache = NULL; - kfree(v9ses->cachetag); - v9ses->cachetag = NULL; - return; + for (p = name; *p; p++) + if (*p == '/') + *p = ';'; + + vcookie = fscache_acquire_volume(name, NULL, NULL, 0); + p9_debug(P9_DEBUG_FSC, "session %p get volume %p (%s)\n", + v9ses, vcookie, name); + if (IS_ERR(vcookie)) { + if (vcookie != ERR_PTR(-EBUSY)) { + kfree(name); + return PTR_ERR(vcookie); } + pr_err("Cache volume key already in use (%s)\n", name); + vcookie = NULL; } - - v9ses->fscache = fscache_acquire_cookie(v9fs_cache_netfs.primary_index, - &v9fs_cache_session_index_def, - v9ses->cachetag, - strlen(v9ses->cachetag), - NULL, 0, - v9ses, 0, true); - p9_debug(P9_DEBUG_FSC, "session %p get cookie %p\n", - v9ses, v9ses->fscache); -} - -void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses) -{ - p9_debug(P9_DEBUG_FSC, "session %p put cookie %p\n", - v9ses, v9ses->fscache); - fscache_relinquish_cookie(v9ses->fscache, NULL, false); - v9ses->fscache = NULL; + v9ses->fscache = vcookie; + kfree(name); + return 0; } -static enum -fscache_checkaux v9fs_cache_inode_check_aux(void *cookie_netfs_data, - const void *buffer, - uint16_t buflen, - loff_t object_size) -{ - const struct v9fs_inode *v9inode = cookie_netfs_data; - - if (buflen != sizeof(v9inode->qid.version)) - return FSCACHE_CHECKAUX_OBSOLETE; - - if (memcmp(buffer, &v9inode->qid.version, - sizeof(v9inode->qid.version))) - return FSCACHE_CHECKAUX_OBSOLETE; - - return FSCACHE_CHECKAUX_OKAY; -} - -const struct fscache_cookie_def v9fs_cache_inode_index_def = { - .name = "9p.inode", - .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .check_aux = v9fs_cache_inode_check_aux, -}; - void v9fs_cache_inode_get_cookie(struct inode *inode) { - struct v9fs_inode *v9inode; + struct v9fs_inode *v9inode = V9FS_I(inode); struct v9fs_session_info *v9ses; + __le32 version; + __le64 path; if (!S_ISREG(inode->i_mode)) return; - - v9inode = V9FS_I(inode); - if (v9inode->fscache) + if (WARN_ON(v9fs_inode_cookie(v9inode))) return; + version = cpu_to_le32(v9inode->qid.version); + path = cpu_to_le64(v9inode->qid.path); v9ses = v9fs_inode2v9ses(inode); - v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, - &v9fs_cache_inode_index_def, - &v9inode->qid.path, - sizeof(v9inode->qid.path), - &v9inode->qid.version, - sizeof(v9inode->qid.version), - v9inode, - i_size_read(&v9inode->vfs_inode), - true); + v9inode->netfs.cache = + fscache_acquire_cookie(v9fs_session_cache(v9ses), + 0, + &path, sizeof(path), + &version, sizeof(version), + i_size_read(&v9inode->netfs.inode)); p9_debug(P9_DEBUG_FSC, "inode %p get cookie %p\n", - inode, v9inode->fscache); -} - -void v9fs_cache_inode_put_cookie(struct inode *inode) -{ - struct v9fs_inode *v9inode = V9FS_I(inode); - - if (!v9inode->fscache) - return; - p9_debug(P9_DEBUG_FSC, "inode %p put cookie %p\n", - inode, v9inode->fscache); - - fscache_relinquish_cookie(v9inode->fscache, &v9inode->qid.version, - false); - v9inode->fscache = NULL; -} - -void v9fs_cache_inode_flush_cookie(struct inode *inode) -{ - struct v9fs_inode *v9inode = V9FS_I(inode); - - if (!v9inode->fscache) - return; - p9_debug(P9_DEBUG_FSC, "inode %p flush cookie %p\n", - inode, v9inode->fscache); - - fscache_relinquish_cookie(v9inode->fscache, NULL, true); - v9inode->fscache = NULL; -} - -void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp) -{ - struct v9fs_inode *v9inode = V9FS_I(inode); - - if (!v9inode->fscache) - return; - - mutex_lock(&v9inode->fscache_lock); - - if ((filp->f_flags & O_ACCMODE) != O_RDONLY) - v9fs_cache_inode_flush_cookie(inode); - else - v9fs_cache_inode_get_cookie(inode); - - mutex_unlock(&v9inode->fscache_lock); -} - -void v9fs_cache_inode_reset_cookie(struct inode *inode) -{ - struct v9fs_inode *v9inode = V9FS_I(inode); - struct v9fs_session_info *v9ses; - struct fscache_cookie *old; - - if (!v9inode->fscache) - return; - - old = v9inode->fscache; - - mutex_lock(&v9inode->fscache_lock); - fscache_relinquish_cookie(v9inode->fscache, NULL, true); - - v9ses = v9fs_inode2v9ses(inode); - v9inode->fscache = fscache_acquire_cookie(v9ses->fscache, - &v9fs_cache_inode_index_def, - &v9inode->qid.path, - sizeof(v9inode->qid.path), - &v9inode->qid.version, - sizeof(v9inode->qid.version), - v9inode, - i_size_read(&v9inode->vfs_inode), - true); - p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n", - inode, old, v9inode->fscache); - - mutex_unlock(&v9inode->fscache_lock); + inode, v9fs_inode_cookie(v9inode)); } diff --git a/fs/9p/cache.h b/fs/9p/cache.h index 7480b4b49fea..1923affcdc62 100644 --- a/fs/9p/cache.h +++ b/fs/9p/cache.h @@ -7,26 +7,15 @@ #ifndef _9P_CACHE_H #define _9P_CACHE_H -#define FSCACHE_USE_NEW_IO_API + #include <linux/fscache.h> #ifdef CONFIG_9P_FSCACHE -extern struct fscache_netfs v9fs_cache_netfs; -extern const struct fscache_cookie_def v9fs_cache_session_index_def; -extern const struct fscache_cookie_def v9fs_cache_inode_index_def; - -extern void v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses); -extern void v9fs_cache_session_put_cookie(struct v9fs_session_info *v9ses); +extern int v9fs_cache_session_get_cookie(struct v9fs_session_info *v9ses, + const char *dev_name); extern void v9fs_cache_inode_get_cookie(struct inode *inode); -extern void v9fs_cache_inode_put_cookie(struct inode *inode); -extern void v9fs_cache_inode_flush_cookie(struct inode *inode); -extern void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp); -extern void v9fs_cache_inode_reset_cookie(struct inode *inode); - -extern int __v9fs_cache_register(void); -extern void __v9fs_cache_unregister(void); #else /* CONFIG_9P_FSCACHE */ @@ -34,13 +23,5 @@ static inline void v9fs_cache_inode_get_cookie(struct inode *inode) { } -static inline void v9fs_cache_inode_put_cookie(struct inode *inode) -{ -} - -static inline void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *file) -{ -} - #endif /* CONFIG_9P_FSCACHE */ #endif /* _9P_CACHE_H */ diff --git a/fs/9p/fid.c b/fs/9p/fid.c index 6aab046c98e2..baf2b152229e 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -96,12 +96,8 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any) dentry, dentry, from_kuid(&init_user_ns, uid), any); ret = NULL; - - if (d_inode(dentry)) - ret = v9fs_fid_find_inode(d_inode(dentry), uid); - /* we'll recheck under lock if there's anything to look in */ - if (!ret && dentry->d_fsdata) { + if (dentry->d_fsdata) { struct hlist_head *h = (struct hlist_head *)&dentry->d_fsdata; spin_lock(&dentry->d_lock); @@ -113,6 +109,9 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any) } } spin_unlock(&dentry->d_lock); + } else { + if (dentry->d_inode) + ret = v9fs_fid_find_inode(dentry->d_inode, uid); } return ret; @@ -153,7 +152,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, const unsigned char **wnames, *uname; int i, n, l, clone, access; struct v9fs_session_info *v9ses; - struct p9_fid *fid, *old_fid = NULL; + struct p9_fid *fid, *old_fid; v9ses = v9fs_dentry2v9ses(dentry); access = v9ses->flags & V9FS_ACCESS_MASK; @@ -195,13 +194,12 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, if (IS_ERR(fid)) return fid; + refcount_inc(&fid->count); v9fs_fid_add(dentry->d_sb->s_root, fid); } /* If we are root ourself just return that */ - if (dentry->d_sb->s_root == dentry) { - refcount_inc(&fid->count); + if (dentry->d_sb->s_root == dentry) return fid; - } /* * Do a multipath walk with attached root. * When walking parent we need to make sure we @@ -213,6 +211,7 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, fid = ERR_PTR(n); goto err_out; } + old_fid = fid; clone = 1; i = 0; while (i < n) { @@ -222,19 +221,15 @@ static struct p9_fid *v9fs_fid_lookup_with_uid(struct dentry *dentry, * walk to ensure none of the patch component change */ fid = p9_client_walk(fid, l, &wnames[i], clone); + /* non-cloning walk will return the same fid */ + if (fid != old_fid) { + p9_client_clunk(old_fid); + old_fid = fid; + } if (IS_ERR(fid)) { - if (old_fid) { - /* - * If we fail, clunk fid which are mapping - * to path component and not the last component - * of the path. - */ - p9_client_clunk(old_fid); - } kfree(wnames); goto err_out; } - old_fid = fid; i += l; clone = 0; } diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index e32dd5f7721b..0129de2ea31a 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -469,7 +469,11 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, #ifdef CONFIG_9P_FSCACHE /* register the session for caching */ - v9fs_cache_session_get_cookie(v9ses); + if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) { + rc = v9fs_cache_session_get_cookie(v9ses, dev_name); + if (rc < 0) + goto err_clnt; + } #endif spin_lock(&v9fs_sessionlist_lock); list_add(&v9ses->slist, &v9fs_sessionlist); @@ -502,8 +506,7 @@ void v9fs_session_close(struct v9fs_session_info *v9ses) } #ifdef CONFIG_9P_FSCACHE - if (v9ses->fscache) - v9fs_cache_session_put_cookie(v9ses); + fscache_relinquish_volume(v9fs_session_cache(v9ses), NULL, false); kfree(v9ses->cachetag); #endif kfree(v9ses->uname); @@ -620,11 +623,9 @@ static void v9fs_sysfs_cleanup(void) static void v9fs_inode_init_once(void *foo) { struct v9fs_inode *v9inode = (struct v9fs_inode *)foo; -#ifdef CONFIG_9P_FSCACHE - v9inode->fscache = NULL; -#endif + memset(&v9inode->qid, 0, sizeof(v9inode->qid)); - inode_init_once(&v9inode->vfs_inode); + inode_init_once(&v9inode->netfs.inode); } /** @@ -665,20 +666,12 @@ static int v9fs_cache_register(void) ret = v9fs_init_inode_cache(); if (ret < 0) return ret; -#ifdef CONFIG_9P_FSCACHE - ret = fscache_register_netfs(&v9fs_cache_netfs); - if (ret < 0) - v9fs_destroy_inode_cache(); -#endif return ret; } static void v9fs_cache_unregister(void) { v9fs_destroy_inode_cache(); -#ifdef CONFIG_9P_FSCACHE - fscache_unregister_netfs(&v9fs_cache_netfs); -#endif } /** diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 1647a8e63671..6acabc2e7dc9 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -9,6 +9,7 @@ #define FS_9P_V9FS_H #include <linux/backing-dev.h> +#include <linux/netfs.h> /** * enum p9_session_flags - option flags for each 9P session @@ -89,7 +90,7 @@ struct v9fs_session_info { unsigned int cache; #ifdef CONFIG_9P_FSCACHE char *cachetag; - struct fscache_cookie *fscache; + struct fscache_volume *fscache; #endif char *uname; /* user name to mount as */ @@ -108,31 +109,37 @@ struct v9fs_session_info { #define V9FS_INO_INVALID_ATTR 0x01 struct v9fs_inode { -#ifdef CONFIG_9P_FSCACHE - struct mutex fscache_lock; - struct fscache_cookie *fscache; -#endif + struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct p9_qid qid; unsigned int cache_validity; struct p9_fid *writeback_fid; struct mutex v_mutex; - struct inode vfs_inode; }; static inline struct v9fs_inode *V9FS_I(const struct inode *inode) { - return container_of(inode, struct v9fs_inode, vfs_inode); + return container_of(inode, struct v9fs_inode, netfs.inode); } static inline struct fscache_cookie *v9fs_inode_cookie(struct v9fs_inode *v9inode) { #ifdef CONFIG_9P_FSCACHE - return v9inode->fscache; + return netfs_i_cookie(&v9inode->netfs); +#else + return NULL; +#endif +} + +static inline struct fscache_volume *v9fs_session_cache(struct v9fs_session_info *v9ses) +{ +#ifdef CONFIG_9P_FSCACHE + return v9ses->fscache; #else return NULL; #endif } + extern int v9fs_show_options(struct seq_file *m, struct dentry *root); struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, @@ -154,6 +161,7 @@ extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses, extern const struct inode_operations v9fs_dir_inode_operations_dotl; extern const struct inode_operations v9fs_file_inode_operations_dotl; extern const struct inode_operations v9fs_symlink_inode_operations_dotl; +extern const struct netfs_request_ops v9fs_req_ops; extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb, int new); diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index fac918ccb305..d0833fa69faf 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -16,6 +16,7 @@ #include <linux/pagemap.h> #include <linux/idr.h> #include <linux/sched.h> +#include <linux/swap.h> #include <linux/uio.h> #include <linux/netfs.h> #include <net/9p/9p.h> @@ -27,12 +28,12 @@ #include "fid.h" /** - * v9fs_req_issue_op - Issue a read from 9P + * v9fs_issue_read - Issue a read from 9P * @subreq: The read to make */ -static void v9fs_req_issue_op(struct netfs_read_subrequest *subreq) +static void v9fs_issue_read(struct netfs_io_subrequest *subreq) { - struct netfs_read_request *rreq = subreq->rreq; + struct netfs_io_request *rreq = subreq->rreq; struct p9_fid *fid = rreq->netfs_priv; struct iov_iter to; loff_t pos = subreq->start + subreq->transferred; @@ -42,129 +43,124 @@ static void v9fs_req_issue_op(struct netfs_read_subrequest *subreq) iov_iter_xarray(&to, READ, &rreq->mapping->i_pages, pos, len); total = p9_client_read(fid, pos, &to, &err); + + /* if we just extended the file size, any portion not in + * cache won't be on server and is zeroes */ + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + netfs_subreq_terminated(subreq, err ?: total, false); } /** - * v9fs_init_rreq - Initialise a read request + * v9fs_init_request - Initialise a read request * @rreq: The read request * @file: The file being read from */ -static void v9fs_init_rreq(struct netfs_read_request *rreq, struct file *file) +static int v9fs_init_request(struct netfs_io_request *rreq, struct file *file) { + struct inode *inode = file_inode(file); + struct v9fs_inode *v9inode = V9FS_I(inode); struct p9_fid *fid = file->private_data; + BUG_ON(!fid); + + /* we might need to read from a fid that was opened write-only + * for read-modify-write of page cache, use the writeback fid + * for that */ + if (rreq->origin == NETFS_READ_FOR_WRITE && + (fid->mode & O_ACCMODE) == O_WRONLY) { + fid = v9inode->writeback_fid; + BUG_ON(!fid); + } + refcount_inc(&fid->count); rreq->netfs_priv = fid; + return 0; } /** - * v9fs_req_cleanup - Cleanup request initialized by v9fs_init_rreq - * @mapping: unused mapping of request to cleanup - * @priv: private data to cleanup, a fid, guaranted non-null. + * v9fs_free_request - Cleanup request initialized by v9fs_init_rreq + * @rreq: The I/O request to clean up */ -static void v9fs_req_cleanup(struct address_space *mapping, void *priv) +static void v9fs_free_request(struct netfs_io_request *rreq) { - struct p9_fid *fid = priv; + struct p9_fid *fid = rreq->netfs_priv; p9_client_clunk(fid); } /** - * v9fs_is_cache_enabled - Determine if caching is enabled for an inode - * @inode: The inode to check - */ -static bool v9fs_is_cache_enabled(struct inode *inode) -{ - struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(inode)); - - return fscache_cookie_enabled(cookie) && !hlist_empty(&cookie->backing_objects); -} - -/** * v9fs_begin_cache_operation - Begin a cache operation for a read * @rreq: The read request */ -static int v9fs_begin_cache_operation(struct netfs_read_request *rreq) +static int v9fs_begin_cache_operation(struct netfs_io_request *rreq) { +#ifdef CONFIG_9P_FSCACHE struct fscache_cookie *cookie = v9fs_inode_cookie(V9FS_I(rreq->inode)); - return fscache_begin_read_operation(rreq, cookie); + return fscache_begin_read_operation(&rreq->cache_resources, cookie); +#else + return -ENOBUFS; +#endif } -static const struct netfs_read_request_ops v9fs_req_ops = { - .init_rreq = v9fs_init_rreq, - .is_cache_enabled = v9fs_is_cache_enabled, +const struct netfs_request_ops v9fs_req_ops = { + .init_request = v9fs_init_request, + .free_request = v9fs_free_request, .begin_cache_operation = v9fs_begin_cache_operation, - .issue_op = v9fs_req_issue_op, - .cleanup = v9fs_req_cleanup, + .issue_read = v9fs_issue_read, }; /** - * v9fs_vfs_readpage - read an entire page in from 9P - * @file: file being read - * @page: structure to page - * - */ -static int v9fs_vfs_readpage(struct file *file, struct page *page) -{ - struct folio *folio = page_folio(page); - - return netfs_readpage(file, folio, &v9fs_req_ops, NULL); -} - -/** - * v9fs_vfs_readahead - read a set of pages from 9P - * @ractl: The readahead parameters - */ -static void v9fs_vfs_readahead(struct readahead_control *ractl) -{ - netfs_readahead(ractl, &v9fs_req_ops, NULL); -} - -/** - * v9fs_release_page - release the private state associated with a page - * @page: The page to be released + * v9fs_release_folio - release the private state associated with a folio + * @folio: The folio to be released * @gfp: The caller's allocation restrictions * - * Returns 1 if the page can be released, false otherwise. + * Returns true if the page can be released, false otherwise. */ -static int v9fs_release_page(struct page *page, gfp_t gfp) +static bool v9fs_release_folio(struct folio *folio, gfp_t gfp) { - struct folio *folio = page_folio(page); + struct inode *inode = folio_inode(folio); if (folio_test_private(folio)) - return 0; + return false; #ifdef CONFIG_9P_FSCACHE if (folio_test_fscache(folio)) { - if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) - return 0; + if (current_is_kswapd() || !(gfp & __GFP_FS)) + return false; folio_wait_fscache(folio); } #endif - return 1; + fscache_note_page_release(v9fs_inode_cookie(V9FS_I(inode))); + return true; } -/** - * v9fs_invalidate_page - Invalidate a page completely or partially - * @page: The page to be invalidated - * @offset: offset of the invalidated region - * @length: length of the invalidated region - */ - -static void v9fs_invalidate_page(struct page *page, unsigned int offset, - unsigned int length) +static void v9fs_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { - struct folio *folio = page_folio(page); - folio_wait_fscache(folio); } +static void v9fs_write_to_cache_done(void *priv, ssize_t transferred_or_error, + bool was_async) +{ + struct v9fs_inode *v9inode = priv; + __le32 version; + + if (IS_ERR_VALUE(transferred_or_error) && + transferred_or_error != -ENOBUFS) { + version = cpu_to_le32(v9inode->qid.version); + fscache_invalidate(v9fs_inode_cookie(v9inode), &version, + i_size_read(&v9inode->netfs.inode), 0); + } +} + static int v9fs_vfs_write_folio_locked(struct folio *folio) { struct inode *inode = folio_inode(folio); struct v9fs_inode *v9inode = V9FS_I(inode); + struct fscache_cookie *cookie = v9fs_inode_cookie(v9inode); loff_t start = folio_pos(folio); loff_t i_size = i_size_read(inode); struct iov_iter from; @@ -181,10 +177,21 @@ static int v9fs_vfs_write_folio_locked(struct folio *folio) /* We should have writeback_fid always set */ BUG_ON(!v9inode->writeback_fid); + folio_wait_fscache(folio); folio_start_writeback(folio); p9_client_write(v9inode->writeback_fid, start, &from, &err); + if (err == 0 && + fscache_cookie_enabled(cookie) && + test_bit(FSCACHE_COOKIE_IS_CACHING, &cookie->flags)) { + folio_start_fscache(folio); + fscache_write_to_cache(v9fs_inode_cookie(v9inode), + folio_mapping(folio), start, len, i_size, + v9fs_write_to_cache_done, v9inode, + true); + } + folio_end_writeback(folio); return err; } @@ -211,16 +218,8 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc) return retval; } -/** - * v9fs_launder_page - Writeback a dirty page - * @page: The page to be cleaned up - * - * Returns 0 on success. - */ - -static int v9fs_launder_page(struct page *page) +static int v9fs_launder_folio(struct folio *folio) { - struct folio *folio = page_folio(page); int retval; if (folio_clear_dirty_for_io(folio)) { @@ -272,7 +271,7 @@ v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } static int v9fs_write_begin(struct file *filp, struct address_space *mapping, - loff_t pos, unsigned int len, unsigned int flags, + loff_t pos, unsigned int len, struct page **subpagep, void **fsdata) { int retval; @@ -287,8 +286,7 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping, * file. We need to do this before we get a lock on the page in case * there's more than one writer competing for the same cache block. */ - retval = netfs_write_begin(filp, mapping, pos, len, flags, &folio, fsdata, - &v9fs_req_ops, NULL); + retval = netfs_write_begin(&v9inode->netfs, filp, mapping, pos, len, &folio, fsdata); if (retval < 0) return retval; @@ -303,6 +301,7 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, loff_t last_pos = pos + copied; struct folio *folio = page_folio(subpage); struct inode *inode = mapping->host; + struct v9fs_inode *v9inode = V9FS_I(inode); p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); @@ -322,6 +321,7 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, if (last_pos > inode->i_size) { inode_add_bytes(inode, last_pos - inode->i_size); i_size_write(inode, last_pos); + fscache_update_cookie(v9fs_inode_cookie(v9inode), NULL, &last_pos); } folio_mark_dirty(folio); out: @@ -331,16 +331,30 @@ out: return copied; } +#ifdef CONFIG_9P_FSCACHE +/* + * Mark a page as having been made dirty and thus needing writeback. We also + * need to pin the cache object to write back to. + */ +static bool v9fs_dirty_folio(struct address_space *mapping, struct folio *folio) +{ + struct v9fs_inode *v9inode = V9FS_I(mapping->host); + + return fscache_dirty_folio(mapping, folio, v9fs_inode_cookie(v9inode)); +} +#else +#define v9fs_dirty_folio filemap_dirty_folio +#endif const struct address_space_operations v9fs_addr_operations = { - .readpage = v9fs_vfs_readpage, - .readahead = v9fs_vfs_readahead, - .set_page_dirty = __set_page_dirty_nobuffers, + .read_folio = netfs_read_folio, + .readahead = netfs_readahead, + .dirty_folio = v9fs_dirty_folio, .writepage = v9fs_vfs_writepage, .write_begin = v9fs_write_begin, .write_end = v9fs_write_end, - .releasepage = v9fs_release_page, - .invalidatepage = v9fs_invalidate_page, - .launder_page = v9fs_launder_page, + .release_folio = v9fs_release_folio, + .invalidate_folio = v9fs_invalidate_folio, + .launder_folio = v9fs_launder_folio, .direct_IO = v9fs_direct_IO, }; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 8c854d8cb0cd..958680f7f23e 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -17,6 +17,7 @@ #include <linux/idr.h> #include <linux/slab.h> #include <linux/uio.h> +#include <linux/fscache.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -205,7 +206,10 @@ static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx) int v9fs_dir_release(struct inode *inode, struct file *filp) { + struct v9fs_inode *v9inode = V9FS_I(inode); struct p9_fid *fid; + __le32 version; + loff_t i_size; fid = filp->private_data; p9_debug(P9_DEBUG_VFS, "inode: %p filp: %p fid: %d\n", @@ -216,6 +220,15 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) spin_unlock(&inode->i_lock); p9_client_clunk(fid); } + + if ((filp->f_mode & FMODE_WRITE)) { + version = cpu_to_le32(v9inode->qid.version); + i_size = i_size_read(inode); + fscache_unuse_cookie(v9fs_inode_cookie(v9inode), + &version, &i_size); + } else { + fscache_unuse_cookie(v9fs_inode_cookie(v9inode), NULL, NULL); + } return 0; } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 612e297f3763..2573c08f335c 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -93,7 +93,8 @@ int v9fs_file_open(struct inode *inode, struct file *file) } mutex_unlock(&v9inode->v_mutex); if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - v9fs_cache_inode_set_cookie(inode, file); + fscache_use_cookie(v9fs_inode_cookie(v9inode), + file->f_mode & FMODE_WRITE); v9fs_open_fid_add(inode, fid); return 0; out_error: @@ -114,7 +115,6 @@ out_error: static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) { - int res = 0; struct inode *inode = file_inode(filp); p9_debug(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl); @@ -124,7 +124,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl) invalidate_mapping_pages(&inode->i_data, 0, -1); } - return res; + return 0; } static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) @@ -139,8 +139,7 @@ static int v9fs_file_do_lock(struct file *filp, int cmd, struct file_lock *fl) fid = filp->private_data; BUG_ON(fid == NULL); - if ((fl->fl_flags & FL_POSIX) != FL_POSIX) - BUG(); + BUG_ON((fl->fl_flags & FL_POSIX) != FL_POSIX); res = locks_lock_file_wait(filp, fl); if (res < 0) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 328c338ff304..3d8297714772 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -228,17 +228,13 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) { struct v9fs_inode *v9inode; - v9inode = kmem_cache_alloc(v9fs_inode_cache, GFP_KERNEL); + v9inode = alloc_inode_sb(sb, v9fs_inode_cache, GFP_KERNEL); if (!v9inode) return NULL; -#ifdef CONFIG_9P_FSCACHE - v9inode->fscache = NULL; - mutex_init(&v9inode->fscache_lock); -#endif v9inode->writeback_fid = NULL; v9inode->cache_validity = 0; mutex_init(&v9inode->v_mutex); - return &v9inode->vfs_inode; + return &v9inode->netfs.inode; } /** @@ -251,6 +247,15 @@ void v9fs_free_inode(struct inode *inode) kmem_cache_free(v9fs_inode_cache, V9FS_I(inode)); } +/* + * Set parameters for the netfs library + */ +static void v9fs_set_netfs_context(struct inode *inode) +{ + struct v9fs_inode *v9inode = V9FS_I(inode); + netfs_inode_init(&v9inode->netfs, &v9fs_req_ops); +} + int v9fs_init_inode(struct v9fs_session_info *v9ses, struct inode *inode, umode_t mode, dev_t rdev) { @@ -339,6 +344,8 @@ int v9fs_init_inode(struct v9fs_session_info *v9ses, err = -EINVAL; goto error; } + + v9fs_set_netfs_context(inode); error: return err; @@ -381,12 +388,16 @@ struct inode *v9fs_get_inode(struct super_block *sb, umode_t mode, dev_t rdev) void v9fs_evict_inode(struct inode *inode) { struct v9fs_inode *v9inode = V9FS_I(inode); + __le32 version; truncate_inode_pages_final(&inode->i_data); + version = cpu_to_le32(v9inode->qid.version); + fscache_clear_inode_writeback(v9fs_inode_cookie(v9inode), inode, + &version); clear_inode(inode); filemap_fdatawrite(&inode->i_data); - v9fs_cache_inode_put_cookie(inode); + fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false); /* clunk the fid stashed in writeback_fid */ if (v9inode->writeback_fid) { p9_client_clunk(v9inode->writeback_fid); @@ -869,7 +880,8 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, file->private_data = fid; if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - v9fs_cache_inode_set_cookie(d_inode(dentry), file); + fscache_use_cookie(v9fs_inode_cookie(v9inode), + file->f_mode & FMODE_WRITE); v9fs_open_fid_add(inode, fid); file->f_mode |= FMODE_CREATED; @@ -1072,6 +1084,8 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *iattr) { int retval, use_dentry = 0; + struct inode *inode = d_inode(dentry); + struct v9fs_inode *v9inode = V9FS_I(inode); struct v9fs_session_info *v9ses; struct p9_fid *fid = NULL; struct p9_wstat wstat; @@ -1117,7 +1131,7 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns, /* Write all dirty data */ if (d_is_reg(dentry)) - filemap_write_and_wait(d_inode(dentry)->i_mapping); + filemap_write_and_wait(inode->i_mapping); retval = p9_client_wstat(fid, &wstat); @@ -1128,13 +1142,15 @@ static int v9fs_vfs_setattr(struct user_namespace *mnt_userns, return retval; if ((iattr->ia_valid & ATTR_SIZE) && - iattr->ia_size != i_size_read(d_inode(dentry))) - truncate_setsize(d_inode(dentry), iattr->ia_size); + iattr->ia_size != i_size_read(inode)) { + truncate_setsize(inode, iattr->ia_size); + fscache_resize_cookie(v9fs_inode_cookie(v9inode), iattr->ia_size); + } - v9fs_invalidate_inode_attr(d_inode(dentry)); + v9fs_invalidate_inode_attr(inode); - setattr_copy(&init_user_ns, d_inode(dentry), iattr); - mark_inode_dirty(d_inode(dentry)); + setattr_copy(&init_user_ns, inode, iattr); + mark_inode_dirty(inode); return 0; } @@ -1235,15 +1251,15 @@ static const char *v9fs_vfs_get_link(struct dentry *dentry, return ERR_PTR(-ECHILD); v9ses = v9fs_dentry2v9ses(dentry); - fid = v9fs_fid_lookup(dentry); + if (!v9fs_proto_dotu(v9ses)) + return ERR_PTR(-EBADF); + p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); + fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return ERR_CAST(fid); - if (!v9fs_proto_dotu(v9ses)) - return ERR_PTR(-EBADF); - st = p9_client_stat(fid); p9_client_clunk(fid); if (IS_ERR(st)) diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 7dee89ba32e7..b6eb1160296c 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -274,6 +274,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (IS_ERR(ofid)) { err = PTR_ERR(ofid); p9_debug(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err); + p9_client_clunk(dfid); goto out; } @@ -285,6 +286,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (err) { p9_debug(P9_DEBUG_VFS, "Failed to get acl values in creat %d\n", err); + p9_client_clunk(dfid); goto error; } err = p9_client_create_dotl(ofid, name, v9fs_open_to_dotl_flags(flags), @@ -292,6 +294,7 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, if (err < 0) { p9_debug(P9_DEBUG_VFS, "p9_client_open_dotl failed in creat %d\n", err); + p9_client_clunk(dfid); goto error; } v9fs_invalidate_inode_attr(dir); @@ -344,7 +347,8 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, goto err_clunk_old_fid; file->private_data = ofid; if (v9ses->cache == CACHE_LOOSE || v9ses->cache == CACHE_FSCACHE) - v9fs_cache_inode_set_cookie(inode, file); + fscache_use_cookie(v9fs_inode_cookie(v9inode), + file->f_mode & FMODE_WRITE); v9fs_open_fid_add(inode, ofid); file->f_mode |= FMODE_CREATED; out: @@ -551,7 +555,10 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, { int retval, use_dentry = 0; struct p9_fid *fid = NULL; - struct p9_iattr_dotl p9attr; + struct p9_iattr_dotl p9attr = { + .uid = INVALID_UID, + .gid = INVALID_GID, + }; struct inode *inode = d_inode(dentry); p9_debug(P9_DEBUG_VFS, "\n"); @@ -561,14 +568,22 @@ int v9fs_vfs_setattr_dotl(struct user_namespace *mnt_userns, return retval; p9attr.valid = v9fs_mapped_iattr_valid(iattr->ia_valid); - p9attr.mode = iattr->ia_mode; - p9attr.uid = iattr->ia_uid; - p9attr.gid = iattr->ia_gid; - p9attr.size = iattr->ia_size; - p9attr.atime_sec = iattr->ia_atime.tv_sec; - p9attr.atime_nsec = iattr->ia_atime.tv_nsec; - p9attr.mtime_sec = iattr->ia_mtime.tv_sec; - p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec; + if (iattr->ia_valid & ATTR_MODE) + p9attr.mode = iattr->ia_mode; + if (iattr->ia_valid & ATTR_UID) + p9attr.uid = iattr->ia_uid; + if (iattr->ia_valid & ATTR_GID) + p9attr.gid = iattr->ia_gid; + if (iattr->ia_valid & ATTR_SIZE) + p9attr.size = iattr->ia_size; + if (iattr->ia_valid & ATTR_ATIME_SET) { + p9attr.atime_sec = iattr->ia_atime.tv_sec; + p9attr.atime_nsec = iattr->ia_atime.tv_nsec; + } + if (iattr->ia_valid & ATTR_MTIME_SET) { + p9attr.mtime_sec = iattr->ia_mtime.tv_sec; + p9attr.mtime_nsec = iattr->ia_mtime.tv_nsec; + } if (iattr->ia_valid & ATTR_FILE) { fid = iattr->ia_file->private_data; diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index b739e02f5ef7..97e23b4e6982 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -20,6 +20,7 @@ #include <linux/slab.h> #include <linux/statfs.h> #include <linux/magic.h> +#include <linux/fscache.h> #include <net/9p/9p.h> #include <net/9p/client.h> @@ -309,6 +310,7 @@ static int v9fs_write_inode(struct inode *inode, __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; } + fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode)); return 0; } @@ -332,6 +334,7 @@ static int v9fs_write_inode_dotl(struct inode *inode, __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; } + fscache_unpin_writeback(wbc, v9fs_inode_cookie(v9inode)); return 0; } diff --git a/fs/Kconfig b/fs/Kconfig index a6313a969bc5..5976eb33535f 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -15,11 +15,11 @@ config VALIDATE_FS_PARSER Enable this to perform validation of the parameter description for a filesystem when it is registered. -if BLOCK - config FS_IOMAP bool +if BLOCK + source "fs/ext2/Kconfig" source "fs/ext4/Kconfig" source "fs/jbd2/Kconfig" @@ -42,11 +42,13 @@ source "fs/nilfs2/Kconfig" source "fs/f2fs/Kconfig" source "fs/zonefs/Kconfig" +endif # BLOCK + config FS_DAX bool "File system based Direct Access (DAX) support" depends on MMU depends on !(ARM || MIPS || SPARC) - select DEV_PAGEMAP_OPS if (ZONE_DEVICE && !FS_DAX_LIMITED) + depends on ZONE_DEVICE || FS_DAX_LIMITED select FS_IOMAP select DAX help @@ -89,8 +91,6 @@ config FS_DAX_PMD config FS_DAX_LIMITED bool -endif # BLOCK - # Posix ACL utility routines # # Note: Posix ACLs can be implemented without these helpers. Never use @@ -245,19 +245,27 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS -config HUGETLB_PAGE_FREE_VMEMMAP +# +# Select this config option from the architecture Kconfig, if it is preferred +# to enable the feature of minimizing overhead of struct page associated with +# each HugeTLB page. +# +config ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP + bool + +config HUGETLB_PAGE_OPTIMIZE_VMEMMAP def_bool HUGETLB_PAGE - depends on X86_64 + depends on ARCH_WANT_HUGETLB_PAGE_OPTIMIZE_VMEMMAP depends on SPARSEMEM_VMEMMAP -config HUGETLB_PAGE_FREE_VMEMMAP_DEFAULT_ON - bool "Default freeing vmemmap pages of HugeTLB to on" +config HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON + bool "Default optimizing vmemmap pages of HugeTLB to on" default n - depends on HUGETLB_PAGE_FREE_VMEMMAP + depends on HUGETLB_PAGE_OPTIMIZE_VMEMMAP help - When using HUGETLB_PAGE_FREE_VMEMMAP, the freeing unused vmemmap + When using HUGETLB_PAGE_OPTIMIZE_VMEMMAP, the optimizing unused vmemmap pages associated with each HugeTLB page is default off. Say Y here - to enable freeing vmemmap pages of HugeTLB by default. It can then + to enable optimizing vmemmap pages of HugeTLB by default. It can then be disabled on the command line via hugetlb_free_vmemmap=off. config MEMFD_CREATE @@ -344,7 +352,7 @@ config LOCKD config LOCKD_V4 bool - depends on NFSD_V3 || NFS_V3 + depends on NFSD || NFS_V3 depends on FILE_LOCKING default y @@ -369,8 +377,8 @@ source "fs/ksmbd/Kconfig" config SMBFS_COMMON tristate - default y if CIFS=y - default m if CIFS=m + default y if CIFS=y || SMB_SERVER=y + default m if CIFS=m || SMB_SERVER=m source "fs/coda/Kconfig" source "fs/afs/Kconfig" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 4d5ae61580aa..21e154516bf2 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -28,6 +28,16 @@ config BINFMT_ELF ld.so (check the file <file:Documentation/Changes> for location and latest version). +config BINFMT_ELF_KUNIT_TEST + bool "Build KUnit tests for ELF binary support" if !KUNIT_ALL_TESTS + depends on KUNIT=y && BINFMT_ELF=y + default KUNIT_ALL_TESTS + help + This builds the ELF loader KUnit tests, which try to gather + prior bug fixes into a regression test collection. This is really + only needed for debugging. Note that with CONFIG_COMPAT=y, the + compat_binfmt_elf KUnit test is also created. + config COMPAT_BINFMT_ELF def_bool y depends on COMPAT && BINFMT_ELF @@ -36,6 +46,9 @@ config COMPAT_BINFMT_ELF config ARCH_BINFMT_ELF_STATE bool +config ARCH_BINFMT_ELF_EXTRA_PHDRS + bool + config ARCH_HAVE_ELF_PROT bool @@ -45,7 +58,7 @@ config ARCH_USE_GNU_PROPERTY config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" default y if !BINFMT_ELF - depends on (ARM || (SUPERH && !MMU)) + depends on ARM || ((M68K || SUPERH) && !MMU) select ELFCORE help ELF FDPIC binaries are based on ELF, but allow the individual load @@ -129,12 +142,6 @@ config BINFMT_ZFLAT help Support FLAT format compressed binaries -config BINFMT_SHARED_FLAT - bool "Enable shared FLAT support" - depends on BINFMT_FLAT - help - Support FLAT shared libraries - config HAVE_AOUT def_bool n diff --git a/fs/Makefile b/fs/Makefile index 84c5e4cdfee5..208a74e0b00e 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -6,6 +6,8 @@ # Rewritten to use lists instead of if-statements. # +obj-$(CONFIG_SYSCTL) += sysctls.o + obj-y := open.o read_write.o file_table.o super.o \ char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ ioctl.o readdir.o select.o dcache.o inode.o \ @@ -94,7 +96,7 @@ obj-$(CONFIG_EXPORTFS) += exportfs/ obj-$(CONFIG_NFSD) += nfsd/ obj-$(CONFIG_LOCKD) += lockd/ obj-$(CONFIG_NLS) += nls/ -obj-$(CONFIG_UNICODE) += unicode/ +obj-y += unicode/ obj-$(CONFIG_SYSV_FS) += sysv/ obj-$(CONFIG_SMBFS_COMMON) += smbfs_common/ obj-$(CONFIG_CIFS) += cifs/ diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index adbb3a1edcbf..ee22278b0cfc 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -38,9 +38,9 @@ static int adfs_writepage(struct page *page, struct writeback_control *wbc) return block_write_full_page(page, adfs_get_block, wbc); } -static int adfs_readpage(struct file *file, struct page *page) +static int adfs_read_folio(struct file *file, struct folio *folio) { - return block_read_full_page(page, adfs_get_block); + return block_read_full_folio(folio, adfs_get_block); } static void adfs_write_failed(struct address_space *mapping, loff_t to) @@ -52,13 +52,13 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to) } static int adfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret; *pagep = NULL; - ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata, adfs_get_block, &ADFS_I(mapping->host)->mmu_private); if (unlikely(ret)) @@ -73,8 +73,9 @@ static sector_t _adfs_bmap(struct address_space *mapping, sector_t block) } static const struct address_space_operations adfs_aops = { - .set_page_dirty = __set_page_dirty_buffers, - .readpage = adfs_readpage, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, + .read_folio = adfs_read_folio, .writepage = adfs_writepage, .write_begin = adfs_write_begin, .write_end = generic_write_end, @@ -355,7 +356,6 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc) { struct super_block *sb = inode->i_sb; struct object_info obj; - int ret; obj.indaddr = ADFS_I(inode)->indaddr; obj.name_len = 0; @@ -365,6 +365,5 @@ int adfs_write_inode(struct inode *inode, struct writeback_control *wbc) obj.attr = ADFS_I(inode)->attr; obj.size = inode->i_size; - ret = adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL); - return ret; + return adfs_dir_update(sb, &obj, wbc->sync_mode == WB_SYNC_ALL); } diff --git a/fs/adfs/super.c b/fs/adfs/super.c index bdbd26e571ed..e8bfc38239cd 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -220,7 +220,7 @@ static struct kmem_cache *adfs_inode_cachep; static struct inode *adfs_alloc_inode(struct super_block *sb) { struct adfs_inode_info *ei; - ei = kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL); + ei = alloc_inode_sb(sb, adfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/affs/file.c b/fs/affs/file.c index 75ebd2b576ca..cd00a4c68a12 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -375,9 +375,9 @@ static int affs_writepage(struct page *page, struct writeback_control *wbc) return block_write_full_page(page, affs_get_block, wbc); } -static int affs_readpage(struct file *file, struct page *page) +static int affs_read_folio(struct file *file, struct folio *folio) { - return block_read_full_page(page, affs_get_block); + return block_read_full_folio(folio, affs_get_block); } static void affs_write_failed(struct address_space *mapping, loff_t to) @@ -414,13 +414,13 @@ affs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) } static int affs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret; *pagep = NULL; - ret = cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata, + ret = cont_write_begin(file, mapping, pos, len, pagep, fsdata, affs_get_block, &AFFS_I(mapping->host)->mmu_private); if (unlikely(ret)) @@ -453,8 +453,9 @@ static sector_t _affs_bmap(struct address_space *mapping, sector_t block) } const struct address_space_operations affs_aops = { - .set_page_dirty = __set_page_dirty_buffers, - .readpage = affs_readpage, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, + .read_folio = affs_read_folio, .writepage = affs_writepage, .write_begin = affs_write_begin, .write_end = affs_write_end, @@ -628,8 +629,9 @@ out: } static int -affs_readpage_ofs(struct file *file, struct page *page) +affs_read_folio_ofs(struct file *file, struct folio *folio) { + struct page *page = &folio->page; struct inode *inode = page->mapping->host; u32 to; int err; @@ -649,7 +651,7 @@ affs_readpage_ofs(struct file *file, struct page *page) } static int affs_write_begin_ofs(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; @@ -669,7 +671,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping } index = pos >> PAGE_SHIFT; - page = grab_cache_page_write_begin(mapping, index, flags); + page = grab_cache_page_write_begin(mapping, index); if (!page) return -ENOMEM; *pagep = page; @@ -834,8 +836,9 @@ err_bh: } const struct address_space_operations affs_aops_ofs = { - .set_page_dirty = __set_page_dirty_buffers, - .readpage = affs_readpage_ofs, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, + .read_folio = affs_read_folio_ofs, //.writepage = affs_writepage_ofs, .write_begin = affs_write_begin_ofs, .write_end = affs_write_end_ofs @@ -885,7 +888,7 @@ affs_truncate(struct inode *inode) loff_t isize = inode->i_size; int res; - res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, 0, &page, &fsdata); + res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, &page, &fsdata); if (!res) res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, page, fsdata); else diff --git a/fs/affs/super.c b/fs/affs/super.c index c609005a9eaa..4c5f30a83336 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -100,7 +100,7 @@ static struct inode *affs_alloc_inode(struct super_block *sb) { struct affs_inode_info *i; - i = kmem_cache_alloc(affs_inode_cachep, GFP_KERNEL); + i = alloc_inode_sb(sb, affs_inode_cachep, GFP_KERNEL); if (!i) return NULL; diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c index a7531b26e8f0..31d6446dc166 100644 --- a/fs/affs/symlink.c +++ b/fs/affs/symlink.c @@ -11,8 +11,9 @@ #include "affs.h" -static int affs_symlink_readpage(struct file *file, struct page *page) +static int affs_symlink_read_folio(struct file *file, struct folio *folio) { + struct page *page = &folio->page; struct buffer_head *bh; struct inode *inode = page->mapping->host; char *link = page_address(page); @@ -67,7 +68,7 @@ fail: } const struct address_space_operations affs_symlink_aops = { - .readpage = affs_symlink_readpage, + .read_folio = affs_symlink_read_folio, }; const struct inode_operations affs_symlink_inode_operations = { diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 75c4e4043d1d..e8956b65d7ff 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -3,10 +3,7 @@ # Makefile for Red Hat Linux AFS client. # -afs-cache-$(CONFIG_AFS_FSCACHE) := cache.o - kafs-y := \ - $(afs-cache-y) \ addr_list.o \ callback.o \ cell.o \ diff --git a/fs/afs/cache.c b/fs/afs/cache.c deleted file mode 100644 index 037af93e3aba..000000000000 --- a/fs/afs/cache.c +++ /dev/null @@ -1,68 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* AFS caching stuff - * - * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - */ - -#include <linux/sched.h> -#include "internal.h" - -static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, - const void *buffer, - uint16_t buflen, - loff_t object_size); - -struct fscache_netfs afs_cache_netfs = { - .name = "afs", - .version = 2, -}; - -struct fscache_cookie_def afs_cell_cache_index_def = { - .name = "AFS.cell", - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; - -struct fscache_cookie_def afs_volume_cache_index_def = { - .name = "AFS.volume", - .type = FSCACHE_COOKIE_TYPE_INDEX, -}; - -struct fscache_cookie_def afs_vnode_cache_index_def = { - .name = "AFS.vnode", - .type = FSCACHE_COOKIE_TYPE_DATAFILE, - .check_aux = afs_vnode_cache_check_aux, -}; - -/* - * check that the auxiliary data indicates that the entry is still valid - */ -static enum fscache_checkaux afs_vnode_cache_check_aux(void *cookie_netfs_data, - const void *buffer, - uint16_t buflen, - loff_t object_size) -{ - struct afs_vnode *vnode = cookie_netfs_data; - struct afs_vnode_cache_aux aux; - - _enter("{%llx,%x,%llx},%p,%u", - vnode->fid.vnode, vnode->fid.unique, vnode->status.data_version, - buffer, buflen); - - memcpy(&aux, buffer, sizeof(aux)); - - /* check the size of the data is what we're expecting */ - if (buflen != sizeof(aux)) { - _leave(" = OBSOLETE [len %hx != %zx]", buflen, sizeof(aux)); - return FSCACHE_CHECKAUX_OBSOLETE; - } - - if (vnode->status.data_version != aux.data_version) { - _leave(" = OBSOLETE [vers %llx != %llx]", - aux.data_version, vnode->status.data_version); - return FSCACHE_CHECKAUX_OBSOLETE; - } - - _leave(" = SUCCESS"); - return FSCACHE_CHECKAUX_OKAY; -} diff --git a/fs/afs/callback.c b/fs/afs/callback.c index 1b4d5809808d..a484fa642808 100644 --- a/fs/afs/callback.c +++ b/fs/afs/callback.c @@ -30,7 +30,7 @@ void afs_invalidate_mmap_work(struct work_struct *work) { struct afs_vnode *vnode = container_of(work, struct afs_vnode, cb_work); - unmap_mapping_pages(vnode->vfs_inode.i_mapping, 0, 0, false); + unmap_mapping_pages(vnode->netfs.inode.i_mapping, 0, 0, false); } void afs_server_init_callback_work(struct work_struct *work) diff --git a/fs/afs/cell.c b/fs/afs/cell.c index d88407fb9bc0..07ad744eef77 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -680,13 +680,6 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) return ret; } -#ifdef CONFIG_AFS_FSCACHE - cell->cache = fscache_acquire_cookie(afs_cache_netfs.primary_index, - &afs_cell_cache_index_def, - cell->name, strlen(cell->name), - NULL, 0, - cell, 0, true); -#endif ret = afs_proc_cell_setup(cell); if (ret < 0) return ret; @@ -723,11 +716,6 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell) afs_dynroot_rmdir(net, cell); mutex_unlock(&net->proc_cells_lock); -#ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(cell->cache, NULL, false); - cell->cache = NULL; -#endif - _leave(""); } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index da9b4f8577a1..56ae5cd5184f 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -41,11 +41,12 @@ static int afs_symlink(struct user_namespace *mnt_userns, struct inode *dir, static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); -static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags); -static void afs_dir_invalidatepage(struct page *page, unsigned int offset, - unsigned int length); +static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags); +static void afs_dir_invalidate_folio(struct folio *folio, size_t offset, + size_t length); -static int afs_dir_set_page_dirty(struct page *page) +static bool afs_dir_dirty_folio(struct address_space *mapping, + struct folio *folio) { BUG(); /* This should never happen. */ } @@ -73,9 +74,9 @@ const struct inode_operations afs_dir_inode_operations = { }; const struct address_space_operations afs_dir_aops = { - .set_page_dirty = afs_dir_set_page_dirty, - .releasepage = afs_dir_releasepage, - .invalidatepage = afs_dir_invalidatepage, + .dirty_folio = afs_dir_dirty_folio, + .release_folio = afs_dir_release_folio, + .invalidate_folio = afs_dir_invalidate_folio, }; const struct dentry_operations afs_fs_dentry_operations = { @@ -108,7 +109,7 @@ struct afs_lookup_cookie { */ static void afs_dir_read_cleanup(struct afs_read *req) { - struct address_space *mapping = req->vnode->vfs_inode.i_mapping; + struct address_space *mapping = req->vnode->netfs.inode.i_mapping; struct folio *folio; pgoff_t last = req->nr_pages - 1; @@ -152,7 +153,7 @@ static bool afs_dir_check_folio(struct afs_vnode *dvnode, struct folio *folio, block = kmap_local_folio(folio, offset); if (block->hdr.magic != AFS_DIR_MAGIC) { printk("kAFS: %s(%lx): [%llx] bad magic %zx/%zx is %04hx\n", - __func__, dvnode->vfs_inode.i_ino, + __func__, dvnode->netfs.inode.i_ino, pos, offset, size, ntohs(block->hdr.magic)); trace_afs_dir_check_failed(dvnode, pos + offset, i_size); kunmap_local(block); @@ -182,7 +183,7 @@ error: static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req) { union afs_xdr_dir_block *block; - struct address_space *mapping = dvnode->vfs_inode.i_mapping; + struct address_space *mapping = dvnode->netfs.inode.i_mapping; struct folio *folio; pgoff_t last = req->nr_pages - 1; size_t offset, size; @@ -216,7 +217,7 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req) */ static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req) { - struct address_space *mapping = dvnode->vfs_inode.i_mapping; + struct address_space *mapping = dvnode->netfs.inode.i_mapping; struct folio *folio; pgoff_t last = req->nr_pages - 1; int ret = 0; @@ -268,7 +269,7 @@ static int afs_dir_open(struct inode *inode, struct file *file) static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) __acquires(&dvnode->validate_lock) { - struct address_space *mapping = dvnode->vfs_inode.i_mapping; + struct address_space *mapping = dvnode->netfs.inode.i_mapping; struct afs_read *req; loff_t i_size; int nr_pages, i; @@ -286,7 +287,7 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) req->cleanup = afs_dir_read_cleanup; expand: - i_size = i_size_read(&dvnode->vfs_inode); + i_size = i_size_read(&dvnode->netfs.inode); if (i_size < 2048) { ret = afs_bad(dvnode, afs_file_error_dir_small); goto error; @@ -304,7 +305,7 @@ expand: req->actual_len = i_size; /* May change */ req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */ req->data_version = dvnode->status.data_version; /* May change */ - iov_iter_xarray(&req->def_iter, READ, &dvnode->vfs_inode.i_mapping->i_pages, + iov_iter_xarray(&req->def_iter, READ, &dvnode->netfs.inode.i_mapping->i_pages, 0, i_size); req->iter = &req->def_iter; @@ -462,8 +463,11 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode, } /* skip if starts before the current position */ - if (offset < curr) + if (offset < curr) { + if (next > curr) + ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent); continue; + } /* found the next entry */ if (!dir_emit(ctx, dire->u.name, nlen, @@ -893,7 +897,7 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, out_op: if (op->error == 0) { - inode = &op->file[1].vnode->vfs_inode; + inode = &op->file[1].vnode->netfs.inode; op->file[1].vnode = NULL; } @@ -1135,7 +1139,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) afs_stat_v(dir, n_reval); /* search the directory for this vnode */ - ret = afs_do_lookup_one(&dir->vfs_inode, dentry, &fid, key, &dir_version); + ret = afs_do_lookup_one(&dir->netfs.inode, dentry, &fid, key, &dir_version); switch (ret) { case 0: /* the filename maps to something */ @@ -1166,7 +1170,7 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) _debug("%pd: file deleted (uq %u -> %u I:%u)", dentry, fid.unique, vnode->fid.unique, - vnode->vfs_inode.i_generation); + vnode->netfs.inode.i_generation); goto not_found; } goto out_valid; @@ -1364,7 +1368,7 @@ static void afs_dir_remove_subdir(struct dentry *dentry) if (d_really_is_positive(dentry)) { struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); - clear_nlink(&vnode->vfs_inode); + clear_nlink(&vnode->netfs.inode); set_bit(AFS_VNODE_DELETED, &vnode->flags); clear_bit(AFS_VNODE_CB_PROMISED, &vnode->flags); clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); @@ -1483,8 +1487,8 @@ static void afs_dir_remove_link(struct afs_operation *op) /* Already done */ } else if (test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { write_seqlock(&vnode->cb_lock); - drop_nlink(&vnode->vfs_inode); - if (vnode->vfs_inode.i_nlink == 0) { + drop_nlink(&vnode->netfs.inode); + if (vnode->netfs.inode.i_nlink == 0) { set_bit(AFS_VNODE_DELETED, &vnode->flags); __afs_break_callback(vnode, afs_cb_break_for_unlink); } @@ -1500,7 +1504,7 @@ static void afs_dir_remove_link(struct afs_operation *op) op->error = ret; } - _debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, op->error); + _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, op->error); } static void afs_unlink_success(struct afs_operation *op) @@ -1676,8 +1680,8 @@ static void afs_link_success(struct afs_operation *op) afs_update_dentry_version(op, dvp, op->dentry); if (op->dentry_2->d_parent == op->dentry->d_parent) afs_update_dentry_version(op, dvp, op->dentry_2); - ihold(&vp->vnode->vfs_inode); - d_instantiate(op->dentry, &vp->vnode->vfs_inode); + ihold(&vp->vnode->netfs.inode); + d_instantiate(op->dentry, &vp->vnode->netfs.inode); } static void afs_link_put(struct afs_operation *op) @@ -2001,9 +2005,8 @@ error: * Release a directory folio and clean up its private state if it's not busy * - return true if the folio can now be released, false if not */ -static int afs_dir_releasepage(struct page *subpage, gfp_t gfp_flags) +static bool afs_dir_release_folio(struct folio *folio, gfp_t gfp_flags) { - struct folio *folio = page_folio(subpage); struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio)); _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio_index(folio)); @@ -2019,13 +2022,12 @@ static int afs_dir_releasepage(struct page *subpage, gfp_t gfp_flags) /* * Invalidate part or all of a folio. */ -static void afs_dir_invalidatepage(struct page *subpage, unsigned int offset, - unsigned int length) +static void afs_dir_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { - struct folio *folio = page_folio(subpage); struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio)); - _enter("{%lu},%u,%u", folio_index(folio), offset, length); + _enter("{%lu},%zu,%zu", folio->index, offset, length); BUG_ON(!folio_test_locked(folio)); diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index d98e109ecee9..0ab7752d1b75 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -109,7 +109,7 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block, */ static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index) { - struct address_space *mapping = vnode->vfs_inode.i_mapping; + struct address_space *mapping = vnode->netfs.inode.i_mapping; struct folio *folio; folio = __filemap_get_folio(mapping, index, @@ -216,7 +216,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode, _enter(",,{%d,%s},", name->len, name->name); - i_size = i_size_read(&vnode->vfs_inode); + i_size = i_size_read(&vnode->netfs.inode); if (i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS || (i_size & (AFS_DIR_BLOCK_SIZE - 1))) { clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); @@ -336,7 +336,7 @@ found_space: if (b < AFS_DIR_BLOCKS_WITH_CTR) meta->meta.alloc_ctrs[b] -= need_slots; - inode_inc_iversion_raw(&vnode->vfs_inode); + inode_inc_iversion_raw(&vnode->netfs.inode); afs_stat_v(vnode, n_dir_cr); _debug("Insert %s in %u[%u]", name->name, b, slot); @@ -383,7 +383,7 @@ void afs_edit_dir_remove(struct afs_vnode *vnode, _enter(",,{%d,%s},", name->len, name->name); - i_size = i_size_read(&vnode->vfs_inode); + i_size = i_size_read(&vnode->netfs.inode); if (i_size < AFS_DIR_BLOCK_SIZE || i_size > AFS_DIR_BLOCK_SIZE * AFS_DIR_MAX_BLOCKS || (i_size & (AFS_DIR_BLOCK_SIZE - 1))) { @@ -463,7 +463,7 @@ found_dirent: if (b < AFS_DIR_BLOCKS_WITH_CTR) meta->meta.alloc_ctrs[b] += need_slots; - inode_set_iversion_raw(&vnode->vfs_inode, vnode->status.data_version); + inode_set_iversion_raw(&vnode->netfs.inode, vnode->status.data_version); afs_stat_v(vnode, n_dir_rm); _debug("Remove %s from %u[%u]", name->name, b, slot); diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index 45cfd50a9521..bb5807e87fa4 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -131,7 +131,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode, goto out; } while (!d_is_negative(sdentry)); - ihold(&vnode->vfs_inode); + ihold(&vnode->netfs.inode); ret = afs_do_silly_rename(dvnode, vnode, dentry, sdentry, key); switch (ret) { @@ -148,7 +148,7 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode, d_drop(sdentry); } - iput(&vnode->vfs_inode); + iput(&vnode->netfs.inode); dput(sdentry); out: _leave(" = %d", ret); diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index db832cc931c8..d7d9402ff718 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -76,6 +76,7 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) /* there shouldn't be an existing inode */ BUG_ON(!(inode->i_state & I_NEW)); + netfs_inode_init(&vnode->netfs, NULL); inode->i_size = 0; inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; if (root) { diff --git a/fs/afs/file.c b/fs/afs/file.c index afe4b803f84b..42118a4f3383 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -14,17 +14,16 @@ #include <linux/gfp.h> #include <linux/task_io_accounting_ops.h> #include <linux/mm.h> +#include <linux/swap.h> #include <linux/netfs.h> #include "internal.h" static int afs_file_mmap(struct file *file, struct vm_area_struct *vma); -static int afs_readpage(struct file *file, struct page *page); -static int afs_symlink_readpage(struct file *file, struct page *page); -static void afs_invalidatepage(struct page *page, unsigned int offset, - unsigned int length); -static int afs_releasepage(struct page *page, gfp_t gfp_flags); +static int afs_symlink_read_folio(struct file *file, struct folio *folio); +static void afs_invalidate_folio(struct folio *folio, size_t offset, + size_t length); +static bool afs_release_folio(struct folio *folio, gfp_t gfp_flags); -static void afs_readahead(struct readahead_control *ractl); static ssize_t afs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter); static void afs_vm_open(struct vm_area_struct *area); static void afs_vm_close(struct vm_area_struct *area); @@ -51,12 +50,12 @@ const struct inode_operations afs_file_inode_operations = { }; const struct address_space_operations afs_file_aops = { - .readpage = afs_readpage, - .readahead = afs_readahead, - .set_page_dirty = afs_set_page_dirty, - .launder_page = afs_launder_page, - .releasepage = afs_releasepage, - .invalidatepage = afs_invalidatepage, + .read_folio = netfs_read_folio, + .readahead = netfs_readahead, + .dirty_folio = afs_dirty_folio, + .launder_folio = afs_launder_folio, + .release_folio = afs_release_folio, + .invalidate_folio = afs_invalidate_folio, .write_begin = afs_write_begin, .write_end = afs_write_end, .writepage = afs_writepage, @@ -64,9 +63,9 @@ const struct address_space_operations afs_file_aops = { }; const struct address_space_operations afs_symlink_aops = { - .readpage = afs_symlink_readpage, - .releasepage = afs_releasepage, - .invalidatepage = afs_invalidatepage, + .read_folio = afs_symlink_read_folio, + .release_folio = afs_release_folio, + .invalidate_folio = afs_invalidate_folio, }; static const struct vm_operations_struct afs_vm_ops = { @@ -158,7 +157,9 @@ int afs_open(struct inode *inode, struct file *file) if (file->f_flags & O_TRUNC) set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); - + + fscache_use_cookie(afs_vnode_cache(vnode), file->f_mode & FMODE_WRITE); + file->private_data = af; _leave(" = 0"); return 0; @@ -177,8 +178,10 @@ error: */ int afs_release(struct inode *inode, struct file *file) { + struct afs_vnode_cache_aux aux; struct afs_vnode *vnode = AFS_FS_I(inode); struct afs_file *af = file->private_data; + loff_t i_size; int ret = 0; _enter("{%llx:%llu},", vnode->fid.vid, vnode->fid.vnode); @@ -189,6 +192,15 @@ int afs_release(struct inode *inode, struct file *file) file->private_data = NULL; if (af->wb) afs_put_wb_key(af->wb); + + if ((file->f_mode & FMODE_WRITE)) { + i_size = i_size_read(&vnode->netfs.inode); + afs_set_cache_aux(vnode, &aux); + fscache_unuse_cookie(afs_vnode_cache(vnode), &aux, &i_size); + } else { + fscache_unuse_cookie(afs_vnode_cache(vnode), NULL, NULL); + } + key_put(af->key); kfree(af); afs_prune_wb_keys(vnode); @@ -226,7 +238,7 @@ void afs_put_read(struct afs_read *req) static void afs_fetch_data_notify(struct afs_operation *op) { struct afs_read *req = op->fetch.req; - struct netfs_read_subrequest *subreq = req->subreq; + struct netfs_io_subrequest *subreq = req->subreq; int error = op->error; if (error == -ECONNABORTED) @@ -296,7 +308,7 @@ int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req) return afs_do_sync_operation(op); } -static void afs_req_issue_op(struct netfs_read_subrequest *subreq) +static void afs_issue_read(struct netfs_io_subrequest *subreq) { struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode); struct afs_read *fsreq; @@ -313,18 +325,17 @@ static void afs_req_issue_op(struct netfs_read_subrequest *subreq) fsreq->iter = &fsreq->def_iter; iov_iter_xarray(&fsreq->def_iter, READ, - &fsreq->vnode->vfs_inode.i_mapping->i_pages, + &fsreq->vnode->netfs.inode.i_mapping->i_pages, fsreq->pos, fsreq->len); afs_fetch_data(fsreq->vnode, fsreq); afs_put_read(fsreq); } -static int afs_symlink_readpage(struct file *file, struct page *page) +static int afs_symlink_read_folio(struct file *file, struct folio *folio) { - struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); + struct afs_vnode *vnode = AFS_FS_I(folio->mapping->host); struct afs_read *fsreq; - struct folio *folio = page_folio(page); int ret; fsreq = afs_alloc_read(GFP_NOFS); @@ -335,33 +346,32 @@ static int afs_symlink_readpage(struct file *file, struct page *page) fsreq->len = folio_size(folio); fsreq->vnode = vnode; fsreq->iter = &fsreq->def_iter; - iov_iter_xarray(&fsreq->def_iter, READ, &page->mapping->i_pages, + iov_iter_xarray(&fsreq->def_iter, READ, &folio->mapping->i_pages, fsreq->pos, fsreq->len); ret = afs_fetch_data(fsreq->vnode, fsreq); if (ret == 0) - SetPageUptodate(page); - unlock_page(page); + folio_mark_uptodate(folio); + folio_unlock(folio); return ret; } -static void afs_init_rreq(struct netfs_read_request *rreq, struct file *file) +static int afs_init_request(struct netfs_io_request *rreq, struct file *file) { rreq->netfs_priv = key_get(afs_file_key(file)); + return 0; } -static bool afs_is_cache_enabled(struct inode *inode) -{ - struct fscache_cookie *cookie = afs_vnode_cache(AFS_FS_I(inode)); - - return fscache_cookie_enabled(cookie) && !hlist_empty(&cookie->backing_objects); -} - -static int afs_begin_cache_operation(struct netfs_read_request *rreq) +static int afs_begin_cache_operation(struct netfs_io_request *rreq) { +#ifdef CONFIG_AFS_FSCACHE struct afs_vnode *vnode = AFS_FS_I(rreq->inode); - return fscache_begin_read_operation(rreq, afs_vnode_cache(vnode)); + return fscache_begin_read_operation(&rreq->cache_resources, + afs_vnode_cache(vnode)); +#else + return -ENOBUFS; +#endif } static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, @@ -372,38 +382,31 @@ static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0; } -static void afs_priv_cleanup(struct address_space *mapping, void *netfs_priv) +static void afs_free_request(struct netfs_io_request *rreq) { - key_put(netfs_priv); + key_put(rreq->netfs_priv); } -const struct netfs_read_request_ops afs_req_ops = { - .init_rreq = afs_init_rreq, - .is_cache_enabled = afs_is_cache_enabled, +const struct netfs_request_ops afs_req_ops = { + .init_request = afs_init_request, + .free_request = afs_free_request, .begin_cache_operation = afs_begin_cache_operation, .check_write_begin = afs_check_write_begin, - .issue_op = afs_req_issue_op, - .cleanup = afs_priv_cleanup, + .issue_read = afs_issue_read, }; -static int afs_readpage(struct file *file, struct page *page) +int afs_write_inode(struct inode *inode, struct writeback_control *wbc) { - struct folio *folio = page_folio(page); - - return netfs_readpage(file, folio, &afs_req_ops, NULL); -} - -static void afs_readahead(struct readahead_control *ractl) -{ - netfs_readahead(ractl, &afs_req_ops, NULL); + fscache_unpin_writeback(wbc, afs_vnode_cache(AFS_FS_I(inode))); + return 0; } /* * Adjust the dirty region of the page on truncation or full invalidation, * getting rid of the markers altogether if the region is entirely invalidated. */ -static void afs_invalidate_dirty(struct folio *folio, unsigned int offset, - unsigned int length) +static void afs_invalidate_dirty(struct folio *folio, size_t offset, + size_t length) { struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); unsigned long priv; @@ -460,16 +463,14 @@ full_invalidate: * - release a page and clean up its private data if offset is 0 (indicating * the entire page) */ -static void afs_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +static void afs_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { - struct folio *folio = page_folio(page); - - _enter("{%lu},%u,%u", folio_index(folio), offset, length); + _enter("{%lu},%zu,%zu", folio->index, offset, length); - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); - if (PagePrivate(page)) + if (folio_get_private(folio)) afs_invalidate_dirty(folio, offset, length); folio_wait_fscache(folio); @@ -480,23 +481,23 @@ static void afs_invalidatepage(struct page *page, unsigned int offset, * release a page and clean up its private state if it's not busy * - return true if the page can now be released, false if not */ -static int afs_releasepage(struct page *page, gfp_t gfp_flags) +static bool afs_release_folio(struct folio *folio, gfp_t gfp) { - struct folio *folio = page_folio(page); struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); _enter("{{%llx:%llu}[%lu],%lx},%x", vnode->fid.vid, vnode->fid.vnode, folio_index(folio), folio->flags, - gfp_flags); + gfp); - /* deny if page is being written to the cache and the caller hasn't + /* deny if folio is being written to the cache and the caller hasn't * elected to wait */ #ifdef CONFIG_AFS_FSCACHE if (folio_test_fscache(folio)) { - if (!(gfp_flags & __GFP_DIRECT_RECLAIM) || !(gfp_flags & __GFP_FS)) + if (current_is_kswapd() || !(gfp & __GFP_FS)) return false; folio_wait_fscache(folio); } + fscache_note_page_release(afs_vnode_cache(vnode)); #endif if (folio_test_private(folio)) { diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index d222dfbe976b..7a3803ce3a22 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -232,14 +232,14 @@ int afs_put_operation(struct afs_operation *op) if (op->file[1].modification && op->file[1].vnode != op->file[0].vnode) clear_bit(AFS_VNODE_MODIFYING, &op->file[1].vnode->flags); if (op->file[0].put_vnode) - iput(&op->file[0].vnode->vfs_inode); + iput(&op->file[0].vnode->netfs.inode); if (op->file[1].put_vnode) - iput(&op->file[1].vnode->vfs_inode); + iput(&op->file[1].vnode->netfs.inode); if (op->more_files) { for (i = 0; i < op->nr_files - 2; i++) if (op->more_files[i].put_vnode) - iput(&op->more_files[i].vnode->vfs_inode); + iput(&op->more_files[i].vnode->netfs.inode); kfree(op->more_files); } diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 16906eb592d9..64dab70d4a4f 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -54,6 +54,14 @@ static noinline void dump_vnode(struct afs_vnode *vnode, struct afs_vnode *paren } /* + * Set parameters for the netfs library + */ +static void afs_set_netfs_context(struct afs_vnode *vnode) +{ + netfs_inode_init(&vnode->netfs, &afs_req_ops); +} + +/* * Initialise an inode from the vnode status. */ static int afs_inode_init_from_status(struct afs_operation *op, @@ -88,7 +96,7 @@ static int afs_inode_init_from_status(struct afs_operation *op, inode->i_flags |= S_NOATIME; inode->i_uid = make_kuid(&init_user_ns, status->owner); inode->i_gid = make_kgid(&init_user_ns, status->group); - set_nlink(&vnode->vfs_inode, status->nlink); + set_nlink(&vnode->netfs.inode, status->nlink); switch (status->type) { case AFS_FTYPE_FILE: @@ -128,9 +136,10 @@ static int afs_inode_init_from_status(struct afs_operation *op, } afs_set_i_size(vnode, status->size); + afs_set_netfs_context(vnode); vnode->invalid_before = status->data_version; - inode_set_iversion_raw(&vnode->vfs_inode, status->data_version); + inode_set_iversion_raw(&vnode->netfs.inode, status->data_version); if (!vp->scb.have_cb) { /* it's a symlink we just created (the fileserver @@ -154,7 +163,7 @@ static void afs_apply_status(struct afs_operation *op, { struct afs_file_status *status = &vp->scb.status; struct afs_vnode *vnode = vp->vnode; - struct inode *inode = &vnode->vfs_inode; + struct inode *inode = &vnode->netfs.inode; struct timespec64 t; umode_t mode; bool data_changed = false; @@ -237,6 +246,7 @@ static void afs_apply_status(struct afs_operation *op, * idea of what the size should be that's not the same as * what's on the server. */ + vnode->netfs.remote_i_size = status->size; if (change_size) { afs_set_i_size(vnode, status->size); inode->i_ctime = t; @@ -279,7 +289,7 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v */ if (vp->scb.status.abort_code == VNOVNODE) { set_bit(AFS_VNODE_DELETED, &vnode->flags); - clear_nlink(&vnode->vfs_inode); + clear_nlink(&vnode->netfs.inode); __afs_break_callback(vnode, afs_cb_break_for_deleted); op->flags &= ~AFS_OPERATION_DIR_CONFLICT; } @@ -296,8 +306,8 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v if (vp->scb.have_cb) afs_apply_callback(op, vp); } else if (vp->op_unlinked && !(op->flags & AFS_OPERATION_DIR_CONFLICT)) { - drop_nlink(&vnode->vfs_inode); - if (vnode->vfs_inode.i_nlink == 0) { + drop_nlink(&vnode->netfs.inode); + if (vnode->netfs.inode.i_nlink == 0) { set_bit(AFS_VNODE_DELETED, &vnode->flags); __afs_break_callback(vnode, afs_cb_break_for_deleted); } @@ -316,7 +326,7 @@ static void afs_fetch_status_success(struct afs_operation *op) struct afs_vnode *vnode = vp->vnode; int ret; - if (vnode->vfs_inode.i_state & I_NEW) { + if (vnode->netfs.inode.i_state & I_NEW) { ret = afs_inode_init_from_status(op, vp, vnode); op->error = ret; if (ret == 0) @@ -413,28 +423,31 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) { #ifdef CONFIG_AFS_FSCACHE struct { - u32 vnode_id; - u32 unique; - u32 vnode_id_ext[2]; /* Allow for a 96-bit key */ + __be32 vnode_id; + __be32 unique; + __be32 vnode_id_ext[2]; /* Allow for a 96-bit key */ } __packed key; struct afs_vnode_cache_aux aux; if (vnode->status.type != AFS_FTYPE_FILE) { - vnode->cache = NULL; + vnode->netfs.cache = NULL; return; } - key.vnode_id = vnode->fid.vnode; - key.unique = vnode->fid.unique; - key.vnode_id_ext[0] = vnode->fid.vnode >> 32; - key.vnode_id_ext[1] = vnode->fid.vnode_hi; - aux.data_version = vnode->status.data_version; - - vnode->cache = fscache_acquire_cookie(vnode->volume->cache, - &afs_vnode_cache_index_def, - &key, sizeof(key), - &aux, sizeof(aux), - vnode, vnode->status.size, true); + key.vnode_id = htonl(vnode->fid.vnode); + key.unique = htonl(vnode->fid.unique); + key.vnode_id_ext[0] = htonl(vnode->fid.vnode >> 32); + key.vnode_id_ext[1] = htonl(vnode->fid.vnode_hi); + afs_set_cache_aux(vnode, &aux); + + afs_vnode_set_cache(vnode, + fscache_acquire_cookie( + vnode->volume->cache, + vnode->status.type == AFS_FTYPE_FILE ? + 0 : FSCACHE_ADV_SINGLE_CHUNK, + &key, sizeof(key), + &aux, sizeof(aux), + vnode->status.size)); #endif } @@ -444,7 +457,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) struct inode *afs_iget(struct afs_operation *op, struct afs_vnode_param *vp) { struct afs_vnode_param *dvp = &op->file[0]; - struct super_block *sb = dvp->vnode->vfs_inode.i_sb; + struct super_block *sb = dvp->vnode->netfs.inode.i_sb; struct afs_vnode *vnode; struct inode *inode; int ret; @@ -527,6 +540,7 @@ struct inode *afs_root_iget(struct super_block *sb, struct key *key) vnode = AFS_FS_I(inode); vnode->cb_v_break = as->volume->cb_v_break, + afs_set_netfs_context(vnode); op = afs_alloc_operation(key, as->volume); if (IS_ERR(op)) { @@ -563,17 +577,15 @@ static void afs_zap_data(struct afs_vnode *vnode) { _enter("{%llx:%llu}", vnode->fid.vid, vnode->fid.vnode); -#ifdef CONFIG_AFS_FSCACHE - fscache_invalidate(vnode->cache); -#endif + afs_invalidate_cache(vnode, 0); /* nuke all the non-dirty pages that aren't locked, mapped or being * written back in a regular file and completely discard the pages in a * directory or symlink */ - if (S_ISREG(vnode->vfs_inode.i_mode)) - invalidate_remote_inode(&vnode->vfs_inode); + if (S_ISREG(vnode->netfs.inode.i_mode)) + invalidate_remote_inode(&vnode->netfs.inode); else - invalidate_inode_pages2(vnode->vfs_inode.i_mapping); + invalidate_inode_pages2(vnode->netfs.inode.i_mapping); } /* @@ -671,8 +683,8 @@ int afs_validate(struct afs_vnode *vnode, struct key *key) key_serial(key)); if (unlikely(test_bit(AFS_VNODE_DELETED, &vnode->flags))) { - if (vnode->vfs_inode.i_nlink) - clear_nlink(&vnode->vfs_inode); + if (vnode->netfs.inode.i_nlink) + clear_nlink(&vnode->netfs.inode); goto valid; } @@ -728,10 +740,23 @@ int afs_getattr(struct user_namespace *mnt_userns, const struct path *path, { struct inode *inode = d_inode(path->dentry); struct afs_vnode *vnode = AFS_FS_I(inode); - int seq = 0; + struct key *key; + int ret, seq = 0; _enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation); + if (vnode->volume && + !(query_flags & AT_STATX_DONT_SYNC) && + !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) { + key = afs_request_key(vnode->volume->cell); + if (IS_ERR(key)) + return PTR_ERR(key); + ret = afs_validate(vnode, key); + key_put(key); + if (ret < 0) + return ret; + } + do { read_seqbegin_or_lock(&vnode->cb_lock, &seq); generic_fillattr(&init_user_ns, inode, stat); @@ -762,9 +787,8 @@ int afs_drop_inode(struct inode *inode) */ void afs_evict_inode(struct inode *inode) { - struct afs_vnode *vnode; - - vnode = AFS_FS_I(inode); + struct afs_vnode_cache_aux aux; + struct afs_vnode *vnode = AFS_FS_I(inode); _enter("{%llx:%llu.%d}", vnode->fid.vid, @@ -776,6 +800,9 @@ void afs_evict_inode(struct inode *inode) ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode); truncate_inode_pages_final(&inode->i_data); + + afs_set_cache_aux(vnode, &aux); + fscache_clear_inode_writeback(afs_vnode_cache(vnode), inode, &aux); clear_inode(inode); while (!list_empty(&vnode->wb_keys)) { @@ -785,16 +812,8 @@ void afs_evict_inode(struct inode *inode) afs_put_wb_key(wbk); } -#ifdef CONFIG_AFS_FSCACHE - { - struct afs_vnode_cache_aux aux; - - aux.data_version = vnode->status.data_version; - fscache_relinquish_cookie(vnode->cache, &aux, - test_bit(AFS_VNODE_DELETED, &vnode->flags)); - vnode->cache = NULL; - } -#endif + fscache_relinquish_cookie(afs_vnode_cache(vnode), + test_bit(AFS_VNODE_DELETED, &vnode->flags)); afs_prune_wb_keys(vnode); afs_put_permits(rcu_access_pointer(vnode->permit_cache)); @@ -808,7 +827,7 @@ void afs_evict_inode(struct inode *inode) static void afs_setattr_success(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; - struct inode *inode = &vp->vnode->vfs_inode; + struct inode *inode = &vp->vnode->netfs.inode; loff_t old_i_size = i_size_read(inode); op->setattr.old_i_size = old_i_size; @@ -825,7 +844,7 @@ static void afs_setattr_success(struct afs_operation *op) static void afs_setattr_edit_file(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; - struct inode *inode = &vp->vnode->vfs_inode; + struct inode *inode = &vp->vnode->netfs.inode; if (op->setattr.attr->ia_valid & ATTR_SIZE) { loff_t size = op->setattr.attr->ia_size; @@ -833,6 +852,9 @@ static void afs_setattr_edit_file(struct afs_operation *op) if (size < i_size) truncate_pagecache(inode, size); + if (size != i_size) + fscache_resize_cookie(afs_vnode_cache(vp->vnode), + vp->scb.status.size); } } @@ -849,40 +871,67 @@ static const struct afs_operation_ops afs_setattr_operation = { int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *attr) { + const unsigned int supported = + ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID | + ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET | ATTR_TOUCH; struct afs_operation *op; struct afs_vnode *vnode = AFS_FS_I(d_inode(dentry)); + struct inode *inode = &vnode->netfs.inode; + loff_t i_size; int ret; _enter("{%llx:%llu},{n=%pd},%x", vnode->fid.vid, vnode->fid.vnode, dentry, attr->ia_valid); - if (!(attr->ia_valid & (ATTR_SIZE | ATTR_MODE | ATTR_UID | ATTR_GID | - ATTR_MTIME | ATTR_MTIME_SET | ATTR_TIMES_SET | - ATTR_TOUCH))) { + if (!(attr->ia_valid & supported)) { _leave(" = 0 [unsupported]"); return 0; } + i_size = i_size_read(inode); if (attr->ia_valid & ATTR_SIZE) { - if (!S_ISREG(vnode->vfs_inode.i_mode)) + if (!S_ISREG(inode->i_mode)) return -EISDIR; - ret = inode_newsize_ok(&vnode->vfs_inode, attr->ia_size); + ret = inode_newsize_ok(inode, attr->ia_size); if (ret) return ret; - if (attr->ia_size == i_size_read(&vnode->vfs_inode)) + if (attr->ia_size == i_size) attr->ia_valid &= ~ATTR_SIZE; } - /* flush any dirty data outstanding on a regular file */ - if (S_ISREG(vnode->vfs_inode.i_mode)) - filemap_write_and_wait(vnode->vfs_inode.i_mapping); + fscache_use_cookie(afs_vnode_cache(vnode), true); /* Prevent any new writebacks from starting whilst we do this. */ down_write(&vnode->validate_lock); + if ((attr->ia_valid & ATTR_SIZE) && S_ISREG(inode->i_mode)) { + loff_t size = attr->ia_size; + + /* Wait for any outstanding writes to the server to complete */ + loff_t from = min(size, i_size); + loff_t to = max(size, i_size); + ret = filemap_fdatawait_range(inode->i_mapping, from, to); + if (ret < 0) + goto out_unlock; + + /* Don't talk to the server if we're just shortening in-memory + * writes that haven't gone to the server yet. + */ + if (!(attr->ia_valid & (supported & ~ATTR_SIZE & ~ATTR_MTIME)) && + attr->ia_size < i_size && + attr->ia_size > vnode->status.size) { + truncate_pagecache(inode, attr->ia_size); + fscache_resize_cookie(afs_vnode_cache(vnode), + attr->ia_size); + i_size_write(inode, attr->ia_size); + ret = 0; + goto out_unlock; + } + } + op = afs_alloc_operation(((attr->ia_valid & ATTR_FILE) ? afs_file_key(attr->ia_file) : NULL), vnode->volume); @@ -907,6 +956,7 @@ int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, out_unlock: up_write(&vnode->validate_lock); + fscache_unuse_cookie(afs_vnode_cache(vnode), NULL, NULL); _leave(" = %d", ret); return ret; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index aa4c0d6c9780..a6f25d9e75b5 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -14,7 +14,6 @@ #include <linux/key.h> #include <linux/workqueue.h> #include <linux/sched.h> -#define FSCACHE_USE_NEW_IO_API #include <linux/fscache.h> #include <linux/backing-dev.h> #include <linux/uuid.h> @@ -208,7 +207,7 @@ struct afs_read { loff_t file_size; /* File size returned by server */ struct key *key; /* The key to use to reissue the read */ struct afs_vnode *vnode; /* The file being read into. */ - struct netfs_read_subrequest *subreq; /* Fscache helper read request this belongs to */ + struct netfs_io_subrequest *subreq; /* Fscache helper read request this belongs to */ afs_dataversion_t data_version; /* Version number returned by server */ refcount_t usage; unsigned int call_debug_id; @@ -312,7 +311,7 @@ struct afs_net { atomic_t n_lookup; /* Number of lookups done */ atomic_t n_reval; /* Number of dentries needing revalidation */ atomic_t n_inval; /* Number of invalidations by the server */ - atomic_t n_relpg; /* Number of invalidations by releasepage */ + atomic_t n_relpg; /* Number of invalidations by release_folio */ atomic_t n_read_dir; /* Number of directory pages read */ atomic_t n_dir_cr; /* Number of directory entry creation edits */ atomic_t n_dir_rm; /* Number of directory entry removal edits */ @@ -364,9 +363,6 @@ struct afs_cell { struct key *anonymous_key; /* anonymous user key for this cell */ struct work_struct manager; /* Manager for init/deinit/dns */ struct hlist_node proc_link; /* /proc cell list link */ -#ifdef CONFIG_AFS_FSCACHE - struct fscache_cookie *cache; /* caching cookie */ -#endif time64_t dns_expiry; /* Time AFSDB/SRV record expires */ time64_t last_inactive; /* Time of last drop of usage count */ atomic_t ref; /* Struct refcount */ @@ -590,7 +586,7 @@ struct afs_volume { #define AFS_VOLUME_BUSY 5 /* - T if volume busy notice given */ #define AFS_VOLUME_MAYBE_NO_IBULK 6 /* - T if some servers don't have InlineBulkStatus */ #ifdef CONFIG_AFS_FSCACHE - struct fscache_cookie *cache; /* caching cookie */ + struct fscache_volume *cache; /* Caching cookie */ #endif struct afs_server_list __rcu *servers; /* List of servers on which volume resides */ rwlock_t servers_lock; /* Lock for ->servers */ @@ -623,15 +619,11 @@ enum afs_lock_state { * leak from one inode to another. */ struct afs_vnode { - struct inode vfs_inode; /* the VFS's inode record */ - + struct netfs_inode netfs; /* Netfslib context and vfs inode */ struct afs_volume *volume; /* volume on which vnode resides */ struct afs_fid fid; /* the file identifier for this inode */ struct afs_file_status status; /* AFS status info for this file */ afs_dataversion_t invalid_before; /* Child dentries are invalid before this */ -#ifdef CONFIG_AFS_FSCACHE - struct fscache_cookie *cache; /* caching cookie */ -#endif struct afs_permits __rcu *permit_cache; /* cache of permits so far obtained */ struct mutex io_lock; /* Lock for serialising I/O on this mutex */ struct rw_semaphore validate_lock; /* lock for validating this vnode */ @@ -678,12 +670,20 @@ struct afs_vnode { static inline struct fscache_cookie *afs_vnode_cache(struct afs_vnode *vnode) { #ifdef CONFIG_AFS_FSCACHE - return vnode->cache; + return netfs_i_cookie(&vnode->netfs); #else return NULL; #endif } +static inline void afs_vnode_set_cache(struct afs_vnode *vnode, + struct fscache_cookie *cookie) +{ +#ifdef CONFIG_AFS_FSCACHE + vnode->netfs.cache = cookie; +#endif +} + /* * cached security record for one user's attempt to access a vnode */ @@ -872,9 +872,24 @@ struct afs_operation { * Cache auxiliary data. */ struct afs_vnode_cache_aux { - u64 data_version; + __be64 data_version; } __packed; +static inline void afs_set_cache_aux(struct afs_vnode *vnode, + struct afs_vnode_cache_aux *aux) +{ + aux->data_version = cpu_to_be64(vnode->status.data_version); +} + +static inline void afs_invalidate_cache(struct afs_vnode *vnode, unsigned int flags) +{ + struct afs_vnode_cache_aux aux; + + afs_set_cache_aux(vnode, &aux); + fscache_invalidate(afs_vnode_cache(vnode), &aux, + i_size_read(&vnode->netfs.inode), flags); +} + /* * We use folio->private to hold the amount of the folio that we've written to, * splitting the field into two parts. However, we need to represent a range @@ -962,13 +977,6 @@ extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16); */ #ifdef CONFIG_AFS_FSCACHE extern struct fscache_netfs afs_cache_netfs; -extern struct fscache_cookie_def afs_cell_cache_index_def; -extern struct fscache_cookie_def afs_volume_cache_index_def; -extern struct fscache_cookie_def afs_vnode_cache_index_def; -#else -#define afs_cell_cache_index_def (*(struct fscache_cookie_def *) NULL) -#define afs_volume_cache_index_def (*(struct fscache_cookie_def *) NULL) -#define afs_vnode_cache_index_def (*(struct fscache_cookie_def *) NULL) #endif /* @@ -1059,7 +1067,7 @@ extern const struct address_space_operations afs_file_aops; extern const struct address_space_operations afs_symlink_aops; extern const struct inode_operations afs_file_inode_operations; extern const struct file_operations afs_file_operations; -extern const struct netfs_read_request_ops afs_req_ops; +extern const struct netfs_request_ops afs_req_ops; extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *); extern void afs_put_wb_key(struct afs_wb_key *); @@ -1068,6 +1076,7 @@ extern int afs_release(struct inode *, struct file *); extern int afs_fetch_data(struct afs_vnode *, struct afs_read *); extern struct afs_read *afs_alloc_read(gfp_t); extern void afs_put_read(struct afs_read *); +extern int afs_write_inode(struct inode *, struct writeback_control *); static inline struct afs_read *afs_get_read(struct afs_read *req) { @@ -1203,7 +1212,7 @@ static inline struct afs_net *afs_i2net(struct inode *inode) static inline struct afs_net *afs_v2net(struct afs_vnode *vnode) { - return afs_i2net(&vnode->vfs_inode); + return afs_i2net(&vnode->netfs.inode); } static inline struct afs_net *afs_sock2net(struct sock *sk) @@ -1506,7 +1515,7 @@ extern struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *, * volume.c */ extern struct afs_volume *afs_create_volume(struct afs_fs_context *); -extern void afs_activate_volume(struct afs_volume *); +extern int afs_activate_volume(struct afs_volume *); extern void afs_deactivate_volume(struct afs_volume *); extern struct afs_volume *afs_get_volume(struct afs_volume *, enum afs_volume_trace); extern void afs_put_volume(struct afs_net *, struct afs_volume *, enum afs_volume_trace); @@ -1515,9 +1524,13 @@ extern int afs_check_volume_status(struct afs_volume *, struct afs_operation *); /* * write.c */ -extern int afs_set_page_dirty(struct page *); +#ifdef CONFIG_AFS_FSCACHE +bool afs_dirty_folio(struct address_space *, struct folio *); +#else +#define afs_dirty_folio filemap_dirty_folio +#endif extern int afs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata); extern int afs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, @@ -1528,7 +1541,7 @@ extern ssize_t afs_file_write(struct kiocb *, struct iov_iter *); extern int afs_fsync(struct file *, loff_t, loff_t, int); extern vm_fault_t afs_page_mkwrite(struct vm_fault *vmf); extern void afs_prune_wb_keys(struct afs_vnode *); -extern int afs_launder_page(struct page *); +int afs_launder_folio(struct folio *); /* * xattr.c @@ -1575,12 +1588,12 @@ extern void yfs_fs_store_opaque_acl2(struct afs_operation *); */ static inline struct afs_vnode *AFS_FS_I(struct inode *inode) { - return container_of(inode, struct afs_vnode, vfs_inode); + return container_of(inode, struct afs_vnode, netfs.inode); } static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode) { - return &vnode->vfs_inode; + return &vnode->netfs.inode; } /* @@ -1603,8 +1616,8 @@ static inline void afs_update_dentry_version(struct afs_operation *op, */ static inline void afs_set_i_size(struct afs_vnode *vnode, u64 size) { - i_size_write(&vnode->vfs_inode, size); - vnode->vfs_inode.i_blocks = ((size + 1023) >> 10) << 1; + i_size_write(&vnode->netfs.inode, size); + vnode->netfs.inode.i_blocks = ((size + 1023) >> 10) << 1; } /* diff --git a/fs/afs/main.c b/fs/afs/main.c index 179004b15566..eae288c8d40a 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -186,13 +186,6 @@ static int __init afs_init(void) if (!afs_lock_manager) goto error_lockmgr; -#ifdef CONFIG_AFS_FSCACHE - /* we want to be able to cache */ - ret = fscache_register_netfs(&afs_cache_netfs); - if (ret < 0) - goto error_cache; -#endif - ret = register_pernet_device(&afs_net_ops); if (ret < 0) goto error_net; @@ -215,10 +208,6 @@ error_proc: error_fs: unregister_pernet_device(&afs_net_ops); error_net: -#ifdef CONFIG_AFS_FSCACHE - fscache_unregister_netfs(&afs_cache_netfs); -error_cache: -#endif destroy_workqueue(afs_lock_manager); error_lockmgr: destroy_workqueue(afs_async_calls); @@ -245,9 +234,6 @@ static void __exit afs_exit(void) proc_remove(afs_proc_symlink); afs_fs_exit(); unregister_pernet_device(&afs_net_ops); -#ifdef CONFIG_AFS_FSCACHE - fscache_unregister_netfs(&afs_cache_netfs); -#endif destroy_workqueue(afs_lock_manager); destroy_workqueue(afs_async_calls); destroy_workqueue(afs_wq); diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 1d1a8debe472..933e67fcdab1 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -163,8 +163,11 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code) return; case -ECONNABORTED: + error = afs_abort_to_error(abort_code); + fallthrough; + case -ENETRESET: /* Responded, but we seem to have changed address */ e->responded = true; - e->error = afs_abort_to_error(abort_code); + e->error = error; return; } } diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 065a28bfa3f1..e1b863449296 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -227,7 +227,7 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v) static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) __acquires(cell->proc_lock) { - struct afs_cell *cell = PDE_DATA(file_inode(m->file)); + struct afs_cell *cell = pde_data(file_inode(m->file)); rcu_read_lock(); return seq_hlist_start_head_rcu(&cell->proc_volumes, *_pos); @@ -236,7 +236,7 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos) static void *afs_proc_cell_volumes_next(struct seq_file *m, void *v, loff_t *_pos) { - struct afs_cell *cell = PDE_DATA(file_inode(m->file)); + struct afs_cell *cell = pde_data(file_inode(m->file)); return seq_hlist_next_rcu(v, &cell->proc_volumes, _pos); } @@ -322,7 +322,7 @@ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos) { struct afs_vl_seq_net_private *priv = m->private; struct afs_vlserver_list *vllist; - struct afs_cell *cell = PDE_DATA(file_inode(m->file)); + struct afs_cell *cell = pde_data(file_inode(m->file)); loff_t pos = *_pos; rcu_read_lock(); diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c index 79e1a5f6701b..a840c3588ebb 100644 --- a/fs/afs/rotate.c +++ b/fs/afs/rotate.c @@ -292,6 +292,10 @@ bool afs_select_fileserver(struct afs_operation *op) op->error = error; goto iterate_address; + case -ENETRESET: + pr_warn("kAFS: Peer reset %s (op=%x)\n", + op->type ? op->type->name : "???", op->debug_id); + fallthrough; case -ECONNRESET: _debug("call reset"); op->error = error; diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 23a1a92d64bb..a5434f3e57c6 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -537,6 +537,8 @@ static void afs_deliver_to_call(struct afs_call *call) case -ENODATA: case -EBADMSG: case -EMSGSIZE: + case -ENOMEM: + case -EFAULT: abort_code = RXGEN_CC_UNMARSHAL; if (state != AFS_CALL_CL_AWAIT_REPLY) abort_code = RXGEN_SS_UNMARSHAL; @@ -544,7 +546,7 @@ static void afs_deliver_to_call(struct afs_call *call) abort_code, ret, "KUM"); goto local_abort; default: - abort_code = RX_USER_ABORT; + abort_code = RX_CALL_DEAD; rxrpc_kernel_abort_call(call->net->socket, call->rxcall, abort_code, ret, "KER"); goto local_abort; @@ -836,7 +838,7 @@ void afs_send_empty_reply(struct afs_call *call) case -ENOMEM: _debug("oom"); rxrpc_kernel_abort_call(net->socket, call->rxcall, - RX_USER_ABORT, -ENOMEM, "KOO"); + RXGEN_SS_MARSHAL, -ENOMEM, "KOO"); fallthrough; default: _leave(" [error]"); @@ -878,7 +880,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len) if (n == -ENOMEM) { _debug("oom"); rxrpc_kernel_abort_call(net->socket, call->rxcall, - RX_USER_ABORT, -ENOMEM, "KOO"); + RXGEN_SS_MARSHAL, -ENOMEM, "KOO"); } _leave(" [error]"); } diff --git a/fs/afs/security.c b/fs/afs/security.c index 3c7a8fc4f93f..7c6a63a30394 100644 --- a/fs/afs/security.c +++ b/fs/afs/security.c @@ -219,8 +219,7 @@ void afs_cache_permit(struct afs_vnode *vnode, struct key *key, * yet. */ size++; - new = kzalloc(sizeof(struct afs_permits) + - sizeof(struct afs_permit) * size, GFP_NOFS); + new = kzalloc(struct_size(new, permits, size), GFP_NOFS); if (!new) goto out_put; diff --git a/fs/afs/super.c b/fs/afs/super.c index 34c68724c98b..95d713074dc8 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -55,6 +55,7 @@ int afs_net_id; static const struct super_operations afs_super_ops = { .statfs = afs_statfs, .alloc_inode = afs_alloc_inode, + .write_inode = afs_write_inode, .drop_inode = afs_drop_inode, .destroy_inode = afs_destroy_inode, .free_inode = afs_free_inode, @@ -658,7 +659,7 @@ static void afs_i_init_once(void *_vnode) struct afs_vnode *vnode = _vnode; memset(vnode, 0, sizeof(*vnode)); - inode_init_once(&vnode->vfs_inode); + inode_init_once(&vnode->netfs.inode); mutex_init(&vnode->io_lock); init_rwsem(&vnode->validate_lock); spin_lock_init(&vnode->wb_lock); @@ -678,7 +679,7 @@ static struct inode *afs_alloc_inode(struct super_block *sb) { struct afs_vnode *vnode; - vnode = kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL); + vnode = alloc_inode_sb(sb, afs_inode_cachep, GFP_KERNEL); if (!vnode) return NULL; @@ -687,13 +688,11 @@ static struct inode *afs_alloc_inode(struct super_block *sb) /* Reset anything that shouldn't leak from one inode to the next. */ memset(&vnode->fid, 0, sizeof(vnode->fid)); memset(&vnode->status, 0, sizeof(vnode->status)); + afs_vnode_set_cache(vnode, NULL); vnode->volume = NULL; vnode->lock_key = NULL; vnode->permit_cache = NULL; -#ifdef CONFIG_AFS_FSCACHE - vnode->cache = NULL; -#endif vnode->flags = 1 << AFS_VNODE_UNSET; vnode->lock_state = AFS_VNODE_LOCK_NONE; @@ -701,8 +700,8 @@ static struct inode *afs_alloc_inode(struct super_block *sb) init_rwsem(&vnode->rmdir_lock); INIT_WORK(&vnode->cb_work, afs_invalidate_mmap_work); - _leave(" = %p", &vnode->vfs_inode); - return &vnode->vfs_inode; + _leave(" = %p", &vnode->netfs.inode); + return &vnode->netfs.inode; } static void afs_free_inode(struct inode *inode) diff --git a/fs/afs/volume.c b/fs/afs/volume.c index f84194b791d3..cc665cef0abe 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -9,8 +9,7 @@ #include <linux/slab.h> #include "internal.h" -unsigned __read_mostly afs_volume_gc_delay = 10; -unsigned __read_mostly afs_volume_record_life = 60 * 60; +static unsigned __read_mostly afs_volume_record_life = 60 * 60; /* * Insert a volume into a cell. If there's an existing volume record, that is @@ -268,15 +267,30 @@ void afs_put_volume(struct afs_net *net, struct afs_volume *volume, /* * Activate a volume. */ -void afs_activate_volume(struct afs_volume *volume) +int afs_activate_volume(struct afs_volume *volume) { #ifdef CONFIG_AFS_FSCACHE - volume->cache = fscache_acquire_cookie(volume->cell->cache, - &afs_volume_cache_index_def, - &volume->vid, sizeof(volume->vid), - NULL, 0, - volume, 0, true); + struct fscache_volume *vcookie; + char *name; + + name = kasprintf(GFP_KERNEL, "afs,%s,%llx", + volume->cell->name, volume->vid); + if (!name) + return -ENOMEM; + + vcookie = fscache_acquire_volume(name, NULL, NULL, 0); + if (IS_ERR(vcookie)) { + if (vcookie != ERR_PTR(-EBUSY)) { + kfree(name); + return PTR_ERR(vcookie); + } + pr_err("AFS: Cache volume key already in use (%s)\n", name); + vcookie = NULL; + } + volume->cache = vcookie; + kfree(name); #endif + return 0; } /* @@ -287,7 +301,7 @@ void afs_deactivate_volume(struct afs_volume *volume) _enter("%s", volume->name); #ifdef CONFIG_AFS_FSCACHE - fscache_relinquish_cookie(volume->cache, NULL, + fscache_relinquish_volume(volume->cache, NULL, test_bit(AFS_VOLUME_DELETED, &volume->flags)); volume->cache = NULL; #endif diff --git a/fs/afs/write.c b/fs/afs/write.c index ca4909baf5e6..2c885b22de34 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -12,23 +12,37 @@ #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/netfs.h> -#include <linux/fscache.h> #include "internal.h" +static void afs_write_to_cache(struct afs_vnode *vnode, loff_t start, size_t len, + loff_t i_size, bool caching); + +#ifdef CONFIG_AFS_FSCACHE /* - * mark a page as having been made dirty and thus needing writeback + * Mark a page as having been made dirty and thus needing writeback. We also + * need to pin the cache object to write back to. */ -int afs_set_page_dirty(struct page *page) +bool afs_dirty_folio(struct address_space *mapping, struct folio *folio) +{ + return fscache_dirty_folio(mapping, folio, + afs_vnode_cache(AFS_FS_I(mapping->host))); +} +static void afs_folio_start_fscache(bool caching, struct folio *folio) +{ + if (caching) + folio_start_fscache(folio); +} +#else +static void afs_folio_start_fscache(bool caching, struct folio *folio) { - _enter(""); - return __set_page_dirty_nobuffers(page); } +#endif /* * prepare to perform part of a write to a page */ int afs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **_page, void **fsdata) { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); @@ -46,8 +60,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping, * file. We need to do this before we get a lock on the page in case * there's more than one writer competing for the same cache block. */ - ret = netfs_write_begin(file, mapping, pos, len, flags, &folio, fsdata, - &afs_req_ops, NULL); + ret = netfs_write_begin(&vnode->netfs, file, mapping, pos, len, &folio, fsdata); if (ret < 0) return ret; @@ -114,7 +127,7 @@ int afs_write_end(struct file *file, struct address_space *mapping, unsigned long priv; unsigned int f, from = offset_in_folio(folio, pos); unsigned int t, to = from + copied; - loff_t i_size, maybe_i_size; + loff_t i_size, write_end_pos; _enter("{%llx:%llu},{%lx}", vnode->fid.vid, vnode->fid.vnode, folio_index(folio)); @@ -131,15 +144,16 @@ int afs_write_end(struct file *file, struct address_space *mapping, if (copied == 0) goto out; - maybe_i_size = pos + copied; + write_end_pos = pos + copied; - i_size = i_size_read(&vnode->vfs_inode); - if (maybe_i_size > i_size) { + i_size = i_size_read(&vnode->netfs.inode); + if (write_end_pos > i_size) { write_seqlock(&vnode->cb_lock); - i_size = i_size_read(&vnode->vfs_inode); - if (maybe_i_size > i_size) - afs_set_i_size(vnode, maybe_i_size); + i_size = i_size_read(&vnode->netfs.inode); + if (write_end_pos > i_size) + afs_set_i_size(vnode, write_end_pos); write_sequnlock(&vnode->cb_lock); + fscache_update_cookie(afs_vnode_cache(vnode), NULL, &write_end_pos); } if (folio_test_private(folio)) { @@ -243,7 +257,7 @@ static void afs_redirty_pages(struct writeback_control *wbc, */ static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len) { - struct address_space *mapping = vnode->vfs_inode.i_mapping; + struct address_space *mapping = vnode->netfs.inode.i_mapping; struct folio *folio; pgoff_t end; @@ -342,7 +356,7 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t { struct afs_operation *op; struct afs_wb_key *wbk = NULL; - loff_t size = iov_iter_count(iter), i_size; + loff_t size = iov_iter_count(iter); int ret = -ENOKEY; _enter("%s{%llx:%llu.%u},%llx,%llx", @@ -364,17 +378,15 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t return -ENOMEM; } - i_size = i_size_read(&vnode->vfs_inode); - afs_op_set_vnode(op, 0, vnode); op->file[0].dv_delta = 1; op->file[0].modification = true; op->store.write_iter = iter; op->store.pos = pos; op->store.size = size; - op->store.i_size = max(pos + size, i_size); + op->store.i_size = max(pos + size, vnode->netfs.remote_i_size); op->store.laundering = laundering; - op->mtime = vnode->vfs_inode.i_mtime; + op->mtime = vnode->netfs.inode.i_mtime; op->flags |= AFS_OPERATION_UNINTR; op->ops = &afs_store_data_operation; @@ -418,6 +430,7 @@ static void afs_extend_writeback(struct address_space *mapping, loff_t start, loff_t max_len, bool new_content, + bool caching, unsigned int *_len) { struct pagevec pvec; @@ -464,7 +477,9 @@ static void afs_extend_writeback(struct address_space *mapping, folio_put(folio); break; } - if (!folio_test_dirty(folio) || folio_test_writeback(folio)) { + if (!folio_test_dirty(folio) || + folio_test_writeback(folio) || + folio_test_fscache(folio)) { folio_unlock(folio); folio_put(folio); break; @@ -512,6 +527,7 @@ static void afs_extend_writeback(struct address_space *mapping, BUG(); if (folio_start_writeback(folio)) BUG(); + afs_folio_start_fscache(caching, folio); *_count -= folio_nr_pages(folio); folio_unlock(folio); @@ -537,8 +553,9 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, struct iov_iter iter; unsigned long priv; unsigned int offset, to, len, max_len; - loff_t i_size = i_size_read(&vnode->vfs_inode); + loff_t i_size = i_size_read(&vnode->netfs.inode); bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); + bool caching = fscache_cookie_enabled(afs_vnode_cache(vnode)); long count = wbc->nr_to_write; int ret; @@ -546,6 +563,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, if (folio_start_writeback(folio)) BUG(); + afs_folio_start_fscache(caching, folio); count -= folio_nr_pages(folio); @@ -572,7 +590,8 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, if (len < max_len && (to == folio_size(folio) || new_content)) afs_extend_writeback(mapping, vnode, &count, - start, max_len, new_content, &len); + start, max_len, new_content, + caching, &len); len = min_t(loff_t, len, max_len); } @@ -585,12 +604,18 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, if (start < i_size) { _debug("write back %x @%llx [%llx]", len, start, i_size); + /* Speculatively write to the cache. We have to fix this up + * later if the store fails. + */ + afs_write_to_cache(vnode, start, len, i_size, caching); + iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len); ret = afs_store_data(vnode, &iter, start, false); } else { _debug("write discard %x @%llx [%llx]", len, start, i_size); /* The dirty region was entirely beyond the EOF. */ + fscache_clear_page_bits(mapping, start, len, caching); afs_pages_written_back(vnode, start, len); ret = 0; } @@ -610,6 +635,7 @@ static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, case -EKEYEXPIRED: case -EKEYREJECTED: case -EKEYREVOKED: + case -ENETRESET: afs_redirty_pages(wbc, mapping, start, len); mapping_set_error(mapping, ret); break; @@ -649,6 +675,10 @@ int afs_writepage(struct page *subpage, struct writeback_control *wbc) _enter("{%lx},", folio_index(folio)); +#ifdef CONFIG_AFS_FSCACHE + folio_wait_fscache(folio); +#endif + start = folio_index(folio) * PAGE_SIZE; ret = afs_write_back_from_locked_folio(folio_mapping(folio), wbc, folio, start, LLONG_MAX - start); @@ -671,7 +701,7 @@ static int afs_writepages_region(struct address_space *mapping, struct folio *folio; struct page *head_page; ssize_t ret; - int n; + int n, skips = 0; _enter("%llx,%llx,", start, end); @@ -714,11 +744,23 @@ static int afs_writepages_region(struct address_space *mapping, continue; } - if (folio_test_writeback(folio)) { + if (folio_test_writeback(folio) || + folio_test_fscache(folio)) { folio_unlock(folio); - if (wbc->sync_mode != WB_SYNC_NONE) + if (wbc->sync_mode != WB_SYNC_NONE) { folio_wait_writeback(folio); +#ifdef CONFIG_AFS_FSCACHE + folio_wait_fscache(folio); +#endif + } else { + start += folio_size(folio); + } folio_put(folio); + if (wbc->sync_mode == WB_SYNC_NONE) { + if (skips >= 5 || need_resched()) + break; + skips++; + } continue; } @@ -802,7 +844,7 @@ ssize_t afs_file_write(struct kiocb *iocb, struct iov_iter *from) _enter("{%llx:%llu},{%zu},", vnode->fid.vid, vnode->fid.vnode, count); - if (IS_SWAPFILE(&vnode->vfs_inode)) { + if (IS_SWAPFILE(&vnode->netfs.inode)) { printk(KERN_INFO "AFS: Attempt to write to active swap file!\n"); return -EBUSY; @@ -915,8 +957,8 @@ void afs_prune_wb_keys(struct afs_vnode *vnode) /* Discard unused keys */ spin_lock(&vnode->wb_lock); - if (!mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_WRITEBACK) && - !mapping_tagged(&vnode->vfs_inode.i_data, PAGECACHE_TAG_DIRTY)) { + if (!mapping_tagged(&vnode->netfs.inode.i_data, PAGECACHE_TAG_WRITEBACK) && + !mapping_tagged(&vnode->netfs.inode.i_data, PAGECACHE_TAG_DIRTY)) { list_for_each_entry_safe(wbk, tmp, &vnode->wb_keys, vnode_link) { if (refcount_read(&wbk->usage) == 1) list_move(&wbk->vnode_link, &graveyard); @@ -935,9 +977,8 @@ void afs_prune_wb_keys(struct afs_vnode *vnode) /* * Clean up a page during invalidation. */ -int afs_launder_page(struct page *subpage) +int afs_launder_folio(struct folio *folio) { - struct folio *folio = page_folio(subpage); struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); struct iov_iter iter; struct bio_vec bv[1]; @@ -945,7 +986,7 @@ int afs_launder_page(struct page *subpage) unsigned int f, t; int ret = 0; - _enter("{%lx}", folio_index(folio)); + _enter("{%lx}", folio->index); priv = (unsigned long)folio_get_private(folio); if (folio_clear_dirty_for_io(folio)) { @@ -970,3 +1011,28 @@ int afs_launder_page(struct page *subpage) folio_wait_fscache(folio); return ret; } + +/* + * Deal with the completion of writing the data to the cache. + */ +static void afs_write_to_cache_done(void *priv, ssize_t transferred_or_error, + bool was_async) +{ + struct afs_vnode *vnode = priv; + + if (IS_ERR_VALUE(transferred_or_error) && + transferred_or_error != -ENOBUFS) + afs_invalidate_cache(vnode, 0); +} + +/* + * Save the write to the cache also. + */ +static void afs_write_to_cache(struct afs_vnode *vnode, + loff_t start, size_t len, loff_t i_size, + bool caching) +{ + fscache_write_to_cache(afs_vnode_cache(vnode), + vnode->netfs.inode.i_mapping, start, len, i_size, + afs_write_to_cache_done, vnode, caching); +} @@ -220,9 +220,35 @@ struct aio_kiocb { /*------ sysctl variables----*/ static DEFINE_SPINLOCK(aio_nr_lock); -unsigned long aio_nr; /* current system wide number of aio requests */ -unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ +static unsigned long aio_nr; /* current system wide number of aio requests */ +static unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */ /*----end sysctl variables---*/ +#ifdef CONFIG_SYSCTL +static struct ctl_table aio_sysctls[] = { + { + .procname = "aio-nr", + .data = &aio_nr, + .maxlen = sizeof(aio_nr), + .mode = 0444, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "aio-max-nr", + .data = &aio_max_nr, + .maxlen = sizeof(aio_max_nr), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + {} +}; + +static void __init aio_sysctl_init(void) +{ + register_sysctl_init("fs", aio_sysctls); +} +#else +#define aio_sysctl_init() do { } while (0) +#endif static struct kmem_cache *kiocb_cachep; static struct kmem_cache *kioctx_cachep; @@ -275,6 +301,7 @@ static int __init aio_setup(void) kiocb_cachep = KMEM_CACHE(aio_kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC); kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); + aio_sysctl_init(); return 0; } __initcall(aio_setup); @@ -451,7 +478,7 @@ out: #endif static const struct address_space_operations aio_ctx_aops = { - .set_page_dirty = __set_page_dirty_no_writeback, + .dirty_folio = noop_dirty_folio, #if IS_ENABLED(CONFIG_MIGRATION) .migratepage = aio_migratepage, #endif @@ -1451,7 +1478,6 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb) req->ki_flags = iocb_flags(req->ki_filp); if (iocb->aio_flags & IOCB_FLAG_RESFD) req->ki_flags |= IOCB_EVENTFD; - req->ki_hint = ki_hint_validate(file_write_hint(req->ki_filp)); if (iocb->aio_flags & IOCB_FLAG_IOPRIO) { /* * If the IOCB_FLAG_IOPRIO flag of aio_flags is set, then @@ -1526,7 +1552,6 @@ static int aio_read(struct kiocb *req, const struct iocb *iocb, file = req->ki_filp; if (unlikely(!(file->f_mode & FMODE_READ))) return -EBADF; - ret = -EINVAL; if (unlikely(!file->f_op->read_iter)) return -EINVAL; diff --git a/fs/attr.c b/fs/attr.c index 66899b6e9bd8..dbe996b0dedf 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -61,9 +61,15 @@ static bool chgrp_ok(struct user_namespace *mnt_userns, const struct inode *inode, kgid_t gid) { kgid_t kgid = i_gid_into_mnt(mnt_userns, inode); - if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode)) && - (in_group_p(gid) || gid_eq(gid, inode->i_gid))) - return true; + if (uid_eq(current_fsuid(), i_uid_into_mnt(mnt_userns, inode))) { + kgid_t mapped_gid; + + if (gid_eq(gid, inode->i_gid)) + return true; + mapped_gid = mapped_kgid_fs(mnt_userns, i_user_ns(inode), gid); + if (in_group_p(mapped_gid)) + return true; + } if (capable_wrt_inode_uidgid(mnt_userns, inode, CAP_CHOWN)) return true; if (gid_eq(kgid, INVALID_GID) && @@ -123,12 +129,20 @@ int setattr_prepare(struct user_namespace *mnt_userns, struct dentry *dentry, /* Make sure a caller can chmod. */ if (ia_valid & ATTR_MODE) { + kgid_t mapped_gid; + if (!inode_owner_or_capable(mnt_userns, inode)) return -EPERM; + + if (ia_valid & ATTR_GID) + mapped_gid = mapped_kgid_fs(mnt_userns, + i_user_ns(inode), attr->ia_gid); + else + mapped_gid = i_gid_into_mnt(mnt_userns, inode); + /* Also check the setgid bit! */ - if (!in_group_p((ia_valid & ATTR_GID) ? attr->ia_gid : - i_gid_into_mnt(mnt_userns, inode)) && - !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) + if (!in_group_p(mapped_gid) && + !capable_wrt_inode_uidgid(mnt_userns, inode, CAP_FSETID)) attr->ia_mode &= ~S_ISGID; } diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index c1ba13d19024..be383fa46b12 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -40,7 +40,7 @@ MODULE_LICENSE("GPL"); static int befs_readdir(struct file *, struct dir_context *); static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int); -static int befs_readpage(struct file *file, struct page *page); +static int befs_read_folio(struct file *file, struct folio *folio); static sector_t befs_bmap(struct address_space *mapping, sector_t block); static struct dentry *befs_lookup(struct inode *, struct dentry *, unsigned int); @@ -48,7 +48,7 @@ static struct inode *befs_iget(struct super_block *, unsigned long); static struct inode *befs_alloc_inode(struct super_block *sb); static void befs_free_inode(struct inode *inode); static void befs_destroy_inodecache(void); -static int befs_symlink_readpage(struct file *, struct page *); +static int befs_symlink_read_folio(struct file *, struct folio *); static int befs_utf2nls(struct super_block *sb, const char *in, int in_len, char **out, int *out_len); static int befs_nls2utf(struct super_block *sb, const char *in, int in_len, @@ -87,12 +87,12 @@ static const struct inode_operations befs_dir_inode_operations = { }; static const struct address_space_operations befs_aops = { - .readpage = befs_readpage, + .read_folio = befs_read_folio, .bmap = befs_bmap, }; static const struct address_space_operations befs_symlink_aops = { - .readpage = befs_symlink_readpage, + .read_folio = befs_symlink_read_folio, }; static const struct export_operations befs_export_operations = { @@ -102,16 +102,16 @@ static const struct export_operations befs_export_operations = { }; /* - * Called by generic_file_read() to read a page of data + * Called by generic_file_read() to read a folio of data * * In turn, simply calls a generic block read function and * passes it the address of befs_get_block, for mapping file * positions to disk blocks. */ static int -befs_readpage(struct file *file, struct page *page) +befs_read_folio(struct file *file, struct folio *folio) { - return block_read_full_page(page, befs_get_block); + return block_read_full_folio(folio, befs_get_block); } static sector_t @@ -277,7 +277,7 @@ befs_alloc_inode(struct super_block *sb) { struct befs_inode_info *bi; - bi = kmem_cache_alloc(befs_inode_cachep, GFP_KERNEL); + bi = alloc_inode_sb(sb, befs_inode_cachep, GFP_KERNEL); if (!bi) return NULL; return &bi->vfs_inode; @@ -468,8 +468,9 @@ befs_destroy_inodecache(void) * The data stream become link name. Unless the LONG_SYMLINK * flag is set. */ -static int befs_symlink_readpage(struct file *unused, struct page *page) +static int befs_symlink_read_folio(struct file *unused, struct folio *folio) { + struct page *page = &folio->page; struct inode *inode = page->mapping->host; struct super_block *sb = inode->i_sb; struct befs_inode_info *befs_ino = BEFS_I(inode); diff --git a/fs/bfs/file.c b/fs/bfs/file.c index 7f8544abf636..57ae5ee6deec 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -155,9 +155,9 @@ static int bfs_writepage(struct page *page, struct writeback_control *wbc) return block_write_full_page(page, bfs_get_block, wbc); } -static int bfs_readpage(struct file *file, struct page *page) +static int bfs_read_folio(struct file *file, struct folio *folio) { - return block_read_full_page(page, bfs_get_block); + return block_read_full_folio(folio, bfs_get_block); } static void bfs_write_failed(struct address_space *mapping, loff_t to) @@ -169,13 +169,12 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to) } static int bfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { int ret; - ret = block_write_begin(mapping, pos, len, flags, pagep, - bfs_get_block); + ret = block_write_begin(mapping, pos, len, pagep, bfs_get_block); if (unlikely(ret)) bfs_write_failed(mapping, pos + len); @@ -188,8 +187,9 @@ static sector_t bfs_bmap(struct address_space *mapping, sector_t block) } const struct address_space_operations bfs_aops = { - .set_page_dirty = __set_page_dirty_buffers, - .readpage = bfs_readpage, + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, + .read_folio = bfs_read_folio, .writepage = bfs_writepage, .write_begin = bfs_write_begin, .write_end = generic_write_end, diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index fd691e4815c5..1926bec2c850 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -239,7 +239,7 @@ static struct kmem_cache *bfs_inode_cachep; static struct inode *bfs_alloc_inode(struct super_block *sb) { struct bfs_inode_info *bi; - bi = kmem_cache_alloc(bfs_inode_cachep, GFP_KERNEL); + bi = alloc_inode_sb(sb, bfs_inode_cachep, GFP_KERNEL); if (!bi) return NULL; return &bi->vfs_inode; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index f8c7f26f1fbb..63c7ebb0da89 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -93,7 +93,7 @@ static int elf_core_dump(struct coredump_params *cprm); #define ELF_CORE_EFLAGS 0 #endif -#define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1)) +#define ELF_PAGESTART(_v) ((_v) & ~(int)(ELF_MIN_ALIGN-1)) #define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1)) #define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1)) @@ -101,8 +101,10 @@ static struct linux_binfmt elf_format = { .module = THIS_MODULE, .load_binary = load_elf_binary, .load_shlib = load_elf_library, +#ifdef CONFIG_COREDUMP .core_dump = elf_core_dump, .min_coredump = ELF_EXEC_PAGESIZE, +#endif }; #define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE)) @@ -170,8 +172,8 @@ static int padzero(unsigned long elf_bss) static int create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, - unsigned long load_addr, unsigned long interp_load_addr, - unsigned long e_entry) + unsigned long interp_load_addr, + unsigned long e_entry, unsigned long phdr_addr) { struct mm_struct *mm = current->mm; unsigned long p = bprm->p; @@ -257,7 +259,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP); NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE); NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); - NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff); + NEW_AUX_ENT(AT_PHDR, phdr_addr); NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr)); NEW_AUX_ENT(AT_PHNUM, exec->e_phnum); NEW_AUX_ENT(AT_BASE, interp_load_addr); @@ -399,22 +401,21 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, return(map_addr); } -static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr) +static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr) { - int i, first_idx = -1, last_idx = -1; + elf_addr_t min_addr = -1; + elf_addr_t max_addr = 0; + bool pt_load = false; + int i; for (i = 0; i < nr; i++) { - if (cmds[i].p_type == PT_LOAD) { - last_idx = i; - if (first_idx == -1) - first_idx = i; + if (phdr[i].p_type == PT_LOAD) { + min_addr = min(min_addr, ELF_PAGESTART(phdr[i].p_vaddr)); + max_addr = max(max_addr, phdr[i].p_vaddr + phdr[i].p_memsz); + pt_load = true; } } - if (first_idx == -1) - return 0; - - return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz - - ELF_PAGESTART(cmds[first_idx].p_vaddr); + return pt_load ? (max_addr - min_addr) : 0; } static int elf_read(struct file *file, void *buf, size_t len, loff_t pos) @@ -823,8 +824,8 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ - unsigned long load_addr = 0, load_bias = 0; - int load_addr_set = 0; + unsigned long load_bias = 0, phdr_addr = 0; + int first_pt_load = 1; unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; struct elf_phdr *elf_property_phdata = NULL; @@ -1074,12 +1075,12 @@ out_free_interp: vaddr = elf_ppnt->p_vaddr; /* - * The first time through the loop, load_addr_set is false: + * The first time through the loop, first_pt_load is true: * layout will be calculated. Once set, use MAP_FIXED since * we know we've already safely mapped the entire region with * MAP_FIXED_NOREPLACE in the once-per-binary logic following. */ - if (load_addr_set) { + if (!first_pt_load) { elf_flags |= MAP_FIXED; } else if (elf_ex->e_type == ET_EXEC) { /* @@ -1135,14 +1136,25 @@ out_free_interp: * is then page aligned. */ load_bias = ELF_PAGESTART(load_bias - vaddr); - } - /* - * Calculate the entire size of the ELF mapping (total_size). - * (Note that load_addr_set is set to true later once the - * initial mapping is performed.) - */ - if (!load_addr_set) { + /* + * Calculate the entire size of the ELF mapping + * (total_size), used for the initial mapping, + * due to load_addr_set which is set to true later + * once the initial mapping is performed. + * + * Note that this is only sensible when the LOAD + * segments are contiguous (or overlapping). If + * used for LOADs that are far apart, this would + * cause the holes between LOADs to be mapped, + * running the risk of having the mapping fail, + * as it would be larger than the ELF file itself. + * + * As a result, only ET_DYN does this, since + * some ET_EXEC (e.g. ia64) may have large virtual + * memory holes between LOADs. + * + */ total_size = total_mapping_size(elf_phdata, elf_ex->e_phnum); if (!total_size) { @@ -1159,16 +1171,25 @@ out_free_interp: goto out_free_dentry; } - if (!load_addr_set) { - load_addr_set = 1; - load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset); + if (first_pt_load) { + first_pt_load = 0; if (elf_ex->e_type == ET_DYN) { load_bias += error - ELF_PAGESTART(load_bias + vaddr); - load_addr += load_bias; reloc_func_desc = load_bias; } } + + /* + * Figure out which segment in the file contains the Program + * Header table, and map to the associated memory address. + */ + if (elf_ppnt->p_offset <= elf_ex->e_phoff && + elf_ex->e_phoff < elf_ppnt->p_offset + elf_ppnt->p_filesz) { + phdr_addr = elf_ex->e_phoff - elf_ppnt->p_offset + + elf_ppnt->p_vaddr; + } + k = elf_ppnt->p_vaddr; if ((elf_ppnt->p_flags & PF_X) && k < start_code) start_code = k; @@ -1204,6 +1225,7 @@ out_free_interp: } e_entry = elf_ex->e_entry + load_bias; + phdr_addr += load_bias; elf_bss += load_bias; elf_brk += load_bias; start_code += load_bias; @@ -1267,8 +1289,8 @@ out_free_interp: goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ - retval = create_elf_tables(bprm, elf_ex, - load_addr, interp_load_addr, e_entry); + retval = create_elf_tables(bprm, elf_ex, interp_load_addr, + e_entry, phdr_addr); if (retval < 0) goto out; @@ -1585,7 +1607,7 @@ static int fill_psinfo(struct elf_prpsinfo *psinfo, struct task_struct *p, SET_UID(psinfo->pr_uid, from_kuid_munged(cred->user_ns, cred->uid)); SET_GID(psinfo->pr_gid, from_kgid_munged(cred->user_ns, cred->gid)); rcu_read_unlock(); - strncpy(psinfo->pr_fname, p->comm, sizeof(psinfo->pr_fname)); + get_task_comm(psinfo->pr_fname, p); return 0; } @@ -1619,17 +1641,16 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, * long file_ofs * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL... */ -static int fill_files_note(struct memelfnote *note) +static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm) { - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; unsigned count, size, names_ofs, remaining, n; user_long_t *data; user_long_t *start_end_ofs; char *name_base, *name_curpos; + int i; /* *Estimated* file count and total data size needed */ - count = mm->map_count; + count = cprm->vma_count; if (count > UINT_MAX / 64) return -EINVAL; size = count * 64; @@ -1651,11 +1672,12 @@ static int fill_files_note(struct memelfnote *note) name_base = name_curpos = ((char *)data) + names_ofs; remaining = size - names_ofs; count = 0; - for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *m = &cprm->vma_meta[i]; struct file *file; const char *filename; - file = vma->vm_file; + file = m->file; if (!file) continue; filename = file_path(file, name_curpos, remaining); @@ -1675,9 +1697,9 @@ static int fill_files_note(struct memelfnote *note) memmove(name_curpos, filename, n); name_curpos += n; - *start_end_ofs++ = vma->vm_start; - *start_end_ofs++ = vma->vm_end; - *start_end_ofs++ = vma->vm_pgoff; + *start_end_ofs++ = m->start; + *start_end_ofs++ = m->end; + *start_end_ofs++ = m->pgoff; count++; } @@ -1688,7 +1710,7 @@ static int fill_files_note(struct memelfnote *note) * Count usually is less than mm->map_count, * we need to move filenames down. */ - n = mm->map_count - count; + n = cprm->vma_count - count; if (n != 0) { unsigned shift_bytes = n * 3 * sizeof(data[0]); memmove(name_base - shift_bytes, name_base, @@ -1744,9 +1766,9 @@ static void do_thread_regset_writeback(struct task_struct *task, static int fill_thread_core_info(struct elf_thread_core_info *t, const struct user_regset_view *view, - long signr, size_t *total) + long signr, struct elf_note_info *info) { - unsigned int i; + unsigned int note_iter, view_iter; /* * NT_PRSTATUS is the one special case, because the regset data @@ -1760,17 +1782,17 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, fill_note(&t->notes[0], "CORE", NT_PRSTATUS, PRSTATUS_SIZE, &t->prstatus); - *total += notesize(&t->notes[0]); + info->size += notesize(&t->notes[0]); do_thread_regset_writeback(t->task, &view->regsets[0]); /* * Each other regset might generate a note too. For each regset - * that has no core_note_type or is inactive, we leave t->notes[i] - * all zero and we'll know to skip writing it later. + * that has no core_note_type or is inactive, skip it. */ - for (i = 1; i < view->n; ++i) { - const struct user_regset *regset = &view->regsets[i]; + note_iter = 1; + for (view_iter = 1; view_iter < view->n; ++view_iter) { + const struct user_regset *regset = &view->regsets[view_iter]; int note_type = regset->core_note_type; bool is_fpreg = note_type == NT_PRFPREG; void *data; @@ -1786,13 +1808,17 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, if (ret < 0) continue; + if (WARN_ON_ONCE(note_iter >= info->thread_notes)) + break; + if (is_fpreg) SET_PR_FPVALID(&t->prstatus); - fill_note(&t->notes[i], is_fpreg ? "CORE" : "LINUX", + fill_note(&t->notes[note_iter], is_fpreg ? "CORE" : "LINUX", note_type, ret, data); - *total += notesize(&t->notes[i]); + info->size += notesize(&t->notes[note_iter]); + note_iter++; } return 1; @@ -1800,7 +1826,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, - const kernel_siginfo_t *siginfo, struct pt_regs *regs) + struct coredump_params *cprm) { struct task_struct *dump_task = current; const struct user_regset_view *view = task_user_regset_view(dump_task); @@ -1872,7 +1898,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, * Now fill in each thread's information. */ for (t = info->thread; t != NULL; t = t->next) - if (!fill_thread_core_info(t, view, siginfo->si_signo, &info->size)) + if (!fill_thread_core_info(t, view, cprm->siginfo->si_signo, info)) return 0; /* @@ -1881,13 +1907,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm); info->size += notesize(&info->psinfo); - fill_siginfo_note(&info->signote, &info->csigdata, siginfo); + fill_siginfo_note(&info->signote, &info->csigdata, cprm->siginfo); info->size += notesize(&info->signote); fill_auxv_note(&info->auxv, current->mm); info->size += notesize(&info->auxv); - if (fill_files_note(&info->files) == 0) + if (fill_files_note(&info->files, cprm) == 0) info->size += notesize(&info->files); return 1; @@ -2029,7 +2055,7 @@ static int elf_note_info_init(struct elf_note_info *info) static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, - const kernel_siginfo_t *siginfo, struct pt_regs *regs) + struct coredump_params *cprm) { struct core_thread *ct; struct elf_thread_status *ets; @@ -2050,13 +2076,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, list_for_each_entry(ets, &info->thread_list, list) { int sz; - sz = elf_dump_thread_status(siginfo->si_signo, ets); + sz = elf_dump_thread_status(cprm->siginfo->si_signo, ets); info->thread_status_size += sz; } /* now collect the dump for the current */ memset(info->prstatus, 0, sizeof(*info->prstatus)); - fill_prstatus(&info->prstatus->common, current, siginfo->si_signo); - elf_core_copy_regs(&info->prstatus->pr_reg, regs); + fill_prstatus(&info->prstatus->common, current, cprm->siginfo->si_signo); + elf_core_copy_regs(&info->prstatus->pr_reg, cprm->regs); /* Set up header */ fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); @@ -2072,18 +2098,18 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, fill_note(info->notes + 1, "CORE", NT_PRPSINFO, sizeof(*info->psinfo), info->psinfo); - fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo); + fill_siginfo_note(info->notes + 2, &info->csigdata, cprm->siginfo); fill_auxv_note(info->notes + 3, current->mm); info->numnote = 4; - if (fill_files_note(info->notes + info->numnote) == 0) { + if (fill_files_note(info->notes + info->numnote, cprm) == 0) { info->notes_files = info->notes + info->numnote; info->numnote++; } /* Try to dump the FPU. */ - info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs, - info->fpu); + info->prstatus->pr_fpvalid = + elf_core_copy_task_fpregs(current, cprm->regs, info->fpu); if (info->prstatus->pr_fpvalid) fill_note(info->notes + info->numnote++, "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu); @@ -2169,8 +2195,7 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, static int elf_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - int vma_count, segs, i; - size_t vma_data_size; + int segs, i; struct elfhdr elf; loff_t offset = 0, dataoff; struct elf_note_info info = { }; @@ -2178,16 +2203,12 @@ static int elf_core_dump(struct coredump_params *cprm) struct elf_shdr *shdr4extnum = NULL; Elf_Half e_phnum; elf_addr_t e_shoff; - struct core_vma_metadata *vma_meta; - - if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) - return 0; /* * The number of segs are recored into ELF header as 16bit value. * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. */ - segs = vma_count + elf_core_extra_phdrs(); + segs = cprm->vma_count + elf_core_extra_phdrs(); /* for notes section */ segs++; @@ -2201,7 +2222,7 @@ static int elf_core_dump(struct coredump_params *cprm) * Collect all the non-memory information about the process for the * notes. This also sets up the file header. */ - if (!fill_note_info(&elf, e_phnum, &info, cprm->siginfo, cprm->regs)) + if (!fill_note_info(&elf, e_phnum, &info, cprm)) goto end_coredump; has_dumped = 1; @@ -2226,7 +2247,7 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - offset += vma_data_size; + offset += cprm->vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -2246,8 +2267,8 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; /* Write program headers for segments dump */ - for (i = 0; i < vma_count; i++) { - struct core_vma_metadata *meta = vma_meta + i; + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *meta = cprm->vma_meta + i; struct elf_phdr phdr; phdr.p_type = PT_LOAD; @@ -2284,8 +2305,8 @@ static int elf_core_dump(struct coredump_params *cprm) /* Align to page */ dump_skip_to(cprm, dataoff); - for (i = 0; i < vma_count; i++) { - struct core_vma_metadata *meta = vma_meta + i; + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *meta = cprm->vma_meta + i; if (!dump_user_range(cprm, meta->start, meta->dump_size)) goto end_coredump; @@ -2302,7 +2323,6 @@ static int elf_core_dump(struct coredump_params *cprm) end_coredump: free_note_info(&info); kfree(shdr4extnum); - kvfree(vma_meta); kfree(phdr4note); return has_dumped; } @@ -2324,3 +2344,7 @@ static void __exit exit_elf_binfmt(void) core_initcall(init_elf_binfmt); module_exit(exit_elf_binfmt); MODULE_LICENSE("GPL"); + +#ifdef CONFIG_BINFMT_ELF_KUNIT_TEST +#include "binfmt_elf_test.c" +#endif diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index c6f588dc4a9d..08d0c8797828 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -83,8 +83,8 @@ static struct linux_binfmt elf_fdpic_format = { .load_binary = load_elf_fdpic_binary, #ifdef CONFIG_ELF_CORE .core_dump = elf_fdpic_core_dump, -#endif .min_coredump = ELF_EXEC_PAGESIZE, +#endif }; static int __init init_elf_fdpic_binfmt(void) @@ -1465,7 +1465,7 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm, static int elf_fdpic_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - int vma_count, segs; + int segs; int i; struct elfhdr *elf = NULL; loff_t offset = 0, dataoff; @@ -1480,8 +1480,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) elf_addr_t e_shoff; struct core_thread *ct; struct elf_thread_status *tmp; - struct core_vma_metadata *vma_meta = NULL; - size_t vma_data_size; /* alloc memory for large data structures: too large to be on stack */ elf = kmalloc(sizeof(*elf), GFP_KERNEL); @@ -1491,9 +1489,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (!psinfo) goto end_coredump; - if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) - goto end_coredump; - for (ct = current->signal->core_state->dumper.next; ct; ct = ct->next) { tmp = elf_dump_thread_status(cprm->siginfo->si_signo, @@ -1513,7 +1508,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) tmp->next = thread_list; thread_list = tmp; - segs = vma_count + elf_core_extra_phdrs(); + segs = cprm->vma_count + elf_core_extra_phdrs(); /* for notes section */ segs++; @@ -1558,7 +1553,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) /* Page-align dumped data */ dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - offset += vma_data_size; + offset += cprm->vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -1578,8 +1573,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto end_coredump; /* write program headers for segments dump */ - for (i = 0; i < vma_count; i++) { - struct core_vma_metadata *meta = vma_meta + i; + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *meta = cprm->vma_meta + i; struct elf_phdr phdr; size_t sz; @@ -1628,7 +1623,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) dump_skip_to(cprm, dataoff); - if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count)) + if (!elf_fdpic_dump_segments(cprm, cprm->vma_meta, cprm->vma_count)) goto end_coredump; if (!elf_core_write_extra_data(cprm)) @@ -1652,7 +1647,6 @@ end_coredump: thread_list = thread_list->next; kfree(tmp); } - kvfree(vma_meta); kfree(phdr4note); kfree(elf); kfree(psinfo); diff --git a/fs/binfmt_elf_test.c b/fs/binfmt_elf_test.c new file mode 100644 index 000000000000..11d734fec366 --- /dev/null +++ b/fs/binfmt_elf_test.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <kunit/test.h> + +static void total_mapping_size_test(struct kunit *test) +{ + struct elf_phdr empty[] = { + { .p_type = PT_LOAD, .p_vaddr = 0, .p_memsz = 0, }, + { .p_type = PT_INTERP, .p_vaddr = 10, .p_memsz = 999999, }, + }; + /* + * readelf -lW /bin/mount | grep '^ .*0x0' | awk '{print "\t\t{ .p_type = PT_" \ + * $1 ", .p_vaddr = " $3 ", .p_memsz = " $6 ", },"}' + */ + struct elf_phdr mount[] = { + { .p_type = PT_PHDR, .p_vaddr = 0x00000040, .p_memsz = 0x0002d8, }, + { .p_type = PT_INTERP, .p_vaddr = 0x00000318, .p_memsz = 0x00001c, }, + { .p_type = PT_LOAD, .p_vaddr = 0x00000000, .p_memsz = 0x0033a8, }, + { .p_type = PT_LOAD, .p_vaddr = 0x00004000, .p_memsz = 0x005c91, }, + { .p_type = PT_LOAD, .p_vaddr = 0x0000a000, .p_memsz = 0x0022f8, }, + { .p_type = PT_LOAD, .p_vaddr = 0x0000d330, .p_memsz = 0x000d40, }, + { .p_type = PT_DYNAMIC, .p_vaddr = 0x0000d928, .p_memsz = 0x000200, }, + { .p_type = PT_NOTE, .p_vaddr = 0x00000338, .p_memsz = 0x000030, }, + { .p_type = PT_NOTE, .p_vaddr = 0x00000368, .p_memsz = 0x000044, }, + { .p_type = PT_GNU_PROPERTY, .p_vaddr = 0x00000338, .p_memsz = 0x000030, }, + { .p_type = PT_GNU_EH_FRAME, .p_vaddr = 0x0000b490, .p_memsz = 0x0001ec, }, + { .p_type = PT_GNU_STACK, .p_vaddr = 0x00000000, .p_memsz = 0x000000, }, + { .p_type = PT_GNU_RELRO, .p_vaddr = 0x0000d330, .p_memsz = 0x000cd0, }, + }; + size_t mount_size = 0xE070; + /* https://lore.kernel.org/linux-fsdevel/YfF18Dy85mCntXrx@fractal.localdomain */ + struct elf_phdr unordered[] = { + { .p_type = PT_LOAD, .p_vaddr = 0x00000000, .p_memsz = 0x0033a8, }, + { .p_type = PT_LOAD, .p_vaddr = 0x0000d330, .p_memsz = 0x000d40, }, + { .p_type = PT_LOAD, .p_vaddr = 0x00004000, .p_memsz = 0x005c91, }, + { .p_type = PT_LOAD, .p_vaddr = 0x0000a000, .p_memsz = 0x0022f8, }, + }; + + /* No headers, no size. */ + KUNIT_EXPECT_EQ(test, total_mapping_size(NULL, 0), 0); + KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 0), 0); + /* Empty headers, no size. */ + KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 1), 0); + /* No PT_LOAD headers, no size. */ + KUNIT_EXPECT_EQ(test, total_mapping_size(&empty[1], 1), 0); + /* Empty PT_LOAD and non-PT_LOAD headers, no size. */ + KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 2), 0); + + /* Normal set of PT_LOADS, and expected size. */ + KUNIT_EXPECT_EQ(test, total_mapping_size(mount, ARRAY_SIZE(mount)), mount_size); + /* Unordered PT_LOADs result in same size. */ + KUNIT_EXPECT_EQ(test, total_mapping_size(unordered, ARRAY_SIZE(unordered)), mount_size); +} + +static struct kunit_case binfmt_elf_test_cases[] = { + KUNIT_CASE(total_mapping_size_test), + {}, +}; + +static struct kunit_suite binfmt_elf_test_suite = { + .name = KBUILD_MODNAME, + .test_cases = binfmt_elf_test_cases, +}; + +kunit_test_suite(binfmt_elf_test_suite); diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 5d776f80ee50..c26545d71d39 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -68,11 +68,7 @@ #define RELOC_FAILED 0xff00ff01 /* Relocation incorrect somewhere */ #define UNLOADED_LIB 0x7ff000ff /* Placeholder for unused library */ -#ifdef CONFIG_BINFMT_SHARED_FLAT -#define MAX_SHARED_LIBS (4) -#else -#define MAX_SHARED_LIBS (1) -#endif +#define MAX_SHARED_LIBS (1) #ifdef CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET #define DATA_START_OFFSET_WORDS (0) @@ -92,32 +88,13 @@ struct lib_info { } lib_list[MAX_SHARED_LIBS]; }; -#ifdef CONFIG_BINFMT_SHARED_FLAT -static int load_flat_shared_library(int id, struct lib_info *p); -#endif - static int load_flat_binary(struct linux_binprm *); -static int flat_core_dump(struct coredump_params *cprm); static struct linux_binfmt flat_format = { .module = THIS_MODULE, .load_binary = load_flat_binary, - .core_dump = flat_core_dump, - .min_coredump = PAGE_SIZE }; -/****************************************************************************/ -/* - * Routine writes a core dump image in the current directory. - * Currently only a stub-function. - */ - -static int flat_core_dump(struct coredump_params *cprm) -{ - pr_warn("Process %s:%d received signr %d and should have core dumped\n", - current->comm, current->pid, cprm->siginfo->si_signo); - return 1; -} /****************************************************************************/ /* @@ -322,51 +299,18 @@ out_free: /****************************************************************************/ static unsigned long -calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) +calc_reloc(unsigned long r, struct lib_info *p) { unsigned long addr; - int id; unsigned long start_brk; unsigned long start_data; unsigned long text_len; unsigned long start_code; -#ifdef CONFIG_BINFMT_SHARED_FLAT - if (r == 0) - id = curid; /* Relocs of 0 are always self referring */ - else { - id = (r >> 24) & 0xff; /* Find ID for this reloc */ - r &= 0x00ffffff; /* Trim ID off here */ - } - if (id >= MAX_SHARED_LIBS) { - pr_err("reference 0x%lx to shared library %d", r, id); - goto failed; - } - if (curid != id) { - if (internalp) { - pr_err("reloc address 0x%lx not in same module " - "(%d != %d)", r, curid, id); - goto failed; - } else if (!p->lib_list[id].loaded && - load_flat_shared_library(id, p) < 0) { - pr_err("failed to load library %d", id); - goto failed; - } - /* Check versioning information (i.e. time stamps) */ - if (p->lib_list[id].build_date && p->lib_list[curid].build_date && - p->lib_list[curid].build_date < p->lib_list[id].build_date) { - pr_err("library %d is younger than %d", id, curid); - goto failed; - } - } -#else - id = 0; -#endif - - start_brk = p->lib_list[id].start_brk; - start_data = p->lib_list[id].start_data; - start_code = p->lib_list[id].start_code; - text_len = p->lib_list[id].text_len; + start_brk = p->lib_list[0].start_brk; + start_data = p->lib_list[0].start_data; + start_code = p->lib_list[0].start_code; + text_len = p->lib_list[0].text_len; if (r > start_brk - start_data + text_len) { pr_err("reloc outside program 0x%lx (0 - 0x%lx/0x%lx)", @@ -433,8 +377,32 @@ static void old_reloc(unsigned long rl) /****************************************************************************/ +static inline u32 __user *skip_got_header(u32 __user *rp) +{ + if (IS_ENABLED(CONFIG_RISCV)) { + /* + * RISC-V has a 16 byte GOT PLT header for elf64-riscv + * and 8 byte GOT PLT header for elf32-riscv. + * Skip the whole GOT PLT header, since it is reserved + * for the dynamic linker (ld.so). + */ + u32 rp_val0, rp_val1; + + if (get_user(rp_val0, rp)) + return rp; + if (get_user(rp_val1, rp + 1)) + return rp; + + if (rp_val0 == 0xffffffff && rp_val1 == 0xffffffff) + rp += 4; + else if (rp_val0 == 0xffffffff) + rp += 2; + } + return rp; +} + static int load_flat_file(struct linux_binprm *bprm, - struct lib_info *libinfo, int id, unsigned long *extra_stack) + struct lib_info *libinfo, unsigned long *extra_stack) { struct flat_hdr *hdr; unsigned long textpos, datapos, realdatastart; @@ -486,14 +454,6 @@ static int load_flat_file(struct linux_binprm *bprm, goto err; } - /* Don't allow old format executables to use shared libraries */ - if (rev == OLD_FLAT_VERSION && id != 0) { - pr_err("shared libraries are not available before rev 0x%lx\n", - FLAT_VERSION); - ret = -ENOEXEC; - goto err; - } - /* * fix up the flags for the older format, there were all kinds * of endian hacks, this only works for the simple cases @@ -544,15 +504,13 @@ static int load_flat_file(struct linux_binprm *bprm, } /* Flush all traces of the currently running executable */ - if (id == 0) { - ret = begin_new_exec(bprm); - if (ret) - goto err; + ret = begin_new_exec(bprm); + if (ret) + goto err; - /* OK, This is the point of no return */ - set_personality(PER_LINUX_32BIT); - setup_new_exec(bprm); - } + /* OK, This is the point of no return */ + set_personality(PER_LINUX_32BIT); + setup_new_exec(bprm); /* * calculate the extra space we need to map in @@ -732,42 +690,40 @@ static int load_flat_file(struct linux_binprm *bprm, text_len -= sizeof(struct flat_hdr); /* the real code len */ /* The main program needs a little extra setup in the task structure */ - if (id == 0) { - current->mm->start_code = start_code; - current->mm->end_code = end_code; - current->mm->start_data = datapos; - current->mm->end_data = datapos + data_len; - /* - * set up the brk stuff, uses any slack left in data/bss/stack - * allocation. We put the brk after the bss (between the bss - * and stack) like other platforms. - * Userspace code relies on the stack pointer starting out at - * an address right at the end of a page. - */ - current->mm->start_brk = datapos + data_len + bss_len; - current->mm->brk = (current->mm->start_brk + 3) & ~3; + current->mm->start_code = start_code; + current->mm->end_code = end_code; + current->mm->start_data = datapos; + current->mm->end_data = datapos + data_len; + /* + * set up the brk stuff, uses any slack left in data/bss/stack + * allocation. We put the brk after the bss (between the bss + * and stack) like other platforms. + * Userspace code relies on the stack pointer starting out at + * an address right at the end of a page. + */ + current->mm->start_brk = datapos + data_len + bss_len; + current->mm->brk = (current->mm->start_brk + 3) & ~3; #ifndef CONFIG_MMU - current->mm->context.end_brk = memp + memp_size - stack_len; + current->mm->context.end_brk = memp + memp_size - stack_len; #endif - } if (flags & FLAT_FLAG_KTRACE) { pr_info("Mapping is %lx, Entry point is %x, data_start is %x\n", textpos, 0x00ffffff&ntohl(hdr->entry), ntohl(hdr->data_start)); pr_info("%s %s: TEXT=%lx-%lx DATA=%lx-%lx BSS=%lx-%lx\n", - id ? "Lib" : "Load", bprm->filename, + "Load", bprm->filename, start_code, end_code, datapos, datapos + data_len, datapos + data_len, (datapos + data_len + bss_len + 3) & ~3); } /* Store the current module values into the global library structure */ - libinfo->lib_list[id].start_code = start_code; - libinfo->lib_list[id].start_data = datapos; - libinfo->lib_list[id].start_brk = datapos + data_len + bss_len; - libinfo->lib_list[id].text_len = text_len; - libinfo->lib_list[id].loaded = 1; - libinfo->lib_list[id].entry = (0x00ffffff & ntohl(hdr->entry)) + textpos; - libinfo->lib_list[id].build_date = ntohl(hdr->build_date); + libinfo->lib_list[0].start_code = start_code; + libinfo->lib_list[0].start_data = datapos; + libinfo->lib_list[0].start_brk = datapos + data_len + bss_len; + libinfo->lib_list[0].text_len = text_len; + libinfo->lib_list[0].loaded = 1; + libinfo->lib_list[0].entry = (0x00ffffff & ntohl(hdr->entry)) + textpos; + libinfo->lib_list[0].build_date = ntohl(hdr->build_date); /* * We just load the allocations into some temporary memory to @@ -782,14 +738,15 @@ static int load_flat_file(struct linux_binprm *bprm, * image. */ if (flags & FLAT_FLAG_GOTPIC) { - for (rp = (u32 __user *)datapos; ; rp++) { + rp = skip_got_header((u32 __user *) datapos); + for (; ; rp++) { u32 addr, rp_val; if (get_user(rp_val, rp)) return -EFAULT; if (rp_val == 0xffffffff) break; if (rp_val) { - addr = calc_reloc(rp_val, libinfo, id, 0); + addr = calc_reloc(rp_val, libinfo); if (addr == RELOC_FAILED) { ret = -ENOEXEC; goto err; @@ -825,7 +782,7 @@ static int load_flat_file(struct linux_binprm *bprm, return -EFAULT; relval = ntohl(tmp); addr = flat_get_relocate_addr(relval); - rp = (u32 __user *)calc_reloc(addr, libinfo, id, 1); + rp = (u32 __user *)calc_reloc(addr, libinfo); if (rp == (u32 __user *)RELOC_FAILED) { ret = -ENOEXEC; goto err; @@ -848,7 +805,7 @@ static int load_flat_file(struct linux_binprm *bprm, */ addr = ntohl((__force __be32)addr); } - addr = calc_reloc(addr, libinfo, id, 0); + addr = calc_reloc(addr, libinfo); if (addr == RELOC_FAILED) { ret = -ENOEXEC; goto err; @@ -876,7 +833,7 @@ static int load_flat_file(struct linux_binprm *bprm, /* zero the BSS, BRK and stack areas */ if (clear_user((void __user *)(datapos + data_len), bss_len + (memp + memp_size - stack_len - /* end brk */ - libinfo->lib_list[id].start_brk) + /* start brk */ + libinfo->lib_list[0].start_brk) + /* start brk */ stack_len)) return -EFAULT; @@ -887,49 +844,6 @@ err: /****************************************************************************/ -#ifdef CONFIG_BINFMT_SHARED_FLAT - -/* - * Load a shared library into memory. The library gets its own data - * segment (including bss) but not argv/argc/environ. - */ - -static int load_flat_shared_library(int id, struct lib_info *libs) -{ - /* - * This is a fake bprm struct; only the members "buf", "file" and - * "filename" are actually used. - */ - struct linux_binprm bprm; - int res; - char buf[16]; - loff_t pos = 0; - - memset(&bprm, 0, sizeof(bprm)); - - /* Create the file name */ - sprintf(buf, "/lib/lib%d.so", id); - - /* Open the file up */ - bprm.filename = buf; - bprm.file = open_exec(bprm.filename); - res = PTR_ERR(bprm.file); - if (IS_ERR(bprm.file)) - return res; - - res = kernel_read(bprm.file, bprm.buf, BINPRM_BUF_SIZE, &pos); - - if (res >= 0) - res = load_flat_file(&bprm, libs, id, NULL); - - allow_write_access(bprm.file); - fput(bprm.file); - - return res; -} - -#endif /* CONFIG_BINFMT_SHARED_FLAT */ -/****************************************************************************/ /* * These are the functions used to load flat style executables and shared @@ -961,7 +875,7 @@ static int load_flat_binary(struct linux_binprm *bprm) stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */ stack_len = ALIGN(stack_len, FLAT_STACK_ALIGN); - res = load_flat_file(bprm, &libinfo, 0, &stack_len); + res = load_flat_file(bprm, &libinfo, &stack_len); if (res < 0) return res; @@ -1006,20 +920,6 @@ static int load_flat_binary(struct linux_binprm *bprm) */ start_addr = libinfo.lib_list[0].entry; -#ifdef CONFIG_BINFMT_SHARED_FLAT - for (i = MAX_SHARED_LIBS-1; i > 0; i--) { - if (libinfo.lib_list[i].loaded) { - /* Push previos first to call address */ - unsigned long __user *sp; - current->mm->start_stack -= sizeof(unsigned long); - sp = (unsigned long __user *)current->mm->start_stack; - if (put_user(start_addr, sp)) - return -EFAULT; - start_addr = libinfo.lib_list[i].entry; - } - } -#endif - #ifdef FLAT_PLAT_INIT FLAT_PLAT_INIT(regs); #endif diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 520a0f6a7d9e..183e5c4aed34 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -18,8 +18,7 @@ config BTRFS_FS select RAID6_PQ select XOR_BLOCKS select SRCU - depends on !PPC_256K_PAGES # powerpc - depends on !PAGE_SIZE_256KB # hexagon + depends on PAGE_SIZE_LESS_THAN_256KB help Btrfs is a general purpose copy-on-write filesystem with extents, diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 4188ba3fd8c3..99f9995670ea 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -17,6 +17,7 @@ subdir-ccflags-y += $(condflags) subdir-ccflags-y += -Wno-missing-field-initializers subdir-ccflags-y += -Wno-sign-compare subdir-ccflags-y += -Wno-type-limits +subdir-ccflags-y += -Wno-shift-negative-value obj-$(CONFIG_BTRFS_FS) := btrfs.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 0a0d0eccee4e..548d6a5477b4 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -55,9 +55,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu) return acl; } -static int __btrfs_set_acl(struct btrfs_trans_handle *trans, - struct user_namespace *mnt_userns, - struct inode *inode, struct posix_acl *acl, int type) +int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, + struct posix_acl *acl, int type) { int ret, size = 0; const char *name; @@ -123,40 +122,8 @@ int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, if (ret) return ret; } - ret = __btrfs_set_acl(NULL, mnt_userns, inode, acl, type); + ret = __btrfs_set_acl(NULL, inode, acl, type); if (ret) inode->i_mode = old_mode; return ret; } - -int btrfs_init_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) -{ - struct posix_acl *default_acl, *acl; - int ret = 0; - - /* this happens with subvols */ - if (!dir) - return 0; - - ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); - if (ret) - return ret; - - if (default_acl) { - ret = __btrfs_set_acl(trans, &init_user_ns, inode, default_acl, - ACL_TYPE_DEFAULT); - posix_acl_release(default_acl); - } - - if (acl) { - if (!ret) - ret = __btrfs_set_acl(trans, &init_user_ns, inode, acl, - ACL_TYPE_ACCESS); - posix_acl_release(acl); - } - - if (!default_acl && !acl) - cache_no_acl(inode); - return ret; -} diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 43c89952b7d2..aac240430efe 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -15,13 +15,12 @@ enum { WORK_DONE_BIT, WORK_ORDER_DONE_BIT, - WORK_HIGH_PRIO_BIT, }; #define NO_THRESHOLD (-1) #define DFT_THRESHOLD (32) -struct __btrfs_workqueue { +struct btrfs_workqueue { struct workqueue_struct *normal_wq; /* File system this workqueue services */ @@ -48,12 +47,7 @@ struct __btrfs_workqueue { spinlock_t thres_lock; }; -struct btrfs_workqueue { - struct __btrfs_workqueue *normal; - struct __btrfs_workqueue *high; -}; - -struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq) +struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq) { return wq->fs_info; } @@ -66,22 +60,22 @@ struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work) bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq) { /* - * We could compare wq->normal->pending with num_online_cpus() + * We could compare wq->pending with num_online_cpus() * to support "thresh == NO_THRESHOLD" case, but it requires * moving up atomic_inc/dec in thresh_queue/exec_hook. Let's * postpone it until someone needs the support of that case. */ - if (wq->normal->thresh == NO_THRESHOLD) + if (wq->thresh == NO_THRESHOLD) return false; - return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2; + return atomic_read(&wq->pending) > wq->thresh * 2; } -static struct __btrfs_workqueue * -__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, - unsigned int flags, int limit_active, int thresh) +struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, + const char *name, unsigned int flags, + int limit_active, int thresh) { - struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); + struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return NULL; @@ -105,12 +99,8 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, ret->thresh = thresh; } - if (flags & WQ_HIGHPRI) - ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags, - ret->current_active, name); - else - ret->normal_wq = alloc_workqueue("btrfs-%s", flags, - ret->current_active, name); + ret->normal_wq = alloc_workqueue("btrfs-%s", flags, ret->current_active, + name); if (!ret->normal_wq) { kfree(ret); return NULL; @@ -119,41 +109,7 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, INIT_LIST_HEAD(&ret->ordered_list); spin_lock_init(&ret->list_lock); spin_lock_init(&ret->thres_lock); - trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI); - return ret; -} - -static inline void -__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); - -struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, - const char *name, - unsigned int flags, - int limit_active, - int thresh) -{ - struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); - - if (!ret) - return NULL; - - ret->normal = __btrfs_alloc_workqueue(fs_info, name, - flags & ~WQ_HIGHPRI, - limit_active, thresh); - if (!ret->normal) { - kfree(ret); - return NULL; - } - - if (flags & WQ_HIGHPRI) { - ret->high = __btrfs_alloc_workqueue(fs_info, name, flags, - limit_active, thresh); - if (!ret->high) { - __btrfs_destroy_workqueue(ret->normal); - kfree(ret); - return NULL; - } - } + trace_btrfs_workqueue_alloc(ret, name); return ret; } @@ -162,7 +118,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, * This hook WILL be called in IRQ handler context, * so workqueue_set_max_active MUST NOT be called in this hook */ -static inline void thresh_queue_hook(struct __btrfs_workqueue *wq) +static inline void thresh_queue_hook(struct btrfs_workqueue *wq) { if (wq->thresh == NO_THRESHOLD) return; @@ -174,7 +130,7 @@ static inline void thresh_queue_hook(struct __btrfs_workqueue *wq) * This hook is called in kthread content. * So workqueue_set_max_active is called here. */ -static inline void thresh_exec_hook(struct __btrfs_workqueue *wq) +static inline void thresh_exec_hook(struct btrfs_workqueue *wq) { int new_current_active; long pending; @@ -217,7 +173,7 @@ out: } } -static void run_ordered_work(struct __btrfs_workqueue *wq, +static void run_ordered_work(struct btrfs_workqueue *wq, struct btrfs_work *self) { struct list_head *list = &wq->ordered_list; @@ -305,7 +261,7 @@ static void btrfs_work_helper(struct work_struct *normal_work) { struct btrfs_work *work = container_of(normal_work, struct btrfs_work, normal_work); - struct __btrfs_workqueue *wq; + struct btrfs_workqueue *wq = work->wq; int need_order = 0; /* @@ -318,7 +274,6 @@ static void btrfs_work_helper(struct work_struct *normal_work) */ if (work->ordered_func) need_order = 1; - wq = work->wq; trace_btrfs_work_sched(work); thresh_exec_hook(wq); @@ -350,8 +305,7 @@ void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, work->flags = 0; } -static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq, - struct btrfs_work *work) +void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work) { unsigned long flags; @@ -366,54 +320,22 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq, queue_work(wq->normal_wq, &work->normal_work); } -void btrfs_queue_work(struct btrfs_workqueue *wq, - struct btrfs_work *work) -{ - struct __btrfs_workqueue *dest_wq; - - if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high) - dest_wq = wq->high; - else - dest_wq = wq->normal; - __btrfs_queue_work(dest_wq, work); -} - -static inline void -__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq) -{ - destroy_workqueue(wq->normal_wq); - trace_btrfs_workqueue_destroy(wq); - kfree(wq); -} - void btrfs_destroy_workqueue(struct btrfs_workqueue *wq) { if (!wq) return; - if (wq->high) - __btrfs_destroy_workqueue(wq->high); - __btrfs_destroy_workqueue(wq->normal); + destroy_workqueue(wq->normal_wq); + trace_btrfs_workqueue_destroy(wq); kfree(wq); } void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active) { - if (!wq) - return; - wq->normal->limit_active = limit_active; - if (wq->high) - wq->high->limit_active = limit_active; -} - -void btrfs_set_work_high_priority(struct btrfs_work *work) -{ - set_bit(WORK_HIGH_PRIO_BIT, &work->flags); + if (wq) + wq->limit_active = limit_active; } void btrfs_flush_workqueue(struct btrfs_workqueue *wq) { - if (wq->high) - flush_workqueue(wq->high->normal_wq); - - flush_workqueue(wq->normal->normal_wq); + flush_workqueue(wq->normal_wq); } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 3204daa51b95..07960529b360 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -11,8 +11,6 @@ struct btrfs_fs_info; struct btrfs_workqueue; -/* Internal use only */ -struct __btrfs_workqueue; struct btrfs_work; typedef void (*btrfs_func_t)(struct btrfs_work *arg); typedef void (*btrfs_work_func_t)(struct work_struct *arg); @@ -25,7 +23,7 @@ struct btrfs_work { /* Don't touch things below */ struct work_struct normal_work; struct list_head ordered_list; - struct __btrfs_workqueue *wq; + struct btrfs_workqueue *wq; unsigned long flags; }; @@ -40,9 +38,8 @@ void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work); void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); -void btrfs_set_work_high_priority(struct btrfs_work *work); struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work); -struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq); +struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq); bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq); void btrfs_flush_workqueue(struct btrfs_workqueue *wq); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index c9ee579bc5a6..ebc392ea1d74 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -789,11 +789,13 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, if (IS_ERR(eb)) { free_pref(ref); return PTR_ERR(eb); - } else if (!extent_buffer_uptodate(eb)) { + } + if (!extent_buffer_uptodate(eb)) { free_pref(ref); free_extent_buffer(eb); return -EIO; } + if (lock) btrfs_tree_read_lock(eb); if (btrfs_header_level(eb) == 0) @@ -1335,7 +1337,8 @@ again: if (IS_ERR(eb)) { ret = PTR_ERR(eb); goto out; - } else if (!extent_buffer_uptodate(eb)) { + } + if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); ret = -EIO; goto out; diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 1db24e6d6d90..ede389f2602d 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -124,7 +124,16 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) { if (refcount_dec_and_test(&cache->refs)) { WARN_ON(cache->pinned > 0); - WARN_ON(cache->reserved > 0); + /* + * If there was a failure to cleanup a log tree, very likely due + * to an IO failure on a writeback attempt of one or more of its + * extent buffers, we could not do proper (and cheap) unaccounting + * of their reserved space, so don't warn on reserved > 0 in that + * case. + */ + if (!(cache->flags & BTRFS_BLOCK_GROUP_METADATA) || + !BTRFS_FS_LOG_CLEANUP_ERROR(cache->fs_info)) + WARN_ON(cache->reserved > 0); /* * A block_group shouldn't be on the discard_list anymore. @@ -159,11 +168,12 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, struct rb_node **p; struct rb_node *parent = NULL; struct btrfs_block_group *cache; + bool leftmost = true; ASSERT(block_group->length != 0); - spin_lock(&info->block_group_cache_lock); - p = &info->block_group_cache_tree.rb_node; + write_lock(&info->block_group_cache_lock); + p = &info->block_group_cache_tree.rb_root.rb_node; while (*p) { parent = *p; @@ -172,20 +182,18 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, p = &(*p)->rb_left; } else if (block_group->start > cache->start) { p = &(*p)->rb_right; + leftmost = false; } else { - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); return -EEXIST; } } rb_link_node(&block_group->cache_node, parent, p); - rb_insert_color(&block_group->cache_node, - &info->block_group_cache_tree); + rb_insert_color_cached(&block_group->cache_node, + &info->block_group_cache_tree, leftmost); - if (info->first_logical_byte > block_group->start) - info->first_logical_byte = block_group->start; - - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); return 0; } @@ -201,8 +209,8 @@ static struct btrfs_block_group *block_group_cache_tree_search( struct rb_node *n; u64 end, start; - spin_lock(&info->block_group_cache_lock); - n = info->block_group_cache_tree.rb_node; + read_lock(&info->block_group_cache_lock); + n = info->block_group_cache_tree.rb_root.rb_node; while (n) { cache = rb_entry(n, struct btrfs_block_group, cache_node); @@ -224,12 +232,9 @@ static struct btrfs_block_group *block_group_cache_tree_search( break; } } - if (ret) { + if (ret) btrfs_get_block_group(ret); - if (bytenr == 0 && info->first_logical_byte > ret->start) - info->first_logical_byte = ret->start; - } - spin_unlock(&info->block_group_cache_lock); + read_unlock(&info->block_group_cache_lock); return ret; } @@ -258,15 +263,15 @@ struct btrfs_block_group *btrfs_next_block_group( struct btrfs_fs_info *fs_info = cache->fs_info; struct rb_node *node; - spin_lock(&fs_info->block_group_cache_lock); + read_lock(&fs_info->block_group_cache_lock); /* If our block group was removed, we need a full search. */ if (RB_EMPTY_NODE(&cache->cache_node)) { const u64 next_bytenr = cache->start + cache->length; - spin_unlock(&fs_info->block_group_cache_lock); + read_unlock(&fs_info->block_group_cache_lock); btrfs_put_block_group(cache); - cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; + return btrfs_lookup_first_block_group(fs_info, next_bytenr); } node = rb_next(&cache->cache_node); btrfs_put_block_group(cache); @@ -275,46 +280,70 @@ struct btrfs_block_group *btrfs_next_block_group( btrfs_get_block_group(cache); } else cache = NULL; - spin_unlock(&fs_info->block_group_cache_lock); + read_unlock(&fs_info->block_group_cache_lock); return cache; } -bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +/** + * Check if we can do a NOCOW write for a given extent. + * + * @fs_info: The filesystem information object. + * @bytenr: Logical start address of the extent. + * + * Check if we can do a NOCOW write for the given extent, and increments the + * number of NOCOW writers in the block group that contains the extent, as long + * as the block group exists and it's currently not in read-only mode. + * + * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller + * is responsible for calling btrfs_dec_nocow_writers() later. + * + * Or NULL if we can not do a NOCOW write + */ +struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, + u64 bytenr) { struct btrfs_block_group *bg; - bool ret = true; + bool can_nocow = true; bg = btrfs_lookup_block_group(fs_info, bytenr); if (!bg) - return false; + return NULL; spin_lock(&bg->lock); if (bg->ro) - ret = false; + can_nocow = false; else atomic_inc(&bg->nocow_writers); spin_unlock(&bg->lock); - /* No put on block group, done by btrfs_dec_nocow_writers */ - if (!ret) + if (!can_nocow) { btrfs_put_block_group(bg); + return NULL; + } - return ret; + /* No put on block group, done by btrfs_dec_nocow_writers(). */ + return bg; } -void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +/** + * Decrement the number of NOCOW writers in a block group. + * + * @bg: The block group. + * + * This is meant to be called after a previous call to btrfs_inc_nocow_writers(), + * and on the block group returned by that call. Typically this is called after + * creating an ordered extent for a NOCOW write, to prevent races with scrub and + * relocation. + * + * After this call, the caller should not use the block group anymore. It it wants + * to use it, then it should get a reference on it before calling this function. + */ +void btrfs_dec_nocow_writers(struct btrfs_block_group *bg) { - struct btrfs_block_group *bg; - - bg = btrfs_lookup_block_group(fs_info, bytenr); - ASSERT(bg); if (atomic_dec_and_test(&bg->nocow_writers)) wake_up_var(&bg->nocow_writers); - /* - * Once for our lookup and once for the lookup done by a previous call - * to btrfs_inc_nocow_writers() - */ - btrfs_put_block_group(bg); + + /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */ btrfs_put_block_group(bg); } @@ -763,10 +792,10 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only cache->has_caching_ctl = 1; spin_unlock(&cache->lock); - spin_lock(&fs_info->block_group_cache_lock); + write_lock(&fs_info->block_group_cache_lock); refcount_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); - spin_unlock(&fs_info->block_group_cache_lock); + write_unlock(&fs_info->block_group_cache_lock); btrfs_get_block_group(cache); @@ -948,17 +977,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (ret) goto out; - spin_lock(&fs_info->block_group_cache_lock); - rb_erase(&block_group->cache_node, - &fs_info->block_group_cache_tree); + write_lock(&fs_info->block_group_cache_lock); + rb_erase_cached(&block_group->cache_node, + &fs_info->block_group_cache_tree); RB_CLEAR_NODE(&block_group->cache_node); /* Once for the block groups rbtree */ btrfs_put_block_group(block_group); - if (fs_info->first_logical_byte == block_group->start) - fs_info->first_logical_byte = (u64)-1; - spin_unlock(&fs_info->block_group_cache_lock); + write_unlock(&fs_info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); /* @@ -983,7 +1010,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (block_group->cached == BTRFS_CACHE_STARTED) btrfs_wait_block_group_cache_done(block_group); if (block_group->has_caching_ctl) { - spin_lock(&fs_info->block_group_cache_lock); + write_lock(&fs_info->block_group_cache_lock); if (!caching_ctl) { struct btrfs_caching_control *ctl; @@ -997,7 +1024,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, } if (caching_ctl) list_del_init(&caching_ctl->list); - spin_unlock(&fs_info->block_group_cache_lock); + write_unlock(&fs_info->block_group_cache_lock); if (caching_ctl) { /* Once for the caching bgs list and once for us. */ btrfs_put_caching_control(caching_ctl); @@ -1358,6 +1385,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) goto next; } + ret = btrfs_zone_finish(block_group); + if (ret < 0) { + btrfs_dec_block_group_ro(block_group); + if (ret == -EAGAIN) + ret = 0; + goto next; + } + /* * Want to do this before we do anything else so we can recover * properly if we fail to join the transaction. @@ -1503,6 +1538,13 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a, return bg1->used > bg2->used; } +static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) +{ + if (btrfs_is_zoned(fs_info)) + return btrfs_zoned_should_reclaim(fs_info); + return true; +} + void btrfs_reclaim_bgs_work(struct work_struct *work) { struct btrfs_fs_info *fs_info = @@ -1513,15 +1555,23 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; - if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) + if (!btrfs_should_reclaim(fs_info)) return; + sb_start_write(fs_info->sb); + + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { + sb_end_write(fs_info->sb); + return; + } + /* * Long running balances can keep us blocked here for eternity, so * simply skip reclaim if we're unable to get the mutex. */ if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) { btrfs_exclop_finish(fs_info); + sb_end_write(fs_info->sb); return; } @@ -1596,6 +1646,7 @@ next: spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); + sb_end_write(fs_info->sb); } void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) @@ -1677,35 +1728,13 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info, struct btrfs_root *root = btrfs_block_group_root(fs_info); int ret; struct btrfs_key found_key; - struct extent_buffer *leaf; - int slot; - - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret < 0) - return ret; - - while (1) { - slot = path->slots[0]; - leaf = path->nodes[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out; - break; - } - btrfs_item_key_to_cpu(leaf, &found_key, slot); + btrfs_for_each_slot(root, key, &found_key, path, ret) { if (found_key.objectid >= key->objectid && found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { - ret = read_bg_from_eb(fs_info, &found_key, path); - break; + return read_bg_from_eb(fs_info, &found_key, path); } - - path->slots[0]++; } -out: return ret; } @@ -1997,6 +2026,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, cache->length = key->offset; cache->used = btrfs_stack_block_group_used(bgi); cache->flags = btrfs_stack_block_group_flags(bgi); + cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); set_free_space_tree_thresholds(cache); @@ -2279,7 +2309,7 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, spin_lock(&block_group->lock); btrfs_set_stack_block_group_used(&bgi, block_group->used); btrfs_set_stack_block_group_chunk_objectid(&bgi, - BTRFS_FIRST_CHUNK_TREE_OBJECTID); + block_group->global_root_id); btrfs_set_stack_block_group_flags(&bgi, block_group->flags); key.objectid = block_group->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; @@ -2435,6 +2465,27 @@ next: btrfs_trans_release_chunk_metadata(trans); } +/* + * For extent tree v2 we use the block_group_item->chunk_offset to point at our + * global root id. For v1 it's always set to BTRFS_FIRST_CHUNK_TREE_OBJECTID. + */ +static u64 calculate_global_root_id(struct btrfs_fs_info *fs_info, u64 offset) +{ + u64 div = SZ_1G; + u64 index; + + if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return BTRFS_FIRST_CHUNK_TREE_OBJECTID; + + /* If we have a smaller fs index based on 128MiB. */ + if (btrfs_super_total_bytes(fs_info->super_copy) <= (SZ_1G * 10ULL)) + div = SZ_128M; + + offset = div64_u64(offset, div); + div64_u64_rem(offset, fs_info->nr_global_roots, &index); + return index; +} + struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, u64 type, u64 chunk_offset, u64 size) @@ -2455,6 +2506,8 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran cache->flags = type; cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; + cache->global_root_id = calculate_global_root_id(fs_info, cache->start); + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) cache->needs_free_space = 1; @@ -2464,12 +2517,6 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran return ERR_PTR(ret); } - /* - * New block group is likely to be used soon. Try to activate it now. - * Failure is OK for now. - */ - btrfs_zone_activate(cache); - ret = exclude_super_stripes(cache); if (ret) { /* We may have excluded something, so call this just in case */ @@ -2544,6 +2591,19 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, int ret; bool dirty_bg_running; + /* + * This can only happen when we are doing read-only scrub on read-only + * mount. + * In that case we should not start a new transaction on read-only fs. + * Thus here we skip all chunk allocations. + */ + if (sb_rdonly(fs_info->sb)) { + mutex_lock(&fs_info->ro_block_group_mutex); + ret = inc_block_group_ro(cache, 0); + mutex_unlock(&fs_info->ro_block_group_mutex); + return ret; + } + do { trans = btrfs_join_transaction(root); if (IS_ERR(trans)) @@ -2671,7 +2731,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans, bi = btrfs_item_ptr_offset(leaf, path->slots[0]); btrfs_set_stack_block_group_used(&bgi, cache->used); btrfs_set_stack_block_group_chunk_objectid(&bgi, - BTRFS_FIRST_CHUNK_TREE_OBJECTID); + cache->global_root_id); btrfs_set_stack_block_group_flags(&bgi, cache->flags); write_extent_buffer(leaf, &bgi, bi, sizeof(bgi)); btrfs_mark_buffer_dirty(leaf); @@ -2894,7 +2954,6 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) struct btrfs_path *path = NULL; LIST_HEAD(dirty); struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; int loops = 0; spin_lock(&cur_trans->dirty_bgs_lock); @@ -2960,7 +3019,6 @@ again: cache->io_ctl.inode = NULL; ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { - num_started++; should_put = 0; /* @@ -3061,7 +3119,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) int should_put; struct btrfs_path *path; struct list_head *io = &cur_trans->io_bgs; - int num_started = 0; path = btrfs_alloc_path(); if (!path) @@ -3119,7 +3176,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) cache->io_ctl.inode = NULL; ret = btrfs_write_out_cache(trans, cache, path); if (ret == 0 && cache->io_ctl.inode) { - num_started++; should_put = 0; list_add_tail(&cache->io_list, io); } else { @@ -3178,6 +3234,31 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) return ret; } +static inline bool should_reclaim_block_group(struct btrfs_block_group *bg, + u64 bytes_freed) +{ + const struct btrfs_space_info *space_info = bg->space_info; + const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); + const u64 new_val = bg->used; + const u64 old_val = new_val + bytes_freed; + u64 thresh; + + if (reclaim_thresh == 0) + return false; + + thresh = div_factor_fine(bg->length, reclaim_thresh); + + /* + * If we were below the threshold before don't reclaim, we are likely a + * brand new block group and we don't want to relocate new block groups. + */ + if (old_val < thresh) + return false; + if (new_val >= thresh) + return false; + return true; +} + int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc) { @@ -3200,6 +3281,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&info->delalloc_root_lock); while (total) { + bool reclaim; + cache = btrfs_lookup_block_group(info, bytenr); if (!cache) { ret = -ENOENT; @@ -3245,6 +3328,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, cache->space_info, num_bytes); cache->space_info->bytes_used -= num_bytes; cache->space_info->disk_used -= num_bytes * factor; + + reclaim = should_reclaim_block_group(cache, num_bytes); spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -3271,6 +3356,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, if (!alloc && old_val == 0) { if (!btrfs_test_opt(info, DISCARD_ASYNC)) btrfs_mark_bg_unused(cache); + } else if (!alloc && reclaim) { + btrfs_mark_bg_to_reclaim(cache); } btrfs_put_block_group(cache); @@ -3403,7 +3490,7 @@ int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); } -static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) +static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) { struct btrfs_block_group *bg; int ret; @@ -3490,7 +3577,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) out: btrfs_trans_release_chunk_metadata(trans); - return ret; + if (ret) + return ERR_PTR(ret); + + btrfs_get_block_group(bg); + return bg; } /* @@ -3605,10 +3696,17 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_space_info *space_info; + struct btrfs_block_group *ret_bg; bool wait_for_alloc = false; bool should_alloc = false; + bool from_extent_allocation = false; int ret = 0; + if (force == CHUNK_ALLOC_FORCE_FOR_EXTENT) { + from_extent_allocation = true; + force = CHUNK_ALLOC_FORCE; + } + /* Don't re-enter if we're already allocating a chunk */ if (trans->allocating_chunk) return -ENOSPC; @@ -3698,9 +3796,22 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, force_metadata_allocation(fs_info); } - ret = do_chunk_alloc(trans, flags); + ret_bg = do_chunk_alloc(trans, flags); trans->allocating_chunk = false; + if (IS_ERR(ret_bg)) { + ret = PTR_ERR(ret_bg); + } else if (from_extent_allocation) { + /* + * New block group is likely to be used soon. Try to activate + * it now. Failure is OK for now. + */ + btrfs_zone_activate(ret_bg); + } + + if (!ret) + btrfs_put_block_group(ret_bg); + spin_lock(&space_info->lock); if (ret < 0) { if (ret == -ENOSPC) @@ -3891,14 +4002,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) struct btrfs_caching_control *caching_ctl; struct rb_node *n; - spin_lock(&info->block_group_cache_lock); + write_lock(&info->block_group_cache_lock); while (!list_empty(&info->caching_block_groups)) { caching_ctl = list_entry(info->caching_block_groups.next, struct btrfs_caching_control, list); list_del(&caching_ctl->list); btrfs_put_caching_control(caching_ctl); } - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); spin_lock(&info->unused_bgs_lock); while (!list_empty(&info->unused_bgs)) { @@ -3928,14 +4039,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } spin_unlock(&info->zone_active_bgs_lock); - spin_lock(&info->block_group_cache_lock); - while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { + write_lock(&info->block_group_cache_lock); + while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) { block_group = rb_entry(n, struct btrfs_block_group, cache_node); - rb_erase(&block_group->cache_node, - &info->block_group_cache_tree); + rb_erase_cached(&block_group->cache_node, + &info->block_group_cache_tree); RB_CLEAR_NODE(&block_group->cache_node); - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); list_del(&block_group->list); @@ -3958,9 +4069,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) ASSERT(block_group->swap_extents == 0); btrfs_put_block_group(block_group); - spin_lock(&info->block_group_cache_lock); + write_lock(&info->block_group_cache_lock); } - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); btrfs_release_global_block_rsv(info); @@ -3974,9 +4085,22 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) * important and indicates a real bug if this happens. */ if (WARN_ON(space_info->bytes_pinned > 0 || - space_info->bytes_reserved > 0 || space_info->bytes_may_use > 0)) btrfs_dump_space_info(info, space_info, 0, 0); + + /* + * If there was a failure to cleanup a log tree, very likely due + * to an IO failure on a writeback attempt of one or more of its + * extent buffers, we could not do proper (and cheap) unaccounting + * of their reserved space, so don't warn on bytes_reserved > 0 in + * that case. + */ + if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || + !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { + if (WARN_ON(space_info->bytes_reserved > 0)) + btrfs_dump_space_info(info, space_info, 0, 0); + } + WARN_ON(space_info->reclaim_size > 0); list_del(&space_info->list); btrfs_sysfs_remove_space_info(space_info); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 5878b7ce3b78..35e0e860cc0b 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -35,11 +35,15 @@ enum btrfs_discard_state { * the FS with empty chunks * * CHUNK_ALLOC_FORCE means it must try to allocate one + * + * CHUNK_ALLOC_FORCE_FOR_EXTENT like CHUNK_ALLOC_FORCE but called from + * find_free_extent() that also activaes the zone */ enum btrfs_chunk_alloc_enum { CHUNK_ALLOC_NO_FORCE, CHUNK_ALLOC_LIMITED, CHUNK_ALLOC_FORCE, + CHUNK_ALLOC_FORCE_FOR_EXTENT, }; struct btrfs_caching_control { @@ -68,6 +72,7 @@ struct btrfs_block_group { u64 bytes_super; u64 flags; u64 cache_generation; + u64 global_root_id; /* * If the free space extent count exceeds this number, convert the block @@ -99,6 +104,7 @@ struct btrfs_block_group { unsigned int relocating_repair:1; unsigned int chunk_item_inserted:1; unsigned int zone_is_active:1; + unsigned int zoned_data_reloc_ongoing:1; int disk_cache_state; @@ -207,6 +213,8 @@ struct btrfs_block_group { u64 meta_write_pointer; struct map_lookup *physical_map; struct list_head active_bg_list; + struct work_struct zone_finish_work; + struct extent_buffer *last_eb; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) @@ -249,8 +257,9 @@ void btrfs_put_block_group(struct btrfs_block_group *cache); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg); -bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); -void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, + u64 bytenr); +void btrfs_dec_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, u64 num_bytes); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b3e46aabc3d8..33811e896623 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -14,6 +14,13 @@ #include "delayed-inode.h" /* + * Since we search a directory based on f_pos (struct dir_context::pos) we have + * to start at 2 since '.' and '..' have f_pos of 0 and 1 respectively, so + * everybody else has to start at 2 (see btrfs_real_readdir() and dir_emit_dots()). + */ +#define BTRFS_DIR_START_INDEX 2 + +/* * ordered_data_close is set by truncate when a file that used * to have good data has been truncated to zero. When it is set * the btrfs file release call will add this inode to the @@ -173,8 +180,9 @@ struct btrfs_inode { u64 disk_i_size; /* - * if this is a directory then index_cnt is the counter for the index - * number for new files that are created + * If this is a directory then index_cnt is the counter for the index + * number for new files that are created. For an empty directory, this + * must be initialized to BTRFS_DIR_START_INDEX. */ u64 index_cnt; @@ -333,6 +341,36 @@ static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode) spin_unlock(&inode->lock); } +/* + * Should be called while holding the inode's VFS lock in exclusive mode or in a + * context where no one else can access the inode concurrently (during inode + * creation or when loading an inode from disk). + */ +static inline void btrfs_set_inode_full_sync(struct btrfs_inode *inode) +{ + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); + /* + * The inode may have been part of a reflink operation in the last + * transaction that modified it, and then a fsync has reset the + * last_reflink_trans to avoid subsequent fsyncs in the same + * transaction to do unnecessary work. So update last_reflink_trans + * to the last_trans value (we have to be pessimistic and assume a + * reflink happened). + * + * The ->last_trans is protected by the inode's spinlock and we can + * have a concurrent ordered extent completion update it. Also set + * last_reflink_trans to ->last_trans only if the former is less than + * the later, because we can be called in a context where + * last_reflink_trans was set to the current transaction generation + * while ->last_trans was not yet updated in the current transaction, + * and therefore has a lower value. + */ + spin_lock(&inode->lock); + if (inode->last_reflink_trans < inode->last_trans) + inode->last_reflink_trans = inode->last_trans; + spin_unlock(&inode->lock); +} + static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) { bool ret = false; @@ -346,30 +384,16 @@ static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) return ret; } -struct btrfs_dio_private { - struct inode *inode; - - /* - * Since DIO can use anonymous page, we cannot use page_offset() to - * grab the file offset, thus need a dedicated member for file offset. - */ - u64 file_offset; - u64 disk_bytenr; - /* Used for bio::bi_size */ - u32 bytes; - - /* - * References to this structure. There is one reference per in-flight - * bio plus one while we're still setting up. - */ - refcount_t refs; - - /* dio_bio came from fs/direct-io.c */ - struct bio *dio_bio; - - /* Array of checksums */ - u8 csums[]; -}; +/* + * Check if the inode has flags compatible with compression + */ +static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode) +{ + if (inode->flags & BTRFS_INODE_NODATACOW || + inode->flags & BTRFS_INODE_NODATASUM) + return false; + return true; +} /* * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 7e9f90fa0388..5d20137b7b67 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -78,7 +78,6 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/mutex.h> -#include <linux/genhd.h> #include <linux/blkdev.h> #include <linux/mm.h> #include <linux/string.h> @@ -1553,21 +1552,18 @@ static int btrfsic_read_block(struct btrfsic_state *state, return -ENOMEM; block_ctx->datav = block_ctx->mem_to_free; block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); - for (i = 0; i < num_pages; i++) { - block_ctx->pagev[i] = alloc_page(GFP_NOFS); - if (!block_ctx->pagev[i]) - return -1; - } + ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev); + if (ret) + return ret; dev_bytenr = block_ctx->dev_bytenr; for (i = 0; i < num_pages;) { struct bio *bio; unsigned int j; - bio = btrfs_bio_alloc(num_pages - i); - bio_set_dev(bio, block_ctx->dev->bdev); + bio = bio_alloc(block_ctx->dev->bdev, num_pages - i, + REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = dev_bytenr >> 9; - bio->bi_opf = REQ_OP_READ; for (j = i; j < num_pages; j++) { ret = bio_add_page(bio, block_ctx->pagev[j], @@ -2034,7 +2030,7 @@ continue_loop: static void btrfsic_bio_end_io(struct bio *bp) { - struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private; + struct btrfsic_block *block = bp->bi_private; int iodone_w_error; /* mutex is not held! This is not save if IO is not yet completed @@ -2636,100 +2632,93 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev) &btrfsic_dev_state_hashtable); } -static void __btrfsic_submit_bio(struct bio *bio) +static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state) { - struct btrfsic_dev_state *dev_state; + unsigned int segs = bio_segments(bio); + u64 dev_bytenr = 512 * bio->bi_iter.bi_sector; + u64 cur_bytenr = dev_bytenr; + struct bvec_iter iter; + struct bio_vec bvec; + char **mapped_datav; + int bio_is_patched = 0; + int i = 0; + + if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + pr_info( +"submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", + bio_op(bio), bio->bi_opf, segs, + bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - if (!btrfsic_is_initialized) + mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); + if (!mapped_datav) return; - mutex_lock(&btrfsic_mutex); - /* since btrfsic_submit_bio() is also called before - * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); - if (NULL != dev_state && - (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { - int i = 0; - u64 dev_bytenr; - u64 cur_bytenr; - struct bio_vec bvec; - struct bvec_iter iter; - int bio_is_patched; - char **mapped_datav; - unsigned int segs = bio_segments(bio); - - dev_bytenr = 512 * bio->bi_iter.bi_sector; - bio_is_patched = 0; - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", - bio_op(bio), bio->bi_opf, segs, - bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - - mapped_datav = kmalloc_array(segs, - sizeof(*mapped_datav), GFP_NOFS); - if (!mapped_datav) - goto leave; - cur_bytenr = dev_bytenr; - - bio_for_each_segment(bvec, bio, iter) { - BUG_ON(bvec.bv_len != PAGE_SIZE); - mapped_datav[i] = page_address(bvec.bv_page); - i++; - - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) - pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", - i, cur_bytenr, bvec.bv_len, bvec.bv_offset); - cur_bytenr += bvec.bv_len; - } - btrfsic_process_written_block(dev_state, dev_bytenr, - mapped_datav, segs, - bio, &bio_is_patched, - bio->bi_opf); - kfree(mapped_datav); - } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", - bio_op(bio), bio->bi_opf, bio->bi_bdev); - if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { - if ((dev_state->state->print_mask & - (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE))) - pr_info( -"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n", - dev_state->bdev); - } else { - struct btrfsic_block *const block = - &dev_state->dummy_block_for_bio_bh_flush; + bio_for_each_segment(bvec, bio, iter) { + BUG_ON(bvec.bv_len != PAGE_SIZE); + mapped_datav[i] = page_address(bvec.bv_page); + i++; - block->is_iodone = 0; - block->never_written = 0; - block->iodone_w_error = 0; - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = bio->bi_opf; - block->orig_bio_private = bio->bi_private; - block->orig_bio_end_io = bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - } + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) + pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", + i, cur_bytenr, bvec.bv_len, bvec.bv_offset); + cur_bytenr += bvec.bv_len; } -leave: - mutex_unlock(&btrfsic_mutex); + + btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs, + bio, &bio_is_patched, bio->bi_opf); + kfree(mapped_datav); } -void btrfsic_submit_bio(struct bio *bio) +static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state) { - __btrfsic_submit_bio(bio); - submit_bio(bio); + if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", + bio_op(bio), bio->bi_opf, bio->bi_bdev); + + if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) { + struct btrfsic_block *const block = + &dev_state->dummy_block_for_bio_bh_flush; + + block->is_iodone = 0; + block->never_written = 0; + block->iodone_w_error = 0; + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = bio->bi_opf; + block->orig_bio_private = bio->bi_private; + block->orig_bio_end_io = bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + } else if ((dev_state->state->print_mask & + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE))) { + pr_info( +"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n", + dev_state->bdev); + } } -int btrfsic_submit_bio_wait(struct bio *bio) +void btrfsic_check_bio(struct bio *bio) { - __btrfsic_submit_bio(bio); - return submit_bio_wait(bio); + struct btrfsic_dev_state *dev_state; + + if (!btrfsic_is_initialized) + return; + + /* + * We can be called before btrfsic_mount, so there might not be a + * dev_state. + */ + dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); + mutex_lock(&btrfsic_mutex); + if (dev_state) { + if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio)) + btrfsic_check_write_bio(bio, dev_state); + else if (bio->bi_opf & REQ_PREFLUSH) + btrfsic_check_flush_bio(bio, dev_state); + } + mutex_unlock(&btrfsic_mutex); } int btrfsic_mount(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h index bcc730a06cb5..e4c8aed7996f 100644 --- a/fs/btrfs/check-integrity.h +++ b/fs/btrfs/check-integrity.h @@ -7,11 +7,9 @@ #define BTRFS_CHECK_INTEGRITY_H #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY -void btrfsic_submit_bio(struct bio *bio); -int btrfsic_submit_bio_wait(struct bio *bio); +void btrfsic_check_bio(struct bio *bio); #else -#define btrfsic_submit_bio submit_bio -#define btrfsic_submit_bio_wait submit_bio_wait +static inline void btrfsic_check_bio(struct bio *bio) { } #endif int btrfsic_mount(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 71e5b2e9a1ba..f4564f32f6d9 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -219,7 +219,7 @@ static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *b bi_size += bvec->bv_len; if (bio->bi_status) - cb->errors = 1; + cb->status = bio->bi_status; ASSERT(bi_size && bi_size <= cb->compressed_len); last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits, @@ -234,7 +234,7 @@ static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *b return last_io; } -static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bio) +static void finish_compressed_bio_read(struct compressed_bio *cb) { unsigned int index; struct page *page; @@ -247,19 +247,18 @@ static void finish_compressed_bio_read(struct compressed_bio *cb, struct bio *bi } /* Do io completion on the original bio */ - if (cb->errors) { - bio_io_error(cb->orig_bio); + if (cb->status != BLK_STS_OK) { + cb->orig_bio->bi_status = cb->status; + bio_endio(cb->orig_bio); } else { struct bio_vec *bvec; struct bvec_iter_all iter_all; - ASSERT(bio); - ASSERT(!bio->bi_status); /* * We have verified the checksum already, set page checked so * the end_io handlers know about it */ - ASSERT(!bio_flagged(bio, BIO_CLONED)); + ASSERT(!bio_flagged(cb->orig_bio, BIO_CLONED)); bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) { u64 bvec_start = page_offset(bvec->bv_page) + bvec->bv_offset; @@ -308,7 +307,7 @@ static void end_compressed_bio_read(struct bio *bio) * Some IO in this cb have failed, just skip checksum as there * is no way it could be correct. */ - if (cb->errors == 1) + if (cb->status != BLK_STS_OK) goto csum_failed; inode = cb->inode; @@ -324,8 +323,8 @@ static void end_compressed_bio_read(struct bio *bio) csum_failed: if (ret) - cb->errors = 1; - finish_compressed_bio_read(cb, bio); + cb->status = errno_to_blk_status(ret); + finish_compressed_bio_read(cb); out: bio_put(bio); } @@ -342,11 +341,12 @@ static noinline void end_compressed_writeback(struct inode *inode, unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct page *pages[16]; unsigned long nr_pages = end_index - index + 1; + const int errno = blk_status_to_errno(cb->status); int i; int ret; - if (cb->errors) - mapping_set_error(inode->i_mapping, -EIO); + if (errno) + mapping_set_error(inode->i_mapping, errno); while (nr_pages > 0) { ret = find_get_pages_contig(inode->i_mapping, index, @@ -358,7 +358,7 @@ static noinline void end_compressed_writeback(struct inode *inode, continue; } for (i = 0; i < ret; i++) { - if (cb->errors) + if (errno) SetPageError(pages[i]); btrfs_page_clamp_clear_writeback(fs_info, pages[i], cb->start, cb->len); @@ -381,9 +381,10 @@ static void finish_compressed_bio_write(struct compressed_bio *cb) */ btrfs_writepage_endio_finish_ordered(BTRFS_I(inode), NULL, cb->start, cb->start + cb->len - 1, - !cb->errors); + cb->status == BLK_STS_OK); - end_compressed_writeback(inode, cb); + if (cb->writeback) + end_compressed_writeback(inode, cb); /* Note, our inode could be gone now */ /* @@ -424,7 +425,6 @@ out: } static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info, - struct compressed_bio *cb, struct bio *bio, int mirror_num) { blk_status_t ret; @@ -506,7 +506,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, struct page **compressed_pages, unsigned int nr_pages, unsigned int write_flags, - struct cgroup_subsys_state *blkcg_css) + struct cgroup_subsys_state *blkcg_css, + bool writeback) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio = NULL; @@ -524,16 +525,20 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, if (!cb) return BLK_STS_RESOURCE; refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); - cb->errors = 0; + cb->status = BLK_STS_OK; cb->inode = &inode->vfs_inode; cb->start = start; cb->len = len; cb->mirror_num = 0; cb->compressed_pages = compressed_pages; cb->compressed_len = compressed_len; + cb->writeback = writeback; cb->orig_bio = NULL; cb->nr_pages = nr_pages; + if (blkcg_css) + kthread_associate_blkcg(blkcg_css); + while (cur_disk_bytenr < disk_start + compressed_len) { u64 offset = cur_disk_bytenr - disk_start; unsigned int index = offset >> PAGE_SHIFT; @@ -552,6 +557,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, bio = NULL; goto finish_cb; } + if (blkcg_css) + bio->bi_opf |= REQ_CGROUP_PUNT; } /* * We should never reach next_stripe_start start as we will @@ -591,12 +598,12 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, if (submit) { if (!skip_sum) { - ret = btrfs_csum_one_bio(inode, bio, start, 1); + ret = btrfs_csum_one_bio(inode, bio, start, true); if (ret) goto finish_cb; } - ret = submit_compressed_bio(fs_info, cb, bio, 0); + ret = submit_compressed_bio(fs_info, bio, 0); if (ret) goto finish_cb; bio = NULL; @@ -609,6 +616,9 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, return 0; finish_cb: + if (blkcg_css) + kthread_associate_blkcg(NULL); + if (bio) { bio->bi_status = ret; bio_endio(bio); @@ -791,15 +801,13 @@ static noinline int add_ra_bio_pages(struct inode *inode, * After the compressed pages are read, we copy the bytes into the * bio we were passed and then call the bio end_io calls */ -blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags) +void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map_tree *em_tree; struct compressed_bio *cb; unsigned int compressed_len; - unsigned int nr_pages; - unsigned int pg_index; struct bio *comp_bio = NULL; const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 cur_disk_byte = disk_bytenr; @@ -808,8 +816,9 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_len; u64 em_start; struct extent_map *em; - blk_status_t ret = BLK_STS_RESOURCE; - int faili = 0; + blk_status_t ret; + int ret2; + int i; u8 *sums; em_tree = &BTRFS_I(inode)->extent_tree; @@ -821,17 +830,21 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); read_unlock(&em_tree->lock); - if (!em) - return BLK_STS_IOERR; + if (!em) { + ret = BLK_STS_IOERR; + goto out; + } ASSERT(em->compress_type != BTRFS_COMPRESS_NONE); compressed_len = em->block_len; cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); - if (!cb) + if (!cb) { + ret = BLK_STS_RESOURCE; goto out; + } refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); - cb->errors = 0; + cb->status = BLK_STS_OK; cb->inode = inode; cb->mirror_num = mirror_num; sums = cb->sums; @@ -840,30 +853,26 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, em_len = em->len; em_start = em->start; - free_extent_map(em); - em = NULL; - cb->len = bio->bi_iter.bi_size; cb->compressed_len = compressed_len; - cb->compress_type = extent_compress_type(bio_flags); + cb->compress_type = em->compress_type; cb->orig_bio = bio; - nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); - cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *), - GFP_NOFS); - if (!cb->compressed_pages) - goto fail1; - - for (pg_index = 0; pg_index < nr_pages; pg_index++) { - cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS); - if (!cb->compressed_pages[pg_index]) { - faili = pg_index - 1; - ret = BLK_STS_RESOURCE; - goto fail2; - } + free_extent_map(em); + em = NULL; + + cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); + cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS); + if (!cb->compressed_pages) { + ret = BLK_STS_RESOURCE; + goto fail; + } + + ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages); + if (ret2) { + ret = BLK_STS_RESOURCE; + goto fail; } - faili = nr_pages - 1; - cb->nr_pages = nr_pages; add_ra_bio_pages(inode, em_start + em_len, cb); @@ -932,26 +941,29 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, fs_info->sectorsize); sums += fs_info->csum_size * nr_sectors; - ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num); + ret = submit_compressed_bio(fs_info, comp_bio, mirror_num); if (ret) goto finish_cb; comp_bio = NULL; } } - return 0; + return; -fail2: - while (faili >= 0) { - __free_page(cb->compressed_pages[faili]); - faili--; +fail: + if (cb->compressed_pages) { + for (i = 0; i < cb->nr_pages; i++) { + if (cb->compressed_pages[i]) + __free_page(cb->compressed_pages[i]); + } } kfree(cb->compressed_pages); -fail1: kfree(cb); out: free_extent_map(em); - return ret; + bio->bi_status = ret; + bio_endio(bio); + return; finish_cb: if (comp_bio) { comp_bio->bi_status = ret; @@ -959,7 +971,7 @@ finish_cb: } /* All bytes of @cb is submitted, endio will free @cb */ if (cur_disk_byte == disk_bytenr + compressed_len) - return ret; + return; wait_var_event(cb, refcount_read(&cb->pending_sectors) == (disk_bytenr + compressed_len - cur_disk_byte) >> @@ -970,8 +982,7 @@ finish_cb: */ ASSERT(refcount_read(&cb->pending_sectors)); /* Now we are the only one referring @cb, can finish it safely. */ - finish_compressed_bio_read(cb, NULL); - return ret; + finish_compressed_bio_read(cb); } /* diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 56eef0821e3e..2707404389a5 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -22,6 +22,8 @@ struct btrfs_inode; /* Maximum length of compressed data stored on disk */ #define BTRFS_MAX_COMPRESSED (SZ_128K) +static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); + /* Maximum size of data before compression */ #define BTRFS_MAX_UNCOMPRESSED (SZ_128K) @@ -52,8 +54,11 @@ struct compressed_bio { /* The compression algorithm for this bio */ u8 compress_type; + /* Whether this is a write for writeback. */ + bool writeback; + /* IO errors */ - u8 errors; + blk_status_t status; int mirror_num; /* for reads, this is the bio we are copying the data into */ @@ -95,9 +100,10 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, struct page **compressed_pages, unsigned int nr_pages, unsigned int write_flags, - struct cgroup_subsys_state *blkcg_css); -blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags); + struct cgroup_subsys_state *blkcg_css, + bool writeback); +void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num); unsigned int btrfs_compress_str2level(unsigned int type, const char *str); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a7db3f6f1b7b..6e556031a8f3 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -16,6 +16,7 @@ #include "volumes.h" #include "qgroup.h" #include "tree-mod-log.h" +#include "tree-checker.h" static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); @@ -342,7 +343,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, int level = btrfs_header_level(buf); ret = btrfs_set_disk_extent_flags(trans, buf, - new_flags, level, 0); + new_flags, level); if (ret) return ret; } @@ -846,9 +847,11 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, btrfs_header_owner(parent), btrfs_node_ptr_generation(parent, slot), level - 1, &first_key); - if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) + return eb; + if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); - eb = ERR_PTR(-EIO); + return ERR_PTR(-EIO); } return eb; @@ -1388,12 +1391,13 @@ static noinline void unlock_up(struct btrfs_path *path, int level, } /* - * helper function for btrfs_search_slot. The goal is to find a block - * in cache without setting the path to blocking. If we find the block - * we return zero and the path is unchanged. + * Helper function for btrfs_search_slot() and other functions that do a search + * on a btree. The goal is to find a tree block in the cache (the radix tree at + * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read + * its pages from disk. * - * If we can't find the block, we set the path blocking and do some - * reada. -EAGAIN is returned and the search must be repeated. + * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the + * whole btree search, starting again from the current root node. */ static int read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, @@ -1407,12 +1411,21 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, struct btrfs_key first_key; int ret; int parent_level; + bool unlock_up; + unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]); blocknr = btrfs_node_blockptr(*eb_ret, slot); gen = btrfs_node_ptr_generation(*eb_ret, slot); parent_level = btrfs_header_level(*eb_ret); btrfs_node_key_to_cpu(*eb_ret, &first_key, slot); + /* + * If we need to read an extent buffer from disk and we are holding locks + * on upper level nodes, we unlock all the upper nodes before reading the + * extent buffer, and then return -EAGAIN to the caller as it needs to + * restart the search. We don't release the lock on the current level + * because we need to walk this node to figure out which blocks to read. + */ tmp = find_extent_buffer(fs_info, blocknr); if (tmp) { if (p->reada == READA_FORWARD_ALWAYS) @@ -1434,47 +1447,61 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, return 0; } + if (unlock_up) + btrfs_unlock_up_safe(p, level + 1); + /* now we're allowed to do a blocking uptodate check */ - ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key); - if (!ret) { - *eb_ret = tmp; - return 0; + ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key); + if (ret) { + free_extent_buffer(tmp); + btrfs_release_path(p); + return -EIO; } - free_extent_buffer(tmp); - btrfs_release_path(p); - return -EIO; + if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) { + free_extent_buffer(tmp); + btrfs_release_path(p); + return -EUCLEAN; + } + + if (unlock_up) + ret = -EAGAIN; + + goto out; } - /* - * reduce lock contention at high levels - * of the btree by dropping locks before - * we read. Don't release the lock on the current - * level because we need to walk this node to figure - * out which blocks to read. - */ - btrfs_unlock_up_safe(p, level + 1); + if (unlock_up) { + btrfs_unlock_up_safe(p, level + 1); + ret = -EAGAIN; + } else { + ret = 0; + } if (p->reada != READA_NONE) reada_for_search(fs_info, p, level, slot, key->objectid); - ret = -EAGAIN; tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid, gen, parent_level - 1, &first_key); - if (!IS_ERR(tmp)) { - /* - * If the read above didn't mark this buffer up to date, - * it will never end up being up to date. Set ret to EIO now - * and give up so that our caller doesn't loop forever - * on our EAGAINs. - */ - if (!extent_buffer_uptodate(tmp)) - ret = -EIO; - free_extent_buffer(tmp); + if (IS_ERR(tmp)) { + btrfs_release_path(p); + return PTR_ERR(tmp); + } + /* + * If the read above didn't mark this buffer up to date, + * it will never end up being up to date. Set ret to EIO now + * and give up so that our caller doesn't loop forever + * on our EAGAINs. + */ + if (!extent_buffer_uptodate(tmp)) + ret = -EIO; + +out: + if (ret == 0) { + *eb_ret = tmp; } else { - ret = PTR_ERR(tmp); + free_extent_buffer(tmp); + btrfs_release_path(p); } - btrfs_release_path(p); return ret; } @@ -2277,6 +2304,43 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, return ret; } +/** + * Search for a valid slot for the given path. + * + * @root: The root node of the tree. + * @key: Will contain a valid item if found. + * @path: The starting point to validate the slot. + * + * Return: 0 if the item is valid + * 1 if not found + * <0 if error. + */ +int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *path) +{ + while (1) { + int ret; + const int slot = path->slots[0]; + const struct extent_buffer *leaf = path->nodes[0]; + + /* This is where we start walking the path. */ + if (slot >= btrfs_header_nritems(leaf)) { + /* + * If we've reached the last slot in this leaf we need + * to go to the next leaf and reset the path. + */ + ret = btrfs_next_leaf(root, path); + if (ret) + return ret; + continue; + } + /* Store the found, valid item in @key. */ + btrfs_item_key_to_cpu(leaf, key, slot); + break; + } + return 0; +} + /* * adjust the pointers going up the tree, starting at level * making sure the right key of each node is points to 'key'. @@ -2990,16 +3054,11 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (free_space < data_size) goto out_unlock; - /* cow and double check */ ret = btrfs_cow_block(trans, root, right, upper, slot + 1, &right, BTRFS_NESTING_RIGHT_COW); if (ret) goto out_unlock; - free_space = btrfs_leaf_free_space(right); - if (free_space < data_size) - goto out_unlock; - left_nritems = btrfs_header_nritems(left); if (left_nritems == 0) goto out_unlock; @@ -3224,7 +3283,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - /* cow and double check */ ret = btrfs_cow_block(trans, root, left, path->nodes[1], slot - 1, &left, BTRFS_NESTING_LEFT_COW); @@ -3235,12 +3293,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } - free_space = btrfs_leaf_free_space(left); - if (free_space < data_size) { - ret = 1; - goto out; - } - if (check_sibling_keys(left, right)) { ret = -EUCLEAN; goto out; @@ -4170,24 +4222,22 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; - u32 last_off; - u32 dsize = 0; int ret = 0; int wret; - int i; u32 nritems; leaf = path->nodes[0]; - last_off = btrfs_item_offset(leaf, slot + nr - 1); - - for (i = 0; i < nr; i++) - dsize += btrfs_item_size(leaf, slot + i); - nritems = btrfs_header_nritems(leaf); if (slot + nr != nritems) { - int data_end = leaf_data_end(leaf); + const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1); + const int data_end = leaf_data_end(leaf); struct btrfs_map_token token; + u32 dsize = 0; + int i; + + for (i = 0; i < nr; i++) + dsize += btrfs_item_size(leaf, slot + i); memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET + data_end + dsize, @@ -4227,24 +4277,50 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, fixup_low_keys(path, &disk_key, 1); } - /* delete the leaf if it is mostly empty */ + /* + * Try to delete the leaf if it is mostly empty. We do this by + * trying to move all its items into its left and right neighbours. + * If we can't move all the items, then we don't delete it - it's + * not ideal, but future insertions might fill the leaf with more + * items, or items from other leaves might be moved later into our + * leaf due to deletions on those leaves. + */ if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) { + u32 min_push_space; + /* push_leaf_left fixes the path. * make sure the path still points to our leaf * for possible call to del_ptr below */ slot = path->slots[1]; atomic_inc(&leaf->refs); - - wret = push_leaf_left(trans, root, path, 1, 1, - 1, (u32)-1); + /* + * We want to be able to at least push one item to the + * left neighbour leaf, and that's the first item. + */ + min_push_space = sizeof(struct btrfs_item) + + btrfs_item_size(leaf, 0); + wret = push_leaf_left(trans, root, path, 0, + min_push_space, 1, (u32)-1); if (wret < 0 && wret != -ENOSPC) ret = wret; if (path->nodes[0] == leaf && btrfs_header_nritems(leaf)) { - wret = push_leaf_right(trans, root, path, 1, - 1, 1, 0); + /* + * If we were not able to push all items from our + * leaf to its left neighbour, then attempt to + * either push all the remaining items to the + * right neighbour or none. There's no advantage + * in pushing only some items, instead of all, as + * it's pointless to end up with a leaf having + * too few items while the neighbours can be full + * or nearly full. + */ + nritems = btrfs_header_nritems(leaf); + min_push_space = leaf_space_used(leaf, 0, nritems); + wret = push_leaf_right(trans, root, path, 0, + min_push_space, 1, 0); if (wret < 0 && wret != -ENOSPC) ret = wret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b4a9b1c58d22..415bf1823fb3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -49,6 +49,7 @@ extern struct kmem_cache *btrfs_free_space_bitmap_cachep; struct btrfs_ordered_sum; struct btrfs_ref; struct btrfs_bio; +struct btrfs_ioctl_encoded_io_args; #define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */ @@ -145,6 +146,11 @@ enum { BTRFS_FS_STATE_DUMMY_FS_INFO, BTRFS_FS_STATE_NO_CSUMS, + + /* Indicates there was an error cleaning up a log tree. */ + BTRFS_FS_STATE_LOG_CLEANUP_ERROR, + + BTRFS_FS_STATE_COUNT }; #define BTRFS_BACKREF_REV_MAX 256 @@ -271,8 +277,14 @@ struct btrfs_super_block { /* the UUID written into btree blocks */ u8 metadata_uuid[BTRFS_FSID_SIZE]; + /* Extent tree v2 */ + __le64 block_group_root; + __le64 block_group_root_generation; + u8 block_group_root_level; + /* future expansion */ - __le64 reserved[28]; + u8 reserved8[7]; + __le64 reserved[25]; u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS]; @@ -297,6 +309,26 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL +#ifdef CONFIG_BTRFS_DEBUG +/* + * Extent tree v2 supported only with CONFIG_BTRFS_DEBUG + */ +#define BTRFS_FEATURE_INCOMPAT_SUPP \ + (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ + BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ + BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \ + BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \ + BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \ + BTRFS_FEATURE_INCOMPAT_RAID56 | \ + BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \ + BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \ + BTRFS_FEATURE_INCOMPAT_NO_HOLES | \ + BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ + BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ + BTRFS_FEATURE_INCOMPAT_ZONED | \ + BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2) +#else #define BTRFS_FEATURE_INCOMPAT_SUPP \ (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \ BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \ @@ -311,6 +343,7 @@ static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE); BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \ BTRFS_FEATURE_INCOMPAT_RAID1C34 | \ BTRFS_FEATURE_INCOMPAT_ZONED) +#endif #define BTRFS_FEATURE_INCOMPAT_SAFE_SET \ (BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF) @@ -599,6 +632,9 @@ enum { /* Indicate that we want the transaction kthread to commit right now. */ BTRFS_FS_COMMIT_TRANS, + /* Indicate we have half completed snapshot deletions pending. */ + BTRFS_FS_UNFINISHED_DROPS, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -630,6 +666,7 @@ struct btrfs_fs_info { struct btrfs_root *quota_root; struct btrfs_root *uuid_root; struct btrfs_root *data_reloc_root; + struct btrfs_root *block_group_root; /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; @@ -638,13 +675,13 @@ struct btrfs_fs_info { rwlock_t global_root_lock; struct rb_root global_root_tree; - spinlock_t fs_roots_radix_lock; - struct radix_tree_root fs_roots_radix; + /* The xarray that holds all the FS roots */ + spinlock_t fs_roots_lock; + struct xarray fs_roots; /* block group cache stuff */ - spinlock_t block_group_cache_lock; - u64 first_logical_byte; - struct rb_root block_group_cache_tree; + rwlock_t block_group_cache_lock; + struct rb_root_cached block_group_cache_tree; /* keep track of unallocated space */ atomic64_t free_chunk_space; @@ -811,12 +848,13 @@ struct btrfs_fs_info { * two */ struct btrfs_workqueue *workers; + struct btrfs_workqueue *hipri_workers; struct btrfs_workqueue *delalloc_workers; struct btrfs_workqueue *flush_workers; struct btrfs_workqueue *endio_workers; struct btrfs_workqueue *endio_meta_workers; struct btrfs_workqueue *endio_raid56_workers; - struct btrfs_workqueue *rmw_workers; + struct workqueue_struct *rmw_workers; struct btrfs_workqueue *endio_meta_write_workers; struct btrfs_workqueue *endio_write_workers; struct btrfs_workqueue *endio_freespace_worker; @@ -909,9 +947,9 @@ struct btrfs_fs_info { * running. */ refcount_t scrub_workers_refcnt; - struct btrfs_workqueue *scrub_workers; - struct btrfs_workqueue *scrub_wr_completion_workers; - struct btrfs_workqueue *scrub_parity_workers; + struct workqueue_struct *scrub_workers; + struct workqueue_struct *scrub_wr_completion_workers; + struct workqueue_struct *scrub_parity_workers; struct btrfs_subpage_info *subpage_info; struct btrfs_discard_ctl discard_ctl; @@ -957,10 +995,10 @@ struct btrfs_fs_info { struct btrfs_delayed_root *delayed_root; - /* Extent buffer radix tree */ + /* Extent buffer xarray */ spinlock_t buffer_lock; /* Entries are eb->start / sectorsize */ - struct radix_tree_root buffer_radix; + struct xarray extent_buffers; /* next backup root to be overwritten */ int backup_root_index; @@ -1008,10 +1046,7 @@ struct btrfs_fs_info { * Zone size > 0 when in ZONED mode, otherwise it's used for a check * if the mode is enabled */ - union { - u64 zone_size; - u64 zoned; - }; + u64 zone_size; struct mutex zoned_meta_io_lock; spinlock_t treelog_bg_lock; @@ -1023,6 +1058,9 @@ struct btrfs_fs_info { */ spinlock_t relocation_bg_lock; u64 data_reloc_bg; + struct mutex zoned_data_reloc_io_lock; + + u64 nr_global_roots; spinlock_t zone_active_bgs_lock; struct list_head zone_active_bgs; @@ -1081,7 +1119,8 @@ enum { */ BTRFS_ROOT_SHAREABLE, BTRFS_ROOT_TRACK_DIRTY, - BTRFS_ROOT_IN_RADIX, + /* The root is tracked in fs_info::fs_roots */ + BTRFS_ROOT_REGISTERED, BTRFS_ROOT_ORPHAN_ITEM_INSERTED, BTRFS_ROOT_DEFRAG_RUNNING, BTRFS_ROOT_FORCE_COW, @@ -1103,8 +1142,15 @@ enum { BTRFS_ROOT_QGROUP_FLUSHING, /* We started the orphan cleanup for this root. */ BTRFS_ROOT_ORPHAN_CLEANUP, + /* This root has a drop operation that was started previously. */ + BTRFS_ROOT_UNFINISHED_DROP, }; +static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info) +{ + clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags); +} + /* * Record swapped tree blocks of a subvolume tree for delayed subtree trace * code. For detail check comment in fs/btrfs/qgroup.c. @@ -1178,10 +1224,10 @@ struct btrfs_root { struct rb_root inode_tree; /* - * radix tree that keeps track of delayed nodes of every inode, - * protected by inode_lock + * Xarray that keeps track of delayed nodes of every inode, protected + * by inode_lock */ - struct radix_tree_root delayed_nodes_tree; + struct xarray delayed_nodes; /* * right now this just gets used so that a root has its own devid * for stat. It may be used for more later @@ -1284,6 +1330,8 @@ struct btrfs_replace_extent_info { * existing extent into a file range. */ bool is_new_extent; + /* Indicate if we should update the inode's mtime and ctime. */ + bool update_times; /* Meaningful only if is_new_extent is true. */ int qgroup_reserved; /* @@ -1596,25 +1644,25 @@ DECLARE_BTRFS_SETGET_BITS(64) static inline u##bits btrfs_##name(const struct extent_buffer *eb, \ const type *s) \ { \ - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ return btrfs_get_##bits(eb, s, offsetof(type, member)); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, type *s, \ u##bits val) \ { \ - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ btrfs_set_##bits(eb, s, offsetof(type, member), val); \ } \ static inline u##bits btrfs_token_##name(struct btrfs_map_token *token, \ const type *s) \ { \ - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ return btrfs_get_token_##bits(token, s, offsetof(type, member));\ } \ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ type *s, u##bits val) \ { \ - BUILD_BUG_ON(sizeof(u##bits) != sizeof(((type *)0))->member); \ + static_assert(sizeof(u##bits) == sizeof(((type *)0))->member); \ btrfs_set_token_##bits(token, s, offsetof(type, member), val); \ } @@ -1645,8 +1693,8 @@ static inline void btrfs_set_##name(type *s, u##bits val) \ static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s) { - BUILD_BUG_ON(sizeof(u64) != - sizeof(((struct btrfs_dev_item *)0))->total_bytes); + static_assert(sizeof(u64) == + sizeof(((struct btrfs_dev_item *)0))->total_bytes); return btrfs_get_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes)); } @@ -1654,8 +1702,8 @@ static inline void btrfs_set_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s, u64 val) { - BUILD_BUG_ON(sizeof(u64) != - sizeof(((struct btrfs_dev_item *)0))->total_bytes); + static_assert(sizeof(u64) == + sizeof(((struct btrfs_dev_item *)0))->total_bytes); WARN_ON(!IS_ALIGNED(val, eb->fs_info->sectorsize)); btrfs_set_64(eb, s, offsetof(struct btrfs_dev_item, total_bytes), val); } @@ -2315,6 +2363,17 @@ BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup, BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup, num_devices, 64); +/* + * For extent tree v2 we overload the extent root with the block group root, as + * we will have multiple extent roots. + */ +BTRFS_SETGET_STACK_FUNCS(backup_block_group_root, struct btrfs_root_backup, + extent_root, 64); +BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_gen, struct btrfs_root_backup, + extent_root_gen, 64); +BTRFS_SETGET_STACK_FUNCS(backup_block_group_root_level, + struct btrfs_root_backup, extent_root_level, 8); + /* struct btrfs_balance_item */ BTRFS_SETGET_FUNCS(balance_flags, struct btrfs_balance_item, flags, 64); @@ -2449,6 +2508,13 @@ BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block, BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64); BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block, uuid_tree_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_block_group_root, struct btrfs_super_block, + block_group_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_block_group_root_generation, + struct btrfs_super_block, + block_group_root_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_block_group_root_level, struct btrfs_super_block, + block_group_root_level, 8); int btrfs_super_csum_size(const struct btrfs_super_block *s); const char *btrfs_super_csum_name(u16 csum_type); @@ -2719,7 +2785,8 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); int btrfs_exclude_logged_extents(struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_root *root, - u64 objectid, u64 offset, u64 bytenr, bool strict); + u64 objectid, u64 offset, u64 bytenr, bool strict, + struct btrfs_path *path); struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, @@ -2746,8 +2813,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct extent_buffer *eb, u64 flags, - int level, int is_data); + struct extent_buffer *eb, u64 flags, int level); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, @@ -2826,7 +2892,8 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv); void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); -int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes); +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, + u64 disk_num_bytes, bool noflush); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); @@ -2973,6 +3040,35 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *path); +int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *path); + +/* + * Search in @root for a given @key, and store the slot found in @found_key. + * + * @root: The root node of the tree. + * @key: The key we are looking for. + * @found_key: Will hold the found item. + * @path: Holds the current slot/leaf. + * @iter_ret: Contains the value returned from btrfs_search_slot or + * btrfs_get_next_valid_item, whichever was executed last. + * + * The @iter_ret is an output variable that will contain the return value of + * btrfs_search_slot, if it encountered an error, or the value returned from + * btrfs_get_next_valid_item otherwise. That return value can be 0, if a valid + * slot was found, 1 if there were no more leaves, and <0 if there was an error. + * + * It's recommended to use a separate variable for iter_ret and then use it to + * set the function return value so there's no confusion of the 0/1/errno + * values stemming from btrfs_search_slot. + */ +#define btrfs_for_each_slot(root, key, found_key, path, iter_ret) \ + for (iter_ret = btrfs_search_slot(NULL, (root), (key), (path), 0, 0); \ + (iter_ret) >= 0 && \ + (iter_ret = btrfs_get_next_valid_item((root), (found_key), (path))) == 0; \ + (path)->slots[0]++ \ + ) + static inline int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *p, u64 time_seq) { @@ -3124,7 +3220,6 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); /* file-item.c */ -struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); @@ -3142,7 +3237,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio, - u64 file_start, int contig); + u64 offset, bool one_ordered); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit); void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, @@ -3158,8 +3253,8 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ -blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags); +void btrfs_submit_data_bio(struct inode *inode, struct bio *bio, + int mirror_num, enum btrfs_compression_type compress_type); unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u64 start, u64 end); @@ -3189,10 +3284,28 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state); -int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, - struct btrfs_root *parent_root, - struct user_namespace *mnt_userns); +struct btrfs_new_inode_args { + /* Input */ + struct inode *dir; + struct dentry *dentry; + struct inode *inode; + bool orphan; + bool subvol; + + /* + * Output from btrfs_new_inode_prepare(), input to + * btrfs_create_new_inode(). + */ + struct posix_acl *default_acl; + struct posix_acl *acl; +}; +int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, + unsigned int *trans_num_items); +int btrfs_create_new_inode(struct btrfs_trans_handle *trans, + struct btrfs_new_inode_args *args); +void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); +struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, + struct inode *dir); void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, unsigned *bits); void btrfs_clear_delalloc_extent(struct inode *inode, @@ -3203,7 +3316,6 @@ void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split); void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); -int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); struct inode *btrfs_alloc_inode(struct super_block *sb); @@ -3243,9 +3355,14 @@ int btrfs_writepage_cow_fixup(struct page *page); void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, u64 end, bool uptodate); +ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, + struct btrfs_ioctl_encoded_io_args *encoded); +ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded); + +ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before); + extern const struct dentry_operations btrfs_dentry_operations; -extern const struct iomap_ops btrfs_dio_iomap_ops; -extern const struct iomap_dio_ops btrfs_dio_ops; /* Inode locking type flags, by default the exclusive lock is taken */ #define BTRFS_ILOCK_SHARED (1U << 0) @@ -3257,6 +3374,7 @@ void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, const u64 del_bytes); +void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -3288,7 +3406,7 @@ void btrfs_exclop_balance(struct btrfs_fs_info *fs_info, int __init btrfs_auto_defrag_init(void); void __cold btrfs_auto_defrag_exit(void); int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans, - struct btrfs_inode *inode); + struct btrfs_inode *inode, u32 extent_thresh); int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info); void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info); int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); @@ -3305,6 +3423,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, struct btrfs_trans_handle **trans_out); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end); +ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, + const struct btrfs_ioctl_encoded_io_args *encoded); int btrfs_release_file(struct inode *inode, struct file *file); int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, size_t num_pages, loff_t pos, size_t write_bytes, @@ -3330,11 +3450,29 @@ void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { } -#ifdef CONFIG_PRINTK +#ifdef CONFIG_PRINTK_INDEX + +#define btrfs_printk(fs_info, fmt, args...) \ +do { \ + printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \ + _btrfs_printk(fs_info, fmt, ##args); \ +} while (0) + __printf(2, 3) __cold -void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); +void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); + +#elif defined(CONFIG_PRINTK) + +#define btrfs_printk(fs_info, fmt, args...) \ + _btrfs_printk(fs_info, fmt, ##args) + +__printf(2, 3) +__cold +void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); + #else + #define btrfs_printk(fs_info, fmt, args...) \ btrfs_no_printk(fs_info, fmt, ##args) #endif @@ -3585,14 +3723,30 @@ do { \ __LINE__, (errno)); \ } while (0) +#ifdef CONFIG_PRINTK_INDEX + #define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ -do { \ - __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args); \ +do { \ + printk_index_subsys_emit( \ + "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \ + KERN_CRIT, fmt); \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args); \ } while (0) +#else + +#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args) + +#endif + #define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ &(fs_info)->fs_state))) +#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ + (unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \ + &(fs_info)->fs_state))) __printf(5, 6) __cold @@ -3740,15 +3894,16 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); -int btrfs_init_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir); +int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, + struct posix_acl *acl, int type); #else #define btrfs_get_acl NULL #define btrfs_set_acl NULL -static inline int btrfs_init_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) +static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct posix_acl *acl, + int type) { - return 0; + return -EOPNOTSUPP; } #endif @@ -3758,7 +3913,7 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_recover_relocation(struct btrfs_root *root); +int btrfs_recover_relocation(struct btrfs_fs_info *fs_info); int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len); int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -3853,7 +4008,7 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) { - return fs_info->zoned != 0; + return fs_info->zone_size > 0; } static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) @@ -3870,5 +4025,8 @@ static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) #define PageOrdered(page) PagePrivate2(page) #define SetPageOrdered(page) SetPagePrivate2(page) #define ClearPageOrdered(page) ClearPagePrivate2(page) +#define folio_test_ordered(folio) folio_test_private_2(folio) +#define folio_set_ordered(folio) folio_set_private_2(folio) +#define folio_clear_ordered(folio) folio_clear_private_2(folio) #endif diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index fb46a28f5065..36ab0859a263 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -270,11 +270,11 @@ static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info *fs_info, } static void calc_inode_reservations(struct btrfs_fs_info *fs_info, - u64 num_bytes, u64 *meta_reserve, - u64 *qgroup_reserve) + u64 num_bytes, u64 disk_num_bytes, + u64 *meta_reserve, u64 *qgroup_reserve) { u64 nr_extents = count_max_extents(num_bytes); - u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, num_bytes); + u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes); u64 inode_update = btrfs_calc_metadata_size(fs_info, 1); *meta_reserve = btrfs_calc_insert_metadata_size(fs_info, @@ -288,7 +288,8 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info, *qgroup_reserve = nr_extents * fs_info->nodesize; } -int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) +int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, + u64 disk_num_bytes, bool noflush) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -307,7 +308,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) * If we have a transaction open (can happen if we call truncate_block * from truncate), then we need FLUSH_LIMIT so we don't deadlock. */ - if (btrfs_is_free_space_inode(inode)) { + if (noflush || btrfs_is_free_space_inode(inode)) { flush = BTRFS_RESERVE_NO_FLUSH; } else { if (current->journal_info) @@ -318,6 +319,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) } num_bytes = ALIGN(num_bytes, fs_info->sectorsize); + disk_num_bytes = ALIGN(disk_num_bytes, fs_info->sectorsize); /* * We always want to do it this way, every other way is wrong and ends @@ -329,9 +331,10 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) * everything out and try again, which is bad. This way we just * over-reserve slightly, and clean up the mess when we are done. */ - calc_inode_reservations(fs_info, num_bytes, &meta_reserve, - &qgroup_reserve); - ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); + calc_inode_reservations(fs_info, num_bytes, disk_num_bytes, + &meta_reserve, &qgroup_reserve); + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true, + noflush); if (ret) return ret; ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush); @@ -349,7 +352,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) spin_lock(&inode->lock); nr_extents = count_max_extents(num_bytes); btrfs_mod_outstanding_extents(inode, nr_extents); - inode->csum_bytes += num_bytes; + inode->csum_bytes += disk_num_bytes; btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); @@ -454,7 +457,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, ret = btrfs_check_data_free_space(inode, reserved, start, len); if (ret < 0) return ret; - ret = btrfs_delalloc_reserve_metadata(inode, len); + ret = btrfs_delalloc_reserve_metadata(inode, len, len, false); if (ret < 0) { btrfs_free_reserved_data_space(inode, *reserved, start, len); extent_changeset_free(*reserved); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 748bf6b0d860..66779ab3ed4a 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -78,7 +78,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( } spin_lock(&root->inode_lock); - node = radix_tree_lookup(&root->delayed_nodes_tree, ino); + node = xa_load(&root->delayed_nodes, ino); if (node) { if (btrfs_inode->delayed_node) { @@ -90,9 +90,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( /* * It's possible that we're racing into the middle of removing - * this node from the radix tree. In this case, the refcount + * this node from the xarray. In this case, the refcount * was zero and it should never go back to one. Just return - * NULL like it was never in the radix at all; our release + * NULL like it was never in the xarray at all; our release * function is in the process of removing it. * * Some implementations of refcount_inc refuse to bump the @@ -100,7 +100,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( * here, refcount_inc() may decide to just WARN_ONCE() instead * of actually bumping the refcount. * - * If this node is properly in the radix, we want to bump the + * If this node is properly in the xarray, we want to bump the * refcount twice, once for the inode and once for this get * operation. */ @@ -128,36 +128,30 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( u64 ino = btrfs_ino(btrfs_inode); int ret; -again: - node = btrfs_get_delayed_node(btrfs_inode); - if (node) - return node; + do { + node = btrfs_get_delayed_node(btrfs_inode); + if (node) + return node; - node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); - if (!node) - return ERR_PTR(-ENOMEM); - btrfs_init_delayed_node(node, root, ino); + node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); + if (!node) + return ERR_PTR(-ENOMEM); + btrfs_init_delayed_node(node, root, ino); - /* cached in the btrfs inode and can be accessed */ - refcount_set(&node->refs, 2); + /* Cached in the inode and can be accessed */ + refcount_set(&node->refs, 2); - ret = radix_tree_preload(GFP_NOFS); - if (ret) { - kmem_cache_free(delayed_node_cache, node); - return ERR_PTR(ret); - } - - spin_lock(&root->inode_lock); - ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); - if (ret == -EEXIST) { - spin_unlock(&root->inode_lock); - kmem_cache_free(delayed_node_cache, node); - radix_tree_preload_end(); - goto again; - } + spin_lock(&root->inode_lock); + ret = xa_insert(&root->delayed_nodes, ino, node, GFP_NOFS); + if (ret) { + spin_unlock(&root->inode_lock); + kmem_cache_free(delayed_node_cache, node); + if (ret != -EBUSY) + return ERR_PTR(ret); + } + } while (ret); btrfs_inode->delayed_node = node; spin_unlock(&root->inode_lock); - radix_tree_preload_end(); return node; } @@ -276,8 +270,7 @@ static void __btrfs_release_delayed_node( * back up. We can delete it now. */ ASSERT(refcount_read(&delayed_node->refs) == 0); - radix_tree_delete(&root->delayed_nodes_tree, - delayed_node->inode_id); + xa_erase(&root->delayed_nodes, delayed_node->inode_id); spin_unlock(&root->inode_lock); kmem_cache_free(delayed_node_cache, delayed_node); } @@ -1870,34 +1863,35 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode) void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) { - u64 inode_id = 0; + unsigned long index = 0; + struct btrfs_delayed_node *delayed_node; struct btrfs_delayed_node *delayed_nodes[8]; - int i, n; while (1) { + int n = 0; + spin_lock(&root->inode_lock); - n = radix_tree_gang_lookup(&root->delayed_nodes_tree, - (void **)delayed_nodes, inode_id, - ARRAY_SIZE(delayed_nodes)); - if (!n) { + if (xa_empty(&root->delayed_nodes)) { spin_unlock(&root->inode_lock); - break; + return; } - inode_id = delayed_nodes[n - 1]->inode_id + 1; - for (i = 0; i < n; i++) { + xa_for_each_start(&root->delayed_nodes, index, delayed_node, index) { /* * Don't increase refs in case the node is dead and * about to be removed from the tree in the loop below */ - if (!refcount_inc_not_zero(&delayed_nodes[i]->refs)) - delayed_nodes[i] = NULL; + if (refcount_inc_not_zero(&delayed_node->refs)) { + delayed_nodes[n] = delayed_node; + n++; + } + if (n >= ARRAY_SIZE(delayed_nodes)) + break; } + index++; spin_unlock(&root->inode_lock); - for (i = 0; i < n; i++) { - if (!delayed_nodes[i]) - continue; + for (int i = 0; i < n; i++) { __btrfs_kill_delayed_node(delayed_nodes[i]); btrfs_release_delayed_node(delayed_nodes[i]); } diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 4176df149d04..99f37fca2e96 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -930,7 +930,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID); ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); - BUG_ON(extent_op && extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); if (!ref) return -ENOMEM; @@ -1103,8 +1102,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, return -ENOMEM; init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0, - BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data, - false); + BTRFS_UPDATE_DELAYED_HEAD, false, false); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 91a3aabad150..d6304b690ec4 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -58,7 +58,6 @@ struct btrfs_delayed_extent_op { u8 level; bool update_key; bool update_flags; - bool is_data; u64 flags_to_set; }; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 62b9651ea662..a7dd6ba25e99 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -243,6 +243,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev, struct btrfs_device **device_out) { + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; struct block_device *bdev; struct rcu_string *name; @@ -271,7 +272,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, sync_blockdev(bdev); - list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + list_for_each_entry(device, &fs_devices->devices, dev_list) { if (device->bdev == bdev) { btrfs_err(fs_info, "target device is in the filesystem!"); @@ -302,6 +303,9 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, goto error; } rcu_assign_pointer(device->name, name); + ret = lookup_bdev(device_path, &device->devt); + if (ret) + goto error; set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state); device->generation = 0; @@ -320,17 +324,17 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, device->mode = FMODE_EXCL; device->dev_stats_valid = 1; set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE); - device->fs_devices = fs_info->fs_devices; + device->fs_devices = fs_devices; ret = btrfs_get_dev_zone_info(device, false); if (ret) goto error; - mutex_lock(&fs_info->fs_devices->device_list_mutex); - list_add(&device->dev_list, &fs_info->fs_devices->devices); - fs_info->fs_devices->num_devices++; - fs_info->fs_devices->open_devices++; - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_devices->device_list_mutex); + list_add(&device->dev_list, &fs_devices->devices); + fs_devices->num_devices++; + fs_devices->open_devices++; + mutex_unlock(&fs_devices->device_list_mutex); *device_out = device; return 0; @@ -470,6 +474,7 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_block_group *cache; struct btrfs_trans_handle *trans; + int iter_ret = 0; int ret = 0; u64 chunk_offset; @@ -520,29 +525,8 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, key.type = BTRFS_DEV_EXTENT_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto free_path; - if (ret > 0) { - if (path->slots[0] >= - btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto free_path; - if (ret > 0) { - ret = 0; - goto free_path; - } - } else { - ret = 0; - } - } - - while (1) { + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct extent_buffer *leaf = path->nodes[0]; - int slot = path->slots[0]; - - btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid != src_dev->devid) break; @@ -553,30 +537,23 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, if (found_key.offset < key.offset) break; - dev_extent = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); + dev_extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent); cache = btrfs_lookup_block_group(fs_info, chunk_offset); if (!cache) - goto skip; + continue; spin_lock(&cache->lock); cache->to_copy = 1; spin_unlock(&cache->lock); btrfs_put_block_group(cache); - -skip: - ret = btrfs_next_item(root, path); - if (ret != 0) { - if (ret > 0) - ret = 0; - break; - } } + if (iter_ret < 0) + ret = iter_ret; -free_path: btrfs_free_path(path); unlock: mutex_unlock(&fs_info->chunk_mutex); @@ -730,7 +707,12 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); - /* Commit dev_replace state and reserve 1 item for it. */ + /* + * Commit dev_replace state and reserve 1 item for it. + * This is crucial to ensure we won't miss copying extents for new block + * groups that are allocated after we started the device replace, and + * must be done after setting up the device replace state. + */ trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -872,6 +854,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *tgt_device; struct btrfs_device *src_device; struct btrfs_root *root = fs_info->tree_root; @@ -921,12 +904,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, WARN_ON(ret); /* Prevent write_all_supers() during the finishing procedure */ - mutex_lock(&fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_devices->device_list_mutex); /* Prevent new chunks being allocated on the source device */ mutex_lock(&fs_info->chunk_mutex); if (!list_empty(&src_device->post_commit_list)) { - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); mutex_unlock(&fs_info->chunk_mutex); } else { break; @@ -963,7 +946,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, error: up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); btrfs_rm_dev_replace_blocked(fs_info); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(tgt_device); @@ -992,8 +975,8 @@ error: btrfs_assign_next_active_device(src_device, tgt_device); - list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); - fs_info->fs_devices->rw_devices++; + list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list); + fs_devices->rw_devices++; up_write(&dev_replace->rwsem); btrfs_rm_dev_replace_blocked(fs_info); @@ -1016,7 +999,7 @@ error: * belong to this filesystem. */ mutex_unlock(&fs_info->chunk_mutex); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); /* replace the sysfs entry */ btrfs_sysfs_remove_device(src_device); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 3b532bab0755..72fb2c518a2b 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -325,36 +325,15 @@ btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, const char *name, int name_len) { - struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key key; - u32 nritems; int ret; key.objectid = dirid; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - return ERR_PTR(ret); - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - - while (1) { - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) - break; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - continue; - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_for_each_slot(root, &key, &key, path, ret) { if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) break; @@ -362,10 +341,12 @@ btrfs_search_dir_index_item(struct btrfs_root *root, name, name_len); if (di) return di; - - path->slots[0]++; } - return NULL; + /* Adjust return code if the key was not found in the next leaf. */ + if (ret > 0) + ret = 0; + + return ERR_PTR(ret); } struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 87a5addbedf6..4ba005c41983 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -5,7 +5,6 @@ #include <linux/fs.h> #include <linux/blkdev.h> -#include <linux/radix-tree.h> #include <linux/writeback.h> #include <linux/workqueue.h> #include <linux/kthread.h> @@ -374,9 +373,9 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, * @level: expected level, mandatory check * @first_key: expected key of first slot, skip check if NULL */ -static int btree_read_extent_buffer_pages(struct extent_buffer *eb, - u64 parent_transid, int level, - struct btrfs_key *first_key) +int btrfs_read_extent_buffer(struct extent_buffer *eb, + u64 parent_transid, int level, + struct btrfs_key *first_key) { struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; @@ -441,17 +440,31 @@ static int csum_one_extent_buffer(struct extent_buffer *eb) else ret = btrfs_check_leaf_full(eb); - if (ret < 0) { - btrfs_print_tree(eb, 0); + if (ret < 0) + goto error; + + /* + * Also check the generation, the eb reached here must be newer than + * last committed. Or something seriously wrong happened. + */ + if (unlikely(btrfs_header_generation(eb) <= fs_info->last_trans_committed)) { + ret = -EUCLEAN; btrfs_err(fs_info, - "block=%llu write time tree block corruption detected", - eb->start); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); - return ret; + "block=%llu bad generation, have %llu expect > %llu", + eb->start, btrfs_header_generation(eb), + fs_info->last_trans_committed); + goto error; } write_extent_buffer(eb, result, 0, fs_info->csum_size); return 0; + +error: + btrfs_print_tree(eb, 0); + btrfs_err(fs_info, "block=%llu write time tree block corruption detected", + eb->start); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + return ret; } /* Checksum all dirty extent buffers in one bio_vec */ @@ -472,7 +485,7 @@ static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info, uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur, fs_info->nodesize); - /* A dirty eb shouldn't disappear from buffer_radix */ + /* A dirty eb shouldn't disappear from extent_buffers */ if (WARN_ON(!eb)) return -EUCLEAN; @@ -505,7 +518,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec u64 found_start; struct extent_buffer *eb; - if (fs_info->sectorsize < PAGE_SIZE) + if (fs_info->nodesize < PAGE_SIZE) return csum_dirty_subpage_buffers(fs_info, bvec); eb = (struct extent_buffer *)page->private; @@ -690,7 +703,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, ASSERT(page->private); - if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return validate_subpage_buffer(page, start, end, mirror); eb = (struct extent_buffer *)page->private; @@ -836,8 +849,7 @@ static void run_one_async_free(struct btrfs_work *work) } blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 dio_file_offset, + int mirror_num, u64 dio_file_offset, extent_submit_bio_start_t *submit_bio_start) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; @@ -860,9 +872,9 @@ blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, async->status = 0; if (op_is_sync(bio->bi_opf)) - btrfs_set_work_high_priority(&async->work); - - btrfs_queue_work(fs_info->workers, &async->work); + btrfs_queue_work(fs_info->hipri_workers, &async->work); + else + btrfs_queue_work(fs_info->workers, &async->work); return 0; } @@ -906,8 +918,7 @@ static bool should_async_write(struct btrfs_fs_info *fs_info, return true; } -blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags) +void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); blk_status_t ret; @@ -919,31 +930,25 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, */ ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_METADATA); - if (ret) - goto out_w_error; - ret = btrfs_map_bio(fs_info, bio, mirror_num); + if (!ret) + ret = btrfs_map_bio(fs_info, bio, mirror_num); } else if (!should_async_write(fs_info, BTRFS_I(inode))) { ret = btree_csum_one_bio(bio); - if (ret) - goto out_w_error; - ret = btrfs_map_bio(fs_info, bio, mirror_num); + if (!ret) + ret = btrfs_map_bio(fs_info, bio, mirror_num); } else { /* * kthread helpers are used to submit writes so that * checksumming can happen in parallel across all CPUs */ ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0, - 0, btree_submit_bio_start); + btree_submit_bio_start); } - if (ret) - goto out_w_error; - return 0; - -out_w_error: - bio->bi_status = ret; - bio_endio(bio); - return ret; + if (ret) { + bio->bi_status = ret; + bio_endio(bio); + } } #ifdef CONFIG_MIGRATION @@ -991,49 +996,48 @@ static int btree_writepages(struct address_space *mapping, return btree_write_cache_pages(mapping, wbc); } -static int btree_releasepage(struct page *page, gfp_t gfp_flags) +static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags) { - if (PageWriteback(page) || PageDirty(page)) - return 0; + if (folio_test_writeback(folio) || folio_test_dirty(folio)) + return false; - return try_release_extent_buffer(page); + return try_release_extent_buffer(&folio->page); } -static void btree_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) +static void btree_invalidate_folio(struct folio *folio, size_t offset, + size_t length) { struct extent_io_tree *tree; - tree = &BTRFS_I(page->mapping->host)->io_tree; - extent_invalidatepage(tree, page, offset); - btree_releasepage(page, GFP_NOFS); - if (PagePrivate(page)) { - btrfs_warn(BTRFS_I(page->mapping->host)->root->fs_info, - "page private not zero on page %llu", - (unsigned long long)page_offset(page)); - detach_page_private(page); + tree = &BTRFS_I(folio->mapping->host)->io_tree; + extent_invalidate_folio(tree, folio, offset); + btree_release_folio(folio, GFP_NOFS); + if (folio_get_private(folio)) { + btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info, + "folio private not zero on folio %llu", + (unsigned long long)folio_pos(folio)); + folio_detach_private(folio); } } -static int btree_set_page_dirty(struct page *page) -{ #ifdef DEBUG - struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); +static bool btree_dirty_folio(struct address_space *mapping, + struct folio *folio) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(mapping->host->i_sb); struct btrfs_subpage *subpage; struct extent_buffer *eb; int cur_bit = 0; - u64 page_start = page_offset(page); + u64 page_start = folio_pos(folio); if (fs_info->sectorsize == PAGE_SIZE) { - BUG_ON(!PagePrivate(page)); - eb = (struct extent_buffer *)page->private; + eb = folio_get_private(folio); BUG_ON(!eb); BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); BUG_ON(!atomic_read(&eb->refs)); btrfs_assert_tree_write_locked(eb); - return __set_page_dirty_nobuffers(page); + return filemap_dirty_folio(mapping, folio); } - ASSERT(PagePrivate(page) && page->private); - subpage = (struct btrfs_subpage *)page->private; + subpage = folio_get_private(folio); ASSERT(subpage->dirty_bitmap); while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) { @@ -1059,18 +1063,20 @@ static int btree_set_page_dirty(struct page *page) cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits); } -#endif - return __set_page_dirty_nobuffers(page); + return filemap_dirty_folio(mapping, folio); } +#else +#define btree_dirty_folio filemap_dirty_folio +#endif static const struct address_space_operations btree_aops = { .writepages = btree_writepages, - .releasepage = btree_releasepage, - .invalidatepage = btree_invalidatepage, + .release_folio = btree_release_folio, + .invalidate_folio = btree_invalidate_folio, #ifdef CONFIG_MIGRATION .migratepage = btree_migratepage, #endif - .set_page_dirty = btree_set_page_dirty, + .dirty_folio = btree_dirty_folio, }; struct extent_buffer *btrfs_find_create_tree_block( @@ -1103,12 +1109,15 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, if (IS_ERR(buf)) return buf; - ret = btree_read_extent_buffer_pages(buf, parent_transid, - level, first_key); + ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key); if (ret) { free_extent_buffer_stale(buf); return ERR_PTR(ret); } + if (btrfs_check_eb_owner(buf, owner_root)) { + free_extent_buffer_stale(buf); + return ERR_PTR(-EUCLEAN); + } return buf; } @@ -1149,7 +1158,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; root->inode_tree = RB_ROOT; - INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); + xa_init_flags(&root->delayed_nodes, GFP_ATOMIC); btrfs_init_root_block_rsv(root); @@ -1201,9 +1210,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks); #ifdef CONFIG_BTRFS_DEBUG INIT_LIST_HEAD(&root->leak_list); - spin_lock(&fs_info->fs_roots_radix_lock); + spin_lock(&fs_info->fs_roots_lock); list_add_tail(&root->leak_list, &fs_info->allocated_roots); - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_lock); #endif } @@ -1289,12 +1298,33 @@ struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, return root; } +static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group *block_group; + u64 ret; + + if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return 0; + + if (bytenr) + block_group = btrfs_lookup_block_group(fs_info, bytenr); + else + block_group = btrfs_lookup_first_block_group(fs_info, bytenr); + ASSERT(block_group); + if (!block_group) + return 0; + ret = block_group->global_root_id; + btrfs_put_block_group(block_group); + + return ret; +} + struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr) { struct btrfs_key key = { .objectid = BTRFS_CSUM_TREE_OBJECTID, .type = BTRFS_ROOT_ITEM_KEY, - .offset = 0, + .offset = btrfs_global_root_id(fs_info, bytenr), }; return btrfs_global_root(fs_info, &key); @@ -1305,7 +1335,7 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr) struct btrfs_key key = { .objectid = BTRFS_EXTENT_TREE_OBJECTID, .type = BTRFS_ROOT_ITEM_KEY, - .offset = 0, + .offset = btrfs_global_root_id(fs_info, bytenr), }; return btrfs_global_root(fs_info, &key); @@ -1522,10 +1552,28 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, ret = PTR_ERR(root->node); root->node = NULL; goto fail; - } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { + } + if (!btrfs_buffer_uptodate(root->node, generation, 0)) { ret = -EIO; goto fail; } + + /* + * For real fs, and not log/reloc trees, root owner must + * match its root node owner + */ + if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) && + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && + root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && + root->root_key.objectid != btrfs_header_owner(root->node)) { + btrfs_crit(fs_info, +"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu", + root->root_key.objectid, root->node->start, + btrfs_header_owner(root->node), + root->root_key.objectid); + ret = -EUCLEAN; + goto fail; + } root->commit_root = btrfs_root_node(root); return root; fail: @@ -1611,12 +1659,11 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, { struct btrfs_root *root; - spin_lock(&fs_info->fs_roots_radix_lock); - root = radix_tree_lookup(&fs_info->fs_roots_radix, - (unsigned long)root_id); + spin_lock(&fs_info->fs_roots_lock); + root = xa_load(&fs_info->fs_roots, (unsigned long)root_id); if (root) root = btrfs_grab_root(root); - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_lock); return root; } @@ -1658,20 +1705,14 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info, { int ret; - ret = radix_tree_preload(GFP_NOFS); - if (ret) - return ret; - - spin_lock(&fs_info->fs_roots_radix_lock); - ret = radix_tree_insert(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid, - root); + spin_lock(&fs_info->fs_roots_lock); + ret = xa_insert(&fs_info->fs_roots, (unsigned long)root->root_key.objectid, + root, GFP_NOFS); if (ret == 0) { btrfs_grab_root(root); - set_bit(BTRFS_ROOT_IN_RADIX, &root->state); + set_bit(BTRFS_ROOT_REGISTERED, &root->state); } - spin_unlock(&fs_info->fs_roots_radix_lock); - radix_tree_preload_end(); + spin_unlock(&fs_info->fs_roots_lock); return ret; } @@ -1727,6 +1768,7 @@ void btrfs_free_fs_info(struct btrfs_fs_info *fs_info) btrfs_put_root(fs_info->uuid_root); btrfs_put_root(fs_info->fs_root); btrfs_put_root(fs_info->data_reloc_root); + btrfs_put_root(fs_info->block_group_root); btrfs_check_leaked_roots(fs_info); btrfs_extent_buffer_leak_debug_check(fs_info); kfree(fs_info->super_copy); @@ -1812,9 +1854,10 @@ again: ret = btrfs_insert_fs_root(fs_info, root); if (ret) { - btrfs_put_root(root); - if (ret == -EEXIST) + if (ret == -EEXIST) { + btrfs_put_root(root); goto again; + } goto fail; } return root; @@ -1925,8 +1968,7 @@ static void end_workqueue_fn(struct btrfs_work *work) static int cleaner_kthread(void *arg) { - struct btrfs_root *root = arg; - struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_fs_info *fs_info = arg; int again; while (1) { @@ -1959,7 +2001,7 @@ static int cleaner_kthread(void *arg) btrfs_run_delayed_iputs(fs_info); - again = btrfs_clean_one_deleted_snapshot(root); + again = btrfs_clean_one_deleted_snapshot(fs_info); mutex_unlock(&fs_info->cleaner_mutex); /* @@ -2095,8 +2137,6 @@ static void backup_super_roots(struct btrfs_fs_info *info) { const int next_backup = info->backup_root_index; struct btrfs_root_backup *root_backup; - struct btrfs_root *extent_root = btrfs_extent_root(info, 0); - struct btrfs_root *csum_root = btrfs_csum_root(info, 0); root_backup = info->super_for_commit->super_roots + next_backup; @@ -2121,11 +2161,30 @@ static void backup_super_roots(struct btrfs_fs_info *info) btrfs_set_backup_chunk_root_level(root_backup, btrfs_header_level(info->chunk_root->node)); - btrfs_set_backup_extent_root(root_backup, extent_root->node->start); - btrfs_set_backup_extent_root_gen(root_backup, - btrfs_header_generation(extent_root->node)); - btrfs_set_backup_extent_root_level(root_backup, - btrfs_header_level(extent_root->node)); + if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) { + btrfs_set_backup_block_group_root(root_backup, + info->block_group_root->node->start); + btrfs_set_backup_block_group_root_gen(root_backup, + btrfs_header_generation(info->block_group_root->node)); + btrfs_set_backup_block_group_root_level(root_backup, + btrfs_header_level(info->block_group_root->node)); + } else { + struct btrfs_root *extent_root = btrfs_extent_root(info, 0); + struct btrfs_root *csum_root = btrfs_csum_root(info, 0); + + btrfs_set_backup_extent_root(root_backup, + extent_root->node->start); + btrfs_set_backup_extent_root_gen(root_backup, + btrfs_header_generation(extent_root->node)); + btrfs_set_backup_extent_root_level(root_backup, + btrfs_header_level(extent_root->node)); + + btrfs_set_backup_csum_root(root_backup, csum_root->node->start); + btrfs_set_backup_csum_root_gen(root_backup, + btrfs_header_generation(csum_root->node)); + btrfs_set_backup_csum_root_level(root_backup, + btrfs_header_level(csum_root->node)); + } /* * we might commit during log recovery, which happens before we set @@ -2146,12 +2205,6 @@ static void backup_super_roots(struct btrfs_fs_info *info) btrfs_set_backup_dev_root_level(root_backup, btrfs_header_level(info->dev_root->node)); - btrfs_set_backup_csum_root(root_backup, csum_root->node->start); - btrfs_set_backup_csum_root_gen(root_backup, - btrfs_header_generation(csum_root->node)); - btrfs_set_backup_csum_root_level(root_backup, - btrfs_header_level(csum_root->node)); - btrfs_set_backup_total_bytes(root_backup, btrfs_super_total_bytes(info->super_copy)); btrfs_set_backup_bytes_used(root_backup, @@ -2217,10 +2270,12 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) { btrfs_destroy_workqueue(fs_info->fixup_workers); btrfs_destroy_workqueue(fs_info->delalloc_workers); + btrfs_destroy_workqueue(fs_info->hipri_workers); btrfs_destroy_workqueue(fs_info->workers); btrfs_destroy_workqueue(fs_info->endio_workers); btrfs_destroy_workqueue(fs_info->endio_raid56_workers); - btrfs_destroy_workqueue(fs_info->rmw_workers); + if (fs_info->rmw_workers) + destroy_workqueue(fs_info->rmw_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); btrfs_destroy_workqueue(fs_info->endio_freespace_worker); btrfs_destroy_workqueue(fs_info->delayed_workers); @@ -2269,6 +2324,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, bool free_chunk_root) free_root_extent_buffers(info->uuid_root); free_root_extent_buffers(info->fs_root); free_root_extent_buffers(info->data_reloc_root); + free_root_extent_buffers(info->block_group_root); if (free_chunk_root) free_root_extent_buffers(info->chunk_root); } @@ -2286,9 +2342,9 @@ void btrfs_put_root(struct btrfs_root *root) btrfs_drew_lock_destroy(&root->snapshot_lock); free_root_extent_buffers(root); #ifdef CONFIG_BTRFS_DEBUG - spin_lock(&root->fs_info->fs_roots_radix_lock); + spin_lock(&root->fs_info->fs_roots_lock); list_del_init(&root->leak_list); - spin_unlock(&root->fs_info->fs_roots_radix_lock); + spin_unlock(&root->fs_info->fs_roots_lock); #endif kfree(root); } @@ -2296,28 +2352,21 @@ void btrfs_put_root(struct btrfs_root *root) void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) { - int ret; - struct btrfs_root *gang[8]; - int i; + struct btrfs_root *root; + unsigned long index = 0; while (!list_empty(&fs_info->dead_roots)) { - gang[0] = list_entry(fs_info->dead_roots.next, - struct btrfs_root, root_list); - list_del(&gang[0]->root_list); + root = list_entry(fs_info->dead_roots.next, + struct btrfs_root, root_list); + list_del(&root->root_list); - if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) - btrfs_drop_and_free_fs_root(fs_info, gang[0]); - btrfs_put_root(gang[0]); + if (test_bit(BTRFS_ROOT_REGISTERED, &root->state)) + btrfs_drop_and_free_fs_root(fs_info, root); + btrfs_put_root(root); } - while (1) { - ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, - (void **)gang, 0, - ARRAY_SIZE(gang)); - if (!ret) - break; - for (i = 0; i < ret; i++) - btrfs_drop_and_free_fs_root(fs_info, gang[i]); + xa_for_each(&fs_info->fs_roots, index, root) { + btrfs_drop_and_free_fs_root(fs_info, root); } } @@ -2394,7 +2443,9 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; fs_info->workers = - btrfs_alloc_workqueue(fs_info, "worker", + btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16); + fs_info->hipri_workers = + btrfs_alloc_workqueue(fs_info, "worker-high", flags | WQ_HIGHPRI, max_active, 16); fs_info->delalloc_workers = @@ -2426,8 +2477,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->endio_raid56_workers = btrfs_alloc_workqueue(fs_info, "endio-raid56", flags, max_active, 4); - fs_info->rmw_workers = - btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2); + fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active); fs_info->endio_write_workers = btrfs_alloc_workqueue(fs_info, "endio-write", flags, max_active, 2); @@ -2442,8 +2492,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->discard_ctl.discard_workers = alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1); - if (!(fs_info->workers && fs_info->delalloc_workers && - fs_info->flush_workers && + if (!(fs_info->workers && fs_info->hipri_workers && + fs_info->delalloc_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && fs_info->endio_meta_write_workers && fs_info->endio_write_workers && fs_info->endio_raid56_workers && @@ -2504,11 +2554,13 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, log_tree_root->node = NULL; btrfs_put_root(log_tree_root); return ret; - } else if (!extent_buffer_uptodate(log_tree_root->node)) { + } + if (!extent_buffer_uptodate(log_tree_root->node)) { btrfs_err(fs_info, "failed to read log tree"); btrfs_put_root(log_tree_root); return -EIO; } + /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); if (ret) { @@ -2533,6 +2585,7 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, { struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_root *root; + u64 max_global_id = 0; int ret; struct btrfs_key key = { .objectid = objectid, @@ -2568,6 +2621,13 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, break; btrfs_release_path(path); + /* + * Just worry about this for extent tree, it'll be the same for + * everybody. + */ + if (objectid == BTRFS_EXTENT_TREE_OBJECTID) + max_global_id = max(max_global_id, key.offset); + found = true; root = read_tree_root_path(tree_root, path, &key); if (IS_ERR(root)) { @@ -2585,6 +2645,9 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, } btrfs_release_path(path); + if (objectid == BTRFS_EXTENT_TREE_OBJECTID) + fs_info->nr_global_roots = max_global_id + 1; + if (!found || ret) { if (objectid == BTRFS_CSUM_TREE_OBJECTID) set_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); @@ -2752,12 +2815,14 @@ static int validate_super(struct btrfs_fs_info *fs_info, } /* - * For 4K page size, we only support 4K sector size. - * For 64K page size, we support 64K and 4K sector sizes. + * We only support at most two sectorsizes: 4K and PAGE_SIZE. + * + * We can support 16K sectorsize with 64K page size without problem, + * but such sectorsize/pagesize combination doesn't make much sense. + * 4K will be our future standard, PAGE_SIZE is supported from the very + * beginning. */ - if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) || - (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K && - sectorsize != SZ_64K))) { + if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) { btrfs_err(fs_info, "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); @@ -2930,6 +2995,56 @@ out: return ret; } +static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level) +{ + int ret = 0; + + root->node = read_tree_block(root->fs_info, bytenr, + root->root_key.objectid, gen, level, NULL); + if (IS_ERR(root->node)) { + ret = PTR_ERR(root->node); + root->node = NULL; + return ret; + } + if (!extent_buffer_uptodate(root->node)) { + free_extent_buffer(root->node); + root->node = NULL; + return -EIO; + } + + btrfs_set_root_node(&root->root_item, root->node); + root->commit_root = btrfs_root_node(root); + btrfs_set_root_refs(&root->root_item, 1); + return ret; +} + +static int load_important_roots(struct btrfs_fs_info *fs_info) +{ + struct btrfs_super_block *sb = fs_info->super_copy; + u64 gen, bytenr; + int level, ret; + + bytenr = btrfs_super_root(sb); + gen = btrfs_super_generation(sb); + level = btrfs_super_root_level(sb); + ret = load_super_root(fs_info->tree_root, bytenr, gen, level); + if (ret) { + btrfs_warn(fs_info, "couldn't read tree root"); + return ret; + } + + if (!btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return 0; + + bytenr = btrfs_super_block_group_root(sb); + gen = btrfs_super_block_group_root_generation(sb); + level = btrfs_super_block_group_root_level(sb); + ret = load_super_root(fs_info->block_group_root, bytenr, gen, level); + if (ret) + btrfs_warn(fs_info, "couldn't read block group root"); + return ret; +} + static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) { int backup_index = find_newest_super_backup(fs_info); @@ -2939,10 +3054,17 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) int ret = 0; int i; - for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { - u64 generation; - int level; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) { + struct btrfs_root *root; + root = btrfs_alloc_root(fs_info, BTRFS_BLOCK_GROUP_TREE_OBJECTID, + GFP_KERNEL); + if (!root) + return -ENOMEM; + fs_info->block_group_root = root; + } + + for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) { if (handle_error) { if (!IS_ERR(tree_root->node)) free_extent_buffer(tree_root->node); @@ -2967,29 +3089,13 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) if (ret < 0) return ret; } - generation = btrfs_super_generation(sb); - level = btrfs_super_root_level(sb); - tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb), - BTRFS_ROOT_TREE_OBJECTID, - generation, level, NULL); - if (IS_ERR(tree_root->node)) { - handle_error = true; - ret = PTR_ERR(tree_root->node); - tree_root->node = NULL; - btrfs_warn(fs_info, "couldn't read tree root"); - continue; - } else if (!extent_buffer_uptodate(tree_root->node)) { + ret = load_important_roots(fs_info); + if (ret) { handle_error = true; - ret = -EIO; - btrfs_warn(fs_info, "error while reading tree root"); continue; } - btrfs_set_root_node(&tree_root->root_item, tree_root->node); - tree_root->commit_root = btrfs_root_node(tree_root); - btrfs_set_root_refs(&tree_root->root_item, 1); - /* * No need to hold btrfs_root::objectid_mutex since the fs * hasn't been fully initialised and we are the only user @@ -3009,8 +3115,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) } /* All successful */ - fs_info->generation = generation; - fs_info->last_trans_committed = generation; + fs_info->generation = btrfs_header_generation(tree_root->node); + fs_info->last_trans_committed = fs_info->generation; fs_info->last_reloc_trans = 0; /* Always begin writing backup roots after the one being used */ @@ -3028,8 +3134,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { - INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); - INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); + xa_init_flags(&fs_info->fs_roots, GFP_ATOMIC); + xa_init_flags(&fs_info->extent_buffers, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); @@ -3037,7 +3143,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->caching_block_groups); spin_lock_init(&fs_info->delalloc_root_lock); spin_lock_init(&fs_info->trans_lock); - spin_lock_init(&fs_info->fs_roots_radix_lock); + spin_lock_init(&fs_info->fs_roots_lock); spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->super_lock); @@ -3053,6 +3159,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); mutex_init(&fs_info->zoned_meta_io_lock); + mutex_init(&fs_info->zoned_data_reloc_io_lock); seqlock_init(&fs_info->profiles_lock); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); @@ -3104,9 +3211,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(fs_info); - spin_lock_init(&fs_info->block_group_cache_lock); - fs_info->block_group_cache_tree = RB_ROOT; - fs_info->first_logical_byte = (u64)-1; + rwlock_init(&fs_info->block_group_cache_lock); + fs_info->block_group_cache_tree = RB_ROOT_CACHED; extent_io_tree_init(fs_info, &fs_info->excluded_extents, IO_TREE_FS_EXCLUDED_EXTENTS, NULL); @@ -3190,7 +3296,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block static int btrfs_uuid_rescan_kthread(void *data) { - struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; + struct btrfs_fs_info *fs_info = data; int ret; /* @@ -3268,7 +3374,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) /* * btrfs_find_orphan_roots() is responsible for finding all the dead * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load - * them into the fs_info->fs_roots_radix tree. This must be done before + * them into the fs_info->fs_roots. This must be done before * calling btrfs_orphan_cleanup() on the tree root. If we don't do it * first, then btrfs_orphan_cleanup() will delete a dead root's orphan * item before the root's tree is deleted - this means that if we unmount @@ -3293,7 +3399,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) up_read(&fs_info->cleanup_work_sem); mutex_lock(&fs_info->cleaner_mutex); - ret = btrfs_recover_relocation(fs_info->tree_root); + ret = btrfs_recover_relocation(fs_info); mutex_unlock(&fs_info->cleaner_mutex); if (ret < 0) { btrfs_warn(fs_info, "failed to recover relocation: %d", ret); @@ -3506,7 +3612,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device ~BTRFS_FEATURE_INCOMPAT_SUPP; if (features) { btrfs_err(fs_info, - "cannot mount because of unsupported optional features (%llx)", + "cannot mount because of unsupported optional features (0x%llx)", features); err = -EINVAL; goto fail_alloc; @@ -3544,7 +3650,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device ~BTRFS_FEATURE_COMPAT_RO_SUPP; if (!sb_rdonly(sb) && features) { btrfs_err(fs_info, - "cannot mount read-write because of unsupported optional features (%llx)", + "cannot mount read-write because of unsupported optional features (0x%llx)", features); err = -EINVAL; goto fail_alloc; @@ -3553,17 +3659,20 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (sectorsize < PAGE_SIZE) { struct btrfs_subpage_info *subpage_info; + /* + * V1 space cache has some hardcoded PAGE_SIZE usage, and is + * going to be deprecated. + * + * Force to use v2 cache for subpage case. + */ + btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); + btrfs_set_and_info(fs_info, FREE_SPACE_TREE, + "forcing free space tree for sector size %u with page size %lu", + sectorsize, PAGE_SIZE); + btrfs_warn(fs_info, "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); - if (btrfs_super_incompat_flags(fs_info->super_copy) & - BTRFS_FEATURE_INCOMPAT_RAID56) { - btrfs_err(fs_info, - "RAID56 is not yet supported for sector size %u with page size %lu", - sectorsize, PAGE_SIZE); - err = -EINVAL; - goto fail_alloc; - } subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); if (!subpage_info) goto fail_alloc; @@ -3594,21 +3703,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device generation = btrfs_super_chunk_root_generation(disk_super); level = btrfs_super_chunk_root_level(disk_super); - - chunk_root->node = read_tree_block(fs_info, - btrfs_super_chunk_root(disk_super), - BTRFS_CHUNK_TREE_OBJECTID, - generation, level, NULL); - if (IS_ERR(chunk_root->node) || - !extent_buffer_uptodate(chunk_root->node)) { + ret = load_super_root(chunk_root, btrfs_super_chunk_root(disk_super), + generation, level); + if (ret) { btrfs_err(fs_info, "failed to read chunk root"); - if (!IS_ERR(chunk_root->node)) - free_extent_buffer(chunk_root->node); - chunk_root->node = NULL; goto fail_tree_roots; } - btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); - chunk_root->commit_root = btrfs_root_node(chunk_root); read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, offsetof(struct btrfs_header, chunk_tree_uuid), @@ -3728,7 +3828,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_sysfs; } - fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, + fs_info->cleaner_kthread = kthread_run(cleaner_kthread, fs_info, "btrfs-cleaner"); if (IS_ERR(fs_info->cleaner_kthread)) goto fail_sysfs; @@ -3813,6 +3913,10 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device set_bit(BTRFS_FS_OPEN, &fs_info->flags); + /* Kick the cleaner thread so it'll start deleting snapshots. */ + if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags)) + wake_up_process(fs_info->cleaner_kthread); + clear_oneshot: btrfs_clear_oneshot_options(fs_info); return 0; @@ -4029,8 +4133,9 @@ static int write_dev_supers(struct btrfs_device *device, * to do I/O, so we don't lose the ability to do integrity * checking. */ - bio = bio_alloc(GFP_NOFS, 1); - bio_set_dev(bio, device->bdev); + bio = bio_alloc(device->bdev, 1, + REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO, + GFP_NOFS); bio->bi_iter.bi_sector = bytenr >> SECTOR_SHIFT; bio->bi_private = device; bio->bi_end_io = btrfs_end_super_write; @@ -4042,11 +4147,11 @@ static int write_dev_supers(struct btrfs_device *device, * go down lazy and there's a short window where the on-disk * copies might still contain the older version. */ - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_META | REQ_PRIO; if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) bio->bi_opf |= REQ_FUA; - btrfsic_submit_bio(bio); + btrfsic_check_bio(bio); + submit_bio(bio); if (btrfs_advance_sb_log(device, i)) errors++; @@ -4127,6 +4232,7 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) */ static void btrfs_end_empty_barrier(struct bio *bio) { + bio_uninit(bio); complete(bio->bi_private); } @@ -4136,7 +4242,7 @@ static void btrfs_end_empty_barrier(struct bio *bio) */ static void write_dev_flush(struct btrfs_device *device) { - struct bio *bio = device->flush_bio; + struct bio *bio = &device->flush_bio; #ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY /* @@ -4149,19 +4255,18 @@ static void write_dev_flush(struct btrfs_device *device) * of simplicity, since this is a debug tool and not meant for use in * non-debug builds. */ - struct request_queue *q = bdev_get_queue(device->bdev); - if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + if (!bdev_write_cache(device->bdev)) return; #endif - bio_reset(bio); + bio_init(bio, device->bdev, NULL, 0, + REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH); bio->bi_end_io = btrfs_end_empty_barrier; - bio_set_dev(bio, device->bdev); - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; - btrfsic_submit_bio(bio); + btrfsic_check_bio(bio); + submit_bio(bio); set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); } @@ -4170,7 +4275,7 @@ static void write_dev_flush(struct btrfs_device *device) */ static blk_status_t wait_dev_flush(struct btrfs_device *device) { - struct bio *bio = device->flush_bio; + struct bio *bio = &device->flush_bio; if (!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state)) return BLK_STS_OK; @@ -4394,12 +4499,11 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, { bool drop_ref = false; - spin_lock(&fs_info->fs_roots_radix_lock); - radix_tree_delete(&fs_info->fs_roots_radix, - (unsigned long)root->root_key.objectid); - if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state)) + spin_lock(&fs_info->fs_roots_lock); + xa_erase(&fs_info->fs_roots, (unsigned long)root->root_key.objectid); + if (test_and_clear_bit(BTRFS_ROOT_REGISTERED, &root->state)) drop_ref = true; - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_lock); if (BTRFS_FS_ERROR(fs_info)) { ASSERT(root->log_root == NULL); @@ -4415,50 +4519,48 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) { - u64 root_objectid = 0; - struct btrfs_root *gang[8]; - int i = 0; + struct btrfs_root *roots[8]; + unsigned long index = 0; + int i; int err = 0; - unsigned int ret = 0; + int grabbed; while (1) { - spin_lock(&fs_info->fs_roots_radix_lock); - ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, - (void **)gang, root_objectid, - ARRAY_SIZE(gang)); - if (!ret) { - spin_unlock(&fs_info->fs_roots_radix_lock); - break; + struct btrfs_root *root; + + spin_lock(&fs_info->fs_roots_lock); + if (!xa_find(&fs_info->fs_roots, &index, ULONG_MAX, XA_PRESENT)) { + spin_unlock(&fs_info->fs_roots_lock); + return err; } - root_objectid = gang[ret - 1]->root_key.objectid + 1; - for (i = 0; i < ret; i++) { - /* Avoid to grab roots in dead_roots */ - if (btrfs_root_refs(&gang[i]->root_item) == 0) { - gang[i] = NULL; - continue; - } - /* grab all the search result for later use */ - gang[i] = btrfs_grab_root(gang[i]); + grabbed = 0; + xa_for_each_start(&fs_info->fs_roots, index, root, index) { + /* Avoid grabbing roots in dead_roots */ + if (btrfs_root_refs(&root->root_item) > 0) + roots[grabbed++] = btrfs_grab_root(root); + if (grabbed >= ARRAY_SIZE(roots)) + break; } - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_lock); - for (i = 0; i < ret; i++) { - if (!gang[i]) + for (i = 0; i < grabbed; i++) { + if (!roots[i]) continue; - root_objectid = gang[i]->root_key.objectid; - err = btrfs_orphan_cleanup(gang[i]); + index = roots[i]->root_key.objectid; + err = btrfs_orphan_cleanup(roots[i]); if (err) - break; - btrfs_put_root(gang[i]); + goto out; + btrfs_put_root(roots[i]); } - root_objectid++; + index++; } - /* release the uncleaned roots due to error */ - for (; i < ret; i++) { - if (gang[i]) - btrfs_put_root(gang[i]); +out: + /* Release the roots that remain uncleaned due to error */ + for (; i < grabbed; i++) { + if (roots[i]) + btrfs_put_root(roots[i]); } return err; } @@ -4530,6 +4632,17 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) int ret; set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); + + /* + * We may have the reclaim task running and relocating a data block group, + * in which case it may create delayed iputs. So stop it before we park + * the cleaner kthread otherwise we can get new delayed iputs after + * parking the cleaner, and that can make the async reclaim task to hang + * if it's waiting for delayed iputs to complete, since the cleaner is + * parked and can not run delayed iputs - this will make us hang when + * trying to stop the async reclaim task. + */ + cancel_work_sync(&fs_info->reclaim_bgs_work); /* * We don't want the cleaner to start new transactions, add more delayed * iputs, etc. while we're closing. We can't use kthread_stop() yet @@ -4538,6 +4651,12 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) */ kthread_park(fs_info->cleaner_kthread); + /* + * If we had UNFINISHED_DROPS we could still be processing them, so + * clear that bit and wake up relocation so it can stop. + */ + btrfs_wake_unfinished_drop(fs_info); + /* wait for the qgroup rescan worker to stop */ btrfs_qgroup_wait_for_completion(fs_info, false); @@ -4564,8 +4683,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); - cancel_work_sync(&fs_info->reclaim_bgs_work); - /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); @@ -4747,13 +4864,6 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info) __btrfs_btree_balance_dirty(fs_info, 0); } -int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, - struct btrfs_key *first_key) -{ - return btree_read_extent_buffer_pages(buf, parent_transid, - level, first_key); -} - static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) { /* cleanup FS via transaction */ @@ -4769,31 +4879,28 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info) { - struct btrfs_root *gang[8]; - u64 root_objectid = 0; - int ret; - - spin_lock(&fs_info->fs_roots_radix_lock); - while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, - (void **)gang, root_objectid, - ARRAY_SIZE(gang))) != 0) { - int i; + unsigned long index = 0; + int grabbed = 0; + struct btrfs_root *roots[8]; - for (i = 0; i < ret; i++) - gang[i] = btrfs_grab_root(gang[i]); - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_lock(&fs_info->fs_roots_lock); + while ((grabbed = xa_extract(&fs_info->fs_roots, (void **)roots, index, + ULONG_MAX, 8, XA_PRESENT))) { + for (int i = 0; i < grabbed; i++) + roots[i] = btrfs_grab_root(roots[i]); + spin_unlock(&fs_info->fs_roots_lock); - for (i = 0; i < ret; i++) { - if (!gang[i]) + for (int i = 0; i < grabbed; i++) { + if (!roots[i]) continue; - root_objectid = gang[i]->root_key.objectid; - btrfs_free_log(NULL, gang[i]); - btrfs_put_root(gang[i]); + index = roots[i]->root_key.objectid; + btrfs_free_log(NULL, roots[i]); + btrfs_put_root(roots[i]); } - root_objectid++; - spin_lock(&fs_info->fs_roots_radix_lock); + index++; + spin_lock(&fs_info->fs_roots_lock); } - spin_unlock(&fs_info->fs_roots_radix_lock); + spin_unlock(&fs_info->fs_roots_lock); btrfs_free_log_root_tree(NULL, fs_info); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 5e8bef4b7563..4ee8c42c9f78 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -87,8 +87,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, struct page *page, u64 start, u64 end, int mirror); -blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags); +void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif @@ -111,6 +110,8 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info) { + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + return fs_info->block_group_root; return btrfs_extent_root(fs_info, 0); } @@ -118,13 +119,12 @@ void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); -int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, - struct btrfs_key *first_key); +int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid, + int level, struct btrfs_key *first_key); blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, enum btrfs_wq_endio_type metadata); blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 dio_file_offset, + int mirror_num, u64 dio_file_offset, extent_submit_bio_start_t *submit_bio_start); blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, int mirror_num); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 04083ee5ae6e..c3eb52dbe61c 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -244,8 +244,8 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits); int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, u32 bits); -int extent_invalidatepage(struct extent_io_tree *tree, - struct page *page, unsigned long offset); +int extent_invalidate_folio(struct extent_io_tree *tree, + struct folio *folio, size_t offset); bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d89273c4b6b8..4157ecc27d4b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -598,7 +598,7 @@ fail: static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, - int refs_to_drop, int *last_ref) + int refs_to_drop) { struct btrfs_key key; struct btrfs_extent_data_ref *ref1 = NULL; @@ -631,7 +631,6 @@ static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans, if (num_refs == 0) { ret = btrfs_del_item(trans, root, path); - *last_ref = 1; } else { if (key.type == BTRFS_EXTENT_DATA_REF_KEY) btrfs_set_extent_data_ref_count(leaf, ref1, num_refs); @@ -896,7 +895,13 @@ again: err = -ENOENT; while (1) { if (ptr >= end) { - WARN_ON(ptr > end); + if (ptr > end) { + err = -EUCLEAN; + btrfs_print_leaf(path->nodes[0]); + btrfs_crit(fs_info, +"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", + path->slots[0], root_objectid, owner, offset, parent); + } break; } iref = (struct btrfs_extent_inline_ref *)ptr; @@ -1072,8 +1077,7 @@ static noinline_for_stack void update_inline_extent_backref(struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, int refs_to_mod, - struct btrfs_delayed_extent_op *extent_op, - int *last_ref) + struct btrfs_delayed_extent_op *extent_op) { struct extent_buffer *leaf = path->nodes[0]; struct btrfs_extent_item *ei; @@ -1121,7 +1125,6 @@ void update_inline_extent_backref(struct btrfs_path *path, else btrfs_set_shared_data_ref_count(leaf, sref, refs); } else { - *last_ref = 1; size = btrfs_extent_inline_ref_size(type); item_size = btrfs_item_size(leaf, path->slots[0]); ptr = (unsigned long)iref; @@ -1166,8 +1169,7 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, } return -EUCLEAN; } - update_inline_extent_backref(path, iref, refs_to_add, - extent_op, NULL); + update_inline_extent_backref(path, iref, refs_to_add, extent_op); } else if (ret == -ENOENT) { setup_inline_extent_backref(trans->fs_info, path, iref, parent, root_objectid, owner, offset, @@ -1181,21 +1183,17 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_extent_inline_ref *iref, - int refs_to_drop, int is_data, int *last_ref) + int refs_to_drop, int is_data) { int ret = 0; BUG_ON(!is_data && refs_to_drop != 1); - if (iref) { - update_inline_extent_backref(path, iref, -refs_to_drop, NULL, - last_ref); - } else if (is_data) { - ret = remove_extent_data_ref(trans, root, path, refs_to_drop, - last_ref); - } else { - *last_ref = 1; + if (iref) + update_inline_extent_backref(path, iref, -refs_to_drop, NULL); + else if (is_data) + ret = remove_extent_data_ref(trans, root, path, refs_to_drop); + else ret = btrfs_del_item(trans, root, path); - } return ret; } @@ -1247,7 +1245,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, if (size) { ret = blkdev_issue_discard(bdev, start >> 9, size >> 9, - GFP_NOFS, 0); + GFP_NOFS); if (!ret) *discarded_bytes += size; else if (ret != -EOPNOTSUPP) @@ -1264,7 +1262,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, if (bytes_left) { ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9, - GFP_NOFS, 0); + GFP_NOFS); if (!ret) *discarded_bytes += bytes_left; } @@ -1299,7 +1297,7 @@ static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes) ret = btrfs_reset_device_zone(dev_replace->tgtdev, phys, len, &discarded); discarded += src_disc; - } else if (blk_queue_discard(bdev_get_queue(stripe->dev->bdev))) { + } else if (bdev_max_discard_sectors(stripe->dev->bdev)) { ret = btrfs_issue_discard(dev->bdev, phys, len, &discarded); } else { ret = 0; @@ -1585,12 +1583,12 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, u32 item_size; int ret; int err = 0; - int metadata = !extent_op->is_data; + int metadata = 1; if (TRANS_ABORTED(trans)) return 0; - if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) metadata = 0; path = btrfs_alloc_path(); @@ -2188,7 +2186,7 @@ out: int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct extent_buffer *eb, u64 flags, - int level, int is_data) + int level) { struct btrfs_delayed_extent_op *extent_op; int ret; @@ -2200,7 +2198,6 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->flags_to_set = flags; extent_op->update_flags = true; extent_op->update_key = false; - extent_op->is_data = is_data ? true : false; extent_op->level = level; ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op); @@ -2365,15 +2362,10 @@ out: } int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, - u64 bytenr, bool strict) + u64 bytenr, bool strict, struct btrfs_path *path) { - struct btrfs_path *path; int ret; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - do { ret = check_committed_ref(root, path, objectid, offset, bytenr, strict); @@ -2384,7 +2376,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, } while (ret == -EAGAIN); out: - btrfs_free_path(path); + btrfs_release_path(path); if (btrfs_is_data_reloc_root(root)) WARN_ON(ret > 0); return ret; @@ -2505,24 +2497,21 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) return ret; } -static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) +static u64 first_logical_byte(struct btrfs_fs_info *fs_info) { - struct btrfs_block_group *cache; - u64 bytenr; + struct rb_node *leftmost; + u64 bytenr = 0; - spin_lock(&fs_info->block_group_cache_lock); - bytenr = fs_info->first_logical_byte; - spin_unlock(&fs_info->block_group_cache_lock); - - if (bytenr < (u64)-1) - return bytenr; - - cache = btrfs_lookup_first_block_group(fs_info, search_start); - if (!cache) - return 0; + read_lock(&fs_info->block_group_cache_lock); + /* Get the block group with the lowest logical start address. */ + leftmost = rb_first_cached(&fs_info->block_group_cache_tree); + if (leftmost) { + struct btrfs_block_group *bg; - bytenr = cache->start; - btrfs_put_block_group(cache); + bg = rb_entry(leftmost, struct btrfs_block_group, cache_node); + bytenr = bg->start; + } + read_unlock(&fs_info->block_group_cache_lock); return bytenr; } @@ -2766,12 +2755,11 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, spin_unlock(&cache->lock); if (!readonly && return_free_space && global_rsv->space_info == space_info) { - u64 to_add = len; - spin_lock(&global_rsv->lock); if (!global_rsv->full) { - to_add = min(len, global_rsv->size - - global_rsv->reserved); + u64 to_add = min(len, global_rsv->size - + global_rsv->reserved); + global_rsv->reserved += to_add; btrfs_space_info_update_bytes_may_use(fs_info, space_info, to_add); @@ -2862,6 +2850,35 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) return 0; } +static int do_free_extent_accounting(struct btrfs_trans_handle *trans, + u64 bytenr, u64 num_bytes, bool is_data) +{ + int ret; + + if (is_data) { + struct btrfs_root *csum_root; + + csum_root = btrfs_csum_root(trans->fs_info, bytenr); + ret = btrfs_del_csums(trans, csum_root, bytenr, num_bytes); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + } + + ret = add_to_free_space_tree(trans, bytenr, num_bytes); + if (ret) { + btrfs_abort_transaction(trans, ret); + return ret; + } + + ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); + if (ret) + btrfs_abort_transaction(trans, ret); + + return ret; +} + /* * Drop one or more refs of @node. * @@ -2943,7 +2960,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, u64 refs; u64 bytenr = node->bytenr; u64 num_bytes = node->num_bytes; - int last_ref = 0; bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA); extent_root = btrfs_extent_root(info, bytenr); @@ -3010,8 +3026,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, extent_root, path, - NULL, refs_to_drop, is_data, - &last_ref); + NULL, refs_to_drop, is_data); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3136,8 +3151,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } if (found_extent) { ret = remove_extent_backref(trans, extent_root, path, - iref, refs_to_drop, is_data, - &last_ref); + iref, refs_to_drop, is_data); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3182,7 +3196,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } - last_ref = 1; ret = btrfs_del_items(trans, extent_root, path, path->slots[0], num_to_del); if (ret) { @@ -3191,28 +3204,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - if (is_data) { - struct btrfs_root *csum_root; - csum_root = btrfs_csum_root(info, bytenr); - ret = btrfs_del_csums(trans, csum_root, bytenr, - num_bytes); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } - } - - ret = add_to_free_space_tree(trans, bytenr, num_bytes); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } - - ret = btrfs_update_block_group(trans, bytenr, num_bytes, false); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto out; - } + ret = do_free_extent_accounting(trans, bytenr, num_bytes, is_data); } btrfs_release_path(path); @@ -3808,8 +3800,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, /* Check RO and no space case before trying to activate it */ spin_lock(&block_group->lock); - if (block_group->ro || - block_group->alloc_offset == block_group->zone_capacity) { + if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) { ret = 1; /* * May need to clear fs_info->{treelog,data_reloc}_bg. @@ -3841,7 +3832,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, block_group->start == fs_info->data_reloc_bg || fs_info->data_reloc_bg == 0); - if (block_group->ro) { + if (block_group->ro || block_group->zoned_data_reloc_ongoing) { ret = 1; goto out; } @@ -3903,8 +3894,24 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, out: if (ret && ffe_ctl->for_treelog) fs_info->treelog_bg = 0; - if (ret && ffe_ctl->for_data_reloc) + if (ret && ffe_ctl->for_data_reloc && + fs_info->data_reloc_bg == block_group->start) { + /* + * Do not allow further allocations from this block group. + * Compared to increasing the ->ro, setting the + * ->zoned_data_reloc_ongoing flag still allows nocow + * writers to come in. See btrfs_inc_nocow_writers(). + * + * We need to disable an allocation to avoid an allocation of + * regular (non-relocation data) extent. With mix of relocation + * extents and regular extents, we can dispatch WRITE commands + * (for relocation extents) and ZONE APPEND commands (for + * regular extents) at the same time to the same zone, which + * easily break the write pointer. + */ + block_group->zoned_data_reloc_ongoing = 1; fs_info->data_reloc_bg = 0; + } spin_unlock(&fs_info->relocation_bg_lock); spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); @@ -4087,7 +4094,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, } ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, - CHUNK_ALLOC_FORCE); + CHUNK_ALLOC_FORCE_FOR_EXTENT); /* Do not bail out on ENOSPC since we can do more. */ if (ret == -ENOSPC) @@ -4277,7 +4284,7 @@ static noinline int find_free_extent(struct btrfs_root *root, return ret; ffe_ctl->search_start = max(ffe_ctl->search_start, - first_logical_byte(fs_info, 0)); + first_logical_byte(fs_info)); ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte); if (ffe_ctl->search_start == ffe_ctl->hint_byte) { block_group = btrfs_lookup_block_group(fs_info, @@ -4605,6 +4612,28 @@ int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, return ret; } +static int alloc_reserved_extent(struct btrfs_trans_handle *trans, u64 bytenr, + u64 num_bytes) +{ + struct btrfs_fs_info *fs_info = trans->fs_info; + int ret; + + ret = remove_from_free_space_tree(trans, bytenr, num_bytes); + if (ret) + return ret; + + ret = btrfs_update_block_group(trans, bytenr, num_bytes, true); + if (ret) { + ASSERT(!ret); + btrfs_err(fs_info, "update block group failed for %llu %llu", + bytenr, num_bytes); + return ret; + } + + trace_btrfs_reserved_extent_alloc(fs_info, bytenr, num_bytes); + return 0; +} + static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, u64 flags, u64 owner, u64 offset, @@ -4665,18 +4694,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset); - if (ret) - return ret; - - ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, true); - if (ret) { /* -ENOENT, logic error */ - btrfs_err(fs_info, "update block group failed for %llu %llu", - ins->objectid, ins->offset); - BUG(); - } - trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset); - return ret; + return alloc_reserved_extent(trans, ins->objectid, ins->offset); } static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, @@ -4694,7 +4712,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_delayed_tree_ref *ref; u32 size = sizeof(*extent_item) + sizeof(*iref); - u64 num_bytes; u64 flags = extent_op->flags_to_set; bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA); @@ -4704,12 +4721,10 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (skinny_metadata) { extent_key.offset = ref->level; extent_key.type = BTRFS_METADATA_ITEM_KEY; - num_bytes = fs_info->nodesize; } else { extent_key.offset = node->num_bytes; extent_key.type = BTRFS_EXTENT_ITEM_KEY; size += sizeof(*block_info); - num_bytes = node->num_bytes; } path = btrfs_alloc_path(); @@ -4754,22 +4769,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - ret = remove_from_free_space_tree(trans, extent_key.objectid, - num_bytes); - if (ret) - return ret; - - ret = btrfs_update_block_group(trans, extent_key.objectid, - fs_info->nodesize, true); - if (ret) { /* -ENOENT, logic error */ - btrfs_err(fs_info, "update block group failed for %llu %llu", - extent_key.objectid, extent_key.offset); - BUG(); - } - - trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid, - fs_info->nodesize); - return ret; + return alloc_reserved_extent(trans, node->bytenr, fs_info->nodesize); } int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans, @@ -4971,7 +4971,6 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->flags_to_set = flags; extent_op->update_key = skinny_metadata ? false : true; extent_op->update_flags = true; - extent_op->is_data = false; extent_op->level = level; btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, @@ -5156,7 +5155,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ ret = btrfs_set_disk_extent_flags(trans, eb, flag, - btrfs_header_level(eb), 0); + btrfs_header_level(eb)); BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; } @@ -5622,6 +5621,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) int ret; int level; bool root_dropped = false; + bool unfinished_drop = false; btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid); @@ -5664,6 +5664,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) * already dropped. */ set_bit(BTRFS_ROOT_DELETING, &root->state); + unfinished_drop = test_bit(BTRFS_ROOT_UNFINISHED_DROP, &root->state); + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { level = btrfs_header_level(root->node); path->nodes[level] = btrfs_lock_root_node(root); @@ -5827,7 +5829,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) btrfs_qgroup_convert_reserved_meta(root, INT_MAX); btrfs_qgroup_free_meta_all_pertrans(root); - if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) + if (test_bit(BTRFS_ROOT_REGISTERED, &root->state)) btrfs_add_dropped_root(trans, root); else btrfs_put_root(root); @@ -5839,6 +5841,13 @@ out_free: btrfs_free_path(path); out: /* + * We were an unfinished drop root, check to see if there are any + * pending, and if not clear and wake up any waiters. + */ + if (!err && unfinished_drop) + btrfs_maybe_wake_unfinished_drop(fs_info); + + /* * So if we need to stop dropping the snapshot for whatever reason we * need to make sure to add it back to the dead root list so that we * keep trying to do the work later. This also cleans up roots if we @@ -5989,7 +5998,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) *trimmed = 0; /* Discard not supported = nothing to do. */ - if (!blk_queue_discard(bdev_get_queue(device->bdev))) + if (!bdev_max_discard_sectors(device->bdev)) return 0; /* Not writable = nothing to do. */ diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d6d48ecf823c..04e36343da3a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -6,13 +6,13 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/page-flags.h> +#include <linux/sched/mm.h> #include <linux/spinlock.h> #include <linux/blkdev.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/prefetch.h> -#include <linux/cleancache.h> #include <linux/fsverity.h> #include "misc.h" #include "extent_io.h" @@ -29,6 +29,7 @@ #include "subpage.h" #include "zoned.h" #include "block-group.h" +#include "compression.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -76,6 +77,7 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) if (!fs_info->allocated_ebs.next) return; + WARN_ON(!list_empty(&fs_info->allocated_ebs)); spin_lock_irqsave(&fs_info->eb_leak_lock, flags); while (!list_empty(&fs_info->allocated_ebs)) { eb = list_first_entry(&fs_info->allocated_ebs, @@ -136,6 +138,17 @@ struct tree_entry { struct rb_node rb_node; }; +/* + * Structure to record info about the bio being assembled, and other info like + * how many bytes are there before stripe/ordered extent boundary. + */ +struct btrfs_bio_ctrl { + struct bio *bio; + enum btrfs_compression_type compress_type; + u32 len_to_stripe_boundary; + u32 len_to_oe_boundary; +}; + struct extent_page_data { struct btrfs_bio_ctrl bio_ctrl; /* tells writepage not to lock the state bits for this range @@ -165,24 +178,27 @@ static int add_extent_changeset(struct extent_state *state, u32 bits, return ret; } -int __must_check submit_one_bio(struct bio *bio, int mirror_num, - unsigned long bio_flags) +static void submit_one_bio(struct bio *bio, int mirror_num, + enum btrfs_compression_type compress_type) { - blk_status_t ret = 0; struct extent_io_tree *tree = bio->bi_private; bio->bi_private = NULL; /* Caller should ensure the bio has at least some range added */ ASSERT(bio->bi_iter.bi_size); + if (is_data_inode(tree->private_data)) - ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num, - bio_flags); + btrfs_submit_data_bio(tree->private_data, bio, mirror_num, + compress_type); else - ret = btrfs_submit_metadata_bio(tree->private_data, bio, - mirror_num, bio_flags); - - return blk_status_to_errno(ret); + btrfs_submit_metadata_bio(tree->private_data, bio, mirror_num); + /* + * Above submission hooks will handle the error by ending the bio, + * which will do the cleanup properly. So here we should not return + * any error, or the caller of submit_extent_page() will do cleanup + * again, causing problems. + */ } /* Cleanup unsubmitted bios */ @@ -203,13 +219,12 @@ static void end_write_bio(struct extent_page_data *epd, int ret) * Return 0 if everything is OK. * Return <0 for error. */ -static int __must_check flush_write_bio(struct extent_page_data *epd) +static void flush_write_bio(struct extent_page_data *epd) { - int ret = 0; struct bio *bio = epd->bio_ctrl.bio; if (bio) { - ret = submit_one_bio(bio, 0, 0); + submit_one_bio(bio, 0, 0); /* * Clean up of epd->bio is handled by its endio function. * And endio is either triggered by successful bio execution @@ -219,7 +234,6 @@ static int __must_check flush_write_bio(struct extent_page_data *epd) */ epd->bio_ctrl.bio = NULL; } - return ret; } int __init extent_state_cache_init(void) @@ -1508,17 +1522,17 @@ void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) { + struct address_space *mapping = inode->i_mapping; unsigned long index = start >> PAGE_SHIFT; unsigned long end_index = end >> PAGE_SHIFT; - struct page *page; + struct folio *folio; while (index <= end_index) { - page = find_get_page(inode->i_mapping, index); - BUG_ON(!page); /* Pages should be in the extent_io_tree */ - __set_page_dirty_nobuffers(page); - account_page_redirty(page); - put_page(page); - index++; + folio = filemap_get_folio(mapping, index); + filemap_dirty_folio(mapping, folio); + folio_account_redirty(folio); + index += folio_nr_pages(folio); + folio_put(folio); } } @@ -2304,12 +2318,13 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num) { - struct bio *bio; struct btrfs_device *dev; + struct bio_vec bvec; + struct bio bio; u64 map_length = 0; u64 sector; struct btrfs_io_context *bioc = NULL; - int ret; + int ret = 0; ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); BUG_ON(!mirror_num); @@ -2317,8 +2332,6 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, if (btrfs_repair_one_zone(fs_info, logical)) return 0; - bio = btrfs_bio_alloc(1); - bio->bi_iter.bi_size = 0; map_length = length; /* @@ -2336,52 +2349,50 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, */ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, &map_length, &bioc, 0); - if (ret) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; - } + if (ret) + goto out_counter_dec; ASSERT(bioc->mirror_num == 1); } else { ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, &bioc, mirror_num); - if (ret) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; - } + if (ret) + goto out_counter_dec; BUG_ON(mirror_num != bioc->mirror_num); } sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; - bio->bi_iter.bi_sector = sector; dev = bioc->stripes[bioc->mirror_num - 1].dev; btrfs_put_bioc(bioc); + if (!dev || !dev->bdev || !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; + ret = -EIO; + goto out_counter_dec; } - bio_set_dev(bio, dev->bdev); - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; - bio_add_page(bio, page, length, pg_offset); - if (btrfsic_submit_bio_wait(bio)) { + bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); + bio.bi_iter.bi_sector = sector; + __bio_add_page(&bio, page, length, pg_offset); + + btrfsic_check_bio(&bio); + ret = submit_bio_wait(&bio); + if (ret) { /* try to remap that extent elsewhere? */ - btrfs_bio_counter_dec(fs_info); - bio_put(bio); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - return -EIO; + goto out_bio_uninit; } btrfs_info_rl_in_rcu(fs_info, "read error corrected: ino %llu off %llu (dev %s sector %llu)", ino, start, rcu_str_deref(dev->name), sector); + ret = 0; + +out_bio_uninit: + bio_uninit(&bio); +out_counter_dec: btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return 0; + return ret; } int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) @@ -2528,7 +2539,7 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode failrec->start = start; failrec->len = sectorsize; failrec->this_mirror = 0; - failrec->bio_flags = 0; + failrec->compress_type = BTRFS_COMPRESS_NONE; read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, start, failrec->len); @@ -2552,8 +2563,7 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode logical = em->block_start + logical; if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { logical = em->block_start; - failrec->bio_flags = EXTENT_BIO_COMPRESSED; - extent_set_compress_type(&failrec->bio_flags, em->compress_type); + failrec->compress_type = em->compress_type; } btrfs_debug(fs_info, @@ -2611,6 +2621,7 @@ static bool btrfs_check_repairable(struct inode *inode, * a good copy of the failed sector and if we succeed, we have setup * everything for repair_io_failure to do the rest for us. */ + ASSERT(failed_mirror); failrec->failed_mirror = failed_mirror; failrec->this_mirror++; if (failrec->this_mirror == failed_mirror) @@ -2640,7 +2651,6 @@ int btrfs_repair_one_sector(struct inode *inode, const int icsum = bio_offset >> fs_info->sectorsize_bits; struct bio *repair_bio; struct btrfs_bio *repair_bbio; - blk_status_t status; btrfs_debug(fs_info, "repair read error: read error at %llu", start); @@ -2659,6 +2669,7 @@ int btrfs_repair_one_sector(struct inode *inode, repair_bio = btrfs_bio_alloc(1); repair_bbio = btrfs_bio(repair_bio); + repair_bbio->file_offset = start; repair_bio->bi_opf = REQ_OP_READ; repair_bio->bi_end_io = failed_bio->bi_end_io; repair_bio->bi_iter.bi_sector = failrec->logical >> 9; @@ -2679,13 +2690,13 @@ int btrfs_repair_one_sector(struct inode *inode, "repair read error: submitting new read to mirror %d", failrec->this_mirror); - status = submit_bio_hook(inode, repair_bio, failrec->this_mirror, - failrec->bio_flags); - if (status) { - free_io_failure(failure_tree, tree, failrec); - bio_put(repair_bio); - } - return blk_status_to_errno(status); + /* + * At this point we have a bio, so any errors from submit_bio_hook() + * will be handled by the endio on the repair_bio, so we can't return an + * error here. + */ + submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->compress_type); + return BLK_STS_OK; } static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) @@ -2710,18 +2721,19 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) btrfs_page_set_error(fs_info, page, start, len); } - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) unlock_page(page); else btrfs_subpage_end_reader(fs_info, page, start, len); } -static blk_status_t submit_read_repair(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, u64 end, int failed_mirror, - unsigned int error_bitmap, - submit_bio_hook_t *submit_bio_hook) +static blk_status_t submit_data_read_repair(struct inode *inode, + struct bio *failed_bio, + u32 bio_offset, struct page *page, + unsigned int pgoff, + u64 start, u64 end, + int failed_mirror, + unsigned int error_bitmap) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; @@ -2731,6 +2743,9 @@ static blk_status_t submit_read_repair(struct inode *inode, BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); + /* This repair is only for data */ + ASSERT(is_data_inode(inode)); + /* We're here because we had some read errors or csum mismatch */ ASSERT(error_bitmap); @@ -2759,7 +2774,7 @@ static blk_status_t submit_read_repair(struct inode *inode, ret = btrfs_repair_one_sector(inode, failed_bio, bio_offset + offset, page, pgoff + offset, start + offset, - failed_mirror, submit_bio_hook); + failed_mirror, btrfs_submit_data_bio); if (!ret) { /* * We have submitted the read repair, the page release @@ -2943,7 +2958,7 @@ update: static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) { ASSERT(PageLocked(page)); - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page)); @@ -2951,7 +2966,7 @@ static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) } /* - * Find extent buffer for a givne bytenr. + * Find extent buffer for a given bytenr. * * This is for end_bio_extent_readpage(), thus we can't do any unsafe locking * in endio context. @@ -2965,16 +2980,14 @@ static struct extent_buffer *find_extent_buffer_readpage( * For regular sectorsize, we can use page->private to grab extent * buffer */ - if (fs_info->sectorsize == PAGE_SIZE) { + if (fs_info->nodesize >= PAGE_SIZE) { ASSERT(PagePrivate(page) && page->private); return (struct extent_buffer *)page->private; } - /* For subpage case, we need to lookup buffer radix tree */ - rcu_read_lock(); - eb = radix_tree_lookup(&fs_info->buffer_radix, - bytenr >> fs_info->sectorsize_bits); - rcu_read_unlock(); + /* For subpage case, we need to lookup extent buffer xarray */ + eb = xa_load(&fs_info->extent_buffers, + bytenr >> fs_info->sectorsize_bits); ASSERT(eb); return eb; } @@ -3069,13 +3082,21 @@ static void end_bio_extent_readpage(struct bio *bio) if (is_data_inode(inode)) { /* - * btrfs_submit_read_repair() will handle all the good + * If we failed to submit the IO at all we'll have a + * mirror_num == 0, in which case we need to just mark + * the page with an error and unlock it and carry on. + */ + if (mirror == 0) + goto readpage_ok; + + /* + * submit_data_read_repair() will handle all the good * and bad sectors, we just continue to the next bvec. */ - submit_read_repair(inode, bio, bio_offset, page, - start - page_offset(page), start, - end, mirror, error_bitmap, - btrfs_submit_data_bio); + submit_data_read_repair(inode, bio, bio_offset, page, + start - page_offset(page), + start, end, mirror, + error_bitmap); ASSERT(bio_offset + len > bio_offset); bio_offset += len; @@ -3124,6 +3145,42 @@ readpage_ok: bio_put(bio); } +/** + * Populate every free slot in a provided array with pages. + * + * @nr_pages: number of pages to allocate + * @page_array: the array to fill with pages; any existing non-null entries in + * the array will be skipped + * + * Return: 0 if all pages were able to be allocated; + * -ENOMEM otherwise, and the caller is responsible for freeing all + * non-null page pointers in the array. + */ +int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) +{ + unsigned int allocated; + + for (allocated = 0; allocated < nr_pages;) { + unsigned int last = allocated; + + allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array); + + if (allocated == nr_pages) + return 0; + + /* + * During this iteration, no page could be allocated, even + * though alloc_pages_bulk_array() falls back to alloc_page() + * if it could not bulk-allocate. So we must be out of memory. + */ + if (allocated == last) + return -ENOMEM; + + memalloc_retry_wait(GFP_NOFS); + } + return 0; +} + /* * Initialize the members up to but not including 'bio'. Use after allocating a * new bio by bio_alloc_bioset as it does not initialize the bytes outside of @@ -3144,18 +3201,18 @@ struct bio *btrfs_bio_alloc(unsigned int nr_iovecs) struct bio *bio; ASSERT(0 < nr_iovecs && nr_iovecs <= BIO_MAX_VECS); - bio = bio_alloc_bioset(GFP_NOFS, nr_iovecs, &btrfs_bioset); + bio = bio_alloc_bioset(NULL, nr_iovecs, 0, GFP_NOFS, &btrfs_bioset); btrfs_bio_init(btrfs_bio(bio)); return bio; } -struct bio *btrfs_bio_clone(struct bio *bio) +struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio) { struct btrfs_bio *bbio; struct bio *new; /* Bio allocation backed by a bioset does not fail */ - new = bio_clone_fast(bio, GFP_NOFS, &btrfs_bioset); + new = bio_alloc_clone(bdev, bio, GFP_NOFS, &btrfs_bioset); bbio = btrfs_bio(new); btrfs_bio_init(bbio); bbio->iter = bio->bi_iter; @@ -3170,7 +3227,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) ASSERT(offset <= UINT_MAX && size <= UINT_MAX); /* this will never fail when it's backed by a bioset */ - bio = bio_clone_fast(orig, GFP_NOFS, &btrfs_bioset); + bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset); ASSERT(bio); bbio = btrfs_bio(bio); @@ -3190,7 +3247,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) * a contiguous page to the previous one * @size: portion of page that we want to write * @pg_offset: starting offset in the page - * @bio_flags: flags of the current bio to see if we can merge them + * @compress_type: compression type of the current bio to see if we can merge them * * Attempt to add a page to bio considering stripe alignment etc. * @@ -3202,7 +3259,7 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, struct page *page, u64 disk_bytenr, unsigned int size, unsigned int pg_offset, - unsigned long bio_flags) + enum btrfs_compression_type compress_type) { struct bio *bio = bio_ctrl->bio; u32 bio_size = bio->bi_iter.bi_size; @@ -3214,10 +3271,10 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, ASSERT(bio); /* The limit should be calculated when bio_ctrl->bio is allocated */ ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); - if (bio_ctrl->bio_flags != bio_flags) + if (bio_ctrl->compress_type != compress_type) return 0; - if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) + if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) contig = bio->bi_iter.bi_sector == sector; else contig = bio_end_sector(bio) == sector; @@ -3260,7 +3317,7 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, * The split happens for real compressed bio, which happens in * btrfs_submit_compressed_read/write(). */ - if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) { + if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { bio_ctrl->len_to_oe_boundary = U32_MAX; bio_ctrl->len_to_stripe_boundary = U32_MAX; return 0; @@ -3303,7 +3360,7 @@ static int alloc_new_bio(struct btrfs_inode *inode, unsigned int opf, bio_end_io_t end_io_func, u64 disk_bytenr, u32 offset, u64 file_offset, - unsigned long bio_flags) + enum btrfs_compression_type compress_type) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio; @@ -3314,37 +3371,49 @@ static int alloc_new_bio(struct btrfs_inode *inode, * For compressed page range, its disk_bytenr is always @disk_bytenr * passed in, no matter if we have added any range into previous bio. */ - if (bio_flags & EXTENT_BIO_COMPRESSED) + if (compress_type != BTRFS_COMPRESS_NONE) bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; else bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; bio_ctrl->bio = bio; - bio_ctrl->bio_flags = bio_flags; + bio_ctrl->compress_type = compress_type; bio->bi_end_io = end_io_func; bio->bi_private = &inode->io_tree; - bio->bi_write_hint = inode->vfs_inode.i_write_hint; bio->bi_opf = opf; ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); if (ret < 0) goto error; - if (wbc) { - struct block_device *bdev; - bdev = fs_info->fs_devices->latest_dev->bdev; - bio_set_dev(bio, bdev); - wbc_init_bio(wbc, bio); - } - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct btrfs_device *device; + if (wbc) { + /* + * For Zone append we need the correct block_device that we are + * going to write to set in the bio to be able to respect the + * hardware limitation. Look it up here: + */ + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct btrfs_device *dev; + + dev = btrfs_zoned_get_device(fs_info, disk_bytenr, + fs_info->sectorsize); + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + goto error; + } - device = btrfs_zoned_get_device(fs_info, disk_bytenr, - fs_info->sectorsize); - if (IS_ERR(device)) { - ret = PTR_ERR(device); - goto error; + bio_set_dev(bio, dev->bdev); + } else { + /* + * Otherwise pick the last added device to support + * cgroup writeback. For multi-device file systems this + * means blk-cgroup policies have to always be set on the + * last added/replaced device. This is a bit odd but has + * been like that for a long time. + */ + bio_set_dev(bio, fs_info->fs_devices->latest_dev->bdev); } - - btrfs_bio(bio)->device = device; + wbc_init_bio(wbc, bio); + } else { + ASSERT(bio_op(bio) != REQ_OP_ZONE_APPEND); } return 0; error: @@ -3366,7 +3435,7 @@ error: * @end_io_func: end_io callback for new bio * @mirror_num: desired mirror to read/write * @prev_bio_flags: flags of previous bio to see if we can merge the current one - * @bio_flags: flags of the current bio to see if we can merge them + * @compress_type: compress type for current bio */ static int submit_extent_page(unsigned int opf, struct writeback_control *wbc, @@ -3375,7 +3444,7 @@ static int submit_extent_page(unsigned int opf, size_t size, unsigned long pg_offset, bio_end_io_t end_io_func, int mirror_num, - unsigned long bio_flags, + enum btrfs_compression_type compress_type, bool force_bio_submit) { int ret = 0; @@ -3387,10 +3456,8 @@ static int submit_extent_page(unsigned int opf, ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && pg_offset + size <= PAGE_SIZE); if (force_bio_submit && bio_ctrl->bio) { - ret = submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->bio_flags); + submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->compress_type); bio_ctrl->bio = NULL; - if (ret < 0) - return ret; } while (cur < pg_offset + size) { @@ -3402,7 +3469,7 @@ static int submit_extent_page(unsigned int opf, ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, end_io_func, disk_bytenr, offset, page_offset(page) + cur, - bio_flags); + compress_type); if (ret < 0) return ret; } @@ -3410,14 +3477,14 @@ static int submit_extent_page(unsigned int opf, * We must go through btrfs_bio_add_page() to ensure each * page range won't cross various boundaries. */ - if (bio_flags & EXTENT_BIO_COMPRESSED) + if (compress_type != BTRFS_COMPRESS_NONE) added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, size - offset, pg_offset + offset, - bio_flags); + compress_type); else added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr + offset, size - offset, - pg_offset + offset, bio_flags); + pg_offset + offset, compress_type); /* Metadata page range should never be split */ if (!is_data_inode(&inode->vfs_inode)) @@ -3431,11 +3498,8 @@ static int submit_extent_page(unsigned int opf, if (added < size - offset) { /* The bio should contain some page(s) */ ASSERT(bio_ctrl->bio->bi_iter.bi_size); - ret = submit_one_bio(bio_ctrl->bio, mirror_num, - bio_ctrl->bio_flags); + submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->compress_type); bio_ctrl->bio = NULL; - if (ret < 0) - return ret; } cur += added; } @@ -3458,7 +3522,7 @@ static int attach_extent_buffer_page(struct extent_buffer *eb, if (page->mapping) lockdep_assert_held(&page->mapping->private_lock); - if (fs_info->sectorsize == PAGE_SIZE) { + if (fs_info->nodesize >= PAGE_SIZE) { if (!PagePrivate(page)) attach_page_private(page, eb); else @@ -3493,7 +3557,7 @@ int set_page_extent_mapped(struct page *page) fs_info = btrfs_sb(page->mapping->host->i_sb); - if (fs_info->sectorsize < PAGE_SIZE) + if (btrfs_is_subpage(fs_info, page)) return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA); attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); @@ -3510,7 +3574,7 @@ void clear_page_extent_mapped(struct page *page) return; fs_info = btrfs_sb(page->mapping->host->i_sb); - if (fs_info->sectorsize < PAGE_SIZE) + if (btrfs_is_subpage(fs_info, page)) return btrfs_detach_subpage(fs_info, page); detach_page_private(page); @@ -3535,7 +3599,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, } em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); - if (em_cached && !IS_ERR_OR_NULL(em)) { + if (em_cached && !IS_ERR(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); *em_cached = em; @@ -3549,7 +3613,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, +static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, struct btrfs_bio_ctrl *bio_ctrl, unsigned int read_flags, u64 *prev_em_start) { @@ -3564,7 +3628,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, u64 cur_end; struct extent_map *em; int ret = 0; - int nr = 0; size_t pg_offset = 0; size_t iosize; size_t blocksize = inode->i_sb->s_blocksize; @@ -3578,15 +3641,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, goto out; } - if (!PageUptodate(page)) { - if (cleancache_get_page(page) == 0) { - BUG_ON(blocksize != PAGE_SIZE); - unlock_extent(tree, start, end); - unlock_page(page); - goto out; - } - } - if (page->index == last_byte >> PAGE_SHIFT) { size_t zero_offset = offset_in_page(last_byte); @@ -3618,25 +3672,23 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } em = __get_extent_map(inode, page, pg_offset, cur, end - cur + 1, em_cached); - if (IS_ERR_OR_NULL(em)) { + if (IS_ERR(em)) { unlock_extent(tree, cur, end); end_page_read(page, false, cur, end + 1 - cur); + ret = PTR_ERR(em); break; } extent_offset = cur - em->start; BUG_ON(extent_map_end(em) <= cur); BUG_ON(end < cur); - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - this_bio_flag |= EXTENT_BIO_COMPRESSED; - extent_set_compress_type(&this_bio_flag, - em->compress_type); - } + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + this_bio_flag = em->compress_type; iosize = min(extent_map_end(em) - cur, end - cur + 1); cur_end = min(extent_map_end(em) - 1, end); iosize = ALIGN(iosize, blocksize); - if (this_bio_flag & EXTENT_BIO_COMPRESSED) + if (this_bio_flag != BTRFS_COMPRESS_NONE) disk_bytenr = em->block_start; else disk_bytenr = em->block_start + extent_offset; @@ -3731,11 +3783,13 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, end_bio_extent_readpage, 0, this_bio_flag, force_bio_submit); - if (!ret) { - nr++; - } else { - unlock_extent(tree, cur, cur + iosize - 1); - end_page_read(page, false, cur, iosize); + if (ret) { + /* + * We have to unlock the remaining range, or the page + * will never be unlocked. + */ + unlock_extent(tree, cur, end); + end_page_read(page, false, cur, end + 1 - cur); goto out; } cur = cur + iosize; @@ -3745,6 +3799,27 @@ out: return ret; } +int btrfs_read_folio(struct file *file, struct folio *folio) +{ + struct page *page = &folio->page; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + u64 start = page_offset(page); + u64 end = start + PAGE_SIZE - 1; + struct btrfs_bio_ctrl bio_ctrl = { 0 }; + int ret; + + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); + + ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); + /* + * If btrfs_do_readpage() failed we will want to submit the assembled + * bio to do the cleanup. + */ + if (bio_ctrl.bio) + submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.compress_type); + return ret; +} + static inline void contiguous_readpages(struct page *pages[], int nr_pages, u64 start, u64 end, struct extent_map **em_cached, @@ -3763,12 +3838,6 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, } } -static void update_nr_written(struct writeback_control *wbc, - unsigned long nr_written) -{ - wbc->nr_to_write -= nr_written; -} - /* * helper for __extent_writepage, doing all of the delayed allocation setup. * @@ -3868,7 +3937,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, * For regular sector size == page size case, since one page only * contains one sector, we return the page offset directly. */ - if (fs_info->sectorsize == PAGE_SIZE) { + if (!btrfs_is_subpage(fs_info, page)) { *start = page_offset(page); *end = page_offset(page) + PAGE_SIZE; return; @@ -3911,10 +3980,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, u64 extent_offset; u64 block_start; struct extent_map *em; + int saved_ret = 0; int ret = 0; int nr = 0; u32 opf = REQ_OP_WRITE; const unsigned int write_flags = wbc_to_write_flags(wbc); + bool has_error = false; bool compressed; ret = btrfs_writepage_cow_fixup(page); @@ -3929,7 +4000,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * we don't want to touch the inode after unlocking the page, * so we update the mapping writeback index now */ - update_nr_written(wbc, 1); + wbc->nr_to_write--; while (cur <= end) { u64 disk_bytenr; @@ -3961,9 +4032,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, } em = btrfs_get_extent(inode, NULL, 0, cur, end - cur + 1); - if (IS_ERR_OR_NULL(em)) { + if (IS_ERR(em)) { btrfs_page_set_error(fs_info, page, cur, end - cur + 1); ret = PTR_ERR_OR_ZERO(em); + has_error = true; + if (!saved_ret) + saved_ret = ret; break; } @@ -4027,6 +4101,10 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, end_bio_extent_writepage, 0, 0, false); if (ret) { + has_error = true; + if (!saved_ret) + saved_ret = ret; + btrfs_page_set_error(fs_info, page, cur, iosize); if (PageWriteback(page)) btrfs_page_clear_writeback(fs_info, page, cur, @@ -4040,8 +4118,10 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * If we finish without problem, we should not only clear page dirty, * but also empty subpage dirty bits */ - if (!ret) + if (!has_error) btrfs_page_assert_not_dirty(fs_info, page); + else + ret = saved_ret; *nr_ret = nr; return ret; } @@ -4058,6 +4138,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, static int __extent_writepage(struct page *page, struct writeback_control *wbc, struct extent_page_data *epd) { + struct folio *folio = page_folio(page); struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u64 page_start = page_offset(page); @@ -4078,8 +4159,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, pg_offset = offset_in_page(i_size); if (page->index > end_index || (page->index == end_index && !pg_offset)) { - page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); - unlock_page(page); + folio_invalidate(folio, 0, folio_size(folio)); + folio_unlock(folio); return 0; } @@ -4171,9 +4252,6 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) static void end_extent_buffer_writeback(struct extent_buffer *eb) { - if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags)) - btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len); - clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); smp_mb__after_atomic(); wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); @@ -4193,14 +4271,12 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb struct extent_page_data *epd) { struct btrfs_fs_info *fs_info = eb->fs_info; - int i, num_pages, failed_page_nr; + int i, num_pages; int flush = 0; int ret = 0; if (!btrfs_try_tree_write_lock(eb)) { - ret = flush_write_bio(epd); - if (ret < 0) - return ret; + flush_write_bio(epd); flush = 1; btrfs_tree_lock(eb); } @@ -4210,9 +4286,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!epd->sync_io) return 0; if (!flush) { - ret = flush_write_bio(epd); - if (ret < 0) - return ret; + flush_write_bio(epd); flush = 1; } while (1) { @@ -4250,7 +4324,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb * Subpage metadata doesn't use page locking at all, so we can skip * the page locking. */ - if (!ret || fs_info->sectorsize < PAGE_SIZE) + if (!ret || fs_info->nodesize < PAGE_SIZE) return ret; num_pages = num_extent_pages(eb); @@ -4259,14 +4333,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!trylock_page(p)) { if (!flush) { - int err; - - err = flush_write_bio(epd); - if (err < 0) { - ret = err; - failed_page_nr = i; - goto err_unlock; - } + flush_write_bio(epd); flush = 1; } lock_page(p); @@ -4274,25 +4341,6 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb } return ret; -err_unlock: - /* Unlock already locked pages */ - for (i = 0; i < failed_page_nr; i++) - unlock_page(eb->pages[i]); - /* - * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it. - * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can - * be made and undo everything done before. - */ - btrfs_tree_lock(eb); - spin_lock(&eb->refs_lock); - set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - end_extent_buffer_writeback(eb); - spin_unlock(&eb->refs_lock); - percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len, - fs_info->dirty_metadata_batch); - btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - btrfs_tree_unlock(eb); - return ret; } static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) @@ -4387,8 +4435,8 @@ static struct extent_buffer *find_extent_buffer_nolock( struct extent_buffer *eb; rcu_read_lock(); - eb = radix_tree_lookup(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits); + eb = xa_load(&fs_info->extent_buffers, + start >> fs_info->sectorsize_bits); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); return eb; @@ -4410,7 +4458,7 @@ static void end_bio_subpage_eb_writepage(struct bio *bio) struct bvec_iter_all iter_all; fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); - ASSERT(fs_info->sectorsize < PAGE_SIZE); + ASSERT(fs_info->nodesize < PAGE_SIZE); ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { @@ -4562,7 +4610,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb, * dirty anymore, we have submitted a page. Update nr_written in wbc. */ if (no_dirty_ebs) - update_nr_written(wbc, 1); + wbc->nr_to_write--; return ret; } @@ -4598,7 +4646,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, break; } disk_bytenr += PAGE_SIZE; - update_nr_written(wbc, 1); + wbc->nr_to_write--; unlock_page(p); } @@ -4737,7 +4785,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, if (!PagePrivate(page)) return 0; - if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return submit_eb_subpage(page, wbc, epd); spin_lock(&mapping->private_lock); @@ -4790,11 +4838,11 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, return ret; } if (cache) { - /* Impiles write in zoned mode */ + /* + * Implies write in zoned mode. Mark the last eb in a block group. + */ + btrfs_schedule_zone_finish_bg(cache, eb); btrfs_put_block_group(cache); - /* Mark the last eb in a block group */ - if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity) - set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags); } ret = write_one_eb(eb, wbc, epd); free_extent_buffer(eb); @@ -4912,13 +4960,19 @@ retry: * if the fs already has error. */ if (!BTRFS_FS_ERROR(fs_info)) { - ret = flush_write_bio(&epd); + flush_write_bio(&epd); } else { ret = -EROFS; end_write_bio(&epd, ret); } out: btrfs_zoned_meta_io_unlock(fs_info); + /* + * We can get ret > 0 from submit_extent_page() indicating how many ebs + * were submitted. Reset it to 0 to avoid false alerts for the caller. + */ + if (ret > 0) + ret = 0; return ret; } @@ -5020,8 +5074,7 @@ retry: * tmpfs file mapping */ if (!trylock_page(page)) { - ret = flush_write_bio(epd); - BUG_ON(ret < 0); + flush_write_bio(epd); lock_page(page); } @@ -5031,10 +5084,8 @@ retry: } if (wbc->sync_mode != WB_SYNC_NONE) { - if (PageWriteback(page)) { - ret = flush_write_bio(epd); - BUG_ON(ret < 0); - } + if (PageWriteback(page)) + flush_write_bio(epd); wait_on_page_writeback(page); } @@ -5074,9 +5125,8 @@ retry: * page in our current bio, and thus deadlock, so flush the * write bio here. */ - ret = flush_write_bio(epd); - if (!ret) - goto retry; + flush_write_bio(epd); + goto retry; } if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) @@ -5102,8 +5152,7 @@ int extent_write_full_page(struct page *page, struct writeback_control *wbc) return ret; } - ret = flush_write_bio(&epd); - ASSERT(ret <= 0); + flush_write_bio(&epd); return ret; } @@ -5165,7 +5214,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) } if (!found_error) - ret = flush_write_bio(&epd); + flush_write_bio(&epd); else end_write_bio(&epd, ret); @@ -5192,13 +5241,14 @@ int extent_writepages(struct address_space *mapping, */ btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); ret = extent_write_cache_pages(mapping, wbc, &epd); - btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); ASSERT(ret <= 0); if (ret < 0) { + btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); end_write_bio(&epd, ret); return ret; } - ret = flush_write_bio(&epd); + flush_write_bio(&epd); + btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); return ret; } @@ -5221,24 +5271,22 @@ void extent_readahead(struct readahead_control *rac) if (em_cached) free_extent_map(em_cached); - if (bio_ctrl.bio) { - if (submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags)) - return; - } + if (bio_ctrl.bio) + submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.compress_type); } /* - * basic invalidatepage code, this waits on any locked or writeback - * ranges corresponding to the page, and then deletes any extent state + * basic invalidate_folio code, this waits on any locked or writeback + * ranges corresponding to the folio, and then deletes any extent state * records from the tree */ -int extent_invalidatepage(struct extent_io_tree *tree, - struct page *page, unsigned long offset) +int extent_invalidate_folio(struct extent_io_tree *tree, + struct folio *folio, size_t offset) { struct extent_state *cached_state = NULL; - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - size_t blocksize = page->mapping->host->i_sb->s_blocksize; + u64 start = folio_pos(folio); + u64 end = start + folio_size(folio) - 1; + size_t blocksize = folio->mapping->host->i_sb->s_blocksize; /* This function is only called for the btree inode */ ASSERT(tree->owner == IO_TREE_BTREE_INODE_IO); @@ -5248,7 +5296,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, return 0; lock_extent_bits(tree, start, end, &cached_state); - wait_on_page_writeback(page); + folio_wait_writeback(folio); /* * Currently for btree io tree, only EXTENT_LOCKED is utilized, @@ -5260,7 +5308,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, } /* - * a helper for releasepage, this tests for areas of the page that + * a helper for release_folio, this tests for areas of the page that * are locked or under IO and drops the related state bits if it is safe * to drop the page. */ @@ -5296,7 +5344,7 @@ static int try_release_extent_state(struct extent_io_tree *tree, } /* - * a helper for releasepage. As long as there are no locked extents + * a helper for release_folio. As long as there are no locked extents * in the range corresponding to the page, both state records and extent * map records are removed */ @@ -5400,7 +5448,7 @@ static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode, break; len = ALIGN(len, sectorsize); em = btrfs_get_extent_fiemap(inode, offset, len); - if (IS_ERR_OR_NULL(em)) + if (IS_ERR(em)) return em; /* if this isn't a hole return it */ @@ -5793,7 +5841,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag return; } - if (fs_info->sectorsize == PAGE_SIZE) { + if (fs_info->nodesize >= PAGE_SIZE) { /* * We do this since we'll remove the pages after we've * removed the eb from the radix tree, so we could race @@ -5900,9 +5948,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) { int i; - struct page *p; struct extent_buffer *new; int num_pages = num_extent_pages(src); + int ret; new = __alloc_extent_buffer(src->fs_info, src->start, src->len); if (new == NULL) @@ -5915,22 +5963,23 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) */ set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); + memset(new->pages, 0, sizeof(*new->pages) * num_pages); + ret = btrfs_alloc_page_array(num_pages, new->pages); + if (ret) { + btrfs_release_extent_buffer(new); + return NULL; + } + for (i = 0; i < num_pages; i++) { int ret; + struct page *p = new->pages[i]; - p = alloc_page(GFP_NOFS); - if (!p) { - btrfs_release_extent_buffer(new); - return NULL; - } ret = attach_extent_buffer_page(new, p, NULL); if (ret < 0) { - put_page(p); btrfs_release_extent_buffer(new); return NULL; } WARN_ON(PageDirty(p)); - new->pages[i] = p; copy_page(page_address(p), page_address(src->pages[i])); } set_extent_buffer_uptodate(new); @@ -5944,31 +5993,36 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, struct extent_buffer *eb; int num_pages; int i; + int ret; eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return NULL; num_pages = num_extent_pages(eb); + ret = btrfs_alloc_page_array(num_pages, eb->pages); + if (ret) + goto err; + for (i = 0; i < num_pages; i++) { - int ret; + struct page *p = eb->pages[i]; - eb->pages[i] = alloc_page(GFP_NOFS); - if (!eb->pages[i]) - goto err; - ret = attach_extent_buffer_page(eb, eb->pages[i], NULL); + ret = attach_extent_buffer_page(eb, p, NULL); if (ret < 0) goto err; } + set_extent_buffer_uptodate(eb); btrfs_set_header_nritems(eb, 0); set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); return eb; err: - for (; i > 0; i--) { - detach_extent_buffer_page(eb, eb->pages[i - 1]); - __free_page(eb->pages[i - 1]); + for (i = 0; i < num_pages; i++) { + if (eb->pages[i]) { + detach_extent_buffer_page(eb, eb->pages[i]); + __free_page(eb->pages[i]); + } } __free_extent_buffer(eb); return NULL; @@ -5990,10 +6044,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) * * It is only cleared in two cases: freeing the last non-tree * reference to the extent_buffer when its STALE bit is set or - * calling releasepage when the tree reference is the only reference. + * calling release_folio when the tree reference is the only reference. * * In both cases, care is taken to ensure that the extent_buffer's - * pages are not under io. However, releasepage can be concurrently + * pages are not under io. However, release_folio can be concurrently * called with creating new references, which is prone to race * conditions between the calls to check_buffer_tree_ref in those * codepaths and clearing TREE_REF in try_release_extent_buffer. @@ -6075,24 +6129,22 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, if (!eb) return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; -again: - ret = radix_tree_preload(GFP_NOFS); - if (ret) { - exists = ERR_PTR(ret); - goto free_eb; - } - spin_lock(&fs_info->buffer_lock); - ret = radix_tree_insert(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits, eb); - spin_unlock(&fs_info->buffer_lock); - radix_tree_preload_end(); - if (ret == -EEXIST) { - exists = find_extent_buffer(fs_info, start); - if (exists) + + do { + ret = xa_insert(&fs_info->extent_buffers, + start >> fs_info->sectorsize_bits, + eb, GFP_NOFS); + if (ret == -ENOMEM) { + exists = ERR_PTR(ret); goto free_eb; - else - goto again; - } + } + if (ret == -EBUSY) { + exists = find_extent_buffer(fs_info, start); + if (exists) + goto free_eb; + } + } while (ret); + check_buffer_tree_ref(eb); set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); @@ -6113,7 +6165,7 @@ static struct extent_buffer *grab_extent_buffer( * don't try to insert two ebs for the same bytenr. So here we always * return NULL and just continue. */ - if (fs_info->sectorsize < PAGE_SIZE) + if (fs_info->nodesize < PAGE_SIZE) return NULL; /* Page not yet attached to an extent buffer */ @@ -6135,6 +6187,30 @@ static struct extent_buffer *grab_extent_buffer( return NULL; } +static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) +{ + if (!IS_ALIGNED(start, fs_info->sectorsize)) { + btrfs_err(fs_info, "bad tree block start %llu", start); + return -EINVAL; + } + + if (fs_info->nodesize < PAGE_SIZE && + offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) { + btrfs_err(fs_info, + "tree block crosses page boundary, start %llu nodesize %u", + start, fs_info->nodesize); + return -EINVAL; + } + if (fs_info->nodesize >= PAGE_SIZE && + !IS_ALIGNED(start, PAGE_SIZE)) { + btrfs_err(fs_info, + "tree block is not page aligned, start %llu nodesize %u", + start, fs_info->nodesize); + return -EINVAL; + } + return 0; +} + struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level) { @@ -6149,10 +6225,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, int uptodate = 1; int ret; - if (!IS_ALIGNED(start, fs_info->sectorsize)) { - btrfs_err(fs_info, "bad tree block start %llu", start); + if (check_eb_alignment(fs_info, start)) return ERR_PTR(-EINVAL); - } #if BITS_PER_LONG == 32 if (start >= MAX_LFS_FILESIZE) { @@ -6165,14 +6239,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, btrfs_warn_32bit_limit(fs_info); #endif - if (fs_info->sectorsize < PAGE_SIZE && - offset_in_page(start) + len > PAGE_SIZE) { - btrfs_err(fs_info, - "tree block crosses page boundary, start %llu nodesize %lu", - start, len); - return ERR_PTR(-EINVAL); - } - eb = find_extent_buffer(fs_info, start); if (eb) return eb; @@ -6202,7 +6268,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * page, but it may change in the future for 16K page size * support, so we still preallocate the memory in the loop. */ - if (fs_info->sectorsize < PAGE_SIZE) { + if (fs_info->nodesize < PAGE_SIZE) { prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); if (IS_ERR(prealloc)) { ret = PTR_ERR(prealloc); @@ -6246,39 +6312,36 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, /* * We can't unlock the pages just yet since the extent buffer * hasn't been properly inserted in the radix tree, this - * opens a race with btree_releasepage which can free a page + * opens a race with btree_release_folio which can free a page * while we are still filling in all pages for the buffer and * we could crash. */ } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); -again: - ret = radix_tree_preload(GFP_NOFS); - if (ret) { - exists = ERR_PTR(ret); - goto free_eb; - } - - spin_lock(&fs_info->buffer_lock); - ret = radix_tree_insert(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits, eb); - spin_unlock(&fs_info->buffer_lock); - radix_tree_preload_end(); - if (ret == -EEXIST) { - exists = find_extent_buffer(fs_info, start); - if (exists) + + do { + ret = xa_insert(&fs_info->extent_buffers, + start >> fs_info->sectorsize_bits, + eb, GFP_NOFS); + if (ret == -ENOMEM) { + exists = ERR_PTR(ret); goto free_eb; - else - goto again; - } + } + if (ret == -EBUSY) { + exists = find_extent_buffer(fs_info, start); + if (exists) + goto free_eb; + } + } while (ret); + /* add one reference for the tree */ check_buffer_tree_ref(eb); set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); /* * Now it's safe to unlock the pages because any calls to - * btree_releasepage will correctly detect that a page belongs to a + * btree_release_folio will correctly detect that a page belongs to a * live buffer and won't free them prematurely. */ for (i = 0; i < num_pages; i++) @@ -6316,10 +6379,8 @@ static int release_extent_buffer(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); - spin_lock(&fs_info->buffer_lock); - radix_tree_delete(&fs_info->buffer_radix, - eb->start >> fs_info->sectorsize_bits); - spin_unlock(&fs_info->buffer_lock); + xa_erase(&fs_info->extent_buffers, + eb->start >> fs_info->sectorsize_bits); } else { spin_unlock(&eb->refs_lock); } @@ -6421,7 +6482,7 @@ void clear_extent_buffer_dirty(const struct extent_buffer *eb) int num_pages; struct page *page; - if (eb->fs_info->sectorsize < PAGE_SIZE) + if (eb->fs_info->nodesize < PAGE_SIZE) return clear_subpage_extent_buffer_dirty(eb); num_pages = num_extent_pages(eb); @@ -6453,7 +6514,7 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb) WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); if (!was_dirty) { - bool subpage = eb->fs_info->sectorsize < PAGE_SIZE; + bool subpage = eb->fs_info->nodesize < PAGE_SIZE; /* * For subpage case, we can have other extent buffers in the @@ -6493,9 +6554,18 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb) num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; - if (page) - btrfs_page_clear_uptodate(fs_info, page, - eb->start, eb->len); + if (!page) + continue; + + /* + * This is special handling for metadata subpage, as regular + * btrfs_is_subpage() can not handle cloned/dummy metadata. + */ + if (fs_info->nodesize >= PAGE_SIZE) + ClearPageUptodate(page); + else + btrfs_subpage_clear_uptodate(fs_info, page, eb->start, + eb->len); } } @@ -6510,7 +6580,16 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; - btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len); + + /* + * This is special handling for metadata subpage, as regular + * btrfs_is_subpage() can not handle cloned/dummy metadata. + */ + if (fs_info->nodesize >= PAGE_SIZE) + SetPageUptodate(page); + else + btrfs_subpage_set_uptodate(fs_info, page, eb->start, + eb->len); } } @@ -6566,12 +6645,8 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, atomic_dec(&eb->io_pages); } if (bio_ctrl.bio) { - int tmp; - - tmp = submit_one_bio(bio_ctrl.bio, mirror_num, 0); + submit_one_bio(bio_ctrl.bio, mirror_num, 0); bio_ctrl.bio = NULL; - if (tmp < 0) - return tmp; } if (ret || wait != WAIT_COMPLETE) return ret; @@ -6605,7 +6680,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))) return -EIO; - if (eb->fs_info->sectorsize < PAGE_SIZE) + if (eb->fs_info->nodesize < PAGE_SIZE) return read_extent_buffer_subpage(eb, wait, mirror_num); num_pages = num_extent_pages(eb); @@ -6648,7 +6723,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) eb->read_mirror = 0; atomic_set(&eb->io_pages, num_reads); /* - * It is possible for releasepage to clear the TREE_REF bit before we + * It is possible for release_folio to clear the TREE_REF bit before we * set io_pages. See check_buffer_tree_ref for a more detailed comment. */ check_buffer_tree_ref(eb); @@ -6684,10 +6759,8 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } if (bio_ctrl.bio) { - err = submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.bio_flags); + submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.compress_type); bio_ctrl.bio = NULL; - if (err) - return err; } if (ret || wait != WAIT_COMPLETE) @@ -6851,14 +6924,24 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, { struct btrfs_fs_info *fs_info = eb->fs_info; - if (fs_info->sectorsize < PAGE_SIZE) { - bool uptodate; + /* + * If we are using the commit root we could potentially clear a page + * Uptodate while we're using the extent buffer that we've previously + * looked up. We don't want to complain in this case, as the page was + * valid before, we just didn't write it out. Instead we want to catch + * the case where we didn't actually read the block properly, which + * would have !PageUptodate && !PageError, as we clear PageError before + * reading. + */ + if (fs_info->nodesize < PAGE_SIZE) { + bool uptodate, error; uptodate = btrfs_subpage_test_uptodate(fs_info, page, eb->start, eb->len); - WARN_ON(!uptodate); + error = btrfs_subpage_test_error(fs_info, page, eb->start, eb->len); + WARN_ON(!uptodate && !error); } else { - WARN_ON(!PageUptodate(page)); + WARN_ON(!PageUptodate(page) && !PageError(page)); } } @@ -6952,7 +7035,7 @@ void copy_extent_buffer_full(const struct extent_buffer *dst, ASSERT(dst->len == src->len); - if (dst->fs_info->sectorsize == PAGE_SIZE) { + if (dst->fs_info->nodesize >= PAGE_SIZE) { num_pages = num_extent_pages(dst); for (i = 0; i < num_pages; i++) copy_page(page_address(dst->pages[i]), @@ -6961,7 +7044,7 @@ void copy_extent_buffer_full(const struct extent_buffer *dst, size_t src_offset = get_eb_offset_in_page(src, 0); size_t dst_offset = get_eb_offset_in_page(dst, 0); - ASSERT(src->fs_info->sectorsize < PAGE_SIZE); + ASSERT(src->fs_info->nodesize < PAGE_SIZE); memcpy(page_address(dst->pages[0]) + dst_offset, page_address(src->pages[0]) + src_offset, src->len); @@ -7242,42 +7325,25 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } -#define GANG_LOOKUP_SIZE 16 static struct extent_buffer *get_next_extent_buffer( struct btrfs_fs_info *fs_info, struct page *page, u64 bytenr) { - struct extent_buffer *gang[GANG_LOOKUP_SIZE]; - struct extent_buffer *found = NULL; + struct extent_buffer *eb; + unsigned long index; u64 page_start = page_offset(page); - u64 cur = page_start; ASSERT(in_range(bytenr, page_start, PAGE_SIZE)); lockdep_assert_held(&fs_info->buffer_lock); - while (cur < page_start + PAGE_SIZE) { - int ret; - int i; - - ret = radix_tree_gang_lookup(&fs_info->buffer_radix, - (void **)gang, cur >> fs_info->sectorsize_bits, - min_t(unsigned int, GANG_LOOKUP_SIZE, - PAGE_SIZE / fs_info->nodesize)); - if (ret == 0) - goto out; - for (i = 0; i < ret; i++) { - /* Already beyond page end */ - if (gang[i]->start >= page_start + PAGE_SIZE) - goto out; - /* Found one */ - if (gang[i]->start >= bytenr) { - found = gang[i]; - goto out; - } - } - cur = gang[ret - 1]->start + gang[ret - 1]->len; + xa_for_each_start(&fs_info->extent_buffers, index, eb, + page_start >> fs_info->sectorsize_bits) { + if (in_range(eb->start, page_start, PAGE_SIZE)) + return eb; + else if (eb->start >= page_start + PAGE_SIZE) + /* Already beyond page end */ + return NULL; } -out: - return found; + return NULL; } static int try_release_subpage_extent_buffer(struct page *page) @@ -7354,7 +7420,7 @@ int try_release_extent_buffer(struct page *page) { struct extent_buffer *eb; - if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return try_release_subpage_extent_buffer(page); /* diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 0399cf8e3c32..23d4103c8831 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -7,15 +7,9 @@ #include <linux/refcount.h> #include <linux/fiemap.h> #include <linux/btrfs_tree.h> +#include "compression.h" #include "ulist.h" -/* - * flags for bio submission. The high bits indicate the compression - * type for this bio - */ -#define EXTENT_BIO_COMPRESSED 1 -#define EXTENT_BIO_FLAG_SHIFT 16 - enum { EXTENT_BUFFER_UPTODATE, EXTENT_BUFFER_DIRTY, @@ -32,7 +26,6 @@ enum { /* write IO error */ EXTENT_BUFFER_WRITE_ERR, EXTENT_BUFFER_NO_CHECK, - EXTENT_BUFFER_ZONE_FINISH, }; /* these are flags for __process_pages_contig */ @@ -71,9 +64,9 @@ struct btrfs_fs_info; struct io_failure_record; struct extent_io_tree; -typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio, +typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio, int mirror_num, - unsigned long bio_flags); + enum btrfs_compression_type compress_type); typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode, struct bio *bio, u64 dio_file_offset); @@ -103,22 +96,11 @@ struct extent_buffer { }; /* - * Structure to record info about the bio being assembled, and other info like - * how many bytes are there before stripe/ordered extent boundary. - */ -struct btrfs_bio_ctrl { - struct bio *bio; - unsigned long bio_flags; - u32 len_to_stripe_boundary; - u32 len_to_oe_boundary; -}; - -/* * Structure to record how many bytes and which ranges are set/cleared */ struct extent_changeset { /* How many bytes are set/cleared in this operation */ - unsigned int bytes_changed; + u64 bytes_changed; /* Changed ranges */ struct ulist range_changed; @@ -158,17 +140,6 @@ static inline void extent_changeset_free(struct extent_changeset *changeset) kfree(changeset); } -static inline void extent_set_compress_type(unsigned long *bio_flags, - int compress_type) -{ - *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT; -} - -static inline int extent_compress_type(unsigned long bio_flags) -{ - return bio_flags >> EXTENT_BIO_FLAG_SHIFT; -} - struct extent_map_tree; typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, @@ -178,11 +149,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int __must_check submit_one_bio(struct bio *bio, int mirror_num, - unsigned long bio_flags); -int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, - struct btrfs_bio_ctrl *bio_ctrl, - unsigned int read_flags, u64 *prev_em_start); +int btrfs_read_folio(struct file *file, struct folio *folio); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end); int extent_writepages(struct address_space *mapping, @@ -277,8 +244,10 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, u32 bits_to_clear, unsigned long page_ops); + +int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); struct bio *btrfs_bio_alloc(unsigned int nr_iovecs); -struct bio *btrfs_bio_clone(struct bio *bio); +struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio); struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); @@ -297,7 +266,7 @@ struct io_failure_record { u64 start; u64 len; u64 logical; - unsigned long bio_flags; + enum btrfs_compression_type compress_type; int this_mirror; int failed_mirror; }; diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 5a36add21305..6fee14ce2e6b 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -261,6 +261,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; em->mod_start = merge->mod_start; em->generation = max(em->generation, merge->generation); + set_bit(EXTENT_FLAG_MERGED, &em->flags); rb_erase_cached(&merge->rb_node, &tree->map); RB_CLEAR_NODE(&merge->rb_node); @@ -278,6 +279,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) RB_CLEAR_NODE(&merge->rb_node); em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; em->generation = max(em->generation, merge->generation); + set_bit(EXTENT_FLAG_MERGED, &em->flags); free_extent_map(merge); } } @@ -490,6 +492,8 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, */ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { + lockdep_assert_held_write(&tree->lock); + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); rb_erase_cached(&em->rb_node, &tree->map); if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) @@ -504,6 +508,8 @@ void replace_extent_mapping(struct extent_map_tree *tree, struct extent_map *new, int modified) { + lockdep_assert_held_write(&tree->lock); + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags)); ASSERT(extent_map_in_tree(cur)); if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags)) diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 8e217337dff9..d2fa32ffe304 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -25,6 +25,8 @@ enum { EXTENT_FLAG_FILLING, /* filesystem extent mapping type */ EXTENT_FLAG_FS_MAPPING, + /* This em is merged from two or more physically adjacent ems */ + EXTENT_FLAG_MERGED, }; struct extent_map { @@ -40,6 +42,12 @@ struct extent_map { u64 ram_bytes; u64 block_start; u64 block_len; + + /* + * Generation of the extent map, for merged em it's the highest + * generation of all merged ems. + * For non-merged extents, it's from btrfs_file_extent_item::generation. + */ u64 generation; unsigned long flags; /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */ diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 90c5c38836ab..c828f971a346 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -305,7 +305,7 @@ found: read_extent_buffer(path->nodes[0], dst, (unsigned long)item, ret * csum_size); out: - if (ret == -ENOENT) + if (ret == -ENOENT || ret == -EFBIG) ret = 0; return ret; } @@ -368,6 +368,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_bio *bbio = NULL; struct btrfs_path *path; const u32 sectorsize = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; @@ -377,6 +378,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst u8 *csum; const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; int count = 0; + blk_status_t ret = BLK_STS_OK; if ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) @@ -400,7 +402,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst return BLK_STS_RESOURCE; if (!dst) { - struct btrfs_bio *bbio = btrfs_bio(bio); + bbio = btrfs_bio(bio); if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); @@ -456,21 +458,27 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst count = search_csum_tree(fs_info, path, cur_disk_bytenr, search_len, csum_dst); - if (count <= 0) { - /* - * Either we hit a critical error or we didn't find - * the csum. - * Either way, we put zero into the csums dst, and skip - * to the next sector. - */ + if (count < 0) { + ret = errno_to_blk_status(count); + if (bbio) + btrfs_bio_free_csum(bbio); + break; + } + + /* + * We didn't find a csum for this range. We need to make sure + * we complain loudly about this, because we are not NODATASUM. + * + * However for the DATA_RELOC inode we could potentially be + * relocating data extents for a NODATASUM inode, so the inode + * itself won't be marked with NODATASUM, but the extent we're + * copying is in fact NODATASUM. If we don't find a csum we + * assume this is the case. + */ + if (count == 0) { memset(csum_dst, 0, csum_size); count = 1; - /* - * For data reloc inode, we need to mark the range - * NODATASUM so that balance won't report false csum - * error. - */ if (BTRFS_I(inode)->root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { u64 file_offset; @@ -491,7 +499,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst } btrfs_free_path(path); - return BLK_STS_OK; + return ret; } int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, @@ -612,32 +620,33 @@ fail: return ret; } -/* - * btrfs_csum_one_bio - Calculates checksums of the data contained inside a bio +/** + * Calculate checksums of the data contained inside a bio + * * @inode: Owner of the data inside the bio * @bio: Contains the data to be checksummed - * @file_start: offset in file this bio begins to describe - * @contig: Boolean. If true/1 means all bio vecs in this bio are - * contiguous and they begin at @file_start in the file. False/0 - * means this bio can contain potentially discontiguous bio vecs |