aboutsummaryrefslogtreecommitdiffstats
path: root/fs/libfs.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/libfs.c')
-rw-r--r--fs/libfs.c353
1 files changed, 279 insertions, 74 deletions
diff --git a/fs/libfs.c b/fs/libfs.c
index c686bd9caac6..682d56345a1c 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -15,21 +15,25 @@
#include <linux/mutex.h>
#include <linux/namei.h>
#include <linux/exportfs.h>
+#include <linux/iversion.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h> /* sync_mapping_buffers */
#include <linux/fs_context.h>
#include <linux/pseudo_fs.h>
#include <linux/fsnotify.h>
+#include <linux/unicode.h>
+#include <linux/fscrypt.h>
#include <linux/uaccess.h>
#include "internal.h"
-int simple_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int query_flags)
+int simple_getattr(struct user_namespace *mnt_userns, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
- generic_fillattr(inode, stat);
+ generic_fillattr(&init_user_ns, inode, stat);
stat->blocks = inode->i_mapping->nrpages << (PAGE_SHIFT - 9);
return 0;
}
@@ -137,11 +141,11 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
switch (whence) {
case 1:
offset += file->f_pos;
- /* fall through */
+ fallthrough;
case 0:
if (offset >= 0)
break;
- /* fall through */
+ fallthrough;
default:
return -EINVAL;
}
@@ -445,16 +449,43 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry)
}
EXPORT_SYMBOL(simple_rmdir);
-int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
- struct inode *new_dir, struct dentry *new_dentry,
- unsigned int flags)
+int simple_rename_exchange(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ bool old_is_dir = d_is_dir(old_dentry);
+ bool new_is_dir = d_is_dir(new_dentry);
+
+ if (old_dir != new_dir && old_is_dir != new_is_dir) {
+ if (old_is_dir) {
+ drop_nlink(old_dir);
+ inc_nlink(new_dir);
+ } else {
+ drop_nlink(new_dir);
+ inc_nlink(old_dir);
+ }
+ }
+ old_dir->i_ctime = old_dir->i_mtime =
+ new_dir->i_ctime = new_dir->i_mtime =
+ d_inode(old_dentry)->i_ctime =
+ d_inode(new_dentry)->i_ctime = current_time(old_dir);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(simple_rename_exchange);
+
+int simple_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
+ struct dentry *old_dentry, struct inode *new_dir,
+ struct dentry *new_dentry, unsigned int flags)
{
struct inode *inode = d_inode(old_dentry);
int they_are_dirs = d_is_dir(old_dentry);
- if (flags & ~RENAME_NOREPLACE)
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
return -EINVAL;
+ if (flags & RENAME_EXCHANGE)
+ return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
+
if (!simple_empty(new_dentry))
return -ENOTEMPTY;
@@ -478,6 +509,7 @@ EXPORT_SYMBOL(simple_rename);
/**
* simple_setattr - setattr for simple filesystem
+ * @mnt_userns: user namespace of the target mount
* @dentry: dentry
* @iattr: iattr structure
*
@@ -490,35 +522,35 @@ EXPORT_SYMBOL(simple_rename);
* on simple regular filesystems. Anything that needs to change on-disk
* or wire state on size changes needs its own setattr method.
*/
-int simple_setattr(struct dentry *dentry, struct iattr *iattr)
+int simple_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
+ struct iattr *iattr)
{
struct inode *inode = d_inode(dentry);
int error;
- error = setattr_prepare(dentry, iattr);
+ error = setattr_prepare(mnt_userns, dentry, iattr);
if (error)
return error;
if (iattr->ia_valid & ATTR_SIZE)
truncate_setsize(inode, iattr->ia_size);
- setattr_copy(inode, iattr);
+ setattr_copy(mnt_userns, inode, iattr);
mark_inode_dirty(inode);
return 0;
}
EXPORT_SYMBOL(simple_setattr);
-int simple_readpage(struct file *file, struct page *page)
+static int simple_read_folio(struct file *file, struct folio *folio)
{
- clear_highpage(page);
- flush_dcache_page(page);
- SetPageUptodate(page);
- unlock_page(page);
+ folio_zero_range(folio, 0, folio_size(folio));
+ flush_dcache_folio(folio);
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
return 0;
}
-EXPORT_SYMBOL(simple_readpage);
int simple_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct page *page;
@@ -526,7 +558,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping,
index = pos >> PAGE_SHIFT;
- page = grab_cache_page_write_begin(mapping, index, flags);
+ page = grab_cache_page_write_begin(mapping, index);
if (!page)
return -ENOMEM;
@@ -561,9 +593,9 @@ EXPORT_SYMBOL(simple_write_begin);
* should extend on what's done here with a call to mark_inode_dirty() in the
* case that i_size has changed.
*
- * Use *ONLY* with simple_readpage()
+ * Use *ONLY* with simple_read_folio()
*/
-int simple_write_end(struct file *file, struct address_space *mapping,
+static int simple_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
@@ -592,7 +624,17 @@ int simple_write_end(struct file *file, struct address_space *mapping,
return copied;
}
-EXPORT_SYMBOL(simple_write_end);
+
+/*
+ * Provides ramfs-style behavior: data in the pagecache, but no writeback.
+ */
+const struct address_space_operations ram_aops = {
+ .read_folio = simple_read_folio,
+ .write_begin = simple_write_begin,
+ .write_end = simple_write_end,
+ .dirty_folio = noop_dirty_folio,
+};
+EXPORT_SYMBOL(ram_aops);
/*
* the inodes created here are not hashed. If you use iunique to generate
@@ -891,7 +933,7 @@ int simple_attr_open(struct inode *inode, struct file *file,
{
struct simple_attr *attr;
- attr = kmalloc(sizeof(*attr), GFP_KERNEL);
+ attr = kzalloc(sizeof(*attr), GFP_KERNEL);
if (!attr)
return -ENOMEM;
@@ -931,9 +973,11 @@ ssize_t simple_attr_read(struct file *file, char __user *buf,
if (ret)
return ret;
- if (*ppos) { /* continued read */
+ if (*ppos && attr->get_buf[0]) {
+ /* continued read */
size = strlen(attr->get_buf);
- } else { /* first read */
+ } else {
+ /* first read */
u64 val;
ret = attr->get(attr->data, &val);
if (ret)
@@ -955,7 +999,7 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
size_t len, loff_t *ppos)
{
struct simple_attr *attr;
- u64 val;
+ unsigned long long val;
size_t size;
ssize_t ret;
@@ -973,7 +1017,9 @@ ssize_t simple_attr_write(struct file *file, const char __user *buf,
goto out;
attr->set_buf[size] = '\0';
- val = simple_strtoll(attr->set_buf, NULL, 0);
+ ret = kstrtoull(attr->set_buf, 0, &val);
+ if (ret)
+ goto out;
ret = attr->set(attr->data, val);
if (ret == 0)
ret = len; /* on success, claim we got the whole input */
@@ -1111,7 +1157,7 @@ int generic_file_fsync(struct file *file, loff_t start, loff_t end,
err = __generic_file_fsync(file, start, end, datasync);
if (err)
return err;
- return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
+ return blkdev_issue_flush(inode->i_sb->s_bdev);
}
EXPORT_SYMBOL(generic_file_fsync);
@@ -1153,33 +1199,6 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
}
EXPORT_SYMBOL(noop_fsync);
-int noop_set_page_dirty(struct page *page)
-{
- /*
- * Unlike __set_page_dirty_no_writeback that handles dirty page
- * tracking in the page object, dax does all dirty tracking in
- * the inode address_space in response to mkwrite faults. In the
- * dax case we only need to worry about potentially dirty CPU
- * caches, not dirty page cache pages to write back.
- *
- * This callback is defined to prevent fallback to
- * __set_page_dirty_buffers() in set_page_dirty().
- */
- return 0;
-}
-EXPORT_SYMBOL_GPL(noop_set_page_dirty);
-
-void noop_invalidatepage(struct page *page, unsigned int offset,
- unsigned int length)
-{
- /*
- * There is no page cache to invalidate in the dax case, however
- * we need this callback defined to prevent falling back to
- * block_invalidatepage() in do_invalidatepage().
- */
-}
-EXPORT_SYMBOL_GPL(noop_invalidatepage);
-
ssize_t noop_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
{
/*
@@ -1199,24 +1218,10 @@ void kfree_link(void *p)
}
EXPORT_SYMBOL(kfree_link);
-/*
- * nop .set_page_dirty method so that people can use .page_mkwrite on
- * anon inodes.
- */
-static int anon_set_page_dirty(struct page *page)
-{
- return 0;
-};
-
-/*
- * A single inode exists for all anon_inode files. Contrary to pipes,
- * anon_inode inodes have no associated per-instance data, so we need
- * only allocate one of them.
- */
struct inode *alloc_anon_inode(struct super_block *s)
{
static const struct address_space_operations anon_aops = {
- .set_page_dirty = anon_set_page_dirty,
+ .dirty_folio = noop_dirty_folio,
};
struct inode *inode = new_inode_pseudo(s);
@@ -1294,15 +1299,17 @@ static struct dentry *empty_dir_lookup(struct inode *dir, struct dentry *dentry,
return ERR_PTR(-ENOENT);
}
-static int empty_dir_getattr(const struct path *path, struct kstat *stat,
+static int empty_dir_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = d_inode(path->dentry);
- generic_fillattr(inode, stat);
+ generic_fillattr(&init_user_ns, inode, stat);
return 0;
}
-static int empty_dir_setattr(struct dentry *dentry, struct iattr *attr)
+static int empty_dir_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *attr)
{
return -EPERM;
}
@@ -1361,3 +1368,201 @@ bool is_empty_dir_inode(struct inode *inode)
return (inode->i_fop == &empty_dir_operations) &&
(inode->i_op == &empty_dir_inode_operations);
}
+
+#if IS_ENABLED(CONFIG_UNICODE)
+/*
+ * Determine if the name of a dentry should be casefolded.
+ *
+ * Return: if names will need casefolding
+ */
+static bool needs_casefold(const struct inode *dir)
+{
+ return IS_CASEFOLDED(dir) && dir->i_sb->s_encoding;
+}
+
+/**
+ * generic_ci_d_compare - generic d_compare implementation for casefolding filesystems
+ * @dentry: dentry whose name we are checking against
+ * @len: len of name of dentry
+ * @str: str pointer to name of dentry
+ * @name: Name to compare against
+ *
+ * Return: 0 if names match, 1 if mismatch, or -ERRNO
+ */
+static int generic_ci_d_compare(const struct dentry *dentry, unsigned int len,
+ const char *str, const struct qstr *name)
+{
+ const struct dentry *parent = READ_ONCE(dentry->d_parent);
+ const struct inode *dir = READ_ONCE(parent->d_inode);
+ const struct super_block *sb = dentry->d_sb;
+ const struct unicode_map *um = sb->s_encoding;
+ struct qstr qstr = QSTR_INIT(str, len);
+ char strbuf[DNAME_INLINE_LEN];
+ int ret;
+
+ if (!dir || !needs_casefold(dir))
+ goto fallback;
+ /*
+ * If the dentry name is stored in-line, then it may be concurrently
+ * modified by a rename. If this happens, the VFS will eventually retry
+ * the lookup, so it doesn't matter what ->d_compare() returns.
+ * However, it's unsafe to call utf8_strncasecmp() with an unstable
+ * string. Therefore, we have to copy the name into a temporary buffer.
+ */
+ if (len <= DNAME_INLINE_LEN - 1) {
+ memcpy(strbuf, str, len);
+ strbuf[len] = 0;
+ qstr.name = strbuf;
+ /* prevent compiler from optimizing out the temporary buffer */
+ barrier();
+ }
+ ret = utf8_strncasecmp(um, name, &qstr);
+ if (ret >= 0)
+ return ret;
+
+ if (sb_has_strict_encoding(sb))
+ return -EINVAL;
+fallback:
+ if (len != name->len)
+ return 1;
+ return !!memcmp(str, name->name, len);
+}
+
+/**
+ * generic_ci_d_hash - generic d_hash implementation for casefolding filesystems
+ * @dentry: dentry of the parent directory
+ * @str: qstr of name whose hash we should fill in
+ *
+ * Return: 0 if hash was successful or unchanged, and -EINVAL on error
+ */
+static int generic_ci_d_hash(const struct dentry *dentry, struct qstr *str)
+{
+ const struct inode *dir = READ_ONCE(dentry->d_inode);
+ struct super_block *sb = dentry->d_sb;
+ const struct unicode_map *um = sb->s_encoding;
+ int ret = 0;
+
+ if (!dir || !needs_casefold(dir))
+ return 0;
+
+ ret = utf8_casefold_hash(um, dentry, str);
+ if (ret < 0 && sb_has_strict_encoding(sb))
+ return -EINVAL;
+ return 0;
+}
+
+static const struct dentry_operations generic_ci_dentry_ops = {
+ .d_hash = generic_ci_d_hash,
+ .d_compare = generic_ci_d_compare,
+};
+#endif
+
+#ifdef CONFIG_FS_ENCRYPTION
+static const struct dentry_operations generic_encrypted_dentry_ops = {
+ .d_revalidate = fscrypt_d_revalidate,
+};
+#endif
+
+#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE)
+static const struct dentry_operations generic_encrypted_ci_dentry_ops = {
+ .d_hash = generic_ci_d_hash,
+ .d_compare = generic_ci_d_compare,
+ .d_revalidate = fscrypt_d_revalidate,
+};
+#endif
+
+/**
+ * generic_set_encrypted_ci_d_ops - helper for setting d_ops for given dentry
+ * @dentry: dentry to set ops on
+ *
+ * Casefolded directories need d_hash and d_compare set, so that the dentries
+ * contained in them are handled case-insensitively. Note that these operations
+ * are needed on the parent directory rather than on the dentries in it, and
+ * while the casefolding flag can be toggled on and off on an empty directory,
+ * dentry_operations can't be changed later. As a result, if the filesystem has
+ * casefolding support enabled at all, we have to give all dentries the
+ * casefolding operations even if their inode doesn't have the casefolding flag
+ * currently (and thus the casefolding ops would be no-ops for now).
+ *
+ * Encryption works differently in that the only dentry operation it needs is
+ * d_revalidate, which it only needs on dentries that have the no-key name flag.
+ * The no-key flag can't be set "later", so we don't have to worry about that.
+ *
+ * Finally, to maximize compatibility with overlayfs (which isn't compatible
+ * with certain dentry operations) and to avoid taking an unnecessary
+ * performance hit, we use custom dentry_operations for each possible
+ * combination rather than always installing all operations.
+ */
+void generic_set_encrypted_ci_d_ops(struct dentry *dentry)
+{
+#ifdef CONFIG_FS_ENCRYPTION
+ bool needs_encrypt_ops = dentry->d_flags & DCACHE_NOKEY_NAME;
+#endif
+#if IS_ENABLED(CONFIG_UNICODE)
+ bool needs_ci_ops = dentry->d_sb->s_encoding;
+#endif
+#if defined(CONFIG_FS_ENCRYPTION) && IS_ENABLED(CONFIG_UNICODE)
+ if (needs_encrypt_ops && needs_ci_ops) {
+ d_set_d_op(dentry, &generic_encrypted_ci_dentry_ops);
+ return;
+ }
+#endif
+#ifdef CONFIG_FS_ENCRYPTION
+ if (needs_encrypt_ops) {
+ d_set_d_op(dentry, &generic_encrypted_dentry_ops);
+ return;
+ }
+#endif
+#if IS_ENABLED(CONFIG_UNICODE)
+ if (needs_ci_ops) {
+ d_set_d_op(dentry, &generic_ci_dentry_ops);
+ return;
+ }
+#endif
+}
+EXPORT_SYMBOL(generic_set_encrypted_ci_d_ops);
+
+/**
+ * inode_maybe_inc_iversion - increments i_version
+ * @inode: inode with the i_version that should be updated
+ * @force: increment the counter even if it's not necessary?
+ *
+ * Every time the inode is modified, the i_version field must be seen to have
+ * changed by any observer.
+ *
+ * If "force" is set or the QUERIED flag is set, then ensure that we increment
+ * the value, and clear the queried flag.
+ *
+ * In the common case where neither is set, then we can return "false" without
+ * updating i_version.
+ *
+ * If this function returns false, and no other metadata has changed, then we
+ * can avoid logging the metadata.
+ */
+bool inode_maybe_inc_iversion(struct inode *inode, bool force)
+{
+ u64 cur, new;
+
+ /*
+ * The i_version field is not strictly ordered with any other inode
+ * information, but the legacy inode_inc_iversion code used a spinlock
+ * to serialize increments.
+ *
+ * Here, we add full memory barriers to ensure that any de-facto
+ * ordering with other info is preserved.
+ *
+ * This barrier pairs with the barrier in inode_query_iversion()
+ */
+ smp_mb();
+ cur = inode_peek_iversion_raw(inode);
+ do {
+ /* If flag is clear then we needn't do anything */
+ if (!force && !(cur & I_VERSION_QUERIED))
+ return false;
+
+ /* Since lowest bit is flag, add 2 to avoid it */
+ new = (cur & ~I_VERSION_QUERIED) + I_VERSION_INCREMENT;
+ } while (!atomic64_try_cmpxchg(&inode->i_version, &cur, new));
+ return true;
+}
+EXPORT_SYMBOL(inode_maybe_inc_iversion);