aboutsummaryrefslogtreecommitdiffstats
path: root/mm/shmem.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--mm/shmem.c2162
1 files changed, 1129 insertions, 1033 deletions
diff --git a/mm/shmem.c b/mm/shmem.c
index aad3ba74b0e9..c1d8b8a1aa3b 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -28,18 +28,18 @@
#include <linux/ramfs.h>
#include <linux/pagemap.h>
#include <linux/file.h>
+#include <linux/fileattr.h>
#include <linux/mm.h>
#include <linux/random.h>
#include <linux/sched/signal.h>
#include <linux/export.h>
#include <linux/swap.h>
#include <linux/uio.h>
-#include <linux/khugepaged.h>
#include <linux/hugetlb.h>
-#include <linux/frontswap.h>
#include <linux/fs_parser.h>
-
-#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */
+#include <linux/swapfile.h>
+#include <linux/iversion.h>
+#include "swap.h"
static struct vfsmount *shm_mnt;
@@ -60,7 +60,6 @@ static struct vfsmount *shm_mnt;
#include <linux/backing-dev.h>
#include <linux/shmem_fs.h>
#include <linux/writeback.h>
-#include <linux/blkdev.h>
#include <linux/pagevec.h>
#include <linux/percpu_counter.h>
#include <linux/falloc.h>
@@ -82,7 +81,6 @@ static struct vfsmount *shm_mnt;
#include <linux/uuid.h>
#include <linux/uaccess.h>
-#include <asm/pgtable.h>
#include "internal.h"
@@ -97,7 +95,7 @@ static struct vfsmount *shm_mnt;
/*
* shmem_fallocate communicates with shmem_fault or shmem_writepage via
- * inode->i_private (with i_mutex making sure that it has only one user at
+ * inode->i_private (with i_rwsem making sure that it has only one user at
* a time): we would prefer not to enlarge the shmem inode just for that.
*/
struct shmem_falloc {
@@ -115,11 +113,13 @@ struct shmem_options {
kuid_t uid;
kgid_t gid;
umode_t mode;
+ bool full_inums;
int huge;
int seen;
#define SHMEM_SEEN_BLOCKS 1
#define SHMEM_SEEN_INODES 2
#define SHMEM_SEEN_HUGE 4
+#define SHMEM_SEEN_INUMS 8
};
#ifdef CONFIG_TMPFS
@@ -136,24 +136,10 @@ static unsigned long shmem_default_max_inodes(void)
}
#endif
-static bool shmem_should_replace_page(struct page *page, gfp_t gfp);
-static int shmem_replace_page(struct page **pagep, gfp_t gfp,
- struct shmem_inode_info *info, pgoff_t index);
-static int shmem_swapin_page(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp,
+static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
+ struct folio **foliop, enum sgp_type sgp,
gfp_t gfp, struct vm_area_struct *vma,
vm_fault_t *fault_type);
-static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp,
- gfp_t gfp, struct vm_area_struct *vma,
- struct vm_fault *vmf, vm_fault_t *fault_type);
-
-int shmem_getpage(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp)
-{
- return shmem_getpage_gfp(inode, index, pagep, sgp,
- mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
-}
static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
{
@@ -194,7 +180,7 @@ static inline int shmem_reacct_size(unsigned long flags,
/*
* ... whereas tmpfs objects are accounted incrementally as
* pages are allocated, in order to allow large sparse files.
- * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
+ * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
* so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
*/
static inline int shmem_acct_block(unsigned long flags, long pages)
@@ -245,7 +231,7 @@ static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
}
static const struct super_operations shmem_ops;
-static const struct address_space_operations shmem_aops;
+const struct address_space_operations shmem_aops;
static const struct file_operations shmem_file_operations;
static const struct inode_operations shmem_inode_operations;
static const struct inode_operations shmem_dir_inode_operations;
@@ -261,18 +247,79 @@ bool vma_is_shmem(struct vm_area_struct *vma)
static LIST_HEAD(shmem_swaplist);
static DEFINE_MUTEX(shmem_swaplist_mutex);
-static int shmem_reserve_inode(struct super_block *sb)
+/*
+ * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
+ * produces a novel ino for the newly allocated inode.
+ *
+ * It may also be called when making a hard link to permit the space needed by
+ * each dentry. However, in that case, no new inode number is needed since that
+ * internally draws from another pool of inode numbers (currently global
+ * get_next_ino()). This case is indicated by passing NULL as inop.
+ */
+#define SHMEM_INO_BATCH 1024
+static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
- if (sbinfo->max_inodes) {
- spin_lock(&sbinfo->stat_lock);
- if (!sbinfo->free_inodes) {
- spin_unlock(&sbinfo->stat_lock);
- return -ENOSPC;
+ ino_t ino;
+
+ if (!(sb->s_flags & SB_KERNMOUNT)) {
+ raw_spin_lock(&sbinfo->stat_lock);
+ if (sbinfo->max_inodes) {
+ if (!sbinfo->free_inodes) {
+ raw_spin_unlock(&sbinfo->stat_lock);
+ return -ENOSPC;
+ }
+ sbinfo->free_inodes--;
+ }
+ if (inop) {
+ ino = sbinfo->next_ino++;
+ if (unlikely(is_zero_ino(ino)))
+ ino = sbinfo->next_ino++;
+ if (unlikely(!sbinfo->full_inums &&
+ ino > UINT_MAX)) {
+ /*
+ * Emulate get_next_ino uint wraparound for
+ * compatibility
+ */
+ if (IS_ENABLED(CONFIG_64BIT))
+ pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
+ __func__, MINOR(sb->s_dev));
+ sbinfo->next_ino = 1;
+ ino = sbinfo->next_ino++;
+ }
+ *inop = ino;
}
- sbinfo->free_inodes--;
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
+ } else if (inop) {
+ /*
+ * __shmem_file_setup, one of our callers, is lock-free: it
+ * doesn't hold stat_lock in shmem_reserve_inode since
+ * max_inodes is always 0, and is called from potentially
+ * unknown contexts. As such, use a per-cpu batched allocator
+ * which doesn't require the per-sb stat_lock unless we are at
+ * the batch boundary.
+ *
+ * We don't need to worry about inode{32,64} since SB_KERNMOUNT
+ * shmem mounts are not exposed to userspace, so we don't need
+ * to worry about things like glibc compatibility.
+ */
+ ino_t *next_ino;
+
+ next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
+ ino = *next_ino;
+ if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
+ raw_spin_lock(&sbinfo->stat_lock);
+ ino = sbinfo->next_ino;
+ sbinfo->next_ino += SHMEM_INO_BATCH;
+ raw_spin_unlock(&sbinfo->stat_lock);
+ if (unlikely(is_zero_ino(ino)))
+ ino++;
+ }
+ *inop = ino;
+ *next_ino = ++ino;
+ put_cpu();
}
+
return 0;
}
@@ -280,9 +327,9 @@ static void shmem_free_inode(struct super_block *sb)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
if (sbinfo->max_inodes) {
- spin_lock(&sbinfo->stat_lock);
+ raw_spin_lock(&sbinfo->stat_lock);
sbinfo->free_inodes++;
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
}
}
@@ -336,7 +383,7 @@ void shmem_uncharge(struct inode *inode, long pages)
struct shmem_inode_info *info = SHMEM_I(inode);
unsigned long flags;
- /* nrpages adjustment done by __delete_from_page_cache() or caller */
+ /* nrpages adjustment done by __filemap_remove_folio() or caller */
spin_lock_irqsave(&info->lock, flags);
info->alloced -= pages;
@@ -410,10 +457,45 @@ static bool shmem_confirm_swap(struct address_space *mapping,
#define SHMEM_HUGE_DENY (-1)
#define SHMEM_HUGE_FORCE (-2)
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* ifdef here to avoid bloating shmem.o when not necessary */
-static int shmem_huge __read_mostly;
+static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
+
+bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode,
+ pgoff_t index, bool shmem_huge_force)
+{
+ loff_t i_size;
+
+ if (!S_ISREG(inode->i_mode))
+ return false;
+ if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) ||
+ test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)))
+ return false;
+ if (shmem_huge_force)
+ return true;
+ if (shmem_huge == SHMEM_HUGE_FORCE)
+ return true;
+ if (shmem_huge == SHMEM_HUGE_DENY)
+ return false;
+
+ switch (SHMEM_SB(inode->i_sb)->huge) {
+ case SHMEM_HUGE_ALWAYS:
+ return true;
+ case SHMEM_HUGE_WITHIN_SIZE:
+ index = round_up(index + 1, HPAGE_PMD_NR);
+ i_size = round_up(i_size_read(inode), PAGE_SIZE);
+ if (i_size >> PAGE_SHIFT >= index)
+ return true;
+ fallthrough;
+ case SHMEM_HUGE_ADVISE:
+ if (vma && (vma->vm_flags & VM_HUGEPAGE))
+ return true;
+ fallthrough;
+ default:
+ return false;
+ }
+}
#if defined(CONFIG_SYSFS)
static int shmem_parse_huge(const char *str)
@@ -464,9 +546,9 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
LIST_HEAD(to_remove);
struct inode *inode;
struct shmem_inode_info *info;
- struct page *page;
+ struct folio *folio;
unsigned long batch = sc ? sc->nr_to_scan : 128;
- int removed = 0, split = 0;
+ int split = 0;
if (list_empty(&sbinfo->shrinklist))
return SHRINK_STOP;
@@ -481,7 +563,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
/* inode is about to be evicted */
if (!inode) {
list_del_init(&info->shrinklist);
- removed++;
goto next;
}
@@ -489,12 +570,12 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
if (round_up(inode->i_size, PAGE_SIZE) ==
round_up(inode->i_size, HPAGE_PMD_SIZE)) {
list_move(&info->shrinklist, &to_remove);
- removed++;
goto next;
}
list_move(&info->shrinklist, &list);
next:
+ sbinfo->shrinklist_len--;
if (!--batch)
break;
}
@@ -509,57 +590,64 @@ next:
list_for_each_safe(pos, next, &list) {
int ret;
+ pgoff_t index;
info = list_entry(pos, struct shmem_inode_info, shrinklist);
inode = &info->vfs_inode;
if (nr_to_split && split >= nr_to_split)
- goto leave;
+ goto move_back;
- page = find_get_page(inode->i_mapping,
- (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT);
- if (!page)
+ index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT;
+ folio = filemap_get_folio(inode->i_mapping, index);
+ if (!folio)
goto drop;
/* No huge page at the end of the file: nothing to split */
- if (!PageTransHuge(page)) {
- put_page(page);
+ if (!folio_test_large(folio)) {
+ folio_put(folio);
goto drop;
}
/*
- * Leave the inode on the list if we failed to lock
- * the page at this time.
+ * Move the inode on the list back to shrinklist if we failed
+ * to lock the page at this time.
*
* Waiting for the lock may lead to deadlock in the
* reclaim path.
*/
- if (!trylock_page(page)) {
- put_page(page);
- goto leave;
+ if (!folio_trylock(folio)) {
+ folio_put(folio);
+ goto move_back;
}
- ret = split_huge_page(page);
- unlock_page(page);
- put_page(page);
+ ret = split_folio(folio);
+ folio_unlock(folio);
+ folio_put(folio);
- /* If split failed leave the inode on the list */
+ /* If split failed move the inode on the list back to shrinklist */
if (ret)
- goto leave;
+ goto move_back;
split++;
drop:
list_del_init(&info->shrinklist);
- removed++;
-leave:
+ goto put;
+move_back:
+ /*
+ * Make sure the inode is either on the global list or deleted
+ * from any local list before iput() since it could be deleted
+ * in another thread once we put the inode (then the local list
+ * is corrupted).
+ */
+ spin_lock(&sbinfo->shrinklist_lock);
+ list_move(&info->shrinklist, &sbinfo->shrinklist);
+ sbinfo->shrinklist_len++;
+ spin_unlock(&sbinfo->shrinklist_lock);
+put:
iput(inode);
}
- spin_lock(&sbinfo->shrinklist_lock);
- list_splice_tail(&list, &sbinfo->shrinklist);
- sbinfo->shrinklist_len -= removed;
- spin_unlock(&sbinfo->shrinklist_lock);
-
return split;
}
@@ -580,100 +668,109 @@ static long shmem_unused_huge_count(struct super_block *sb,
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
return READ_ONCE(sbinfo->shrinklist_len);
}
-#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
#define shmem_huge SHMEM_HUGE_DENY
-static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
- struct shrink_control *sc, unsigned long nr_to_split)
+bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode,
+ pgoff_t index, bool shmem_huge_force)
{
- return 0;
+ return false;
}
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
-static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
+static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
+ struct shrink_control *sc, unsigned long nr_to_split)
{
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
- (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) &&
- shmem_huge != SHMEM_HUGE_DENY)
- return true;
- return false;
+ return 0;
}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
- * Like add_to_page_cache_locked, but error if expected item has gone.
+ * Like filemap_add_folio, but error if expected item has gone.
*/
-static int shmem_add_to_page_cache(struct page *page,
+static int shmem_add_to_page_cache(struct folio *folio,
struct address_space *mapping,
- pgoff_t index, void *expected, gfp_t gfp)
+ pgoff_t index, void *expected, gfp_t gfp,
+ struct mm_struct *charge_mm)
{
- XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
- unsigned long i = 0;
- unsigned long nr = compound_nr(page);
+ XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio));
+ long nr = folio_nr_pages(folio);
+ int error;
- VM_BUG_ON_PAGE(PageTail(page), page);
- VM_BUG_ON_PAGE(index != round_down(index, nr), page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
- VM_BUG_ON(expected && PageTransHuge(page));
+ VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
+ VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
+ VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
+ VM_BUG_ON(expected && folio_test_large(folio));
- page_ref_add(page, nr);
- page->mapping = mapping;
- page->index = index;
+ folio_ref_add(folio, nr);
+ folio->mapping = mapping;
+ folio->index = index;
+
+ if (!folio_test_swapcache(folio)) {
+ error = mem_cgroup_charge(folio, charge_mm, gfp);
+ if (error) {
+ if (folio_test_pmd_mappable(folio)) {
+ count_vm_event(THP_FILE_FALLBACK);
+ count_vm_event(THP_FILE_FALLBACK_CHARGE);
+ }
+ goto error;
+ }
+ }
+ folio_throttle_swaprate(folio, gfp);
do {
- void *entry;
xas_lock_irq(&xas);
- entry = xas_find_conflict(&xas);
- if (entry != expected)
+ if (expected != xas_find_conflict(&xas)) {
xas_set_err(&xas, -EEXIST);
- xas_create_range(&xas);
- if (xas_error(&xas))
goto unlock;
-next:
- xas_store(&xas, page);
- if (++i < nr) {
- xas_next(&xas);
- goto next;
}
- if (PageTransHuge(page)) {
+ if (expected && xas_find_conflict(&xas)) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+ xas_store(&xas, folio);
+ if (xas_error(&xas))
+ goto unlock;
+ if (folio_test_pmd_mappable(folio)) {
count_vm_event(THP_FILE_ALLOC);
- __inc_node_page_state(page, NR_SHMEM_THPS);
+ __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr);
}
mapping->nrpages += nr;
- __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
- __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
+ __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
+ __lruvec_stat_mod_folio(folio, NR_SHMEM, nr);
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp));
if (xas_error(&xas)) {
- page->mapping = NULL;
- page_ref_sub(page, nr);
- return xas_error(&xas);
+ error = xas_error(&xas);
+ goto error;
}
return 0;
+error:
+ folio->mapping = NULL;
+ folio_ref_sub(folio, nr);
+ return error;
}
/*
- * Like delete_from_page_cache, but substitutes swap for page.
+ * Like delete_from_page_cache, but substitutes swap for @folio.
*/
-static void shmem_delete_from_page_cache(struct page *page, void *radswap)
+static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
{
- struct address_space *mapping = page->mapping;
+ struct address_space *mapping = folio->mapping;
+ long nr = folio_nr_pages(folio);
int error;
- VM_BUG_ON_PAGE(PageCompound(page), page);
-
xa_lock_irq(&mapping->i_pages);
- error = shmem_replace_entry(mapping, page->index, page, radswap);
- page->mapping = NULL;
- mapping->nrpages--;
- __dec_node_page_state(page, NR_FILE_PAGES);
- __dec_node_page_state(page, NR_SHMEM);
+ error = shmem_replace_entry(mapping, folio->index, folio, radswap);
+ folio->mapping = NULL;
+ mapping->nrpages -= nr;
+ __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
+ __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
xa_unlock_irq(&mapping->i_pages);
- put_page(page);
+ folio_put(folio);
BUG_ON(error);
}
@@ -696,7 +793,7 @@ static int shmem_free_swap(struct address_space *mapping,
* Determine (in bytes) how many of the shmem object's pages mapped by the
* given offsets are swapped out.
*
- * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
+ * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
* as long as the inode doesn't go away and racy results are not a problem.
*/
unsigned long shmem_partial_swap_usage(struct address_space *mapping,
@@ -728,7 +825,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
* Determine (in bytes) how many of the shmem object's pages mapped by the
* given vma is swapped out.
*
- * This is safe to call without i_mutex or the i_pages lock thanks to RCU,
+ * This is safe to call without i_rwsem or the i_pages lock thanks to RCU,
* as long as the inode doesn't go away and racy results are not a problem.
*/
unsigned long shmem_swap_usage(struct vm_area_struct *vma)
@@ -753,9 +850,8 @@ unsigned long shmem_swap_usage(struct vm_area_struct *vma)
return swapped << PAGE_SHIFT;
/* Here comes the more involved part */
- return shmem_partial_swap_usage(mapping,
- linear_page_index(vma, vma->vm_start),
- linear_page_index(vma, vma->vm_end));
+ return shmem_partial_swap_usage(mapping, vma->vm_pgoff,
+ vma->vm_pgoff + vma_pages(vma));
}
/*
@@ -763,31 +859,42 @@ unsigned long shmem_swap_usage(struct vm_area_struct *vma)
*/
void shmem_unlock_mapping(struct address_space *mapping)
{
- struct pagevec pvec;
- pgoff_t indices[PAGEVEC_SIZE];
+ struct folio_batch fbatch;
pgoff_t index = 0;
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
/*
* Minor point, but we might as well stop if someone else SHM_LOCKs it.
*/
- while (!mapping_unevictable(mapping)) {
- /*
- * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
- * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
- */
- pvec.nr = find_get_entries(mapping, index,
- PAGEVEC_SIZE, pvec.pages, indices);
- if (!pvec.nr)
- break;
- index = indices[pvec.nr - 1] + 1;
- pagevec_remove_exceptionals(&pvec);
- check_move_unevictable_pages(&pvec);
- pagevec_release(&pvec);
+ while (!mapping_unevictable(mapping) &&
+ filemap_get_folios(mapping, &index, ~0UL, &fbatch)) {
+ check_move_unevictable_folios(&fbatch);
+ folio_batch_release(&fbatch);
cond_resched();
}
}
+static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
+{
+ struct folio *folio;
+
+ /*
+ * At first avoid shmem_get_folio(,,,SGP_READ): that fails
+ * beyond i_size, and reports fallocated pages as holes.
+ */
+ folio = __filemap_get_folio(inode->i_mapping, index,
+ FGP_ENTRY | FGP_LOCK, 0);
+ if (!xa_is_value(folio))
+ return folio;
+ /*
+ * But read a page back from swap if any of it is within i_size
+ * (although in some cases this is just a waste of time).
+ */
+ folio = NULL;
+ shmem_get_folio(inode, index, &folio, SGP_READ);
+ return folio;
+}
+
/*
* Remove range of pages and swap entries from page cache, and free them.
* If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
@@ -799,10 +906,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
struct shmem_inode_info *info = SHMEM_I(inode);
pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT;
pgoff_t end = (lend + 1) >> PAGE_SHIFT;
- unsigned int partial_start = lstart & (PAGE_SIZE - 1);
- unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1);
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t indices[PAGEVEC_SIZE];
+ struct folio *folio;
+ bool same_folio;
long nr_swaps_freed = 0;
pgoff_t index;
int i;
@@ -810,104 +917,68 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (lend == -1)
end = -1; /* unsigned, so actually very big */
- pagevec_init(&pvec);
+ if (info->fallocend > start && info->fallocend <= end && !unfalloc)
+ info->fallocend = start;
+
+ folio_batch_init(&fbatch);
index = start;
- while (index < end) {
- pvec.nr = find_get_entries(mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE),
- pvec.pages, indices);
- if (!pvec.nr)
- break;
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
+ while (index < end && find_lock_entries(mapping, index, end - 1,
+ &fbatch, indices)) {
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ folio = fbatch.folios[i];
index = indices[i];
- if (index >= end)
- break;
- if (xa_is_value(page)) {
+ if (xa_is_value(folio)) {
if (unfalloc)
continue;
nr_swaps_freed += !shmem_free_swap(mapping,
- index, page);
+ index, folio);
continue;
}
+ index += folio_nr_pages(folio) - 1;
- VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page);
-
- if (!trylock_page(page))
- continue;
-
- if (PageTransTail(page)) {
- /* Middle of THP: zero out the page */
- clear_highpage(page);
- unlock_page(page);
- continue;
- } else if (PageTransHuge(page)) {
- if (index == round_down(end, HPAGE_PMD_NR)) {
- /*
- * Range ends in the middle of THP:
- * zero out the page
- */
- clear_highpage(page);
- unlock_page(page);
- continue;
- }
- index += HPAGE_PMD_NR - 1;
- i += HPAGE_PMD_NR - 1;
- }
-
- if (!unfalloc || !PageUptodate(page)) {
- VM_BUG_ON_PAGE(PageTail(page), page);
- if (page_mapping(page) == mapping) {
- VM_BUG_ON_PAGE(PageWriteback(page), page);
- truncate_inode_page(mapping, page);
- }
- }
- unlock_page(page);
+ if (!unfalloc || !folio_test_uptodate(folio))
+ truncate_inode_folio(mapping, folio);
+ folio_unlock(folio);
}
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
+ folio_batch_remove_exceptionals(&fbatch);
+ folio_batch_release(&fbatch);
cond_resched();
index++;
}
- if (partial_start) {
- struct page *page = NULL;
- shmem_getpage(inode, start - 1, &page, SGP_READ);
- if (page) {
- unsigned int top = PAGE_SIZE;
- if (start > end) {
- top = partial_end;
- partial_end = 0;
- }
- zero_user_segment(page, partial_start, top);
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
+ same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
+ folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
+ if (folio) {
+ same_folio = lend < folio_pos(folio) + folio_size(folio);
+ folio_mark_dirty(folio);
+ if (!truncate_inode_partial_folio(folio, lstart, lend)) {
+ start = folio->index + folio_nr_pages(folio);
+ if (same_folio)
+ end = folio->index;
}
+ folio_unlock(folio);
+ folio_put(folio);
+ folio = NULL;
}
- if (partial_end) {
- struct page *page = NULL;
- shmem_getpage(inode, end, &page, SGP_READ);
- if (page) {
- zero_user_segment(page, 0, partial_end);
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
- }
+
+ if (!same_folio)
+ folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT);
+ if (folio) {
+ folio_mark_dirty(folio);
+ if (!truncate_inode_partial_folio(folio, lstart, lend))
+ end = folio->index;
+ folio_unlock(folio);
+ folio_put(folio);
}
- if (start >= end)
- return;
index = start;
while (index < end) {
cond_resched();
- pvec.nr = find_get_entries(mapping, index,
- min(end - index, (pgoff_t)PAGEVEC_SIZE),
- pvec.pages, indices);
- if (!pvec.nr) {
+ if (!find_get_entries(mapping, index, end - 1, &fbatch,
+ indices)) {
/* If all gone or hole-punch or unfalloc, we're done */
if (index == start || end != -1)
break;
@@ -915,17 +986,14 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
index = start;
continue;
}
- for (i = 0; i < pagevec_count(&pvec); i++) {
- struct page *page = pvec.pages[i];
+ for (i = 0; i < folio_batch_count(&fbatch); i++) {
+ folio = fbatch.folios[i];
index = indices[i];
- if (index >= end)
- break;
-
- if (xa_is_value(page)) {
+ if (xa_is_value(folio)) {
if (unfalloc)
continue;
- if (shmem_free_swap(mapping, index, page)) {
+ if (shmem_free_swap(mapping, index, folio)) {
/* Swap was replaced by page: retry */
index--;
break;
@@ -934,50 +1002,24 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
continue;
}
- lock_page(page);
+ folio_lock(folio);
- if (PageTransTail(page)) {
- /* Middle of THP: zero out the page */
- clear_highpage(page);
- unlock_page(page);
- /*
- * Partial thp truncate due 'start' in middle
- * of THP: don't need to look on these pages
- * again on !pvec.nr restart.
- */
- if (index != round_down(end, HPAGE_PMD_NR))
- start++;
- continue;
- } else if (PageTransHuge(page)) {
- if (index == round_down(end, HPAGE_PMD_NR)) {
- /*
- * Range ends in the middle of THP:
- * zero out the page
- */
- clear_highpage(page);
- unlock_page(page);
- continue;
- }
- index += HPAGE_PMD_NR - 1;
- i += HPAGE_PMD_NR - 1;
- }
-
- if (!unfalloc || !PageUptodate(page)) {
- VM_BUG_ON_PAGE(PageTail(page), page);
- if (page_mapping(page) == mapping) {
- VM_BUG_ON_PAGE(PageWriteback(page), page);
- truncate_inode_page(mapping, page);
- } else {
+ if (!unfalloc || !folio_test_uptodate(folio)) {
+ if (folio_mapping(folio) != mapping) {
/* Page was replaced by swap: retry */
- unlock_page(page);
+ folio_unlock(folio);
index--;
break;
}
+ VM_BUG_ON_FOLIO(folio_test_writeback(folio),
+ folio);
+ truncate_inode_folio(mapping, folio);
}
- unlock_page(page);
+ index = folio->index + folio_nr_pages(folio) - 1;
+ folio_unlock(folio);
}
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
+ folio_batch_remove_exceptionals(&fbatch);
+ folio_batch_release(&fbatch);
index++;
}
@@ -991,37 +1033,55 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
{
shmem_undo_range(inode, lstart, lend, false);
inode->i_ctime = inode->i_mtime = current_time(inode);
+ inode_inc_iversion(inode);
}
EXPORT_SYMBOL_GPL(shmem_truncate_range);
-static int shmem_getattr(const struct path *path, struct kstat *stat,
+static int shmem_getattr(struct user_namespace *mnt_userns,
+ const struct path *path, struct kstat *stat,
u32 request_mask, unsigned int query_flags)
{
struct inode *inode = path->dentry->d_inode;
struct shmem_inode_info *info = SHMEM_I(inode);
- struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb);
if (info->alloced - info->swapped != inode->i_mapping->nrpages) {
spin_lock_irq(&info->lock);
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
}
- generic_fillattr(inode, stat);
-
- if (is_huge_enabled(sb_info))
+ if (info->fsflags & FS_APPEND_FL)
+ stat->attributes |= STATX_ATTR_APPEND;
+ if (info->fsflags & FS_IMMUTABLE_FL)
+ stat->attributes |= STATX_ATTR_IMMUTABLE;
+ if (info->fsflags & FS_NODUMP_FL)
+ stat->attributes |= STATX_ATTR_NODUMP;
+ stat->attributes_mask |= (STATX_ATTR_APPEND |
+ STATX_ATTR_IMMUTABLE |
+ STATX_ATTR_NODUMP);
+ generic_fillattr(&init_user_ns, inode, stat);
+
+ if (shmem_is_huge(NULL, inode, 0, false))
stat->blksize = HPAGE_PMD_SIZE;
+ if (request_mask & STATX_BTIME) {
+ stat->result_mask |= STATX_BTIME;
+ stat->btime.tv_sec = info->i_crtime.tv_sec;
+ stat->btime.tv_nsec = info->i_crtime.tv_nsec;
+ }
+
return 0;
}
-static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
+static int shmem_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
struct shmem_inode_info *info = SHMEM_I(inode);
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
int error;
+ bool update_mtime = false;
+ bool update_ctime = true;
- error = setattr_prepare(dentry, attr);
+ error = setattr_prepare(&init_user_ns, dentry, attr);
if (error)
return error;
@@ -1029,7 +1089,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
loff_t oldsize = inode->i_size;
loff_t newsize = attr->ia_size;
- /* protected by i_mutex */
+ /* protected by i_rwsem */
if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
(newsize > oldsize && (info->seals & F_SEAL_GROW)))
return -EPERM;
@@ -1040,7 +1100,9 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
if (error)
return error;
i_size_write(inode, newsize);
- inode->i_ctime = inode->i_mtime = current_time(inode);
+ update_mtime = true;
+ } else {
+ update_ctime = false;
}
if (newsize <= oldsize) {
loff_t holebegin = round_up(newsize, PAGE_SIZE);
@@ -1054,30 +1116,18 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr)
if (oldsize > holebegin)
unmap_mapping_range(inode->i_mapping,
holebegin, 0, 1);
-
- /*
- * Part of the huge page can be beyond i_size: subject
- * to shrink under memory pressure.
- */
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) {
- spin_lock(&sbinfo->shrinklist_lock);
- /*
- * _careful to defend against unlocked access to
- * ->shrink_list in shmem_unused_huge_shrink()
- */
- if (list_empty_careful(&info->shrinklist)) {
- list_add_tail(&info->shrinklist,
- &sbinfo->shrinklist);
- sbinfo->shrinklist_len++;
- }
- spin_unlock(&sbinfo->shrinklist_lock);
- }
}
}
- setattr_copy(inode, attr);
+ setattr_copy(&init_user_ns, inode, attr);
if (attr->ia_valid & ATTR_MODE)
- error = posix_acl_chmod(inode, inode->i_mode);
+ error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode);
+ if (!error && update_ctime) {
+ inode->i_ctime = current_time(inode);
+ if (update_mtime)
+ inode->i_mtime = inode->i_ctime;
+ inode_inc_iversion(inode);
+ }
return error;
}
@@ -1086,9 +1136,10 @@ static void shmem_evict_inode(struct inode *inode)
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- if (inode->i_mapping->a_ops == &shmem_aops) {
+ if (shmem_mapping(inode->i_mapping)) {
shmem_unacct_size(info->flags, inode->i_size);
inode->i_size = 0;
+ mapping_set_exiting(inode->i_mapping);
shmem_truncate_range(inode, 0, (loff_t)-1);
if (!list_empty(&info->shrinklist)) {
spin_lock(&sbinfo->shrinklist_lock);
@@ -1116,75 +1167,68 @@ static void shmem_evict_inode(struct inode *inode)
clear_inode(inode);
}
-extern struct swap_info_struct *swap_info[];
-
static int shmem_find_swap_entries(struct address_space *mapping,
- pgoff_t start, unsigned int nr_entries,
- struct page **entries, pgoff_t *indices,
- unsigned int type, bool frontswap)
+ pgoff_t start, struct folio_batch *fbatch,
+ pgoff_t *indices, unsigned int type)
{
XA_STATE(xas, &mapping->i_pages, start);
- struct page *page;
+ struct folio *folio;
swp_entry_t entry;
- unsigned int ret = 0;
-
- if (!nr_entries)
- return 0;
rcu_read_lock();
- xas_for_each(&xas, page, ULONG_MAX) {
- if (xas_retry(&xas, page))
+ xas_for_each(&xas, folio, ULONG_MAX) {
+ if (xas_retry(&xas, folio))
continue;
- if (!xa_is_value(page))
+ if (!xa_is_value(folio))
continue;
- entry = radix_to_swp_entry(page);
+ entry = radix_to_swp_entry(folio);
+ /*
+ * swapin error entries can be found in the mapping. But they're
+ * deliberately ignored here as we've done everything we can do.
+ */
if (swp_type(entry) != type)
continue;
- if (frontswap &&
- !frontswap_test(swap_info[type], swp_offset(entry)))
- continue;
- indices[ret] = xas.xa_index;
- entries[ret] = page;
+ indices[folio_batch_count(fbatch)] = xas.xa_index;
+ if (!folio_batch_add(fbatch, folio))
+ break;
if (need_resched()) {
xas_pause(&xas);
cond_resched_rcu();
}
- if (++ret == nr_entries)
- break;
}
rcu_read_unlock();
- return ret;
+ return xas.xa_index;
}
/*
* Move the swapped pages for an inode to page cache. Returns the count
* of pages swapped in, or the error in case of failure.
*/
-static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
- pgoff_t *indices)
+static int shmem_unuse_swap_entries(struct inode *inode,
+ struct folio_batch *fbatch, pgoff_t *indices)
{
int i = 0;
int ret = 0;
int error = 0;
struct address_space *mapping = inode->i_mapping;
- for (i = 0; i < pvec.nr; i++) {
- struct page *page = pvec.pages[i];
+ for (i = 0; i < folio_batch_count(fbatch); i++) {
+ struct folio *folio = fbatch->folios[i];
- if (!xa_is_value(page))
+ if (!xa_is_value(folio))
continue;
- error = shmem_swapin_page(inode, indices[i],
- &page, SGP_CACHE,
+ error = shmem_swapin_folio(inode, indices[i],
+ &folio, SGP_CACHE,
mapping_gfp_mask(mapping),
NULL, NULL);
if (error == 0) {
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
ret++;
}
if (error == -ENOMEM)
@@ -1197,44 +1241,27 @@ static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec,
/*
* If swap found in inode, free it and move page from swapcache to filecache.
*/
-static int shmem_unuse_inode(struct inode *inode, unsigned int type,
- bool frontswap, unsigned long *fs_pages_to_unuse)
+static int shmem_unuse_inode(struct inode *inode, unsigned int type)
{
struct address_space *mapping = inode->i_mapping;
pgoff_t start = 0;
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t indices[PAGEVEC_SIZE];
- bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0);
int ret = 0;
- pagevec_init(&pvec);
do {
- unsigned int nr_entries = PAGEVEC_SIZE;
-
- if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE)
- nr_entries = *fs_pages_to_unuse;
-
- pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries,
- pvec.pages, indices,
- type, frontswap);
- if (pvec.nr == 0) {
+ folio_batch_init(&fbatch);
+ shmem_find_swap_entries(mapping, start, &fbatch, indices, type);
+ if (folio_batch_count(&fbatch) == 0) {
ret = 0;
break;
}
- ret = shmem_unuse_swap_entries(inode, pvec, indices);
+ ret = shmem_unuse_swap_entries(inode, &fbatch, indices);
if (ret < 0)
break;
- if (frontswap_partial) {
- *fs_pages_to_unuse -= ret;
- if (*fs_pages_to_unuse == 0) {
- ret = FRONTSWAP_PAGES_UNUSED;
- break;
- }
- }
-
- start = indices[pvec.nr - 1];
+ start = indices[folio_batch_count(&fbatch) - 1];
} while (true);
return ret;
@@ -1245,8 +1272,7 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type,
* device 'type' back into memory, so the swap device can be
* unused.
*/
-int shmem_unuse(unsigned int type, bool frontswap,
- unsigned long *fs_pages_to_unuse)
+int shmem_unuse(unsigned int type)
{
struct shmem_inode_info *info, *next;
int error = 0;
@@ -1269,8 +1295,7 @@ int shmem_unuse(unsigned int type, bool frontswap,
atomic_inc(&info->stop_eviction);
mutex_unlock(&shmem_swaplist_mutex);
- error = shmem_unuse_inode(&info->vfs_inode, type, frontswap,
- fs_pages_to_unuse);
+ error = shmem_unuse_inode(&info->vfs_inode, type);
cond_resched();
mutex_lock(&shmem_swaplist_mutex);
@@ -1292,16 +1317,30 @@ int shmem_unuse(unsigned int type, bool frontswap,
*/
static int shmem_writepage(struct page *page, struct writeback_control *wbc)
{
+ struct folio *folio = page_folio(page);
struct shmem_inode_info *info;
struct address_space *mapping;
struct inode *inode;
swp_entry_t swap;
pgoff_t index;
- VM_BUG_ON_PAGE(PageCompound(page), page);
- BUG_ON(!PageLocked(page));
- mapping = page->mapping;
- index = page->index;
+ /*
+ * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or
+ * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages,
+ * and its shmem_writeback() needs them to be split when swapping.
+ */
+ if (folio_test_large(folio)) {
+ /* Ensure the subpages are still dirty */
+ folio_test_set_dirty(folio);
+ if (split_huge_page(page) < 0)
+ goto redirty;
+ folio = page_folio(page);
+ folio_clear_dirty(folio);
+ }
+
+ BUG_ON(!folio_test_locked(folio));
+ mapping = folio->mapping;
+ index = folio->index;
inode = mapping->host;
info = SHMEM_I(inode);
if (info->flags & VM_LOCKED)
@@ -1324,15 +1363,15 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
/*
* This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
* value into swapfile.c, the only way we can correctly account for a
- * fallocated page arriving here is now to initialize it and write it.
+ * fallocated folio arriving here is now to initialize it and write it.
*
- * That's okay for a page already fallocated earlier, but if we have
+ * That's okay for a folio already fallocated earlier, but if we have
* not yet completed the fallocation, then (a) we want to keep track
- * of this page in case we have to undo it, and (b) it may not be a
+ * of this folio in case we have to undo it, and (b) it may not be a
* good idea to continue anyway, once we're pushing into swap. So
- * reactivate the page, and let shmem_fallocate() quit when too many.
+ * reactivate the folio, and let shmem_fallocate() quit when too many.
*/
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
if (inode->i_private) {
struct shmem_falloc *shmem_falloc;
spin_lock(&inode->i_lock);
@@ -1348,18 +1387,18 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (shmem_falloc)
goto redirty;
}
- clear_highpage(page);
- flush_dcache_page(page);
- SetPageUptodate(page);
+ folio_zero_range(folio, 0, folio_size(folio));
+ flush_dcache_folio(folio);
+ folio_mark_uptodate(folio);
}
- swap = get_swap_page(page);
+ swap = folio_alloc_swap(folio);
if (!swap.val)
goto redirty;
/*
* Add inode to shmem_unuse()'s list of swapped-out inodes,
- * if it's not already there. Do it now before the page is
+ * if it's not already there. Do it now before the folio is
* moved to swap cache, when its pagelock no longer protects
* the inode from eviction. But don't unlock the mutex until
* we've incremented swapped, because shmem_unuse_inode() will
@@ -1369,29 +1408,30 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (list_empty(&info->swaplist))
list_add(&info->swaplist, &shmem_swaplist);
- if (add_to_swap_cache(page, swap,
- __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN) == 0) {
+ if (add_to_swap_cache(folio, swap,
+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ NULL) == 0) {
spin_lock_irq(&info->lock);
shmem_recalc_inode(inode);
info->swapped++;
spin_unlock_irq(&info->lock);
swap_shmem_alloc(swap);
- shmem_delete_from_page_cache(page, swp_to_radix_entry(swap));
+ shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
mutex_unlock(&shmem_swaplist_mutex);
- BUG_ON(page_mapped(page));
- swap_writepage(page, wbc);
+ BUG_ON(folio_mapped(folio));
+ swap_writepage(&folio->page, wbc);
return 0;
}
mutex_unlock(&shmem_swaplist_mutex);
- put_swap_page(page, swap);
+ put_swap_folio(folio, swap);
redirty:
- set_page_dirty(page);
+ folio_mark_dirty(folio);
if (wbc->for_reclaim)
- return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */
- unlock_page(page);
+ return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */
+ folio_unlock(folio);
return 0;
}
@@ -1412,10 +1452,10 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo)
{
struct mempolicy *mpol = NULL;
if (sbinfo->mpol) {
- spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
+ raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */
mpol = sbinfo->mpol;
mpol_get(mpol);
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
}
return mpol;
}
@@ -1448,32 +1488,55 @@ static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma)
mpol_cond_put(vma->vm_policy);
}
-static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp,
+static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
struct page *page;
- struct vm_fault vmf;
+ struct vm_fault vmf = {
+ .vma = &pvma,
+ };
shmem_pseudo_vma_init(&pvma, info, index);
- vmf.vma = &pvma;
- vmf.address = 0;
page = swap_cluster_readahead(swap, gfp, &vmf);
shmem_pseudo_vma_destroy(&pvma);
- return page;
+ if (!page)
+ return NULL;
+ return page_folio(page);
+}
+
+/*
+ * Make sure huge_gfp is always more limited than limit_gfp.
+ * Some of the flags set permissions, while others set limitations.
+ */
+static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
+{
+ gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM;
+ gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY;
+ gfp_t zoneflags = limit_gfp & GFP_ZONEMASK;
+ gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK);
+
+ /* Allow allocations only from the originally specified zones. */
+ result |= zoneflags;
+
+ /*
+ * Minimize the result gfp by taking the union with the deny flags,
+ * and the intersection of the allow flags.
+ */
+ result |= (limit_gfp & denyflags);
+ result |= (huge_gfp & limit_gfp) & allowflags;
+
+ return result;
}
-static struct page *shmem_alloc_hugepage(gfp_t gfp,
+static struct folio *shmem_alloc_hugefolio(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
struct address_space *mapping = info->vfs_inode.i_mapping;
pgoff_t hindex;
- struct page *page;
-
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
- return NULL;
+ struct folio *folio;
hindex = round_down(index, HPAGE_PMD_NR);
if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1,
@@ -1481,37 +1544,35 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp,
return NULL;
shmem_pseudo_vma_init(&pvma, info, hindex);
- page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN,
- HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true);
+ folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true);
shmem_pseudo_vma_destroy(&pvma);
- if (page)
- prep_transhuge_page(page);
- return page;
+ if (!folio)
+ count_vm_event(THP_FILE_FALLBACK);
+ return folio;
}
-static struct page *shmem_alloc_page(gfp_t gfp,
+static struct folio *shmem_alloc_folio(gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
struct vm_area_struct pvma;
- struct page *page;
+ struct folio *folio;
shmem_pseudo_vma_init(&pvma, info, index);
- page = alloc_page_vma(gfp, &pvma, 0);
+ folio = vma_alloc_folio(gfp, 0, &pvma, 0, false);
shmem_pseudo_vma_destroy(&pvma);
- return page;
+ return folio;
}
-static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
- struct inode *inode,
+static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode,
pgoff_t index, bool huge)
{
struct shmem_inode_info *info = SHMEM_I(inode);
- struct page *page;
+ struct folio *folio;
int nr;
int err = -ENOSPC;
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
huge = false;
nr = huge ? HPAGE_PMD_NR : 1;
@@ -1519,13 +1580,13 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp,
goto failed;
if (huge)
- page = shmem_alloc_hugepage(gfp, info, index);
+ folio = shmem_alloc_hugefolio(gfp, info, index);
else
- page = shmem_alloc_page(gfp, info, index);
- if (page) {
- __SetPageLocked(page);
- __SetPageSwapBacked(page);
- return page;
+ folio = shmem_alloc_folio(gfp, info, index);
+ if (folio) {
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
+ return folio;
}
err = -ENOMEM;
@@ -1536,7 +1597,7 @@ failed:
/*
* When a page is moved from swapcache to shmem filecache (either by the
- * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of
+ * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of
* shmem_unuse_inode()), it may have been read in earlier from swap, in
* ignorance of the mapping it belongs to. If that mapping has special
* constraints (like the gma500 GEM driver, which requires RAM below 4GB),
@@ -1546,53 +1607,57 @@ failed:
* NUMA mempolicy, and applied also to anonymous pages in do_swap_page();
* but for now it is a simple matter of zone.
*/
-static bool shmem_should_replace_page(struct page *page, gfp_t gfp)
+static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
{
- return page_zonenum(page) > gfp_zone(gfp);
+ return folio_zonenum(folio) > gfp_zone(gfp);
}
-static int shmem_replace_page(struct page **pagep, gfp_t gfp,
+static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
struct shmem_inode_info *info, pgoff_t index)
{
- struct page *oldpage, *newpage;
+ struct folio *old, *new;
struct address_space *swap_mapping;
swp_entry_t entry;
pgoff_t swap_index;
int error;
- oldpage = *pagep;
- entry.val = page_private(oldpage);
+ old = *foliop;
+ entry = folio_swap_entry(old);
swap_index = swp_offset(entry);
- swap_mapping = page_mapping(oldpage);
+ swap_mapping = swap_address_space(entry);
/*
* We have arrived here because our zones are constrained, so don't
* limit chance of success by further cpuset and node constraints.
*/
gfp &= ~GFP_CONSTRAINT_MASK;
- newpage = shmem_alloc_page(gfp, info, index);
- if (!newpage)
+ VM_BUG_ON_FOLIO(folio_test_large(old), old);
+ new = shmem_alloc_folio(gfp, info, index);
+ if (!new)
return -ENOMEM;
- get_page(newpage);
- copy_highpage(newpage, oldpage);
- flush_dcache_page(newpage);
+ folio_get(new);
+ folio_copy(new, old);
+ flush_dcache_folio(new);
- __SetPageLocked(newpage);
- __SetPageSwapBacked(newpage);
- SetPageUptodate(newpage);
- set_page_private(newpage, entry.val);
- SetPageSwapCache(newpage);
+ __folio_set_locked(new);
+ __folio_set_swapbacked(new);
+ folio_mark_uptodate(new);
+ folio_set_swap_entry(new, entry);
+ folio_set_swapcache(new);
/*
* Our caller will very soon move newpage out of swapcache, but it's
* a nice clean interface for us to replace oldpage by newpage there.
*/
xa_lock_irq(&swap_mapping->i_pages);
- error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
+ error = shmem_replace_entry(swap_mapping, swap_index, old, new);
if (!error) {
- __inc_node_page_state(newpage, NR_FILE_PAGES);
- __dec_node_page_state(oldpage, NR_FILE_PAGES);
+ mem_cgroup_migrate(old, new);
+ __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1);
+ __lruvec_stat_mod_folio(new, NR_SHMEM, 1);
+ __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1);
+ __lruvec_stat_mod_folio(old, NR_SHMEM, -1);
}
xa_unlock_irq(&swap_mapping->i_pages);
@@ -1602,48 +1667,78 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
* both PageSwapCache and page_private after getting page lock;
* but be defensive. Reverse old to newpage for clear and free.
*/
- oldpage = newpage;
+ old = new;
} else {
- mem_cgroup_migrate(oldpage, newpage);
- lru_cache_add_anon(newpage);
- *pagep = newpage;
+ folio_add_lru(new);
+ *foliop = new;
}
- ClearPageSwapCache(oldpage);
- set_page_private(oldpage, 0);
+ folio_clear_swapcache(old);
+ old->private = NULL;
- unlock_page(oldpage);
- put_page(oldpage);
- put_page(oldpage);
+ folio_unlock(old);
+ folio_put_refs(old, 2);
return error;
}
+static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
+ struct folio *folio, swp_entry_t swap)
+{
+ struct address_space *mapping = inode->i_mapping;
+ struct shmem_inode_info *info = SHMEM_I(inode);
+ swp_entry_t swapin_error;
+ void *old;
+
+ swapin_error = make_swapin_error_entry(&folio->page);
+ old = xa_cmpxchg_irq(&mapping->i_pages, index,
+ swp_to_radix_entry(swap),
+ swp_to_radix_entry(swapin_error), 0);
+ if (old != swp_to_radix_entry(swap))
+ return;
+
+ folio_wait_writeback(folio);
+ delete_from_swap_cache(folio);
+ spin_lock_irq(&info->lock);
+ /*
+ * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't
+ * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in
+ * shmem_evict_inode.
+ */
+ info->alloced--;
+ info->swapped--;
+ shmem_recalc_inode(inode);
+ spin_unlock_irq(&info->lock);
+ swap_free(swap);
+}
+
/*
- * Swap in the page pointed to by *pagep.
- * Caller has to make sure that *pagep contains a valid swapped page.
- * Returns 0 and the page in pagep if success. On failure, returns the
- * the error code and NULL in *pagep.
+ * Swap in the folio pointed to by *foliop.
+ * Caller has to make sure that *foliop contains a valid swapped folio.
+ * Returns 0 and the folio in foliop if success. On failure, returns the
+ * error code and NULL in *foliop.
*/
-static int shmem_swapin_page(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp,
+static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
+ struct folio **foliop, enum sgp_type sgp,
gfp_t gfp, struct vm_area_struct *vma,
vm_fault_t *fault_type)
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
- struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
- struct mem_cgroup *memcg;
- struct page *page;
+ struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
+ struct folio *folio = NULL;
swp_entry_t swap;
int error;
- VM_BUG_ON(!*pagep || !xa_is_value(*pagep));
- swap = radix_to_swp_entry(*pagep);
- *pagep = NULL;
+ VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
+ swap = radix_to_swp_entry(*foliop);
+ *foliop = NULL;
+
+ if (is_swapin_error_entry(swap))
+ return -EIO;
/* Look it up and read it in.. */
- page = lookup_swap_cache(swap, NULL, 0);
- if (!page) {
+ folio = swap_cache_get_folio(swap, NULL, 0);
+ if (!folio) {
/* Or update major stats only when swapin succeeds?? */
if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
@@ -1651,114 +1746,101 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
count_memcg_event_mm(charge_mm, PGMAJFAULT);
}
/* Here we actually start the io */
- page = shmem_swapin(swap, gfp, info, index);
- if (!page) {
+ folio = shmem_swapin(swap, gfp, info, index);
+ if (!folio) {
error = -ENOMEM;
goto failed;
}
}
- /* We have to do this with page locked to prevent races */
- lock_page(page);
- if (!PageSwapCache(page) || page_private(page) != swap.val ||
+ /* We have to do this with folio locked to prevent races */
+ folio_lock(folio);
+ if (!folio_test_swapcache(folio) ||
+ folio_swap_entry(folio).val != swap.val ||
!shmem_confirm_swap(mapping, index, swap)) {
error = -EEXIST;
goto unlock;
}
- if (!PageUptodate(page)) {
+ if (!folio_test_uptodate(folio)) {
error = -EIO;
goto failed;
}
- wait_on_page_writeback(page);
+ folio_wait_writeback(folio);
+
+ /*
+ * Some architectures may have to restore extra metadata to the
+ * folio after reading from swap.
+ */
+ arch_swap_restore(swap, folio);
- if (shmem_should_replace_page(page, gfp)) {
- error = shmem_replace_page(&page, gfp, info, index);
+ if (shmem_should_replace_folio(folio, gfp)) {
+ error = shmem_replace_folio(&folio, gfp, info, index);
if (error)
goto failed;
}
- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
- false);
- if (!error) {
- error = shmem_add_to_page_cache(page, mapping, index,
- swp_to_radix_entry(swap), gfp);
- /*
- * We already confirmed swap under page lock, and make
- * no memory allocation here, so usually no possibility
- * of error; but free_swap_and_cache() only trylocks a
- * page, so it is just possible that the entry has been
- * truncated or holepunched since swap was confirmed.
- * shmem_undo_range() will have done some of the
- * unaccounting, now delete_from_swap_cache() will do
- * the rest.
- */
- if (error) {
- mem_cgroup_cancel_charge(page, memcg, false);
- delete_from_swap_cache(page);
- }
- }
+ error = shmem_add_to_page_cache(folio, mapping, index,
+ swp_to_radix_entry(swap), gfp,
+ charge_mm);
if (error)
goto failed;
- mem_cgroup_commit_charge(page, memcg, true, false);
-
spin_lock_irq(&info->lock);
info->swapped--;
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
if (sgp == SGP_WRITE)
- mark_page_accessed(page);
+ folio_mark_accessed(folio);
- delete_from_swap_cache(page);
- set_page_dirty(page);
+ delete_from_swap_cache(folio);
+ folio_mark_dirty(folio);
swap_free(swap);
- *pagep = page;
+ *foliop = folio;
return 0;
failed:
if (!shmem_confirm_swap(mapping, index, swap))
error = -EEXIST;
+ if (error == -EIO)
+ shmem_set_folio_swapin_error(inode, index, folio, swap);
unlock:
- if (page) {
- unlock_page(page);
- put_page(page);
+ if (folio) {
+ folio_unlock(folio);
+ folio_put(folio);
}
return error;
}
/*
- * shmem_getpage_gfp - find page in cache, or get from swap, or allocate
+ * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate
*
* If we allocate a new one we do not mark it dirty. That's up to the
* vm. If we swap it in we mark it dirty since we also free the swap
* entry since a page cannot live in both the swap and page cache.
*
- * vmf and fault_type are only supplied by shmem_fault:
+ * vma, vmf, and fault_type are only supplied by shmem_fault:
* otherwise they are NULL.
*/
-static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
- struct page **pagep, enum sgp_type sgp, gfp_t gfp,
- struct vm_area_struct *vma, struct vm_fault *vmf,
- vm_fault_t *fault_type)
+static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
+ struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
+ struct vm_area_struct *vma, struct vm_fault *vmf,
+ vm_fault_t *fault_type)
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo;
struct mm_struct *charge_mm;
- struct mem_cgroup *memcg;
- struct page *page;
- enum sgp_type sgp_huge = sgp;
+ struct folio *folio;
pgoff_t hindex = index;
+ gfp_t huge_gfp;
int error;
int once = 0;
int alloced = 0;
if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT))
return -EFBIG;
- if (sgp == SGP_NOHUGE || sgp == SGP_HUGE)
- sgp = SGP_CACHE;
repeat:
if (sgp <= SGP_CACHE &&
((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
@@ -1766,38 +1848,53 @@ repeat:
}
sbinfo = SHMEM_SB(inode->i_sb);
- charge_mm = vma ? vma->vm_mm : current->mm;
+ charge_mm = vma ? vma->vm_mm : NULL;
- page = find_lock_entry(mapping, index);
- if (xa_is_value(page)) {
- error = shmem_swapin_page(inode, index, &page,
+ folio = __filemap_get_folio(mapping, index, FGP_ENTRY | FGP_LOCK, 0);
+ if (folio && vma && userfaultfd_minor(vma)) {
+ if (!xa_is_value(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ }
+ *fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
+ return 0;
+ }
+
+ if (xa_is_value(folio)) {
+ error = shmem_swapin_folio(inode, index, &folio,
sgp, gfp, vma, fault_type);
if (error == -EEXIST)
goto repeat;
- *pagep = page;
+ *foliop = folio;
return error;
}
- if (page && sgp == SGP_WRITE)
- mark_page_accessed(page);
-
- /* fallocated page? */
- if (page && !PageUptodate(page)) {
+ if (folio) {
+ hindex = folio->index;
+ if (sgp == SGP_WRITE)
+ folio_mark_accessed(folio);
+ if (folio_test_uptodate(folio))
+ goto out;
+ /* fallocated folio */
if (sgp != SGP_READ)
goto clear;
- unlock_page(page);
- put_page(page);
- page = NULL;
+ folio_unlock(folio);
+ folio_put(folio);
}
- if (page || sgp == SGP_READ) {
- *pagep = page;
+
+ /*
+ * SGP_READ: succeed on hole, with NULL folio, letting caller zero.
+ * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail.
+ */
+ *foliop = NULL;
+ if (sgp == SGP_READ)
return 0;
- }
+ if (sgp == SGP_NOALLOC)
+ return -ENOENT;
/*
- * Fast cache lookup did not find it:
- * bring it back from swap or allocate.
+ * Fast cache lookup and swap lookup did not find it: allocate.
*/
if (vma && userfaultfd_missing(vma)) {
@@ -1805,48 +1902,25 @@ repeat:
return 0;
}
- /* shmem_symlink() */
- if (mapping->a_ops != &shmem_aops)
- goto alloc_nohuge;
- if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
+ if (!shmem_is_huge(vma, inode, index, false))
goto alloc_nohuge;
- if (shmem_huge == SHMEM_HUGE_FORCE)
- goto alloc_huge;
- switch (sbinfo->huge) {
- loff_t i_size;
- pgoff_t off;
- case SHMEM_HUGE_NEVER:
- goto alloc_nohuge;
- case SHMEM_HUGE_WITHIN_SIZE:
- off = round_up(index, HPAGE_PMD_NR);
- i_size = round_up(i_size_read(inode), PAGE_SIZE);
- if (i_size >= HPAGE_PMD_SIZE &&
- i_size >> PAGE_SHIFT >= off)
- goto alloc_huge;
- /* fallthrough */
- case SHMEM_HUGE_ADVISE:
- if (sgp_huge == SGP_HUGE)
- goto alloc_huge;
- /* TODO: implement fadvise() hints */
- goto alloc_nohuge;
- }
-alloc_huge:
- page = shmem_alloc_and_acct_page(gfp, inode, index, true);
- if (IS_ERR(page)) {
+ huge_gfp = vma_thp_gfp_mask(vma);
+ huge_gfp = limit_gfp_mask(huge_gfp, gfp);
+ folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true);
+ if (IS_ERR(folio)) {
alloc_nohuge:
- page = shmem_alloc_and_acct_page(gfp, inode,
- index, false);
+ folio = shmem_alloc_and_acct_folio(gfp, inode, index, false);
}
- if (IS_ERR(page)) {
+ if (IS_ERR(folio)) {
int retry = 5;
- error = PTR_ERR(page);
- page = NULL;
+ error = PTR_ERR(folio);
+ folio = NULL;
if (error != -ENOSPC)
goto unlock;
/*
- * Try to reclaim some space by splitting a huge page
+ * Try to reclaim some space by splitting a large folio
* beyond i_size on the filesystem.
*/
while (retry--) {
@@ -1861,41 +1935,30 @@ alloc_nohuge:
goto unlock;
}
- if (PageTransHuge(page))
- hindex = round_down(index, HPAGE_PMD_NR);
- else
- hindex = index;
+ hindex = round_down(index, folio_nr_pages(folio));
if (sgp == SGP_WRITE)
- __SetPageReferenced(page);
+ __folio_set_referenced(folio);
- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
- PageTransHuge(page));
+ error = shmem_add_to_page_cache(folio, mapping, hindex,
+ NULL, gfp & GFP_RECLAIM_MASK,
+ charge_mm);
if (error)
goto unacct;
- error = shmem_add_to_page_cache(page, mapping, hindex,
- NULL, gfp & GFP_RECLAIM_MASK);
- if (error) {
- mem_cgroup_cancel_charge(page, memcg,
- PageTransHuge(page));
- goto unacct;
- }
- mem_cgroup_commit_charge(page, memcg, false,
- PageTransHuge(page));
- lru_cache_add_anon(page);
+ folio_add_lru(folio);
spin_lock_irq(&info->lock);
- info->alloced += compound_nr(page);
- inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page);
+ info->alloced += folio_nr_pages(folio);
+ inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio);
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
alloced = true;
- if (PageTransHuge(page) &&
+ if (folio_test_pmd_mappable(folio) &&
DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
- hindex + HPAGE_PMD_NR - 1) {
+ folio_next_index(folio) - 1) {
/*
- * Part of the huge page is beyond i_size: subject
+ * Part of the large folio is beyond i_size: subject
* to shrink under memory pressure.
*/
spin_lock(&sbinfo->shrinklist_lock);
@@ -1912,33 +1975,31 @@ alloc_nohuge:
}
/*
- * Let SGP_FALLOC use the SGP_WRITE optimization on a new page.
+ * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio.
*/
if (sgp == SGP_FALLOC)
sgp = SGP_WRITE;
clear:
/*
- * Let SGP_WRITE caller clear ends if write does not fill page;
- * but SGP_FALLOC on a page fallocated earlier must initialize
+ * Let SGP_WRITE caller clear ends if write does not fill folio;
+ * but SGP_FALLOC on a folio fallocated earlier must initialize
* it now, lest undo on failure cancel our earlier guarantee.
*/
- if (sgp != SGP_WRITE && !PageUptodate(page)) {
- struct page *head = compound_head(page);
- int i;
+ if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) {
+ long i, n = folio_nr_pages(folio);
- for (i = 0; i < compound_nr(head); i++) {
- clear_highpage(head + i);
- flush_dcache_page(head + i);
- }
- SetPageUptodate(head);
+ for (i = 0; i < n; i++)
+ clear_highpage(folio_page(folio, i));
+ flush_dcache_folio(folio);
+ folio_mark_uptodate(folio);
}
/* Perhaps the file has been truncated since we checked */
if (sgp <= SGP_CACHE &&
((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) {
if (alloced) {
- ClearPageDirty(page);
- delete_from_page_cache(page);
+ folio_clear_dirty(folio);
+ filemap_remove_folio(folio);
spin_lock_irq(&info->lock);
shmem_recalc_inode(inode);
spin_unlock_irq(&info->lock);
@@ -1946,24 +2007,25 @@ clear:
error = -EINVAL;
goto unlock;
}
- *pagep = page + index - hindex;
+out:
+ *foliop = folio;
return 0;
/*
* Error recovery.
*/
unacct:
- shmem_inode_unacct_blocks(inode, compound_nr(page));
+ shmem_inode_unacct_blocks(inode, folio_nr_pages(folio));
- if (PageTransHuge(page)) {
- unlock_page(page);
- put_page(page);
+ if (folio_test_large(folio)) {
+ folio_unlock(folio);
+ folio_put(folio);
goto alloc_nohuge;
}
unlock:
- if (page) {
- unlock_page(page);
- put_page(page);
+ if (folio) {
+ folio_unlock(folio);
+ folio_put(folio);
}
if (error == -ENOSPC && !once++) {
spin_lock_irq(&info->lock);
@@ -1976,6 +2038,13 @@ unlock:
return error;
}
+int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
+ enum sgp_type sgp)
+{
+ return shmem_get_folio_gfp(inode, index, foliop, sgp,
+ mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL);
+}
+
/*
* This is like autoremove_wake_function, but it removes the wait queue
* entry unconditionally - even if something else had already woken the
@@ -1993,14 +2062,14 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma;
struct inode *inode = file_inode(vma->vm_file);
gfp_t gfp = mapping_gfp_mask(inode->i_mapping);
- enum sgp_type sgp;
+ struct folio *folio = NULL;
int err;
vm_fault_t ret = VM_FAULT_LOCKED;
/*
* Trinity finds that probing a hole which tmpfs is punching can
* prevent the hole-punch from ever completing: which in turn
- * locks writers out with its hold on i_mutex. So refrain from
+ * locks writers out with its hold on i_rwsem. So refrain from
* faulting pages into the hole while it's being punched. Although
* shmem_undo_range() does remove the additions, it may be unable to
* keep up, as each new page needs its own unmap_mapping_range() call,
@@ -2011,7 +2080,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
* we just need to make racing faults a rare case.
*
* The implementation below would be much simpler if we just used a
- * standard mutex or completion: but we cannot take i_mutex in fault,
+ * standard mutex or completion: but we cannot take i_rwsem in fault,
* and bloating every shmem inode for this unlikely case would be sad.
*/
if (unlikely(inode->i_private)) {
@@ -2056,18 +2125,12 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
spin_unlock(&inode->i_lock);
}
- sgp = SGP_CACHE;
-
- if ((vma->vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
- sgp = SGP_NOHUGE;
- else if (vma->vm_flags & VM_HUGEPAGE)
- sgp = SGP_HUGE;
-
- err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp,
+ err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE,
gfp, vma, vmf, &ret);
if (err)
return vmf_error(err);
+ if (folio)
+ vmf->page = folio_file_page(folio, vmf->pgoff);
return ret;
}
@@ -2089,7 +2152,7 @@ unsigned long shmem_get_unmapped_area(struct file *file,
get_area = current->mm->get_unmapped_area;
addr = get_area(file, uaddr, len, pgoff, flags);
- if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE))
+ if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
return addr;
if (IS_ERR_VALUE(addr))
return addr;
@@ -2178,91 +2241,114 @@ static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
}
#endif
-int shmem_lock(struct file *file, int lock, struct user_struct *user)
+int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
{
struct inode *inode = file_inode(file);
struct shmem_inode_info *info = SHMEM_I(inode);
int retval = -ENOMEM;
- spin_lock_irq(&info->lock);
+ /*
+ * What serializes the accesses to info->flags?
+ * ipc_lock_object() when called from shmctl_do_lock(),
+ * no serialization needed when called from shm_destroy().
+ */
if (lock && !(info->flags & VM_LOCKED)) {
- if (!user_shm_lock(inode->i_size, user))
+ if (!user_shm_lock(inode->i_size, ucounts))
goto out_nomem;
info->flags |= VM_LOCKED;
mapping_set_unevictable(file->f_mapping);
}
- if (!lock && (info->flags & VM_LOCKED) && user) {
- user_shm_unlock(inode->i_size, user);
+ if (!lock && (info->flags & VM_LOCKED) && ucounts) {
+ user_shm_unlock(inode->i_size, ucounts);
info->flags &= ~VM_LOCKED;
mapping_clear_unevictable(file->f_mapping);
}
retval = 0;
out_nomem:
- spin_unlock_irq(&info->lock);
return retval;
}
static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
{
struct shmem_inode_info *info = SHMEM_I(file_inode(file));
+ int ret;
- if (info->seals & F_SEAL_FUTURE_WRITE) {
- /*
- * New PROT_WRITE and MAP_SHARED mmaps are not allowed when
- * "future write" seal active.
- */
- if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE))
- return -EPERM;
+ ret = seal_check_future_write(info->seals, vma);
+ if (ret)
+ return ret;
- /*
- * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as
- * MAP_SHARED and read-only, take care to not allow mprotect to
- * revert protections on such mappings. Do this only for shared
- * mappings. For private mappings, don't need to mask
- * VM_MAYWRITE as we still want them to be COW-writable.
- */
- if (vma->vm_flags & VM_SHARED)
- vma->vm_flags &= ~(VM_MAYWRITE);
- }
+ /* arm64 - allow memory tagging on RAM-based files */
+ vma->vm_flags |= VM_MTE_ALLOWED;
file_accessed(file);
vma->vm_ops = &shmem_vm_ops;
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
- ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
- (vma->vm_end & HPAGE_PMD_MASK)) {
- khugepaged_enter(vma, vma->vm_flags);
- }
return 0;
}
-static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
+#ifdef CONFIG_TMPFS_XATTR
+static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
+
+/*
+ * chattr's fsflags are unrelated to extended attributes,
+ * but tmpfs has chosen to enable them under the same config option.
+ */
+static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
+{
+ unsigned int i_flags = 0;
+
+ if (fsflags & FS_NOATIME_FL)
+ i_flags |= S_NOATIME;
+ if (fsflags & FS_APPEND_FL)
+ i_flags |= S_APPEND;
+ if (fsflags & FS_IMMUTABLE_FL)
+ i_flags |= S_IMMUTABLE;
+ /*
+ * But FS_NODUMP_FL does not require any action in i_flags.
+ */
+ inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE);
+}
+#else
+static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags)
+{
+}
+#define shmem_initxattrs NULL
+#endif
+
+static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir,
umode_t mode, dev_t dev, unsigned long flags)
{
struct inode *inode;
struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ ino_t ino;
- if (shmem_reserve_inode(sb))
+ if (shmem_reserve_inode(sb, &ino))
return NULL;
inode = new_inode(sb);
if (inode) {
- inode->i_ino = get_next_ino();
- inode_init_owner(inode, dir, mode);
+ inode->i_ino = ino;
+ inode_init_owner(&init_user_ns, inode, dir, mode);
inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
- inode->i_generation = prandom_u32();
+ inode->i_generation = get_random_u32();
info = SHMEM_I(inode);
memset(info, 0, (char *)inode - (char *)info);
spin_lock_init(&info->lock);
atomic_set(&info->stop_eviction, 0);
info->seals = F_SEAL_SEAL;
info->flags = flags & VM_NORESERVE;
+ info->i_crtime = inode->i_mtime;
+ info->fsflags = (dir == NULL) ? 0 :
+ SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED;
+ if (info->fsflags)
+ shmem_set_inode_flags(inode, info->fsflags);
INIT_LIST_HEAD(&info->shrinklist);
INIT_LIST_HEAD(&info->swaplist);
simple_xattrs_init(&info->xattrs);
cache_no_acl(inode);
+ mapping_set_large_folios(inode->i_mapping);
switch (mode & S_IFMT) {
default:
@@ -2298,185 +2384,142 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
return inode;
}
-bool shmem_mapping(struct address_space *mapping)
-{
- return mapping->a_ops == &shmem_aops;
-}
-
-static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- bool zeropage,
- struct page **pagep)
+#ifdef CONFIG_USERFAULTFD
+int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
+ pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ unsigned long src_addr,
+ bool zeropage, bool wp_copy,
+ struct page **pagep)
{
struct inode *inode = file_inode(dst_vma->vm_file);
struct shmem_inode_info *info = SHMEM_I(inode);
struct address_space *mapping = inode->i_mapping;
gfp_t gfp = mapping_gfp_mask(mapping);
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
- struct mem_cgroup *memcg;
- spinlock_t *ptl;
void *page_kaddr;
- struct page *page;
- pte_t _dst_pte, *dst_pte;
+ struct folio *folio;
int ret;
- pgoff_t offset, max_off;
+ pgoff_t max_off;
- ret = -ENOMEM;
- if (!shmem_inode_acct_block(inode, 1))
- goto out;
+ if (!shmem_inode_acct_block(inode, 1)) {
+ /*
+ * We may have got a page, returned -ENOENT triggering a retry,
+ * and now we find ourselves with -ENOMEM. Release the page, to
+ * avoid a BUG_ON in our caller.
+ */
+ if (unlikely(*pagep)) {
+ put_page(*pagep);
+ *pagep = NULL;
+ }
+ return -ENOMEM;
+ }
if (!*pagep) {
- page = shmem_alloc_page(gfp, info, pgoff);
- if (!page)
+ ret = -ENOMEM;
+ folio = shmem_alloc_folio(gfp, info, pgoff);
+ if (!folio)
goto out_unacct_blocks;
- if (!zeropage) { /* mcopy_atomic */
- page_kaddr = kmap_atomic(page);
+ if (!zeropage) { /* COPY */
+ page_kaddr = kmap_local_folio(folio, 0);
+ /*
+ * The read mmap_lock is held here. Despite the
+ * mmap_lock being read recursive a deadlock is still
+ * possible if a writer has taken a lock. For example:
+ *
+ * process A thread 1 takes read lock on own mmap_lock
+ * process A thread 2 calls mmap, blocks taking write lock
+ * process B thread 1 takes page fault, read lock on own mmap lock
+ * process B thread 2 calls mmap, blocks taking write lock
+ * process A thread 1 blocks taking read lock on process B
+ * process B thread 1 blocks taking read lock on process A
+ *
+ * Disable page faults to prevent potential deadlock
+ * and retry the copy outside the mmap_lock.
+ */
+ pagefault_disable();
ret = copy_from_user(page_kaddr,
(const void __user *)src_addr,
PAGE_SIZE);
- kunmap_atomic(page_kaddr);
+ pagefault_enable();
+ kunmap_local(page_kaddr);
- /* fallback to copy_from_user outside mmap_sem */
+ /* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
- *pagep = page;
- shmem_inode_unacct_blocks(inode, 1);
+ *pagep = &folio->page;
+ ret = -ENOENT;
/* don't free the page */
- return -ENOENT;
+ goto out_unacct_blocks;
}
- } else { /* mfill_zeropage_atomic */
- clear_highpage(page);
+
+ flush_dcache_folio(folio);
+ } else { /* ZEROPAGE */
+ clear_user_highpage(&folio->page, dst_addr);
}
} else {
- page = *pagep;
+ folio = page_folio(*pagep);
+ VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
*pagep = NULL;
}
- VM_BUG_ON(PageLocked(page) || PageSwapBacked(page));
- __SetPageLocked(page);
- __SetPageSwapBacked(page);
- __SetPageUptodate(page);
+ VM_BUG_ON(folio_test_locked(folio));
+ VM_BUG_ON(folio_test_swapbacked(folio));
+ __folio_set_locked(folio);
+ __folio_set_swapbacked(folio);
+ __folio_mark_uptodate(folio);
ret = -EFAULT;
- offset = linear_page_index(dst_vma, dst_addr);
max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- if (unlikely(offset >= max_off))
+ if (unlikely(pgoff >= max_off))
goto out_release;
- ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
+ ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL,
+ gfp & GFP_RECLAIM_MASK, dst_mm);
if (ret)
goto out_release;
- ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
- gfp & GFP_RECLAIM_MASK);
+ ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr,
+ &folio->page, true, wp_copy);
if (ret)
- goto out_release_uncharge;
-
- mem_cgroup_commit_charge(page, memcg, false, false);
-
- _dst_pte = mk_pte(page, dst_vma->vm_page_prot);
- if (dst_vma->vm_flags & VM_WRITE)
- _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte));
- else {
- /*
- * We don't set the pte dirty if the vma has no
- * VM_WRITE permission, so mark the page dirty or it
- * could be freed from under us. We could do it
- * unconditionally before unlock_page(), but doing it
- * only if VM_WRITE is not set is faster.
- */
- set_page_dirty(page);
- }
-
- dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl);
-
- ret = -EFAULT;
- max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
- if (unlikely(offset >= max_off))
- goto out_release_uncharge_unlock;
-
- ret = -EEXIST;
- if (!pte_none(*dst_pte))
- goto out_release_uncharge_unlock;
+ goto out_delete_from_cache;
- lru_cache_add_anon(page);
-
- spin_lock(&info->lock);
+ spin_lock_irq(&info->lock);
info->alloced++;
inode->i_blocks += BLOCKS_PER_PAGE;
shmem_recalc_inode(inode);
- spin_unlock(&info->lock);
-
- inc_mm_counter(dst_mm, mm_counter_file(page));
- page_add_file_rmap(page, false);
- set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
+ spin_unlock_irq(&info->lock);
- /* No need to invalidate - it was non-present before */
- update_mmu_cache(dst_vma, dst_addr, dst_pte);
- pte_unmap_unlock(dst_pte, ptl);
- unlock_page(page);
- ret = 0;
-out:
- return ret;
-out_release_uncharge_unlock:
- pte_unmap_unlock(dst_pte, ptl);
- ClearPageDirty(page);
- delete_from_page_cache(page);
-out_release_uncharge:
- mem_cgroup_cancel_charge(page, memcg, false);
+ folio_unlock(folio);
+ return 0;
+out_delete_from_cache:
+ filemap_remove_folio(folio);
out_release:
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
+ folio_put(folio);
out_unacct_blocks:
shmem_inode_unacct_blocks(inode, 1);
- goto out;
-}
-
-int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr,
- unsigned long src_addr,
- struct page **pagep)
-{
- return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, src_addr, false, pagep);
-}
-
-int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm,
- pmd_t *dst_pmd,
- struct vm_area_struct *dst_vma,
- unsigned long dst_addr)
-{
- struct page *page = NULL;
-
- return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma,
- dst_addr, 0, true, &page);
+ return ret;
}
+#endif /* CONFIG_USERFAULTFD */
#ifdef CONFIG_TMPFS
static const struct inode_operations shmem_symlink_inode_operations;
static const struct inode_operations shmem_short_symlink_operations;
-#ifdef CONFIG_TMPFS_XATTR
-static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
-#else
-#define shmem_initxattrs NULL
-#endif
-
static int
shmem_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
struct shmem_inode_info *info = SHMEM_I(inode);
pgoff_t index = pos >> PAGE_SHIFT;
+ struct folio *folio;
+ int ret = 0;
- /* i_mutex is held by caller */
+ /* i_rwsem is held by caller */
if (unlikely(info->seals & (F_SEAL_GROW |
F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) {
if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))
@@ -2485,7 +2528,20 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
return -EPERM;
}
- return shmem_getpage(inode, index, pagep, SGP_WRITE);
+ ret = shmem_get_folio(inode, index, &folio, SGP_WRITE);
+
+ if (ret)
+ return ret;
+
+ *pagep = folio_file_page(folio, index);
+ if (PageHWPoison(*pagep)) {
+ folio_unlock(folio);
+ folio_put(folio);
+ *pagep = NULL;
+ return -EIO;
+ }
+
+ return 0;
}
static int
@@ -2531,23 +2587,15 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct address_space *mapping = inode->i_mapping;
pgoff_t index;
unsigned long offset;
- enum sgp_type sgp = SGP_READ;
int error = 0;
ssize_t retval = 0;
loff_t *ppos = &iocb->ki_pos;
- /*
- * Might this read be for a stacking filesystem? Then when reading
- * holes of a sparse file, we actually need to allocate those pages,
- * and even mark them dirty, so it cannot exceed the max_blocks limit.
- */
- if (!iter_is_iovec(to))
- sgp = SGP_CACHE;
-
index = *ppos >> PAGE_SHIFT;
offset = *ppos & ~PAGE_MASK;
for (;;) {
+ struct folio *folio = NULL;
struct page *page = NULL;
pgoff_t end_index;
unsigned long nr, ret;
@@ -2562,21 +2610,26 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
break;
}
- error = shmem_getpage(inode, index, &page, sgp);
+ error = shmem_get_folio(inode, index, &folio, SGP_READ);
if (error) {
if (error == -EINVAL)
error = 0;
break;
}
- if (page) {
- if (sgp == SGP_CACHE)
- set_page_dirty(page);
- unlock_page(page);
+ if (folio) {
+ folio_unlock(folio);
+
+ page = folio_file_page(folio, index);
+ if (PageHWPoison(page)) {
+ folio_put(folio);
+ error = -EIO;
+ break;
+ }
}
/*
* We must evaluate after, since reads (unlike writes)
- * are called without i_mutex protection against truncate
+ * are called without i_rwsem protection against truncate
*/
nr = PAGE_SIZE;
i_size = i_size_read(inode);
@@ -2584,14 +2637,14 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (index == end_index) {
nr = i_size & ~PAGE_MASK;
if (nr <= offset) {
- if (page)
- put_page(page);
+ if (folio)
+ folio_put(folio);
break;
}
}
nr -= offset;
- if (page) {
+ if (folio) {
/*
* If users can be writing to this page using arbitrary
* virtual addresses, take care about potential aliasing
@@ -2603,23 +2656,35 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
* Mark the page accessed if we read the beginning.
*/
if (!offset)
- mark_page_accessed(page);
+ folio_mark_accessed(folio);
+ /*
+ * Ok, we have the page, and it's up-to-date, so
+ * now we can copy it to user space...
+ */
+ ret = copy_page_to_iter(page, offset, nr, to);
+ folio_put(folio);
+
+ } else if (user_backed_iter(to)) {
+ /*
+ * Copy to user tends to be so well optimized, but
+ * clear_user() not so much, that it is noticeably
+ * faster to copy the zero page instead of clearing.
+ */
+ ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to);
} else {
- page = ZERO_PAGE(0);
- get_page(page);
+ /*
+ * But submitting the same page twice in a row to
+ * splice() - or others? - can result in confusion:
+ * so don't attempt that optimization on pipes etc.
+ */
+ ret = iov_iter_zero(nr, to);
}
- /*
- * Ok, we have the page, and it's up-to-date, so
- * now we can copy it to user space...
- */
- ret = copy_page_to_iter(page, offset, nr, to);
retval += ret;
offset += ret;
index += offset >> PAGE_SHIFT;
offset &= ~PAGE_MASK;
- put_page(page);
if (!iov_iter_count(to))
break;
if (ret < nr) {
@@ -2634,86 +2699,20 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
return retval ? retval : error;
}
-/*
- * llseek SEEK_DATA or SEEK_HOLE through the page cache.
- */
-static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
- pgoff_t index, pgoff_t end, int whence)
-{
- struct page *page;
- struct pagevec pvec;
- pgoff_t indices[PAGEVEC_SIZE];
- bool done = false;
- int i;
-
- pagevec_init(&pvec);
- pvec.nr = 1; /* start small: we may be there already */
- while (!done) {
- pvec.nr = find_get_entries(mapping, index,
- pvec.nr, pvec.pages, indices);
- if (!pvec.nr) {
- if (whence == SEEK_DATA)
- index = end;
- break;
- }
- for (i = 0; i < pvec.nr; i++, index++) {
- if (index < indices[i]) {
- if (whence == SEEK_HOLE) {
- done = true;
- break;
- }
- index = indices[i];
- }
- page = pvec.pages[i];
- if (page && !xa_is_value(page)) {
- if (!PageUptodate(page))
- page = NULL;
- }
- if (index >= end ||
- (page && whence == SEEK_DATA) ||
- (!page && whence == SEEK_HOLE)) {
- done = true;
- break;
- }
- }
- pagevec_remove_exceptionals(&pvec);
- pagevec_release(&pvec);
- pvec.nr = PAGEVEC_SIZE;
- cond_resched();
- }
- return index;
-}
-
static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
{
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
- pgoff_t start, end;
- loff_t new_offset;
if (whence != SEEK_DATA && whence != SEEK_HOLE)
return generic_file_llseek_size(file, offset, whence,
MAX_LFS_FILESIZE, i_size_read(inode));
- inode_lock(inode);
- /* We're holding i_mutex so we can access i_size directly */
-
- if (offset < 0 || offset >= inode->i_size)
- offset = -ENXIO;
- else {
- start = offset >> PAGE_SHIFT;
- end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT;
- new_offset = shmem_seek_hole_data(mapping, start, end, whence);
- new_offset <<= PAGE_SHIFT;
- if (new_offset > offset) {
- if (new_offset < inode->i_size)
- offset = new_offset;
- else if (whence == SEEK_DATA)
- offset = -ENXIO;
- else
- offset = inode->i_size;
- }
- }
+ if (offset < 0)
+ return -ENXIO;
+ inode_lock(inode);
+ /* We're holding i_rwsem so we can access i_size directly */
+ offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence);
if (offset >= 0)
offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
inode_unlock(inode);
@@ -2727,7 +2726,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_falloc shmem_falloc;
- pgoff_t start, index, end;
+ pgoff_t start, index, end, undo_fallocend;
int error;
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -2741,7 +2740,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
- /* protected by i_mutex */
+ /* protected by i_rwsem */
if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
error = -EPERM;
goto out;
@@ -2796,8 +2795,17 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
inode->i_private = &shmem_falloc;
spin_unlock(&inode->i_lock);
- for (index = start; index < end; index++) {
- struct page *page;
+ /*
+ * info->fallocend is only relevant when huge pages might be
+ * involved: to prevent split_huge_page() freeing fallocated
+ * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size.
+ */
+ undo_fallocend = info->fallocend;
+ if (info->fallocend < end)
+ info->fallocend = end;
+
+ for (index = start; index < end; ) {
+ struct folio *folio;
/*
* Good, the fallocate(2) manpage permits EINTR: we may have
@@ -2808,9 +2816,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
error = -ENOMEM;
else
- error = shmem_getpage(inode, index, &page, SGP_FALLOC);
+ error = shmem_get_folio(inode, index, &folio,
+ SGP_FALLOC);
if (error) {
- /* Remove the !PageUptodate pages we added */
+ info->fallocend = undo_fallocend;
+ /* Remove the !uptodate folios we added */
if (index > start) {
shmem_undo_range(inode,
(loff_t)start << PAGE_SHIFT,
@@ -2820,34 +2830,45 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
}
/*
+ * Here is a more important optimization than it appears:
+ * a second SGP_FALLOC on the same large folio will clear it,
+ * making it uptodate and un-undoable if we fail later.
+ */
+ index = folio_next_index(folio);
+ /* Beware 32-bit wraparound */
+ if (!index)
+ index--;
+
+ /*
* Inform shmem_writepage() how far we have reached.
* No need for lock or barrier: we have the page lock.
*/
- shmem_falloc.next++;
- if (!PageUptodate(page))
- shmem_falloc.nr_falloced++;
+ if (!folio_test_uptodate(folio))
+ shmem_falloc.nr_falloced += index - shmem_falloc.next;
+ shmem_falloc.next = index;
/*
- * If !PageUptodate, leave it that way so that freeable pages
+ * If !uptodate, leave it that way so that freeable folios
* can be recognized if we need to rollback on error later.
- * But set_page_dirty so that memory pressure will swap rather
- * than free the pages we are allocating (and SGP_CACHE pages
+ * But mark it dirty so that memory pressure will swap rather
+ * than free the folios we are allocating (and SGP_CACHE folios
* might still be clean: we now need to mark those dirty too).
*/
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
cond_resched();
}
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size)
i_size_write(inode, offset + len);
- inode->i_ctime = current_time(inode);
undone:
spin_lock(&inode->i_lock);
inode->i_private = NULL;
spin_unlock(&inode->i_lock);
out:
+ if (!error)
+ file_modified(file);
inode_unlock(inode);
return error;
}
@@ -2870,6 +2891,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_ffree = sbinfo->free_inodes;
}
/* else leave those fields 0 like simple_statfs */
+
+ buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b);
+
return 0;
}
@@ -2877,7 +2901,8 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf)
* File creation. Allocate an inode, and we're done..
*/
static int
-shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode *inode;
int error = -ENOSPC;
@@ -2896,6 +2921,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
error = 0;
dir->i_size += BOGO_DIRENT_SIZE;
dir->i_ctime = dir->i_mtime = current_time(dir);
+ inode_inc_iversion(dir);
d_instantiate(dentry, inode);
dget(dentry); /* Extra count - pin the dentry in core */
}
@@ -2906,7 +2932,8 @@ out_iput:
}
static int
-shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
+shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir,
+ struct file *file, umode_t mode)
{
struct inode *inode;
int error = -ENOSPC;
@@ -2921,28 +2948,30 @@ shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
error = simple_acl_create(dir, inode);
if (error)
goto out_iput;
- d_tmpfile(dentry, inode);
+ d_tmpfile(file, inode);
}
- return error;
+ return finish_open_simple(file, error);
out_iput:
iput(inode);
return error;
}
-static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int shmem_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
int error;
- if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
+ if ((error = shmem_mknod(&init_user_ns, dir, dentry,
+ mode | S_IFDIR, 0)))
return error;
inc_nlink(dir);
return 0;
}
-static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode,
- bool excl)
+static int shmem_create(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, bool excl)
{
- return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
+ return shmem_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
}
/*
@@ -2961,13 +2990,14 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
* first link must skip that, to get the accounting right.
*/
if (inode->i_nlink) {
- ret = shmem_reserve_inode(inode->i_sb);
+ ret = shmem_reserve_inode(inode->i_sb, NULL);
if (ret)
goto out;
}
dir->i_size += BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ inode_inc_iversion(dir);
inc_nlink(inode);
ihold(inode); /* New dentry reference */
dget(dentry); /* Extra pinning count for the created dentry */
@@ -2985,6 +3015,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
dir->i_size -= BOGO_DIRENT_SIZE;
inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
+ inode_inc_iversion(dir);
drop_nlink(inode);
dput(dentry); /* Undo the count from "create" - this does all the work */
return 0;
@@ -3000,29 +3031,8 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
return shmem_unlink(dir, dentry);
}
-static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
-{
- bool old_is_dir = d_is_dir(old_dentry);
- bool new_is_dir = d_is_dir(new_dentry);
-
- if (old_dir != new_dir && old_is_dir != new_is_dir) {
- if (old_is_dir) {
- drop_nlink(old_dir);
- inc_nlink(new_dir);
- } else {
- drop_nlink(new_dir);
- inc_nlink(old_dir);
- }
- }
- old_dir->i_ctime = old_dir->i_mtime =
- new_dir->i_ctime = new_dir->i_mtime =
- d_inode(old_dentry)->i_ctime =
- d_inode(new_dentry)->i_ctime = current_time(old_dir);
-
- return 0;
-}
-
-static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
+static int shmem_whiteout(struct user_namespace *mnt_userns,
+ struct inode *old_dir, struct dentry *old_dentry)
{
struct dentry *whiteout;
int error;
@@ -3031,7 +3041,7 @@ static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
if (!whiteout)
return -ENOMEM;
- error = shmem_mknod(old_dir, whiteout,
+ error = shmem_mknod(&init_user_ns, old_dir, whiteout,
S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV);
dput(whiteout);
if (error)
@@ -3054,7 +3064,10 @@ static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry)
* it exists so that the VFS layer correctly free's it when it
* gets overwritten.
*/
-static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags)
+static int shmem_rename2(struct user_namespace *mnt_userns,
+ struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry,
+ unsigned int flags)
{
struct inode *inode = d_inode(old_dentry);
int they_are_dirs = S_ISDIR(inode->i_mode);
@@ -3063,7 +3076,7 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
return -EINVAL;
if (flags & RENAME_EXCHANGE)
- return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry);
+ return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry);
if (!simple_empty(new_dentry))
return -ENOTEMPTY;
@@ -3071,7 +3084,7 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
if (flags & RENAME_WHITEOUT) {
int error;
- error = shmem_whiteout(old_dir, old_dentry);
+ error = shmem_whiteout(&init_user_ns, old_dir, old_dentry);
if (error)
return error;
}
@@ -3092,15 +3105,18 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc
old_dir->i_ctime = old_dir->i_mtime =
new_dir->i_ctime = new_dir->i_mtime =
inode->i_ctime = current_time(old_dir);
+ inode_inc_iversion(old_dir);
+ inode_inc_iversion(new_dir);
return 0;
}
-static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
+static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, const char *symname)
{
int error;
int len;
struct inode *inode;
- struct page *page;
+ struct folio *folio;
len = strlen(symname) + 1;
if (len > PAGE_SIZE)
@@ -3113,12 +3129,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
error = security_inode_init_security(inode, dir, &dentry->d_name,
shmem_initxattrs, NULL);
- if (error) {
- if (error != -EOPNOTSUPP) {
- iput(inode);
- return error;
- }
- error = 0;
+ if (error && error != -EOPNOTSUPP) {
+ iput(inode);
+ return error;
}
inode->i_size = len-1;
@@ -3131,21 +3144,22 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
inode->i_op = &shmem_short_symlink_operations;
} else {
inode_nohighmem(inode);
- error = shmem_getpage(inode, 0, &page, SGP_WRITE);
+ error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
if (error) {
iput(inode);
return error;
}
inode->i_mapping->a_ops = &shmem_aops;
inode->i_op = &shmem_symlink_inode_operations;
- memcpy(page_address(page), symname, len);
- SetPageUptodate(page);
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
+ memcpy(folio_address(folio), symname, len);
+ folio_mark_uptodate(folio);
+ folio_mark_dirty(folio);
+ folio_unlock(folio);
+ folio_put(folio);
}
dir->i_size += BOGO_DIRENT_SIZE;
dir->i_ctime = dir->i_mtime = current_time(dir);
+ inode_inc_iversion(dir);
d_instantiate(dentry, inode);
dget(dentry);
return 0;
@@ -3153,35 +3167,74 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
static void shmem_put_link(void *arg)
{
- mark_page_accessed(arg);
- put_page(arg);
+ folio_mark_accessed(arg);
+ folio_put(arg);
}
static const char *shmem_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
- struct page *page = NULL;
+ struct folio *folio = NULL;
int error;
+
if (!dentry) {
- page = find_get_page(inode->i_mapping, 0);
- if (!page)
+ folio = filemap_get_folio(inode->i_mapping, 0);
+ if (!folio)
return ERR_PTR(-ECHILD);
- if (!PageUptodate(page)) {
- put_page(page);
+ if (PageHWPoison(folio_page(folio, 0)) ||
+ !folio_test_uptodate(folio)) {
+ folio_put(folio);
return ERR_PTR(-ECHILD);
}
} else {
- error = shmem_getpage(inode, 0, &page, SGP_READ);
+ error = shmem_get_folio(inode, 0, &folio, SGP_READ);
if (error)
return ERR_PTR(error);
- unlock_page(page);
+ if (!folio)
+ return ERR_PTR(-ECHILD);
+ if (PageHWPoison(folio_page(folio, 0))) {
+ folio_unlock(folio);
+ folio_put(folio);
+ return ERR_PTR(-ECHILD);
+ }
+ folio_unlock(folio);
}
- set_delayed_call(done, shmem_put_link, page);
- return page_address(page);
+ set_delayed_call(done, shmem_put_link, folio);
+ return folio_address(folio);
}
#ifdef CONFIG_TMPFS_XATTR
+
+static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+ struct shmem_inode_info *info = SHMEM_I(d_inode(dentry));
+
+ fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE);
+
+ return 0;
+}
+
+static int shmem_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
+{
+ struct inode *inode = d_inode(dentry);
+ struct shmem_inode_info *info = SHMEM_I(inode);
+
+ if (fileattr_has_fsx(fa))
+ return -EOPNOTSUPP;
+ if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE)
+ return -EOPNOTSUPP;
+
+ info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) |
+ (fa->flags & SHMEM_FL_USER_MODIFIABLE);
+
+ shmem_set_inode_flags(inode, info->fsflags);
+ inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
+ return 0;
+}
+
/*
* Superblocks without xattr inode operations may get some security.* xattr
* support from the LSM "for free". As soon as we have any other xattrs
@@ -3210,7 +3263,7 @@ static int shmem_initxattrs(struct inode *inode,
new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
GFP_KERNEL);
if (!new_xattr->name) {
- kfree(new_xattr);
+ kvfree(new_xattr);
return -ENOMEM;
}
@@ -3236,14 +3289,21 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler,
}
static int shmem_xattr_handler_set(const struct xattr_handler *handler,
+ struct user_namespace *mnt_userns,
struct dentry *unused, struct inode *inode,
const char *name, const void *value,
size_t size, int flags)
{
struct shmem_inode_info *info = SHMEM_I(inode);
+ int err;
name = xattr_full_name(handler, name);
- return simple_xattr_set(&info->xattrs, name, value, size, flags);
+ err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
+ if (!err) {
+ inode->i_ctime = current_time(inode);
+ inode_inc_iversion(inode);
+ }
+ return err;
}
static const struct xattr_handler shmem_security_xattr_handler = {
@@ -3276,6 +3336,7 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size)
#endif /* CONFIG_TMPFS_XATTR */
static const struct inode_operations shmem_short_symlink_operations = {
+ .getattr = shmem_getattr,
.get_link = simple_get_link,
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
@@ -3283,6 +3344,7 @@ static const struct inode_operations shmem_short_symlink_operations = {
};
static const struct inode_operations shmem_symlink_inode_operations = {
+ .getattr = shmem_getattr,
.get_link = shmem_get_link,
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
@@ -3379,6 +3441,8 @@ enum shmem_param {
Opt_nr_inodes,
Opt_size,
Opt_uid,
+ Opt_inode32,
+ Opt_inode64,
};
static const struct constant_table shmem_param_enums_huge[] = {
@@ -3398,6 +3462,8 @@ const struct fs_parameter_spec shmem_fs_parameters[] = {
fsparam_string("nr_inodes", Opt_nr_inodes),
fsparam_string("size", Opt_size),
fsparam_u32 ("uid", Opt_uid),
+ fsparam_flag ("inode32", Opt_inode32),
+ fsparam_flag ("inode64", Opt_inode64),
{}
};
@@ -3429,7 +3495,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_nr_blocks:
ctx->blocks = memparse(param->string, &rest);
- if (*rest)
+ if (*rest || ctx->blocks > S64_MAX)
goto bad_value;
ctx->seen |= SHMEM_SEEN_BLOCKS;
break;
@@ -3455,7 +3521,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
case Opt_huge:
ctx->huge = result.uint_32;
if (ctx->huge != SHMEM_HUGE_NEVER &&
- !(IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
+ !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
has_transparent_hugepage()))
goto unsupported_parameter;
ctx->seen |= SHMEM_SEEN_HUGE;
@@ -3469,6 +3535,18 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
break;
}
goto unsupported_parameter;
+ case Opt_inode32:
+ ctx->full_inums = false;
+ ctx->seen |= SHMEM_SEEN_INUMS;
+ break;
+ case Opt_inode64:
+ if (sizeof(ino_t) < 8) {
+ return invalfc(fc,
+ "Cannot use inode64 with <64bit inums in kernel\n");
+ }
+ ctx->full_inums = true;
+ ctx->seen |= SHMEM_SEEN_INUMS;
+ break;
}
return 0;
@@ -3506,7 +3584,7 @@ static int shmem_parse_options(struct fs_context *fc, void *data)
}
}
if (*this_char) {
- char *value = strchr(this_char,'=');
+ char *value = strchr(this_char, '=');
size_t len = 0;
int err;
@@ -3534,10 +3612,12 @@ static int shmem_reconfigure(struct fs_context *fc)
struct shmem_options *ctx = fc->fs_private;
struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb);
unsigned long inodes;
+ struct mempolicy *mpol = NULL;
const char *err;
- spin_lock(&sbinfo->stat_lock);
+ raw_spin_lock(&sbinfo->stat_lock);
inodes = sbinfo->max_inodes - sbinfo->free_inodes;
+
if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) {
if (!sbinfo->max_blocks) {
err = "Cannot retroactively limit size";
@@ -3560,8 +3640,16 @@ static int shmem_reconfigure(struct fs_context *fc)
}
}
+ if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
+ sbinfo->next_ino > UINT_MAX) {
+ err = "Current inum too high to switch to 32-bit inums";
+ goto out;
+ }
+
if (ctx->seen & SHMEM_SEEN_HUGE)
sbinfo->huge = ctx->huge;
+ if (ctx->seen & SHMEM_SEEN_INUMS)
+ sbinfo->full_inums = ctx->full_inums;
if (ctx->seen & SHMEM_SEEN_BLOCKS)
sbinfo->max_blocks = ctx->blocks;
if (ctx->seen & SHMEM_SEEN_INODES) {
@@ -3573,14 +3661,15 @@ static int shmem_reconfigure(struct fs_context *fc)
* Preserve previous mempolicy unless mpol remount option was specified.
*/
if (ctx->mpol) {
- mpol_put(sbinfo->mpol);
+ mpol = sbinfo->mpol;
sbinfo->mpol = ctx->mpol; /* transfers initial ref */
ctx->mpol = NULL;
}
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
+ mpol_put(mpol);
return 0;
out:
- spin_unlock(&sbinfo->stat_lock);
+ raw_spin_unlock(&sbinfo->stat_lock);
return invalfc(fc, "%s", err);
}
@@ -3601,7 +3690,30 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
seq_printf(seq, ",gid=%u",
from_kgid_munged(&init_user_ns, sbinfo->gid));
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+
+ /*
+ * Showing inode{64,32} might be useful even if it's the system default,
+ * since then people don't have to resort to checking both here and
+ * /proc/config.gz to confirm 64-bit inums were successfully applied
+ * (which may not even exist if IKCONFIG_PROC isn't enabled).
+ *
+ * We hide it when inode64 isn't the default and we are using 32-bit
+ * inodes, since that probably just means the feature isn't even under
+ * consideration.
+ *
+ * As such:
+ *
+ * +-----------------+-----------------+
+ * | TMPFS_INODE64=y | TMPFS_INODE64=n |
+ * +------------------+-----------------+-----------------+
+ * | full_inums=true | show | show |
+ * | full_inums=false | show | hide |
+ * +------------------+-----------------+-----------------+
+ *
+ */
+ if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
+ seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
if (sbinfo->huge)
seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge));
@@ -3616,6 +3728,7 @@ static void shmem_put_super(struct super_block *sb)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ free_percpu(sbinfo->ino_batch);
percpu_counter_destroy(&sbinfo->used_blocks);
mpol_put(sbinfo->mpol);
kfree(sbinfo);
@@ -3627,7 +3740,6 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
struct shmem_options *ctx = fc->fs_private;
struct inode *inode;
struct shmem_sb_info *sbinfo;
- int err = -ENOMEM;
/* Round up to L1_CACHE_BYTES to resist false sharing */
sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info),
@@ -3648,24 +3760,32 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
ctx->blocks = shmem_default_max_blocks();
if (!(ctx->seen & SHMEM_SEEN_INODES))
ctx->inodes = shmem_default_max_inodes();
+ if (!(ctx->seen & SHMEM_SEEN_INUMS))
+ ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
} else {
sb->s_flags |= SB_NOUSER;
}
sb->s_export_op = &shmem_export_ops;
- sb->s_flags |= SB_NOSEC;
+ sb->s_flags |= SB_NOSEC | SB_I_VERSION;
#else
sb->s_flags |= SB_NOUSER;
#endif
sbinfo->max_blocks = ctx->blocks;
sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
+ if (sb->s_flags & SB_KERNMOUNT) {
+ sbinfo->ino_batch = alloc_percpu(ino_t);
+ if (!sbinfo->ino_batch)
+ goto failed;
+ }
sbinfo->uid = ctx->uid;
sbinfo->gid = ctx->gid;
+ sbinfo->full_inums = ctx->full_inums;
sbinfo->mode = ctx->mode;
sbinfo->huge = ctx->huge;
sbinfo->mpol = ctx->mpol;
ctx->mpol = NULL;
- spin_lock_init(&sbinfo->stat_lock);
+ raw_spin_lock_init(&sbinfo->stat_lock);
if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL))
goto failed;
spin_lock_init(&sbinfo->shrinklist_lock);
@@ -3697,7 +3817,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
failed:
shmem_put_super(sb);
- return err;
+ return -ENOMEM;
}
static int shmem_get_tree(struct fs_context *fc)
@@ -3730,7 +3850,7 @@ static struct kmem_cache *shmem_inode_cachep;
static struct inode *shmem_alloc_inode(struct super_block *sb)
{
struct shmem_inode_info *info;
- info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
+ info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL);
if (!info)
return NULL;
return &info->vfs_inode;
@@ -3767,18 +3887,26 @@ static void shmem_destroy_inodecache(void)
kmem_cache_destroy(shmem_inode_cachep);
}
-static const struct address_space_operations shmem_aops = {
+/* Keep the page in page cache instead of truncating it */
+static int shmem_error_remove_page(struct address_space *mapping,
+ struct page *page)
+{
+ return 0;
+}
+
+const struct address_space_operations shmem_aops = {
.writepage = shmem_writepage,
- .set_page_dirty = __set_page_dirty_no_writeback,
+ .dirty_folio = noop_dirty_folio,
#ifdef CONFIG_TMPFS
.write_begin = shmem_write_begin,
.write_end = shmem_write_end,
#endif
#ifdef CONFIG_MIGRATION
- .migratepage = migrate_page,
+ .migrate_folio = migrate_folio,
#endif
- .error_remove_page = generic_error_remove_page,
+ .error_remove_page = shmem_error_remove_page,
};
+EXPORT_SYMBOL(shmem_aops);
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
@@ -3800,11 +3928,14 @@ static const struct inode_operations shmem_inode_operations = {
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
.set_acl = simple_set_acl,
+ .fileattr_get = shmem_fileattr_get,
+ .fileattr_set = shmem_fileattr_set,
#endif
};
static const struct inode_operations shmem_dir_inode_operations = {
#ifdef CONFIG_TMPFS
+ .getattr = shmem_getattr,
.create = shmem_create,
.lookup = simple_lookup,
.link = shmem_link,
@@ -3818,6 +3949,8 @@ static const struct inode_operations shmem_dir_inode_operations = {
#endif
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
+ .fileattr_get = shmem_fileattr_get,
+ .fileattr_set = shmem_fileattr_set,
#endif
#ifdef CONFIG_TMPFS_POSIX_ACL
.setattr = shmem_setattr,
@@ -3826,6 +3959,7 @@ static const struct inode_operations shmem_dir_inode_operations = {
};
static const struct inode_operations shmem_special_inode_operations = {
+ .getattr = shmem_getattr,
#ifdef CONFIG_TMPFS_XATTR
.listxattr = shmem_listxattr,
#endif
@@ -3846,7 +3980,7 @@ static const struct super_operations shmem_ops = {
.evict_inode = shmem_evict_inode,
.drop_inode = generic_delete_inode,
.put_super = shmem_put_super,
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
.nr_cached_objects = shmem_unused_huge_count,
.free_cached_objects = shmem_unused_huge_scan,
#endif
@@ -3889,7 +4023,7 @@ static struct file_system_type shmem_fs_type = {
.fs_flags = FS_USERNS_MOUNT,
};
-int __init shmem_init(void)
+void __init shmem_init(void)
{
int error;
@@ -3908,25 +4042,24 @@ int __init shmem_init(void)
goto out1;
}
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY)
SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge;
else
- shmem_huge = 0; /* just in case it was patched */
+ shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */
#endif
- return 0;
+ return;
out1:
unregister_filesystem(&shmem_fs_type);
out2:
shmem_destroy_inodecache();
shm_mnt = ERR_PTR(error);
- return error;
}
-#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS)
+#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
static ssize_t shmem_enabled_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+ struct kobj_attribute *attr, char *buf)
{
static const int values[] = {
SHMEM_HUGE_ALWAYS,
@@ -3936,16 +4069,19 @@ static ssize_t shmem_enabled_show(struct kobject *kobj,
SHMEM_HUGE_DENY,
SHMEM_HUGE_FORCE,
};
- int i, count;
-
- for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) {
- const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s ";
+ int len = 0;
+ int i;
- count += sprintf(buf + count, fmt,
- shmem_format_huge(values[i]));
+ for (i = 0; i < ARRAY_SIZE(values); i++) {
+ len += sysfs_emit_at(buf, len,
+ shmem_huge == values[i] ? "%s[%s]" : "%s%s",
+ i ? " " : "",
+ shmem_format_huge(values[i]));
}
- buf[count - 1] = '\n';
- return count;
+
+ len += sysfs_emit_at(buf, len, "\n");
+
+ return len;
}
static ssize_t shmem_enabled_store(struct kobject *kobj,
@@ -3974,46 +4110,8 @@ static ssize_t shmem_enabled_store(struct kobject *kobj,
return count;
}
-struct kobj_attribute shmem_enabled_attr =
- __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store);
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */
-
-#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE
-bool shmem_huge_enabled(struct vm_area_struct *vma)
-{
- struct inode *inode = file_inode(vma->vm_file);
- struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- loff_t i_size;
- pgoff_t off;
-
- if ((vma->vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
- return false;
- if (shmem_huge == SHMEM_HUGE_FORCE)
- return true;
- if (shmem_huge == SHMEM_HUGE_DENY)
- return false;
- switch (sbinfo->huge) {
- case SHMEM_HUGE_NEVER:
- return false;
- case SHMEM_HUGE_ALWAYS:
- return true;
- case SHMEM_HUGE_WITHIN_SIZE:
- off = round_up(vma->vm_pgoff, HPAGE_PMD_NR);
- i_size = round_up(i_size_read(inode), PAGE_SIZE);
- if (i_size >= HPAGE_PMD_SIZE &&
- i_size >> PAGE_SHIFT >= off)
- return true;
- /* fall through */
- case SHMEM_HUGE_ADVISE:
- /* TODO: implement fadvise() hints */
- return (vma->vm_flags & VM_HUGEPAGE);
- default:
- VM_BUG_ON(1);
- return false;
- }
-}
-#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */
+struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */
#else /* !CONFIG_SHMEM */
@@ -4034,23 +4132,20 @@ static struct file_system_type shmem_fs_type = {
.fs_flags = FS_USERNS_MOUNT,
};
-int __init shmem_init(void)
+void __init shmem_init(void)
{
BUG_ON(register_filesystem(&shmem_fs_type) != 0);
shm_mnt = kern_mount(&shmem_fs_type);
BUG_ON(IS_ERR(shm_mnt));
-
- return 0;
}
-int shmem_unuse(unsigned int type, bool frontswap,
- unsigned long *fs_pages_to_unuse)
+int shmem_unuse(unsigned int type)
{
return 0;
}
-int shmem_lock(struct file *file, int lock, struct user_struct *user)
+int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
{
return 0;
}
@@ -4160,7 +4255,7 @@ EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
/**
* shmem_zero_setup - setup a shared anonymous mapping
- * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
+ * @vma: the vma to be mmapped is prepared by do_mmap
*/
int shmem_zero_setup(struct vm_area_struct *vma)
{
@@ -4168,7 +4263,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
loff_t size = vma->vm_end - vma->vm_start;
/*
- * Cloning a new file under mmap_sem leads to a lock ordering conflict
+ * Cloning a new file under mmap_lock leads to a lock ordering conflict
* between XFS directory reading and selinux: since this file is only
* accessible to the user through its mapping, use S_PRIVATE flag to
* bypass file security, in the same way as shmem_kernel_file_setup().
@@ -4182,12 +4277,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
vma->vm_file = file;
vma->vm_ops = &shmem_vm_ops;
- if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
- ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
- (vma->vm_end & HPAGE_PMD_MASK)) {
- khugepaged_enter(vma, vma->vm_flags);
- }
-
return 0;
}
@@ -4199,7 +4288,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
*
* This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
* with any new page allocations done using the specified allocation flags.
- * But read_cache_page_gfp() uses the ->readpage() method: which does not
+ * But read_cache_page_gfp() uses the ->read_folio() method: which does not
* suit tmpfs, since it may have pages in swapcache, and needs to find those
* for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
*
@@ -4211,16 +4300,23 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
{
#ifdef CONFIG_SHMEM
struct inode *inode = mapping->host;
+ struct folio *folio;
struct page *page;
int error;
- BUG_ON(mapping->a_ops != &shmem_aops);
- error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
+ BUG_ON(!shmem_mapping(mapping));
+ error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE,
gfp, NULL, NULL, NULL);
if (error)
- page = ERR_PTR(error);
- else
- unlock_page(page);
+ return ERR_PTR(error);
+
+ folio_unlock(folio);
+ page = folio_file_page(folio, index);
+ if (PageHWPoison(page)) {
+ folio_put(folio);
+ return ERR_PTR(-EIO);
+ }
+
return page;
#else
/*