diff options
Diffstat (limited to '')
-rw-r--r-- | mm/shmem.c | 2162 |
1 files changed, 1129 insertions, 1033 deletions
diff --git a/mm/shmem.c b/mm/shmem.c index aad3ba74b0e9..c1d8b8a1aa3b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -28,18 +28,18 @@ #include <linux/ramfs.h> #include <linux/pagemap.h> #include <linux/file.h> +#include <linux/fileattr.h> #include <linux/mm.h> #include <linux/random.h> #include <linux/sched/signal.h> #include <linux/export.h> #include <linux/swap.h> #include <linux/uio.h> -#include <linux/khugepaged.h> #include <linux/hugetlb.h> -#include <linux/frontswap.h> #include <linux/fs_parser.h> - -#include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ +#include <linux/swapfile.h> +#include <linux/iversion.h> +#include "swap.h" static struct vfsmount *shm_mnt; @@ -60,7 +60,6 @@ static struct vfsmount *shm_mnt; #include <linux/backing-dev.h> #include <linux/shmem_fs.h> #include <linux/writeback.h> -#include <linux/blkdev.h> #include <linux/pagevec.h> #include <linux/percpu_counter.h> #include <linux/falloc.h> @@ -82,7 +81,6 @@ static struct vfsmount *shm_mnt; #include <linux/uuid.h> #include <linux/uaccess.h> -#include <asm/pgtable.h> #include "internal.h" @@ -97,7 +95,7 @@ static struct vfsmount *shm_mnt; /* * shmem_fallocate communicates with shmem_fault or shmem_writepage via - * inode->i_private (with i_mutex making sure that it has only one user at + * inode->i_private (with i_rwsem making sure that it has only one user at * a time): we would prefer not to enlarge the shmem inode just for that. */ struct shmem_falloc { @@ -115,11 +113,13 @@ struct shmem_options { kuid_t uid; kgid_t gid; umode_t mode; + bool full_inums; int huge; int seen; #define SHMEM_SEEN_BLOCKS 1 #define SHMEM_SEEN_INODES 2 #define SHMEM_SEEN_HUGE 4 +#define SHMEM_SEEN_INUMS 8 }; #ifdef CONFIG_TMPFS @@ -136,24 +136,10 @@ static unsigned long shmem_default_max_inodes(void) } #endif -static bool shmem_should_replace_page(struct page *page, gfp_t gfp); -static int shmem_replace_page(struct page **pagep, gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index); -static int shmem_swapin_page(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, +static int shmem_swapin_folio(struct inode *inode, pgoff_t index, + struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type); -static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, - gfp_t gfp, struct vm_area_struct *vma, - struct vm_fault *vmf, vm_fault_t *fault_type); - -int shmem_getpage(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp) -{ - return shmem_getpage_gfp(inode, index, pagep, sgp, - mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); -} static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { @@ -194,7 +180,7 @@ static inline int shmem_reacct_size(unsigned long flags, /* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow large sparse files. - * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, + * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ static inline int shmem_acct_block(unsigned long flags, long pages) @@ -245,7 +231,7 @@ static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages) } static const struct super_operations shmem_ops; -static const struct address_space_operations shmem_aops; +const struct address_space_operations shmem_aops; static const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; @@ -261,18 +247,79 @@ bool vma_is_shmem(struct vm_area_struct *vma) static LIST_HEAD(shmem_swaplist); static DEFINE_MUTEX(shmem_swaplist_mutex); -static int shmem_reserve_inode(struct super_block *sb) +/* + * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and + * produces a novel ino for the newly allocated inode. + * + * It may also be called when making a hard link to permit the space needed by + * each dentry. However, in that case, no new inode number is needed since that + * internally draws from another pool of inode numbers (currently global + * get_next_ino()). This case is indicated by passing NULL as inop. + */ +#define SHMEM_INO_BATCH 1024 +static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); - if (sbinfo->max_inodes) { - spin_lock(&sbinfo->stat_lock); - if (!sbinfo->free_inodes) { - spin_unlock(&sbinfo->stat_lock); - return -ENOSPC; + ino_t ino; + + if (!(sb->s_flags & SB_KERNMOUNT)) { + raw_spin_lock(&sbinfo->stat_lock); + if (sbinfo->max_inodes) { + if (!sbinfo->free_inodes) { + raw_spin_unlock(&sbinfo->stat_lock); + return -ENOSPC; + } + sbinfo->free_inodes--; + } + if (inop) { + ino = sbinfo->next_ino++; + if (unlikely(is_zero_ino(ino))) + ino = sbinfo->next_ino++; + if (unlikely(!sbinfo->full_inums && + ino > UINT_MAX)) { + /* + * Emulate get_next_ino uint wraparound for + * compatibility + */ + if (IS_ENABLED(CONFIG_64BIT)) + pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n", + __func__, MINOR(sb->s_dev)); + sbinfo->next_ino = 1; + ino = sbinfo->next_ino++; + } + *inop = ino; } - sbinfo->free_inodes--; - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); + } else if (inop) { + /* + * __shmem_file_setup, one of our callers, is lock-free: it + * doesn't hold stat_lock in shmem_reserve_inode since + * max_inodes is always 0, and is called from potentially + * unknown contexts. As such, use a per-cpu batched allocator + * which doesn't require the per-sb stat_lock unless we are at + * the batch boundary. + * + * We don't need to worry about inode{32,64} since SB_KERNMOUNT + * shmem mounts are not exposed to userspace, so we don't need + * to worry about things like glibc compatibility. + */ + ino_t *next_ino; + + next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); + ino = *next_ino; + if (unlikely(ino % SHMEM_INO_BATCH == 0)) { + raw_spin_lock(&sbinfo->stat_lock); + ino = sbinfo->next_ino; + sbinfo->next_ino += SHMEM_INO_BATCH; + raw_spin_unlock(&sbinfo->stat_lock); + if (unlikely(is_zero_ino(ino))) + ino++; + } + *inop = ino; + *next_ino = ++ino; + put_cpu(); } + return 0; } @@ -280,9 +327,9 @@ static void shmem_free_inode(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (sbinfo->max_inodes) { - spin_lock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); sbinfo->free_inodes++; - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); } } @@ -336,7 +383,7 @@ void shmem_uncharge(struct inode *inode, long pages) struct shmem_inode_info *info = SHMEM_I(inode); unsigned long flags; - /* nrpages adjustment done by __delete_from_page_cache() or caller */ + /* nrpages adjustment done by __filemap_remove_folio() or caller */ spin_lock_irqsave(&info->lock, flags); info->alloced -= pages; @@ -410,10 +457,45 @@ static bool shmem_confirm_swap(struct address_space *mapping, #define SHMEM_HUGE_DENY (-1) #define SHMEM_HUGE_FORCE (-2) -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* ifdef here to avoid bloating shmem.o when not necessary */ -static int shmem_huge __read_mostly; +static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; + +bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, + pgoff_t index, bool shmem_huge_force) +{ + loff_t i_size; + + if (!S_ISREG(inode->i_mode)) + return false; + if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) || + test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) + return false; + if (shmem_huge_force) + return true; + if (shmem_huge == SHMEM_HUGE_FORCE) + return true; + if (shmem_huge == SHMEM_HUGE_DENY) + return false; + + switch (SHMEM_SB(inode->i_sb)->huge) { + case SHMEM_HUGE_ALWAYS: + return true; + case SHMEM_HUGE_WITHIN_SIZE: + index = round_up(index + 1, HPAGE_PMD_NR); + i_size = round_up(i_size_read(inode), PAGE_SIZE); + if (i_size >> PAGE_SHIFT >= index) + return true; + fallthrough; + case SHMEM_HUGE_ADVISE: + if (vma && (vma->vm_flags & VM_HUGEPAGE)) + return true; + fallthrough; + default: + return false; + } +} #if defined(CONFIG_SYSFS) static int shmem_parse_huge(const char *str) @@ -464,9 +546,9 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, LIST_HEAD(to_remove); struct inode *inode; struct shmem_inode_info *info; - struct page *page; + struct folio *folio; unsigned long batch = sc ? sc->nr_to_scan : 128; - int removed = 0, split = 0; + int split = 0; if (list_empty(&sbinfo->shrinklist)) return SHRINK_STOP; @@ -481,7 +563,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, /* inode is about to be evicted */ if (!inode) { list_del_init(&info->shrinklist); - removed++; goto next; } @@ -489,12 +570,12 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, if (round_up(inode->i_size, PAGE_SIZE) == round_up(inode->i_size, HPAGE_PMD_SIZE)) { list_move(&info->shrinklist, &to_remove); - removed++; goto next; } list_move(&info->shrinklist, &list); next: + sbinfo->shrinklist_len--; if (!--batch) break; } @@ -509,57 +590,64 @@ next: list_for_each_safe(pos, next, &list) { int ret; + pgoff_t index; info = list_entry(pos, struct shmem_inode_info, shrinklist); inode = &info->vfs_inode; if (nr_to_split && split >= nr_to_split) - goto leave; + goto move_back; - page = find_get_page(inode->i_mapping, - (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT); - if (!page) + index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT; + folio = filemap_get_folio(inode->i_mapping, index); + if (!folio) goto drop; /* No huge page at the end of the file: nothing to split */ - if (!PageTransHuge(page)) { - put_page(page); + if (!folio_test_large(folio)) { + folio_put(folio); goto drop; } /* - * Leave the inode on the list if we failed to lock - * the page at this time. + * Move the inode on the list back to shrinklist if we failed + * to lock the page at this time. * * Waiting for the lock may lead to deadlock in the * reclaim path. */ - if (!trylock_page(page)) { - put_page(page); - goto leave; + if (!folio_trylock(folio)) { + folio_put(folio); + goto move_back; } - ret = split_huge_page(page); - unlock_page(page); - put_page(page); + ret = split_folio(folio); + folio_unlock(folio); + folio_put(folio); - /* If split failed leave the inode on the list */ + /* If split failed move the inode on the list back to shrinklist */ if (ret) - goto leave; + goto move_back; split++; drop: list_del_init(&info->shrinklist); - removed++; -leave: + goto put; +move_back: + /* + * Make sure the inode is either on the global list or deleted + * from any local list before iput() since it could be deleted + * in another thread once we put the inode (then the local list + * is corrupted). + */ + spin_lock(&sbinfo->shrinklist_lock); + list_move(&info->shrinklist, &sbinfo->shrinklist); + sbinfo->shrinklist_len++; + spin_unlock(&sbinfo->shrinklist_lock); +put: iput(inode); } - spin_lock(&sbinfo->shrinklist_lock); - list_splice_tail(&list, &sbinfo->shrinklist); - sbinfo->shrinklist_len -= removed; - spin_unlock(&sbinfo->shrinklist_lock); - return split; } @@ -580,100 +668,109 @@ static long shmem_unused_huge_count(struct super_block *sb, struct shmem_sb_info *sbinfo = SHMEM_SB(sb); return READ_ONCE(sbinfo->shrinklist_len); } -#else /* !CONFIG_TRANSPARENT_HUGE_PAGECACHE */ +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ #define shmem_huge SHMEM_HUGE_DENY -static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, - struct shrink_control *sc, unsigned long nr_to_split) +bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, + pgoff_t index, bool shmem_huge_force) { - return 0; + return false; } -#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ -static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) +static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, + struct shrink_control *sc, unsigned long nr_to_split) { - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && - (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) && - shmem_huge != SHMEM_HUGE_DENY) - return true; - return false; + return 0; } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* - * Like add_to_page_cache_locked, but error if expected item has gone. + * Like filemap_add_folio, but error if expected item has gone. */ -static int shmem_add_to_page_cache(struct page *page, +static int shmem_add_to_page_cache(struct folio *folio, struct address_space *mapping, - pgoff_t index, void *expected, gfp_t gfp) + pgoff_t index, void *expected, gfp_t gfp, + struct mm_struct *charge_mm) { - XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page)); - unsigned long i = 0; - unsigned long nr = compound_nr(page); + XA_STATE_ORDER(xas, &mapping->i_pages, index, folio_order(folio)); + long nr = folio_nr_pages(folio); + int error; - VM_BUG_ON_PAGE(PageTail(page), page); - VM_BUG_ON_PAGE(index != round_down(index, nr), page); - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); - VM_BUG_ON(expected && PageTransHuge(page)); + VM_BUG_ON_FOLIO(index != round_down(index, nr), folio); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); + VM_BUG_ON(expected && folio_test_large(folio)); - page_ref_add(page, nr); - page->mapping = mapping; - page->index = index; + folio_ref_add(folio, nr); + folio->mapping = mapping; + folio->index = index; + + if (!folio_test_swapcache(folio)) { + error = mem_cgroup_charge(folio, charge_mm, gfp); + if (error) { + if (folio_test_pmd_mappable(folio)) { + count_vm_event(THP_FILE_FALLBACK); + count_vm_event(THP_FILE_FALLBACK_CHARGE); + } + goto error; + } + } + folio_throttle_swaprate(folio, gfp); do { - void *entry; xas_lock_irq(&xas); - entry = xas_find_conflict(&xas); - if (entry != expected) + if (expected != xas_find_conflict(&xas)) { xas_set_err(&xas, -EEXIST); - xas_create_range(&xas); - if (xas_error(&xas)) goto unlock; -next: - xas_store(&xas, page); - if (++i < nr) { - xas_next(&xas); - goto next; } - if (PageTransHuge(page)) { + if (expected && xas_find_conflict(&xas)) { + xas_set_err(&xas, -EEXIST); + goto unlock; + } + xas_store(&xas, folio); + if (xas_error(&xas)) + goto unlock; + if (folio_test_pmd_mappable(folio)) { count_vm_event(THP_FILE_ALLOC); - __inc_node_page_state(page, NR_SHMEM_THPS); + __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, nr); } mapping->nrpages += nr; - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); - __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr); + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); + __lruvec_stat_mod_folio(folio, NR_SHMEM, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); if (xas_error(&xas)) { - page->mapping = NULL; - page_ref_sub(page, nr); - return xas_error(&xas); + error = xas_error(&xas); + goto error; } return 0; +error: + folio->mapping = NULL; + folio_ref_sub(folio, nr); + return error; } /* - * Like delete_from_page_cache, but substitutes swap for page. + * Like delete_from_page_cache, but substitutes swap for @folio. */ -static void shmem_delete_from_page_cache(struct page *page, void *radswap) +static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; + long nr = folio_nr_pages(folio); int error; - VM_BUG_ON_PAGE(PageCompound(page), page); - xa_lock_irq(&mapping->i_pages); - error = shmem_replace_entry(mapping, page->index, page, radswap); - page->mapping = NULL; - mapping->nrpages--; - __dec_node_page_state(page, NR_FILE_PAGES); - __dec_node_page_state(page, NR_SHMEM); + error = shmem_replace_entry(mapping, folio->index, folio, radswap); + folio->mapping = NULL; + mapping->nrpages -= nr; + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); + __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); xa_unlock_irq(&mapping->i_pages); - put_page(page); + folio_put(folio); BUG_ON(error); } @@ -696,7 +793,7 @@ static int shmem_free_swap(struct address_space *mapping, * Determine (in bytes) how many of the shmem object's pages mapped by the * given offsets are swapped out. * - * This is safe to call without i_mutex or the i_pages lock thanks to RCU, + * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_partial_swap_usage(struct address_space *mapping, @@ -728,7 +825,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, * Determine (in bytes) how many of the shmem object's pages mapped by the * given vma is swapped out. * - * This is safe to call without i_mutex or the i_pages lock thanks to RCU, + * This is safe to call without i_rwsem or the i_pages lock thanks to RCU, * as long as the inode doesn't go away and racy results are not a problem. */ unsigned long shmem_swap_usage(struct vm_area_struct *vma) @@ -753,9 +850,8 @@ unsigned long shmem_swap_usage(struct vm_area_struct *vma) return swapped << PAGE_SHIFT; /* Here comes the more involved part */ - return shmem_partial_swap_usage(mapping, - linear_page_index(vma, vma->vm_start), - linear_page_index(vma, vma->vm_end)); + return shmem_partial_swap_usage(mapping, vma->vm_pgoff, + vma->vm_pgoff + vma_pages(vma)); } /* @@ -763,31 +859,42 @@ unsigned long shmem_swap_usage(struct vm_area_struct *vma) */ void shmem_unlock_mapping(struct address_space *mapping) { - struct pagevec pvec; - pgoff_t indices[PAGEVEC_SIZE]; + struct folio_batch fbatch; pgoff_t index = 0; - pagevec_init(&pvec); + folio_batch_init(&fbatch); /* * Minor point, but we might as well stop if someone else SHM_LOCKs it. */ - while (!mapping_unevictable(mapping)) { - /* - * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it - * has finished, if it hits a row of PAGEVEC_SIZE swap entries. - */ - pvec.nr = find_get_entries(mapping, index, - PAGEVEC_SIZE, pvec.pages, indices); - if (!pvec.nr) - break; - index = indices[pvec.nr - 1] + 1; - pagevec_remove_exceptionals(&pvec); - check_move_unevictable_pages(&pvec); - pagevec_release(&pvec); + while (!mapping_unevictable(mapping) && + filemap_get_folios(mapping, &index, ~0UL, &fbatch)) { + check_move_unevictable_folios(&fbatch); + folio_batch_release(&fbatch); cond_resched(); } } +static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) +{ + struct folio *folio; + + /* + * At first avoid shmem_get_folio(,,,SGP_READ): that fails + * beyond i_size, and reports fallocated pages as holes. + */ + folio = __filemap_get_folio(inode->i_mapping, index, + FGP_ENTRY | FGP_LOCK, 0); + if (!xa_is_value(folio)) + return folio; + /* + * But read a page back from swap if any of it is within i_size + * (although in some cases this is just a waste of time). + */ + folio = NULL; + shmem_get_folio(inode, index, &folio, SGP_READ); + return folio; +} + /* * Remove range of pages and swap entries from page cache, and free them. * If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate. @@ -799,10 +906,10 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t start = (lstart + PAGE_SIZE - 1) >> PAGE_SHIFT; pgoff_t end = (lend + 1) >> PAGE_SHIFT; - unsigned int partial_start = lstart & (PAGE_SIZE - 1); - unsigned int partial_end = (lend + 1) & (PAGE_SIZE - 1); - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; + struct folio *folio; + bool same_folio; long nr_swaps_freed = 0; pgoff_t index; int i; @@ -810,104 +917,68 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (lend == -1) end = -1; /* unsigned, so actually very big */ - pagevec_init(&pvec); + if (info->fallocend > start && info->fallocend <= end && !unfalloc) + info->fallocend = start; + + folio_batch_init(&fbatch); index = start; - while (index < end) { - pvec.nr = find_get_entries(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - pvec.pages, indices); - if (!pvec.nr) - break; - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + while (index < end && find_lock_entries(mapping, index, end - 1, + &fbatch, indices)) { + for (i = 0; i < folio_batch_count(&fbatch); i++) { + folio = fbatch.folios[i]; index = indices[i]; - if (index >= end) - break; - if (xa_is_value(page)) { + if (xa_is_value(folio)) { if (unfalloc) continue; nr_swaps_freed += !shmem_free_swap(mapping, - index, page); + index, folio); continue; } + index += folio_nr_pages(folio) - 1; - VM_BUG_ON_PAGE(page_to_pgoff(page) != index, page); - - if (!trylock_page(page)) - continue; - - if (PageTransTail(page)) { - /* Middle of THP: zero out the page */ - clear_highpage(page); - unlock_page(page); - continue; - } else if (PageTransHuge(page)) { - if (index == round_down(end, HPAGE_PMD_NR)) { - /* - * Range ends in the middle of THP: - * zero out the page - */ - clear_highpage(page); - unlock_page(page); - continue; - } - index += HPAGE_PMD_NR - 1; - i += HPAGE_PMD_NR - 1; - } - - if (!unfalloc || !PageUptodate(page)) { - VM_BUG_ON_PAGE(PageTail(page), page); - if (page_mapping(page) == mapping) { - VM_BUG_ON_PAGE(PageWriteback(page), page); - truncate_inode_page(mapping, page); - } - } - unlock_page(page); + if (!unfalloc || !folio_test_uptodate(folio)) + truncate_inode_folio(mapping, folio); + folio_unlock(folio); } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); cond_resched(); index++; } - if (partial_start) { - struct page *page = NULL; - shmem_getpage(inode, start - 1, &page, SGP_READ); - if (page) { - unsigned int top = PAGE_SIZE; - if (start > end) { - top = partial_end; - partial_end = 0; - } - zero_user_segment(page, partial_start, top); - set_page_dirty(page); - unlock_page(page); - put_page(page); + same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT); + folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT); + if (folio) { + same_folio = lend < folio_pos(folio) + folio_size(folio); + folio_mark_dirty(folio); + if (!truncate_inode_partial_folio(folio, lstart, lend)) { + start = folio->index + folio_nr_pages(folio); + if (same_folio) + end = folio->index; } + folio_unlock(folio); + folio_put(folio); + folio = NULL; } - if (partial_end) { - struct page *page = NULL; - shmem_getpage(inode, end, &page, SGP_READ); - if (page) { - zero_user_segment(page, 0, partial_end); - set_page_dirty(page); - unlock_page(page); - put_page(page); - } + + if (!same_folio) + folio = shmem_get_partial_folio(inode, lend >> PAGE_SHIFT); + if (folio) { + folio_mark_dirty(folio); + if (!truncate_inode_partial_folio(folio, lstart, lend)) + end = folio->index; + folio_unlock(folio); + folio_put(folio); } - if (start >= end) - return; index = start; while (index < end) { cond_resched(); - pvec.nr = find_get_entries(mapping, index, - min(end - index, (pgoff_t)PAGEVEC_SIZE), - pvec.pages, indices); - if (!pvec.nr) { + if (!find_get_entries(mapping, index, end - 1, &fbatch, + indices)) { /* If all gone or hole-punch or unfalloc, we're done */ if (index == start || end != -1) break; @@ -915,17 +986,14 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, index = start; continue; } - for (i = 0; i < pagevec_count(&pvec); i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < folio_batch_count(&fbatch); i++) { + folio = fbatch.folios[i]; index = indices[i]; - if (index >= end) - break; - - if (xa_is_value(page)) { + if (xa_is_value(folio)) { if (unfalloc) continue; - if (shmem_free_swap(mapping, index, page)) { + if (shmem_free_swap(mapping, index, folio)) { /* Swap was replaced by page: retry */ index--; break; @@ -934,50 +1002,24 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, continue; } - lock_page(page); + folio_lock(folio); - if (PageTransTail(page)) { - /* Middle of THP: zero out the page */ - clear_highpage(page); - unlock_page(page); - /* - * Partial thp truncate due 'start' in middle - * of THP: don't need to look on these pages - * again on !pvec.nr restart. - */ - if (index != round_down(end, HPAGE_PMD_NR)) - start++; - continue; - } else if (PageTransHuge(page)) { - if (index == round_down(end, HPAGE_PMD_NR)) { - /* - * Range ends in the middle of THP: - * zero out the page - */ - clear_highpage(page); - unlock_page(page); - continue; - } - index += HPAGE_PMD_NR - 1; - i += HPAGE_PMD_NR - 1; - } - - if (!unfalloc || !PageUptodate(page)) { - VM_BUG_ON_PAGE(PageTail(page), page); - if (page_mapping(page) == mapping) { - VM_BUG_ON_PAGE(PageWriteback(page), page); - truncate_inode_page(mapping, page); - } else { + if (!unfalloc || !folio_test_uptodate(folio)) { + if (folio_mapping(folio) != mapping) { /* Page was replaced by swap: retry */ - unlock_page(page); + folio_unlock(folio); index--; break; } + VM_BUG_ON_FOLIO(folio_test_writeback(folio), + folio); + truncate_inode_folio(mapping, folio); } - unlock_page(page); + index = folio->index + folio_nr_pages(folio) - 1; + folio_unlock(folio); } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); + folio_batch_remove_exceptionals(&fbatch); + folio_batch_release(&fbatch); index++; } @@ -991,37 +1033,55 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { shmem_undo_range(inode, lstart, lend, false); inode->i_ctime = inode->i_mtime = current_time(inode); + inode_inc_iversion(inode); } EXPORT_SYMBOL_GPL(shmem_truncate_range); -static int shmem_getattr(const struct path *path, struct kstat *stat, +static int shmem_getattr(struct user_namespace *mnt_userns, + const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = path->dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb); if (info->alloced - info->swapped != inode->i_mapping->nrpages) { spin_lock_irq(&info->lock); shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); } - generic_fillattr(inode, stat); - - if (is_huge_enabled(sb_info)) + if (info->fsflags & FS_APPEND_FL) + stat->attributes |= STATX_ATTR_APPEND; + if (info->fsflags & FS_IMMUTABLE_FL) + stat->attributes |= STATX_ATTR_IMMUTABLE; + if (info->fsflags & FS_NODUMP_FL) + stat->attributes |= STATX_ATTR_NODUMP; + stat->attributes_mask |= (STATX_ATTR_APPEND | + STATX_ATTR_IMMUTABLE | + STATX_ATTR_NODUMP); + generic_fillattr(&init_user_ns, inode, stat); + + if (shmem_is_huge(NULL, inode, 0, false)) stat->blksize = HPAGE_PMD_SIZE; + if (request_mask & STATX_BTIME) { + stat->result_mask |= STATX_BTIME; + stat->btime.tv_sec = info->i_crtime.tv_sec; + stat->btime.tv_nsec = info->i_crtime.tv_nsec; + } + return 0; } -static int shmem_setattr(struct dentry *dentry, struct iattr *attr) +static int shmem_setattr(struct user_namespace *mnt_userns, + struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); int error; + bool update_mtime = false; + bool update_ctime = true; - error = setattr_prepare(dentry, attr); + error = setattr_prepare(&init_user_ns, dentry, attr); if (error) return error; @@ -1029,7 +1089,7 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) loff_t oldsize = inode->i_size; loff_t newsize = attr->ia_size; - /* protected by i_mutex */ + /* protected by i_rwsem */ if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || (newsize > oldsize && (info->seals & F_SEAL_GROW))) return -EPERM; @@ -1040,7 +1100,9 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) if (error) return error; i_size_write(inode, newsize); - inode->i_ctime = inode->i_mtime = current_time(inode); + update_mtime = true; + } else { + update_ctime = false; } if (newsize <= oldsize) { loff_t holebegin = round_up(newsize, PAGE_SIZE); @@ -1054,30 +1116,18 @@ static int shmem_setattr(struct dentry *dentry, struct iattr *attr) if (oldsize > holebegin) unmap_mapping_range(inode->i_mapping, holebegin, 0, 1); - - /* - * Part of the huge page can be beyond i_size: subject - * to shrink under memory pressure. - */ - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { - spin_lock(&sbinfo->shrinklist_lock); - /* - * _careful to defend against unlocked access to - * ->shrink_list in shmem_unused_huge_shrink() - */ - if (list_empty_careful(&info->shrinklist)) { - list_add_tail(&info->shrinklist, - &sbinfo->shrinklist); - sbinfo->shrinklist_len++; - } - spin_unlock(&sbinfo->shrinklist_lock); - } } } - setattr_copy(inode, attr); + setattr_copy(&init_user_ns, inode, attr); if (attr->ia_valid & ATTR_MODE) - error = posix_acl_chmod(inode, inode->i_mode); + error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + if (!error && update_ctime) { + inode->i_ctime = current_time(inode); + if (update_mtime) + inode->i_mtime = inode->i_ctime; + inode_inc_iversion(inode); + } return error; } @@ -1086,9 +1136,10 @@ static void shmem_evict_inode(struct inode *inode) struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - if (inode->i_mapping->a_ops == &shmem_aops) { + if (shmem_mapping(inode->i_mapping)) { shmem_unacct_size(info->flags, inode->i_size); inode->i_size = 0; + mapping_set_exiting(inode->i_mapping); shmem_truncate_range(inode, 0, (loff_t)-1); if (!list_empty(&info->shrinklist)) { spin_lock(&sbinfo->shrinklist_lock); @@ -1116,75 +1167,68 @@ static void shmem_evict_inode(struct inode *inode) clear_inode(inode); } -extern struct swap_info_struct *swap_info[]; - static int shmem_find_swap_entries(struct address_space *mapping, - pgoff_t start, unsigned int nr_entries, - struct page **entries, pgoff_t *indices, - unsigned int type, bool frontswap) + pgoff_t start, struct folio_batch *fbatch, + pgoff_t *indices, unsigned int type) { XA_STATE(xas, &mapping->i_pages, start); - struct page *page; + struct folio *folio; swp_entry_t entry; - unsigned int ret = 0; - - if (!nr_entries) - return 0; rcu_read_lock(); - xas_for_each(&xas, page, ULONG_MAX) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, folio, ULONG_MAX) { + if (xas_retry(&xas, folio)) continue; - if (!xa_is_value(page)) + if (!xa_is_value(folio)) continue; - entry = radix_to_swp_entry(page); + entry = radix_to_swp_entry(folio); + /* + * swapin error entries can be found in the mapping. But they're + * deliberately ignored here as we've done everything we can do. + */ if (swp_type(entry) != type) continue; - if (frontswap && - !frontswap_test(swap_info[type], swp_offset(entry))) - continue; - indices[ret] = xas.xa_index; - entries[ret] = page; + indices[folio_batch_count(fbatch)] = xas.xa_index; + if (!folio_batch_add(fbatch, folio)) + break; if (need_resched()) { xas_pause(&xas); cond_resched_rcu(); } - if (++ret == nr_entries) - break; } rcu_read_unlock(); - return ret; + return xas.xa_index; } /* * Move the swapped pages for an inode to page cache. Returns the count * of pages swapped in, or the error in case of failure. */ -static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec, - pgoff_t *indices) +static int shmem_unuse_swap_entries(struct inode *inode, + struct folio_batch *fbatch, pgoff_t *indices) { int i = 0; int ret = 0; int error = 0; struct address_space *mapping = inode->i_mapping; - for (i = 0; i < pvec.nr; i++) { - struct page *page = pvec.pages[i]; + for (i = 0; i < folio_batch_count(fbatch); i++) { + struct folio *folio = fbatch->folios[i]; - if (!xa_is_value(page)) + if (!xa_is_value(folio)) continue; - error = shmem_swapin_page(inode, indices[i], - &page, SGP_CACHE, + error = shmem_swapin_folio(inode, indices[i], + &folio, SGP_CACHE, mapping_gfp_mask(mapping), NULL, NULL); if (error == 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); ret++; } if (error == -ENOMEM) @@ -1197,44 +1241,27 @@ static int shmem_unuse_swap_entries(struct inode *inode, struct pagevec pvec, /* * If swap found in inode, free it and move page from swapcache to filecache. */ -static int shmem_unuse_inode(struct inode *inode, unsigned int type, - bool frontswap, unsigned long *fs_pages_to_unuse) +static int shmem_unuse_inode(struct inode *inode, unsigned int type) { struct address_space *mapping = inode->i_mapping; pgoff_t start = 0; - struct pagevec pvec; + struct folio_batch fbatch; pgoff_t indices[PAGEVEC_SIZE]; - bool frontswap_partial = (frontswap && *fs_pages_to_unuse > 0); int ret = 0; - pagevec_init(&pvec); do { - unsigned int nr_entries = PAGEVEC_SIZE; - - if (frontswap_partial && *fs_pages_to_unuse < PAGEVEC_SIZE) - nr_entries = *fs_pages_to_unuse; - - pvec.nr = shmem_find_swap_entries(mapping, start, nr_entries, - pvec.pages, indices, - type, frontswap); - if (pvec.nr == 0) { + folio_batch_init(&fbatch); + shmem_find_swap_entries(mapping, start, &fbatch, indices, type); + if (folio_batch_count(&fbatch) == 0) { ret = 0; break; } - ret = shmem_unuse_swap_entries(inode, pvec, indices); + ret = shmem_unuse_swap_entries(inode, &fbatch, indices); if (ret < 0) break; - if (frontswap_partial) { - *fs_pages_to_unuse -= ret; - if (*fs_pages_to_unuse == 0) { - ret = FRONTSWAP_PAGES_UNUSED; - break; - } - } - - start = indices[pvec.nr - 1]; + start = indices[folio_batch_count(&fbatch) - 1]; } while (true); return ret; @@ -1245,8 +1272,7 @@ static int shmem_unuse_inode(struct inode *inode, unsigned int type, * device 'type' back into memory, so the swap device can be * unused. */ -int shmem_unuse(unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse) +int shmem_unuse(unsigned int type) { struct shmem_inode_info *info, *next; int error = 0; @@ -1269,8 +1295,7 @@ int shmem_unuse(unsigned int type, bool frontswap, atomic_inc(&info->stop_eviction); mutex_unlock(&shmem_swaplist_mutex); - error = shmem_unuse_inode(&info->vfs_inode, type, frontswap, - fs_pages_to_unuse); + error = shmem_unuse_inode(&info->vfs_inode, type); cond_resched(); mutex_lock(&shmem_swaplist_mutex); @@ -1292,16 +1317,30 @@ int shmem_unuse(unsigned int type, bool frontswap, */ static int shmem_writepage(struct page *page, struct writeback_control *wbc) { + struct folio *folio = page_folio(page); struct shmem_inode_info *info; struct address_space *mapping; struct inode *inode; swp_entry_t swap; pgoff_t index; - VM_BUG_ON_PAGE(PageCompound(page), page); - BUG_ON(!PageLocked(page)); - mapping = page->mapping; - index = page->index; + /* + * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or + * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages, + * and its shmem_writeback() needs them to be split when swapping. + */ + if (folio_test_large(folio)) { + /* Ensure the subpages are still dirty */ + folio_test_set_dirty(folio); + if (split_huge_page(page) < 0) + goto redirty; + folio = page_folio(page); + folio_clear_dirty(folio); + } + + BUG_ON(!folio_test_locked(folio)); + mapping = folio->mapping; + index = folio->index; inode = mapping->host; info = SHMEM_I(inode); if (info->flags & VM_LOCKED) @@ -1324,15 +1363,15 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) /* * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC * value into swapfile.c, the only way we can correctly account for a - * fallocated page arriving here is now to initialize it and write it. + * fallocated folio arriving here is now to initialize it and write it. * - * That's okay for a page already fallocated earlier, but if we have + * That's okay for a folio already fallocated earlier, but if we have * not yet completed the fallocation, then (a) we want to keep track - * of this page in case we have to undo it, and (b) it may not be a + * of this folio in case we have to undo it, and (b) it may not be a * good idea to continue anyway, once we're pushing into swap. So - * reactivate the page, and let shmem_fallocate() quit when too many. + * reactivate the folio, and let shmem_fallocate() quit when too many. */ - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { if (inode->i_private) { struct shmem_falloc *shmem_falloc; spin_lock(&inode->i_lock); @@ -1348,18 +1387,18 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (shmem_falloc) goto redirty; } - clear_highpage(page); - flush_dcache_page(page); - SetPageUptodate(page); + folio_zero_range(folio, 0, folio_size(folio)); + flush_dcache_folio(folio); + folio_mark_uptodate(folio); } - swap = get_swap_page(page); + swap = folio_alloc_swap(folio); if (!swap.val) goto redirty; /* * Add inode to shmem_unuse()'s list of swapped-out inodes, - * if it's not already there. Do it now before the page is + * if it's not already there. Do it now before the folio is * moved to swap cache, when its pagelock no longer protects * the inode from eviction. But don't unlock the mutex until * we've incremented swapped, because shmem_unuse_inode() will @@ -1369,29 +1408,30 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (list_empty(&info->swaplist)) list_add(&info->swaplist, &shmem_swaplist); - if (add_to_swap_cache(page, swap, - __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN) == 0) { + if (add_to_swap_cache(folio, swap, + __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, + NULL) == 0) { spin_lock_irq(&info->lock); shmem_recalc_inode(inode); info->swapped++; spin_unlock_irq(&info->lock); swap_shmem_alloc(swap); - shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); + shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); mutex_unlock(&shmem_swaplist_mutex); - BUG_ON(page_mapped(page)); - swap_writepage(page, wbc); + BUG_ON(folio_mapped(folio)); + swap_writepage(&folio->page, wbc); return 0; } mutex_unlock(&shmem_swaplist_mutex); - put_swap_page(page, swap); + put_swap_folio(folio, swap); redirty: - set_page_dirty(page); + folio_mark_dirty(folio); if (wbc->for_reclaim) - return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */ - unlock_page(page); + return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ + folio_unlock(folio); return 0; } @@ -1412,10 +1452,10 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) { struct mempolicy *mpol = NULL; if (sbinfo->mpol) { - spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ + raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ mpol = sbinfo->mpol; mpol_get(mpol); - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); } return mpol; } @@ -1448,32 +1488,55 @@ static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) mpol_cond_put(vma->vm_policy); } -static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, +static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; struct page *page; - struct vm_fault vmf; + struct vm_fault vmf = { + .vma = &pvma, + }; shmem_pseudo_vma_init(&pvma, info, index); - vmf.vma = &pvma; - vmf.address = 0; page = swap_cluster_readahead(swap, gfp, &vmf); shmem_pseudo_vma_destroy(&pvma); - return page; + if (!page) + return NULL; + return page_folio(page); +} + +/* + * Make sure huge_gfp is always more limited than limit_gfp. + * Some of the flags set permissions, while others set limitations. + */ +static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) +{ + gfp_t allowflags = __GFP_IO | __GFP_FS | __GFP_RECLAIM; + gfp_t denyflags = __GFP_NOWARN | __GFP_NORETRY; + gfp_t zoneflags = limit_gfp & GFP_ZONEMASK; + gfp_t result = huge_gfp & ~(allowflags | GFP_ZONEMASK); + + /* Allow allocations only from the originally specified zones. */ + result |= zoneflags; + + /* + * Minimize the result gfp by taking the union with the deny flags, + * and the intersection of the allow flags. + */ + result |= (limit_gfp & denyflags); + result |= (huge_gfp & limit_gfp) & allowflags; + + return result; } -static struct page *shmem_alloc_hugepage(gfp_t gfp, +static struct folio *shmem_alloc_hugefolio(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; struct address_space *mapping = info->vfs_inode.i_mapping; pgoff_t hindex; - struct page *page; - - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) - return NULL; + struct folio *folio; hindex = round_down(index, HPAGE_PMD_NR); if (xa_find(&mapping->i_pages, &hindex, hindex + HPAGE_PMD_NR - 1, @@ -1481,37 +1544,35 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, return NULL; shmem_pseudo_vma_init(&pvma, info, hindex); - page = alloc_pages_vma(gfp | __GFP_COMP | __GFP_NORETRY | __GFP_NOWARN, - HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), true); + folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); shmem_pseudo_vma_destroy(&pvma); - if (page) - prep_transhuge_page(page); - return page; + if (!folio) + count_vm_event(THP_FILE_FALLBACK); + return folio; } -static struct page *shmem_alloc_page(gfp_t gfp, +static struct folio *shmem_alloc_folio(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; - struct page *page; + struct folio *folio; shmem_pseudo_vma_init(&pvma, info, index); - page = alloc_page_vma(gfp, &pvma, 0); + folio = vma_alloc_folio(gfp, 0, &pvma, 0, false); shmem_pseudo_vma_destroy(&pvma); - return page; + return folio; } -static struct page *shmem_alloc_and_acct_page(gfp_t gfp, - struct inode *inode, +static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, pgoff_t index, bool huge) { struct shmem_inode_info *info = SHMEM_I(inode); - struct page *page; + struct folio *folio; int nr; int err = -ENOSPC; - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) huge = false; nr = huge ? HPAGE_PMD_NR : 1; @@ -1519,13 +1580,13 @@ static struct page *shmem_alloc_and_acct_page(gfp_t gfp, goto failed; if (huge) - page = shmem_alloc_hugepage(gfp, info, index); + folio = shmem_alloc_hugefolio(gfp, info, index); else - page = shmem_alloc_page(gfp, info, index); - if (page) { - __SetPageLocked(page); - __SetPageSwapBacked(page); - return page; + folio = shmem_alloc_folio(gfp, info, index); + if (folio) { + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + return folio; } err = -ENOMEM; @@ -1536,7 +1597,7 @@ failed: /* * When a page is moved from swapcache to shmem filecache (either by the - * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of + * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of * shmem_unuse_inode()), it may have been read in earlier from swap, in * ignorance of the mapping it belongs to. If that mapping has special * constraints (like the gma500 GEM driver, which requires RAM below 4GB), @@ -1546,53 +1607,57 @@ failed: * NUMA mempolicy, and applied also to anonymous pages in do_swap_page(); * but for now it is a simple matter of zone. */ -static bool shmem_should_replace_page(struct page *page, gfp_t gfp) +static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) { - return page_zonenum(page) > gfp_zone(gfp); + return folio_zonenum(folio) > gfp_zone(gfp); } -static int shmem_replace_page(struct page **pagep, gfp_t gfp, +static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct page *oldpage, *newpage; + struct folio *old, *new; struct address_space *swap_mapping; swp_entry_t entry; pgoff_t swap_index; int error; - oldpage = *pagep; - entry.val = page_private(oldpage); + old = *foliop; + entry = folio_swap_entry(old); swap_index = swp_offset(entry); - swap_mapping = page_mapping(oldpage); + swap_mapping = swap_address_space(entry); /* * We have arrived here because our zones are constrained, so don't * limit chance of success by further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; - newpage = shmem_alloc_page(gfp, info, index); - if (!newpage) + VM_BUG_ON_FOLIO(folio_test_large(old), old); + new = shmem_alloc_folio(gfp, info, index); + if (!new) return -ENOMEM; - get_page(newpage); - copy_highpage(newpage, oldpage); - flush_dcache_page(newpage); + folio_get(new); + folio_copy(new, old); + flush_dcache_folio(new); - __SetPageLocked(newpage); - __SetPageSwapBacked(newpage); - SetPageUptodate(newpage); - set_page_private(newpage, entry.val); - SetPageSwapCache(newpage); + __folio_set_locked(new); + __folio_set_swapbacked(new); + folio_mark_uptodate(new); + folio_set_swap_entry(new, entry); + folio_set_swapcache(new); /* * Our caller will very soon move newpage out of swapcache, but it's * a nice clean interface for us to replace oldpage by newpage there. */ xa_lock_irq(&swap_mapping->i_pages); - error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage); + error = shmem_replace_entry(swap_mapping, swap_index, old, new); if (!error) { - __inc_node_page_state(newpage, NR_FILE_PAGES); - __dec_node_page_state(oldpage, NR_FILE_PAGES); + mem_cgroup_migrate(old, new); + __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); + __lruvec_stat_mod_folio(new, NR_SHMEM, 1); + __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1); + __lruvec_stat_mod_folio(old, NR_SHMEM, -1); } xa_unlock_irq(&swap_mapping->i_pages); @@ -1602,48 +1667,78 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * both PageSwapCache and page_private after getting page lock; * but be defensive. Reverse old to newpage for clear and free. */ - oldpage = newpage; + old = new; } else { - mem_cgroup_migrate(oldpage, newpage); - lru_cache_add_anon(newpage); - *pagep = newpage; + folio_add_lru(new); + *foliop = new; } - ClearPageSwapCache(oldpage); - set_page_private(oldpage, 0); + folio_clear_swapcache(old); + old->private = NULL; - unlock_page(oldpage); - put_page(oldpage); - put_page(oldpage); + folio_unlock(old); + folio_put_refs(old, 2); return error; } +static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index, + struct folio *folio, swp_entry_t swap) +{ + struct address_space *mapping = inode->i_mapping; + struct shmem_inode_info *info = SHMEM_I(inode); + swp_entry_t swapin_error; + void *old; + + swapin_error = make_swapin_error_entry(&folio->page); + old = xa_cmpxchg_irq(&mapping->i_pages, index, + swp_to_radix_entry(swap), + swp_to_radix_entry(swapin_error), 0); + if (old != swp_to_radix_entry(swap)) + return; + + folio_wait_writeback(folio); + delete_from_swap_cache(folio); + spin_lock_irq(&info->lock); + /* + * Don't treat swapin error folio as alloced. Otherwise inode->i_blocks won't + * be 0 when inode is released and thus trigger WARN_ON(inode->i_blocks) in + * shmem_evict_inode. + */ + info->alloced--; + info->swapped--; + shmem_recalc_inode(inode); + spin_unlock_irq(&info->lock); + swap_free(swap); +} + /* - * Swap in the page pointed to by *pagep. - * Caller has to make sure that *pagep contains a valid swapped page. - * Returns 0 and the page in pagep if success. On failure, returns the - * the error code and NULL in *pagep. + * Swap in the folio pointed to by *foliop. + * Caller has to make sure that *foliop contains a valid swapped folio. + * Returns 0 and the folio in foliop if success. On failure, returns the + * error code and NULL in *foliop. */ -static int shmem_swapin_page(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, +static int shmem_swapin_folio(struct inode *inode, pgoff_t index, + struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); - struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm; - struct mem_cgroup *memcg; - struct page *page; + struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL; + struct folio *folio = NULL; swp_entry_t swap; int error; - VM_BUG_ON(!*pagep || !xa_is_value(*pagep)); - swap = radix_to_swp_entry(*pagep); - *pagep = NULL; + VM_BUG_ON(!*foliop || !xa_is_value(*foliop)); + swap = radix_to_swp_entry(*foliop); + *foliop = NULL; + + if (is_swapin_error_entry(swap)) + return -EIO; /* Look it up and read it in.. */ - page = lookup_swap_cache(swap, NULL, 0); - if (!page) { + folio = swap_cache_get_folio(swap, NULL, 0); + if (!folio) { /* Or update major stats only when swapin succeeds?? */ if (fault_type) { *fault_type |= VM_FAULT_MAJOR; @@ -1651,114 +1746,101 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index, count_memcg_event_mm(charge_mm, PGMAJFAULT); } /* Here we actually start the io */ - page = shmem_swapin(swap, gfp, info, index); - if (!page) { + folio = shmem_swapin(swap, gfp, info, index); + if (!folio) { error = -ENOMEM; goto failed; } } - /* We have to do this with page locked to prevent races */ - lock_page(page); - if (!PageSwapCache(page) || page_private(page) != swap.val || + /* We have to do this with folio locked to prevent races */ + folio_lock(folio); + if (!folio_test_swapcache(folio) || + folio_swap_entry(folio).val != swap.val || !shmem_confirm_swap(mapping, index, swap)) { error = -EEXIST; goto unlock; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { error = -EIO; goto failed; } - wait_on_page_writeback(page); + folio_wait_writeback(folio); + + /* + * Some architectures may have to restore extra metadata to the + * folio after reading from swap. + */ + arch_swap_restore(swap, folio); - if (shmem_should_replace_page(page, gfp)) { - error = shmem_replace_page(&page, gfp, info, index); + if (shmem_should_replace_folio(folio, gfp)) { + error = shmem_replace_folio(&folio, gfp, info, index); if (error) goto failed; } - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, - false); - if (!error) { - error = shmem_add_to_page_cache(page, mapping, index, - swp_to_radix_entry(swap), gfp); - /* - * We already confirmed swap under page lock, and make - * no memory allocation here, so usually no possibility - * of error; but free_swap_and_cache() only trylocks a - * page, so it is just possible that the entry has been - * truncated or holepunched since swap was confirmed. - * shmem_undo_range() will have done some of the - * unaccounting, now delete_from_swap_cache() will do - * the rest. - */ - if (error) { - mem_cgroup_cancel_charge(page, memcg, false); - delete_from_swap_cache(page); - } - } + error = shmem_add_to_page_cache(folio, mapping, index, + swp_to_radix_entry(swap), gfp, + charge_mm); if (error) goto failed; - mem_cgroup_commit_charge(page, memcg, true, false); - spin_lock_irq(&info->lock); info->swapped--; shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); if (sgp == SGP_WRITE) - mark_page_accessed(page); + folio_mark_accessed(folio); - delete_from_swap_cache(page); - set_page_dirty(page); + delete_from_swap_cache(folio); + folio_mark_dirty(folio); swap_free(swap); - *pagep = page; + *foliop = folio; return 0; failed: if (!shmem_confirm_swap(mapping, index, swap)) error = -EEXIST; + if (error == -EIO) + shmem_set_folio_swapin_error(inode, index, folio, swap); unlock: - if (page) { - unlock_page(page); - put_page(page); + if (folio) { + folio_unlock(folio); + folio_put(folio); } return error; } /* - * shmem_getpage_gfp - find page in cache, or get from swap, or allocate + * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate * * If we allocate a new one we do not mark it dirty. That's up to the * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache. * - * vmf and fault_type are only supplied by shmem_fault: + * vma, vmf, and fault_type are only supplied by shmem_fault: * otherwise they are NULL. */ -static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, gfp_t gfp, - struct vm_area_struct *vma, struct vm_fault *vmf, - vm_fault_t *fault_type) +static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, + struct folio **foliop, enum sgp_type sgp, gfp_t gfp, + struct vm_area_struct *vma, struct vm_fault *vmf, + vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; struct mm_struct *charge_mm; - struct mem_cgroup *memcg; - struct page *page; - enum sgp_type sgp_huge = sgp; + struct folio *folio; pgoff_t hindex = index; + gfp_t huge_gfp; int error; int once = 0; int alloced = 0; if (index > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) return -EFBIG; - if (sgp == SGP_NOHUGE || sgp == SGP_HUGE) - sgp = SGP_CACHE; repeat: if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { @@ -1766,38 +1848,53 @@ repeat: } sbinfo = SHMEM_SB(inode->i_sb); - charge_mm = vma ? vma->vm_mm : current->mm; + charge_mm = vma ? vma->vm_mm : NULL; - page = find_lock_entry(mapping, index); - if (xa_is_value(page)) { - error = shmem_swapin_page(inode, index, &page, + folio = __filemap_get_folio(mapping, index, FGP_ENTRY | FGP_LOCK, 0); + if (folio && vma && userfaultfd_minor(vma)) { + if (!xa_is_value(folio)) { + folio_unlock(folio); + folio_put(folio); + } + *fault_type = handle_userfault(vmf, VM_UFFD_MINOR); + return 0; + } + + if (xa_is_value(folio)) { + error = shmem_swapin_folio(inode, index, &folio, sgp, gfp, vma, fault_type); if (error == -EEXIST) goto repeat; - *pagep = page; + *foliop = folio; return error; } - if (page && sgp == SGP_WRITE) - mark_page_accessed(page); - - /* fallocated page? */ - if (page && !PageUptodate(page)) { + if (folio) { + hindex = folio->index; + if (sgp == SGP_WRITE) + folio_mark_accessed(folio); + if (folio_test_uptodate(folio)) + goto out; + /* fallocated folio */ if (sgp != SGP_READ) goto clear; - unlock_page(page); - put_page(page); - page = NULL; + folio_unlock(folio); + folio_put(folio); } - if (page || sgp == SGP_READ) { - *pagep = page; + + /* + * SGP_READ: succeed on hole, with NULL folio, letting caller zero. + * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail. + */ + *foliop = NULL; + if (sgp == SGP_READ) return 0; - } + if (sgp == SGP_NOALLOC) + return -ENOENT; /* - * Fast cache lookup did not find it: - * bring it back from swap or allocate. + * Fast cache lookup and swap lookup did not find it: allocate. */ if (vma && userfaultfd_missing(vma)) { @@ -1805,48 +1902,25 @@ repeat: return 0; } - /* shmem_symlink() */ - if (mapping->a_ops != &shmem_aops) - goto alloc_nohuge; - if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE) + if (!shmem_is_huge(vma, inode, index, false)) goto alloc_nohuge; - if (shmem_huge == SHMEM_HUGE_FORCE) - goto alloc_huge; - switch (sbinfo->huge) { - loff_t i_size; - pgoff_t off; - case SHMEM_HUGE_NEVER: - goto alloc_nohuge; - case SHMEM_HUGE_WITHIN_SIZE: - off = round_up(index, HPAGE_PMD_NR); - i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >= HPAGE_PMD_SIZE && - i_size >> PAGE_SHIFT >= off) - goto alloc_huge; - /* fallthrough */ - case SHMEM_HUGE_ADVISE: - if (sgp_huge == SGP_HUGE) - goto alloc_huge; - /* TODO: implement fadvise() hints */ - goto alloc_nohuge; - } -alloc_huge: - page = shmem_alloc_and_acct_page(gfp, inode, index, true); - if (IS_ERR(page)) { + huge_gfp = vma_thp_gfp_mask(vma); + huge_gfp = limit_gfp_mask(huge_gfp, gfp); + folio = shmem_alloc_and_acct_folio(huge_gfp, inode, index, true); + if (IS_ERR(folio)) { alloc_nohuge: - page = shmem_alloc_and_acct_page(gfp, inode, - index, false); + folio = shmem_alloc_and_acct_folio(gfp, inode, index, false); } - if (IS_ERR(page)) { + if (IS_ERR(folio)) { int retry = 5; - error = PTR_ERR(page); - page = NULL; + error = PTR_ERR(folio); + folio = NULL; if (error != -ENOSPC) goto unlock; /* - * Try to reclaim some space by splitting a huge page + * Try to reclaim some space by splitting a large folio * beyond i_size on the filesystem. */ while (retry--) { @@ -1861,41 +1935,30 @@ alloc_nohuge: goto unlock; } - if (PageTransHuge(page)) - hindex = round_down(index, HPAGE_PMD_NR); - else - hindex = index; + hindex = round_down(index, folio_nr_pages(folio)); if (sgp == SGP_WRITE) - __SetPageReferenced(page); + __folio_set_referenced(folio); - error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg, - PageTransHuge(page)); + error = shmem_add_to_page_cache(folio, mapping, hindex, + NULL, gfp & GFP_RECLAIM_MASK, + charge_mm); if (error) goto unacct; - error = shmem_add_to_page_cache(page, mapping, hindex, - NULL, gfp & GFP_RECLAIM_MASK); - if (error) { - mem_cgroup_cancel_charge(page, memcg, - PageTransHuge(page)); - goto unacct; - } - mem_cgroup_commit_charge(page, memcg, false, - PageTransHuge(page)); - lru_cache_add_anon(page); + folio_add_lru(folio); spin_lock_irq(&info->lock); - info->alloced += compound_nr(page); - inode->i_blocks += BLOCKS_PER_PAGE << compound_order(page); + info->alloced += folio_nr_pages(folio); + inode->i_blocks += (blkcnt_t)BLOCKS_PER_PAGE << folio_order(folio); shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); alloced = true; - if (PageTransHuge(page) && + if (folio_test_pmd_mappable(folio) && DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < - hindex + HPAGE_PMD_NR - 1) { + folio_next_index(folio) - 1) { /* - * Part of the huge page is beyond i_size: subject + * Part of the large folio is beyond i_size: subject * to shrink under memory pressure. */ spin_lock(&sbinfo->shrinklist_lock); @@ -1912,33 +1975,31 @@ alloc_nohuge: } /* - * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. + * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio. */ if (sgp == SGP_FALLOC) sgp = SGP_WRITE; clear: /* - * Let SGP_WRITE caller clear ends if write does not fill page; - * but SGP_FALLOC on a page fallocated earlier must initialize + * Let SGP_WRITE caller clear ends if write does not fill folio; + * but SGP_FALLOC on a folio fallocated earlier must initialize * it now, lest undo on failure cancel our earlier guarantee. */ - if (sgp != SGP_WRITE && !PageUptodate(page)) { - struct page *head = compound_head(page); - int i; + if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) { + long i, n = folio_nr_pages(folio); - for (i = 0; i < compound_nr(head); i++) { - clear_highpage(head + i); - flush_dcache_page(head + i); - } - SetPageUptodate(head); + for (i = 0; i < n; i++) + clear_highpage(folio_page(folio, i)); + flush_dcache_folio(folio); + folio_mark_uptodate(folio); } /* Perhaps the file has been truncated since we checked */ if (sgp <= SGP_CACHE && ((loff_t)index << PAGE_SHIFT) >= i_size_read(inode)) { if (alloced) { - ClearPageDirty(page); - delete_from_page_cache(page); + folio_clear_dirty(folio); + filemap_remove_folio(folio); spin_lock_irq(&info->lock); shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); @@ -1946,24 +2007,25 @@ clear: error = -EINVAL; goto unlock; } - *pagep = page + index - hindex; +out: + *foliop = folio; return 0; /* * Error recovery. */ unacct: - shmem_inode_unacct_blocks(inode, compound_nr(page)); + shmem_inode_unacct_blocks(inode, folio_nr_pages(folio)); - if (PageTransHuge(page)) { - unlock_page(page); - put_page(page); + if (folio_test_large(folio)) { + folio_unlock(folio); + folio_put(folio); goto alloc_nohuge; } unlock: - if (page) { - unlock_page(page); - put_page(page); + if (folio) { + folio_unlock(folio); + folio_put(folio); } if (error == -ENOSPC && !once++) { spin_lock_irq(&info->lock); @@ -1976,6 +2038,13 @@ unlock: return error; } +int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, + enum sgp_type sgp) +{ + return shmem_get_folio_gfp(inode, index, foliop, sgp, + mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); +} + /* * This is like autoremove_wake_function, but it removes the wait queue * entry unconditionally - even if something else had already woken the @@ -1993,14 +2062,14 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); - enum sgp_type sgp; + struct folio *folio = NULL; int err; vm_fault_t ret = VM_FAULT_LOCKED; /* * Trinity finds that probing a hole which tmpfs is punching can * prevent the hole-punch from ever completing: which in turn - * locks writers out with its hold on i_mutex. So refrain from + * locks writers out with its hold on i_rwsem. So refrain from * faulting pages into the hole while it's being punched. Although * shmem_undo_range() does remove the additions, it may be unable to * keep up, as each new page needs its own unmap_mapping_range() call, @@ -2011,7 +2080,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) * we just need to make racing faults a rare case. * * The implementation below would be much simpler if we just used a - * standard mutex or completion: but we cannot take i_mutex in fault, + * standard mutex or completion: but we cannot take i_rwsem in fault, * and bloating every shmem inode for this unlikely case would be sad. */ if (unlikely(inode->i_private)) { @@ -2056,18 +2125,12 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) spin_unlock(&inode->i_lock); } - sgp = SGP_CACHE; - - if ((vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - sgp = SGP_NOHUGE; - else if (vma->vm_flags & VM_HUGEPAGE) - sgp = SGP_HUGE; - - err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, + err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, gfp, vma, vmf, &ret); if (err) return vmf_error(err); + if (folio) + vmf->page = folio_file_page(folio, vmf->pgoff); return ret; } @@ -2089,7 +2152,7 @@ unsigned long shmem_get_unmapped_area(struct file *file, get_area = current->mm->get_unmapped_area; addr = get_area(file, uaddr, len, pgoff, flags); - if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) return addr; if (IS_ERR_VALUE(addr)) return addr; @@ -2178,91 +2241,114 @@ static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, } #endif -int shmem_lock(struct file *file, int lock, struct user_struct *user) +int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { struct inode *inode = file_inode(file); struct shmem_inode_info *info = SHMEM_I(inode); int retval = -ENOMEM; - spin_lock_irq(&info->lock); + /* + * What serializes the accesses to info->flags? + * ipc_lock_object() when called from shmctl_do_lock(), + * no serialization needed when called from shm_destroy(). + */ if (lock && !(info->flags & VM_LOCKED)) { - if (!user_shm_lock(inode->i_size, user)) + if (!user_shm_lock(inode->i_size, ucounts)) goto out_nomem; info->flags |= VM_LOCKED; mapping_set_unevictable(file->f_mapping); } - if (!lock && (info->flags & VM_LOCKED) && user) { - user_shm_unlock(inode->i_size, user); + if (!lock && (info->flags & VM_LOCKED) && ucounts) { + user_shm_unlock(inode->i_size, ucounts); info->flags &= ~VM_LOCKED; mapping_clear_unevictable(file->f_mapping); } retval = 0; out_nomem: - spin_unlock_irq(&info->lock); return retval; } static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { struct shmem_inode_info *info = SHMEM_I(file_inode(file)); + int ret; - if (info->seals & F_SEAL_FUTURE_WRITE) { - /* - * New PROT_WRITE and MAP_SHARED mmaps are not allowed when - * "future write" seal active. - */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return -EPERM; + ret = seal_check_future_write(info->seals, vma); + if (ret) + return ret; - /* - * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as - * MAP_SHARED and read-only, take care to not allow mprotect to - * revert protections on such mappings. Do this only for shared - * mappings. For private mappings, don't need to mask - * VM_MAYWRITE as we still want them to be COW-writable. - */ - if (vma->vm_flags & VM_SHARED) - vma->vm_flags &= ~(VM_MAYWRITE); - } + /* arm64 - allow memory tagging on RAM-based files */ + vma->vm_flags |= VM_MTE_ALLOWED; file_accessed(file); vma->vm_ops = &shmem_vm_ops; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && - ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < - (vma->vm_end & HPAGE_PMD_MASK)) { - khugepaged_enter(vma, vma->vm_flags); - } return 0; } -static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, +#ifdef CONFIG_TMPFS_XATTR +static int shmem_initxattrs(struct inode *, const struct xattr *, void *); + +/* + * chattr's fsflags are unrelated to extended attributes, + * but tmpfs has chosen to enable them under the same config option. + */ +static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) +{ + unsigned int i_flags = 0; + + if (fsflags & FS_NOATIME_FL) + i_flags |= S_NOATIME; + if (fsflags & FS_APPEND_FL) + i_flags |= S_APPEND; + if (fsflags & FS_IMMUTABLE_FL) + i_flags |= S_IMMUTABLE; + /* + * But FS_NODUMP_FL does not require any action in i_flags. + */ + inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE); +} +#else +static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) +{ +} +#define shmem_initxattrs NULL +#endif + +static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, unsigned long flags) { struct inode *inode; struct shmem_inode_info *info; struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + ino_t ino; - if (shmem_reserve_inode(sb)) + if (shmem_reserve_inode(sb, &ino)) return NULL; inode = new_inode(sb); if (inode) { - inode->i_ino = get_next_ino(); - inode_init_owner(inode, dir, mode); + inode->i_ino = ino; + inode_init_owner(&init_user_ns, inode, dir, mode); inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); - inode->i_generation = prandom_u32(); + inode->i_generation = get_random_u32(); info = SHMEM_I(inode); memset(info, 0, (char *)inode - (char *)info); spin_lock_init(&info->lock); atomic_set(&info->stop_eviction, 0); info->seals = F_SEAL_SEAL; info->flags = flags & VM_NORESERVE; + info->i_crtime = inode->i_mtime; + info->fsflags = (dir == NULL) ? 0 : + SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; + if (info->fsflags) + shmem_set_inode_flags(inode, info->fsflags); INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); cache_no_acl(inode); + mapping_set_large_folios(inode->i_mapping); switch (mode & S_IFMT) { default: @@ -2298,185 +2384,142 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode return inode; } -bool shmem_mapping(struct address_space *mapping) -{ - return mapping->a_ops == &shmem_aops; -} - -static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - bool zeropage, - struct page **pagep) +#ifdef CONFIG_USERFAULTFD +int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, + pmd_t *dst_pmd, + struct vm_area_struct *dst_vma, + unsigned long dst_addr, + unsigned long src_addr, + bool zeropage, bool wp_copy, + struct page **pagep) { struct inode *inode = file_inode(dst_vma->vm_file); struct shmem_inode_info *info = SHMEM_I(inode); struct address_space *mapping = inode->i_mapping; gfp_t gfp = mapping_gfp_mask(mapping); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); - struct mem_cgroup *memcg; - spinlock_t *ptl; void *page_kaddr; - struct page *page; - pte_t _dst_pte, *dst_pte; + struct folio *folio; int ret; - pgoff_t offset, max_off; + pgoff_t max_off; - ret = -ENOMEM; - if (!shmem_inode_acct_block(inode, 1)) - goto out; + if (!shmem_inode_acct_block(inode, 1)) { + /* + * We may have got a page, returned -ENOENT triggering a retry, + * and now we find ourselves with -ENOMEM. Release the page, to + * avoid a BUG_ON in our caller. + */ + if (unlikely(*pagep)) { + put_page(*pagep); + *pagep = NULL; + } + return -ENOMEM; + } if (!*pagep) { - page = shmem_alloc_page(gfp, info, pgoff); - if (!page) + ret = -ENOMEM; + folio = shmem_alloc_folio(gfp, info, pgoff); + if (!folio) goto out_unacct_blocks; - if (!zeropage) { /* mcopy_atomic */ - page_kaddr = kmap_atomic(page); + if (!zeropage) { /* COPY */ + page_kaddr = kmap_local_folio(folio, 0); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, PAGE_SIZE); - kunmap_atomic(page_kaddr); + pagefault_enable(); + kunmap_local(page_kaddr); - /* fallback to copy_from_user outside mmap_sem */ + /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { - *pagep = page; - shmem_inode_unacct_blocks(inode, 1); + *pagep = &folio->page; + ret = -ENOENT; /* don't free the page */ - return -ENOENT; + goto out_unacct_blocks; } - } else { /* mfill_zeropage_atomic */ - clear_highpage(page); + + flush_dcache_folio(folio); + } else { /* ZEROPAGE */ + clear_user_highpage(&folio->page, dst_addr); } } else { - page = *pagep; + folio = page_folio(*pagep); + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); *pagep = NULL; } - VM_BUG_ON(PageLocked(page) || PageSwapBacked(page)); - __SetPageLocked(page); - __SetPageSwapBacked(page); - __SetPageUptodate(page); + VM_BUG_ON(folio_test_locked(folio)); + VM_BUG_ON(folio_test_swapbacked(folio)); + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + __folio_mark_uptodate(folio); ret = -EFAULT; - offset = linear_page_index(dst_vma, dst_addr); max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (unlikely(offset >= max_off)) + if (unlikely(pgoff >= max_off)) goto out_release; - ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false); + ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, + gfp & GFP_RECLAIM_MASK, dst_mm); if (ret) goto out_release; - ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL, - gfp & GFP_RECLAIM_MASK); + ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, + &folio->page, true, wp_copy); if (ret) - goto out_release_uncharge; - - mem_cgroup_commit_charge(page, memcg, false, false); - - _dst_pte = mk_pte(page, dst_vma->vm_page_prot); - if (dst_vma->vm_flags & VM_WRITE) - _dst_pte = pte_mkwrite(pte_mkdirty(_dst_pte)); - else { - /* - * We don't set the pte dirty if the vma has no - * VM_WRITE permission, so mark the page dirty or it - * could be freed from under us. We could do it - * unconditionally before unlock_page(), but doing it - * only if VM_WRITE is not set is faster. - */ - set_page_dirty(page); - } - - dst_pte = pte_offset_map_lock(dst_mm, dst_pmd, dst_addr, &ptl); - - ret = -EFAULT; - max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); - if (unlikely(offset >= max_off)) - goto out_release_uncharge_unlock; - - ret = -EEXIST; - if (!pte_none(*dst_pte)) - goto out_release_uncharge_unlock; + goto out_delete_from_cache; - lru_cache_add_anon(page); - - spin_lock(&info->lock); + spin_lock_irq(&info->lock); info->alloced++; inode->i_blocks += BLOCKS_PER_PAGE; shmem_recalc_inode(inode); - spin_unlock(&info->lock); - - inc_mm_counter(dst_mm, mm_counter_file(page)); - page_add_file_rmap(page, false); - set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte); + spin_unlock_irq(&info->lock); - /* No need to invalidate - it was non-present before */ - update_mmu_cache(dst_vma, dst_addr, dst_pte); - pte_unmap_unlock(dst_pte, ptl); - unlock_page(page); - ret = 0; -out: - return ret; -out_release_uncharge_unlock: - pte_unmap_unlock(dst_pte, ptl); - ClearPageDirty(page); - delete_from_page_cache(page); -out_release_uncharge: - mem_cgroup_cancel_charge(page, memcg, false); + folio_unlock(folio); + return 0; +out_delete_from_cache: + filemap_remove_folio(folio); out_release: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); out_unacct_blocks: shmem_inode_unacct_blocks(inode, 1); - goto out; -} - -int shmem_mcopy_atomic_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr, - unsigned long src_addr, - struct page **pagep) -{ - return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, src_addr, false, pagep); -} - -int shmem_mfill_zeropage_pte(struct mm_struct *dst_mm, - pmd_t *dst_pmd, - struct vm_area_struct *dst_vma, - unsigned long dst_addr) -{ - struct page *page = NULL; - - return shmem_mfill_atomic_pte(dst_mm, dst_pmd, dst_vma, - dst_addr, 0, true, &page); + return ret; } +#endif /* CONFIG_USERFAULTFD */ #ifdef CONFIG_TMPFS static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; -#ifdef CONFIG_TMPFS_XATTR -static int shmem_initxattrs(struct inode *, const struct xattr *, void *); -#else -#define shmem_initxattrs NULL -#endif - static int shmem_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_SHIFT; + struct folio *folio; + int ret = 0; - /* i_mutex is held by caller */ + /* i_rwsem is held by caller */ if (unlikely(info->seals & (F_SEAL_GROW | F_SEAL_WRITE | F_SEAL_FUTURE_WRITE))) { if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) @@ -2485,7 +2528,20 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - return shmem_getpage(inode, index, pagep, SGP_WRITE); + ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); + + if (ret) + return ret; + + *pagep = folio_file_page(folio, index); + if (PageHWPoison(*pagep)) { + folio_unlock(folio); + folio_put(folio); + *pagep = NULL; + return -EIO; + } + + return 0; } static int @@ -2531,23 +2587,15 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) struct address_space *mapping = inode->i_mapping; pgoff_t index; unsigned long offset; - enum sgp_type sgp = SGP_READ; int error = 0; ssize_t retval = 0; loff_t *ppos = &iocb->ki_pos; - /* - * Might this read be for a stacking filesystem? Then when reading - * holes of a sparse file, we actually need to allocate those pages, - * and even mark them dirty, so it cannot exceed the max_blocks limit. - */ - if (!iter_is_iovec(to)) - sgp = SGP_CACHE; - index = *ppos >> PAGE_SHIFT; offset = *ppos & ~PAGE_MASK; for (;;) { + struct folio *folio = NULL; struct page *page = NULL; pgoff_t end_index; unsigned long nr, ret; @@ -2562,21 +2610,26 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) break; } - error = shmem_getpage(inode, index, &page, sgp); + error = shmem_get_folio(inode, index, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; break; } - if (page) { - if (sgp == SGP_CACHE) - set_page_dirty(page); - unlock_page(page); + if (folio) { + folio_unlock(folio); + + page = folio_file_page(folio, index); + if (PageHWPoison(page)) { + folio_put(folio); + error = -EIO; + break; + } } /* * We must evaluate after, since reads (unlike writes) - * are called without i_mutex protection against truncate + * are called without i_rwsem protection against truncate */ nr = PAGE_SIZE; i_size = i_size_read(inode); @@ -2584,14 +2637,14 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (index == end_index) { nr = i_size & ~PAGE_MASK; if (nr <= offset) { - if (page) - put_page(page); + if (folio) + folio_put(folio); break; } } nr -= offset; - if (page) { + if (folio) { /* * If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing @@ -2603,23 +2656,35 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * Mark the page accessed if we read the beginning. */ if (!offset) - mark_page_accessed(page); + folio_mark_accessed(folio); + /* + * Ok, we have the page, and it's up-to-date, so + * now we can copy it to user space... + */ + ret = copy_page_to_iter(page, offset, nr, to); + folio_put(folio); + + } else if (user_backed_iter(to)) { + /* + * Copy to user tends to be so well optimized, but + * clear_user() not so much, that it is noticeably + * faster to copy the zero page instead of clearing. + */ + ret = copy_page_to_iter(ZERO_PAGE(0), offset, nr, to); } else { - page = ZERO_PAGE(0); - get_page(page); + /* + * But submitting the same page twice in a row to + * splice() - or others? - can result in confusion: + * so don't attempt that optimization on pipes etc. + */ + ret = iov_iter_zero(nr, to); } - /* - * Ok, we have the page, and it's up-to-date, so - * now we can copy it to user space... - */ - ret = copy_page_to_iter(page, offset, nr, to); retval += ret; offset += ret; index += offset >> PAGE_SHIFT; offset &= ~PAGE_MASK; - put_page(page); if (!iov_iter_count(to)) break; if (ret < nr) { @@ -2634,86 +2699,20 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) return retval ? retval : error; } -/* - * llseek SEEK_DATA or SEEK_HOLE through the page cache. - */ -static pgoff_t shmem_seek_hole_data(struct address_space *mapping, - pgoff_t index, pgoff_t end, int whence) -{ - struct page *page; - struct pagevec pvec; - pgoff_t indices[PAGEVEC_SIZE]; - bool done = false; - int i; - - pagevec_init(&pvec); - pvec.nr = 1; /* start small: we may be there already */ - while (!done) { - pvec.nr = find_get_entries(mapping, index, - pvec.nr, pvec.pages, indices); - if (!pvec.nr) { - if (whence == SEEK_DATA) - index = end; - break; - } - for (i = 0; i < pvec.nr; i++, index++) { - if (index < indices[i]) { - if (whence == SEEK_HOLE) { - done = true; - break; - } - index = indices[i]; - } - page = pvec.pages[i]; - if (page && !xa_is_value(page)) { - if (!PageUptodate(page)) - page = NULL; - } - if (index >= end || - (page && whence == SEEK_DATA) || - (!page && whence == SEEK_HOLE)) { - done = true; - break; - } - } - pagevec_remove_exceptionals(&pvec); - pagevec_release(&pvec); - pvec.nr = PAGEVEC_SIZE; - cond_resched(); - } - return index; -} - static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) { struct address_space *mapping = file->f_mapping; struct inode *inode = mapping->host; - pgoff_t start, end; - loff_t new_offset; if (whence != SEEK_DATA && whence != SEEK_HOLE) return generic_file_llseek_size(file, offset, whence, MAX_LFS_FILESIZE, i_size_read(inode)); - inode_lock(inode); - /* We're holding i_mutex so we can access i_size directly */ - - if (offset < 0 || offset >= inode->i_size) - offset = -ENXIO; - else { - start = offset >> PAGE_SHIFT; - end = (inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; - new_offset = shmem_seek_hole_data(mapping, start, end, whence); - new_offset <<= PAGE_SHIFT; - if (new_offset > offset) { - if (new_offset < inode->i_size) - offset = new_offset; - else if (whence == SEEK_DATA) - offset = -ENXIO; - else - offset = inode->i_size; - } - } + if (offset < 0) + return -ENXIO; + inode_lock(inode); + /* We're holding i_rwsem so we can access i_size directly */ + offset = mapping_seek_hole_data(mapping, offset, inode->i_size, whence); if (offset >= 0) offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); inode_unlock(inode); @@ -2727,7 +2726,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_falloc shmem_falloc; - pgoff_t start, index, end; + pgoff_t start, index, end, undo_fallocend; int error; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) @@ -2741,7 +2740,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq); - /* protected by i_mutex */ + /* protected by i_rwsem */ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { error = -EPERM; goto out; @@ -2796,8 +2795,17 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, inode->i_private = &shmem_falloc; spin_unlock(&inode->i_lock); - for (index = start; index < end; index++) { - struct page *page; + /* + * info->fallocend is only relevant when huge pages might be + * involved: to prevent split_huge_page() freeing fallocated + * pages when FALLOC_FL_KEEP_SIZE committed beyond i_size. + */ + undo_fallocend = info->fallocend; + if (info->fallocend < end) + info->fallocend = end; + + for (index = start; index < end; ) { + struct folio *folio; /* * Good, the fallocate(2) manpage permits EINTR: we may have @@ -2808,9 +2816,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; else - error = shmem_getpage(inode, index, &page, SGP_FALLOC); + error = shmem_get_folio(inode, index, &folio, + SGP_FALLOC); if (error) { - /* Remove the !PageUptodate pages we added */ + info->fallocend = undo_fallocend; + /* Remove the !uptodate folios we added */ if (index > start) { shmem_undo_range(inode, (loff_t)start << PAGE_SHIFT, @@ -2820,34 +2830,45 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, } /* + * Here is a more important optimization than it appears: + * a second SGP_FALLOC on the same large folio will clear it, + * making it uptodate and un-undoable if we fail later. + */ + index = folio_next_index(folio); + /* Beware 32-bit wraparound */ + if (!index) + index--; + + /* * Inform shmem_writepage() how far we have reached. * No need for lock or barrier: we have the page lock. */ - shmem_falloc.next++; - if (!PageUptodate(page)) - shmem_falloc.nr_falloced++; + if (!folio_test_uptodate(folio)) + shmem_falloc.nr_falloced += index - shmem_falloc.next; + shmem_falloc.next = index; /* - * If !PageUptodate, leave it that way so that freeable pages + * If !uptodate, leave it that way so that freeable folios * can be recognized if we need to rollback on error later. - * But set_page_dirty so that memory pressure will swap rather - * than free the pages we are allocating (and SGP_CACHE pages + * But mark it dirty so that memory pressure will swap rather + * than free the folios we are allocating (and SGP_CACHE folios * might still be clean: we now need to mark those dirty too). */ - set_page_dirty(page); - unlock_page(page); - put_page(page); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); cond_resched(); } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) i_size_write(inode, offset + len); - inode->i_ctime = current_time(inode); undone: spin_lock(&inode->i_lock); inode->i_private = NULL; spin_unlock(&inode->i_lock); out: + if (!error) + file_modified(file); inode_unlock(inode); return error; } @@ -2870,6 +2891,9 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_ffree = sbinfo->free_inodes; } /* else leave those fields 0 like simple_statfs */ + + buf->f_fsid = uuid_to_fsid(dentry->d_sb->s_uuid.b); + return 0; } @@ -2877,7 +2901,8 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) * File creation. Allocate an inode, and we're done.. */ static int -shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) +shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t dev) { struct inode *inode; int error = -ENOSPC; @@ -2896,6 +2921,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) error = 0; dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = current_time(dir); + inode_inc_iversion(dir); d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ } @@ -2906,7 +2932,8 @@ out_iput: } static int -shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +shmem_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, + struct file *file, umode_t mode) { struct inode *inode; int error = -ENOSPC; @@ -2921,28 +2948,30 @@ shmem_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) error = simple_acl_create(dir, inode); if (error) goto out_iput; - d_tmpfile(dentry, inode); + d_tmpfile(file, inode); } - return error; + return finish_open_simple(file, error); out_iput: iput(inode); return error; } -static int shmem_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +static int shmem_mkdir(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode) { int error; - if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0))) + if ((error = shmem_mknod(&init_user_ns, dir, dentry, + mode | S_IFDIR, 0))) return error; inc_nlink(dir); return 0; } -static int shmem_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int shmem_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) { - return shmem_mknod(dir, dentry, mode | S_IFREG, 0); + return shmem_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0); } /* @@ -2961,13 +2990,14 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr * first link must skip that, to get the accounting right. */ if (inode->i_nlink) { - ret = shmem_reserve_inode(inode->i_sb); + ret = shmem_reserve_inode(inode->i_sb, NULL); if (ret) goto out; } dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + inode_inc_iversion(dir); inc_nlink(inode); ihold(inode); /* New dentry reference */ dget(dentry); /* Extra pinning count for the created dentry */ @@ -2985,6 +3015,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) dir->i_size -= BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + inode_inc_iversion(dir); drop_nlink(inode); dput(dentry); /* Undo the count from "create" - this does all the work */ return 0; @@ -3000,29 +3031,8 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry) return shmem_unlink(dir, dentry); } -static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) -{ - bool old_is_dir = d_is_dir(old_dentry); - bool new_is_dir = d_is_dir(new_dentry); - - if (old_dir != new_dir && old_is_dir != new_is_dir) { - if (old_is_dir) { - drop_nlink(old_dir); - inc_nlink(new_dir); - } else { - drop_nlink(new_dir); - inc_nlink(old_dir); - } - } - old_dir->i_ctime = old_dir->i_mtime = - new_dir->i_ctime = new_dir->i_mtime = - d_inode(old_dentry)->i_ctime = - d_inode(new_dentry)->i_ctime = current_time(old_dir); - - return 0; -} - -static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry) +static int shmem_whiteout(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry) { struct dentry *whiteout; int error; @@ -3031,7 +3041,7 @@ static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry) if (!whiteout) return -ENOMEM; - error = shmem_mknod(old_dir, whiteout, + error = shmem_mknod(&init_user_ns, old_dir, whiteout, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); dput(whiteout); if (error) @@ -3054,7 +3064,10 @@ static int shmem_whiteout(struct inode *old_dir, struct dentry *old_dentry) * it exists so that the VFS layer correctly free's it when it * gets overwritten. */ -static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) +static int shmem_rename2(struct user_namespace *mnt_userns, + struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct inode *inode = d_inode(old_dentry); int they_are_dirs = S_ISDIR(inode->i_mode); @@ -3063,7 +3076,7 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc return -EINVAL; if (flags & RENAME_EXCHANGE) - return shmem_exchange(old_dir, old_dentry, new_dir, new_dentry); + return simple_rename_exchange(old_dir, old_dentry, new_dir, new_dentry); if (!simple_empty(new_dentry)) return -ENOTEMPTY; @@ -3071,7 +3084,7 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc if (flags & RENAME_WHITEOUT) { int error; - error = shmem_whiteout(old_dir, old_dentry); + error = shmem_whiteout(&init_user_ns, old_dir, old_dentry); if (error) return error; } @@ -3092,15 +3105,18 @@ static int shmem_rename2(struct inode *old_dir, struct dentry *old_dentry, struc old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = new_dir->i_mtime = inode->i_ctime = current_time(old_dir); + inode_inc_iversion(old_dir); + inode_inc_iversion(new_dir); return 0; } -static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname) +static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, const char *symname) { int error; int len; struct inode *inode; - struct page *page; + struct folio *folio; len = strlen(symname) + 1; if (len > PAGE_SIZE) @@ -3113,12 +3129,9 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); - if (error) { - if (error != -EOPNOTSUPP) { - iput(inode); - return error; - } - error = 0; + if (error && error != -EOPNOTSUPP) { + iput(inode); + return error; } inode->i_size = len-1; @@ -3131,21 +3144,22 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s inode->i_op = &shmem_short_symlink_operations; } else { inode_nohighmem(inode); - error = shmem_getpage(inode, 0, &page, SGP_WRITE); + error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); if (error) { iput(inode); return error; } inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; - memcpy(page_address(page), symname, len); - SetPageUptodate(page); - set_page_dirty(page); - unlock_page(page); - put_page(page); + memcpy(folio_address(folio), symname, len); + folio_mark_uptodate(folio); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); } dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = current_time(dir); + inode_inc_iversion(dir); d_instantiate(dentry, inode); dget(dentry); return 0; @@ -3153,35 +3167,74 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s static void shmem_put_link(void *arg) { - mark_page_accessed(arg); - put_page(arg); + folio_mark_accessed(arg); + folio_put(arg); } static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct page *page = NULL; + struct folio *folio = NULL; int error; + if (!dentry) { - page = find_get_page(inode->i_mapping, 0); - if (!page) + folio = filemap_get_folio(inode->i_mapping, 0); + if (!folio) return ERR_PTR(-ECHILD); - if (!PageUptodate(page)) { - put_page(page); + if (PageHWPoison(folio_page(folio, 0)) || + !folio_test_uptodate(folio)) { + folio_put(folio); return ERR_PTR(-ECHILD); } } else { - error = shmem_getpage(inode, 0, &page, SGP_READ); + error = shmem_get_folio(inode, 0, &folio, SGP_READ); if (error) return ERR_PTR(error); - unlock_page(page); + if (!folio) + return ERR_PTR(-ECHILD); + if (PageHWPoison(folio_page(folio, 0))) { + folio_unlock(folio); + folio_put(folio); + return ERR_PTR(-ECHILD); + } + folio_unlock(folio); } - set_delayed_call(done, shmem_put_link, page); - return page_address(page); + set_delayed_call(done, shmem_put_link, folio); + return folio_address(folio); } #ifdef CONFIG_TMPFS_XATTR + +static int shmem_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + struct shmem_inode_info *info = SHMEM_I(d_inode(dentry)); + + fileattr_fill_flags(fa, info->fsflags & SHMEM_FL_USER_VISIBLE); + + return 0; +} + +static int shmem_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); + struct shmem_inode_info *info = SHMEM_I(inode); + + if (fileattr_has_fsx(fa)) + return -EOPNOTSUPP; + if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE) + return -EOPNOTSUPP; + + info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | + (fa->flags & SHMEM_FL_USER_MODIFIABLE); + + shmem_set_inode_flags(inode, info->fsflags); + inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); + return 0; +} + /* * Superblocks without xattr inode operations may get some security.* xattr * support from the LSM "for free". As soon as we have any other xattrs @@ -3210,7 +3263,7 @@ static int shmem_initxattrs(struct inode *inode, new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len, GFP_KERNEL); if (!new_xattr->name) { - kfree(new_xattr); + kvfree(new_xattr); return -ENOMEM; } @@ -3236,14 +3289,21 @@ static int shmem_xattr_handler_get(const struct xattr_handler *handler, } static int shmem_xattr_handler_set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, struct dentry *unused, struct inode *inode, const char *name, const void *value, size_t size, int flags) { struct shmem_inode_info *info = SHMEM_I(inode); + int err; name = xattr_full_name(handler, name); - return simple_xattr_set(&info->xattrs, name, value, size, flags); + err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); + if (!err) { + inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); + } + return err; } static const struct xattr_handler shmem_security_xattr_handler = { @@ -3276,6 +3336,7 @@ static ssize_t shmem_listxattr(struct dentry *dentry, char *buffer, size_t size) #endif /* CONFIG_TMPFS_XATTR */ static const struct inode_operations shmem_short_symlink_operations = { + .getattr = shmem_getattr, .get_link = simple_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, @@ -3283,6 +3344,7 @@ static const struct inode_operations shmem_short_symlink_operations = { }; static const struct inode_operations shmem_symlink_inode_operations = { + .getattr = shmem_getattr, .get_link = shmem_get_link, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, @@ -3379,6 +3441,8 @@ enum shmem_param { Opt_nr_inodes, Opt_size, Opt_uid, + Opt_inode32, + Opt_inode64, }; static const struct constant_table shmem_param_enums_huge[] = { @@ -3398,6 +3462,8 @@ const struct fs_parameter_spec shmem_fs_parameters[] = { fsparam_string("nr_inodes", Opt_nr_inodes), fsparam_string("size", Opt_size), fsparam_u32 ("uid", Opt_uid), + fsparam_flag ("inode32", Opt_inode32), + fsparam_flag ("inode64", Opt_inode64), {} }; @@ -3429,7 +3495,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) break; case Opt_nr_blocks: ctx->blocks = memparse(param->string, &rest); - if (*rest) + if (*rest || ctx->blocks > S64_MAX) goto bad_value; ctx->seen |= SHMEM_SEEN_BLOCKS; break; @@ -3455,7 +3521,7 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) case Opt_huge: ctx->huge = result.uint_32; if (ctx->huge != SHMEM_HUGE_NEVER && - !(IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + !(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && has_transparent_hugepage())) goto unsupported_parameter; ctx->seen |= SHMEM_SEEN_HUGE; @@ -3469,6 +3535,18 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) break; } goto unsupported_parameter; + case Opt_inode32: + ctx->full_inums = false; + ctx->seen |= SHMEM_SEEN_INUMS; + break; + case Opt_inode64: + if (sizeof(ino_t) < 8) { + return invalfc(fc, + "Cannot use inode64 with <64bit inums in kernel\n"); + } + ctx->full_inums = true; + ctx->seen |= SHMEM_SEEN_INUMS; + break; } return 0; @@ -3506,7 +3584,7 @@ static int shmem_parse_options(struct fs_context *fc, void *data) } } if (*this_char) { - char *value = strchr(this_char,'='); + char *value = strchr(this_char, '='); size_t len = 0; int err; @@ -3534,10 +3612,12 @@ static int shmem_reconfigure(struct fs_context *fc) struct shmem_options *ctx = fc->fs_private; struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); unsigned long inodes; + struct mempolicy *mpol = NULL; const char *err; - spin_lock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); inodes = sbinfo->max_inodes - sbinfo->free_inodes; + if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { err = "Cannot retroactively limit size"; @@ -3560,8 +3640,16 @@ static int shmem_reconfigure(struct fs_context *fc) } } + if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums && + sbinfo->next_ino > UINT_MAX) { + err = "Current inum too high to switch to 32-bit inums"; + goto out; + } + if (ctx->seen & SHMEM_SEEN_HUGE) sbinfo->huge = ctx->huge; + if (ctx->seen & SHMEM_SEEN_INUMS) + sbinfo->full_inums = ctx->full_inums; if (ctx->seen & SHMEM_SEEN_BLOCKS) sbinfo->max_blocks = ctx->blocks; if (ctx->seen & SHMEM_SEEN_INODES) { @@ -3573,14 +3661,15 @@ static int shmem_reconfigure(struct fs_context *fc) * Preserve previous mempolicy unless mpol remount option was specified. */ if (ctx->mpol) { - mpol_put(sbinfo->mpol); + mpol = sbinfo->mpol; sbinfo->mpol = ctx->mpol; /* transfers initial ref */ ctx->mpol = NULL; } - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); + mpol_put(mpol); return 0; out: - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); return invalfc(fc, "%s", err); } @@ -3601,7 +3690,30 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID)) seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, sbinfo->gid)); -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE + + /* + * Showing inode{64,32} might be useful even if it's the system default, + * since then people don't have to resort to checking both here and + * /proc/config.gz to confirm 64-bit inums were successfully applied + * (which may not even exist if IKCONFIG_PROC isn't enabled). + * + * We hide it when inode64 isn't the default and we are using 32-bit + * inodes, since that probably just means the feature isn't even under + * consideration. + * + * As such: + * + * +-----------------+-----------------+ + * | TMPFS_INODE64=y | TMPFS_INODE64=n | + * +------------------+-----------------+-----------------+ + * | full_inums=true | show | show | + * | full_inums=false | show | hide | + * +------------------+-----------------+-----------------+ + * + */ + if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums) + seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32)); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Rightly or wrongly, show huge mount option unmasked by shmem_huge */ if (sbinfo->huge) seq_printf(seq, ",huge=%s", shmem_format_huge(sbinfo->huge)); @@ -3616,6 +3728,7 @@ static void shmem_put_super(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + free_percpu(sbinfo->ino_batch); percpu_counter_destroy(&sbinfo->used_blocks); mpol_put(sbinfo->mpol); kfree(sbinfo); @@ -3627,7 +3740,6 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) struct shmem_options *ctx = fc->fs_private; struct inode *inode; struct shmem_sb_info *sbinfo; - int err = -ENOMEM; /* Round up to L1_CACHE_BYTES to resist false sharing */ sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), @@ -3648,24 +3760,32 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) ctx->blocks = shmem_default_max_blocks(); if (!(ctx->seen & SHMEM_SEEN_INODES)) ctx->inodes = shmem_default_max_inodes(); + if (!(ctx->seen & SHMEM_SEEN_INUMS)) + ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64); } else { sb->s_flags |= SB_NOUSER; } sb->s_export_op = &shmem_export_ops; - sb->s_flags |= SB_NOSEC; + sb->s_flags |= SB_NOSEC | SB_I_VERSION; #else sb->s_flags |= SB_NOUSER; #endif sbinfo->max_blocks = ctx->blocks; sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes; + if (sb->s_flags & SB_KERNMOUNT) { + sbinfo->ino_batch = alloc_percpu(ino_t); + if (!sbinfo->ino_batch) + goto failed; + } sbinfo->uid = ctx->uid; sbinfo->gid = ctx->gid; + sbinfo->full_inums = ctx->full_inums; sbinfo->mode = ctx->mode; sbinfo->huge = ctx->huge; sbinfo->mpol = ctx->mpol; ctx->mpol = NULL; - spin_lock_init(&sbinfo->stat_lock); + raw_spin_lock_init(&sbinfo->stat_lock); if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; spin_lock_init(&sbinfo->shrinklist_lock); @@ -3697,7 +3817,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) failed: shmem_put_super(sb); - return err; + return -ENOMEM; } static int shmem_get_tree(struct fs_context *fc) @@ -3730,7 +3850,7 @@ static struct kmem_cache *shmem_inode_cachep; static struct inode *shmem_alloc_inode(struct super_block *sb) { struct shmem_inode_info *info; - info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL); + info = alloc_inode_sb(sb, shmem_inode_cachep, GFP_KERNEL); if (!info) return NULL; return &info->vfs_inode; @@ -3767,18 +3887,26 @@ static void shmem_destroy_inodecache(void) kmem_cache_destroy(shmem_inode_cachep); } -static const struct address_space_operations shmem_aops = { +/* Keep the page in page cache instead of truncating it */ +static int shmem_error_remove_page(struct address_space *mapping, + struct page *page) +{ + return 0; +} + +const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, - .set_page_dirty = __set_page_dirty_no_writeback, + .dirty_folio = noop_dirty_folio, #ifdef CONFIG_TMPFS .write_begin = shmem_write_begin, .write_end = shmem_write_end, #endif #ifdef CONFIG_MIGRATION - .migratepage = migrate_page, + .migrate_folio = migrate_folio, #endif - .error_remove_page = generic_error_remove_page, + .error_remove_page = shmem_error_remove_page, }; +EXPORT_SYMBOL(shmem_aops); static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, @@ -3800,11 +3928,14 @@ static const struct inode_operations shmem_inode_operations = { #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, .set_acl = simple_set_acl, + .fileattr_get = shmem_fileattr_get, + .fileattr_set = shmem_fileattr_set, #endif }; static const struct inode_operations shmem_dir_inode_operations = { #ifdef CONFIG_TMPFS + .getattr = shmem_getattr, .create = shmem_create, .lookup = simple_lookup, .link = shmem_link, @@ -3818,6 +3949,8 @@ static const struct inode_operations shmem_dir_inode_operations = { #endif #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, + .fileattr_get = shmem_fileattr_get, + .fileattr_set = shmem_fileattr_set, #endif #ifdef CONFIG_TMPFS_POSIX_ACL .setattr = shmem_setattr, @@ -3826,6 +3959,7 @@ static const struct inode_operations shmem_dir_inode_operations = { }; static const struct inode_operations shmem_special_inode_operations = { + .getattr = shmem_getattr, #ifdef CONFIG_TMPFS_XATTR .listxattr = shmem_listxattr, #endif @@ -3846,7 +3980,7 @@ static const struct super_operations shmem_ops = { .evict_inode = shmem_evict_inode, .drop_inode = generic_delete_inode, .put_super = shmem_put_super, -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE .nr_cached_objects = shmem_unused_huge_count, .free_cached_objects = shmem_unused_huge_scan, #endif @@ -3889,7 +4023,7 @@ static struct file_system_type shmem_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; -int __init shmem_init(void) +void __init shmem_init(void) { int error; @@ -3908,25 +4042,24 @@ int __init shmem_init(void) goto out1; } -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE +#ifdef CONFIG_TRANSPARENT_HUGEPAGE if (has_transparent_hugepage() && shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; else - shmem_huge = 0; /* just in case it was patched */ + shmem_huge = SHMEM_HUGE_NEVER; /* just in case it was patched */ #endif - return 0; + return; out1: unregister_filesystem(&shmem_fs_type); out2: shmem_destroy_inodecache(); shm_mnt = ERR_PTR(error); - return error; } -#if defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && defined(CONFIG_SYSFS) +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS) static ssize_t shmem_enabled_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) + struct kobj_attribute *attr, char *buf) { static const int values[] = { SHMEM_HUGE_ALWAYS, @@ -3936,16 +4069,19 @@ static ssize_t shmem_enabled_show(struct kobject *kobj, SHMEM_HUGE_DENY, SHMEM_HUGE_FORCE, }; - int i, count; - - for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) { - const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s "; + int len = 0; + int i; - count += sprintf(buf + count, fmt, - shmem_format_huge(values[i])); + for (i = 0; i < ARRAY_SIZE(values); i++) { + len += sysfs_emit_at(buf, len, + shmem_huge == values[i] ? "%s[%s]" : "%s%s", + i ? " " : "", + shmem_format_huge(values[i])); } - buf[count - 1] = '\n'; - return count; + + len += sysfs_emit_at(buf, len, "\n"); + + return len; } static ssize_t shmem_enabled_store(struct kobject *kobj, @@ -3974,46 +4110,8 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, return count; } -struct kobj_attribute shmem_enabled_attr = - __ATTR(shmem_enabled, 0644, shmem_enabled_show, shmem_enabled_store); -#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE && CONFIG_SYSFS */ - -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE -bool shmem_huge_enabled(struct vm_area_struct *vma) -{ - struct inode *inode = file_inode(vma->vm_file); - struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); - loff_t i_size; - pgoff_t off; - - if ((vma->vm_flags & VM_NOHUGEPAGE) || - test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) - return false; - if (shmem_huge == SHMEM_HUGE_FORCE) - return true; - if (shmem_huge == SHMEM_HUGE_DENY) - return false; - switch (sbinfo->huge) { - case SHMEM_HUGE_NEVER: - return false; - case SHMEM_HUGE_ALWAYS: - return true; - case SHMEM_HUGE_WITHIN_SIZE: - off = round_up(vma->vm_pgoff, HPAGE_PMD_NR); - i_size = round_up(i_size_read(inode), PAGE_SIZE); - if (i_size >= HPAGE_PMD_SIZE && - i_size >> PAGE_SHIFT >= off) - return true; - /* fall through */ - case SHMEM_HUGE_ADVISE: - /* TODO: implement fadvise() hints */ - return (vma->vm_flags & VM_HUGEPAGE); - default: - VM_BUG_ON(1); - return false; - } -} -#endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ +struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ #else /* !CONFIG_SHMEM */ @@ -4034,23 +4132,20 @@ static struct file_system_type shmem_fs_type = { .fs_flags = FS_USERNS_MOUNT, }; -int __init shmem_init(void) +void __init shmem_init(void) { BUG_ON(register_filesystem(&shmem_fs_type) != 0); shm_mnt = kern_mount(&shmem_fs_type); BUG_ON(IS_ERR(shm_mnt)); - - return 0; } -int shmem_unuse(unsigned int type, bool frontswap, - unsigned long *fs_pages_to_unuse) +int shmem_unuse(unsigned int type) { return 0; } -int shmem_lock(struct file *file, int lock, struct user_struct *user) +int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { return 0; } @@ -4160,7 +4255,7 @@ EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt); /** * shmem_zero_setup - setup a shared anonymous mapping - * @vma: the vma to be mmapped is prepared by do_mmap_pgoff + * @vma: the vma to be mmapped is prepared by do_mmap */ int shmem_zero_setup(struct vm_area_struct *vma) { @@ -4168,7 +4263,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) loff_t size = vma->vm_end - vma->vm_start; /* - * Cloning a new file under mmap_sem leads to a lock ordering conflict + * Cloning a new file under mmap_lock leads to a lock ordering conflict * between XFS directory reading and selinux: since this file is only * accessible to the user through its mapping, use S_PRIVATE flag to * bypass file security, in the same way as shmem_kernel_file_setup(). @@ -4182,12 +4277,6 @@ int shmem_zero_setup(struct vm_area_struct *vma) vma->vm_file = file; vma->vm_ops = &shmem_vm_ops; - if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && - ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) < - (vma->vm_end & HPAGE_PMD_MASK)) { - khugepaged_enter(vma, vma->vm_flags); - } - return 0; } @@ -4199,7 +4288,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) * * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", * with any new page allocations done using the specified allocation flags. - * But read_cache_page_gfp() uses the ->readpage() method: which does not + * But read_cache_page_gfp() uses the ->read_folio() method: which does not * suit tmpfs, since it may have pages in swapcache, and needs to find those * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. * @@ -4211,16 +4300,23 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, { #ifdef CONFIG_SHMEM struct inode *inode = mapping->host; + struct folio *folio; struct page *page; int error; - BUG_ON(mapping->a_ops != &shmem_aops); - error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, + BUG_ON(!shmem_mapping(mapping)); + error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, gfp, NULL, NULL, NULL); if (error) - page = ERR_PTR(error); - else - unlock_page(page); + return ERR_PTR(error); + + folio_unlock(folio); + page = folio_file_page(folio, index); + if (PageHWPoison(page)) { + folio_put(folio); + return ERR_PTR(-EIO); + } + return page; #else /* |