aboutsummaryrefslogtreecommitdiffstats
path: root/fs/hugetlbfs/inode.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/hugetlbfs/inode.c')
-rw-r--r--fs/hugetlbfs/inode.c733
1 files changed, 460 insertions, 273 deletions
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index aff8642f0c2e..df7772335dc0 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -11,7 +11,6 @@
#include <linux/thread_info.h>
#include <asm/current.h>
-#include <linux/sched/signal.h> /* remove ASAP */
#include <linux/falloc.h>
#include <linux/fs.h>
#include <linux/mount.h>
@@ -38,8 +37,8 @@
#include <linux/uio.h>
#include <linux/uaccess.h>
+#include <linux/sched/mm.h>
-static const struct super_operations hugetlbfs_ops;
static const struct address_space_operations hugetlbfs_aops;
const struct file_operations hugetlbfs_file_operations;
static const struct inode_operations hugetlbfs_dir_inode_operations;
@@ -76,7 +75,7 @@ enum hugetlb_param {
static const struct fs_parameter_spec hugetlb_fs_parameters[] = {
fsparam_u32 ("gid", Opt_gid),
fsparam_string("min_size", Opt_min_size),
- fsparam_u32 ("mode", Opt_mode),
+ fsparam_u32oct("mode", Opt_mode),
fsparam_string("nr_inodes", Opt_nr_inodes),
fsparam_string("pagesize", Opt_pagesize),
fsparam_string("size", Opt_size),
@@ -107,16 +106,6 @@ static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma)
}
#endif
-static void huge_pagevec_release(struct pagevec *pvec)
-{
- int i;
-
- for (i = 0; i < pagevec_count(pvec); ++i)
- put_page(pvec->pages[i]);
-
- pagevec_reinit(pvec);
-}
-
/*
* Mask used when checking the page offset value passed in via system
* calls. This value will be converted to a loff_t which is signed.
@@ -130,6 +119,7 @@ static void huge_pagevec_release(struct pagevec *pvec)
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct inode *inode = file_inode(file);
+ struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
loff_t len, vma_len;
int ret;
struct hstate *h = hstate_file(file);
@@ -139,12 +129,16 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
* already been checked by prepare_hugepage_range. If you add
* any error returns here, do so after setting VM_HUGETLB, so
* is_vm_hugetlb_page tests below unmap_region go the right
- * way when do_mmap_pgoff unwinds (may be important on powerpc
+ * way when do_mmap unwinds (may be important on powerpc
* and ia64).
*/
vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND;
vma->vm_ops = &hugetlb_vm_ops;
+ ret = seal_check_future_write(info->seals, vma);
+ if (ret)
+ return ret;
+
/*
* page based offset in vm_pgoff could be sufficiently large to
* overflow a loff_t when converted to byte offset. This can
@@ -170,7 +164,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
file_accessed(file);
ret = -ENOMEM;
- if (hugetlb_reserve_pages(inode,
+ if (!hugetlb_reserve_pages(inode,
vma->vm_pgoff >> huge_page_order(h),
len >> huge_page_shift(h), vma,
vma->vm_flags))
@@ -186,18 +180,66 @@ out:
}
/*
- * Called under down_write(mmap_sem).
+ * Called under mmap_write_lock(mm).
*/
-#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
static unsigned long
-hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags)
{
+ struct hstate *h = hstate_file(file);
+ struct vm_unmapped_area_info info;
+
+ info.flags = 0;
+ info.length = len;
+ info.low_limit = current->mm->mmap_base;
+ info.high_limit = arch_get_mmap_end(addr, len, flags);
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+ return vm_unmapped_area(&info);
+}
+
+static unsigned long
+hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff, unsigned long flags)
+{
+ struct hstate *h = hstate_file(file);
+ struct vm_unmapped_area_info info;
+
+ info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+ info.length = len;
+ info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+ info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
+ info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+ info.align_offset = 0;
+ addr = vm_unmapped_area(&info);
+
+ /*
+ * A failed mmap() very likely causes application failure,
+ * so fall back to the bottom-up function here. This scenario
+ * can happen with large stack limits and large mmap()
+ * allocations.
+ */
+ if (unlikely(offset_in_page(addr))) {
+ VM_BUG_ON(addr != -ENOMEM);
+ info.flags = 0;
+ info.low_limit = current->mm->mmap_base;
+ info.high_limit = arch_get_mmap_end(addr, len, flags);
+ addr = vm_unmapped_area(&info);
+ }
+
+ return addr;
+}
+
+unsigned long
+generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
+{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct hstate *h = hstate_file(file);
- struct vm_unmapped_area_info info;
+ const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags);
if (len & ~huge_page_mask(h))
return -EINVAL;
@@ -213,54 +255,36 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
if (addr) {
addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
+ if (mmap_end - len >= addr &&
(!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
- info.flags = 0;
- info.length = len;
- info.low_limit = TASK_UNMAPPED_BASE;
- info.high_limit = TASK_SIZE;
- info.align_mask = PAGE_MASK & ~huge_page_mask(h);
- info.align_offset = 0;
- return vm_unmapped_area(&info);
+ /*
+ * Use mm->get_unmapped_area value as a hint to use topdown routine.
+ * If architectures have special needs, they should define their own
+ * version of hugetlb_get_unmapped_area.
+ */
+ if (mm->get_unmapped_area == arch_get_unmapped_area_topdown)
+ return hugetlb_get_unmapped_area_topdown(file, addr, len,
+ pgoff, flags);
+ return hugetlb_get_unmapped_area_bottomup(file, addr, len,
+ pgoff, flags);
}
-#endif
-static size_t
-hugetlbfs_read_actor(struct page *page, unsigned long offset,
- struct iov_iter *to, unsigned long size)
+#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA
+static unsigned long
+hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
+ unsigned long len, unsigned long pgoff,
+ unsigned long flags)
{
- size_t copied = 0;
- int i, chunksize;
-
- /* Find which 4k chunk and offset with in that chunk */
- i = offset >> PAGE_SHIFT;
- offset = offset & ~PAGE_MASK;
-
- while (size) {
- size_t n;
- chunksize = PAGE_SIZE;
- if (offset)
- chunksize -= offset;
- if (chunksize > size)
- chunksize = size;
- n = copy_page_to_iter(&page[i], offset, chunksize, to);
- copied += n;
- if (n != chunksize)
- return copied;
- offset = 0;
- size -= chunksize;
- i++;
- }
- return copied;
+ return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags);
}
+#endif
/*
* Support for read() - Find the page attached to f_mapping and copy out the
- * data. Its *very* similar to do_generic_mapping_read(), we can't use that
- * since it has PAGE_SIZE assumptions.
+ * data. This provides functionality similar to filemap_read().
*/
static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
@@ -304,10 +328,16 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
} else {
unlock_page(page);
+ if (PageHWPoison(page)) {
+ put_page(page);
+ retval = -EIO;
+ break;
+ }
+
/*
* We have the page, copy it to user space buffer.
*/
- copied = hugetlbfs_read_actor(page, offset, to, nr);
+ copied = copy_page_to_iter(page, offset, nr, to);
put_page(page);
}
offset += copied;
@@ -326,7 +356,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to)
static int hugetlbfs_write_begin(struct file *file,
struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
return -EINVAL;
@@ -340,49 +370,227 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping,
return -EINVAL;
}
-static void remove_huge_page(struct page *page)
+static void hugetlb_delete_from_page_cache(struct page *page)
{
ClearPageDirty(page);
ClearPageUptodate(page);
delete_from_page_cache(page);
}
+/*
+ * Called with i_mmap_rwsem held for inode based vma maps. This makes
+ * sure vma (and vm_mm) will not go away. We also hold the hugetlb fault
+ * mutex for the page in the mapping. So, we can not race with page being
+ * faulted into the vma.
+ */
+static bool hugetlb_vma_maps_page(struct vm_area_struct *vma,
+ unsigned long addr, struct page *page)
+{
+ pte_t *ptep, pte;
+
+ ptep = huge_pte_offset(vma->vm_mm, addr,
+ huge_page_size(hstate_vma(vma)));
+
+ if (!ptep)
+ return false;
+
+ pte = huge_ptep_get(ptep);
+ if (huge_pte_none(pte) || !pte_present(pte))
+ return false;
+
+ if (pte_page(pte) == page)
+ return true;
+
+ return false;
+}
+
+/*
+ * Can vma_offset_start/vma_offset_end overflow on 32-bit arches?
+ * No, because the interval tree returns us only those vmas
+ * which overlap the truncated area starting at pgoff,
+ * and no vma on a 32-bit arch can span beyond the 4GB.
+ */
+static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start)
+{
+ if (vma->vm_pgoff < start)
+ return (start - vma->vm_pgoff) << PAGE_SHIFT;
+ else
+ return 0;
+}
+
+static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end)
+{
+ unsigned long t_end;
+
+ if (!end)
+ return vma->vm_end;
+
+ t_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start;
+ if (t_end > vma->vm_end)
+ t_end = vma->vm_end;
+ return t_end;
+}
+
+/*
+ * Called with hugetlb fault mutex held. Therefore, no more mappings to
+ * this folio can be created while executing the routine.
+ */
+static void hugetlb_unmap_file_folio(struct hstate *h,
+ struct address_space *mapping,
+ struct folio *folio, pgoff_t index)
+{
+ struct rb_root_cached *root = &mapping->i_mmap;
+ struct hugetlb_vma_lock *vma_lock;
+ struct page *page = &folio->page;
+ struct vm_area_struct *vma;
+ unsigned long v_start;
+ unsigned long v_end;
+ pgoff_t start, end;
+
+ start = index * pages_per_huge_page(h);
+ end = (index + 1) * pages_per_huge_page(h);
+
+ i_mmap_lock_write(mapping);
+retry:
+ vma_lock = NULL;
+ vma_interval_tree_foreach(vma, root, start, end - 1) {
+ v_start = vma_offset_start(vma, start);
+ v_end = vma_offset_end(vma, end);
+
+ if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
+ continue;
+
+ if (!hugetlb_vma_trylock_write(vma)) {
+ vma_lock = vma->vm_private_data;
+ /*
+ * If we can not get vma lock, we need to drop
+ * immap_sema and take locks in order. First,
+ * take a ref on the vma_lock structure so that
+ * we can be guaranteed it will not go away when
+ * dropping immap_sema.
+ */
+ kref_get(&vma_lock->refs);
+ break;
+ }
+
+ unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
+ NULL, ZAP_FLAG_DROP_MARKER);
+ hugetlb_vma_unlock_write(vma);
+ }
+
+ i_mmap_unlock_write(mapping);
+
+ if (vma_lock) {
+ /*
+ * Wait on vma_lock. We know it is still valid as we have
+ * a reference. We must 'open code' vma locking as we do
+ * not know if vma_lock is still attached to vma.
+ */
+ down_write(&vma_lock->rw_sema);
+ i_mmap_lock_write(mapping);
+
+ vma = vma_lock->vma;
+ if (!vma) {
+ /*
+ * If lock is no longer attached to vma, then just
+ * unlock, drop our reference and retry looking for
+ * other vmas.
+ */
+ up_write(&vma_lock->rw_sema);
+ kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
+ goto retry;
+ }
+
+ /*
+ * vma_lock is still attached to vma. Check to see if vma
+ * still maps page and if so, unmap.
+ */
+ v_start = vma_offset_start(vma, start);
+ v_end = vma_offset_end(vma, end);
+ if (hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page))
+ unmap_hugepage_range(vma, vma->vm_start + v_start,
+ v_end, NULL,
+ ZAP_FLAG_DROP_MARKER);
+
+ kref_put(&vma_lock->refs, hugetlb_vma_lock_release);
+ hugetlb_vma_unlock_write(vma);
+
+ goto retry;
+ }
+}
+
static void
-hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
+hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end,
+ zap_flags_t zap_flags)
{
struct vm_area_struct *vma;
/*
- * end == 0 indicates that the entire range after
- * start should be unmapped.
+ * end == 0 indicates that the entire range after start should be
+ * unmapped. Note, end is exclusive, whereas the interval tree takes
+ * an inclusive "last".
*/
- vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
- unsigned long v_offset;
+ vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) {
+ unsigned long v_start;
unsigned long v_end;
+ if (!hugetlb_vma_trylock_write(vma))
+ continue;
+
+ v_start = vma_offset_start(vma, start);
+ v_end = vma_offset_end(vma, end);
+
+ unmap_hugepage_range(vma, vma->vm_start + v_start, v_end,
+ NULL, zap_flags);
+
/*
- * Can the expression below overflow on 32-bit arches?
- * No, because the interval tree returns us only those vmas
- * which overlap the truncated area starting at pgoff,
- * and no vma on a 32-bit arch can span beyond the 4GB.
+ * Note that vma lock only exists for shared/non-private
+ * vmas. Therefore, lock is not held when calling
+ * unmap_hugepage_range for private vmas.
*/
- if (vma->vm_pgoff < start)
- v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
- else
- v_offset = 0;
-
- if (!end)
- v_end = vma->vm_end;
- else {
- v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
- + vma->vm_start;
- if (v_end > vma->vm_end)
- v_end = vma->vm_end;
- }
+ hugetlb_vma_unlock_write(vma);
+ }
+}
- unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
- NULL);
+/*
+ * Called with hugetlb fault mutex held.
+ * Returns true if page was actually removed, false otherwise.
+ */
+static bool remove_inode_single_folio(struct hstate *h, struct inode *inode,
+ struct address_space *mapping,
+ struct folio *folio, pgoff_t index,
+ bool truncate_op)
+{
+ bool ret = false;
+
+ /*
+ * If folio is mapped, it was faulted in after being
+ * unmapped in caller. Unmap (again) while holding
+ * the fault mutex. The mutex will prevent faults
+ * until we finish removing the folio.
+ */
+ if (unlikely(folio_mapped(folio)))
+ hugetlb_unmap_file_folio(h, mapping, folio, index);
+
+ folio_lock(folio);
+ /*
+ * We must remove the folio from page cache before removing
+ * the region/ reserve map (hugetlb_unreserve_pages). In
+ * rare out of memory conditions, removal of the region/reserve
+ * map could fail. Correspondingly, the subpool and global
+ * reserve usage count can need to be adjusted.
+ */
+ VM_BUG_ON(HPageRestoreReserve(&folio->page));
+ hugetlb_delete_from_page_cache(&folio->page);
+ ret = true;
+ if (!truncate_op) {
+ if (unlikely(hugetlb_unreserve_pages(inode, index,
+ index + 1, 1)))
+ hugetlb_fix_reserve_counts(inode);
}
+
+ folio_unlock(folio);
+ return ret;
}
/*
@@ -391,16 +599,15 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
*
* truncation is indicated by end of range being LLONG_MAX
* In this case, we first scan the range and release found pages.
- * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
- * maps and global counts. Page faults can not race with truncation
- * in this routine. hugetlb_no_page() prevents page faults in the
- * truncated range. It checks i_size before allocation, and again after
- * with the page table lock for the page held. The same lock must be
- * acquired to unmap a page.
+ * After releasing pages, hugetlb_unreserve_pages cleans up region/reserve
+ * maps and global counts. Page faults can race with truncation.
+ * During faults, hugetlb_no_page() checks i_size before page allocation,
+ * and again after obtaining page table lock. It will 'back out'
+ * allocations in the truncated range.
* hole punch is indicated if end is not LLONG_MAX
* In the hole punch case we scan the range and release found pages.
- * Only when releasing a page is the associated region/reserv map
- * deleted. The region/reserv map for ranges without associated
+ * Only when releasing a page is the associated region/reserve map
+ * deleted. The region/reserve map for ranges without associated
* pages are not modified. Page faults can race with hole punch.
* This is indicated if we find a mapped page.
* Note: If the passed end of range value is beyond the end of file, but
@@ -413,73 +620,32 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
struct address_space *mapping = &inode->i_data;
const pgoff_t start = lstart >> huge_page_shift(h);
const pgoff_t end = lend >> huge_page_shift(h);
- struct vm_area_struct pseudo_vma;
- struct pagevec pvec;
+ struct folio_batch fbatch;
pgoff_t next, index;
int i, freed = 0;
bool truncate_op = (lend == LLONG_MAX);
- vma_init(&pseudo_vma, current->mm);
- pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED);
- pagevec_init(&pvec);
+ folio_batch_init(&fbatch);
next = start;
- while (next < end) {
- /*
- * When no more pages are found, we are done.
- */
- if (!pagevec_lookup_range(&pvec, mapping, &next, end - 1))
- break;
+ while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) {
+ for (i = 0; i < folio_batch_count(&fbatch); ++i) {
+ struct folio *folio = fbatch.folios[i];
+ u32 hash = 0;
- for (i = 0; i < pagevec_count(&pvec); ++i) {
- struct page *page = pvec.pages[i];
- u32 hash;
-
- index = page->index;
+ index = folio->index;
hash = hugetlb_fault_mutex_hash(mapping, index);
mutex_lock(&hugetlb_fault_mutex_table[hash]);
/*
- * If page is mapped, it was faulted in after being
- * unmapped in caller. Unmap (again) now after taking
- * the fault mutex. The mutex will prevent faults
- * until we finish removing the page.
- *
- * This race can only happen in the hole punch case.
- * Getting here in a truncate operation is a bug.
+ * Remove folio that was part of folio_batch.
*/
- if (unlikely(page_mapped(page))) {
- BUG_ON(truncate_op);
-
- i_mmap_lock_write(mapping);
- hugetlb_vmdelete_list(&mapping->i_mmap,
- index * pages_per_huge_page(h),
- (index + 1) * pages_per_huge_page(h));
- i_mmap_unlock_write(mapping);
- }
+ if (remove_inode_single_folio(h, inode, mapping, folio,
+ index, truncate_op))
+ freed++;
- lock_page(page);
- /*
- * We must free the huge page and remove from page
- * cache (remove_huge_page) BEFORE removing the
- * region/reserve map (hugetlb_unreserve_pages). In
- * rare out of memory conditions, removal of the
- * region/reserve map could fail. Correspondingly,
- * the subpool and global reserve usage count can need
- * to be adjusted.
- */
- VM_BUG_ON(PagePrivate(page));
- remove_huge_page(page);
- freed++;
- if (!truncate_op) {
- if (unlikely(hugetlb_unreserve_pages(inode,
- index, index + 1, 1)))
- hugetlb_fix_reserve_counts(inode);
- }
-
- unlock_page(page);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
}
- huge_pagevec_release(&pvec);
+ folio_batch_release(&fbatch);
cond_resched();
}
@@ -506,7 +672,7 @@ static void hugetlbfs_evict_inode(struct inode *inode)
clear_inode(inode);
}
-static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
+static void hugetlb_vmtruncate(struct inode *inode, loff_t offset)
{
pgoff_t pgoff;
struct address_space *mapping = inode->i_mapping;
@@ -518,47 +684,85 @@ static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
i_size_write(inode, offset);
i_mmap_lock_write(mapping);
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
- hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0);
+ hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0,
+ ZAP_FLAG_DROP_MARKER);
i_mmap_unlock_write(mapping);
remove_inode_hugepages(inode, offset, LLONG_MAX);
- return 0;
+}
+
+static void hugetlbfs_zero_partial_page(struct hstate *h,
+ struct address_space *mapping,
+ loff_t start,
+ loff_t end)
+{
+ pgoff_t idx = start >> huge_page_shift(h);
+ struct folio *folio;
+
+ folio = filemap_lock_folio(mapping, idx);
+ if (!folio)
+ return;
+
+ start = start & ~huge_page_mask(h);
+ end = end & ~huge_page_mask(h);
+ if (!end)
+ end = huge_page_size(h);
+
+ folio_zero_segment(folio, (size_t)start, (size_t)end);
+
+ folio_unlock(folio);
+ folio_put(folio);
}
static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
{
+ struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
+ struct address_space *mapping = inode->i_mapping;
struct hstate *h = hstate_inode(inode);
loff_t hpage_size = huge_page_size(h);
loff_t hole_start, hole_end;
/*
- * For hole punch round up the beginning offset of the hole and
- * round down the end.
+ * hole_start and hole_end indicate the full pages within the hole.
*/
hole_start = round_up(offset, hpage_size);
hole_end = round_down(offset + len, hpage_size);
- if (hole_end > hole_start) {
- struct address_space *mapping = inode->i_mapping;
- struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
+ inode_lock(inode);
- inode_lock(inode);
+ /* protected by i_rwsem */
+ if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
+ inode_unlock(inode);
+ return -EPERM;
+ }
- /* protected by i_mutex */
- if (info->seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) {
- inode_unlock(inode);
- return -EPERM;
- }
+ i_mmap_lock_write(mapping);
- i_mmap_lock_write(mapping);
+ /* If range starts before first full page, zero partial page. */
+ if (offset < hole_start)
+ hugetlbfs_zero_partial_page(h, mapping,
+ offset, min(offset + len, hole_start));
+
+ /* Unmap users of full pages in the hole. */
+ if (hole_end > hole_start) {
if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))
hugetlb_vmdelete_list(&mapping->i_mmap,
- hole_start >> PAGE_SHIFT,
- hole_end >> PAGE_SHIFT);
- i_mmap_unlock_write(mapping);
- remove_inode_hugepages(inode, hole_start, hole_end);
- inode_unlock(inode);
+ hole_start >> PAGE_SHIFT,
+ hole_end >> PAGE_SHIFT, 0);
}
+ /* If range extends beyond last full page, zero partial page. */
+ if ((offset + len) > hole_end && (offset + len) > hole_start)
+ hugetlbfs_zero_partial_page(h, mapping,
+ hole_end, offset + len);
+
+ i_mmap_unlock_write(mapping);
+
+ /* Remove full pages from the file. */
+ if (hole_end > hole_start)
+ remove_inode_hugepages(inode, hole_start, hole_end);
+
+ inode_unlock(inode);
+
return 0;
}
@@ -619,7 +823,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
*/
struct page *page;
unsigned long addr;
- int avoid_reserve = 0;
cond_resched();
@@ -651,8 +854,15 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
continue;
}
- /* Allocate page and add to page cache */
- page = alloc_huge_page(&pseudo_vma, addr, avoid_reserve);
+ /*
+ * Allocate page without setting the avoid_reserve argument.
+ * There certainly are no reserves associated with the
+ * pseudo_vma. However, there could be shared mappings with
+ * reserves for the file at the inode level. If we fallocate
+ * pages in these areas, we need to consume the reserves
+ * to keep reservation accounting consistent.
+ */
+ page = alloc_huge_page(&pseudo_vma, addr, 0);
hugetlb_drop_vma_policy(&pseudo_vma);
if (IS_ERR(page)) {
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
@@ -661,8 +871,9 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
}
clear_huge_page(page, addr, pages_per_huge_page(h));
__SetPageUptodate(page);
- error = huge_add_to_page_cache(page, mapping, index);
+ error = hugetlb_add_to_page_cache(page, mapping, index);
if (unlikely(error)) {
+ restore_reserve_on_error(h, &pseudo_vma, addr, page);
put_page(page);
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
goto out;
@@ -670,9 +881,10 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+ SetHPageMigratable(page);
/*
- * unlock_page because locked by add_to_page_cache()
- * page_put due to reference from alloc_huge_page()
+ * unlock_page because locked by hugetlb_add_to_page_cache()
+ * put_page() due to reference from alloc_huge_page()
*/
unlock_page(page);
put_page(page);
@@ -686,7 +898,8 @@ out:
return error;
}
-static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
+static int hugetlbfs_setattr(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
struct hstate *h = hstate_inode(inode);
@@ -694,9 +907,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
unsigned int ia_valid = attr->ia_valid;
struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
- BUG_ON(!inode);
-
- error = setattr_prepare(dentry, attr);
+ error = setattr_prepare(&init_user_ns, dentry, attr);
if (error)
return error;
@@ -706,16 +917,14 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr)
if (newsize & ~huge_page_mask(h))
return -EINVAL;
- /* protected by i_mutex */
+ /* protected by i_rwsem */
if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) ||
(newsize > oldsize && (info->seals & F_SEAL_GROW)))
return -EPERM;
- error = hugetlb_vmtruncate(inode, newsize);
- if (error)
- return error;
+ hugetlb_vmtruncate(inode, newsize);
}
- setattr_copy(inode, attr);
+ setattr_copy(&init_user_ns, inode, attr);
mark_inode_dirty(inode);
return 0;
}
@@ -771,7 +980,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode);
inode->i_ino = get_next_ino();
- inode_init_owner(inode, dir, mode);
+ inode_init_owner(&init_user_ns, inode, dir, mode);
lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
&hugetlbfs_i_mmap_rwsem_key);
inode->i_mapping->a_ops = &hugetlbfs_aops;
@@ -810,56 +1019,54 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
/*
* File creation. Allocate an inode, and we're done..
*/
-static int do_hugetlbfs_mknod(struct inode *dir,
- struct dentry *dentry,
- umode_t mode,
- dev_t dev,
- bool tmpfile)
+static int hugetlbfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode *inode;
- int error = -ENOSPC;
inode = hugetlbfs_get_inode(dir->i_sb, dir, mode, dev);
- if (inode) {
- dir->i_ctime = dir->i_mtime = current_time(dir);
- if (tmpfile) {
- d_tmpfile(dentry, inode);
- } else {
- d_instantiate(dentry, inode);
- dget(dentry);/* Extra count - pin the dentry in core */
- }
- error = 0;
- }
- return error;
-}
-
-static int hugetlbfs_mknod(struct inode *dir,
- struct dentry *dentry, umode_t mode, dev_t dev)
-{
- return do_hugetlbfs_mknod(dir, dentry, mode, dev, false);
+ if (!inode)
+ return -ENOSPC;
+ dir->i_ctime = dir->i_mtime = current_time(dir);
+ d_instantiate(dentry, inode);
+ dget(dentry);/* Extra count - pin the dentry in core */
+ return 0;
}
-static int hugetlbfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+static int hugetlbfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
+ struct dentry *dentry, umode_t mode)
{
- int retval = hugetlbfs_mknod(dir, dentry, mode | S_IFDIR, 0);
+ int retval = hugetlbfs_mknod(&init_user_ns, dir, dentry,
+ mode | S_IFDIR, 0);
if (!retval)
inc_nlink(dir);
return retval;
}
-static int hugetlbfs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl)
+static int hugetlbfs_create(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *dentry,
+ umode_t mode, bool excl)
{
- return hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0);
+ return hugetlbfs_mknod(&init_user_ns, dir, dentry, mode | S_IFREG, 0);
}
-static int hugetlbfs_tmpfile(struct inode *dir,
- struct dentry *dentry, umode_t mode)
+static int hugetlbfs_tmpfile(struct user_namespace *mnt_userns,
+ struct inode *dir, struct file *file,
+ umode_t mode)
{
- return do_hugetlbfs_mknod(dir, dentry, mode | S_IFREG, 0, true);
+ struct inode *inode;
+
+ inode = hugetlbfs_get_inode(dir->i_sb, dir, mode | S_IFREG, 0);
+ if (!inode)
+ return -ENOSPC;
+ dir->i_ctime = dir->i_mtime = current_time(dir);
+ d_tmpfile(file, inode);
+ return finish_open_simple(file, 0);
}
-static int hugetlbfs_symlink(struct inode *dir,
- struct dentry *dentry, const char *symname)
+static int hugetlbfs_symlink(struct user_namespace *mnt_userns,
+ struct inode *dir, struct dentry *dentry,
+ const char *symname)
{
struct inode *inode;
int error = -ENOSPC;
@@ -879,56 +1086,37 @@ static int hugetlbfs_symlink(struct inode *dir,
return error;
}
-/*
- * mark the head page dirty
- */
-static int hugetlbfs_set_page_dirty(struct page *page)
-{
- struct page *head = compound_head(page);
-
- SetPageDirty(head);
- return 0;
-}
-
-static int hugetlbfs_migrate_page(struct address_space *mapping,
- struct page *newpage, struct page *page,
+#ifdef CONFIG_MIGRATION
+static int hugetlbfs_migrate_folio(struct address_space *mapping,
+ struct folio *dst, struct folio *src,
enum migrate_mode mode)
{
int rc;
- rc = migrate_huge_page_move_mapping(mapping, newpage, page);
+ rc = migrate_huge_page_move_mapping(mapping, dst, src);
if (rc != MIGRATEPAGE_SUCCESS)
return rc;
- /*
- * page_private is subpool pointer in hugetlb pages. Transfer to
- * new page. PagePrivate is not associated with page_private for
- * hugetlb pages and can not be set here as only page_huge_active
- * pages can be migrated.
- */
- if (page_private(page)) {
- set_page_private(newpage, page_private(page));
- set_page_private(page, 0);
+ if (hugetlb_page_subpool(&src->page)) {
+ hugetlb_set_page_subpool(&dst->page,
+ hugetlb_page_subpool(&src->page));
+ hugetlb_set_page_subpool(&src->page, NULL);
}
if (mode != MIGRATE_SYNC_NO_COPY)
- migrate_page_copy(newpage, page);
+ folio_migrate_copy(dst, src);
else
- migrate_page_states(newpage, page);
+ folio_migrate_flags(dst, src);
return MIGRATEPAGE_SUCCESS;
}
+#else
+#define hugetlbfs_migrate_folio NULL
+#endif
static int hugetlbfs_error_remove_page(struct address_space *mapping,
struct page *page)
{
- struct inode *inode = mapping->host;
- pgoff_t index = page->index;
-
- remove_huge_page(page);
- if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1)))
- hugetlb_fix_reserve_counts(inode);
-
return 0;
}
@@ -981,17 +1169,17 @@ static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_bsize = huge_page_size(h);
if (sbinfo) {
spin_lock(&sbinfo->stat_lock);
- /* If no limits set, just report 0 for max/free/used
+ /* If no limits set, just report 0 or -1 for max/free/used
* blocks, like simple_statfs() */
if (sbinfo->spool) {
long free_pages;
- spin_lock(&sbinfo->spool->lock);
+ spin_lock_irq(&sbinfo->spool->lock);
buf->f_blocks = sbinfo->spool->max_hpages;
free_pages = sbinfo->spool->max_hpages
- sbinfo->spool->used_hpages;
buf->f_bavail = buf->f_bfree = free_pages;
- spin_unlock(&sbinfo->spool->lock);
+ spin_unlock_irq(&sbinfo->spool->lock);
buf->f_files = sbinfo->max_inodes;
buf->f_ffree = sbinfo->free_inodes;
}
@@ -1049,7 +1237,7 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb)
if (unlikely(!hugetlbfs_dec_free_inodes(sbinfo)))
return NULL;
- p = kmem_cache_alloc(hugetlbfs_inode_cachep, GFP_KERNEL);
+ p = alloc_inode_sb(sb, hugetlbfs_inode_cachep, GFP_KERNEL);
if (unlikely(!p)) {
hugetlbfs_inc_free_inodes(sbinfo);
return NULL;
@@ -1083,8 +1271,8 @@ static void hugetlbfs_destroy_inode(struct inode *inode)
static const struct address_space_operations hugetlbfs_aops = {
.write_begin = hugetlbfs_write_begin,
.write_end = hugetlbfs_write_end,
- .set_page_dirty = hugetlbfs_set_page_dirty,
- .migratepage = hugetlbfs_migrate_page,
+ .dirty_folio = noop_dirty_folio,
+ .migrate_folio = hugetlbfs_migrate_folio,
.error_remove_page = hugetlbfs_error_remove_page,
};
@@ -1208,7 +1396,7 @@ static int hugetlbfs_parse_param(struct fs_context *fc, struct fs_parameter *par
ps = memparse(param->string, &rest);
ctx->hstate = size_to_hstate(ps);
if (!ctx->hstate) {
- pr_err("Unsupported page size %lu MB\n", ps >> 20);
+ pr_err("Unsupported page size %lu MB\n", ps / SZ_1M);
return -EINVAL;
}
return 0;
@@ -1283,8 +1471,8 @@ hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
/*
* Allocate and initialize subpool if maximum or minimum size is
- * specified. Any needed reservations (for minimim size) are taken
- * taken when the subpool is created.
+ * specified. Any needed reservations (for minimum size) are taken
+ * when the subpool is created.
*/
if (ctx->max_hpages != -1 || ctx->min_hpages != -1) {
sbinfo->spool = hugepage_new_subpool(ctx->hstate,
@@ -1299,6 +1487,12 @@ hugetlbfs_fill_super(struct super_block *sb, struct fs_context *fc)
sb->s_magic = HUGETLBFS_MAGIC;
sb->s_op = &hugetlbfs_ops;
sb->s_time_gran = 1;
+
+ /*
+ * Due to the special and limited functionality of hugetlbfs, it does
+ * not work well as a stacking filesystem.
+ */
+ sb->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH;
sb->s_root = d_make_root(hugetlbfs_get_root(sb, ctx));
if (!sb->s_root)
goto out_free;
@@ -1372,7 +1566,7 @@ static int get_hstate_idx(int page_size_log)
if (!h)
return -1;
- return h - hstates;
+ return hstate_index(h);
}
/*
@@ -1380,8 +1574,8 @@ static int get_hstate_idx(int page_size_log)
* otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
*/
struct file *hugetlb_file_setup(const char *name, size_t size,
- vm_flags_t acctflag, struct user_struct **user,
- int creat_flags, int page_size_log)
+ vm_flags_t acctflag, int creat_flags,
+ int page_size_log)
{
struct inode *inode;
struct vfsmount *mnt;
@@ -1392,22 +1586,19 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
if (hstate_idx < 0)
return ERR_PTR(-ENODEV);
- *user = NULL;
mnt = hugetlbfs_vfsmount[hstate_idx];
if (!mnt)
return ERR_PTR(-ENOENT);
if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
- *user = current_user();
- if (user_shm_lock(size, *user)) {
- task_lock(current);
- pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
+ struct ucounts *ucounts = current_ucounts();
+
+ if (user_shm_lock(size, ucounts)) {
+ pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is obsolete\n",
current->comm, current->pid);
- task_unlock(current);
- } else {
- *user = NULL;
- return ERR_PTR(-EPERM);
+ user_shm_unlock(size, ucounts);
}
+ return ERR_PTR(-EPERM);
}
file = ERR_PTR(-ENOSPC);
@@ -1420,7 +1611,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
inode->i_size = size;
clear_nlink(inode);
- if (hugetlb_reserve_pages(inode, 0,
+ if (!hugetlb_reserve_pages(inode, 0,
size >> huge_page_shift(hstate_inode(inode)), NULL,
acctflag))
file = ERR_PTR(-ENOMEM);
@@ -1432,10 +1623,6 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
iput(inode);
out:
- if (*user) {
- user_shm_unlock(size, *user);
- *user = NULL;
- }
return file;
}
@@ -1454,8 +1641,8 @@ static struct vfsmount *__init mount_one_hugetlbfs(struct hstate *h)
put_fs_context(fc);
}
if (IS_ERR(mnt))
- pr_err("Cannot mount internal hugetlbfs for page size %uK",
- 1U << (h->order + PAGE_SHIFT - 10));
+ pr_err("Cannot mount internal hugetlbfs for page size %luK",
+ huge_page_size(h) / SZ_1K);
return mnt;
}
@@ -1483,7 +1670,7 @@ static int __init init_hugetlbfs_fs(void)
goto out_free;
/* default hstate mount is required */
- mnt = mount_one_hugetlbfs(&hstates[default_hstate_idx]);
+ mnt = mount_one_hugetlbfs(&default_hstate);
if (IS_ERR(mnt)) {
error = PTR_ERR(mnt);
goto out_unreg;