aboutsummaryrefslogtreecommitdiffstats
path: root/mm/filemap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c149
1 files changed, 88 insertions, 61 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index ad8c39d90bf9..647d72bf23b6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -72,7 +72,7 @@
* Lock ordering:
*
* ->i_mmap_rwsem (truncate_pagecache)
- * ->private_lock (__free_pte->__set_page_dirty_buffers)
+ * ->private_lock (__free_pte->block_dirty_folio)
* ->swap_lock (exclusive_swap_page, others)
* ->i_pages lock
*
@@ -115,7 +115,7 @@
* ->memcg->move_lock (page_remove_rmap->lock_page_memcg)
* bdi.wb->list_lock (zap_pte_range->set_page_dirty)
* ->inode->i_lock (zap_pte_range->set_page_dirty)
- * ->private_lock (zap_pte_range->__set_page_dirty_buffers)
+ * ->private_lock (zap_pte_range->block_dirty_folio)
*
* ->i_mmap_rwsem
* ->tasklist_lock (memory_failure, collect_procs_ao)
@@ -152,25 +152,25 @@ static void filemap_unaccount_folio(struct address_space *mapping,
VM_BUG_ON_FOLIO(folio_mapped(folio), folio);
if (!IS_ENABLED(CONFIG_DEBUG_VM) && unlikely(folio_mapped(folio))) {
- int mapcount;
-
pr_alert("BUG: Bad page cache in process %s pfn:%05lx\n",
current->comm, folio_pfn(folio));
dump_page(&folio->page, "still mapped when deleted");
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
- mapcount = page_mapcount(&folio->page);
- if (mapping_exiting(mapping) &&
- folio_ref_count(folio) >= mapcount + 2) {
- /*
- * All vmas have already been torn down, so it's
- * a good bet that actually the folio is unmapped,
- * and we'd prefer not to leak it: if we're wrong,
- * some other bad page check should catch it later.
- */
- page_mapcount_reset(&folio->page);
- folio_ref_sub(folio, mapcount);
+ if (mapping_exiting(mapping) && !folio_test_large(folio)) {
+ int mapcount = page_mapcount(&folio->page);
+
+ if (folio_ref_count(folio) >= mapcount + 2) {
+ /*
+ * All vmas have already been torn down, so it's
+ * a good bet that actually the page is unmapped
+ * and we'd rather not leak it: if we're wrong,
+ * another bad page check should catch it later.
+ */
+ page_mapcount_reset(&folio->page);
+ folio_ref_sub(folio, mapcount);
+ }
}
}
@@ -193,16 +193,20 @@ static void filemap_unaccount_folio(struct address_space *mapping,
/*
* At this point folio must be either written or cleaned by
* truncate. Dirty folio here signals a bug and loss of
- * unwritten data.
+ * unwritten data - on ordinary filesystems.
*
- * This fixes dirty accounting after removing the folio entirely
+ * But it's harmless on in-memory filesystems like tmpfs; and can
+ * occur when a driver which did get_user_pages() sets page dirty
+ * before putting it, while the inode is being finally evicted.
+ *
+ * Below fixes dirty accounting after removing the folio entirely
* but leaves the dirty flag set: it has no effect for truncated
* folio and anyway will be cleared before returning folio to
* buddy allocator.
*/
- if (WARN_ON_ONCE(folio_test_dirty(folio)))
- folio_account_cleaned(folio, mapping,
- inode_to_wb(mapping->host));
+ if (WARN_ON_ONCE(folio_test_dirty(folio) &&
+ mapping_can_writeback(mapping)))
+ folio_account_cleaned(folio, inode_to_wb(mapping->host));
}
/*
@@ -842,26 +846,27 @@ noinline int __filemap_add_folio(struct address_space *mapping,
{
XA_STATE(xas, &mapping->i_pages, index);
int huge = folio_test_hugetlb(folio);
- int error;
bool charged = false;
+ long nr = 1;
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(folio_test_swapbacked(folio), folio);
mapping_set_update(&xas, mapping);
- folio_get(folio);
- folio->mapping = mapping;
- folio->index = index;
-
if (!huge) {
- error = mem_cgroup_charge(folio, NULL, gfp);
+ int error = mem_cgroup_charge(folio, NULL, gfp);
VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio);
if (error)
- goto error;
+ return error;
charged = true;
+ xas_set_order(&xas, index, folio_order(folio));
+ nr = folio_nr_pages(folio);
}
gfp &= GFP_RECLAIM_MASK;
+ folio_ref_add(folio, nr);
+ folio->mapping = mapping;
+ folio->index = xas.xa_index;
do {
unsigned int order = xa_get_order(xas.xa, xas.xa_index);
@@ -885,6 +890,8 @@ noinline int __filemap_add_folio(struct address_space *mapping,
/* entry may have been split before we acquired lock */
order = xa_get_order(xas.xa, xas.xa_index);
if (order > folio_order(folio)) {
+ /* How to handle large swap entries? */
+ BUG_ON(shmem_mapping(mapping));
xas_split(&xas, old, order);
xas_reset(&xas);
}
@@ -894,29 +901,31 @@ noinline int __filemap_add_folio(struct address_space *mapping,
if (xas_error(&xas))
goto unlock;
- mapping->nrpages++;
+ mapping->nrpages += nr;
/* hugetlb pages do not participate in page cache accounting */
- if (!huge)
- __lruvec_stat_add_folio(folio, NR_FILE_PAGES);
+ if (!huge) {
+ __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr);
+ if (folio_test_pmd_mappable(folio))
+ __lruvec_stat_mod_folio(folio,
+ NR_FILE_THPS, nr);
+ }
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp));
- if (xas_error(&xas)) {
- error = xas_error(&xas);
- if (charged)
- mem_cgroup_uncharge(folio);
+ if (xas_error(&xas))
goto error;
- }
trace_mm_filemap_add_to_page_cache(folio);
return 0;
error:
+ if (charged)
+ mem_cgroup_uncharge(folio);
folio->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
- folio_put(folio);
- return error;
+ folio_put_refs(folio, nr);
+ return xas_error(&xas);
}
ALLOW_ERROR_INJECTION(__filemap_add_folio, ERRNO);
@@ -1054,6 +1063,12 @@ void __init pagecache_init(void)
init_waitqueue_head(&folio_wait_table[i]);
page_writeback_init();
+
+ /*
+ * tmpfs uses the ZERO_PAGE for reading holes: it is up-to-date,
+ * and splice's page_cache_pipe_buf_confirm() needs to see that.
+ */
+ SetPageUptodate(ZERO_PAGE(0));
}
/*
@@ -1174,24 +1189,17 @@ static void folio_wake_bit(struct folio *folio, int bit_nr)
}
/*
- * It is possible for other pages to have collided on the waitqueue
- * hash, so in that case check for a page match. That prevents a long-
- * term waiter
+ * It's possible to miss clearing waiters here, when we woke our page
+ * waiters, but the hashed waitqueue has waiters for other pages on it.
+ * That's okay, it's a rare case. The next waker will clear it.
*
- * It is still possible to miss a case here, when we woke page waiters
- * and removed them from the waitqueue, but there are still other
- * page waiters.
+ * Note that, depending on the page pool (buddy, hugetlb, ZONE_DEVICE,
+ * other), the flag may be cleared in the course of freeing the page;
+ * but that is not required for correctness.
*/
- if (!waitqueue_active(q) || !key.page_match) {
+ if (!waitqueue_active(q) || !key.page_match)
folio_clear_waiters(folio);
- /*
- * It's possible to miss clearing Waiters here, when we woke
- * our page waiters, but the hashed waitqueue has waiters for
- * other pages on it.
- *
- * That's okay, it's a rare case. The next waker will clear it.
- */
- }
+
spin_unlock_irqrestore(&q->lock, flags);
}
@@ -2229,8 +2237,9 @@ out:
* @nr_pages: The maximum number of pages
* @pages: Where the resulting pages are placed
*
- * find_get_pages_contig() works exactly like find_get_pages(), except
- * that the returned number of pages are guaranteed to be contiguous.
+ * find_get_pages_contig() works exactly like find_get_pages_range(),
+ * except that the returned number of pages are guaranteed to be
+ * contiguous.
*
* Return: the number of pages which were found.
*/
@@ -2290,9 +2299,9 @@ EXPORT_SYMBOL(find_get_pages_contig);
* @nr_pages: the maximum number of pages
* @pages: where the resulting pages are placed
*
- * Like find_get_pages(), except we only return head pages which are tagged
- * with @tag. @index is updated to the index immediately after the last
- * page we return, ready for the next iteration.
+ * Like find_get_pages_range(), except we only return head pages which are
+ * tagged with @tag. @index is updated to the index immediately after the
+ * last page we return, ready for the next iteration.
*
* Return: the number of pages which were found.
*/
@@ -2452,7 +2461,7 @@ static bool filemap_range_uptodate(struct address_space *mapping,
pos -= folio_pos(folio);
}
- return mapping->a_ops->is_partially_uptodate(&folio->page, pos, count);
+ return mapping->a_ops->is_partially_uptodate(folio, pos, count);
}
static int filemap_update_page(struct kiocb *iocb,
@@ -2844,7 +2853,7 @@ static inline loff_t folio_seek_hole_data(struct xa_state *xas,
offset = offset_in_folio(folio, start) & ~(bsz - 1);
do {
- if (ops->is_partially_uptodate(&folio->page, offset, bsz) ==
+ if (ops->is_partially_uptodate(folio, offset, bsz) ==
seek_data)
break;
start = (start + bsz) & ~(bsz - 1);
@@ -2990,6 +2999,24 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
struct file *fpin = NULL;
unsigned int mmap_miss;
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ /* Use the readahead code, even if readahead is disabled */
+ if (vmf->vma->vm_flags & VM_HUGEPAGE) {
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
+ ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1);
+ ra->size = HPAGE_PMD_NR;
+ /*
+ * Fetch two PMD folios, so we get the chance to actually
+ * readahead, unless we've been told not to.
+ */
+ if (!(vmf->vma->vm_flags & VM_RAND_READ))
+ ra->size *= 2;
+ ra->async_size = HPAGE_PMD_NR;
+ page_cache_ra_order(&ractl, ra, HPAGE_PMD_ORDER);
+ return fpin;
+ }
+#endif
+
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ)
return fpin;
@@ -3022,7 +3049,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
ra->size = ra->ra_pages;
ra->async_size = ra->ra_pages / 4;
ractl._index = ra->start;
- do_page_cache_ra(&ractl, ra->size, ra->async_size);
+ page_cache_ra_order(&ractl, ra, 0);
return fpin;
}
@@ -3752,7 +3779,7 @@ again:
* same page as we're writing to, without it being marked
* up-to-date.
*/
- if (unlikely(fault_in_iov_iter_readable(i, bytes))) {
+ if (unlikely(fault_in_iov_iter_readable(i, bytes) == bytes)) {
status = -EFAULT;
break;
}