diff options
Diffstat (limited to 'fs/erofs/zdata.c')
-rw-r--r-- | fs/erofs/zdata.c | 1712 |
1 files changed, 1011 insertions, 701 deletions
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 80e47f07d946..064a166324a7 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1,25 +1,188 @@ // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 2018 HUAWEI, Inc. - * http://www.huawei.com/ - * Created by Gao Xiang <gaoxiang25@huawei.com> + * https://www.huawei.com/ + * Copyright (C) 2022 Alibaba Cloud */ #include "zdata.h" #include "compress.h" #include <linux/prefetch.h> +#include <linux/psi.h> #include <trace/events/erofs.h> /* - * a compressed_pages[] placeholder in order to avoid - * being filled with file pages for in-place decompression. + * since pclustersize is variable for big pcluster feature, introduce slab + * pools implementation for different pcluster sizes. */ -#define PAGE_UNALLOCATED ((void *)0x5F0E4B1D) +struct z_erofs_pcluster_slab { + struct kmem_cache *slab; + unsigned int maxpages; + char name[48]; +}; + +#define _PCLP(n) { .maxpages = n } + +static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { + _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128), + _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) +}; + +struct z_erofs_bvec_iter { + struct page *bvpage; + struct z_erofs_bvset *bvset; + unsigned int nr, cur; +}; + +static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter) +{ + if (iter->bvpage) + kunmap_local(iter->bvset); + return iter->bvpage; +} + +static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter) +{ + unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec; + /* have to access nextpage in advance, otherwise it will be unmapped */ + struct page *nextpage = iter->bvset->nextpage; + struct page *oldpage; + + DBG_BUGON(!nextpage); + oldpage = z_erofs_bvec_iter_end(iter); + iter->bvpage = nextpage; + iter->bvset = kmap_local_page(nextpage); + iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec); + iter->cur = 0; + return oldpage; +} + +static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvset_inline *bvset, + unsigned int bootstrap_nr, + unsigned int cur) +{ + *iter = (struct z_erofs_bvec_iter) { + .nr = bootstrap_nr, + .bvset = (struct z_erofs_bvset *)bvset, + }; + + while (cur > iter->nr) { + cur -= iter->nr; + z_erofs_bvset_flip(iter); + } + iter->cur = cur; +} + +static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvec *bvec, + struct page **candidate_bvpage) +{ + if (iter->cur == iter->nr) { + if (!*candidate_bvpage) + return -EAGAIN; + + DBG_BUGON(iter->bvset->nextpage); + iter->bvset->nextpage = *candidate_bvpage; + z_erofs_bvset_flip(iter); + + iter->bvset->nextpage = NULL; + *candidate_bvpage = NULL; + } + iter->bvset->bvec[iter->cur++] = *bvec; + return 0; +} + +static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter, + struct z_erofs_bvec *bvec, + struct page **old_bvpage) +{ + if (iter->cur == iter->nr) + *old_bvpage = z_erofs_bvset_flip(iter); + else + *old_bvpage = NULL; + *bvec = iter->bvset->bvec[iter->cur++]; +} + +static void z_erofs_destroy_pcluster_pool(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { + if (!pcluster_pool[i].slab) + continue; + kmem_cache_destroy(pcluster_pool[i].slab); + pcluster_pool[i].slab = NULL; + } +} + +static int z_erofs_create_pcluster_pool(void) +{ + struct z_erofs_pcluster_slab *pcs; + struct z_erofs_pcluster *a; + unsigned int size; + + for (pcs = pcluster_pool; + pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) { + size = struct_size(a, compressed_bvecs, pcs->maxpages); + + sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages); + pcs->slab = kmem_cache_create(pcs->name, size, 0, + SLAB_RECLAIM_ACCOUNT, NULL); + if (pcs->slab) + continue; + + z_erofs_destroy_pcluster_pool(); + return -ENOMEM; + } + return 0; +} + +static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { + struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; + struct z_erofs_pcluster *pcl; + + if (nrpages > pcs->maxpages) + continue; + + pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS); + if (!pcl) + return ERR_PTR(-ENOMEM); + pcl->pclusterpages = nrpages; + return pcl; + } + return ERR_PTR(-EINVAL); +} + +static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) +{ + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + int i; + + for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) { + struct z_erofs_pcluster_slab *pcs = pcluster_pool + i; + + if (pclusterpages > pcs->maxpages) + continue; + + kmem_cache_free(pcs->slab, pcl); + return; + } + DBG_BUGON(1); +} /* how to allocate cached pages for a pcluster */ enum z_erofs_cache_alloctype { DONTALLOC, /* don't allocate any cached pages */ - DELAYEDALLOC, /* delayed allocation (at the time of submitting io) */ + /* + * try to use cached I/O if page allocation succeeds or fallback + * to in-place I/O instead to avoid any direct reclaim. + */ + TRYALLOC, }; /* @@ -32,84 +195,62 @@ typedef tagptr1_t compressed_page_t; tagptr_fold(compressed_page_t, page, 1) static struct workqueue_struct *z_erofs_workqueue __read_mostly; -static struct kmem_cache *pcluster_cachep __read_mostly; void z_erofs_exit_zip_subsystem(void) { destroy_workqueue(z_erofs_workqueue); - kmem_cache_destroy(pcluster_cachep); + z_erofs_destroy_pcluster_pool(); } static inline int z_erofs_init_workqueue(void) { const unsigned int onlinecpus = num_possible_cpus(); - const unsigned int flags = WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE; /* * no need to spawn too many threads, limiting threads could minimum * scheduling overhead, perhaps per-CPU threads should be better? */ - z_erofs_workqueue = alloc_workqueue("erofs_unzipd", flags, + z_erofs_workqueue = alloc_workqueue("erofs_unzipd", + WQ_UNBOUND | WQ_HIGHPRI, onlinecpus + onlinecpus / 4); return z_erofs_workqueue ? 0 : -ENOMEM; } -static void z_erofs_pcluster_init_once(void *ptr) -{ - struct z_erofs_pcluster *pcl = ptr; - struct z_erofs_collection *cl = z_erofs_primarycollection(pcl); - unsigned int i; - - mutex_init(&cl->lock); - cl->nr_pages = 0; - cl->vcnt = 0; - for (i = 0; i < Z_EROFS_CLUSTER_MAX_PAGES; ++i) - pcl->compressed_pages[i] = NULL; -} - -static void z_erofs_pcluster_init_always(struct z_erofs_pcluster *pcl) -{ - struct z_erofs_collection *cl = z_erofs_primarycollection(pcl); - - atomic_set(&pcl->obj.refcount, 1); - - DBG_BUGON(cl->nr_pages); - DBG_BUGON(cl->vcnt); -} - int __init z_erofs_init_zip_subsystem(void) { - pcluster_cachep = kmem_cache_create("erofs_compress", - Z_EROFS_WORKGROUP_SIZE, 0, - SLAB_RECLAIM_ACCOUNT, - z_erofs_pcluster_init_once); - if (pcluster_cachep) { - if (!z_erofs_init_workqueue()) - return 0; + int err = z_erofs_create_pcluster_pool(); - kmem_cache_destroy(pcluster_cachep); - } - return -ENOMEM; + if (err) + return err; + err = z_erofs_init_workqueue(); + if (err) + z_erofs_destroy_pcluster_pool(); + return err; } -enum z_erofs_collectmode { - COLLECT_SECONDARY, - COLLECT_PRIMARY, +enum z_erofs_pclustermode { + Z_EROFS_PCLUSTER_INFLIGHT, /* - * The current collection was the tail of an exist chain, in addition - * that the previous processed chained collections are all decided to + * The current pclusters was the tail of an exist chain, in addition + * that the previous processed chained pclusters are all decided to * be hooked up to it. - * A new chain will be created for the remaining collections which are - * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED, - * the next collection cannot reuse the whole page safely in - * the following scenario: + * A new chain will be created for the remaining pclusters which are + * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED, + * the next pcluster cannot reuse the whole page safely for inplace I/O + * in the following scenario: * ________________________________________________________________ * | tail (partial) page | head (partial) page | - * | (belongs to the next cl) | (belongs to the current cl) | - * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________| + * | (belongs to the next pcl) | (belongs to the current pcl) | + * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________| + */ + Z_EROFS_PCLUSTER_HOOKED, + /* + * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it + * could be dispatched into bypass queue later due to uptodated managed + * pages. All related online pages cannot be reused for inplace I/O (or + * bvpage) since it can be directly decoded without I/O submission. */ - COLLECT_PRIMARY_HOOKED, - COLLECT_PRIMARY_FOLLOWED_NOINPLACE, + Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE, /* * The current collection has been linked with the owned chain, and * could also be linked with the remaining collections, which means @@ -120,91 +261,101 @@ enum z_erofs_collectmode { * ________________________________________________________________ * | tail (partial) page | head (partial) page | * | (of the current cl) | (of the previous collection) | - * | PRIMARY_FOLLOWED or | | - * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________| + * | PCLUSTER_FOLLOWED or | | + * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________| * * [ (*) the above page can be used as inplace I/O. ] */ - COLLECT_PRIMARY_FOLLOWED, -}; - -struct z_erofs_collector { - struct z_erofs_pagevec_ctor vector; - - struct z_erofs_pcluster *pcl, *tailpcl; - struct z_erofs_collection *cl; - struct page **compressedpages; - z_erofs_next_pcluster_t owned_head; - - enum z_erofs_collectmode mode; + Z_EROFS_PCLUSTER_FOLLOWED, }; struct z_erofs_decompress_frontend { struct inode *const inode; - - struct z_erofs_collector clt; struct erofs_map_blocks map; + struct z_erofs_bvec_iter biter; + struct page *candidate_bvpage; + struct z_erofs_pcluster *pcl, *tailpcl; + z_erofs_next_pcluster_t owned_head; + enum z_erofs_pclustermode mode; + + bool readahead; /* used for applying cache strategy on the fly */ bool backmost; erofs_off_t headoffset; -}; -#define COLLECTOR_INIT() { \ - .owned_head = Z_EROFS_PCLUSTER_TAIL, \ - .mode = COLLECT_PRIMARY_FOLLOWED } + /* a pointer used to pick up inplace I/O pages */ + unsigned int icur; +}; #define DECOMPRESS_FRONTEND_INIT(__i) { \ - .inode = __i, .clt = COLLECTOR_INIT(), \ - .backmost = true, } - -static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES]; -static DEFINE_MUTEX(z_pagemap_global_lock); + .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \ + .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true } -static void preload_compressed_pages(struct z_erofs_collector *clt, - struct address_space *mc, - enum z_erofs_cache_alloctype type, - struct list_head *pagepool) +static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, + enum z_erofs_cache_alloctype type, + struct page **pagepool) { - const struct z_erofs_pcluster *pcl = clt->pcl; - const unsigned int clusterpages = BIT(pcl->clusterbits); - struct page **pages = clt->compressedpages; - pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages); + struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode)); + struct z_erofs_pcluster *pcl = fe->pcl; bool standalone = true; + /* + * optimistic allocation without direct reclaim since inplace I/O + * can be used if low memory otherwise. + */ + gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + unsigned int i; - if (clt->mode < COLLECT_PRIMARY_FOLLOWED) + if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED) return; - for (; pages < pcl->compressed_pages + clusterpages; ++pages) { + for (i = 0; i < pcl->pclusterpages; ++i) { struct page *page; compressed_page_t t; + struct page *newpage = NULL; /* the compressed page was loaded before */ - if (READ_ONCE(*pages)) + if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; - page = find_get_page(mc, index); + page = find_get_page(mc, pcl->obj.index + i); if (page) { t = tag_compressed_page_justfound(page); - } else if (type == DELAYEDALLOC) { - t = tagptr_init(compressed_page_t, PAGE_UNALLOCATED); - } else { /* DONTALLOC */ - if (standalone) - clt->compressedpages = pages; + } else { + /* I/O is needed, no possible to decompress directly */ standalone = false; - continue; + switch (type) { + case TRYALLOC: + newpage = erofs_allocpage(pagepool, gfp); + if (!newpage) + continue; + set_page_private(newpage, + Z_EROFS_PREALLOCATED_PAGE); + t = tag_compressed_page_justfound(newpage); + break; + default: /* DONTALLOC */ + continue; + } } - if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) + if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, + tagptr_cast_ptr(t))) continue; if (page) put_page(page); + else if (newpage) + erofs_pagepool_add(pagepool, newpage); } - if (standalone) /* downgrade to PRIMARY_FOLLOWED_NOINPLACE */ - clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; + /* + * don't do inplace I/O if all compressed pages are available in + * managed cache since it can be moved to the bypass queue instead. + */ + if (standalone) + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; } /* called by erofs_shrinker to get rid of all compressed_pages */ @@ -213,16 +364,15 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, { struct z_erofs_pcluster *const pcl = container_of(grp, struct z_erofs_pcluster, obj); - struct address_space *const mapping = MNGD_MAPPING(sbi); - const unsigned int clusterpages = BIT(pcl->clusterbits); int i; + DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); /* * refcount of workgroup is now freezed as 1, * therefore no need to worry about available decompression users. */ - for (i = 0; i < clusterpages; ++i) { - struct page *page = pcl->compressed_pages[i]; + for (i = 0; i < pcl->pclusterpages; ++i) { + struct page *page = pcl->compressed_bvecs[i].page; if (!page) continue; @@ -231,265 +381,215 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, if (!trylock_page(page)) return -EBUSY; - if (page->mapping != mapping) + if (!erofs_page_is_managed(sbi, page)) continue; /* barrier is implied in the following 'unlock_page' */ - WRITE_ONCE(pcl->compressed_pages[i], NULL); - set_page_private(page, 0); - ClearPagePrivate(page); - + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); + detach_page_private(page); unlock_page(page); - put_page(page); } return 0; } -int erofs_try_to_free_cached_page(struct address_space *mapping, - struct page *page) +int erofs_try_to_free_cached_page(struct page *page) { struct z_erofs_pcluster *const pcl = (void *)page_private(page); - const unsigned int clusterpages = BIT(pcl->clusterbits); - int ret = 0; /* 0 - busy */ - - if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) { - unsigned int i; + int ret, i; - for (i = 0; i < clusterpages; ++i) { - if (pcl->compressed_pages[i] == page) { - WRITE_ONCE(pcl->compressed_pages[i], NULL); - ret = 1; - break; - } - } - erofs_workgroup_unfreeze(&pcl->obj, 1); + if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1)) + return 0; - if (ret) { - ClearPagePrivate(page); - put_page(page); + ret = 0; + DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); + for (i = 0; i < pcl->pclusterpages; ++i) { + if (pcl->compressed_bvecs[i].page == page) { + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); + ret = 1; + break; } } + erofs_workgroup_unfreeze(&pcl->obj, 1); + if (ret) + detach_page_private(page); return ret; } -/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ -static inline bool z_erofs_try_inplace_io(struct z_erofs_collector *clt, - struct page *page) +static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe, + struct z_erofs_bvec *bvec) { - struct z_erofs_pcluster *const pcl = clt->pcl; - const unsigned int clusterpages = BIT(pcl->clusterbits); + struct z_erofs_pcluster *const pcl = fe->pcl; - while (clt->compressedpages < pcl->compressed_pages + clusterpages) { - if (!cmpxchg(clt->compressedpages++, NULL, page)) + while (fe->icur > 0) { + if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page, + NULL, bvec->page)) { + pcl->compressed_bvecs[fe->icur] = *bvec; return true; + } } return false; } -/* callers must be with collection lock held */ -static int z_erofs_attach_page(struct z_erofs_collector *clt, - struct page *page, - enum z_erofs_page_type type) +/* callers must be with pcluster lock held */ +static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe, + struct z_erofs_bvec *bvec, bool exclusive) { int ret; - bool occupied; - /* give priority for inplaceio */ - if (clt->mode >= COLLECT_PRIMARY && - type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && - z_erofs_try_inplace_io(clt, page)) - return 0; - - ret = z_erofs_pagevec_enqueue(&clt->vector, - page, type, &occupied); - clt->cl->vcnt += (unsigned int)ret; - - return ret ? 0 : -EAGAIN; + if (exclusive) { + /* give priority for inplaceio to use file pages first */ + if (z_erofs_try_inplace_io(fe, bvec)) + return 0; + /* otherwise, check if it can be used as a bvpage */ + if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && + !fe->candidate_bvpage) + fe->candidate_bvpage = bvec->page; + } + ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage); + fe->pcl->vcnt += (ret >= 0); + return ret; } -static enum z_erofs_collectmode -try_to_claim_pcluster(struct z_erofs_pcluster *pcl, - z_erofs_next_pcluster_t *owned_head) +static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f) { - /* let's claim these following types of pclusters */ -retry: - if (pcl->next == Z_EROFS_PCLUSTER_NIL) { - /* type 1, nil pcluster */ - if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, - *owned_head) != Z_EROFS_PCLUSTER_NIL) - goto retry; + struct z_erofs_pcluster *pcl = f->pcl; + z_erofs_next_pcluster_t *owned_head = &f->owned_head; + /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */ + if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, + *owned_head) == Z_EROFS_PCLUSTER_NIL) { *owned_head = &pcl->next; - /* lucky, I am the followee :) */ - return COLLECT_PRIMARY_FOLLOWED; - } else if (pcl->next == Z_EROFS_PCLUSTER_TAIL) { - /* - * type 2, link to the end of a existing open chain, - * be careful that its submission itself is governed - * by the original owned chain. - */ - if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, - *owned_head) != Z_EROFS_PCLUSTER_TAIL) - goto retry; + /* so we can attach this pcluster to our submission chain. */ + f->mode = Z_EROFS_PCLUSTER_FOLLOWED; + return; + } + + /* + * type 2, link to the end of an existing open chain, be careful + * that its submission is controlled by the original attached chain. + */ + if (*owned_head != &pcl->next && pcl != f->tailpcl && + cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, + *owned_head) == Z_EROFS_PCLUSTER_TAIL) { *owned_head = Z_EROFS_PCLUSTER_TAIL; - return COLLECT_PRIMARY_HOOKED; + f->mode = Z_EROFS_PCLUSTER_HOOKED; + f->tailpcl = NULL; + return; } - return COLLECT_PRIMARY; /* :( better luck next time */ + /* type 3, it belongs to a chain, but it isn't the end of the chain */ + f->mode = Z_EROFS_PCLUSTER_INFLIGHT; } -static int z_erofs_lookup_collection(struct z_erofs_collector *clt, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe) { - struct erofs_workgroup *grp; + struct erofs_map_blocks *map = &fe->map; + bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl; - struct z_erofs_collection *cl; - unsigned int length; - - grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT); - if (!grp) - return -ENOENT; - - pcl = container_of(grp, struct z_erofs_pcluster, obj); - if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) { - DBG_BUGON(1); - erofs_workgroup_put(grp); - return -EFSCORRUPTED; - } + struct erofs_workgroup *grp; + int err; - cl = z_erofs_primarycollection(pcl); - if (cl->pageofs != (map->m_la & ~PAGE_MASK)) { + if (!(map->m_flags & EROFS_MAP_ENCODED)) { DBG_BUGON(1); - erofs_workgroup_put(grp); return -EFSCORRUPTED; } - length = READ_ONCE(pcl->length); - if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) { - if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) { - DBG_BUGON(1); - erofs_workgroup_put(grp); - return -EFSCORRUPTED; - } - } else { - unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT; - - if (map->m_flags & EROFS_MAP_FULL_MAPPED) - llen |= Z_EROFS_PCLUSTER_FULL_LENGTH; - - while (llen > length && - length != cmpxchg_relaxed(&pcl->length, length, llen)) { - cpu_relax(); - length = READ_ONCE(pcl->length); - } - } - mutex_lock(&cl->lock); - /* used to check tail merging loop due to corrupted images */ - if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) - clt->tailpcl = pcl; - clt->mode = try_to_claim_pcluster(pcl, &clt->owned_head); - /* clean tailpcl if the current owned_head is Z_EROFS_PCLUSTER_TAIL */ - if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) - clt->tailpcl = NULL; - clt->pcl = pcl; - clt->cl = cl; - return 0; -} + /* no available pcluster, let's allocate one */ + pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 : + map->m_plen >> PAGE_SHIFT); + if (IS_ERR(pcl)) + return PTR_ERR(pcl); -static int z_erofs_register_collection(struct z_erofs_collector *clt, - struct inode *inode, - struct erofs_map_blocks *map) -{ - struct z_erofs_pcluster *pcl; - struct z_erofs_collection *cl; - int err; - - /* no available workgroup, let's allocate one */ - pcl = kmem_cache_alloc(pcluster_cachep, GFP_NOFS); - if (!pcl) - return -ENOMEM; - - z_erofs_pcluster_init_always(pcl); - pcl->obj.index = map->m_pa >> PAGE_SHIFT; - - pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | - (map->m_flags & EROFS_MAP_FULL_MAPPED ? - Z_EROFS_PCLUSTER_FULL_LENGTH : 0); - - if (map->m_flags & EROFS_MAP_ZIPPED) - pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4; - else - pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; - - pcl->clusterbits = EROFS_I(inode)->z_physical_clusterbits[0]; - pcl->clusterbits -= PAGE_SHIFT; + atomic_set(&pcl->obj.refcount, 1); + pcl->algorithmformat = map->m_algorithmformat; + pcl->length = 0; + pcl->partial = true; /* new pclusters should be claimed as type 1, primary and followed */ - pcl->next = clt->owned_head; - clt->mode = COLLECT_PRIMARY_FOLLOWED; - - cl = z_erofs_primarycollection(pcl); - cl->pageofs = map->m_la & ~PAGE_MASK; + pcl->next = fe->owned_head; + pcl->pageofs_out = map->m_la & ~PAGE_MASK; + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; /* * lock all primary followed works before visible to others * and mutex_trylock *never* fails for a new pcluster. */ - mutex_trylock(&cl->lock); + mutex_init(&pcl->lock); + DBG_BUGON(!mutex_trylock(&pcl->lock)); - err = erofs_register_workgroup(inode->i_sb, &pcl->obj); - if (err) { - mutex_unlock(&cl->lock); - kmem_cache_free(pcluster_cachep, pcl); - return -EAGAIN; + if (ztailpacking) { + pcl->obj.index = 0; /* which indicates ztailpacking */ + pcl->pageofs_in = erofs_blkoff(map->m_pa); + pcl->tailpacking_size = map->m_plen; + } else { + pcl->obj.index = map->m_pa >> PAGE_SHIFT; + + grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj); + if (IS_ERR(grp)) { + err = PTR_ERR(grp); + goto err_out; + } + + if (grp != &pcl->obj) { + fe->pcl = container_of(grp, + struct z_erofs_pcluster, obj); + err = -EEXIST; + goto err_out; + } } /* used to check tail merging loop due to corrupted images */ - if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) - clt->tailpcl = pcl; - clt->owned_head = &pcl->next; - clt->pcl = pcl; - clt->cl = cl; + if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) + fe->tailpcl = pcl; + fe->owned_head = &pcl->next; + fe->pcl = pcl; return 0; + +err_out: + mutex_unlock(&pcl->lock); + z_erofs_free_pcluster(pcl); + return err; } -static int z_erofs_collector_begin(struct z_erofs_collector *clt, - struct inode *inode, - struct erofs_map_blocks *map) +static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe) { + struct erofs_map_blocks *map = &fe->map; + struct erofs_workgroup *grp = NULL; int ret; - DBG_BUGON(clt->cl); + DBG_BUGON(fe->pcl); - /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */ - DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_NIL); - DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); + /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */ + DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL); + DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); - if (!PAGE_ALIGNED(map->m_pa)) { + if (!(map->m_flags & EROFS_MAP_META)) { + grp = erofs_find_workgroup(fe->inode->i_sb, + map->m_pa >> PAGE_SHIFT); + } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) { DBG_BUGON(1); - return -EINVAL; + return -EFSCORRUPTED; } -repeat: - ret = z_erofs_lookup_collection(clt, inode, map); - if (ret == -ENOENT) { - ret = z_erofs_register_collection(clt, inode, map); - - /* someone registered at the same time, give another try */ - if (ret == -EAGAIN) { - cond_resched(); - goto repeat; - } + if (grp) { + fe->pcl = container_of(grp, struct z_erofs_pcluster, obj); + ret = -EEXIST; + } else { + ret = z_erofs_register_pcluster(fe); } - if (ret) - return ret; - - z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS, - clt->cl->pagevec, clt->cl->vcnt); + if (ret == -EEXIST) { + mutex_lock(&fe->pcl->lock); + /* used to check tail merging loop due to corrupted images */ + if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL) + fe->tailpcl = fe->pcl; - clt->compressedpages = clt->pcl->compressed_pages; - if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */ - clt->compressedpages += Z_EROFS_CLUSTER_MAX_PAGES; + z_erofs_try_to_claim_pcluster(fe); + } else if (ret) { + return ret; + } + z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, + Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); + /* since file-backed online pages are traversed in reverse order */ + fe->icur = z_erofs_pclusterpages(fe->pcl); return 0; } @@ -499,49 +599,41 @@ repeat: */ static void z_erofs_rcu_callback(struct rcu_head *head) { - struct z_erofs_collection *const cl = - container_of(head, struct z_erofs_collection, rcu); - - kmem_cache_free(pcluster_cachep, - container_of(cl, struct z_erofs_pcluster, - primary_collection)); + z_erofs_free_pcluster(container_of(head, + struct z_erofs_pcluster, rcu)); } void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) { struct z_erofs_pcluster *const pcl = container_of(grp, struct z_erofs_pcluster, obj); - struct z_erofs_collection *const cl = z_erofs_primarycollection(pcl); - - call_rcu(&cl->rcu, z_erofs_rcu_callback); -} - -static void z_erofs_collection_put(struct z_erofs_collection *cl) -{ - struct z_erofs_pcluster *const pcl = - container_of(cl, struct z_erofs_pcluster, primary_collection); - erofs_workgroup_put(&pcl->obj); + call_rcu(&pcl->rcu, z_erofs_rcu_callback); } -static bool z_erofs_collector_end(struct z_erofs_collector *clt) +static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe) { - struct z_erofs_collection *cl = clt->cl; + struct z_erofs_pcluster *pcl = fe->pcl; - if (!cl) + if (!pcl) return false; - z_erofs_pagevec_ctor_exit(&clt->vector, false); - mutex_unlock(&cl->lock); + z_erofs_bvec_iter_end(&fe->biter); + mutex_unlock(&pcl->lock); + + if (fe->candidate_bvpage) { + DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage)); + fe->candidate_bvpage = NULL; + } /* * if all pending pages are added, don't hold its reference * any longer if the pcluster isn't hosted by ourselves. */ - if (clt->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE) - z_erofs_collection_put(cl); + if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE) + erofs_workgroup_put(&pcl->obj); - clt->cl = NULL; + fe->pcl = NULL; return true; } @@ -559,20 +651,46 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, la < fe->headoffset; } +static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, + struct page *page, unsigned int pageofs, + unsigned int len) +{ + struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + u8 *src, *dst; + unsigned int i, cnt; + + pos += EROFS_I(inode)->z_fragmentoff; + for (i = 0; i < len; i += cnt) { + cnt = min_t(unsigned int, len - i, + EROFS_BLKSIZ - erofs_blkoff(pos)); + src = erofs_bread(&buf, packed_inode, + erofs_blknr(pos), EROFS_KMAP); + if (IS_ERR(src)) { + erofs_put_metabuf(&buf); + return PTR_ERR(src); + } + + dst = kmap_local_page(page); + memcpy(dst + pageofs + i, src + erofs_blkoff(pos), cnt); + kunmap_local(dst); + pos += cnt; + } + erofs_put_metabuf(&buf); + return 0; +} + static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page, - struct list_head *pagepool) + struct page *page, struct page **pagepool) { struct inode *const inode = fe->inode; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct erofs_map_blocks *const map = &fe->map; - struct z_erofs_collector *const clt = &fe->clt; const loff_t offset = page_offset(page); - bool tight = true; + bool tight = true, exclusive; enum z_erofs_cache_alloctype cache_strategy; - enum z_erofs_page_type page_type; - unsigned int cur, end, spiltted, index; + unsigned int cur, end, spiltted; int err = 0; /* register locked file pages as online pages in pack */ @@ -583,379 +701,456 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, repeat: cur = end - 1; - /* lucky, within the range of the current map_blocks */ - if (offset + cur >= map->m_la && - offset + cur < map->m_la + map->m_llen) { - /* didn't get a valid collection previously (very rare) */ - if (!clt->cl) - goto restart_now; - goto hitted; - } - - /* go ahead the next map_blocks */ - erofs_dbg("%s: [out-of-range] pos %llu", __func__, offset + cur); + if (offset + cur < map->m_la || + offset + cur >= map->m_la + map->m_llen) { + erofs_dbg("out-of-range map @ pos %llu", offset + cur); - if (z_erofs_collector_end(clt)) - fe->backmost = false; - - map->m_la = offset + cur; - map->m_llen = 0; - err = z_erofs_map_blocks_iter(inode, map, 0); - if (err) - goto err_out; + if (z_erofs_collector_end(fe)) + fe->backmost = false; + map->m_la = offset + cur; + map->m_llen = 0; + err = z_erofs_map_blocks_iter(inode, map, 0); + if (err) + goto out; + } else { + if (fe->pcl) + goto hitted; + /* didn't get a valid pcluster previously (very rare) */ + } -restart_now: - if (!(map->m_flags & EROFS_MAP_MAPPED)) + if (!(map->m_flags & EROFS_MAP_MAPPED) || + map->m_flags & EROFS_MAP_FRAGMENT) goto hitted; - err = z_erofs_collector_begin(clt, inode, map); + err = z_erofs_collector_begin(fe); if (err) - goto err_out; + goto out; - /* preload all compressed pages (maybe downgrade role if necessary) */ - if (should_alloc_managed_pages(fe, sbi->cache_strategy, map->m_la)) - cache_strategy = DELAYEDALLOC; - else - cache_strategy = DONTALLOC; + if (z_erofs_is_inline_pcluster(fe->pcl)) { + void *mp; - preload_compressed_pages(clt, MNGD_MAPPING(sbi), - cache_strategy, pagepool); + mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb, + erofs_blknr(map->m_pa), EROFS_NO_KMAP); + if (IS_ERR(mp)) { + err = PTR_ERR(mp); + erofs_err(inode->i_sb, + "failed to get inline page, err %d", err); + goto out; + } + get_page(fe->map.buf.page); + WRITE_ONCE(fe->pcl->compressed_bvecs[0].page, + fe->map.buf.page); + fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE; + } else { + /* bind cache first when cached decompression is preferred */ + if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy, + map->m_la)) + cache_strategy = TRYALLOC; + else + cache_strategy = DONTALLOC; + z_erofs_bind_cache(fe, cache_strategy, pagepool); + } hitted: /* * Ensure the current partial page belongs to this submit chain rather * than other concurrent submit chains or the noio(bypass) chain since * those chains are handled asynchronously thus the page cannot be used - * for inplace I/O or pagevec (should be processed in strict order.) + * for inplace I/O or bvpage (should be processed in a strict order.) */ - tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED && - clt->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE); + tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED && + fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); cur = end - min_t(unsigned int, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { zero_user_segment(page, cur, end); goto next_part; } + if (map->m_flags & EROFS_MAP_FRAGMENT) { + unsigned int pageofs, skip, len; - /* let's derive page type */ - page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD : - (!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : - (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : - Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED)); + if (offset > map->m_la) { + pageofs = 0; + skip = offset - map->m_la; + } else { + pageofs = map->m_la & ~PAGE_MASK; + skip = 0; + } + len = min_t(unsigned int, map->m_llen - skip, end - cur); + err = z_erofs_read_fragment(inode, skip, page, pageofs, len); + if (err) + goto out; + ++spiltted; + tight = false; + goto next_part; + } + exclusive = (!cur && (!spiltted || tight)); if (cur) - tight &= (clt->mode >= COLLECT_PRIMARY_FOLLOWED); + tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED); retry: - err = z_erofs_attach_page(clt, page, page_type); - /* should allocate an additional staging page for pagevec */ - if (err == -EAGAIN) { - struct page *const newpage = - erofs_allocpage(pagepool, GFP_NOFS | __GFP_NOFAIL); - - newpage->mapping = Z_EROFS_MAPPING_STAGING; - err = z_erofs_attach_page(clt, newpage, - Z_EROFS_PAGE_TYPE_EXCLUSIVE); - if (!err) - goto retry; + err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) { + .page = page, + .offset = offset - map->m_la, + .end = end, + }), exclusive); + /* should allocate an additional short-lived page for bvset */ + if (err == -EAGAIN && !fe->candidate_bvpage) { + fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL); + set_page_private(fe->candidate_bvpage, + Z_EROFS_SHORTLIVED_PAGE); + goto retry; } - if (err) - goto err_out; - - index = page->index - (map->m_la >> PAGE_SHIFT); - - z_erofs_onlinepage_fixup(page, index, true); + if (err) { + DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage); + goto out; + } + z_erofs_onlinepage_split(page); /* bump up the number of spiltted parts of a page */ ++spiltted; - /* also update nr_pages */ - clt->cl->nr_pages = max_t(pgoff_t, clt->cl->nr_pages, index + 1); + if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) + fe->pcl->multibases = true; + if (fe->pcl->length < offset + end - map->m_la) { + fe->pcl->length = offset + end - map->m_la; + fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK; + } + if ((map->m_flags & EROFS_MAP_FULL_MAPPED) && + !(map->m_flags & EROFS_MAP_PARTIAL_REF) && + fe->pcl->length == map->m_llen) + fe->pcl->partial = false; next_part: - /* can be used for verification */ + /* shorten the remaining extent to update progress */ map->m_llen = offset + cur - map->m_la; + map->m_flags &= ~EROFS_MAP_FULL_MAPPED; end = cur; if (end > 0) goto repeat; out: + if (err) + z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu", __func__, page, spiltted, map->m_llen); return err; - - /* if some error occurred while processing this page */ -err_out: - SetPageError(page); - goto out; } -static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, - bool sync, int bios) +static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi, + unsigned int readahead_pages) { - /* wake up the caller thread for sync decompression */ - if (sync) { - unsigned long flags; + /* auto: enable for read_folio, disable for readahead */ + if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) && + !readahead_pages) + return true; - spin_lock_irqsave(&io->u.wait.lock, flags); - if (!atomic_add_return(bios, &io->pending_bios)) - wake_up_locked(&io->u.wait); - spin_unlock_irqrestore(&io->u.wait.lock, flags); - return; - } + if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) && + (readahead_pages <= sbi->opt.max_sync_decompress_pages)) + return true; - if (!atomic_add_return(bios, &io->pending_bios)) - queue_work(z_erofs_workqueue, &io->u.work); + return false; } -static void z_erofs_decompressqueue_endio(struct bio *bio) +static bool z_erofs_page_is_invalidated(struct page *page) { - tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private); - struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t); - blk_status_t err = bio->bi_status; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + return !page->mapping && !z_erofs_is_shortlived_page(page); +} - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; +struct z_erofs_decompress_backend { + struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES]; + struct super_block *sb; + struct z_erofs_pcluster *pcl; - DBG_BUGON(PageUptodate(page)); - DBG_BUGON(!page->mapping); + /* pages with the longest decompressed length for deduplication */ + struct page **decompressed_pages; + /* pages to keep the compressed data */ + struct page **compressed_pages; - if (err) - SetPageError(page); + struct list_head decompressed_secondary_bvecs; + struct page **pagepool; + unsigned int onstack_used, nr_pages; +}; - if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { - if (!err) - SetPageUptodate(page); - unlock_page(page); - } - } - z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); - bio_put(bio); -} +struct z_erofs_bvec_item { + struct z_erofs_bvec bvec; + struct list_head list; +}; -static int z_erofs_decompress_pcluster(struct super_block *sb, - struct z_erofs_pcluster *pcl, - struct list_head *pagepool) +static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be, + struct z_erofs_bvec *bvec) { - struct erofs_sb_info *const sbi = EROFS_SB(sb); - const unsigned int clusterpages = BIT(pcl->clusterbits); - struct z_erofs_pagevec_ctor ctor; - unsigned int i, outputsize, llen, nr_pages; - struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES]; - struct page **pages, **compressed_pages, *page; - - enum z_erofs_page_type page_type; - bool overlapped, partial; - struct z_erofs_collection *cl; - int err; + struct z_erofs_bvec_item *item; - might_sleep(); - cl = z_erofs_primarycollection(pcl); - DBG_BUGON(!READ_ONCE(cl->nr_pages)); + if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) { + unsigned int pgnr; - mutex_lock(&cl->lock); - nr_pages = cl->nr_pages; - - if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) { - pages = pages_onstack; - } else if (nr_pages <= Z_EROFS_VMAP_GLOBAL_PAGES && - mutex_trylock(&z_pagemap_global_lock)) { - pages = z_pagemap_global; - } else { - gfp_t gfp_flags = GFP_KERNEL; - - if (nr_pages > Z_EROFS_VMAP_GLOBAL_PAGES) - gfp_flags |= __GFP_NOFAIL; + pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; + DBG_BUGON(pgnr >= be->nr_pages); + if (!be->decompressed_pages[pgnr]) { + be->decompressed_pages[pgnr] = bvec->page; + return; + } + } - pages = kvmalloc_array(nr_pages, sizeof(struct page *), - gfp_flags); + /* (cold path) one pcluster is requested multiple times */ + item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL); + item->bvec = *bvec; + list_add(&item->list, &be->decompressed_secondary_bvecs); +} - /* fallback to global pagemap for the lowmem scenario */ - if (!pages) { - mutex_lock(&z_pagemap_global_lock); - pages = z_pagemap_global; +static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be, + int err) +{ + unsigned int off0 = be->pcl->pageofs_out; + struct list_head *p, *n; + + list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) { + struct z_erofs_bvec_item *bvi; + unsigned int end, cur; + void *dst, *src; + + bvi = container_of(p, struct z_erofs_bvec_item, list); + cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0; + end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset, + bvi->bvec.end); + dst = kmap_local_page(bvi->bvec.page); + while (cur < end) { + unsigned int pgnr, scur, len; + + pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT; + DBG_BUGON(pgnr >= be->nr_pages); + + scur = bvi->bvec.offset + cur - + ((pgnr << PAGE_SHIFT) - off0); + len = min_t(unsigned int, end - cur, PAGE_SIZE - scur); + if (!be->decompressed_pages[pgnr]) { + err = -EFSCORRUPTED; + cur += len; + continue; + } + src = kmap_local_page(be->decompressed_pages[pgnr]); + memcpy(dst + cur, src + scur, len); + kunmap_local(src); + cur += len; } + kunmap_local(dst); + if (err) + z_erofs_page_mark_eio(bvi->bvec.page); + z_erofs_onlinepage_endio(bvi->bvec.page); + list_del(p); + kfree(bvi); } +} - for (i = 0; i < nr_pages; ++i) - pages[i] = NULL; +static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be) +{ + struct z_erofs_pcluster *pcl = be->pcl; + struct z_erofs_bvec_iter biter; + struct page *old_bvpage; + int i; - err = 0; - z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, - cl->pagevec, 0); + z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0); + for (i = 0; i < pcl->vcnt; ++i) { + struct z_erofs_bvec bvec; - for (i = 0; i < cl->vcnt; ++i) { - unsigned int pagenr; + z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage); - page = z_erofs_pagevec_dequeue(&ctor, &page_type); + if (old_bvpage) + z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); - /* all pages in pagevec ought to be valid */ - DBG_BUGON(!page); - DBG_BUGON(!page->mapping); + DBG_BUGON(z_erofs_page_is_invalidated(bvec.page)); + z_erofs_do_decompressed_bvec(be, &bvec); + } - if (z_erofs_put_stagingpage(pagepool, page)) - continue; + old_bvpage = z_erofs_bvec_iter_end(&biter); + if (old_bvpage) + z_erofs_put_shortlivedpage(be->pagepool, old_bvpage); +} - if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD) - pagenr = 0; - else - pagenr = z_erofs_onlinepage_index(page); +static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be, + bool *overlapped) +{ + struct z_erofs_pcluster *pcl = be->pcl; + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + int i, err = 0; - DBG_BUGON(pagenr >= nr_pages); + *overlapped = false; + for (i = 0; i < pclusterpages; ++i) { + struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i]; + struct page *page = bvec->page; - /* - * currently EROFS doesn't support multiref(dedup), - * so here erroring out one multiref page. - */ - if (pages[pagenr]) { + /* compressed pages ought to be present before decompressing */ + if (!page) { DBG_BUGON(1); - SetPageError(pages[pagenr]); - z_erofs_onlinepage_endio(pages[pagenr]); - err = -EFSCORRUPTED; + continue; } - pages[pagenr] = page; - } - z_erofs_pagevec_ctor_exit(&ctor, true); - - overlapped = false; - compressed_pages = pcl->compressed_pages; - - for (i = 0; i < clusterpages; ++i) { - unsigned int pagenr; - - page = compressed_pages[i]; + be->compressed_pages[i] = page; - /* all compressed pages ought to be valid */ - DBG_BUGON(!page); - DBG_BUGON(!page->mapping); + if (z_erofs_is_inline_pcluster(pcl)) { + if (!PageUptodate(page)) + err = -EIO; + continue; + } - if (!z_erofs_page_is_staging(page)) { - if (erofs_page_is_managed(sbi, page)) { + DBG_BUGON(z_erofs_page_is_invalidated(page)); + if (!z_erofs_is_shortlived_page(page)) { + if (erofs_page_is_managed(EROFS_SB(be->sb), page)) { if (!PageUptodate(page)) err = -EIO; continue; } - - /* - * only if non-head page can be selected - * for inplace decompression - */ - pagenr = z_erofs_onlinepage_index(page); - - DBG_BUGON(pagenr >= nr_pages); - if (pages[pagenr]) { - DBG_BUGON(1); - SetPageError(pages[pagenr]); - z_erofs_onlinepage_endio(pages[pagenr]); - err = -EFSCORRUPTED; - } - pages[pagenr] = page; - - overlapped = true; + z_erofs_do_decompressed_bvec(be, bvec); + *overlapped = true; } + } - /* PG_error needs checking for inplaced and staging pages */ - if (PageError(page)) { - DBG_BUGON(PageUptodate(page)); - err = -EIO; - } + if (err) + return err; + return 0; +} + +static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be, + int err) +{ + struct erofs_sb_info *const sbi = EROFS_SB(be->sb); + struct z_erofs_pcluster *pcl = be->pcl; + unsigned int pclusterpages = z_erofs_pclusterpages(pcl); + unsigned int i, inputsize; + int err2; + struct page *page; + bool overlapped; + + mutex_lock(&pcl->lock); + be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT; + + /* allocate (de)compressed page arrays if cannot be kept on stack */ + be->decompressed_pages = NULL; + be->compressed_pages = NULL; + be->onstack_used = 0; + if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) { + be->decompressed_pages = be->onstack_pages; + be->onstack_used = be->nr_pages; + memset(be->decompressed_pages, 0, + sizeof(struct page *) * be->nr_pages); } + if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES) + be->compressed_pages = be->onstack_pages + be->onstack_used; + + if (!be->decompressed_pages) + be->decompressed_pages = + kvcalloc(be->nr_pages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); + if (!be->compressed_pages) + be->compressed_pages = + kvcalloc(pclusterpages, sizeof(struct page *), + GFP_KERNEL | __GFP_NOFAIL); + + z_erofs_parse_out_bvecs(be); + err2 = z_erofs_parse_in_bvecs(be, &overlapped); + if (err2) + err = err2; if (err) goto out; - llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT; - if (nr_pages << PAGE_SHIFT >= cl->pageofs + llen) { - outputsize = llen; - partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH); - } else { - outputsize = (nr_pages << PAGE_SHIFT) - cl->pageofs; - partial = true; - } + if (z_erofs_is_inline_pcluster(pcl)) + inputsize = pcl->tailpacking_size; + else + inputsize = pclusterpages * PAGE_SIZE; err = z_erofs_decompress(&(struct z_erofs_decompress_req) { - .sb = sb, - .in = compressed_pages, - .out = pages, - .pageofs_out = cl->pageofs, - .inputsize = PAGE_SIZE, - .outputsize = outputsize, + .sb = be->sb, + .in = be->compressed_pages, + .out = be->decompressed_pages, + .pageofs_in = pcl->pageofs_in, + .pageofs_out = pcl->pageofs_out, + .inputsize = inputsize, + .outputsize = pcl->length, .alg = pcl->algorithmformat, .inplace_io = overlapped, - .partial_decoding = partial - }, pagepool); + .partial_decoding = pcl->partial, + .fillgaps = pcl->multibases, + }, be->pagepool); out: - /* must handle all compressed pages before endding pages */ - for (i = 0; i < clusterpages; ++i) { - page = compressed_pages[i]; - - if (erofs_page_is_managed(sbi, page)) - continue; + /* must handle all compressed pages before actual file pages */ + if (z_erofs_is_inline_pcluster(pcl)) { + page = pcl->compressed_bvecs[0].page; + WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); + put_page(page); + } else { + for (i = 0; i < pclusterpages; ++i) { + page = pcl->compressed_bvecs[i].page; - /* recycle all individual staging pages */ - (void)z_erofs_put_stagingpage(pagepool, page); + if (erofs_page_is_managed(sbi, page)) + continue; - WRITE_ONCE(compressed_pages[i], NULL); + /* recycle all individual short-lived pages */ + (void)z_erofs_put_shortlivedpage(be->pagepool, page); + WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL); + } } + if (be->compressed_pages < be->onstack_pages || + be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES) + kvfree(be->compressed_pages); + z_erofs_fill_other_copies(be, err); - for (i = 0; i < nr_pages; ++i) { - page = pages[i]; + for (i = 0; i < be->nr_pages; ++i) { + page = be->decompressed_pages[i]; if (!page) continue; - DBG_BUGON(!page->mapping); + DBG_BUGON(z_erofs_page_is_invalidated(page)); - /* recycle all individual staging pages */ - if (z_erofs_put_stagingpage(pagepool, page)) + /* recycle all individual short-lived pages */ + if (z_erofs_put_shortlivedpage(be->pagepool, page)) continue; - - if (err < 0) - SetPageError(page); - + if (err) + z_erofs_page_mark_eio(page); z_erofs_onlinepage_endio(page); } - if (pages == z_pagemap_global) - mutex_unlock(&z_pagemap_global_lock); - else if (pages != pages_onstack) - kvfree(pages); + if (be->decompressed_pages != be->onstack_pages) + kvfree(be->decompressed_pages); - cl->nr_pages = 0; - cl->vcnt = 0; + pcl->length = 0; + pcl->partial = true; + pcl->multibases = false; + pcl->bvset.nextpage = NULL; + pcl->vcnt = 0; - /* all cl locks MUST be taken before the following line */ + /* pcluster lock MUST be taken before the following line */ WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); - - /* all cl locks SHOULD be released right now */ - mutex_unlock(&cl->lock); - - z_erofs_collection_put(cl); + mutex_unlock(&pcl->lock); return err; } static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io, - struct list_head *pagepool) + struct page **pagepool) { + struct z_erofs_decompress_backend be = { + .sb = io->sb, + .pagepool = pagepool, + .decompressed_secondary_bvecs = + LIST_HEAD_INIT(be.decompressed_secondary_bvecs), + }; z_erofs_next_pcluster_t owned = io->head; while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) { - struct z_erofs_pcluster *pcl; - - /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ + /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL); - - /* no possible that 'owned' equals NULL */ + /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */ DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); - pcl = container_of(owned, struct z_erofs_pcluster, next); - owned = READ_ONCE(pcl->next); + be.pcl = container_of(owned, struct z_erofs_pcluster, next); + owned = READ_ONCE(be.pcl->next); - z_erofs_decompress_pcluster(io->sb, pcl, pagepool); + z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0); + erofs_workgroup_put(&be.pcl->obj); } } @@ -963,22 +1158,47 @@ static void z_erofs_decompressqueue_work(struct work_struct *work) { struct z_erofs_decompressqueue *bgq = container_of(work, struct z_erofs_decompressqueue, u.work); - LIST_HEAD(pagepool); + struct page *pagepool = NULL; DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED); z_erofs_decompress_queue(bgq, &pagepool); - put_pages_list(&pagepool); + erofs_release_pages(&pagepool); kvfree(bgq); } +static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, + bool sync, int bios) +{ + struct erofs_sb_info *const sbi = EROFS_SB(io->sb); + + /* wake up the caller thread for sync decompression */ + if (sync) { + if (!atomic_add_return(bios, &io->pending_bios)) + complete(&io->u.done); + return; + } + + if (atomic_add_return(bios, &io->pending_bios)) + return; + /* Use workqueue and sync decompression for atomic contexts only */ + if (in_atomic() || irqs_disabled()) { + queue_work(z_erofs_workqueue, &io->u.work); + /* enable sync decompression for readahead */ + if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) + sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; + return; + } + z_erofs_decompressqueue_work(&io->u.work); +} + static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, unsigned int nr, - struct list_head *pagepool, - struct address_space *mc, - gfp_t gfp) + struct page **pagepool, + struct address_space *mc) { const pgoff_t index = pcl->obj.index; + gfp_t gfp = mapping_gfp_mask(mc); bool tocache = false; struct address_space *mapping; @@ -988,36 +1208,41 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, int justfound; repeat: - page = READ_ONCE(pcl->compressed_pages[nr]); + page = READ_ONCE(pcl->compressed_bvecs[nr].page); oldpage = page; if (!page) goto out_allocpage; - /* - * the cached page has not been allocated and - * an placeholder is out there, prepare it now. - */ - if (page == PAGE_UNALLOCATED) { - tocache = true; - goto out_allocpage; - } - /* process the target tagged pointer */ t = tagptr_init(compressed_page_t, page); justfound = tagptr_unfold_tags(t); page = tagptr_unfold_ptr(t); + /* + * preallocated cached pages, which is used to avoid direct reclaim + * otherwise, it will go inplace I/O path instead. + */ + if (page->private == Z_EROFS_PREALLOCATED_PAGE) { + WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); + set_page_private(page, 0); + tocache = true; + goto out_tocache; + } mapping = READ_ONCE(page->mapping); /* - * unmanaged (file) pages are all locked solidly, + * file-backed online pages in plcuster are all locked steady, * therefore it is impossible for `mapping' to be NULL. */ if (mapping && mapping != mc) /* ought to be unmanaged pages */ goto out; + /* directly return for shortlived page as well */ + if (z_erofs_is_shortlived_page(page)) + goto out; + lock_page(page); /* only true if page reclaim goes wrong, should never happen */ @@ -1025,14 +1250,13 @@ repeat: /* the page is still in manage cache */ if (page->mapping == mc) { - WRITE_ONCE(pcl->compressed_pages[nr], page); + WRITE_ONCE(pcl->compressed_bvecs[nr].page, page); - ClearPageError(page); if (!PagePrivate(page)) { /* * impossible to be !PagePrivate(page) for * the current restriction as well if - * the page is already in compressed_pages[]. + * the page is already in compressed_bvecs[]. */ DBG_BUGON(!justfound); @@ -1061,25 +1285,22 @@ repeat: put_page(page); out_allocpage: page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL); - if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) { - /* non-LRU / non-movable temporary page is needed */ - page->mapping = Z_EROFS_MAPPING_STAGING; - tocache = false; - } - - if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { - if (tocache) { - /* since it added to managed cache successfully */ - unlock_page(page); - put_page(page); - } else { - list_add(&page->lru, pagepool); - } + if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page, + oldpage, page)) { + erofs_pagepool_add(pagepool, page); cond_resched(); goto repeat; } - set_page_private(page, (unsigned long)pcl); - SetPagePrivate(page); +out_tocache: + if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) { + /* turn into temporary page if fails (1 ref) */ + set_page_private(page, Z_EROFS_SHORTLIVED_PAGE); + goto out; + } + attach_page_private(page, pcl); + /* drop a refcount added by allocpage (then we have 2 refs here) */ + put_page(page); + out: /* the only exit (for tracing and debugging) */ return page; } @@ -1100,8 +1321,9 @@ jobqueue_init(struct super_block *sb, } else { fg_out: q = fgq; - init_waitqueue_head(&fgq->u.wait); + init_completion(&fgq->u.done); atomic_set(&fgq->pending_bios, 0); + q->eio = false; } q->sb = sb; q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; @@ -1148,20 +1370,50 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, qtail[JQ_BYPASS] = &pcl->next; } -static void z_erofs_submit_queue(struct super_block *sb, - z_erofs_next_pcluster_t owned_head, - struct list_head *pagepool, +static void z_erofs_decompressqueue_endio(struct bio *bio) +{ + tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private); + struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t); + blk_status_t err = bio->bi_status; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *page = bvec->bv_page; + + DBG_BUGON(PageUptodate(page)); + DBG_BUGON(z_erofs_page_is_invalidated(page)); + + if (erofs_page_is_managed(EROFS_SB(q->sb), page)) { + if (!err) + SetPageUptodate(page); + unlock_page(page); + } + } + if (err) + q->eio = true; + z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); + bio_put(bio); +} + +static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, + struct page **pagepool, struct z_erofs_decompressqueue *fgq, bool *force_fg) { - struct erofs_sb_info *const sbi = EROFS_SB(sb); + struct super_block *sb = f->inode->i_sb; + struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; void *bi_private; - /* since bio will be NULL, no need to initialize last_index */ - pgoff_t uninitialized_var(last_index); + z_erofs_next_pcluster_t owned_head = f->owned_head; + /* bio is NULL initially, so no need to initialize last_{index,bdev} */ + pgoff_t last_index; + struct block_device *last_bdev; unsigned int nr_bios = 0; struct bio *bio = NULL; + unsigned long pflags; + int memstall = 0; bi_private = jobqueueset_init(sb, q, fgq, force_fg); qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; @@ -1171,6 +1423,7 @@ static void z_erofs_submit_queue(struct super_block *sb, q[JQ_SUBMIT]->head = owned_head; do { + struct erofs_map_dev mdev; struct z_erofs_pcluster *pcl; pgoff_t cur, end; unsigned int i = 0; @@ -1182,43 +1435,62 @@ static void z_erofs_submit_queue(struct super_block *sb, pcl = container_of(owned_head, struct z_erofs_pcluster, next); - cur = pcl->obj.index; - end = cur + BIT(pcl->clusterbits); - /* close the main owned chain at first */ owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, Z_EROFS_PCLUSTER_TAIL_CLOSED); + if (z_erofs_is_inline_pcluster(pcl)) { + move_to_bypass_jobqueue(pcl, qtail, owned_head); + continue; + } + + /* no device id here, thus it will always succeed */ + mdev = (struct erofs_map_dev) { + .m_pa = blknr_to_addr(pcl->obj.index), + }; + (void)erofs_map_dev(sb, &mdev); + + cur = erofs_blknr(mdev.m_pa); + end = cur + pcl->pclusterpages; do { struct page *page; - int err; page = pickup_page_for_submission(pcl, i++, pagepool, - MNGD_MAPPING(sbi), - GFP_NOFS); + mc); if (!page) continue; - if (bio && cur != last_index + 1) { + if (bio && (cur != last_index + 1 || + last_bdev != mdev.m_bdev)) { submit_bio_retry: submit_bio(bio); + if (memstall) { + psi_memstall_leave(&pflags); + memstall = 0; + } bio = NULL; } - if (!bio) { - bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES); + if (unlikely(PageWorkingset(page)) && !memstall) { + psi_memstall_enter(&pflags); + memstall = 1; + } + if (!bio) { + bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, + REQ_OP_READ, GFP_NOIO); bio->bi_end_io = z_erofs_decompressqueue_endio; - bio_set_dev(bio, sb->s_bdev); + + last_bdev = mdev.m_bdev; bio->bi_iter.bi_sector = (sector_t)cur << LOG_SECTORS_PER_BLOCK; bio->bi_private = bi_private; - bio->bi_opf = REQ_OP_READ; + if (f->readahead) + bio->bi_opf |= REQ_RAHEAD; ++nr_bios; } - err = bio_add_page(bio, page, PAGE_SIZE, 0); - if (err < PAGE_SIZE) + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) goto submit_bio_retry; last_index = cur; @@ -1231,8 +1503,11 @@ submit_bio_retry: move_to_bypass_jobqueue(pcl, qtail, owned_head); } while (owned_head != Z_EROFS_PCLUSTER_TAIL); - if (bio) + if (bio) { submit_bio(bio); + if (memstall) + psi_memstall_leave(&pflags); + } /* * although background is preferred, no one is pending for submission. @@ -1245,15 +1520,14 @@ submit_bio_retry: z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios); } -static void z_erofs_runqueue(struct super_block *sb, - struct z_erofs_collector *clt, - struct list_head *pagepool, bool force_fg) +static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, + struct page **pagepool, bool force_fg) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; - if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) + if (f->owned_head == Z_EROFS_PCLUSTER_TAIL) return; - z_erofs_submit_queue(sb, clt->owned_head, pagepool, io, &force_fg); + z_erofs_submit_queue(f, pagepool, io, &force_fg); /* handle bypass queue (no i/o pclusters) immediately */ z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); @@ -1262,82 +1536,123 @@ static void z_erofs_runqueue(struct super_block *sb, return; /* wait until all bios are completed */ - io_wait_event(io[JQ_SUBMIT].u.wait, - !atomic_read(&io[JQ_SUBMIT].pending_bios)); + wait_for_completion_io(&io[JQ_SUBMIT].u.done); /* handle synchronous decompress queue in the caller context */ z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool); } -static int z_erofs_readpage(struct file *file, struct page *page) +/* + * Since partial uptodate is still unimplemented for now, we have to use + * approximate readmore strategies as a start. + */ +static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, + struct readahead_control *rac, + erofs_off_t end, + struct page **pagepool, + bool backmost) +{ + struct inode *inode = f->inode; + struct erofs_map_blocks *map = &f->map; + erofs_off_t cur; + int err; + + if (backmost) { + map->m_la = end; + err = z_erofs_map_blocks_iter(inode, map, + EROFS_GET_BLOCKS_READMORE); + if (err) + return; + + /* expend ra for the trailing edge if readahead */ + if (rac) { + loff_t newstart = readahead_pos(rac); + + cur = round_up(map->m_la + map->m_llen, PAGE_SIZE); + readahead_expand(rac, newstart, cur - newstart); + return; + } + end = round_up(end, PAGE_SIZE); + } else { + end = round_up(map->m_la, PAGE_SIZE); + + if (!map->m_llen) + return; + } + + cur = map->m_la + map->m_llen - 1; + while (cur >= end) { + pgoff_t index = cur >> PAGE_SHIFT; + struct page *page; + + page = erofs_grab_cache_page_nowait(inode->i_mapping, index); + if (page) { + if (PageUptodate(page)) { + unlock_page(page); + } else { + err = z_erofs_do_read_page(f, page, pagepool); + if (err) + erofs_err(inode->i_sb, + "readmore error at page %lu @ nid %llu", + index, EROFS_I(inode)->nid); + } + put_page(page); + } + + if (cur < PAGE_SIZE) + break; + cur = (index << PAGE_SHIFT) - 1; + } +} + +static int z_erofs_read_folio(struct file *file, struct folio *folio) { + struct page *page = &folio->page; struct inode *const inode = page->mapping->host; + struct erofs_sb_info *const sbi = EROFS_I_SB(inode); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); + struct page *pagepool = NULL; int err; - LIST_HEAD(pagepool); trace_erofs_readpage(page, false); - f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; + z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1, + &pagepool, true); err = z_erofs_do_read_page(&f, page, &pagepool); - (void)z_erofs_collector_end(&f.clt); + z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false); + + (void)z_erofs_collector_end(&f); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, true); + z_erofs_runqueue(&f, &pagepool, + z_erofs_get_sync_decompress_policy(sbi, 0)); if (err) erofs_err(inode->i_sb, "failed to read, err [%d]", err); - if (f.map.mpage) - put_page(f.map.mpage); - - /* clean up the remaining free pages */ - put_pages_list(&pagepool); + erofs_put_metabuf(&f.map.buf); + erofs_release_pages(&pagepool); return err; } -static bool should_decompress_synchronously(struct erofs_sb_info *sbi, - unsigned int nr) +static void z_erofs_readahead(struct readahead_control *rac) { - return nr <= sbi->max_sync_decompress_pages; -} - -static int z_erofs_readpages(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned int nr_pages) -{ - struct inode *const inode = mapping->host; + struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - - bool sync = should_decompress_synchronously(sbi, nr_pages); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); - gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); - struct page *head = NULL; - LIST_HEAD(pagepool); - - trace_erofs_readpages(mapping->host, lru_to_page(pages), - nr_pages, false); - - f.headoffset = (erofs_off_t)lru_to_page(pages)->index << PAGE_SHIFT; + struct page *pagepool = NULL, *head = NULL, *page; + unsigned int nr_pages; - for (; nr_pages; --nr_pages) { - struct page *page = lru_to_page(pages); + f.readahead = true; + f.headoffset = readahead_pos(rac); - prefetchw(&page->flags); - list_del(&page->lru); - - /* - * A pure asynchronous readahead is indicated if - * a PG_readahead marked page is hitted at first. - * Let's also do asynchronous decompression for this case. - */ - sync &= !(PageReadahead(page) && !head); - - if (add_to_page_cache_lru(page, mapping, page->index, gfp)) { - list_add(&page->lru, &pagepool); - continue; - } + z_erofs_pcluster_readmore(&f, rac, f.headoffset + + readahead_length(rac) - 1, &pagepool, true); + nr_pages = readahead_count(rac); + trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); + while ((page = readahead_page(rac))) { set_page_private(page, (unsigned long)head); head = page; } @@ -1356,21 +1671,16 @@ static int z_erofs_readpages(struct file *filp, struct address_space *mapping, page->index, EROFS_I(inode)->nid); put_page(page); } + z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false); + (void)z_erofs_collector_end(&f); - (void)z_erofs_collector_end(&f.clt); - - z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, sync); - - if (f.map.mpage) - put_page(f.map.mpage); - - /* clean up the remaining free pages */ - put_pages_list(&pagepool); - return 0; + z_erofs_runqueue(&f, &pagepool, + z_erofs_get_sync_decompress_policy(sbi, nr_pages)); + erofs_put_metabuf(&f.map.buf); + erofs_release_pages(&pagepool); } const struct address_space_operations z_erofs_aops = { - .readpage = z_erofs_readpage, - .readpages = z_erofs_readpages, + .read_folio = z_erofs_read_folio, + .readahead = z_erofs_readahead, }; - |