aboutsummaryrefslogtreecommitdiffstats
path: root/fs/erofs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/erofs')
-rw-r--r--fs/erofs/Kconfig58
-rw-r--r--fs/erofs/Makefile9
-rw-r--r--fs/erofs/compress.h88
-rw-r--r--fs/erofs/data.c596
-rw-r--r--fs/erofs/decompressor.c481
-rw-r--r--fs/erofs/decompressor_lzma.c291
-rw-r--r--fs/erofs/dir.c45
-rw-r--r--fs/erofs/erofs_fs.h253
-rw-r--r--fs/erofs/fscache.c674
-rw-r--r--fs/erofs/inode.c237
-rw-r--r--fs/erofs/internal.h426
-rw-r--r--fs/erofs/namei.c83
-rw-r--r--fs/erofs/pcpubuf.c148
-rw-r--r--fs/erofs/super.c872
-rw-r--r--fs/erofs/sysfs.c277
-rw-r--r--fs/erofs/tagptr.h3
-rw-r--r--fs/erofs/utils.c141
-rw-r--r--fs/erofs/xattr.c160
-rw-r--r--fs/erofs/xattr.h15
-rw-r--r--fs/erofs/zdata.c1712
-rw-r--r--fs/erofs/zdata.h181
-rw-r--r--fs/erofs/zmap.c637
-rw-r--r--fs/erofs/zpvec.h157
23 files changed, 5176 insertions, 2368 deletions
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 74b0aaa7114c..85490370e0ca 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -3,18 +3,25 @@
config EROFS_FS
tristate "EROFS filesystem support"
depends on BLOCK
+ select FS_IOMAP
select LIBCRC32C
help
- EROFS (Enhanced Read-Only File System) is a lightweight
- read-only file system with modern designs (eg. page-sized
- blocks, inline xattrs/data, etc.) for scenarios which need
- high-performance read-only requirements, e.g. Android OS
- for mobile phones and LIVECDs.
-
- It also provides fixed-sized output compression support,
- which improves storage density, keeps relatively higher
- compression ratios, which is more useful to achieve high
- performance for embedded devices with limited memory.
+ EROFS (Enhanced Read-Only File System) is a lightweight read-only
+ file system with modern designs (e.g. no buffer heads, inline
+ xattrs/data, chunk-based deduplication, multiple devices, etc.) for
+ scenarios which need high-performance read-only solutions, e.g.
+ smartphones with Android OS, LiveCDs and high-density hosts with
+ numerous containers;
+
+ It also provides fixed-sized output compression support in order to
+ improve storage density as well as keep relatively higher compression
+ ratios and implements in-place decompression to reuse the file page
+ for compressed data temporarily with proper strategies, which is
+ quite useful to ensure guaranteed end-to-end runtime decompression
+ performance under extremely memory pressure without extra cost.
+
+ See the documentation at <file:Documentation/filesystems/erofs.rst>
+ for more details.
If unsure, say N.
@@ -76,17 +83,28 @@ config EROFS_FS_ZIP
If you don't want to enable compression feature, say N.
-config EROFS_FS_CLUSTER_PAGE_LIMIT
- int "EROFS Cluster Pages Hard Limit"
+config EROFS_FS_ZIP_LZMA
+ bool "EROFS LZMA compressed data support"
depends on EROFS_FS_ZIP
- range 1 256
- default "1"
+ select XZ_DEC
+ select XZ_DEC_MICROLZMA
help
- Indicates maximum # of pages of a compressed
- physical cluster.
+ Saying Y here includes support for reading EROFS file systems
+ containing LZMA compressed data, specifically called microLZMA. it
+ gives better compression ratios than the LZ4 algorithm, at the
+ expense of more CPU overhead.
+
+ LZMA support is an experimental feature for now and so most file
+ systems will be readable without selecting this option.
+
+ If unsure, say N.
- For example, if files in a image were compressed
- into 8k-unit, hard limit should not be configured
- less than 2. Otherwise, the image will be refused
- to mount on this kernel.
+config EROFS_FS_ONDEMAND
+ bool "EROFS fscache-based on-demand read support"
+ depends on CACHEFILES_ONDEMAND && (EROFS_FS=m && FSCACHE || EROFS_FS=y && FSCACHE=y)
+ default n
+ help
+ This permits EROFS to use fscache-backed data blobs with on-demand
+ read support.
+ If unsure, say N.
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 46f2aa4ba46c..99bbc597a3e9 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -1,11 +1,8 @@
# SPDX-License-Identifier: GPL-2.0-only
-EROFS_VERSION = "1.0"
-
-ccflags-y += -DEROFS_VERSION=\"$(EROFS_VERSION)\"
-
obj-$(CONFIG_EROFS_FS) += erofs.o
-erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o sysfs.o
erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
-
+erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o
+erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 07d279fd5d67..26fa170090b8 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -1,60 +1,98 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2019 HUAWEI, Inc.
- * http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
+ * https://www.huawei.com/
*/
#ifndef __EROFS_FS_COMPRESS_H
#define __EROFS_FS_COMPRESS_H
#include "internal.h"
-enum {
- Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
- Z_EROFS_COMPRESSION_RUNTIME_MAX
-};
-
struct z_erofs_decompress_req {
struct super_block *sb;
struct page **in, **out;
- unsigned short pageofs_out;
+ unsigned short pageofs_in, pageofs_out;
unsigned int inputsize, outputsize;
/* indicate the algorithm will be used for decompression */
unsigned int alg;
- bool inplace_io, partial_decoding;
+ bool inplace_io, partial_decoding, fillgaps;
+};
+
+struct z_erofs_decompressor {
+ int (*decompress)(struct z_erofs_decompress_req *rq,
+ struct page **pagepool);
+ char *name;
};
+/* some special page->private (unsigned long, see below) */
+#define Z_EROFS_SHORTLIVED_PAGE (-1UL << 2)
+#define Z_EROFS_PREALLOCATED_PAGE (-2UL << 2)
+
/*
- * - 0x5A110C8D ('sallocated', Z_EROFS_MAPPING_STAGING) -
- * used to mark temporary allocated pages from other
- * file/cached pages and NULL mapping pages.
+ * For all pages in a pcluster, page->private should be one of
+ * Type Last 2bits page->private
+ * short-lived page 00 Z_EROFS_SHORTLIVED_PAGE
+ * preallocated page (tryalloc) 00 Z_EROFS_PREALLOCATED_PAGE
+ * cached/managed page 00 pointer to z_erofs_pcluster
+ * online page (file-backed, 01/10/11 sub-index << 2 | count
+ * some pages can be used for inplace I/O)
+ *
+ * page->mapping should be one of
+ * Type page->mapping
+ * short-lived page NULL
+ * preallocated page NULL
+ * cached/managed page non-NULL or NULL (invalidated/truncated page)
+ * online page non-NULL
+ *
+ * For all managed pages, PG_private should be set with 1 extra refcount,
+ * which is used for page reclaim / migration.
*/
-#define Z_EROFS_MAPPING_STAGING ((void *)0x5A110C8D)
-/* check if a page is marked as staging */
-static inline bool z_erofs_page_is_staging(struct page *page)
+/*
+ * short-lived pages are pages directly from buddy system with specific
+ * page->private (no need to set PagePrivate since these are non-LRU /
+ * non-movable pages and bypass reclaim / migration code).
+ */
+static inline bool z_erofs_is_shortlived_page(struct page *page)
{
- return page->mapping == Z_EROFS_MAPPING_STAGING;
+ if (page->private != Z_EROFS_SHORTLIVED_PAGE)
+ return false;
+
+ DBG_BUGON(page->mapping);
+ return true;
}
-static inline bool z_erofs_put_stagingpage(struct list_head *pagepool,
- struct page *page)
+static inline bool z_erofs_put_shortlivedpage(struct page **pagepool,
+ struct page *page)
{
- if (!z_erofs_page_is_staging(page))
+ if (!z_erofs_is_shortlived_page(page))
return false;
- /* staging pages should not be used by others at the same time */
- if (page_ref_count(page) > 1)
+ /* short-lived pages should not be used by others at the same time */
+ if (page_ref_count(page) > 1) {
put_page(page);
- else
- list_add(&page->lru, pagepool);
+ } else {
+ /* follow the pcluster rule above. */
+ erofs_pagepool_add(pagepool, page);
+ }
return true;
}
+#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
+static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
+ struct page *page)
+{
+ return page->mapping == MNGD_MAPPING(sbi);
+}
+
+int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
+ unsigned int padbufsize);
int z_erofs_decompress(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool);
+ struct page **pagepool);
+/* prototypes for specific algorithms */
+int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool);
#endif
-
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index fc3a8d8064f8..fe8ac0e163f7 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -1,75 +1,96 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
- * http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
+ * https://www.huawei.com/
+ * Copyright (C) 2021, Alibaba Cloud
*/
#include "internal.h"
#include <linux/prefetch.h>
-
+#include <linux/sched/mm.h>
+#include <linux/dax.h>
#include <trace/events/erofs.h>
-static void erofs_readendio(struct bio *bio)
+void erofs_unmap_metabuf(struct erofs_buf *buf)
{
- struct bio_vec *bvec;
- blk_status_t err = bio->bi_status;
- struct bvec_iter_all iter_all;
-
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct page *page = bvec->bv_page;
-
- /* page is already locked */
- DBG_BUGON(PageUptodate(page));
+ if (buf->kmap_type == EROFS_KMAP)
+ kunmap(buf->page);
+ else if (buf->kmap_type == EROFS_KMAP_ATOMIC)
+ kunmap_atomic(buf->base);
+ buf->base = NULL;
+ buf->kmap_type = EROFS_NO_KMAP;
+}
- if (err)
- SetPageError(page);
- else
- SetPageUptodate(page);
+void erofs_put_metabuf(struct erofs_buf *buf)
+{
+ if (!buf->page)
+ return;
+ erofs_unmap_metabuf(buf);
+ put_page(buf->page);
+ buf->page = NULL;
+}
- unlock_page(page);
- /* page could be reclaimed now */
+void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
+ erofs_blk_t blkaddr, enum erofs_kmap_type type)
+{
+ struct address_space *const mapping = inode->i_mapping;
+ erofs_off_t offset = blknr_to_addr(blkaddr);
+ pgoff_t index = offset >> PAGE_SHIFT;
+ struct page *page = buf->page;
+ struct folio *folio;
+ unsigned int nofs_flag;
+
+ if (!page || page->index != index) {
+ erofs_put_metabuf(buf);
+
+ nofs_flag = memalloc_nofs_save();
+ folio = read_cache_folio(mapping, index, NULL, NULL);
+ memalloc_nofs_restore(nofs_flag);
+ if (IS_ERR(folio))
+ return folio;
+
+ /* should already be PageUptodate, no need to lock page */
+ page = folio_file_page(folio, index);
+ buf->page = page;
}
- bio_put(bio);
+ if (buf->kmap_type == EROFS_NO_KMAP) {
+ if (type == EROFS_KMAP)
+ buf->base = kmap(page);
+ else if (type == EROFS_KMAP_ATOMIC)
+ buf->base = kmap_atomic(page);
+ buf->kmap_type = type;
+ } else if (buf->kmap_type != type) {
+ DBG_BUGON(1);
+ return ERR_PTR(-EFAULT);
+ }
+ if (type == EROFS_NO_KMAP)
+ return NULL;
+ return buf->base + (offset & ~PAGE_MASK);
}
-struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr)
+void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
+ erofs_blk_t blkaddr, enum erofs_kmap_type type)
{
- struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping;
- struct page *page;
-
- page = read_cache_page_gfp(mapping, blkaddr,
- mapping_gfp_constraint(mapping, ~__GFP_FS));
- /* should already be PageUptodate */
- if (!IS_ERR(page))
- lock_page(page);
- return page;
+ if (erofs_is_fscache_mode(sb))
+ return erofs_bread(buf, EROFS_SB(sb)->s_fscache->inode,
+ blkaddr, type);
+
+ return erofs_bread(buf, sb->s_bdev->bd_inode, blkaddr, type);
}
static int erofs_map_blocks_flatmode(struct inode *inode,
struct erofs_map_blocks *map,
int flags)
{
- int err = 0;
erofs_blk_t nblocks, lastblk;
u64 offset = map->m_la;
struct erofs_inode *vi = EROFS_I(inode);
bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
- trace_erofs_map_blocks_flatmode_enter(inode, map, flags);
-
- nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+ nblocks = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ);
lastblk = nblocks - tailendpacking;
- if (offset >= inode->i_size) {
- /* leave out-of-bound access unmapped */
- map->m_flags = 0;
- map->m_plen = 0;
- goto out;
- }
-
/* there is no hole in flatmode */
map->m_flags = EROFS_MAP_MAPPED;
-
if (offset < blknr_to_addr(lastblk)) {
map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la;
map->m_plen = blknr_to_addr(lastblk) - offset;
@@ -81,284 +102,345 @@ static int erofs_map_blocks_flatmode(struct inode *inode,
vi->xattr_isize + erofs_blkoff(map->m_la);
map->m_plen = inode->i_size - offset;
- /* inline data should be located in one meta block */
- if (erofs_blkoff(map->m_pa) + map->m_plen > PAGE_SIZE) {
+ /* inline data should be located in the same meta block */
+ if (erofs_blkoff(map->m_pa) + map->m_plen > EROFS_BLKSIZ) {
erofs_err(inode->i_sb,
"inline data cross block boundary @ nid %llu",
vi->nid);
DBG_BUGON(1);
- err = -EFSCORRUPTED;
- goto err_out;
+ return -EFSCORRUPTED;
}
-
map->m_flags |= EROFS_MAP_META;
} else {
erofs_err(inode->i_sb,
"internal error @ nid: %llu (size %llu), m_la 0x%llx",
vi->nid, inode->i_size, map->m_la);
DBG_BUGON(1);
- err = -EIO;
- goto err_out;
+ return -EIO;
}
-
-out:
- map->m_llen = map->m_plen;
-
-err_out:
- trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0);
- return err;
+ return 0;
}
int erofs_map_blocks(struct inode *inode,
struct erofs_map_blocks *map, int flags)
{
- if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) {
- int err = z_erofs_map_blocks_iter(inode, map, flags);
-
- if (map->mpage) {
- put_page(map->mpage);
- map->mpage = NULL;
- }
- return err;
- }
- return erofs_map_blocks_flatmode(inode, map, flags);
-}
-
-static inline struct bio *erofs_read_raw_page(struct bio *bio,
- struct address_space *mapping,
- struct page *page,
- erofs_off_t *last_block,
- unsigned int nblocks,
- bool ra)
-{
- struct inode *const inode = mapping->host;
- struct super_block *const sb = inode->i_sb;
- erofs_off_t current_block = (erofs_off_t)page->index;
- int err;
-
- DBG_BUGON(!nblocks);
+ struct super_block *sb = inode->i_sb;
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct erofs_inode_chunk_index *idx;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ u64 chunknr;
+ unsigned int unit;
+ erofs_off_t pos;
+ void *kaddr;
+ int err = 0;
- if (PageUptodate(page)) {
- err = 0;
- goto has_updated;
+ trace_erofs_map_blocks_enter(inode, map, flags);
+ map->m_deviceid = 0;
+ if (map->m_la >= inode->i_size) {
+ /* leave out-of-bound access unmapped */
+ map->m_flags = 0;
+ map->m_plen = 0;
+ goto out;
}
- /* note that for readpage case, bio also equals to NULL */
- if (bio &&
- /* not continuous */
- *last_block + 1 != current_block) {
-submit_bio_retry:
- submit_bio(bio);
- bio = NULL;
+ if (vi->datalayout != EROFS_INODE_CHUNK_BASED) {
+ err = erofs_map_blocks_flatmode(inode, map, flags);
+ goto out;
}
- if (!bio) {
- struct erofs_map_blocks map = {
- .m_la = blknr_to_addr(current_block),
- };
- erofs_blk_t blknr;
- unsigned int blkoff;
-
- err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
- if (err)
- goto err_out;
+ if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)
+ unit = sizeof(*idx); /* chunk index */
+ else
+ unit = EROFS_BLOCK_MAP_ENTRY_SIZE; /* block map */
- /* zero out the holed page */
- if (!(map.m_flags & EROFS_MAP_MAPPED)) {
- zero_user_segment(page, 0, PAGE_SIZE);
- SetPageUptodate(page);
+ chunknr = map->m_la >> vi->chunkbits;
+ pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
+ vi->xattr_isize, unit) + unit * chunknr;
- /* imply err = 0, see erofs_map_blocks */
- goto has_updated;
+ kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP);
+ if (IS_ERR(kaddr)) {
+ err = PTR_ERR(kaddr);
+ goto out;
+ }
+ map->m_la = chunknr << vi->chunkbits;
+ map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits,
+ roundup(inode->i_size - map->m_la, EROFS_BLKSIZ));
+
+ /* handle block map */
+ if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) {
+ __le32 *blkaddr = kaddr + erofs_blkoff(pos);
+
+ if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) {
+ map->m_flags = 0;
+ } else {
+ map->m_pa = blknr_to_addr(le32_to_cpu(*blkaddr));
+ map->m_flags = EROFS_MAP_MAPPED;
}
+ goto out_unlock;
+ }
+ /* parse chunk indexes */
+ idx = kaddr + erofs_blkoff(pos);
+ switch (le32_to_cpu(idx->blkaddr)) {
+ case EROFS_NULL_ADDR:
+ map->m_flags = 0;
+ break;
+ default:
+ map->m_deviceid = le16_to_cpu(idx->device_id) &
+ EROFS_SB(sb)->device_id_mask;
+ map->m_pa = blknr_to_addr(le32_to_cpu(idx->blkaddr));
+ map->m_flags = EROFS_MAP_MAPPED;
+ break;
+ }
+out_unlock:
+ erofs_put_metabuf(&buf);
+out:
+ if (!err)
+ map->m_llen = map->m_plen;
+ trace_erofs_map_blocks_exit(inode, map, flags, 0);
+ return err;
+}
- /* for RAW access mode, m_plen must be equal to m_llen */
- DBG_BUGON(map.m_plen != map.m_llen);
-
- blknr = erofs_blknr(map.m_pa);
- blkoff = erofs_blkoff(map.m_pa);
-
- /* deal with inline page */
- if (map.m_flags & EROFS_MAP_META) {
- void *vsrc, *vto;
- struct page *ipage;
-
- DBG_BUGON(map.m_plen > PAGE_SIZE);
-
- ipage = erofs_get_meta_page(inode->i_sb, blknr);
-
- if (IS_ERR(ipage)) {
- err = PTR_ERR(ipage);
- goto err_out;
+int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map)
+{
+ struct erofs_dev_context *devs = EROFS_SB(sb)->devs;
+ struct erofs_device_info *dif;
+ int id;
+
+ /* primary device by default */
+ map->m_bdev = sb->s_bdev;
+ map->m_daxdev = EROFS_SB(sb)->dax_dev;
+ map->m_dax_part_off = EROFS_SB(sb)->dax_part_off;
+ map->m_fscache = EROFS_SB(sb)->s_fscache;
+
+ if (map->m_deviceid) {
+ down_read(&devs->rwsem);
+ dif = idr_find(&devs->tree, map->m_deviceid - 1);
+ if (!dif) {
+ up_read(&devs->rwsem);
+ return -ENODEV;
+ }
+ map->m_bdev = dif->bdev;
+ map->m_daxdev = dif->dax_dev;
+ map->m_dax_part_off = dif->dax_part_off;
+ map->m_fscache = dif->fscache;
+ up_read(&devs->rwsem);
+ } else if (devs->extra_devices) {
+ down_read(&devs->rwsem);
+ idr_for_each_entry(&devs->tree, dif, id) {
+ erofs_off_t startoff, length;
+
+ if (!dif->mapped_blkaddr)
+ continue;
+ startoff = blknr_to_addr(dif->mapped_blkaddr);
+ length = blknr_to_addr(dif->blocks);
+
+ if (map->m_pa >= startoff &&
+ map->m_pa < startoff + length) {
+ map->m_pa -= startoff;
+ map->m_bdev = dif->bdev;
+ map->m_daxdev = dif->dax_dev;
+ map->m_dax_part_off = dif->dax_part_off;
+ map->m_fscache = dif->fscache;
+ break;
}
-
- vsrc = kmap_atomic(ipage);
- vto = kmap_atomic(page);
- memcpy(vto, vsrc + blkoff, map.m_plen);
- memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen);
- kunmap_atomic(vto);
- kunmap_atomic(vsrc);
- flush_dcache_page(page);
-
- SetPageUptodate(page);
- /* TODO: could we unlock the page earlier? */
- unlock_page(ipage);
- put_page(ipage);
-
- /* imply err = 0, see erofs_map_blocks */
- goto has_updated;
}
+ up_read(&devs->rwsem);
+ }
+ return 0;
+}
- /* pa must be block-aligned for raw reading */
- DBG_BUGON(erofs_blkoff(map.m_pa));
+static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
+{
+ int ret;
+ struct erofs_map_blocks map;
+ struct erofs_map_dev mdev;
- /* max # of continuous pages */
- if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE))
- nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE);
- if (nblocks > BIO_MAX_PAGES)
- nblocks = BIO_MAX_PAGES;
+ map.m_la = offset;
+ map.m_llen = length;
- bio = bio_alloc(GFP_NOIO, nblocks);
+ ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+ if (ret < 0)
+ return ret;
- bio->bi_end_io = erofs_readendio;
- bio_set_dev(bio, sb->s_bdev);
- bio->bi_iter.bi_sector = (sector_t)blknr <<
- LOG_SECTORS_PER_BLOCK;
- bio->bi_opf = REQ_OP_READ;
+ mdev = (struct erofs_map_dev) {
+ .m_deviceid = map.m_deviceid,
+ .m_pa = map.m_pa,
+ };
+ ret = erofs_map_dev(inode->i_sb, &mdev);
+ if (ret)
+ return ret;
+
+ iomap->offset = map.m_la;
+ if (flags & IOMAP_DAX)
+ iomap->dax_dev = mdev.m_daxdev;
+ else
+ iomap->bdev = mdev.m_bdev;
+ iomap->length = map.m_llen;
+ iomap->flags = 0;
+ iomap->private = NULL;
+
+ if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+ iomap->type = IOMAP_HOLE;
+ iomap->addr = IOMAP_NULL_ADDR;
+ if (!iomap->length)
+ iomap->length = length;
+ return 0;
}
- err = bio_add_page(bio, page, PAGE_SIZE, 0);
- /* out of the extent or bio is full */
- if (err < PAGE_SIZE)
- goto submit_bio_retry;
+ if (map.m_flags & EROFS_MAP_META) {
+ void *ptr;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+
+ iomap->type = IOMAP_INLINE;
+ ptr = erofs_read_metabuf(&buf, inode->i_sb,
+ erofs_blknr(mdev.m_pa), EROFS_KMAP);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+ iomap->inline_data = ptr + erofs_blkoff(mdev.m_pa);
+ iomap->private = buf.base;
+ } else {
+ iomap->type = IOMAP_MAPPED;
+ iomap->addr = mdev.m_pa;
+ if (flags & IOMAP_DAX)
+ iomap->addr += mdev.m_dax_part_off;
+ }
+ return 0;
+}
- *last_block = current_block;
+static int erofs_iomap_end(struct inode *inode, loff_t pos, loff_t length,
+ ssize_t written, unsigned int flags, struct iomap *iomap)
+{
+ void *ptr = iomap->private;
- /* shift in advance in case of it followed by too many gaps */
- if (bio->bi_iter.bi_size >= bio->bi_max_vecs * PAGE_SIZE) {
- /* err should reassign to 0 after submitting */
- err = 0;
- goto submit_bio_out;
+ if (ptr) {
+ struct erofs_buf buf = {
+ .page = kmap_to_page(ptr),
+ .base = ptr,
+ .kmap_type = EROFS_KMAP,
+ };
+
+ DBG_BUGON(iomap->type != IOMAP_INLINE);
+ erofs_put_metabuf(&buf);
+ } else {
+ DBG_BUGON(iomap->type == IOMAP_INLINE);
}
+ return written;
+}
- return bio;
+static const struct iomap_ops erofs_iomap_ops = {
+ .iomap_begin = erofs_iomap_begin,
+ .iomap_end = erofs_iomap_end,
+};
-err_out:
- /* for sync reading, set page error immediately */
- if (!ra) {
- SetPageError(page);
- ClearPageUptodate(page);
+int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len)
+{
+ if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) {
+#ifdef CONFIG_EROFS_FS_ZIP
+ return iomap_fiemap(inode, fieinfo, start, len,
+ &z_erofs_iomap_report_ops);
+#else
+ return -EOPNOTSUPP;
+#endif
}
-has_updated:
- unlock_page(page);
-
- /* if updated manually, continuous pages has a gap */
- if (bio)
-submit_bio_out:
- submit_bio(bio);
- return err ? ERR_PTR(err) : NULL;
+ return iomap_fiemap(inode, fieinfo, start, len, &erofs_iomap_ops);
}
/*
* since we dont have write or truncate flows, so no inode
* locking needs to be held at the moment.
*/
-static int erofs_raw_access_readpage(struct file *file, struct page *page)
+static int erofs_read_folio(struct file *file, struct folio *folio)
{
- erofs_off_t last_block;
- struct bio *bio;
-
- trace_erofs_readpage(page, true);
-
- bio = erofs_read_raw_page(NULL, page->mapping,
- page, &last_block, 1, false);
-
- if (IS_ERR(bio))
- return PTR_ERR(bio);
-
- DBG_BUGON(bio); /* since we have only one bio -- must be NULL */
- return 0;
+ return iomap_read_folio(folio, &erofs_iomap_ops);
}
-static int erofs_raw_access_readpages(struct file *filp,
- struct address_space *mapping,
- struct list_head *pages,
- unsigned int nr_pages)
+static void erofs_readahead(struct readahead_control *rac)
{
- erofs_off_t last_block;
- struct bio *bio = NULL;
- gfp_t gfp = readahead_gfp_mask(mapping);
- struct page *page = list_last_entry(pages, struct page, lru);
-
- trace_erofs_readpages(mapping->host, page, nr_pages, true);
-
- for (; nr_pages; --nr_pages) {
- page = list_entry(pages->prev, struct page, lru);
-
- prefetchw(&page->flags);
- list_del(&page->lru);
-
- if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) {
- bio = erofs_read_raw_page(bio, mapping, page,
- &last_block, nr_pages, true);
-
- /* all the page errors are ignored when readahead */
- if (IS_ERR(bio)) {
- pr_err("%s, readahead error at page %lu of nid %llu\n",
- __func__, page->index,
- EROFS_I(mapping->host)->nid);
-
- bio = NULL;
- }
- }
-
- /* pages could still be locked */
- put_page(page);
- }
- DBG_BUGON(!list_empty(pages));
-
- /* the rare case (end in gaps) */
- if (bio)
- submit_bio(bio);
- return 0;
+ return iomap_readahead(rac, &erofs_iomap_ops);
}
-static int erofs_get_block(struct inode *inode, sector_t iblock,
- struct buffer_head *bh, int create)
+static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
{
- struct erofs_map_blocks map = {
- .m_la = iblock << 9,
- };
- int err;
-
- err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
- if (err)
- return err;
-
- if (map.m_flags & EROFS_MAP_MAPPED)
- bh->b_blocknr = erofs_blknr(map.m_pa);
-
- return err;
+ return iomap_bmap(mapping, block, &erofs_iomap_ops);
}
-static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
+static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
{
- struct inode *inode = mapping->host;
+ struct inode *inode = file_inode(iocb->ki_filp);
+
+ /* no need taking (shared) inode lock since it's a ro filesystem */
+ if (!iov_iter_count(to))
+ return 0;
+
+#ifdef CONFIG_FS_DAX
+ if (IS_DAX(inode))
+ return dax_iomap_rw(iocb, to, &erofs_iomap_ops);
+#endif
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ struct block_device *bdev = inode->i_sb->s_bdev;
+ unsigned int blksize_mask;
+
+ if (bdev)
+ blksize_mask = bdev_logical_block_size(bdev) - 1;
+ else
+ blksize_mask = (1 << inode->i_blkbits) - 1;
- if (EROFS_I(inode)->datalayout == EROFS_INODE_FLAT_INLINE) {
- erofs_blk_t blks = i_size_read(inode) >> LOG_BLOCK_SIZE;
+ if ((iocb->ki_pos | iov_iter_count(to) |
+ iov_iter_alignment(to)) & blksize_mask)
+ return -EINVAL;
- if (block >> LOG_SECTORS_PER_BLOCK >= blks)
- return 0;
+ return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
+ NULL, 0, NULL, 0);
}
-
- return generic_block_bmap(mapping, block, erofs_get_block);
+ return filemap_read(iocb, to, 0);
}
/* for uncompressed (aligned) files and raw access for other files */
const struct address_space_operations erofs_raw_access_aops = {
- .readpage = erofs_raw_access_readpage,
- .readpages = erofs_raw_access_readpages,
+ .read_folio = erofs_read_folio,
+ .readahead = erofs_readahead,
.bmap = erofs_bmap,
+ .direct_IO = noop_direct_IO,
};
+#ifdef CONFIG_FS_DAX
+static vm_fault_t erofs_dax_huge_fault(struct vm_fault *vmf,
+ enum page_entry_size pe_size)
+{
+ return dax_iomap_fault(vmf, pe_size, NULL, NULL, &erofs_iomap_ops);
+}
+
+static vm_fault_t erofs_dax_fault(struct vm_fault *vmf)
+{
+ return erofs_dax_huge_fault(vmf, PE_SIZE_PTE);
+}
+
+static const struct vm_operations_struct erofs_dax_vm_ops = {
+ .fault = erofs_dax_fault,
+ .huge_fault = erofs_dax_huge_fault,
+};
+
+static int erofs_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ if (!IS_DAX(file_inode(file)))
+ return generic_file_readonly_mmap(file, vma);
+
+ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+ return -EINVAL;
+
+ vma->vm_ops = &erofs_dax_vm_ops;
+ vma->vm_flags |= VM_HUGEPAGE;
+ return 0;
+}
+#else
+#define erofs_file_mmap generic_file_readonly_mmap
+#endif
+
+const struct file_operations erofs_file_fops = {
+ .llseek = generic_file_llseek,
+ .read_iter = erofs_file_read_iter,
+ .mmap = erofs_file_mmap,
+ .splice_read = generic_file_splice_read,
+};
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 5779a15c2cd6..51b7ac7166d9 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -1,8 +1,7 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2019 HUAWEI, Inc.
- * http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
+ * https://www.huawei.com/
*/
#include "compress.h"
#include <linux/module.h>
@@ -17,53 +16,93 @@
#define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32)
#endif
-struct z_erofs_decompressor {
- /*
- * if destpages have sparsed pages, fill them with bounce pages.
- * it also check whether destpages indicate continuous physical memory.
- */
- int (*prepare_destpages)(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool);
- int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out);
- char *name;
+struct z_erofs_lz4_decompress_ctx {
+ struct z_erofs_decompress_req *rq;
+ /* # of encoded, decoded pages */
+ unsigned int inpages, outpages;
+ /* decoded block total length (used for in-place decompression) */
+ unsigned int oend;
};
-static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool)
+int z_erofs_load_lz4_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_lz4_cfgs *lz4, int size)
{
- const unsigned int nr =
- PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ u16 distance;
+
+ if (lz4) {
+ if (size < sizeof(struct z_erofs_lz4_cfgs)) {
+ erofs_err(sb, "invalid lz4 cfgs, size=%u", size);
+ return -EINVAL;
+ }
+ distance = le16_to_cpu(lz4->max_distance);
+
+ sbi->lz4.max_pclusterblks = le16_to_cpu(lz4->max_pclusterblks);
+ if (!sbi->lz4.max_pclusterblks) {
+ sbi->lz4.max_pclusterblks = 1; /* reserved case */
+ } else if (sbi->lz4.max_pclusterblks >
+ Z_EROFS_PCLUSTER_MAX_SIZE / EROFS_BLKSIZ) {
+ erofs_err(sb, "too large lz4 pclusterblks %u",
+ sbi->lz4.max_pclusterblks);
+ return -EINVAL;
+ }
+ } else {
+ distance = le16_to_cpu(dsb->u1.lz4_max_distance);
+ sbi->lz4.max_pclusterblks = 1;
+ }
+
+ sbi->lz4.max_distance_pages = distance ?
+ DIV_ROUND_UP(distance, PAGE_SIZE) + 1 :
+ LZ4_MAX_DISTANCE_PAGES;
+ return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks);
+}
+
+/*
+ * Fill all gaps with bounce pages if it's a sparse page list. Also check if
+ * all physical pages are consecutive, which can be seen for moderate CR.
+ */
+static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx,
+ struct page **pagepool)
+{
+ struct z_erofs_decompress_req *rq = ctx->rq;
struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL };
unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES,
BITS_PER_LONG)] = { 0 };
+ unsigned int lz4_max_distance_pages =
+ EROFS_SB(rq->sb)->lz4.max_distance_pages;
void *kaddr = NULL;
unsigned int i, j, top;
top = 0;
- for (i = j = 0; i < nr; ++i, ++j) {
+ for (i = j = 0; i < ctx->outpages; ++i, ++j) {
struct page *const page = rq->out[i];
struct page *victim;
- if (j >= LZ4_MAX_DISTANCE_PAGES)
+ if (j >= lz4_max_distance_pages)
j = 0;
/* 'valid' bounced can only be tested after a complete round */
- if (test_bit(j, bounced)) {
- DBG_BUGON(i < LZ4_MAX_DISTANCE_PAGES);
- DBG_BUGON(top >= LZ4_MAX_DISTANCE_PAGES);
- availables[top++] = rq->out[i - LZ4_MAX_DISTANCE_PAGES];
+ if (!rq->fillgaps && test_bit(j, bounced)) {
+ DBG_BUGON(i < lz4_max_distance_pages);
+ DBG_BUGON(top >= lz4_max_distance_pages);
+ availables[top++] = rq->out[i - lz4_max_distance_pages];
}
if (page) {
__clear_bit(j, bounced);
- if (kaddr) {
- if (kaddr + PAGE_SIZE == page_address(page))
+ if (!PageHighMem(page)) {
+ if (!i) {
+ kaddr = page_address(page);
+ continue;
+ }
+ if (kaddr &&
+ kaddr + PAGE_SIZE == page_address(page)) {
kaddr += PAGE_SIZE;
- else
- kaddr = NULL;
- } else if (!i) {
- kaddr = page_address(page);
+ continue;
+ }
}
+ kaddr = NULL;
continue;
}
kaddr = NULL;
@@ -73,264 +112,280 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
victim = availables[--top];
get_page(victim);
} else {
- victim = erofs_allocpage(pagepool, GFP_KERNEL);
- if (!victim)
- return -ENOMEM;
- victim->mapping = Z_EROFS_MAPPING_STAGING;
+ victim = erofs_allocpage(pagepool,
+ GFP_KERNEL | __GFP_NOFAIL);
+ set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE);
}
rq->out[i] = victim;
}
return kaddr ? 1 : 0;
}
-static void *generic_copy_inplace_data(struct z_erofs_decompress_req *rq,
- u8 *src, unsigned int pageofs_in)
+static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx,
+ void *inpage, unsigned int *inputmargin, int *maptype,
+ bool may_inplace)
{
- /*
- * if in-place decompression is ongoing, those decompressed
- * pages should be copied in order to avoid being overlapped.
- */
- struct page **in = rq->in;
- u8 *const tmp = erofs_get_pcpubuf(0);
- u8 *tmpp = tmp;
- unsigned int inlen = rq->inputsize - pageofs_in;
- unsigned int count = min_t(uint, inlen, PAGE_SIZE - pageofs_in);
-
- while (tmpp < tmp + inlen) {
- if (!src)
- src = kmap_atomic(*in);
- memcpy(tmpp, src + pageofs_in, count);
- kunmap_atomic(src);
- src = NULL;
- tmpp += count;
- pageofs_in = 0;
- count = PAGE_SIZE;
+ struct z_erofs_decompress_req *rq = ctx->rq;
+ unsigned int omargin, total, i, j;
+ struct page **in;
+ void *src, *tmp;
+
+ if (rq->inplace_io) {
+ omargin = PAGE_ALIGN(ctx->oend) - ctx->oend;
+ if (rq->partial_decoding || !may_inplace ||
+ omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize))
+ goto docopy;
+
+ for (i = 0; i < ctx->inpages; ++i) {
+ DBG_BUGON(rq->in[i] == NULL);
+ for (j = 0; j < ctx->outpages - ctx->inpages + i; ++j)
+ if (rq->out[j] == rq->in[i])
+ goto docopy;
+ }
+ }
+
+ if (ctx->inpages <= 1) {
+ *maptype = 0;
+ return inpage;
+ }
+ kunmap_atomic(inpage);
+ might_sleep();
+ src = erofs_vm_map_ram(rq->in, ctx->inpages);
+ if (!src)
+ return ERR_PTR(-ENOMEM);
+ *maptype = 1;
+ return src;
+
+docopy:
+ /* Or copy compressed data which can be overlapped to per-CPU buffer */
+ in = rq->in;
+ src = erofs_get_pcpubuf(ctx->inpages);
+ if (!src) {
+ DBG_BUGON(1);
+ kunmap_atomic(inpage);
+ return ERR_PTR(-EFAULT);
+ }
+
+ tmp = src;
+ total = rq->inputsize;
+ while (total) {
+ unsigned int page_copycnt =
+ min_t(unsigned int, total, PAGE_SIZE - *inputmargin);
+
+ if (!inpage)
+ inpage = kmap_atomic(*in);
+ memcpy(tmp, inpage + *inputmargin, page_copycnt);
+ kunmap_atomic(inpage);
+ inpage = NULL;
+ tmp += page_copycnt;
+ total -= page_copycnt;
++in;
+ *inputmargin = 0;
}
- return tmp;
+ *maptype = 2;
+ return src;
}
-static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
+/*
+ * Get the exact inputsize with zero_padding feature.
+ * - For LZ4, it should work if zero_padding feature is on (5.3+);
+ * - For MicroLZMA, it'd be enabled all the time.
+ */
+int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf,
+ unsigned int padbufsize)
{
- unsigned int inputmargin, inlen;
- u8 *src;
- bool copied, support_0padding;
- int ret;
-
- if (rq->inputsize > PAGE_SIZE)
- return -EOPNOTSUPP;
+ const char *padend;
- src = kmap_atomic(*rq->in);
- inputmargin = 0;
- support_0padding = false;
+ padend = memchr_inv(padbuf, 0, padbufsize);
+ if (!padend)
+ return -EFSCORRUPTED;
+ rq->inputsize -= padend - padbuf;
+ rq->pageofs_in += padend - padbuf;
+ return 0;
+}
- /* decompression inplace is only safe when 0padding is enabled */
- if (EROFS_SB(rq->sb)->feature_incompat &
- EROFS_FEATURE_INCOMPAT_LZ4_0PADDING) {
- support_0padding = true;
+static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx,
+ u8 *out)
+{
+ struct z_erofs_decompress_req *rq = ctx->rq;
+ bool support_0padding = false, may_inplace = false;
+ unsigned int inputmargin;
+ u8 *headpage, *src;
+ int ret, maptype;
- while (!src[inputmargin & ~PAGE_MASK])
- if (!(++inputmargin & ~PAGE_MASK))
- break;
+ DBG_BUGON(*rq->in == NULL);
+ headpage = kmap_atomic(*rq->in);
- if (inputmargin >= rq->inputsize) {
- kunmap_atomic(src);
- return -EIO;
+ /* LZ4 decompression inplace is only safe if zero_padding is enabled */
+ if (erofs_sb_has_zero_padding(EROFS_SB(rq->sb))) {
+ support_0padding = true;
+ ret = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in,
+ min_t(unsigned int, rq->inputsize,
+ EROFS_BLKSIZ - rq->pageofs_in));
+ if (ret) {
+ kunmap_atomic(headpage);
+ return ret;
}
+ may_inplace = !((rq->pageofs_in + rq->inputsize) &
+ (EROFS_BLKSIZ - 1));
}
- copied = false;
- inlen = rq->inputsize - inputmargin;
- if (rq->inplace_io) {
- const uint oend = (rq->pageofs_out +
- rq->outputsize) & ~PAGE_MASK;
- const uint nr = PAGE_ALIGN(rq->pageofs_out +
- rq->outputsize) >> PAGE_SHIFT;
-
- if (rq->partial_decoding || !support_0padding ||
- rq->out[nr - 1] != rq->in[0] ||
- rq->inputsize - oend <
- LZ4_DECOMPRESS_INPLACE_MARGIN(inlen)) {
- src = generic_copy_inplace_data(rq, src, inputmargin);
- inputmargin = 0;
- copied = true;
- }
- }
+ inputmargin = rq->pageofs_in;
+ src = z_erofs_lz4_handle_overlap(ctx, headpage, &inputmargin,
+ &maptype, may_inplace);
+ if (IS_ERR(src))
+ return PTR_ERR(src);
+
+ /* legacy format could compress extra data in a pcluster. */
+ if (rq->partial_decoding || !support_0padding)
+ ret = LZ4_decompress_safe_partial(src + inputmargin, out,
+ rq->inputsize, rq->outputsize, rq->outputsize);
+ else
+ ret = LZ4_decompress_safe(src + inputmargin, out,
+ rq->inputsize, rq->outputsize);
+
+ if (ret != rq->outputsize) {
+ erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
+ ret, rq->inputsize, inputmargin, rq->outputsize);
- ret = LZ4_decompress_safe_partial(src + inputmargin, out,
- inlen, rq->outputsize,
- rq->outputsize);
- if (ret < 0) {
- erofs_err(rq->sb, "failed to decompress, in[%u, %u] out[%u]",
- inlen, inputmargin, rq->outputsize);
- WARN_ON(1);
print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
- 16, 1, src + inputmargin, inlen, true);
+ 16, 1, src + inputmargin, rq->inputsize, true);
print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
16, 1, out, rq->outputsize, true);
+
+ if (ret >= 0)
+ memset(out + ret, 0, rq->outputsize - ret);
ret = -EIO;
+ } else {
+ ret = 0;
}
- if (copied)
+ if (maptype == 0) {
+ kunmap_atomic(headpage);
+ } else if (maptype == 1) {
+ vm_unmap_ram(src, ctx->inpages);
+ } else if (maptype == 2) {
erofs_put_pcpubuf(src);
- else
- kunmap_atomic(src);
- return ret;
-}
-
-static struct z_erofs_decompressor decompressors[] = {
- [Z_EROFS_COMPRESSION_SHIFTED] = {
- .name = "shifted"
- },
- [Z_EROFS_COMPRESSION_LZ4] = {
- .prepare_destpages = z_erofs_lz4_prepare_destpages,
- .decompress = z_erofs_lz4_decompress,
- .name = "lz4"
- },
-};
-
-static void copy_from_pcpubuf(struct page **out, const char *dst,
- unsigned short pageofs_out,
- unsigned int outputsize)
-{
- const char *end = dst + outputsize;
- const unsigned int righthalf = PAGE_SIZE - pageofs_out;
- const char *cur = dst - pageofs_out;
-
- while (cur < end) {
- struct page *const page = *out++;
-
- if (page) {
- char *buf = kmap_atomic(page);
-
- if (cur >= dst) {
- memcpy(buf, cur, min_t(uint, PAGE_SIZE,
- end - cur));
- } else {
- memcpy(buf + pageofs_out, cur + pageofs_out,
- min_t(uint, righthalf, end - cur));
- }
- kunmap_atomic(buf);
- }
- cur += PAGE_SIZE;
+ } else {
+ DBG_BUGON(1);
+ return -EFAULT;
}
+ return ret;
}
-static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool)
+static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
{
- const unsigned int nrpages_out =
- PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
- const struct z_erofs_decompressor *alg = decompressors + rq->alg;
+ struct z_erofs_lz4_decompress_ctx ctx;
unsigned int dst_maptype;
void *dst;
- int ret, i;
+ int ret;
- if (nrpages_out == 1 && !rq->inplace_io) {
+ ctx.rq = rq;
+ ctx.oend = rq->pageofs_out + rq->outputsize;
+ ctx.outpages = PAGE_ALIGN(ctx.oend) >> PAGE_SHIFT;
+ ctx.inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
+
+ /* one optimized fast path only for non bigpcluster cases yet */
+ if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) {
DBG_BUGON(!*rq->out);
dst = kmap_atomic(*rq->out);
dst_maptype = 0;
goto dstmap_out;
}
- /*
- * For the case of small output size (especially much less
- * than PAGE_SIZE), memcpy the decompressed data rather than
- * compressed data is preferred.
- */
- if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
- dst = erofs_get_pcpubuf(0);
- if (IS_ERR(dst))
- return PTR_ERR(dst);
-
- rq->inplace_io = false;
- ret = alg->decompress(rq, dst);
- if (!ret)
- copy_from_pcpubuf(rq->out, dst, rq->pageofs_out,
- rq->outputsize);
-
- erofs_put_pcpubuf(dst);
- return ret;
- }
-
- ret = alg->prepare_destpages(rq, pagepool);
+ /* general decoding path which can be used for all cases */
+ ret = z_erofs_lz4_prepare_dstpages(&ctx, pagepool);
if (ret < 0) {
return ret;
- } else if (ret) {
+ } else if (ret > 0) {
dst = page_address(*rq->out);
dst_maptype = 1;
- goto dstmap_out;
+ } else {
+ dst = erofs_vm_map_ram(rq->out, ctx.outpages);
+ if (!dst)
+ return -ENOMEM;
+ dst_maptype = 2;
}
- i = 0;
- while (1) {
- dst = vm_map_ram(rq->out, nrpages_out, -1, PAGE_KERNEL);
-
- /* retry two more times (totally 3 times) */
- if (dst || ++i >= 3)
- break;
- vm_unmap_aliases();
- }
-
- if (!dst)
- return -ENOMEM;
-
- dst_maptype = 2;
-
dstmap_out:
- ret = alg->decompress(rq, dst + rq->pageofs_out);
-
+ ret = z_erofs_lz4_decompress_mem(&ctx, dst + rq->pageofs_out);
if (!dst_maptype)
kunmap_atomic(dst);
else if (dst_maptype == 2)
- vm_unmap_ram(dst, nrpages_out);
+ vm_unmap_ram(dst, ctx.outpages);
return ret;
}
-static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq,
- struct list_head *pagepool)
+static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
{
- const unsigned int nrpages_out =
+ const unsigned int inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
+ const unsigned int outpages =
PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
- const unsigned int righthalf = PAGE_SIZE - rq->pageofs_out;
+ const unsigned int righthalf = min_t(unsigned int, rq->outputsize,
+ PAGE_SIZE - rq->pageofs_out);
+ const unsigned int lefthalf = rq->outputsize - righthalf;
+ const unsigned int interlaced_offset =
+ rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out;
unsigned char *src, *dst;
- if (nrpages_out > 2) {
+ if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) {
DBG_BUGON(1);
- return -EIO;
+ return -EFSCORRUPTED;
}
if (rq->out[0] == *rq->in) {
- DBG_BUGON(nrpages_out != 1);
+ DBG_BUGON(rq->pageofs_out);
return 0;
}
- src = kmap_atomic(*rq->in);
+ src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in;
if (rq->out[0]) {
- dst = kmap_atomic(rq->out[0]);
- memcpy(dst + rq->pageofs_out, src, righthalf);
- kunmap_atomic(dst);
+ dst = kmap_local_page(rq->out[0]);
+ memcpy(dst + rq->pageofs_out, src + interlaced_offset,
+ righthalf);
+ kunmap_local(dst);
}
- if (nrpages_out == 2) {
- DBG_BUGON(!rq->out[1]);
- if (rq->out[1] == *rq->in) {
- memmove(src, src + righthalf, rq->pageofs_out);
- } else {
- dst = kmap_atomic(rq->out[1]);
- memcpy(dst, src + righthalf, rq->pageofs_out);
- kunmap_atomic(dst);
+ if (outpages > inpages) {
+ DBG_BUGON(!rq->out[outpages - 1]);
+ if (rq->out[outpages - 1] != rq->in[inpages - 1]) {
+ dst = kmap_local_page(rq->out[outpages - 1]);
+ memcpy(dst, interlaced_offset ? src :
+ (src + righthalf), lefthalf);
+ kunmap_local(dst);
+ } else if (!interlaced_offset) {
+ memmove(src, src + righthalf, lefthalf);
}
}
- kunmap_atomic(src);
+ kunmap_local(src);
return 0;
}
+static struct z_erofs_decompressor decompressors[] = {
+ [Z_EROFS_COMPRESSION_SHIFTED] = {
+ .decompress = z_erofs_transform_plain,
+ .name = "shifted"
+ },
+ [Z_EROFS_COMPRESSION_INTERLACED] = {
+ .decompress = z_erofs_transform_plain,
+ .name = "interlaced"
+ },
+ [Z_EROFS_COMPRESSION_LZ4] = {
+ .decompress = z_erofs_lz4_decompress,
+ .name = "lz4"
+ },
+#ifdef CONFIG_EROFS_FS_ZIP_LZMA
+ [Z_EROFS_COMPRESSION_LZMA] = {
+ .decompress = z_erofs_lzma_decompress,
+ .name = "lzma"
+ },
+#endif
+};
+
int z_erofs_decompress(struct z_erofs_decompress_req *rq,
- struct list_head *pagepool)
+ struct page **pagepool)
{
- if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED)
- return z_erofs_shifted_transform(rq, pagepool);
- return z_erofs_decompress_generic(rq, pagepool);
+ return decompressors[rq->alg].decompress(rq, pagepool);
}
-
diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c
new file mode 100644
index 000000000000..091fd5adf818
--- /dev/null
+++ b/fs/erofs/decompressor_lzma.c
@@ -0,0 +1,291 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+#include <linux/xz.h>
+#include <linux/module.h>
+#include "compress.h"
+
+struct z_erofs_lzma {
+ struct z_erofs_lzma *next;
+ struct xz_dec_microlzma *state;
+ struct xz_buf buf;
+ u8 bounce[PAGE_SIZE];
+};
+
+/* considering the LZMA performance, no need to use a lockless list for now */
+static DEFINE_SPINLOCK(z_erofs_lzma_lock);
+static unsigned int z_erofs_lzma_max_dictsize;
+static unsigned int z_erofs_lzma_nstrms, z_erofs_lzma_avail_strms;
+static struct z_erofs_lzma *z_erofs_lzma_head;
+static DECLARE_WAIT_QUEUE_HEAD(z_erofs_lzma_wq);
+
+module_param_named(lzma_streams, z_erofs_lzma_nstrms, uint, 0444);
+
+void z_erofs_lzma_exit(void)
+{
+ /* there should be no running fs instance */
+ while (z_erofs_lzma_avail_strms) {
+ struct z_erofs_lzma *strm;
+
+ spin_lock(&z_erofs_lzma_lock);
+ strm = z_erofs_lzma_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_lzma_lock);
+ DBG_BUGON(1);
+ return;
+ }
+ z_erofs_lzma_head = NULL;
+ spin_unlock(&z_erofs_lzma_lock);
+
+ while (strm) {
+ struct z_erofs_lzma *n = strm->next;
+
+ if (strm->state)
+ xz_dec_microlzma_end(strm->state);
+ kfree(strm);
+ --z_erofs_lzma_avail_strms;
+ strm = n;
+ }
+ }
+}
+
+int z_erofs_lzma_init(void)
+{
+ unsigned int i;
+
+ /* by default, use # of possible CPUs instead */
+ if (!z_erofs_lzma_nstrms)
+ z_erofs_lzma_nstrms = num_possible_cpus();
+
+ for (i = 0; i < z_erofs_lzma_nstrms; ++i) {
+ struct z_erofs_lzma *strm = kzalloc(sizeof(*strm), GFP_KERNEL);
+
+ if (!strm) {
+ z_erofs_lzma_exit();
+ return -ENOMEM;
+ }
+ spin_lock(&z_erofs_lzma_lock);
+ strm->next = z_erofs_lzma_head;
+ z_erofs_lzma_head = strm;
+ spin_unlock(&z_erofs_lzma_lock);
+ ++z_erofs_lzma_avail_strms;
+ }
+ return 0;
+}
+
+int z_erofs_load_lzma_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_lzma_cfgs *lzma, int size)
+{
+ static DEFINE_MUTEX(lzma_resize_mutex);
+ unsigned int dict_size, i;
+ struct z_erofs_lzma *strm, *head = NULL;
+ int err;
+
+ if (!lzma || size < sizeof(struct z_erofs_lzma_cfgs)) {
+ erofs_err(sb, "invalid lzma cfgs, size=%u", size);
+ return -EINVAL;
+ }
+ if (lzma->format) {
+ erofs_err(sb, "unidentified lzma format %x, please check kernel version",
+ le16_to_cpu(lzma->format));
+ return -EINVAL;
+ }
+ dict_size = le32_to_cpu(lzma->dict_size);
+ if (dict_size > Z_EROFS_LZMA_MAX_DICT_SIZE || dict_size < 4096) {
+ erofs_err(sb, "unsupported lzma dictionary size %u",
+ dict_size);
+ return -EINVAL;
+ }
+
+ erofs_info(sb, "EXPERIMENTAL MicroLZMA in use. Use at your own risk!");
+
+ /* in case 2 z_erofs_load_lzma_config() race to avoid deadlock */
+ mutex_lock(&lzma_resize_mutex);
+
+ if (z_erofs_lzma_max_dictsize >= dict_size) {
+ mutex_unlock(&lzma_resize_mutex);
+ return 0;
+ }
+
+ /* 1. collect/isolate all streams for the following check */
+ for (i = 0; i < z_erofs_lzma_avail_strms; ++i) {
+ struct z_erofs_lzma *last;
+
+again:
+ spin_lock(&z_erofs_lzma_lock);
+ strm = z_erofs_lzma_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_lzma_lock);
+ wait_event(z_erofs_lzma_wq,
+ READ_ONCE(z_erofs_lzma_head));
+ goto again;
+ }
+ z_erofs_lzma_head = NULL;
+ spin_unlock(&z_erofs_lzma_lock);
+
+ for (last = strm; last->next; last = last->next)
+ ++i;
+ last->next = head;
+ head = strm;
+ }
+
+ err = 0;
+ /* 2. walk each isolated stream and grow max dict_size if needed */
+ for (strm = head; strm; strm = strm->next) {
+ if (strm->state)
+ xz_dec_microlzma_end(strm->state);
+ strm->state = xz_dec_microlzma_alloc(XZ_PREALLOC, dict_size);
+ if (!strm->state)
+ err = -ENOMEM;
+ }
+
+ /* 3. push back all to the global list and update max dict_size */
+ spin_lock(&z_erofs_lzma_lock);
+ DBG_BUGON(z_erofs_lzma_head);
+ z_erofs_lzma_head = head;
+ spin_unlock(&z_erofs_lzma_lock);
+ wake_up_all(&z_erofs_lzma_wq);
+
+ z_erofs_lzma_max_dictsize = dict_size;
+ mutex_unlock(&lzma_resize_mutex);
+ return err;
+}
+
+int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq,
+ struct page **pagepool)
+{
+ const unsigned int nrpages_out =
+ PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+ const unsigned int nrpages_in =
+ PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT;
+ unsigned int inlen, outlen, pageofs;
+ struct z_erofs_lzma *strm;
+ u8 *kin;
+ bool bounced = false;
+ int no, ni, j, err = 0;
+
+ /* 1. get the exact LZMA compressed size */
+ kin = kmap(*rq->in);
+ err = z_erofs_fixup_insize(rq, kin + rq->pageofs_in,
+ min_t(unsigned int, rq->inputsize,
+ EROFS_BLKSIZ - rq->pageofs_in));
+ if (err) {
+ kunmap(*rq->in);
+ return err;
+ }
+
+ /* 2. get an available lzma context */
+again:
+ spin_lock(&z_erofs_lzma_lock);
+ strm = z_erofs_lzma_head;
+ if (!strm) {
+ spin_unlock(&z_erofs_lzma_lock);
+ wait_event(z_erofs_lzma_wq, READ_ONCE(z_erofs_lzma_head));
+ goto again;
+ }
+ z_erofs_lzma_head = strm->next;
+ spin_unlock(&z_erofs_lzma_lock);
+
+ /* 3. multi-call decompress */
+ inlen = rq->inputsize;
+ outlen = rq->outputsize;
+ xz_dec_microlzma_reset(strm->state, inlen, outlen,
+ !rq->partial_decoding);
+ pageofs = rq->pageofs_out;
+ strm->buf.in = kin + rq->pageofs_in;
+ strm->buf.in_pos = 0;
+ strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE - rq->pageofs_in);
+ inlen -= strm->buf.in_size;
+ strm->buf.out = NULL;
+ strm->buf.out_pos = 0;
+ strm->buf.out_size = 0;
+
+ for (ni = 0, no = -1;;) {
+ enum xz_ret xz_err;
+
+ if (strm->buf.out_pos == strm->buf.out_size) {
+ if (strm->buf.out) {
+ kunmap(rq->out[no]);
+ strm->buf.out = NULL;
+ }
+
+ if (++no >= nrpages_out || !outlen) {
+ erofs_err(rq->sb, "decompressed buf out of bound");
+ err = -EFSCORRUPTED;
+ break;
+ }
+ strm->buf.out_pos = 0;
+ strm->buf.out_size = min_t(u32, outlen,
+ PAGE_SIZE - pageofs);
+ outlen -= strm->buf.out_size;
+ if (!rq->out[no] && rq->fillgaps) /* deduped */
+ rq->out[no] = erofs_allocpage(pagepool,
+ GFP_KERNEL | __GFP_NOFAIL);
+ if (rq->out[no])
+ strm->buf.out = kmap(rq->out[no]) + pageofs;
+ pageofs = 0;
+ } else if (strm->buf.in_pos == strm->buf.in_size) {
+ kunmap(rq->in[ni]);
+
+ if (++ni >= nrpages_in || !inlen) {
+ erofs_err(rq->sb, "compressed buf out of bound");
+ err = -EFSCORRUPTED;
+ break;
+ }
+ strm->buf.in_pos = 0;
+ strm->buf.in_size = min_t(u32, inlen, PAGE_SIZE);
+ inlen -= strm->buf.in_size;
+ kin = kmap(rq->in[ni]);
+ strm->buf.in = kin;
+ bounced = false;
+ }
+
+ /*
+ * Handle overlapping: Use bounced buffer if the compressed
+ * data is under processing; Otherwise, Use short-lived pages
+ * from the on-stack pagepool where pages share with the same
+ * request.
+ */
+ if (!bounced && rq->out[no] == rq->in[ni]) {
+ memcpy(strm->bounce, strm->buf.in, strm->buf.in_size);
+ strm->buf.in = strm->bounce;
+ bounced = true;
+ }
+ for (j = ni + 1; j < nrpages_in; ++j) {
+ struct page *tmppage;
+
+ if (rq->out[no] != rq->in[j])
+ continue;
+
+ DBG_BUGON(erofs_page_is_managed(EROFS_SB(rq->sb),
+ rq->in[j]));
+ tmppage = erofs_allocpage(pagepool,
+ GFP_KERNEL | __GFP_NOFAIL);
+ set_page_private(tmppage, Z_EROFS_SHORTLIVED_PAGE);
+ copy_highpage(tmppage, rq->in[j]);
+ rq->in[j] = tmppage;
+ }
+ xz_err = xz_dec_microlzma_run(strm->state, &strm->buf);
+ DBG_BUGON(strm->buf.out_pos > strm->buf.out_size);
+ DBG_BUGON(strm->buf.in_pos > strm->buf.in_size);
+
+ if (xz_err != XZ_OK) {
+ if (xz_err == XZ_STREAM_END && !outlen)
+ break;
+ erofs_err(rq->sb, "failed to decompress %d in[%u] out[%u]",
+ xz_err, rq->inputsize, rq->outputsize);
+ err = -EFSCORRUPTED;
+ break;
+ }
+ }
+ if (no < nrpages_out && strm->buf.out)
+ kunmap(rq->in[no]);
+ if (ni < nrpages_in)
+ kunmap(rq->in[ni]);
+ /* 4. push back LZMA stream context to the global list */
+ spin_lock(&z_erofs_lzma_lock);
+ strm->next = z_erofs_lzma_head;
+ z_erofs_lzma_head = strm;
+ spin_unlock(&z_erofs_lzma_lock);
+ wake_up(&z_erofs_lzma_wq);
+ return err;
+}
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
index d28c623dfef9..ecf28f66b97d 100644
--- a/fs/erofs/dir.c
+++ b/fs/erofs/dir.c
@@ -1,8 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
- * http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
+ * https://www.huawei.com/
+ * Copyright (C) 2022, Alibaba Cloud
*/
#include "internal.h"
@@ -22,10 +22,9 @@ static void debug_one_dentry(unsigned char d_type, const char *de_name,
}
static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
- void *dentry_blk, unsigned int *ofs,
+ void *dentry_blk, struct erofs_dirent *de,
unsigned int nameoff, unsigned int maxsize)
{
- struct erofs_dirent *de = dentry_blk + *ofs;
const struct erofs_dirent *end = dentry_blk + nameoff;
while (de < end) {
@@ -59,16 +58,15 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
/* stopped by some reason */
return 1;
++de;
- *ofs += sizeof(struct erofs_dirent);
+ ctx->pos += sizeof(struct erofs_dirent);
}
- *ofs = maxsize;
return 0;
}
static int erofs_readdir(struct file *f, struct dir_context *ctx)
{
struct inode *dir = file_inode(f);
- struct address_space *mapping = dir->i_mapping;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
const size_t dirsize = i_size_read(dir);
unsigned int i = ctx->pos / EROFS_BLKSIZ;
unsigned int ofs = ctx->pos % EROFS_BLKSIZ;
@@ -76,61 +74,51 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
bool initial = true;
while (ctx->pos < dirsize) {
- struct page *dentry_page;
struct erofs_dirent *de;
unsigned int nameoff, maxsize;
- dentry_page = read_mapping_page(mapping, i, NULL);
- if (dentry_page == ERR_PTR(-ENOMEM)) {
- err = -ENOMEM;
- break;
- } else if (IS_ERR(dentry_page)) {
+ de = erofs_bread(&buf, dir, i, EROFS_KMAP);
+ if (IS_ERR(de)) {
erofs_err(dir->i_sb,
"fail to readdir of logical block %u of nid %llu",
i, EROFS_I(dir)->nid);
- err = -EFSCORRUPTED;
+ err = PTR_ERR(de);
break;
}
- de = (struct erofs_dirent *)kmap(dentry_page);
-
nameoff = le16_to_cpu(de->nameoff);
-
if (nameoff < sizeof(struct erofs_dirent) ||
- nameoff >= PAGE_SIZE) {
+ nameoff >= EROFS_BLKSIZ) {
erofs_err(dir->i_sb,
"invalid de[0].nameoff %u @ nid %llu",
nameoff, EROFS_I(dir)->nid);
err = -EFSCORRUPTED;
- goto skip_this;
+ break;
}
maxsize = min_t(unsigned int,
- dirsize - ctx->pos + ofs, PAGE_SIZE);
+ dirsize - ctx->pos + ofs, EROFS_BLKSIZ);
/* search dirents at the arbitrary position */
if (initial) {
initial = false;
ofs = roundup(ofs, sizeof(struct erofs_dirent));
+ ctx->pos = blknr_to_addr(i) + ofs;
if (ofs >= nameoff)
goto skip_this;
}
- err = erofs_fill_dentries(dir, ctx, de, &ofs,
+ err = erofs_fill_dentries(dir, ctx, de, (void *)de + ofs,
nameoff, maxsize);
-skip_this:
- kunmap(dentry_page);
-
- put_page(dentry_page);
-
- ctx->pos = blknr_to_addr(i) + ofs;
-
if (err)
break;
+skip_this:
+ ctx->pos = blknr_to_addr(i) + maxsize;
++i;
ofs = 0;
}
+ erofs_put_metabuf(&buf);
return err < 0 ? err : 0;
}
@@ -139,4 +127,3 @@ const struct file_operations erofs_dir_fops = {
.read = generic_read_dir,
.iterate_shared = erofs_readdir,
};
-
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 385fa49c7749..dbcd24371002 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -3,8 +3,8 @@
* EROFS (Enhanced ROM File System) on-disk format definition
*
* Copyright (C) 2017-2018 HUAWEI, Inc.
- * http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
+ * https://www.huawei.com/
+ * Copyright (C) 2021, Alibaba Cloud
*/
#ifndef __EROFS_FS_H
#define __EROFS_FS_H
@@ -12,53 +12,94 @@
#define EROFS_SUPER_OFFSET 1024
#define EROFS_FEATURE_COMPAT_SB_CHKSUM 0x00000001
+#define EROFS_FEATURE_COMPAT_MTIME 0x00000002
/*
* Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
* be incompatible with this kernel version.
*/
-#define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING 0x00000001
-#define EROFS_ALL_FEATURE_INCOMPAT EROFS_FEATURE_INCOMPAT_LZ4_0PADDING
+#define EROFS_FEATURE_INCOMPAT_ZERO_PADDING 0x00000001
+#define EROFS_FEATURE_INCOMPAT_COMPR_CFGS 0x00000002
+#define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER 0x00000002
+#define EROFS_FEATURE_INCOMPAT_CHUNKED_FILE 0x00000004
+#define EROFS_FEATURE_INCOMPAT_DEVICE_TABLE 0x00000008
+#define EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 0x00000008
+#define EROFS_FEATURE_INCOMPAT_ZTAILPACKING 0x00000010
+#define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020
+#define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020
+#define EROFS_ALL_FEATURE_INCOMPAT \
+ (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \
+ EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
+ EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \
+ EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \
+ EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \
+ EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \
+ EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \
+ EROFS_FEATURE_INCOMPAT_FRAGMENTS | \
+ EROFS_FEATURE_INCOMPAT_DEDUPE)
+
+#define EROFS_SB_EXTSLOT_SIZE 16
+
+struct erofs_deviceslot {
+ u8 tag[64]; /* digest(sha256), etc. */
+ __le32 blocks; /* total fs blocks of this device */
+ __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */
+ u8 reserved[56];
+};
+#define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot)
-/* 128-byte erofs on-disk super block */
+/* erofs on-disk super block (currently 128 bytes) */
struct erofs_super_block {
__le32 magic; /* file system magic number */
__le32 checksum; /* crc32c(super_block) */
__le32 feature_compat;
__u8 blkszbits; /* support block_size == PAGE_SIZE only */
- __u8 reserved;
+ __u8 sb_extslots; /* superblock size = 128 + sb_extslots * 16 */
__le16 root_nid; /* nid of root directory */
__le64 inos; /* total valid ino # (== f_files - f_favail) */
- __le64 build_time; /* inode v1 time derivation */
- __le32 build_time_nsec; /* inode v1 time derivation in nano scale */
+ __le64 build_time; /* compact inode time derivation */
+ __le32 build_time_nsec; /* compact inode time derivation in ns scale */
__le32 blocks; /* used for statfs */
__le32 meta_blkaddr; /* start block address of metadata area */
__le32 xattr_blkaddr; /* start block address of shared xattr area */
__u8 uuid[16]; /* 128-bit uuid for volume */
__u8 volume_name[16]; /* volume name */
__le32 feature_incompat;
- __u8 reserved2[44];
+ union {
+ /* bitmap for available compression algorithms */
+ __le16 available_compr_algs;
+ /* customized sliding window size instead of 64k by default */
+ __le16 lz4_max_distance;
+ } __packed u1;
+ __le16 extra_devices; /* # of devices besides the primary device */
+ __le16 devt_slotoff; /* startoff = devt_slotoff * devt_slotsize */
+ __u8 reserved[6];
+ __le64 packed_nid; /* nid of the special packed inode */
+ __u8 reserved2[24];
};
/*
* erofs inode datalayout (i_format in on-disk inode):
- * 0 - inode plain without inline data A:
+ * 0 - uncompressed flat inode without tail-packing inline data:
* inode, [xattrs], ... | ... | no-holed data
- * 1 - inode VLE compression B (legacy):
- * inode, [xattrs], extents ... | ...
- * 2 - inode plain with inline data C:
- * inode, [xattrs], last_inline_data, ... | ... | no-holed data
- * 3 - inode compression D:
+ * 1 - compressed inode with non-compact indexes:
+ * inode, [xattrs], [map_header], extents ... | ...
+ * 2 - uncompressed flat inode with tail-packing inline data:
+ * inode, [xattrs], tailpacking data, ... | ... | no-holed data
+ * 3 - compressed inode with compact indexes:
* inode, [xattrs], map_header, extents ... | ...
- * 4~7 - reserved
+ * 4 - chunk-based inode with (optional) multi-device support:
+ * inode, [xattrs], chunk indexes ... | ...
+ * 5~7 - reserved
*/
enum {
EROFS_INODE_FLAT_PLAIN = 0,
EROFS_INODE_FLAT_COMPRESSION_LEGACY = 1,
EROFS_INODE_FLAT_INLINE = 2,
EROFS_INODE_FLAT_COMPRESSION = 3,
+ EROFS_INODE_CHUNK_BASED = 4,
EROFS_INODE_DATALAYOUT_MAX
};
@@ -68,13 +109,29 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY;
}
-/* bit definitions of inode i_advise */
+/* bit definitions of inode i_format */
#define EROFS_I_VERSION_BITS 1
#define EROFS_I_DATALAYOUT_BITS 3
#define EROFS_I_VERSION_BIT 0
#define EROFS_I_DATALAYOUT_BIT 1
+#define EROFS_I_ALL \
+ ((1 << (EROFS_I_DATALAYOUT_BIT + EROFS_I_DATALAYOUT_BITS)) - 1)
+
+/* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */
+#define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F
+/* with chunk indexes or just a 4-byte blkaddr array */
+#define EROFS_CHUNK_FORMAT_INDEXES 0x0020
+
+#define EROFS_CHUNK_FORMAT_ALL \
+ (EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES)
+
+struct erofs_inode_chunk_info {
+ __le16 format; /* chunk blkbits, etc. */
+ __le16 reserved;
+};
+
/* 32-byte reduced form of an ondisk inode */
struct erofs_inode_compact {
__le16 i_format; /* inode format hints */
@@ -86,12 +143,16 @@ struct erofs_inode_compact {
__le32 i_size;
__le32 i_reserved;
union {
- /* file total compressed blocks for data mapping 1 */
+ /* total compressed blocks for compressed inodes */
__le32 compressed_blocks;
+ /* block address for uncompressed flat inodes */
__le32 raw_blkaddr;
/* for device files, used to indicate old/new device # */
__le32 rdev;
+
+ /* for chunk-based files, it contains the summary info */
+ struct erofs_inode_chunk_info c;
} i_u;
__le32 i_ino; /* only used for 32-bit stat compatibility */
__le16 i_uid;
@@ -99,9 +160,9 @@ struct erofs_inode_compact {
__le32 i_reserved2;
};
-/* 32 bytes on-disk inode */
+/* 32-byte on-disk inode */
#define EROFS_INODE_LAYOUT_COMPACT 0
-/* 64 bytes on-disk inode */
+/* 64-byte on-disk inode */
#define EROFS_INODE_LAYOUT_EXTENDED 1
/* 64-byte complete form of an ondisk inode */
@@ -114,12 +175,16 @@ struct erofs_inode_extended {
__le16 i_reserved;
__le64 i_size;
union {
- /* file total compressed blocks for data mapping 1 */
+ /* total compressed blocks for compressed inodes */
__le32 compressed_blocks;
+ /* block address for uncompressed flat inodes */
__le32 raw_blkaddr;
/* for device files, used to indicate old/new device # */
__le32 rdev;
+
+ /* for chunk-based files, it contains the summary info */
+ struct erofs_inode_chunk_info c;
} i_u;
/* only used for 32-bit stat compatibility */
@@ -127,8 +192,8 @@ struct erofs_inode_extended {
__le32 i_uid;
__le32 i_gid;
- __le64 i_ctime;
- __le32 i_ctime_nsec;
+ __le64 i_mtime;
+ __le32 i_mtime_nsec;
__le32 i_nlink;
__u8 i_reserved2[16];
};
@@ -152,7 +217,7 @@ struct erofs_xattr_ibody_header {
__le32 h_reserved;
__u8 h_shared_count;
__u8 h_reserved2[7];
- __le32 h_shared_xattrs[0]; /* shared xattr id array */
+ __le32 h_shared_xattrs[]; /* shared xattr id array */
};
/* Name indexes */
@@ -169,7 +234,7 @@ struct erofs_xattr_entry {
__u8 e_name_index; /* attribute name index */
__le16 e_value_size; /* size of attribute value */
/* followed by e_name and e_value */
- char e_name[0]; /* attribute name */
+ char e_name[]; /* attribute name */
};
static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount)
@@ -189,23 +254,74 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
e->e_name_len + le16_to_cpu(e->e_value_size));
}
+/* represent a zeroed chunk (hole) */
+#define EROFS_NULL_ADDR -1
+
+/* 4-byte block address array */
+#define EROFS_BLOCK_MAP_ENTRY_SIZE sizeof(__le32)
+
+/* 8-byte inode chunk indexes */
+struct erofs_inode_chunk_index {
+ __le16 advise; /* always 0, don't care for now */
+ __le16 device_id; /* back-end storage id (with bits masked) */
+ __le32 blkaddr; /* start block address of this inode chunk */
+};
+
+/* maximum supported size of a physical compression cluster */
+#define Z_EROFS_PCLUSTER_MAX_SIZE (1024 * 1024)
+
/* available compression algorithm types (for h_algorithmtype) */
enum {
- Z_EROFS_COMPRESSION_LZ4 = 0,
+ Z_EROFS_COMPRESSION_LZ4 = 0,
+ Z_EROFS_COMPRESSION_LZMA = 1,
Z_EROFS_COMPRESSION_MAX
};
+#define Z_EROFS_ALL_COMPR_ALGS ((1 << Z_EROFS_COMPRESSION_MAX) - 1)
+
+/* 14 bytes (+ length field = 16 bytes) */
+struct z_erofs_lz4_cfgs {
+ __le16 max_distance;
+ __le16 max_pclusterblks;
+ u8 reserved[10];
+} __packed;
+
+/* 14 bytes (+ length field = 16 bytes) */
+struct z_erofs_lzma_cfgs {
+ __le32 dict_size;
+ __le16 format;
+ u8 reserved[8];
+} __packed;
+
+#define Z_EROFS_LZMA_MAX_DICT_SIZE (8 * Z_EROFS_PCLUSTER_MAX_SIZE)
/*
* bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
* e.g. for 4k logical cluster size, 4B if compacted 2B is off;
* (4B) + 2B + (4B) if compacted 2B is on.
+ * bit 1 : HEAD1 big pcluster (0 - off; 1 - on)
+ * bit 2 : HEAD2 big pcluster (0 - off; 1 - on)
+ * bit 3 : tailpacking inline pcluster (0 - off; 1 - on)
+ * bit 4 : interlaced plain pcluster (0 - off; 1 - on)
+ * bit 5 : fragment pcluster (0 - off; 1 - on)
*/
-#define Z_EROFS_ADVISE_COMPACTED_2B_BIT 0
-
-#define Z_EROFS_ADVISE_COMPACTED_2B (1 << Z_EROFS_ADVISE_COMPACTED_2B_BIT)
-
+#define Z_EROFS_ADVISE_COMPACTED_2B 0x0001
+#define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002
+#define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004
+#define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008
+#define Z_EROFS_ADVISE_INTERLACED_PCLUSTER 0x0010
+#define Z_EROFS_ADVISE_FRAGMENT_PCLUSTER 0x0020
+
+#define Z_EROFS_FRAGMENT_INODE_BIT 7
struct z_erofs_map_header {
- __le32 h_reserved1;
+ union {
+ /* fragment data offset in the packed inode */
+ __le32 h_fragmentoff;
+ struct {
+ __le16 h_reserved1;
+ /* indicates the encoded size of tailpacking data */
+ __le16 h_idata_size;
+ };
+ };
__le16 h_advise;
/*
* bit 0-3 : algorithm type of head 1 (logical cluster type 01);
@@ -214,9 +330,8 @@ struct z_erofs_map_header {
__u8 h_algorithmtype;
/*
* bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096;
- * bit 3-4 : (physical - logical) cluster bits of head 1:
- * For example, if logical clustersize = 4096, 1 for 8192.
- * bit 5-7 : (physical - logical) cluster bits of head 2.
+ * bit 3-6 : reserved;
+ * bit 7 : move the whole file into packed inode or not.
*/
__u8 h_clusterbits;
};
@@ -224,54 +339,62 @@ struct z_erofs_map_header {
#define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8
/*
- * Fixed-sized output compression ondisk Logical Extent cluster type:
- * 0 - literal (uncompressed) cluster
- * 1 - compressed cluster (for the head logical cluster)
- * 2 - compressed cluster (for the other logical clusters)
+ * Fixed-sized output compression on-disk logical cluster type:
+ * 0 - literal (uncompressed) lcluster
+ * 1,3 - compressed lcluster (for HEAD lclusters)
+ * 2 - compressed lcluster (for NONHEAD lclusters)
*
* In detail,
- * 0 - literal (uncompressed) cluster,
+ * 0 - literal (uncompressed) lcluster,
* di_advise = 0
- * di_clusterofs = the literal data offset of the cluster
- * di_blkaddr = the blkaddr of the literal cluster
+ * di_clusterofs = the literal data offset of the lcluster
+ * di_blkaddr = the blkaddr of the literal pcluster
*
- * 1 - compressed cluster (for the head logical cluster)
- * di_advise = 1
- * di_clusterofs = the decompressed data offset of the cluster
- * di_blkaddr = the blkaddr of the compressed cluster
+ * 1,3 - compressed lcluster (for HEAD lclusters)
+ * di_advise = 1 or 3
+ * di_clusterofs = the decompressed data offset of the lcluster
+ * di_blkaddr = the blkaddr of the compressed pcluster
*
- * 2 - compressed cluster (for the other logical clusters)
+ * 2 - compressed lcluster (for NONHEAD lclusters)
* di_advise = 2
* di_clusterofs =
- * the decompressed data offset in its own head cluster
- * di_u.delta[0] = distance to its corresponding head cluster
- * di_u.delta[1] = distance to its corresponding tail cluster
- * (di_advise could be 0, 1 or 2)
+ * the decompressed data offset in its own HEAD lcluster
+ * di_u.delta[0] = distance to this HEAD lcluster
+ * di_u.delta[1] = distance to the next HEAD lcluster
*/
enum {
Z_EROFS_VLE_CLUSTER_TYPE_PLAIN = 0,
- Z_EROFS_VLE_CLUSTER_TYPE_HEAD = 1,
+ Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 = 1,
Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD = 2,
- Z_EROFS_VLE_CLUSTER_TYPE_RESERVED = 3,
+ Z_EROFS_VLE_CLUSTER_TYPE_HEAD2 = 3,
Z_EROFS_VLE_CLUSTER_TYPE_MAX
};
#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2
#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0
+/* (noncompact only, HEAD) This pcluster refers to partial decompressed data */
+#define Z_EROFS_VLE_DI_PARTIAL_REF (1 << 15)
+
+/*
+ * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the
+ * compressed block count of a compressed extent (in logical clusters, aka.
+ * block count of a pcluster).
+ */
+#define Z_EROFS_VLE_DI_D0_CBLKCNT (1 << 11)
+
struct z_erofs_vle_decompressed_index {
__le16 di_advise;
- /* where to decompress in the head cluster */
+ /* where to decompress in the head lcluster */
__le16 di_clusterofs;
union {
- /* for the head cluster */
+ /* for the HEAD lclusters */
__le32 blkaddr;
/*
- * for the rest clusters
- * eg. for 4k page-sized cluster, maximum 4K*64k = 256M)
- * [0] - pointing to the head cluster
- * [1] - pointing to the tail cluster
+ * for the NONHEAD lclusters
+ * [0] - distance to its HEAD lcluster
+ * [1] - distance to the next HEAD lcluster
*/
__le16 delta[2];
} di_u;
@@ -300,18 +423,30 @@ struct erofs_dirent {
/* check the EROFS on-disk layout strictly at compile time */
static inline void erofs_check_ondisk_layout_definitions(void)
{
+ const __le64 fmh = *(__le64 *)&(struct z_erofs_map_header) {
+ .h_clusterbits = 1 << Z_EROFS_FRAGMENT_INODE_BIT
+ };
+
BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128);
BUILD_BUG_ON(sizeof(struct erofs_inode_compact) != 32);
BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64);
BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12);
BUILD_BUG_ON(sizeof(struct erofs_xattr_entry) != 4);
+ BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_info) != 4);
+ BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) != 8);
BUILD_BUG_ON(sizeof(struct z_erofs_map_header) != 8);
BUILD_BUG_ON(sizeof(struct z_erofs_vle_decompressed_index) != 8);
BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12);
+ /* keep in sync between 2 index structures for better extendibility */
+ BUILD_BUG_ON(sizeof(struct erofs_inode_chunk_index) !=
+ sizeof(struct z_erofs_vle_decompressed_index));
+ BUILD_BUG_ON(sizeof(struct erofs_deviceslot) != 128);
BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) <
Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1);
+ /* exclude old compiler versions like gcc 7.5.0 */
+ BUILD_BUG_ON(__builtin_constant_p(fmh) ?
+ fmh != cpu_to_le64(1ULL << 63) : 0);
}
#endif
-
diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c
new file mode 100644
index 000000000000..fe05bc51f9f2
--- /dev/null
+++ b/fs/erofs/fscache.c
@@ -0,0 +1,674 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2022, Alibaba Cloud
+ * Copyright (C) 2022, Bytedance Inc. All rights reserved.
+ */
+#include <linux/fscache.h>
+#include "internal.h"
+
+static DEFINE_MUTEX(erofs_domain_list_lock);
+static DEFINE_MUTEX(erofs_domain_cookies_lock);
+static LIST_HEAD(erofs_domain_list);
+static struct vfsmount *erofs_pseudo_mnt;
+
+static struct netfs_io_request *erofs_fscache_alloc_request(struct address_space *mapping,
+ loff_t start, size_t len)
+{
+ struct netfs_io_request *rreq;
+
+ rreq = kzalloc(sizeof(struct netfs_io_request), GFP_KERNEL);
+ if (!rreq)
+ return ERR_PTR(-ENOMEM);
+
+ rreq->start = start;
+ rreq->len = len;
+ rreq->mapping = mapping;
+ rreq->inode = mapping->host;
+ INIT_LIST_HEAD(&rreq->subrequests);
+ refcount_set(&rreq->ref, 1);
+ return rreq;
+}
+
+static void erofs_fscache_put_request(struct netfs_io_request *rreq)
+{
+ if (!refcount_dec_and_test(&rreq->ref))
+ return;
+ if (rreq->cache_resources.ops)
+ rreq->cache_resources.ops->end_operation(&rreq->cache_resources);
+ kfree(rreq);
+}
+
+static void erofs_fscache_put_subrequest(struct netfs_io_subrequest *subreq)
+{
+ if (!refcount_dec_and_test(&subreq->ref))
+ return;
+ erofs_fscache_put_request(subreq->rreq);
+ kfree(subreq);
+}
+
+static void erofs_fscache_clear_subrequests(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+
+ while (!list_empty(&rreq->subrequests)) {
+ subreq = list_first_entry(&rreq->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ list_del(&subreq->rreq_link);
+ erofs_fscache_put_subrequest(subreq);
+ }
+}
+
+static void erofs_fscache_rreq_unlock_folios(struct netfs_io_request *rreq)
+{
+ struct netfs_io_subrequest *subreq;
+ struct folio *folio;
+ unsigned int iopos = 0;
+ pgoff_t start_page = rreq->start / PAGE_SIZE;
+ pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1;
+ bool subreq_failed = false;
+
+ XA_STATE(xas, &rreq->mapping->i_pages, start_page);
+
+ subreq = list_first_entry(&rreq->subrequests,
+ struct netfs_io_subrequest, rreq_link);
+ subreq_failed = (subreq->error < 0);
+
+ rcu_read_lock();
+ xas_for_each(&xas, folio, last_page) {
+ unsigned int pgpos =
+ (folio_index(folio) - start_page) * PAGE_SIZE;
+ unsigned int pgend = pgpos + folio_size(folio);
+ bool pg_failed = false;
+
+ for (;;) {
+ if (!subreq) {
+ pg_failed = true;
+ break;
+ }
+
+ pg_failed |= subreq_failed;
+ if (pgend < iopos + subreq->len)
+ break;
+
+ iopos += subreq->len;
+ if (!list_is_last(&subreq->rreq_link,
+ &rreq->subrequests)) {
+ subreq = list_next_entry(subreq, rreq_link);
+ subreq_failed = (subreq->error < 0);
+ } else {
+ subreq = NULL;
+ subreq_failed = false;
+ }
+ if (pgend == iopos)
+ break;
+ }
+
+ if (!pg_failed)
+ folio_mark_uptodate(folio);
+
+ folio_unlock(folio);
+ }
+ rcu_read_unlock();
+}
+
+static void erofs_fscache_rreq_complete(struct netfs_io_request *rreq)
+{
+ erofs_fscache_rreq_unlock_folios(rreq);
+ erofs_fscache_clear_subrequests(rreq);
+ erofs_fscache_put_request(rreq);
+}
+
+static void erofc_fscache_subreq_complete(void *priv,
+ ssize_t transferred_or_error, bool was_async)
+{
+ struct netfs_io_subrequest *subreq = priv;
+ struct netfs_io_request *rreq = subreq->rreq;
+
+ if (IS_ERR_VALUE(transferred_or_error))
+ subreq->error = transferred_or_error;
+
+ if (atomic_dec_and_test(&rreq->nr_outstanding))
+ erofs_fscache_rreq_complete(rreq);
+
+ erofs_fscache_put_subrequest(subreq);
+}
+
+/*
+ * Read data from fscache and fill the read data into page cache described by
+ * @rreq, which shall be both aligned with PAGE_SIZE. @pstart describes
+ * the start physical address in the cache file.
+ */
+static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie,
+ struct netfs_io_request *rreq, loff_t pstart)
+{
+ enum netfs_io_source source;
+ struct super_block *sb = rreq->mapping->host->i_sb;
+ struct netfs_io_subrequest *subreq;
+ struct netfs_cache_resources *cres = &rreq->cache_resources;
+ struct iov_iter iter;
+ loff_t start = rreq->start;
+ size_t len = rreq->len;
+ size_t done = 0;
+ int ret;
+
+ atomic_set(&rreq->nr_outstanding, 1);
+
+ ret = fscache_begin_read_operation(cres, cookie);
+ if (ret)
+ goto out;
+
+ while (done < len) {
+ subreq = kzalloc(sizeof(struct netfs_io_subrequest),
+ GFP_KERNEL);
+ if (subreq) {
+ INIT_LIST_HEAD(&subreq->rreq_link);
+ refcount_set(&subreq->ref, 2);
+ subreq->rreq = rreq;
+ refcount_inc(&rreq->ref);
+ } else {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ subreq->start = pstart + done;
+ subreq->len = len - done;
+ subreq->flags = 1 << NETFS_SREQ_ONDEMAND;
+
+ list_add_tail(&subreq->rreq_link, &rreq->subrequests);
+
+ source = cres->ops->prepare_read(subreq, LLONG_MAX);
+ if (WARN_ON(subreq->len == 0))
+ source = NETFS_INVALID_READ;
+ if (source != NETFS_READ_FROM_CACHE) {
+ erofs_err(sb, "failed to fscache prepare_read (source %d)",
+ source);
+ ret = -EIO;
+ subreq->error = ret;
+ erofs_fscache_put_subrequest(subreq);
+ goto out;
+ }
+
+ atomic_inc(&rreq->nr_outstanding);
+
+ iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages,
+ start + done, subreq->len);
+
+ ret = fscache_read(cres, subreq->start, &iter,
+ NETFS_READ_HOLE_FAIL,
+ erofc_fscache_subreq_complete, subreq);
+ if (ret == -EIOCBQUEUED)
+ ret = 0;
+ if (ret) {
+ erofs_err(sb, "failed to fscache_read (ret %d)", ret);
+ goto out;
+ }
+
+ done += subreq->len;
+ }
+out:
+ if (atomic_dec_and_test(&rreq->nr_outstanding))
+ erofs_fscache_rreq_complete(rreq);
+
+ return ret;
+}
+
+static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio)
+{
+ int ret;
+ struct super_block *sb = folio_mapping(folio)->host->i_sb;
+ struct netfs_io_request *rreq;
+ struct erofs_map_dev mdev = {
+ .m_deviceid = 0,
+ .m_pa = folio_pos(folio),
+ };
+
+ ret = erofs_map_dev(sb, &mdev);
+ if (ret)
+ goto out;
+
+ rreq = erofs_fscache_alloc_request(folio_mapping(folio),
+ folio_pos(folio), folio_size(folio));
+ if (IS_ERR(rreq)) {
+ ret = PTR_ERR(rreq);
+ goto out;
+ }
+
+ return erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
+ rreq, mdev.m_pa);
+out:
+ folio_unlock(folio);
+ return ret;
+}
+
+/*
+ * Read into page cache in the range described by (@pos, @len).
+ *
+ * On return, the caller is responsible for page unlocking if the output @unlock
+ * is true, or the callee will take this responsibility through netfs_io_request
+ * interface.
+ *
+ * The return value is the number of bytes successfully handled, or negative
+ * error code on failure. The only exception is that, the length of the range
+ * instead of the error code is returned on failure after netfs_io_request is
+ * allocated, so that .readahead() could advance rac accordingly.
+ */
+static int erofs_fscache_data_read(struct address_space *mapping,
+ loff_t pos, size_t len, bool *unlock)
+{
+ struct inode *inode = mapping->host;
+ struct super_block *sb = inode->i_sb;
+ struct netfs_io_request *rreq;
+ struct erofs_map_blocks map;
+ struct erofs_map_dev mdev;
+ struct iov_iter iter;
+ size_t count;
+ int ret;
+
+ *unlock = true;
+
+ map.m_la = pos;
+ ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+ if (ret)
+ return ret;
+
+ if (map.m_flags & EROFS_MAP_META) {
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ erofs_blk_t blknr;
+ size_t offset, size;
+ void *src;
+
+ /* For tail packing layout, the offset may be non-zero. */
+ offset = erofs_blkoff(map.m_pa);
+ blknr = erofs_blknr(map.m_pa);
+ size = map.m_llen;
+
+ src = erofs_read_metabuf(&buf, sb, blknr, EROFS_KMAP);
+ if (IS_ERR(src))
+ return PTR_ERR(src);
+
+ iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, PAGE_SIZE);
+ if (copy_to_iter(src + offset, size, &iter) != size)
+ return -EFAULT;
+ iov_iter_zero(PAGE_SIZE - size, &iter);
+ erofs_put_metabuf(&buf);
+ return PAGE_SIZE;
+ }
+
+ count = min_t(size_t, map.m_llen - (pos - map.m_la), len);
+ DBG_BUGON(!count || count % PAGE_SIZE);
+
+ if (!(map.m_flags & EROFS_MAP_MAPPED)) {
+ iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, count);
+ iov_iter_zero(count, &iter);
+ return count;
+ }
+
+ mdev = (struct erofs_map_dev) {
+ .m_deviceid = map.m_deviceid,
+ .m_pa = map.m_pa,
+ };
+ ret = erofs_map_dev(sb, &mdev);
+ if (ret)
+ return ret;
+
+ rreq = erofs_fscache_alloc_request(mapping, pos, count);
+ if (IS_ERR(rreq))
+ return PTR_ERR(rreq);
+
+ *unlock = false;
+ erofs_fscache_read_folios_async(mdev.m_fscache->cookie,
+ rreq, mdev.m_pa + (pos - map.m_la));
+ return count;
+}
+
+static int erofs_fscache_read_folio(struct file *file, struct folio *folio)
+{
+ bool unlock;
+ int ret;
+
+ DBG_BUGON(folio_size(folio) != EROFS_BLKSIZ);
+
+ ret = erofs_fscache_data_read(folio_mapping(folio), folio_pos(folio),
+ folio_size(folio), &unlock);
+ if (unlock) {
+ if (ret > 0)
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
+ }
+ return ret < 0 ? ret : 0;
+}
+
+static void erofs_fscache_readahead(struct readahead_control *rac)
+{
+ struct folio *folio;
+ size_t len, done = 0;
+ loff_t start, pos;
+ bool unlock;
+ int ret, size;
+
+ if (!readahead_count(rac))
+ return;
+
+ start = readahead_pos(rac);
+ len = readahead_length(rac);
+
+ do {
+ pos = start + done;
+ ret = erofs_fscache_data_read(rac->mapping, pos,
+ len - done, &unlock);
+ if (ret <= 0)
+ return;
+
+ size = ret;
+ while (size) {
+ folio = readahead_folio(rac);
+ size -= folio_size(folio);
+ if (unlock) {
+ folio_mark_uptodate(folio);
+ folio_unlock(folio);
+ }
+ }
+ } while ((done += ret) < len);
+}
+
+static const struct address_space_operations erofs_fscache_meta_aops = {
+ .read_folio = erofs_fscache_meta_read_folio,
+};
+
+const struct address_space_operations erofs_fscache_access_aops = {
+ .read_folio = erofs_fscache_read_folio,
+ .readahead = erofs_fscache_readahead,
+};
+
+static void erofs_fscache_domain_put(struct erofs_domain *domain)
+{
+ if (!domain)
+ return;
+ mutex_lock(&erofs_domain_list_lock);
+ if (refcount_dec_and_test(&domain->ref)) {
+ list_del(&domain->list);
+ if (list_empty(&erofs_domain_list)) {
+ kern_unmount(erofs_pseudo_mnt);
+ erofs_pseudo_mnt = NULL;
+ }
+ mutex_unlock(&erofs_domain_list_lock);
+ fscache_relinquish_volume(domain->volume, NULL, false);
+ kfree(domain->domain_id);
+ kfree(domain);
+ return;
+ }
+ mutex_unlock(&erofs_domain_list_lock);
+}
+
+static int erofs_fscache_register_volume(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ char *domain_id = sbi->opt.domain_id;
+ struct fscache_volume *volume;
+ char *name;
+ int ret = 0;
+
+ name = kasprintf(GFP_KERNEL, "erofs,%s",
+ domain_id ? domain_id : sbi->opt.fsid);
+ if (!name)
+ return -ENOMEM;
+
+ volume = fscache_acquire_volume(name, NULL, NULL, 0);
+ if (IS_ERR_OR_NULL(volume)) {
+ erofs_err(sb, "failed to register volume for %s", name);
+ ret = volume ? PTR_ERR(volume) : -EOPNOTSUPP;
+ volume = NULL;
+ }
+
+ sbi->volume = volume;
+ kfree(name);
+ return ret;
+}
+
+static int erofs_fscache_init_domain(struct super_block *sb)
+{
+ int err;
+ struct erofs_domain *domain;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ domain = kzalloc(sizeof(struct erofs_domain), GFP_KERNEL);
+ if (!domain)
+ return -ENOMEM;
+
+ domain->domain_id = kstrdup(sbi->opt.domain_id, GFP_KERNEL);
+ if (!domain->domain_id) {
+ kfree(domain);
+ return -ENOMEM;
+ }
+
+ err = erofs_fscache_register_volume(sb);
+ if (err)
+ goto out;
+
+ if (!erofs_pseudo_mnt) {
+ erofs_pseudo_mnt = kern_mount(&erofs_fs_type);
+ if (IS_ERR(erofs_pseudo_mnt)) {
+ err = PTR_ERR(erofs_pseudo_mnt);
+ goto out;
+ }
+ }
+
+ domain->volume = sbi->volume;
+ refcount_set(&domain->ref, 1);
+ list_add(&domain->list, &erofs_domain_list);
+ sbi->domain = domain;
+ return 0;
+out:
+ kfree(domain->domain_id);
+ kfree(domain);
+ return err;
+}
+
+static int erofs_fscache_register_domain(struct super_block *sb)
+{
+ int err;
+ struct erofs_domain *domain;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ mutex_lock(&erofs_domain_list_lock);
+ list_for_each_entry(domain, &erofs_domain_list, list) {
+ if (!strcmp(domain->domain_id, sbi->opt.domain_id)) {
+ sbi->domain = domain;
+ sbi->volume = domain->volume;
+ refcount_inc(&domain->ref);
+ mutex_unlock(&erofs_domain_list_lock);
+ return 0;
+ }
+ }
+ err = erofs_fscache_init_domain(sb);
+ mutex_unlock(&erofs_domain_list_lock);
+ return err;
+}
+
+static
+struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb,
+ char *name, bool need_inode)
+{
+ struct fscache_volume *volume = EROFS_SB(sb)->volume;
+ struct erofs_fscache *ctx;
+ struct fscache_cookie *cookie;
+ int ret;
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return ERR_PTR(-ENOMEM);
+
+ cookie = fscache_acquire_cookie(volume, FSCACHE_ADV_WANT_CACHE_SIZE,
+ name, strlen(name), NULL, 0, 0);
+ if (!cookie) {
+ erofs_err(sb, "failed to get cookie for %s", name);
+ ret = -EINVAL;
+ goto err;
+ }
+
+ fscache_use_cookie(cookie, false);
+ ctx->cookie = cookie;
+
+ if (need_inode) {
+ struct inode *const inode = new_inode(sb);
+
+ if (!inode) {
+ erofs_err(sb, "failed to get anon inode for %s", name);
+ ret = -ENOMEM;
+ goto err_cookie;
+ }
+
+ set_nlink(inode, 1);
+ inode->i_size = OFFSET_MAX;
+ inode->i_mapping->a_ops = &erofs_fscache_meta_aops;
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
+
+ ctx->inode = inode;
+ }
+
+ return ctx;
+
+err_cookie:
+ fscache_unuse_cookie(ctx->cookie, NULL, NULL);
+ fscache_relinquish_cookie(ctx->cookie, false);
+err:
+ kfree(ctx);
+ return ERR_PTR(ret);
+}
+
+static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx)
+{
+ fscache_unuse_cookie(ctx->cookie, NULL, NULL);
+ fscache_relinquish_cookie(ctx->cookie, false);
+ iput(ctx->inode);
+ kfree(ctx->name);
+ kfree(ctx);
+}
+
+static
+struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb,
+ char *name, bool need_inode)
+{
+ int err;
+ struct inode *inode;
+ struct erofs_fscache *ctx;
+ struct erofs_domain *domain = EROFS_SB(sb)->domain;
+
+ ctx = erofs_fscache_acquire_cookie(sb, name, need_inode);
+ if (IS_ERR(ctx))
+ return ctx;
+
+ ctx->name = kstrdup(name, GFP_KERNEL);
+ if (!ctx->name) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ inode = new_inode(erofs_pseudo_mnt->mnt_sb);
+ if (!inode) {
+ err = -ENOMEM;
+ goto out;
+ }
+
+ ctx->domain = domain;
+ ctx->anon_inode = inode;
+ inode->i_private = ctx;
+ refcount_inc(&domain->ref);
+ return ctx;
+out:
+ erofs_fscache_relinquish_cookie(ctx);
+ return ERR_PTR(err);
+}
+
+static
+struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb,
+ char *name, bool need_inode)
+{
+ struct inode *inode;
+ struct erofs_fscache *ctx;
+ struct erofs_domain *domain = EROFS_SB(sb)->domain;
+ struct super_block *psb = erofs_pseudo_mnt->mnt_sb;
+
+ mutex_lock(&erofs_domain_cookies_lock);
+ spin_lock(&psb->s_inode_list_lock);
+ list_for_each_entry(inode, &psb->s_inodes, i_sb_list) {
+ ctx = inode->i_private;
+ if (!ctx || ctx->domain != domain || strcmp(ctx->name, name))
+ continue;
+ igrab(inode);
+ spin_unlock(&psb->s_inode_list_lock);
+ mutex_unlock(&erofs_domain_cookies_lock);
+ return ctx;
+ }
+ spin_unlock(&psb->s_inode_list_lock);
+ ctx = erofs_fscache_domain_init_cookie(sb, name, need_inode);
+ mutex_unlock(&erofs_domain_cookies_lock);
+ return ctx;
+}
+
+struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
+ char *name, bool need_inode)
+{
+ if (EROFS_SB(sb)->opt.domain_id)
+ return erofs_domain_register_cookie(sb, name, need_inode);
+ return erofs_fscache_acquire_cookie(sb, name, need_inode);
+}
+
+void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx)
+{
+ bool drop;
+ struct erofs_domain *domain;
+
+ if (!ctx)
+ return;
+ domain = ctx->domain;
+ if (domain) {
+ mutex_lock(&erofs_domain_cookies_lock);
+ drop = atomic_read(&ctx->anon_inode->i_count) == 1;
+ iput(ctx->anon_inode);
+ mutex_unlock(&erofs_domain_cookies_lock);
+ if (!drop)
+ return;
+ }
+
+ erofs_fscache_relinquish_cookie(ctx);
+ erofs_fscache_domain_put(domain);
+}
+
+int erofs_fscache_register_fs(struct super_block *sb)
+{
+ int ret;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_fscache *fscache;
+
+ if (sbi->opt.domain_id)
+ ret = erofs_fscache_register_domain(sb);
+ else
+ ret = erofs_fscache_register_volume(sb);
+ if (ret)
+ return ret;
+
+ /* acquired domain/volume will be relinquished in kill_sb() on error */
+ fscache = erofs_fscache_register_cookie(sb, sbi->opt.fsid, true);
+ if (IS_ERR(fscache))
+ return PTR_ERR(fscache);
+
+ sbi->s_fscache = fscache;
+ return 0;
+}
+
+void erofs_fscache_unregister_fs(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ erofs_fscache_unregister_cookie(sbi->s_fscache);
+
+ if (sbi->domain)
+ erofs_fscache_domain_put(sbi->domain);
+ else
+ fscache_relinquish_volume(sbi->volume, NULL, false);
+
+ sbi->s_fscache = NULL;
+ sbi->volume = NULL;
+ sbi->domain = NULL;
+}
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
index 3350ab65d892..ad2a82f2eb4c 100644
--- a/fs/erofs/inode.c
+++ b/fs/erofs/inode.c
@@ -1,38 +1,87 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
- * http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
+ * https://www.huawei.com/
+ * Copyright (C) 2021, Alibaba Cloud
*/
#include "xattr.h"
#include <trace/events/erofs.h>
-/* no locking */
-static int erofs_read_inode(struct inode *inode, void *data)
+static void *erofs_read_inode(struct erofs_buf *buf,
+ struct inode *inode, unsigned int *ofs)
{
+ struct super_block *sb = inode->i_sb;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
struct erofs_inode *vi = EROFS_I(inode);
- struct erofs_inode_compact *dic = data;
- struct erofs_inode_extended *die;
+ const erofs_off_t inode_loc = iloc(sbi, vi->nid);
- const unsigned int ifmt = le16_to_cpu(dic->i_format);
- struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
- erofs_blk_t nblks = 0;
+ erofs_blk_t blkaddr, nblks = 0;
+ void *kaddr;
+ struct erofs_inode_compact *dic;
+ struct erofs_inode_extended *die, *copied = NULL;
+ unsigned int ifmt;
+ int err;
- vi->datalayout = erofs_inode_datalayout(ifmt);
+ blkaddr = erofs_blknr(inode_loc);
+ *ofs = erofs_blkoff(inode_loc);
+
+ erofs_dbg("%s, reading inode nid %llu at %u of blkaddr %u",
+ __func__, vi->nid, *ofs, blkaddr);
+ kaddr = erofs_read_metabuf(buf, sb, blkaddr, EROFS_KMAP);
+ if (IS_ERR(kaddr)) {
+ erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld",
+ vi->nid, PTR_ERR(kaddr));
+ return kaddr;
+ }
+
+ dic = kaddr + *ofs;
+ ifmt = le16_to_cpu(dic->i_format);
+
+ if (ifmt & ~EROFS_I_ALL) {
+ erofs_err(inode->i_sb, "unsupported i_format %u of nid %llu",
+ ifmt, vi->nid);
+ err = -EOPNOTSUPP;
+ goto err_out;
+ }
+
+ vi->datalayout = erofs_inode_datalayout(ifmt);
if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) {
erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu",
vi->datalayout, vi->nid);
- DBG_BUGON(1);
- return -EOPNOTSUPP;
+ err = -EOPNOTSUPP;
+ goto err_out;
}
switch (erofs_inode_version(ifmt)) {
case EROFS_INODE_LAYOUT_EXTENDED:
- die = data;
-
vi->inode_isize = sizeof(struct erofs_inode_extended);
+ /* check if the extended inode acrosses block boundary */
+ if (*ofs + vi->inode_isize <= EROFS_BLKSIZ) {
+ *ofs += vi->inode_isize;
+ die = (struct erofs_inode_extended *)dic;
+ } else {
+ const unsigned int gotten = EROFS_BLKSIZ - *ofs;
+
+ copied = kmalloc(vi->inode_isize, GFP_NOFS);
+ if (!copied) {
+ err = -ENOMEM;
+ goto err_out;
+ }
+ memcpy(copied, dic, gotten);
+ kaddr = erofs_read_metabuf(buf, sb, blkaddr + 1,
+ EROFS_KMAP);
+ if (IS_ERR(kaddr)) {
+ erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld",
+ vi->nid, PTR_ERR(kaddr));
+ kfree(copied);
+ return kaddr;
+ }
+ *ofs = vi->inode_isize - gotten;
+ memcpy((u8 *)copied + gotten, kaddr, *ofs);
+ die = copied;
+ }
vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount);
inode->i_mode = le16_to_cpu(die->i_mode);
@@ -58,20 +107,24 @@ static int erofs_read_inode(struct inode *inode, void *data)
i_gid_write(inode, le32_to_cpu(die->i_gid));
set_nlink(inode, le32_to_cpu(die->i_nlink));
- /* ns timestamp */
- inode->i_mtime.tv_sec = inode->i_ctime.tv_sec =
- le64_to_cpu(die->i_ctime);
- inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec =
- le32_to_cpu(die->i_ctime_nsec);
+ /* extended inode has its own timestamp */
+ inode->i_ctime.tv_sec = le64_to_cpu(die->i_mtime);
+ inode->i_ctime.tv_nsec = le32_to_cpu(die->i_mtime_nsec);
inode->i_size = le64_to_cpu(die->i_size);
/* total blocks for compressed files */
if (erofs_inode_is_data_compressed(vi->datalayout))
nblks = le32_to_cpu(die->i_u.compressed_blocks);
+ else if (vi->datalayout == EROFS_INODE_CHUNK_BASED)
+ /* fill chunked inode summary info */
+ vi->chunkformat = le16_to_cpu(die->i_u.c.format);
+ kfree(copied);
+ copied = NULL;
break;
case EROFS_INODE_LAYOUT_COMPACT:
vi->inode_isize = sizeof(struct erofs_inode_compact);
+ *ofs += vi->inode_isize;
vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount);
inode->i_mode = le16_to_cpu(dic->i_mode);
@@ -97,39 +150,63 @@ static int erofs_read_inode(struct inode *inode, void *data)
i_gid_write(inode, le16_to_cpu(dic->i_gid));
set_nlink(inode, le16_to_cpu(dic->i_nlink));
- /* use build time to derive all file time */
- inode->i_mtime.tv_sec = inode->i_ctime.tv_sec =
- sbi->build_time;
- inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec =
- sbi->build_time_nsec;
+ /* use build time for compact inodes */
+ inode->i_ctime.tv_sec = sbi->build_time;
+ inode->i_ctime.tv_nsec = sbi->build_time_nsec;
inode->i_size = le32_to_cpu(dic->i_size);
if (erofs_inode_is_data_compressed(vi->datalayout))
nblks = le32_to_cpu(dic->i_u.compressed_blocks);
+ else if (vi->datalayout == EROFS_INODE_CHUNK_BASED)
+ vi->chunkformat = le16_to_cpu(dic->i_u.c.format);
break;
default:
erofs_err(inode->i_sb,
"unsupported on-disk inode version %u of nid %llu",
erofs_inode_version(ifmt), vi->nid);
- DBG_BUGON(1);
- return -EOPNOTSUPP;
+ err = -EOPNOTSUPP;
+ goto err_out;
}
+ if (vi->datalayout == EROFS_INODE_CHUNK_BASED) {
+ if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) {
+ erofs_err(inode->i_sb,
+ "unsupported chunk format %x of nid %llu",
+ vi->chunkformat, vi->nid);
+ err = -EOPNOTSUPP;
+ goto err_out;
+ }
+ vi->chunkbits = LOG_BLOCK_SIZE +
+ (vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK);
+ }
+ inode->i_mtime.tv_sec = inode->i_ctime.tv_sec;
+ inode->i_atime.tv_sec = inode->i_ctime.tv_sec;
+ inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec;
+ inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec;
+
+ inode->i_flags &= ~S_DAX;
+ if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) &&
+ vi->datalayout == EROFS_INODE_FLAT_PLAIN)
+ inode->i_flags |= S_DAX;
if (!nblks)
/* measure inode.i_blocks as generic filesystems */
inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9;
else
inode->i_blocks = nblks << LOG_SECTORS_PER_BLOCK;
- return 0;
+ return kaddr;
bogusimode:
erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu",
inode->i_mode, vi->nid);
+ err = -EFSCORRUPTED;
+err_out:
DBG_BUGON(1);
- return -EFSCORRUPTED;
+ kfree(copied);
+ erofs_put_metabuf(buf);
+ return ERR_PTR(err);
}
-static int erofs_fill_symlink(struct inode *inode, void *data,
+static int erofs_fill_symlink(struct inode *inode, void *kaddr,
unsigned int m_pofs)
{
struct erofs_inode *vi = EROFS_I(inode);
@@ -137,7 +214,7 @@ static int erofs_fill_symlink(struct inode *inode, void *data,
/* if it cannot be handled with fast symlink scheme */
if (vi->datalayout != EROFS_INODE_FLAT_INLINE ||
- inode->i_size >= PAGE_SIZE) {
+ inode->i_size >= EROFS_BLKSIZ || inode->i_size < 0) {
inode->i_op = &erofs_symlink_iops;
return 0;
}
@@ -146,9 +223,9 @@ static int erofs_fill_symlink(struct inode *inode, void *data,
if (!lnk)
return -ENOMEM;
- m_pofs += vi->inode_isize + vi->xattr_isize;
- /* inline symlink data shouldn't cross page boundary as well */
- if (m_pofs + inode->i_size > PAGE_SIZE) {
+ m_pofs += vi->xattr_isize;
+ /* inline symlink data shouldn't cross block boundary */
+ if (m_pofs + inode->i_size > EROFS_BLKSIZ) {
kfree(lnk);
erofs_err(inode->i_sb,
"inline data cross block boundary @ nid %llu",
@@ -156,8 +233,7 @@ static int erofs_fill_symlink(struct inode *inode, void *data,
DBG_BUGON(1);
return -EFSCORRUPTED;
}
-
- memcpy(lnk, data + m_pofs, inode->i_size);
+ memcpy(lnk, kaddr + m_pofs, inode->i_size);
lnk[inode->i_size] = '\0';
inode->i_link = lnk;
@@ -165,52 +241,36 @@ static int erofs_fill_symlink(struct inode *inode, void *data,
return 0;
}
-static int erofs_fill_inode(struct inode *inode, int isdir)
+static int erofs_fill_inode(struct inode *inode)
{
- struct super_block *sb = inode->i_sb;
struct erofs_inode *vi = EROFS_I(inode);
- struct page *page;
- void *data;
- int err;
- erofs_blk_t blkaddr;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ void *kaddr;
unsigned int ofs;
- erofs_off_t inode_loc;
-
- trace_erofs_fill_inode(inode, isdir);
- inode_loc = iloc(EROFS_SB(sb), vi->nid);
- blkaddr = erofs_blknr(inode_loc);
- ofs = erofs_blkoff(inode_loc);
-
- erofs_dbg("%s, reading inode nid %llu at %u of blkaddr %u",
- __func__, vi->nid, ofs, blkaddr);
+ int err = 0;
- page = erofs_get_meta_page(sb, blkaddr);
+ trace_erofs_fill_inode(inode);
- if (IS_ERR(page)) {
- erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld",
- vi->nid, PTR_ERR(page));
- return PTR_ERR(page);
- }
-
- DBG_BUGON(!PageUptodate(page));
- data = page_address(page);
-
- err = erofs_read_inode(inode, data + ofs);
- if (err)
- goto out_unlock;
+ /* read inode base data from disk */
+ kaddr = erofs_read_inode(&buf, inode, &ofs);
+ if (IS_ERR(kaddr))
+ return PTR_ERR(kaddr);
/* setup the new inode */
switch (inode->i_mode & S_IFMT) {
case S_IFREG:
inode->i_op = &erofs_generic_iops;
- inode->i_fop = &generic_ro_fops;
+ if (erofs_inode_is_data_compressed(vi->datalayout))
+ inode->i_fop = &generic_ro_fops;
+ else
+ inode->i_fop = &erofs_file_fops;
break;
case S_IFDIR:
inode->i_op = &erofs_dir_iops;
inode->i_fop = &erofs_dir_fops;
break;
case S_IFLNK:
- err = erofs_fill_symlink(inode, data, ofs);
+ err = erofs_fill_symlink(inode, kaddr, ofs);
if (err)
goto out_unlock;
inode_nohighmem(inode);
@@ -228,14 +288,20 @@ static int erofs_fill_inode(struct inode *inode, int isdir)
}
if (erofs_inode_is_data_compressed(vi->datalayout)) {
- err = z_erofs_fill_inode(inode);
+ if (!erofs_is_fscache_mode(inode->i_sb))
+ err = z_erofs_fill_inode(inode);
+ else
+ err = -EOPNOTSUPP;
goto out_unlock;
}
inode->i_mapping->a_ops = &erofs_raw_access_aops;
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+ if (erofs_is_fscache_mode(inode->i_sb))
+ inode->i_mapping->a_ops = &erofs_fscache_access_aops;
+#endif
out_unlock:
- unlock_page(page);
- put_page(page);
+ erofs_put_metabuf(&buf);
return err;
}
@@ -258,21 +324,13 @@ static int erofs_iget_set_actor(struct inode *inode, void *opaque)
return 0;
}
-static inline struct inode *erofs_iget_locked(struct super_block *sb,
- erofs_nid_t nid)
+struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid)
{
const unsigned long hashval = erofs_inode_hash(nid);
+ struct inode *inode;
- return iget5_locked(sb, hashval, erofs_ilookup_test_actor,
+ inode = iget5_locked(sb, hashval, erofs_ilookup_test_actor,
erofs_iget_set_actor, &nid);
-}
-
-struct inode *erofs_iget(struct super_block *sb,
- erofs_nid_t nid,
- bool isdir)
-{
- struct inode *inode = erofs_iget_locked(sb, nid);
-
if (!inode)
return ERR_PTR(-ENOMEM);
@@ -282,10 +340,10 @@ struct inode *erofs_iget(struct super_block *sb,
vi->nid = nid;
- err = erofs_fill_inode(inode, isdir);
- if (!err)
+ err = erofs_fill_inode(inode);
+ if (!err) {
unlock_new_inode(inode);
- else {
+ } else {
iget_failed(inode);
inode = ERR_PTR(err);
}
@@ -293,8 +351,9 @@ struct inode *erofs_iget(struct super_block *sb,
return inode;
}
-int erofs_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int query_flags)
+int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags)
{
struct inode *const inode = d_inode(path->dentry);
@@ -305,33 +364,27 @@ int erofs_getattr(const struct path *path, struct kstat *stat,
stat->attributes_mask |= (STATX_ATTR_COMPRESSED |
STATX_ATTR_IMMUTABLE);
- generic_fillattr(inode, stat);
+ generic_fillattr(mnt_userns, inode, stat);
return 0;
}
const struct inode_operations erofs_generic_iops = {
.getattr = erofs_getattr,
-#ifdef CONFIG_EROFS_FS_XATTR
.listxattr = erofs_listxattr,
-#endif
.get_acl = erofs_get_acl,
+ .fiemap = erofs_fiemap,
};
const struct inode_operations erofs_symlink_iops = {
.get_link = page_get_link,
.getattr = erofs_getattr,
-#ifdef CONFIG_EROFS_FS_XATTR
.listxattr = erofs_listxattr,
-#endif
.get_acl = erofs_get_acl,
};
const struct inode_operations erofs_fast_symlink_iops = {
.get_link = simple_get_link,
.getattr = erofs_getattr,
-#ifdef CONFIG_EROFS_FS_XATTR
.listxattr = erofs_listxattr,
-#endif
.get_acl = erofs_get_acl,
};
-
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index c4c6dcdc89ad..1701df48c446 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -1,8 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
- * http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
+ * https://www.huawei.com/
+ * Copyright (C) 2021, Alibaba Cloud
*/
#ifndef __EROFS_INTERNAL_H
#define __EROFS_INTERNAL_H
@@ -16,6 +16,7 @@
#include <linux/magic.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
+#include <linux/iomap.h>
#include "erofs_fs.h"
/* redefine pr_fmt "erofs: " */
@@ -46,35 +47,108 @@ typedef u64 erofs_off_t;
/* data type for filesystem-wide blocks number */
typedef u32 erofs_blk_t;
+struct erofs_device_info {
+ char *path;
+ struct erofs_fscache *fscache;
+ struct block_device *bdev;
+ struct dax_device *dax_dev;
+ u64 dax_part_off;
+
+ u32 blocks;
+ u32 mapped_blkaddr;
+};
+
+enum {
+ EROFS_SYNC_DECOMPRESS_AUTO,
+ EROFS_SYNC_DECOMPRESS_FORCE_ON,
+ EROFS_SYNC_DECOMPRESS_FORCE_OFF
+};
+
+struct erofs_mount_opts {
+#ifdef CONFIG_EROFS_FS_ZIP
+ /* current strategy of how to use managed cache */
+ unsigned char cache_strategy;
+ /* strategy of sync decompression (0 - auto, 1 - force on, 2 - force off) */
+ unsigned int sync_decompress;
+
+ /* threshold for decompression synchronously */
+ unsigned int max_sync_decompress_pages;
+#endif
+ unsigned int mount_opt;
+ char *fsid;
+ char *domain_id;
+};
+
+struct erofs_dev_context {
+ struct idr tree;
+ struct rw_semaphore rwsem;
+
+ unsigned int extra_devices;
+};
+
+struct erofs_fs_context {
+ struct erofs_mount_opts opt;
+ struct erofs_dev_context *devs;
+};
+
+/* all filesystem-wide lz4 configurations */
+struct erofs_sb_lz4_info {
+ /* # of pages needed for EROFS lz4 rolling decompression */
+ u16 max_distance_pages;
+ /* maximum possible blocks for pclusters in the filesystem */
+ u16 max_pclusterblks;
+};
+
+struct erofs_domain {
+ refcount_t ref;
+ struct list_head list;
+ struct fscache_volume *volume;
+ char *domain_id;
+};
+
+struct erofs_fscache {
+ struct fscache_cookie *cookie;
+ struct inode *inode;
+ struct inode *anon_inode;
+ struct erofs_domain *domain;
+ char *name;
+};
+
struct erofs_sb_info {
+ struct erofs_mount_opts opt; /* options */
#ifdef CONFIG_EROFS_FS_ZIP
/* list for all registered superblocks, mainly for shrinker */
struct list_head list;
struct mutex umount_mutex;
- /* the dedicated workstation for compression */
- struct radix_tree_root workstn_tree;
-
- /* threshold for decompression synchronously */
- unsigned int max_sync_decompress_pages;
+ /* managed XArray arranged in physical block number */
+ struct xarray managed_pslots;
unsigned int shrinker_run_no;
-
- /* current strategy of how to use managed cache */
- unsigned char cache_strategy;
+ u16 available_compr_algs;
/* pseudo inode to manage cached pages */
struct inode *managed_cache;
+
+ struct erofs_sb_lz4_info lz4;
+ struct inode *packed_inode;
#endif /* CONFIG_EROFS_FS_ZIP */
- u32 blocks;
+ struct erofs_dev_context *devs;
+ struct dax_device *dax_dev;
+ u64 dax_part_off;
+ u64 total_blocks;
+ u32 primarydevice_blocks;
+
u32 meta_blkaddr;
#ifdef CONFIG_EROFS_FS_XATTR
u32 xattr_blkaddr;
#endif
+ u16 device_id_mask; /* valid bits of device id to be used */
/* inode slot unit size in bit shift */
unsigned char islotbits;
+ u32 sb_size; /* total superblock size */
u32 build_time_nsec;
u64 build_time;
@@ -88,7 +162,14 @@ struct erofs_sb_info {
u32 feature_compat;
u32 feature_incompat;
- unsigned int mount_opt;
+ /* sysfs support */
+ struct kobject s_kobj; /* /sys/fs/erofs/<devname> */
+ struct completion s_kobj_unregister;
+
+ /* fscache support */
+ struct fscache_volume *volume;
+ struct erofs_fscache *s_fscache;
+ struct erofs_domain *domain;
};
#define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
@@ -97,12 +178,18 @@ struct erofs_sb_info {
/* Mount flags set via mount options or defaults */
#define EROFS_MOUNT_XATTR_USER 0x00000010
#define EROFS_MOUNT_POSIX_ACL 0x00000020
+#define EROFS_MOUNT_DAX_ALWAYS 0x00000040
+#define EROFS_MOUNT_DAX_NEVER 0x00000080
-#define clear_opt(sbi, option) ((sbi)->mount_opt &= ~EROFS_MOUNT_##option)
-#define set_opt(sbi, option) ((sbi)->mount_opt |= EROFS_MOUNT_##option)
-#define test_opt(sbi, option) ((sbi)->mount_opt & EROFS_MOUNT_##option)
+#define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option)
+#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option)
+#define test_opt(opt, option) ((opt)->mount_opt & EROFS_MOUNT_##option)
+
+static inline bool erofs_is_fscache_mode(struct super_block *sb)
+{
+ return IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && !sb->s_bdev;
+}
-#ifdef CONFIG_EROFS_FS_ZIP
enum {
EROFS_ZIP_CACHE_DISABLED,
EROFS_ZIP_CACHE_READAHEAD,
@@ -120,7 +207,6 @@ struct erofs_workgroup {
atomic_t refcount;
};
-#if defined(CONFIG_SMP)
static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
int val)
{
@@ -149,41 +235,6 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
return atomic_cond_read_relaxed(&grp->refcount,
VAL != EROFS_LOCKED_MAGIC);
}
-#else
-static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
- int val)
-{
- preempt_disable();
- /* no need to spin on UP platforms, let's just disable preemption. */
- if (val != atomic_read(&grp->refcount)) {
- preempt_enable();
- return false;
- }
- return true;
-}
-
-static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp,
- int orig_val)
-{
- preempt_enable();
-}
-
-static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
-{
- int v = atomic_read(&grp->refcount);
-
- /* workgroup is never freezed on uniprocessor systems */
- DBG_BUGON(v == EROFS_LOCKED_MAGIC);
- return v;
-}
-#endif /* !CONFIG_SMP */
-
-/* hard limit of pages per compressed cluster */
-#define Z_EROFS_CLUSTER_MAX_PAGES (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
-#define EROFS_PCPUBUF_NR_PAGES Z_EROFS_CLUSTER_MAX_PAGES
-#else
-#define EROFS_PCPUBUF_NR_PAGES 0
-#endif /* !CONFIG_EROFS_FS_ZIP */
/* we strictly follow PAGE_SIZE and no buffer head yet */
#define LOG_BLOCK_SIZE PAGE_SHIFT
@@ -200,6 +251,19 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
#error erofs cannot be used in this platform
#endif
+enum erofs_kmap_type {
+ EROFS_NO_KMAP, /* don't map the buffer */
+ EROFS_KMAP, /* use kmap() to map the buffer */
+ EROFS_KMAP_ATOMIC, /* use kmap_atomic() to map the buffer */
+};
+
+struct erofs_buf {
+ struct page *page;
+ void *base;
+ enum erofs_kmap_type kmap_type;
+};
+#define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL })
+
#define ROOT_NID(sb) ((sb)->root_nid)
#define erofs_blknr(addr) ((addr) / EROFS_BLKSIZ)
@@ -211,6 +275,23 @@ static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid)
return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits);
}
+#define EROFS_FEATURE_FUNCS(name, compat, feature) \
+static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \
+{ \
+ return sbi->feature_##compat & EROFS_FEATURE_##feature; \
+}
+
+EROFS_FEATURE_FUNCS(zero_padding, incompat, INCOMPAT_ZERO_PADDING)
+EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS)
+EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER)
+EROFS_FEATURE_FUNCS(chunked_file, incompat, INCOMPAT_CHUNKED_FILE)
+EROFS_FEATURE_FUNCS(device_table, incompat, INCOMPAT_DEVICE_TABLE)
+EROFS_FEATURE_FUNCS(compr_head2, incompat, INCOMPAT_COMPR_HEAD2)
+EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING)
+EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS)
+EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE)
+EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
+
/* atomic flag definitions */
#define EROFS_I_EA_INITED_BIT 0
#define EROFS_I_Z_INITED_BIT 1
@@ -234,12 +315,23 @@ struct erofs_inode {
union {
erofs_blk_t raw_blkaddr;
+ struct {
+ unsigned short chunkformat;
+ unsigned char chunkbits;
+ };
#ifdef CONFIG_EROFS_FS_ZIP
struct {
unsigned short z_advise;
unsigned char z_algorithmtype[2];
unsigned char z_logical_clusterbits;
- unsigned char z_physical_clusterbits[2];
+ unsigned long z_tailextent_headlcn;
+ union {
+ struct {
+ erofs_off_t z_idataoff;
+ unsigned short z_idata_size;
+ };
+ erofs_off_t z_fragmentoff;
+ };
};
#endif /* CONFIG_EROFS_FS_ZIP */
};
@@ -276,63 +368,77 @@ static inline unsigned int erofs_inode_datalayout(unsigned int value)
EROFS_I_DATALAYOUT_BITS);
}
+/*
+ * Different from grab_cache_page_nowait(), reclaiming is never triggered
+ * when allocating new pages.
+ */
+static inline
+struct page *erofs_grab_cache_page_nowait(struct address_space *mapping,
+ pgoff_t index)
+{
+ return pagecache_get_page(mapping, index,
+ FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT,
+ readahead_gfp_mask(mapping) & ~__GFP_RECLAIM);
+}
+
extern const struct super_operations erofs_sops;
+extern struct file_system_type erofs_fs_type;
extern const struct address_space_operations erofs_raw_access_aops;
extern const struct address_space_operations z_erofs_aops;
-/*
- * Logical to physical block mapping, used by erofs_map_blocks()
- *
- * Different with other file systems, it is used for 2 access modes:
- *
- * 1) RAW access mode:
- *
- * Users pass a valid (m_lblk, m_lofs -- usually 0) pair,
- * and get the valid m_pblk, m_pofs and the longest m_len(in bytes).
- *
- * Note that m_lblk in the RAW access mode refers to the number of
- * the compressed ondisk block rather than the uncompressed
- * in-memory block for the compressed file.
- *
- * m_pofs equals to m_lofs except for the inline data page.
- *
- * 2) Normal access mode:
- *
- * If the inode is not compressed, it has no difference with
- * the RAW access mode. However, if the inode is compressed,
- * users should pass a valid (m_lblk, m_lofs) pair, and get
- * the needed m_pblk, m_pofs, m_len to get the compressed data
- * and the updated m_lblk, m_lofs which indicates the start
- * of the corresponding uncompressed data in the file.
- */
enum {
- BH_Zipped = BH_PrivateStart,
+ BH_Encoded = BH_PrivateStart,
BH_FullMapped,
+ BH_Fragment,
+ BH_Partialref,
};
/* Has a disk mapping */
#define EROFS_MAP_MAPPED (1 << BH_Mapped)
/* Located in metadata (could be copied from bd_inode) */
#define EROFS_MAP_META (1 << BH_Meta)
-/* The extent has been compressed */
-#define EROFS_MAP_ZIPPED (1 << BH_Zipped)
+/* The extent is encoded */
+#define EROFS_MAP_ENCODED (1 << BH_Encoded)
/* The length of extent is full */
#define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped)
+/* Located in the special packed inode */
+#define EROFS_MAP_FRAGMENT (1 << BH_Fragment)
+/* The extent refers to partial decompressed data */
+#define EROFS_MAP_PARTIAL_REF (1 << BH_Partialref)
struct erofs_map_blocks {
+ struct erofs_buf buf;
+
erofs_off_t m_pa, m_la;
u64 m_plen, m_llen;
+ unsigned short m_deviceid;
+ char m_algorithmformat;
unsigned int m_flags;
-
- struct page *mpage;
};
-/* Flags used by erofs_map_blocks() */
+/* Flags used by erofs_map_blocks_flatmode() */
#define EROFS_GET_BLOCKS_RAW 0x0001
+/*
+ * Used to get the exact decompressed length, e.g. fiemap (consider lookback
+ * approach instead if possible since it's more metadata lightweight.)
+ */
+#define EROFS_GET_BLOCKS_FIEMAP 0x0002
+/* Used to map the whole extent if non-negligible data is requested for LZMA */
+#define EROFS_GET_BLOCKS_READMORE 0x0004
+/* Used to map tail extent for tailpacking inline or fragment pcluster */
+#define EROFS_GET_BLOCKS_FINDTAIL 0x0008
+
+enum {
+ Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
+ Z_EROFS_COMPRESSION_INTERLACED,
+ Z_EROFS_COMPRESSION_RUNTIME_MAX
+};
/* zmap.c */
+extern const struct iomap_ops z_erofs_iomap_report_ops;
+
#ifdef CONFIG_EROFS_FS_ZIP
int z_erofs_fill_inode(struct inode *inode);
int z_erofs_map_blocks_iter(struct inode *inode,
@@ -348,10 +454,29 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode,
}
#endif /* !CONFIG_EROFS_FS_ZIP */
-/* data.c */
-struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr);
+struct erofs_map_dev {
+ struct erofs_fscache *m_fscache;
+ struct block_device *m_bdev;
+ struct dax_device *m_daxdev;
+ u64 m_dax_part_off;
+
+ erofs_off_t m_pa;
+ unsigned int m_deviceid;
+};
-int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
+/* data.c */
+extern const struct file_operations erofs_file_fops;
+void erofs_unmap_metabuf(struct erofs_buf *buf);
+void erofs_put_metabuf(struct erofs_buf *buf);
+void *erofs_bread(struct erofs_buf *buf, struct inode *inode,
+ erofs_blk_t blkaddr, enum erofs_kmap_type type);
+void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb,
+ erofs_blk_t blkaddr, enum erofs_kmap_type type);
+int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev);
+int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
+ u64 start, u64 len);
+int erofs_map_blocks(struct inode *inode,
+ struct erofs_map_blocks *map, int flags);
/* inode.c */
static inline unsigned long erofs_inode_hash(erofs_nid_t nid)
@@ -367,43 +492,64 @@ extern const struct inode_operations erofs_generic_iops;
extern const struct inode_operations erofs_symlink_iops;
extern const struct inode_operations erofs_fast_symlink_iops;
-struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid, bool dir);
-int erofs_getattr(const struct path *path, struct kstat *stat,
- u32 request_mask, unsigned int query_flags);
+struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid);
+int erofs_getattr(struct user_namespace *mnt_userns, const struct path *path,
+ struct kstat *stat, u32 request_mask,
+ unsigned int query_flags);
/* namei.c */
extern const struct inode_operations erofs_dir_iops;
-int erofs_namei(struct inode *dir, struct qstr *name,
+int erofs_namei(struct inode *dir, const struct qstr *name,
erofs_nid_t *nid, unsigned int *d_type);
/* dir.c */
extern const struct file_operations erofs_dir_fops;
-/* utils.c / zdata.c */
-struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
-
-#if (EROFS_PCPUBUF_NR_PAGES > 0)
-void *erofs_get_pcpubuf(unsigned int pagenr);
-#define erofs_put_pcpubuf(buf) do { \
- (void)&(buf); \
- preempt_enable(); \
-} while (0)
-#else
-static inline void *erofs_get_pcpubuf(unsigned int pagenr)
+static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count)
{
- return ERR_PTR(-EOPNOTSUPP);
+ int retried = 0;
+
+ while (1) {
+ void *p = vm_map_ram(pages, count, -1);
+
+ /* retry two more times (totally 3 times) */
+ if (p || ++retried >= 3)
+ return p;
+ vm_unmap_aliases();
+ }
+ return NULL;
}
-#define erofs_put_pcpubuf(buf) do {} while (0)
-#endif
+/* pcpubuf.c */
+void *erofs_get_pcpubuf(unsigned int requiredpages);
+void erofs_put_pcpubuf(void *ptr);
+int erofs_pcpubuf_growsize(unsigned int nrpages);
+void erofs_pcpubuf_init(void);
+void erofs_pcpubuf_exit(void);
+
+/* sysfs.c */
+int erofs_register_sysfs(struct super_block *sb);
+void erofs_unregister_sysfs(struct super_block *sb);
+int __init erofs_init_sysfs(void);
+void erofs_exit_sysfs(void);
+
+/* utils.c / zdata.c */
+struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp);
+static inline void erofs_pagepool_add(struct page **pagepool,
+ struct page *page)
+{
+ set_page_private(page, (unsigned long)*pagepool);
+ *pagepool = page;
+}
+void erofs_release_pages(struct page **pagepool);
#ifdef CONFIG_EROFS_FS_ZIP
int erofs_workgroup_put(struct erofs_workgroup *grp);
struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
pgoff_t index);
-int erofs_register_workgroup(struct super_block *sb,
- struct erofs_workgroup *grp);
+struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
+ struct erofs_workgroup *grp);
void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
void erofs_shrinker_register(struct super_block *sb);
void erofs_shrinker_unregister(struct super_block *sb);
@@ -413,8 +559,10 @@ int __init z_erofs_init_zip_subsystem(void);
void z_erofs_exit_zip_subsystem(void);
int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
struct erofs_workgroup *egrp);
-int erofs_try_to_free_cached_page(struct address_space *mapping,
- struct page *page);
+int erofs_try_to_free_cached_page(struct page *page);
+int z_erofs_load_lz4_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_lz4_cfgs *lz4, int len);
#else
static inline void erofs_shrinker_register(struct super_block *sb) {}
static inline void erofs_shrinker_unregister(struct super_block *sb) {}
@@ -422,9 +570,67 @@ static inline int erofs_init_shrinker(void) { return 0; }
static inline void erofs_exit_shrinker(void) {}
static inline int z_erofs_init_zip_subsystem(void) { return 0; }
static inline void z_erofs_exit_zip_subsystem(void) {}
+static inline int z_erofs_load_lz4_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_lz4_cfgs *lz4, int len)
+{
+ if (lz4 || dsb->u1.lz4_max_distance) {
+ erofs_err(sb, "lz4 algorithm isn't enabled");
+ return -EINVAL;
+ }
+ return 0;
+}
+#endif /* !CONFIG_EROFS_FS_ZIP */
+
+#ifdef CONFIG_EROFS_FS_ZIP_LZMA
+int z_erofs_lzma_init(void);
+void z_erofs_lzma_exit(void);
+int z_erofs_load_lzma_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_lzma_cfgs *lzma, int size);
+#else
+static inline int z_erofs_lzma_init(void) { return 0; }
+static inline int z_erofs_lzma_exit(void) { return 0; }
+static inline int z_erofs_load_lzma_config(struct super_block *sb,
+ struct erofs_super_block *dsb,
+ struct z_erofs_lzma_cfgs *lzma, int size) {
+ if (lzma) {
+ erofs_err(sb, "lzma algorithm isn't enabled");
+ return -EINVAL;
+ }
+ return 0;
+}
#endif /* !CONFIG_EROFS_FS_ZIP */
+/* fscache.c */
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+int erofs_fscache_register_fs(struct super_block *sb);
+void erofs_fscache_unregister_fs(struct super_block *sb);
+
+struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
+ char *name, bool need_inode);
+void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache);
+
+extern const struct address_space_operations erofs_fscache_access_aops;
+#else
+static inline int erofs_fscache_register_fs(struct super_block *sb)
+{
+ return -EOPNOTSUPP;
+}
+static inline void erofs_fscache_unregister_fs(struct super_block *sb) {}
+
+static inline
+struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb,
+ char *name, bool need_inode)
+{
+ return ERR_PTR(-EOPNOTSUPP);
+}
+
+static inline void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache)
+{
+}
+#endif
+
#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
#endif /* __EROFS_INTERNAL_H */
-
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
index 3abbecbf73de..0dc34721080c 100644
--- a/fs/erofs/namei.c
+++ b/fs/erofs/namei.c
@@ -1,8 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
- * http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
+ * https://www.huawei.com/
+ * Copyright (C) 2022, Alibaba Cloud
*/
#include "xattr.h"
@@ -87,14 +87,14 @@ static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name,
return ERR_PTR(-ENOENT);
}
-static struct page *find_target_block_classic(struct inode *dir,
- struct erofs_qstr *name,
- int *_ndirents)
+static void *find_target_block_classic(struct erofs_buf *target,
+ struct inode *dir,
+ struct erofs_qstr *name,
+ int *_ndirents)
{
unsigned int startprfx, endprfx;
int head, back;
- struct address_space *const mapping = dir->i_mapping;
- struct page *candidate = ERR_PTR(-ENOENT);
+ void *candidate = ERR_PTR(-ENOENT);
startprfx = endprfx = 0;
head = 0;
@@ -102,10 +102,11 @@ static struct page *find_target_block_classic(struct inode *dir,
while (head <= back) {
const int mid = head + (back - head) / 2;
- struct page *page = read_mapping_page(mapping, mid, NULL);
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ struct erofs_dirent *de;
- if (!IS_ERR(page)) {
- struct erofs_dirent *de = kmap_atomic(page);
+ de = erofs_bread(&buf, dir, mid, EROFS_KMAP);
+ if (!IS_ERR(de)) {
const int nameoff = nameoff_from_disk(de->nameoff,
EROFS_BLKSIZ);
const int ndirents = nameoff / sizeof(*de);
@@ -114,13 +115,12 @@ static struct page *find_target_block_classic(struct inode *dir,
struct erofs_qstr dname;
if (!ndirents) {
- kunmap_atomic(de);
- put_page(page);
+ erofs_put_metabuf(&buf);
erofs_err(dir->i_sb,
"corrupted dir block %d @ nid %llu",
mid, EROFS_I(dir)->nid);
DBG_BUGON(1);
- page = ERR_PTR(-EFSCORRUPTED);
+ de = ERR_PTR(-EFSCORRUPTED);
goto out;
}
@@ -136,7 +136,6 @@ static struct page *find_target_block_classic(struct inode *dir,
/* string comparison without already matched prefix */
diff = erofs_dirnamecmp(name, &dname, &matched);
- kunmap_atomic(de);
if (!diff) {
*_ndirents = 0;
@@ -146,11 +145,12 @@ static struct page *find_target_block_classic(struct inode *dir,
startprfx = matched;
if (!IS_ERR(candidate))
- put_page(candidate);
- candidate = page;
+ erofs_put_metabuf(target);
+ *target = buf;
+ candidate = de;
*_ndirents = ndirents;
} else {
- put_page(page);
+ erofs_put_metabuf(&buf);
back = mid - 1;
endprfx = matched;
@@ -159,19 +159,17 @@ static struct page *find_target_block_classic(struct inode *dir,
}
out: /* free if the candidate is valid */
if (!IS_ERR(candidate))
- put_page(candidate);
- return page;
+ erofs_put_metabuf(target);
+ return de;
}
return candidate;
}
-int erofs_namei(struct inode *dir,
- struct qstr *name,
- erofs_nid_t *nid, unsigned int *d_type)
+int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid,
+ unsigned int *d_type)
{
int ndirents;
- struct page *page;
- void *data;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_dirent *de;
struct erofs_qstr qn;
@@ -182,32 +180,23 @@ int erofs_namei(struct inode *dir,
qn.end = name->name + name->len;
ndirents = 0;
- page = find_target_block_classic(dir, &qn, &ndirents);
- if (IS_ERR(page))
- return PTR_ERR(page);
+ de = find_target_block_classic(&buf, dir, &qn, &ndirents);
+ if (IS_ERR(de))
+ return PTR_ERR(de);
- data = kmap_atomic(page);
- /* the target page has been mapped */
if (ndirents)
- de = find_target_dirent(&qn, data, EROFS_BLKSIZ, ndirents);
- else
- de = (struct erofs_dirent *)data;
+ de = find_target_dirent(&qn, (u8 *)de, EROFS_BLKSIZ, ndirents);
if (!IS_ERR(de)) {
*nid = le64_to_cpu(de->nid);
*d_type = de->file_type;
}
-
- kunmap_atomic(data);
- put_page(page);
-
+ erofs_put_metabuf(&buf);
return PTR_ERR_OR_ZERO(de);
}
-/* NOTE: i_mutex is already held by vfs */
-static struct dentry *erofs_lookup(struct inode *dir,
- struct dentry *dentry,
+static struct dentry *erofs_lookup(struct inode *dir, struct dentry *dentry,
unsigned int flags)
{
int err;
@@ -215,17 +204,11 @@ static struct dentry *erofs_lookup(struct inode *dir,
unsigned int d_type;
struct inode *inode;
- DBG_BUGON(!d_really_is_negative(dentry));
- /* dentry must be unhashed in lookup, no need to worry about */
- DBG_BUGON(!d_unhashed(dentry));
-
trace_erofs_lookup(dir, dentry, flags);
- /* file name exceeds fs limit */
if (dentry->d_name.len > EROFS_NAME_LEN)
return ERR_PTR(-ENAMETOOLONG);
- /* false uninitialized warnings on gcc 4.8.x */
err = erofs_namei(dir, &dentry->d_name, &nid, &d_type);
if (err == -ENOENT) {
@@ -234,9 +217,9 @@ static struct dentry *erofs_lookup(struct inode *dir,
} else if (err) {
inode = ERR_PTR(err);
} else {
- erofs_dbg("%s, %s (nid %llu) found, d_type %u", __func__,
- dentry->d_name.name, nid, d_type);
- inode = erofs_iget(dir->i_sb, nid, d_type == FT_DIR);
+ erofs_dbg("%s, %pd (nid %llu) found, d_type %u", __func__,
+ dentry, nid, d_type);
+ inode = erofs_iget(dir->i_sb, nid);
}
return d_splice_alias(inode, dentry);
}
@@ -244,9 +227,7 @@ static struct dentry *erofs_lookup(struct inode *dir,
const struct inode_operations erofs_dir_iops = {
.lookup = erofs_lookup,
.getattr = erofs_getattr,
-#ifdef CONFIG_EROFS_FS_XATTR
.listxattr = erofs_listxattr,
-#endif
.get_acl = erofs_get_acl,
+ .fiemap = erofs_fiemap,
};
-
diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c
new file mode 100644
index 000000000000..a2efd833d1b6
--- /dev/null
+++ b/fs/erofs/pcpubuf.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) Gao Xiang <xiang@kernel.org>
+ *
+ * For low-latency decompression algorithms (e.g. lz4), reserve consecutive
+ * per-CPU virtual memory (in pages) in advance to store such inplace I/O
+ * data if inplace decompression is failed (due to unmet inplace margin for
+ * example).
+ */
+#include "internal.h"
+
+struct erofs_pcpubuf {
+ raw_spinlock_t lock;
+ void *ptr;
+ struct page **pages;
+ unsigned int nrpages;
+};
+
+static DEFINE_PER_CPU(struct erofs_pcpubuf, erofs_pcb);
+
+void *erofs_get_pcpubuf(unsigned int requiredpages)
+ __acquires(pcb->lock)
+{
+ struct erofs_pcpubuf *pcb = &get_cpu_var(erofs_pcb);
+
+ raw_spin_lock(&pcb->lock);
+ /* check if the per-CPU buffer is too small */
+ if (requiredpages > pcb->nrpages) {
+ raw_spin_unlock(&pcb->lock);
+ put_cpu_var(erofs_pcb);
+ /* (for sparse checker) pretend pcb->lock is still taken */
+ __acquire(pcb->lock);
+ return NULL;
+ }
+ return pcb->ptr;
+}
+
+void erofs_put_pcpubuf(void *ptr) __releases(pcb->lock)
+{
+ struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, smp_processor_id());
+
+ DBG_BUGON(pcb->ptr != ptr);
+ raw_spin_unlock(&pcb->lock);
+ put_cpu_var(erofs_pcb);
+}
+
+/* the next step: support per-CPU page buffers hotplug */
+int erofs_pcpubuf_growsize(unsigned int nrpages)
+{
+ static DEFINE_MUTEX(pcb_resize_mutex);
+ static unsigned int pcb_nrpages;
+ struct page *pagepool = NULL;
+ int delta, cpu, ret, i;
+
+ mutex_lock(&pcb_resize_mutex);
+ delta = nrpages - pcb_nrpages;
+ ret = 0;
+ /* avoid shrinking pcpubuf, since no idea how many fses rely on */
+ if (delta <= 0)
+ goto out;
+
+ for_each_possible_cpu(cpu) {
+ struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
+ struct page **pages, **oldpages;
+ void *ptr, *old_ptr;
+
+ pages = kmalloc_array(nrpages, sizeof(*pages), GFP_KERNEL);
+ if (!pages) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ for (i = 0; i < nrpages; ++i) {
+ pages[i] = erofs_allocpage(&pagepool, GFP_KERNEL);
+ if (!pages[i]) {
+ ret = -ENOMEM;
+ oldpages = pages;
+ goto free_pagearray;
+ }
+ }
+ ptr = vmap(pages, nrpages, VM_MAP, PAGE_KERNEL);
+ if (!ptr) {
+ ret = -ENOMEM;
+ oldpages = pages;
+ goto free_pagearray;
+ }
+ raw_spin_lock(&pcb->lock);
+ old_ptr = pcb->ptr;
+ pcb->ptr = ptr;
+ oldpages = pcb->pages;
+ pcb->pages = pages;
+ i = pcb->nrpages;
+ pcb->nrpages = nrpages;
+ raw_spin_unlock(&pcb->lock);
+
+ if (!oldpages) {
+ DBG_BUGON(old_ptr);
+ continue;
+ }
+
+ if (old_ptr)
+ vunmap(old_ptr);
+free_pagearray:
+ while (i)
+ erofs_pagepool_add(&pagepool, oldpages[--i]);
+ kfree(oldpages);
+ if (ret)
+ break;
+ }
+ pcb_nrpages = nrpages;
+ erofs_release_pages(&pagepool);
+out:
+ mutex_unlock(&pcb_resize_mutex);
+ return ret;
+}
+
+void erofs_pcpubuf_init(void)
+{
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
+
+ raw_spin_lock_init(&pcb->lock);
+ }
+}
+
+void erofs_pcpubuf_exit(void)
+{
+ int cpu, i;
+
+ for_each_possible_cpu(cpu) {
+ struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
+
+ if (pcb->ptr) {
+ vunmap(pcb->ptr);
+ pcb->ptr = NULL;
+ }
+ if (!pcb->pages)
+ continue;
+
+ for (i = 0; i < pcb->nrpages; ++i)
+ if (pcb->pages[i])
+ put_page(pcb->pages[i]);
+ kfree(pcb->pages);
+ pcb->pages = NULL;
+ }
+}
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 057e6d7b5b7f..2cf96ce1c32e 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -1,8 +1,8 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
- * http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
+ * https://www.huawei.com/
+ * Copyright (C) 2021, Alibaba Cloud
*/
#include <linux/module.h>
#include <linux/buffer_head.h>
@@ -10,6 +10,10 @@
#include <linux/parser.h>
#include <linux/seq_file.h>
#include <linux/crc32c.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
+#include <linux/dax.h>
+#include <linux/exportfs.h>
#include "xattr.h"
#define CREATE_TRACE_POINTS
@@ -81,7 +85,7 @@ static void erofs_inode_init_once(void *ptr)
static struct inode *erofs_alloc_inode(struct super_block *sb)
{
struct erofs_inode *vi =
- kmem_cache_alloc(erofs_inode_cachep, GFP_KERNEL);
+ alloc_inode_sb(sb, erofs_inode_cachep, GFP_KERNEL);
if (!vi)
return NULL;
@@ -120,24 +124,223 @@ static bool check_layout_compatibility(struct super_block *sb,
return true;
}
+#ifdef CONFIG_EROFS_FS_ZIP
+/* read variable-sized metadata, offset will be aligned by 4-byte */
+static void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf,
+ erofs_off_t *offset, int *lengthp)
+{
+ u8 *buffer, *ptr;
+ int len, i, cnt;
+
+ *offset = round_up(*offset, 4);
+ ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*offset), EROFS_KMAP);
+ if (IS_ERR(ptr))
+ return ptr;
+
+ len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(*offset)]);
+ if (!len)
+ len = U16_MAX + 1;
+ buffer = kmalloc(len, GFP_KERNEL);
+ if (!buffer)
+ return ERR_PTR(-ENOMEM);
+ *offset += sizeof(__le16);
+ *lengthp = len;
+
+ for (i = 0; i < len; i += cnt) {
+ cnt = min(EROFS_BLKSIZ - (int)erofs_blkoff(*offset), len - i);
+ ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*offset),
+ EROFS_KMAP);
+ if (IS_ERR(ptr)) {
+ kfree(buffer);
+ return ptr;
+ }
+ memcpy(buffer + i, ptr + erofs_blkoff(*offset), cnt);
+ *offset += cnt;
+ }
+ return buffer;
+}
+
+static int erofs_load_compr_cfgs(struct super_block *sb,
+ struct erofs_super_block *dsb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ unsigned int algs, alg;
+ erofs_off_t offset;
+ int size, ret = 0;
+
+ sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs);
+ if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) {
+ erofs_err(sb, "try to load compressed fs with unsupported algorithms %x",
+ sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS);
+ return -EINVAL;
+ }
+
+ offset = EROFS_SUPER_OFFSET + sbi->sb_size;
+ alg = 0;
+ for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) {
+ void *data;
+
+ if (!(algs & 1))
+ continue;
+
+ data = erofs_read_metadata(sb, &buf, &offset, &size);
+ if (IS_ERR(data)) {
+ ret = PTR_ERR(data);
+ break;
+ }
+
+ switch (alg) {
+ case Z_EROFS_COMPRESSION_LZ4:
+ ret = z_erofs_load_lz4_config(sb, dsb, data, size);
+ break;
+ case Z_EROFS_COMPRESSION_LZMA:
+ ret = z_erofs_load_lzma_config(sb, dsb, data, size);
+ break;
+ default:
+ DBG_BUGON(1);
+ ret = -EFAULT;
+ }
+ kfree(data);
+ if (ret)
+ break;
+ }
+ erofs_put_metabuf(&buf);
+ return ret;
+}
+#else
+static int erofs_load_compr_cfgs(struct super_block *sb,
+ struct erofs_super_block *dsb)
+{
+ if (dsb->u1.available_compr_algs) {
+ erofs_err(sb, "try to load compressed fs when compression is disabled");
+ return -EINVAL;
+ }
+ return 0;
+}
+#endif
+
+static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb,
+ struct erofs_device_info *dif, erofs_off_t *pos)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_fscache *fscache;
+ struct erofs_deviceslot *dis;
+ struct block_device *bdev;
+ void *ptr;
+
+ ptr = erofs_read_metabuf(buf, sb, erofs_blknr(*pos), EROFS_KMAP);
+ if (IS_ERR(ptr))
+ return PTR_ERR(ptr);
+ dis = ptr + erofs_blkoff(*pos);
+
+ if (!dif->path) {
+ if (!dis->tag[0]) {
+ erofs_err(sb, "empty device tag @ pos %llu", *pos);
+ return -EINVAL;
+ }
+ dif->path = kmemdup_nul(dis->tag, sizeof(dis->tag), GFP_KERNEL);
+ if (!dif->path)
+ return -ENOMEM;
+ }
+
+ if (erofs_is_fscache_mode(sb)) {
+ fscache = erofs_fscache_register_cookie(sb, dif->path, false);
+ if (IS_ERR(fscache))
+ return PTR_ERR(fscache);
+ dif->fscache = fscache;
+ } else {
+ bdev = blkdev_get_by_path(dif->path, FMODE_READ | FMODE_EXCL,
+ sb->s_type);
+ if (IS_ERR(bdev))
+ return PTR_ERR(bdev);
+ dif->bdev = bdev;
+ dif->dax_dev = fs_dax_get_by_bdev(bdev, &dif->dax_part_off,
+ NULL, NULL);
+ }
+
+ dif->blocks = le32_to_cpu(dis->blocks);
+ dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr);
+ sbi->total_blocks += dif->blocks;
+ *pos += EROFS_DEVT_SLOT_SIZE;
+ return 0;
+}
+
+static int erofs_scan_devices(struct super_block *sb,
+ struct erofs_super_block *dsb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ unsigned int ondisk_extradevs;
+ erofs_off_t pos;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ struct erofs_device_info *dif;
+ int id, err = 0;
+
+ sbi->total_blocks = sbi->primarydevice_blocks;
+ if (!erofs_sb_has_device_table(sbi))
+ ondisk_extradevs = 0;
+ else
+ ondisk_extradevs = le16_to_cpu(dsb->extra_devices);
+
+ if (sbi->devs->extra_devices &&
+ ondisk_extradevs != sbi->devs->extra_devices) {
+ erofs_err(sb, "extra devices don't match (ondisk %u, given %u)",
+ ondisk_extradevs, sbi->devs->extra_devices);
+ return -EINVAL;
+ }
+ if (!ondisk_extradevs)
+ return 0;
+
+ sbi->device_id_mask = roundup_pow_of_two(ondisk_extradevs + 1) - 1;
+ pos = le16_to_cpu(dsb->devt_slotoff) * EROFS_DEVT_SLOT_SIZE;
+ down_read(&sbi->devs->rwsem);
+ if (sbi->devs->extra_devices) {
+ idr_for_each_entry(&sbi->devs->tree, dif, id) {
+ err = erofs_init_device(&buf, sb, dif, &pos);
+ if (err)
+ break;
+ }
+ } else {
+ for (id = 0; id < ondisk_extradevs; id++) {
+ dif = kzalloc(sizeof(*dif), GFP_KERNEL);
+ if (!dif) {
+ err = -ENOMEM;
+ break;
+ }
+
+ err = idr_alloc(&sbi->devs->tree, dif, 0, 0, GFP_KERNEL);
+ if (err < 0) {
+ kfree(dif);
+ break;
+ }
+ ++sbi->devs->extra_devices;
+
+ err = erofs_init_device(&buf, sb, dif, &pos);
+ if (err)
+ break;
+ }
+ }
+ up_read(&sbi->devs->rwsem);
+ erofs_put_metabuf(&buf);
+ return err;
+}
+
static int erofs_read_superblock(struct super_block *sb)
{
struct erofs_sb_info *sbi;
- struct page *page;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
struct erofs_super_block *dsb;
unsigned int blkszbits;
void *data;
int ret;
- page = read_mapping_page(sb->s_bdev->bd_inode->i_mapping, 0, NULL);
- if (IS_ERR(page)) {
+ data = erofs_read_metabuf(&buf, sb, 0, EROFS_KMAP);
+ if (IS_ERR(data)) {
erofs_err(sb, "cannot read erofs superblock");
- return PTR_ERR(page);
+ return PTR_ERR(data);
}
sbi = EROFS_SB(sb);
-
- data = kmap(page);
dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET);
ret = -EINVAL;
@@ -147,30 +350,48 @@ static int erofs_read_superblock(struct super_block *sb)
}
sbi->feature_compat = le32_to_cpu(dsb->feature_compat);
- if (sbi->feature_compat & EROFS_FEATURE_COMPAT_SB_CHKSUM) {
+ if (erofs_sb_has_sb_chksum(sbi)) {
ret = erofs_superblock_csum_verify(sb, data);
if (ret)
goto out;
}
+ ret = -EINVAL;
blkszbits = dsb->blkszbits;
/* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */
if (blkszbits != LOG_BLOCK_SIZE) {
- erofs_err(sb, "blksize %u isn't supported on this platform",
- 1 << blkszbits);
+ erofs_err(sb, "blkszbits %u isn't supported on this platform",
+ blkszbits);
goto out;
}
if (!check_layout_compatibility(sb, dsb))
goto out;
- sbi->blocks = le32_to_cpu(dsb->blocks);
+ sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE;
+ if (sbi->sb_size > EROFS_BLKSIZ) {
+ erofs_err(sb, "invalid sb_extslots %u (more than a fs block)",
+ sbi->sb_size);
+ goto out;
+ }
+ sbi->primarydevice_blocks = le32_to_cpu(dsb->blocks);
sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
#ifdef CONFIG_EROFS_FS_XATTR
sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
#endif
sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
sbi->root_nid = le16_to_cpu(dsb->root_nid);
+#ifdef CONFIG_EROFS_FS_ZIP
+ sbi->packed_inode = NULL;
+ if (erofs_sb_has_fragments(sbi) && dsb->packed_nid) {
+ sbi->packed_inode =
+ erofs_iget(sb, le64_to_cpu(dsb->packed_nid));
+ if (IS_ERR(sbi->packed_inode)) {
+ ret = PTR_ERR(sbi->packed_inode);
+ goto out;
+ }
+ }
+#endif
sbi->inos = le64_to_cpu(dsb->inos);
sbi->build_time = le64_to_cpu(dsb->build_time);
@@ -185,139 +406,199 @@ static int erofs_read_superblock(struct super_block *sb)
ret = -EFSCORRUPTED;
goto out;
}
- ret = 0;
-out:
- kunmap(page);
- put_page(page);
- return ret;
-}
-
-#ifdef CONFIG_EROFS_FS_ZIP
-static int erofs_build_cache_strategy(struct super_block *sb,
- substring_t *args)
-{
- struct erofs_sb_info *sbi = EROFS_SB(sb);
- const char *cs = match_strdup(args);
- int err = 0;
- if (!cs) {
- erofs_err(sb, "Not enough memory to store cache strategy");
- return -ENOMEM;
- }
+ /* parse on-disk compression configurations */
+ if (erofs_sb_has_compr_cfgs(sbi))
+ ret = erofs_load_compr_cfgs(sb, dsb);
+ else
+ ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0);
+ if (ret < 0)
+ goto out;
- if (!strcmp(cs, "disabled")) {
- sbi->cache_strategy = EROFS_ZIP_CACHE_DISABLED;
- } else if (!strcmp(cs, "readahead")) {
- sbi->cache_strategy = EROFS_ZIP_CACHE_READAHEAD;
- } else if (!strcmp(cs, "readaround")) {
- sbi->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
- } else {
- erofs_err(sb, "Unrecognized cache strategy \"%s\"", cs);
- err = -EINVAL;
- }
- kfree(cs);
- return err;
-}
-#else
-static int erofs_build_cache_strategy(struct super_block *sb,
- substring_t *args)
-{
- erofs_info(sb, "EROFS compression is disabled, so cache strategy is ignored");
- return 0;
+ /* handle multiple devices */
+ ret = erofs_scan_devices(sb, dsb);
+
+ if (erofs_sb_has_ztailpacking(sbi))
+ erofs_info(sb, "EXPERIMENTAL compressed inline data feature in use. Use at your own risk!");
+ if (erofs_is_fscache_mode(sb))
+ erofs_info(sb, "EXPERIMENTAL fscache-based on-demand read feature in use. Use at your own risk!");
+ if (erofs_sb_has_fragments(sbi))
+ erofs_info(sb, "EXPERIMENTAL compressed fragments feature in use. Use at your own risk!");
+ if (erofs_sb_has_dedupe(sbi))
+ erofs_info(sb, "EXPERIMENTAL global deduplication feature in use. Use at your own risk!");
+out:
+ erofs_put_metabuf(&buf);
+ return ret;
}
-#endif
/* set up default EROFS parameters */
-static void erofs_default_options(struct erofs_sb_info *sbi)
+static void erofs_default_options(struct erofs_fs_context *ctx)
{
#ifdef CONFIG_EROFS_FS_ZIP
- sbi->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
- sbi->max_sync_decompress_pages = 3;
+ ctx->opt.cache_strategy = EROFS_ZIP_CACHE_READAROUND;
+ ctx->opt.max_sync_decompress_pages = 3;
+ ctx->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_AUTO;
#endif
#ifdef CONFIG_EROFS_FS_XATTR
- set_opt(sbi, XATTR_USER);
+ set_opt(&ctx->opt, XATTR_USER);
#endif
#ifdef CONFIG_EROFS_FS_POSIX_ACL
- set_opt(sbi, POSIX_ACL);
+ set_opt(&ctx->opt, POSIX_ACL);
#endif
}
enum {
Opt_user_xattr,
- Opt_nouser_xattr,
Opt_acl,
- Opt_noacl,
Opt_cache_strategy,
+ Opt_dax,
+ Opt_dax_enum,
+ Opt_device,
+ Opt_fsid,
+ Opt_domain_id,
Opt_err
};
-static match_table_t erofs_tokens = {
- {Opt_user_xattr, "user_xattr"},
- {Opt_nouser_xattr, "nouser_xattr"},
- {Opt_acl, "acl"},
- {Opt_noacl, "noacl"},
- {Opt_cache_strategy, "cache_strategy=%s"},
- {Opt_err, NULL}
+static const struct constant_table erofs_param_cache_strategy[] = {
+ {"disabled", EROFS_ZIP_CACHE_DISABLED},
+ {"readahead", EROFS_ZIP_CACHE_READAHEAD},
+ {"readaround", EROFS_ZIP_CACHE_READAROUND},
+ {}
};
-static int erofs_parse_options(struct super_block *sb, char *options)
-{
- substring_t args[MAX_OPT_ARGS];
- char *p;
- int err;
+static const struct constant_table erofs_dax_param_enums[] = {
+ {"always", EROFS_MOUNT_DAX_ALWAYS},
+ {"never", EROFS_MOUNT_DAX_NEVER},
+ {}
+};
- if (!options)
- return 0;
+static const struct fs_parameter_spec erofs_fs_parameters[] = {
+ fsparam_flag_no("user_xattr", Opt_user_xattr),
+ fsparam_flag_no("acl", Opt_acl),
+ fsparam_enum("cache_strategy", Opt_cache_strategy,
+ erofs_param_cache_strategy),
+ fsparam_flag("dax", Opt_dax),
+ fsparam_enum("dax", Opt_dax_enum, erofs_dax_param_enums),
+ fsparam_string("device", Opt_device),
+ fsparam_string("fsid", Opt_fsid),
+ fsparam_string("domain_id", Opt_domain_id),
+ {}
+};
- while ((p = strsep(&options, ","))) {
- int token;
+static bool erofs_fc_set_dax_mode(struct fs_context *fc, unsigned int mode)
+{
+#ifdef CONFIG_FS_DAX
+ struct erofs_fs_context *ctx = fc->fs_private;
+
+ switch (mode) {
+ case EROFS_MOUNT_DAX_ALWAYS:
+ warnfc(fc, "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+ set_opt(&ctx->opt, DAX_ALWAYS);
+ clear_opt(&ctx->opt, DAX_NEVER);
+ return true;
+ case EROFS_MOUNT_DAX_NEVER:
+ set_opt(&ctx->opt, DAX_NEVER);
+ clear_opt(&ctx->opt, DAX_ALWAYS);
+ return true;
+ default:
+ DBG_BUGON(1);
+ return false;
+ }
+#else
+ errorfc(fc, "dax options not supported");
+ return false;
+#endif
+}
- if (!*p)
- continue;
+static int erofs_fc_parse_param(struct fs_context *fc,
+ struct fs_parameter *param)
+{
+ struct erofs_fs_context *ctx = fc->fs_private;
+ struct fs_parse_result result;
+ struct erofs_device_info *dif;
+ int opt, ret;
- args[0].to = args[0].from = NULL;
- token = match_token(p, erofs_tokens, args);
+ opt = fs_parse(fc, erofs_fs_parameters, param, &result);
+ if (opt < 0)
+ return opt;
- switch (token) {
+ switch (opt) {
+ case Opt_user_xattr:
#ifdef CONFIG_EROFS_FS_XATTR
- case Opt_user_xattr:
- set_opt(EROFS_SB(sb), XATTR_USER);
- break;
- case Opt_nouser_xattr:
- clear_opt(EROFS_SB(sb), XATTR_USER);
- break;
+ if (result.boolean)
+ set_opt(&ctx->opt, XATTR_USER);
+ else
+ clear_opt(&ctx->opt, XATTR_USER);
#else
- case Opt_user_xattr:
- erofs_info(sb, "user_xattr options not supported");
- break;
- case Opt_nouser_xattr:
- erofs_info(sb, "nouser_xattr options not supported");
- break;
+ errorfc(fc, "{,no}user_xattr options not supported");
#endif
+ break;
+ case Opt_acl:
#ifdef CONFIG_EROFS_FS_POSIX_ACL
- case Opt_acl:
- set_opt(EROFS_SB(sb), POSIX_ACL);
- break;
- case Opt_noacl:
- clear_opt(EROFS_SB(sb), POSIX_ACL);
- break;
+ if (result.boolean)
+ set_opt(&ctx->opt, POSIX_ACL);
+ else
+ clear_opt(&ctx->opt, POSIX_ACL);
#else
- case Opt_acl:
- erofs_info(sb, "acl options not supported");
- break;
- case Opt_noacl:
- erofs_info(sb, "noacl options not supported");
- break;
+ errorfc(fc, "{,no}acl options not supported");
#endif
- case Opt_cache_strategy:
- err = erofs_build_cache_strategy(sb, args);
- if (err)
- return err;
- break;
- default:
- erofs_err(sb, "Unrecognized mount option \"%s\" or missing value", p);
+ break;
+ case Opt_cache_strategy:
+#ifdef CONFIG_EROFS_FS_ZIP
+ ctx->opt.cache_strategy = result.uint_32;
+#else
+ errorfc(fc, "compression not supported, cache_strategy ignored");
+#endif
+ break;
+ case Opt_dax:
+ if (!erofs_fc_set_dax_mode(fc, EROFS_MOUNT_DAX_ALWAYS))
return -EINVAL;
+ break;
+ case Opt_dax_enum:
+ if (!erofs_fc_set_dax_mode(fc, result.uint_32))
+ return -EINVAL;
+ break;
+ case Opt_device:
+ dif = kzalloc(sizeof(*dif), GFP_KERNEL);
+ if (!dif)
+ return -ENOMEM;
+ dif->path = kstrdup(param->string, GFP_KERNEL);
+ if (!dif->path) {
+ kfree(dif);
+ return -ENOMEM;
+ }
+ down_write(&ctx->devs->rwsem);
+ ret = idr_alloc(&ctx->devs->tree, dif, 0, 0, GFP_KERNEL);
+ up_write(&ctx->devs->rwsem);
+ if (ret < 0) {
+ kfree(dif->path);
+ kfree(dif);
+ return ret;
}
+ ++ctx->devs->extra_devices;
+ break;
+ case Opt_fsid:
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+ kfree(ctx->opt.fsid);
+ ctx->opt.fsid = kstrdup(param->string, GFP_KERNEL);
+ if (!ctx->opt.fsid)
+ return -ENOMEM;
+#else
+ errorfc(fc, "fsid option not supported");
+#endif
+ break;
+ case Opt_domain_id:
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+ kfree(ctx->opt.domain_id);
+ ctx->opt.domain_id = kstrdup(param->string, GFP_KERNEL);
+ if (!ctx->opt.domain_id)
+ return -ENOMEM;
+#else
+ errorfc(fc, "domain_id option not supported");
+#endif
+ break;
+ default:
+ return -ENOPARAM;
}
return 0;
}
@@ -325,39 +606,43 @@ static int erofs_parse_options(struct super_block *sb, char *options)
#ifdef CONFIG_EROFS_FS_ZIP
static const struct address_space_operations managed_cache_aops;
-static int erofs_managed_cache_releasepage(struct page *page, gfp_t gfp_mask)
+static bool erofs_managed_cache_release_folio(struct folio *folio, gfp_t gfp)
{
- int ret = 1; /* 0 - busy */
- struct address_space *const mapping = page->mapping;
+ bool ret = true;
+ struct address_space *const mapping = folio->mapping;
- DBG_BUGON(!PageLocked(page));
+ DBG_BUGON(!folio_test_locked(folio));
DBG_BUGON(mapping->a_ops != &managed_cache_aops);
- if (PagePrivate(page))
- ret = erofs_try_to_free_cached_page(mapping, page);
+ if (folio_test_private(folio))
+ ret = erofs_try_to_free_cached_page(&folio->page);
return ret;
}
-static void erofs_managed_cache_invalidatepage(struct page *page,
- unsigned int offset,
- unsigned int length)
+/*
+ * It will be called only on inode eviction. In case that there are still some
+ * decompression requests in progress, wait with rescheduling for a bit here.
+ * We could introduce an extra locking instead but it seems unnecessary.
+ */
+static void erofs_managed_cache_invalidate_folio(struct folio *folio,
+ size_t offset, size_t length)
{
- const unsigned int stop = length + offset;
+ const size_t stop = length + offset;
- DBG_BUGON(!PageLocked(page));
+ DBG_BUGON(!folio_test_locked(folio));
/* Check for potential overflow in debug mode */
- DBG_BUGON(stop > PAGE_SIZE || stop < length);
+ DBG_BUGON(stop > folio_size(folio) || stop < length);
- if (offset == 0 && stop == PAGE_SIZE)
- while (!erofs_managed_cache_releasepage(page, GFP_NOFS))
+ if (offset == 0 && stop == folio_size(folio))
+ while (!erofs_managed_cache_release_folio(folio, GFP_NOFS))
cond_resched();
}
static const struct address_space_operations managed_cache_aops = {
- .releasepage = erofs_managed_cache_releasepage,
- .invalidatepage = erofs_managed_cache_invalidatepage,
+ .release_folio = erofs_managed_cache_release_folio,
+ .invalidate_folio = erofs_managed_cache_invalidate_folio,
};
static int erofs_init_managed_cache(struct super_block *sb)
@@ -372,8 +657,7 @@ static int erofs_init_managed_cache(struct super_block *sb)
inode->i_size = OFFSET_MAX;
inode->i_mapping->a_ops = &managed_cache_aops;
- mapping_set_gfp_mask(inode->i_mapping,
- GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE);
+ mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
sbi->managed_cache = inode;
return 0;
}
@@ -381,55 +665,124 @@ static int erofs_init_managed_cache(struct super_block *sb)
static int erofs_init_managed_cache(struct super_block *sb) { return 0; }
#endif
-static int erofs_fill_super(struct super_block *sb, void *data, int silent)
+static struct inode *erofs_nfs_get_inode(struct super_block *sb,
+ u64 ino, u32 generation)
+{
+ return erofs_iget(sb, ino);
+}
+
+static struct dentry *erofs_fh_to_dentry(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+ erofs_nfs_get_inode);
+}
+
+static struct dentry *erofs_fh_to_parent(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+ erofs_nfs_get_inode);
+}
+
+static struct dentry *erofs_get_parent(struct dentry *child)
+{
+ erofs_nid_t nid;
+ unsigned int d_type;
+ int err;
+
+ err = erofs_namei(d_inode(child), &dotdot_name, &nid, &d_type);
+ if (err)
+ return ERR_PTR(err);
+ return d_obtain_alias(erofs_iget(child->d_sb, nid));
+}
+
+static const struct export_operations erofs_export_ops = {
+ .fh_to_dentry = erofs_fh_to_dentry,
+ .fh_to_parent = erofs_fh_to_parent,
+ .get_parent = erofs_get_parent,
+};
+
+static int erofs_fc_fill_pseudo_super(struct super_block *sb, struct fs_context *fc)
+{
+ static const struct tree_descr empty_descr = {""};
+
+ return simple_fill_super(sb, EROFS_SUPER_MAGIC, &empty_descr);
+}
+
+static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc)
{
struct inode *inode;
struct erofs_sb_info *sbi;
+ struct erofs_fs_context *ctx = fc->fs_private;
int err;
sb->s_magic = EROFS_SUPER_MAGIC;
-
- if (!sb_set_blocksize(sb, EROFS_BLKSIZ)) {
- erofs_err(sb, "failed to set erofs blksize");
- return -EINVAL;
- }
+ sb->s_flags |= SB_RDONLY | SB_NOATIME;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_op = &erofs_sops;
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
sb->s_fs_info = sbi;
+ sbi->opt = ctx->opt;
+ ctx->opt.fsid = NULL;
+ ctx->opt.domain_id = NULL;
+ sbi->devs = ctx->devs;
+ ctx->devs = NULL;
+
+ if (erofs_is_fscache_mode(sb)) {
+ sb->s_blocksize = EROFS_BLKSIZ;
+ sb->s_blocksize_bits = LOG_BLOCK_SIZE;
+
+ err = erofs_fscache_register_fs(sb);
+ if (err)
+ return err;
+
+ err = super_setup_bdi(sb);
+ if (err)
+ return err;
+ } else {
+ if (!sb_set_blocksize(sb, EROFS_BLKSIZ)) {
+ erofs_err(sb, "failed to set erofs blksize");
+ return -EINVAL;
+ }
+
+ sbi->dax_dev = fs_dax_get_by_bdev(sb->s_bdev,
+ &sbi->dax_part_off,
+ NULL, NULL);
+ }
+
err = erofs_read_superblock(sb);
if (err)
return err;
- sb->s_flags |= SB_RDONLY | SB_NOATIME;
- sb->s_maxbytes = MAX_LFS_FILESIZE;
- sb->s_time_gran = 1;
+ if (test_opt(&sbi->opt, DAX_ALWAYS)) {
+ BUILD_BUG_ON(EROFS_BLKSIZ != PAGE_SIZE);
- sb->s_op = &erofs_sops;
+ if (!sbi->dax_dev) {
+ errorfc(fc, "DAX unsupported by block device. Turning off DAX.");
+ clear_opt(&sbi->opt, DAX_ALWAYS);
+ }
+ }
-#ifdef CONFIG_EROFS_FS_XATTR
+ sb->s_time_gran = 1;
sb->s_xattr = erofs_xattr_handlers;
-#endif
- /* set erofs default mount options */
- erofs_default_options(sbi);
-
- err = erofs_parse_options(sb, data);
- if (err)
- return err;
+ sb->s_export_op = &erofs_export_ops;
- if (test_opt(sbi, POSIX_ACL))
+ if (test_opt(&sbi->opt, POSIX_ACL))
sb->s_flags |= SB_POSIXACL;
else
sb->s_flags &= ~SB_POSIXACL;
#ifdef CONFIG_EROFS_FS_ZIP
- INIT_RADIX_TREE(&sbi->workstn_tree, GFP_ATOMIC);
+ xa_init(&sbi->managed_pslots);
#endif
/* get the root inode */
- inode = erofs_iget(sb, ROOT_NID(sbi), true);
+ inode = erofs_iget(sb, ROOT_NID(sbi));
if (IS_ERR(inode))
return PTR_ERR(inode);
@@ -450,15 +803,117 @@ static int erofs_fill_super(struct super_block *sb, void *data, int silent)
if (err)
return err;
- erofs_info(sb, "mounted with opts: %s, root inode @ nid %llu.",
- (char *)data, ROOT_NID(sbi));
+ err = erofs_register_sysfs(sb);
+ if (err)
+ return err;
+
+ erofs_info(sb, "mounted with root inode @ nid %llu.", ROOT_NID(sbi));
return 0;
}
-static struct dentry *erofs_mount(struct file_system_type *fs_type, int flags,
- const char *dev_name, void *data)
+static int erofs_fc_anon_get_tree(struct fs_context *fc)
{
- return mount_bdev(fs_type, flags, dev_name, data, erofs_fill_super);
+ return get_tree_nodev(fc, erofs_fc_fill_pseudo_super);
+}
+
+static int erofs_fc_get_tree(struct fs_context *fc)
+{
+ struct erofs_fs_context *ctx = fc->fs_private;
+
+ if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && ctx->opt.fsid)
+ return get_tree_nodev(fc, erofs_fc_fill_super);
+
+ return get_tree_bdev(fc, erofs_fc_fill_super);
+}
+
+static int erofs_fc_reconfigure(struct fs_context *fc)
+{
+ struct super_block *sb = fc->root->d_sb;
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ struct erofs_fs_context *ctx = fc->fs_private;
+
+ DBG_BUGON(!sb_rdonly(sb));
+
+ if (test_opt(&ctx->opt, POSIX_ACL))
+ fc->sb_flags |= SB_POSIXACL;
+ else
+ fc->sb_flags &= ~SB_POSIXACL;
+
+ sbi->opt = ctx->opt;
+
+ fc->sb_flags |= SB_RDONLY;
+ return 0;
+}
+
+static int erofs_release_device_info(int id, void *ptr, void *data)
+{
+ struct erofs_device_info *dif = ptr;
+
+ fs_put_dax(dif->dax_dev, NULL);
+ if (dif->bdev)
+ blkdev_put(dif->bdev, FMODE_READ | FMODE_EXCL);
+ erofs_fscache_unregister_cookie(dif->fscache);
+ dif->fscache = NULL;
+ kfree(dif->path);
+ kfree(dif);
+ return 0;
+}
+
+static void erofs_free_dev_context(struct erofs_dev_context *devs)
+{
+ if (!devs)
+ return;
+ idr_for_each(&devs->tree, &erofs_release_device_info, NULL);
+ idr_destroy(&devs->tree);
+ kfree(devs);
+}
+
+static void erofs_fc_free(struct fs_context *fc)
+{
+ struct erofs_fs_context *ctx = fc->fs_private;
+
+ erofs_free_dev_context(ctx->devs);
+ kfree(ctx->opt.fsid);
+ kfree(ctx->opt.domain_id);
+ kfree(ctx);
+}
+
+static const struct fs_context_operations erofs_context_ops = {
+ .parse_param = erofs_fc_parse_param,
+ .get_tree = erofs_fc_get_tree,
+ .reconfigure = erofs_fc_reconfigure,
+ .free = erofs_fc_free,
+};
+
+static const struct fs_context_operations erofs_anon_context_ops = {
+ .get_tree = erofs_fc_anon_get_tree,
+};
+
+static int erofs_init_fs_context(struct fs_context *fc)
+{
+ struct erofs_fs_context *ctx;
+
+ /* pseudo mount for anon inodes */
+ if (fc->sb_flags & SB_KERNMOUNT) {
+ fc->ops = &erofs_anon_context_ops;
+ return 0;
+ }
+
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->devs = kzalloc(sizeof(struct erofs_dev_context), GFP_KERNEL);
+ if (!ctx->devs) {
+ kfree(ctx);
+ return -ENOMEM;
+ }
+ fc->fs_private = ctx;
+
+ idr_init(&ctx->devs->tree);
+ init_rwsem(&ctx->devs->rwsem);
+ erofs_default_options(ctx);
+ fc->ops = &erofs_context_ops;
+ return 0;
}
/*
@@ -471,11 +926,26 @@ static void erofs_kill_sb(struct super_block *sb)
WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC);
- kill_block_super(sb);
+ /* pseudo mount for anon inodes */
+ if (sb->s_flags & SB_KERNMOUNT) {
+ kill_anon_super(sb);
+ return;
+ }
+
+ if (erofs_is_fscache_mode(sb))
+ kill_anon_super(sb);
+ else
+ kill_block_super(sb);
sbi = EROFS_SB(sb);
if (!sbi)
return;
+
+ erofs_free_dev_context(sbi->devs);
+ fs_put_dax(sbi->dax_dev, NULL);
+ erofs_fscache_unregister_fs(sb);
+ kfree(sbi->opt.fsid);
+ kfree(sbi->opt.domain_id);
kfree(sbi);
sb->s_fs_info = NULL;
}
@@ -487,19 +957,23 @@ static void erofs_put_super(struct super_block *sb)
DBG_BUGON(!sbi);
+ erofs_unregister_sysfs(sb);
erofs_shrinker_unregister(sb);
#ifdef CONFIG_EROFS_FS_ZIP
iput(sbi->managed_cache);
sbi->managed_cache = NULL;
+ iput(sbi->packed_inode);
+ sbi->packed_inode = NULL;
#endif
+ erofs_fscache_unregister_fs(sb);
}
-static struct file_system_type erofs_fs_type = {
+struct file_system_type erofs_fs_type = {
.owner = THIS_MODULE,
.name = "erofs",
- .mount = erofs_mount,
+ .init_fs_context = erofs_init_fs_context,
.kill_sb = erofs_kill_sb,
- .fs_flags = FS_REQUIRES_DEV,
+ .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP,
};
MODULE_ALIAS_FS("erofs");
@@ -522,10 +996,19 @@ static int __init erofs_module_init(void)
if (err)
goto shrinker_err;
+ err = z_erofs_lzma_init();
+ if (err)
+ goto lzma_err;
+
+ erofs_pcpubuf_init();
err = z_erofs_init_zip_subsystem();
if (err)
goto zip_err;
+ err = erofs_init_sysfs();
+ if (err)
+ goto sysfs_err;
+
err = register_filesystem(&erofs_fs_type);
if (err)
goto fs_err;
@@ -533,8 +1016,12 @@ static int __init erofs_module_init(void)
return 0;
fs_err:
+ erofs_exit_sysfs();
+sysfs_err:
z_erofs_exit_zip_subsystem();
zip_err:
+ z_erofs_lzma_exit();
+lzma_err:
erofs_exit_shrinker();
shrinker_err:
kmem_cache_destroy(erofs_inode_cachep);
@@ -545,12 +1032,16 @@ icache_err:
static void __exit erofs_module_exit(void)
{
unregister_filesystem(&erofs_fs_type);
- z_erofs_exit_zip_subsystem();
- erofs_exit_shrinker();
- /* Ensure all RCU free inodes are safe before cache is destroyed. */
+ /* Ensure all RCU free inodes / pclusters are safe to be destroyed. */
rcu_barrier();
+
+ erofs_exit_sysfs();
+ z_erofs_exit_zip_subsystem();
+ z_erofs_lzma_exit();
+ erofs_exit_shrinker();
kmem_cache_destroy(erofs_inode_cachep);
+ erofs_pcpubuf_exit();
}
/* get filesystem statistics */
@@ -558,11 +1049,14 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
struct erofs_sb_info *sbi = EROFS_SB(sb);
- u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+ u64 id = 0;
+
+ if (!erofs_is_fscache_mode(sb))
+ id = huge_encode_dev(sb->s_bdev->bd_dev);
buf->f_type = sb->s_magic;
buf->f_bsize = EROFS_BLKSIZ;
- buf->f_blocks = sbi->blocks;
+ buf->f_blocks = sbi->total_blocks;
buf->f_bfree = buf->f_bavail = 0;
buf->f_files = ULLONG_MAX;
@@ -570,69 +1064,54 @@ static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_namelen = EROFS_NAME_LEN;
- buf->f_fsid.val[0] = (u32)id;
- buf->f_fsid.val[1] = (u32)(id >> 32);
+ buf->f_fsid = u64_to_fsid(id);
return 0;
}
static int erofs_show_options(struct seq_file *seq, struct dentry *root)
{
- struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb);
+ struct erofs_sb_info *sbi = EROFS_SB(root->d_sb);
+ struct erofs_mount_opts *opt = &sbi->opt;
#ifdef CONFIG_EROFS_FS_XATTR
- if (test_opt(sbi, XATTR_USER))
+ if (test_opt(opt, XATTR_USER))
seq_puts(seq, ",user_xattr");
else
seq_puts(seq, ",nouser_xattr");
#endif
#ifdef CONFIG_EROFS_FS_POSIX_ACL
- if (test_opt(sbi, POSIX_ACL))
+ if (test_opt(opt, POSIX_ACL))
seq_puts(seq, ",acl");
else
seq_puts(seq, ",noacl");
#endif
#ifdef CONFIG_EROFS_FS_ZIP
- if (sbi->cache_strategy == EROFS_ZIP_CACHE_DISABLED) {
+ if (opt->cache_strategy == EROFS_ZIP_CACHE_DISABLED)
seq_puts(seq, ",cache_strategy=disabled");
- } else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) {
+ else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAHEAD)
seq_puts(seq, ",cache_strategy=readahead");
- } else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAROUND) {
+ else if (opt->cache_strategy == EROFS_ZIP_CACHE_READAROUND)
seq_puts(seq, ",cache_strategy=readaround");
- }
+#endif
+ if (test_opt(opt, DAX_ALWAYS))
+ seq_puts(seq, ",dax=always");
+ if (test_opt(opt, DAX_NEVER))
+ seq_puts(seq, ",dax=never");
+#ifdef CONFIG_EROFS_FS_ONDEMAND
+ if (opt->fsid)
+ seq_printf(seq, ",fsid=%s", opt->fsid);
+ if (opt->domain_id)
+ seq_printf(seq, ",domain_id=%s", opt->domain_id);
#endif
return 0;
}
-static int erofs_remount(struct super_block *sb, int *flags, char *data)
-{
- struct erofs_sb_info *sbi = EROFS_SB(sb);
- unsigned int org_mnt_opt = sbi->mount_opt;
- int err;
-
- DBG_BUGON(!sb_rdonly(sb));
- err = erofs_parse_options(sb, data);
- if (err)
- goto out;
-
- if (test_opt(sbi, POSIX_ACL))
- sb->s_flags |= SB_POSIXACL;
- else
- sb->s_flags &= ~SB_POSIXACL;
-
- *flags |= SB_RDONLY;
- return 0;
-out:
- sbi->mount_opt = org_mnt_opt;
- return err;
-}
-
const struct super_operations erofs_sops = {
.put_super = erofs_put_super,
.alloc_inode = erofs_alloc_inode,
.free_inode = erofs_free_inode,
.statfs = erofs_statfs,
.show_options = erofs_show_options,
- .remount_fs = erofs_remount,
};
module_init(erofs_module_init);
@@ -641,4 +1120,3 @@ module_exit(erofs_module_exit);
MODULE_DESCRIPTION("Enhanced ROM File System");
MODULE_AUTHOR("Gao Xiang, Chao Yu, Miao Xie, CONSUMER BG, HUAWEI Inc.");
MODULE_LICENSE("GPL");
-
diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c
new file mode 100644
index 000000000000..783bb7b21b51
--- /dev/null
+++ b/fs/erofs/sysfs.c
@@ -0,0 +1,277 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C), 2008-2021, OPPO Mobile Comm Corp., Ltd.
+ * https://www.oppo.com/
+ */
+#include <linux/sysfs.h>
+#include <linux/kobject.h>
+
+#include "internal.h"
+
+enum {
+ attr_feature,
+ attr_pointer_ui,
+ attr_pointer_bool,
+};
+
+enum {
+ struct_erofs_sb_info,
+ struct_erofs_mount_opts,
+};
+
+struct erofs_attr {
+ struct attribute attr;
+ short attr_id;
+ int struct_type, offset;
+};
+
+#define EROFS_ATTR(_name, _mode, _id) \
+static struct erofs_attr erofs_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .attr_id = attr_##_id, \
+}
+#define EROFS_ATTR_FUNC(_name, _mode) EROFS_ATTR(_name, _mode, _name)
+#define EROFS_ATTR_FEATURE(_name) EROFS_ATTR(_name, 0444, feature)
+
+#define EROFS_ATTR_OFFSET(_name, _mode, _id, _struct) \
+static struct erofs_attr erofs_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .attr_id = attr_##_id, \
+ .struct_type = struct_##_struct, \
+ .offset = offsetof(struct _struct, _name),\
+}
+
+#define EROFS_ATTR_RW(_name, _id, _struct) \
+ EROFS_ATTR_OFFSET(_name, 0644, _id, _struct)
+
+#define EROFS_RO_ATTR(_name, _id, _struct) \
+ EROFS_ATTR_OFFSET(_name, 0444, _id, _struct)
+
+#define EROFS_ATTR_RW_UI(_name, _struct) \
+ EROFS_ATTR_RW(_name, pointer_ui, _struct)
+
+#define EROFS_ATTR_RW_BOOL(_name, _struct) \
+ EROFS_ATTR_RW(_name, pointer_bool, _struct)
+
+#define ATTR_LIST(name) (&erofs_attr_##name.attr)
+
+#ifdef CONFIG_EROFS_FS_ZIP
+EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts);
+#endif
+
+static struct attribute *erofs_attrs[] = {
+#ifdef CONFIG_EROFS_FS_ZIP
+ ATTR_LIST(sync_decompress),
+#endif
+ NULL,
+};
+ATTRIBUTE_GROUPS(erofs);
+
+/* Features this copy of erofs supports */
+EROFS_ATTR_FEATURE(zero_padding);
+EROFS_ATTR_FEATURE(compr_cfgs);
+EROFS_ATTR_FEATURE(big_pcluster);
+EROFS_ATTR_FEATURE(chunked_file);
+EROFS_ATTR_FEATURE(device_table);
+EROFS_ATTR_FEATURE(compr_head2);
+EROFS_ATTR_FEATURE(sb_chksum);
+EROFS_ATTR_FEATURE(ztailpacking);
+EROFS_ATTR_FEATURE(fragments);
+EROFS_ATTR_FEATURE(dedupe);
+
+static struct attribute *erofs_feat_attrs[] = {
+ ATTR_LIST(zero_padding),
+ ATTR_LIST(compr_cfgs),
+ ATTR_LIST(big_pcluster),
+ ATTR_LIST(chunked_file),
+ ATTR_LIST(device_table),
+ ATTR_LIST(compr_head2),
+ ATTR_LIST(sb_chksum),
+ ATTR_LIST(ztailpacking),
+ ATTR_LIST(fragments),
+ ATTR_LIST(dedupe),
+ NULL,
+};
+ATTRIBUTE_GROUPS(erofs_feat);
+
+static unsigned char *__struct_ptr(struct erofs_sb_info *sbi,
+ int struct_type, int offset)
+{
+ if (struct_type == struct_erofs_sb_info)
+ return (unsigned char *)sbi + offset;
+ if (struct_type == struct_erofs_mount_opts)
+ return (unsigned char *)&sbi->opt + offset;
+ return NULL;
+}
+
+static ssize_t erofs_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info,
+ s_kobj);
+ struct erofs_attr *a = container_of(attr, struct erofs_attr, attr);
+ unsigned char *ptr = __struct_ptr(sbi, a->struct_type, a->offset);
+
+ switch (a->attr_id) {
+ case attr_feature:
+ return sysfs_emit(buf, "supported\n");
+ case attr_pointer_ui:
+ if (!ptr)
+ return 0;
+ return sysfs_emit(buf, "%u\n", *(unsigned int *)ptr);
+ case attr_pointer_bool:
+ if (!ptr)
+ return 0;
+ return sysfs_emit(buf, "%d\n", *(bool *)ptr);
+ }
+ return 0;
+}
+
+static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info,
+ s_kobj);
+ struct erofs_attr *a = container_of(attr, struct erofs_attr, attr);
+ unsigned char *ptr = __struct_ptr(sbi, a->struct_type, a->offset);
+ unsigned long t;
+ int ret;
+
+ switch (a->attr_id) {
+ case attr_pointer_ui:
+ if (!ptr)
+ return 0;
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ if (t != (unsigned int)t)
+ return -ERANGE;
+#ifdef CONFIG_EROFS_FS_ZIP
+ if (!strcmp(a->attr.name, "sync_decompress") &&
+ (t > EROFS_SYNC_DECOMPRESS_FORCE_OFF))
+ return -EINVAL;
+#endif
+ *(unsigned int *)ptr = t;
+ return len;
+ case attr_pointer_bool:
+ if (!ptr)
+ return 0;
+ ret = kstrtoul(skip_spaces(buf), 0, &t);
+ if (ret)
+ return ret;
+ if (t != 0 && t != 1)
+ return -EINVAL;
+ *(bool *)ptr = !!t;
+ return len;
+ }
+ return 0;
+}
+
+static void erofs_sb_release(struct kobject *kobj)
+{
+ struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info,
+ s_kobj);
+ complete(&sbi->s_kobj_unregister);
+}
+
+static const struct sysfs_ops erofs_attr_ops = {
+ .show = erofs_attr_show,
+ .store = erofs_attr_store,
+};
+
+static struct kobj_type erofs_sb_ktype = {
+ .default_groups = erofs_groups,
+ .sysfs_ops = &erofs_attr_ops,
+ .release = erofs_sb_release,
+};
+
+static struct kobj_type erofs_ktype = {
+ .sysfs_ops = &erofs_attr_ops,
+};
+
+static struct kset erofs_root = {
+ .kobj = {.ktype = &erofs_ktype},
+};
+
+static struct kobj_type erofs_feat_ktype = {
+ .default_groups = erofs_feat_groups,
+ .sysfs_ops = &erofs_attr_ops,
+};
+
+static struct kobject erofs_feat = {
+ .kset = &erofs_root,
+};
+
+int erofs_register_sysfs(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+ char *name;
+ char *str = NULL;
+ int err;
+
+ if (erofs_is_fscache_mode(sb)) {
+ if (sbi->opt.domain_id) {
+ str = kasprintf(GFP_KERNEL, "%s,%s", sbi->opt.domain_id,
+ sbi->opt.fsid);
+ if (!str)
+ return -ENOMEM;
+ name = str;
+ } else {
+ name = sbi->opt.fsid;
+ }
+ } else {
+ name = sb->s_id;
+ }
+ sbi->s_kobj.kset = &erofs_root;
+ init_completion(&sbi->s_kobj_unregister);
+ err = kobject_init_and_add(&sbi->s_kobj, &erofs_sb_ktype, NULL, "%s", name);
+ kfree(str);
+ if (err)
+ goto put_sb_kobj;
+ return 0;
+
+put_sb_kobj:
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+ return err;
+}
+
+void erofs_unregister_sysfs(struct super_block *sb)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(sb);
+
+ if (sbi->s_kobj.state_in_sysfs) {
+ kobject_del(&sbi->s_kobj);
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+ }
+}
+
+int __init erofs_init_sysfs(void)
+{
+ int ret;
+
+ kobject_set_name(&erofs_root.kobj, "erofs");
+ erofs_root.kobj.parent = fs_kobj;
+ ret = kset_register(&erofs_root);
+ if (ret)
+ goto root_err;
+
+ ret = kobject_init_and_add(&erofs_feat, &erofs_feat_ktype,
+ NULL, "features");
+ if (ret)
+ goto feat_err;
+ return ret;
+
+feat_err:
+ kobject_put(&erofs_feat);
+ kset_unregister(&erofs_root);
+root_err:
+ return ret;
+}
+
+void erofs_exit_sysfs(void)
+{
+ kobject_put(&erofs_feat);
+ kset_unregister(&erofs_root);
+}
diff --git a/fs/erofs/tagptr.h b/fs/erofs/tagptr.h
index a72897c86744..64ceb7270b5c 100644
--- a/fs/erofs/tagptr.h
+++ b/fs/erofs/tagptr.h
@@ -1,8 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* A tagged pointer implementation
- *
- * Copyright (C) 2018 Gao Xiang <gaoxiang25@huawei.com>
*/
#ifndef __EROFS_FS_TAGPTR_H
#define __EROFS_FS_TAGPTR_H
@@ -107,4 +105,3 @@ tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); })
*ptptr; })
#endif /* __EROFS_FS_TAGPTR_H */
-
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index fddc5059c930..46627cb69abe 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -1,45 +1,38 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2018 HUAWEI, Inc.
- * http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
+ * https://www.huawei.com/
*/
#include "internal.h"
#include <linux/pagevec.h>
-struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
+struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp)
{
- struct page *page;
+ struct page *page = *pagepool;
- if (!list_empty(pool)) {
- page = lru_to_page(pool);
+ if (page) {
DBG_BUGON(page_ref_count(page) != 1);
- list_del(&page->lru);
+ *pagepool = (struct page *)page_private(page);
} else {
page = alloc_page(gfp);
}
return page;
}
-#if (EROFS_PCPUBUF_NR_PAGES > 0)
-static struct {
- u8 data[PAGE_SIZE * EROFS_PCPUBUF_NR_PAGES];
-} ____cacheline_aligned_in_smp erofs_pcpubuf[NR_CPUS];
-
-void *erofs_get_pcpubuf(unsigned int pagenr)
+void erofs_release_pages(struct page **pagepool)
{
- preempt_disable();
- return &erofs_pcpubuf[smp_processor_id()].data[pagenr * PAGE_SIZE];
+ while (*pagepool) {
+ struct page *page = *pagepool;
+
+ *pagepool = (struct page *)page_private(page);
+ put_page(page);
+ }
}
-#endif
#ifdef CONFIG_EROFS_FS_ZIP
/* global shrink count (for all mounted EROFS instances) */
static atomic_long_t erofs_global_shrink_cnt;
-#define __erofs_workgroup_get(grp) atomic_inc(&(grp)->refcount)
-#define __erofs_workgroup_put(grp) atomic_dec(&(grp)->refcount)
-
static int erofs_workgroup_get(struct erofs_workgroup *grp)
{
int o;
@@ -66,7 +59,7 @@ struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
repeat:
rcu_read_lock();
- grp = radix_tree_lookup(&sbi->workstn_tree, index);
+ grp = xa_load(&sbi->managed_pslots, index);
if (grp) {
if (erofs_workgroup_get(grp)) {
/* prefer to relax rcu read side */
@@ -80,43 +73,37 @@ repeat:
return grp;
}
-int erofs_register_workgroup(struct super_block *sb,
- struct erofs_workgroup *grp)
+struct erofs_workgroup *erofs_insert_workgroup(struct super_block *sb,
+ struct erofs_workgroup *grp)
{
- struct erofs_sb_info *sbi;
- int err;
-
- /* grp shouldn't be broken or used before */
- if (atomic_read(&grp->refcount) != 1) {
- DBG_BUGON(1);
- return -EINVAL;
- }
-
- err = radix_tree_preload(GFP_NOFS);
- if (err)
- return err;
-
- sbi = EROFS_SB(sb);
- xa_lock(&sbi->workstn_tree);
+ struct erofs_sb_info *const sbi = EROFS_SB(sb);
+ struct erofs_workgroup *pre;
/*
- * Bump up reference count before making this workgroup
- * visible to other users in order to avoid potential UAF
- * without serialized by workstn_lock.
+ * Bump up a reference count before making this visible
+ * to others for the XArray in order to avoid potential
+ * UAF without serialized by xa_lock.
*/
- __erofs_workgroup_get(grp);
+ atomic_inc(&grp->refcount);
- err = radix_tree_insert(&sbi->workstn_tree, grp->index, grp);
- if (err)
- /*
- * it's safe to decrease since the workgroup isn't visible
- * and refcount >= 2 (cannot be freezed).
- */
- __erofs_workgroup_put(grp);
-
- xa_unlock(&sbi->workstn_tree);
- radix_tree_preload_end();
- return err;
+repeat:
+ xa_lock(&sbi->managed_pslots);
+ pre = __xa_cmpxchg(&sbi->managed_pslots, grp->index,
+ NULL, grp, GFP_NOFS);
+ if (pre) {
+ if (xa_is_err(pre)) {
+ pre = ERR_PTR(xa_err(pre));
+ } else if (erofs_workgroup_get(pre)) {
+ /* try to legitimize the current in-tree one */
+ xa_unlock(&sbi->managed_pslots);
+ cond_resched();
+ goto repeat;
+ }
+ atomic_dec(&grp->refcount);
+ grp = pre;
+ }
+ xa_unlock(&sbi->managed_pslots);
+ return grp;
}
static void __erofs_workgroup_free(struct erofs_workgroup *grp)
@@ -136,12 +123,6 @@ int erofs_workgroup_put(struct erofs_workgroup *grp)
return count;
}
-static void erofs_workgroup_unfreeze_final(struct erofs_workgroup *grp)
-{
- erofs_workgroup_unfreeze(grp, 0);
- __erofs_workgroup_free(grp);
-}
-
static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
struct erofs_workgroup *grp)
{
@@ -155,7 +136,7 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
/*
* Note that all cached pages should be unattached
- * before deleted from the radix tree. Otherwise some
+ * before deleted from the XArray. Otherwise some
* cached pages could be still attached to the orphan
* old workgroup when the new one is available in the tree.
*/
@@ -169,47 +150,34 @@ static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
* however in order to avoid some race conditions, add a
* DBG_BUGON to observe this in advance.
*/
- DBG_BUGON(radix_tree_delete(&sbi->workstn_tree, grp->index) != grp);
+ DBG_BUGON(__xa_erase(&sbi->managed_pslots, grp->index) != grp);
- /*
- * If managed cache is on, last refcount should indicate
- * the related workstation.
- */
- erofs_workgroup_unfreeze_final(grp);
+ /* last refcount should be connected with its managed pslot. */
+ erofs_workgroup_unfreeze(grp, 0);
+ __erofs_workgroup_free(grp);
return true;
}
static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
unsigned long nr_shrink)
{
- pgoff_t first_index = 0;
- void *batch[PAGEVEC_SIZE];
+ struct erofs_workgroup *grp;
unsigned int freed = 0;
+ unsigned long index;
- int i, found;
-repeat:
- xa_lock(&sbi->workstn_tree);
-
- found = radix_tree_gang_lookup(&sbi->workstn_tree,
- batch, first_index, PAGEVEC_SIZE);
-
- for (i = 0; i < found; ++i) {
- struct erofs_workgroup *grp = batch[i];
-
- first_index = grp->index + 1;
-
+ xa_lock(&sbi->managed_pslots);
+ xa_for_each(&sbi->managed_pslots, index, grp) {
/* try to shrink each valid workgroup */
if (!erofs_try_to_release_workgroup(sbi, grp))
continue;
+ xa_unlock(&sbi->managed_pslots);
++freed;
if (!--nr_shrink)
- break;
+ return freed;
+ xa_lock(&sbi->managed_pslots);
}
- xa_unlock(&sbi->workstn_tree);
-
- if (i && nr_shrink)
- goto repeat;
+ xa_unlock(&sbi->managed_pslots);
return freed;
}
@@ -286,7 +254,7 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink,
spin_unlock(&erofs_sb_list_lock);
sbi->shrinker_run_no = run_no;
- freed += erofs_shrink_workstation(sbi, nr);
+ freed += erofs_shrink_workstation(sbi, nr - freed);
spin_lock(&erofs_sb_list_lock);
/* Get the next list element before we move this one */
@@ -314,7 +282,7 @@ static struct shrinker erofs_shrinker_info = {
int __init erofs_init_shrinker(void)
{
- return register_shrinker(&erofs_shrinker_info);
+ return register_shrinker(&erofs_shrinker_info, "erofs-shrinker");
}
void erofs_exit_shrinker(void)
@@ -322,4 +290,3 @@ void erofs_exit_shrinker(void)
unregister_shrinker(&erofs_shrinker_info);
}
#endif /* !CONFIG_EROFS_FS_ZIP */
-
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
index b766c3ee5fa8..8106bcb5a38d 100644
--- a/fs/erofs/xattr.c
+++ b/fs/erofs/xattr.c
@@ -1,41 +1,21 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
- * http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
+ * https://www.huawei.com/
+ * Copyright (C) 2021-2022, Alibaba Cloud
*/
#include <linux/security.h>
#include "xattr.h"
struct xattr_iter {
struct super_block *sb;
- struct page *page;
+ struct erofs_buf buf;
void *kaddr;
erofs_blk_t blkaddr;
unsigned int ofs;
};
-static inline void xattr_iter_end(struct xattr_iter *it, bool atomic)
-{
- /* the only user of kunmap() is 'init_inode_xattrs' */
- if (!atomic)
- kunmap(it->page);
- else
- kunmap_atomic(it->kaddr);
-
- unlock_page(it->page);
- put_page(it->page);
-}
-
-static inline void xattr_iter_end_final(struct xattr_iter *it)
-{
- if (!it->page)
- return;
-
- xattr_iter_end(it, true);
-}
-
static int init_inode_xattrs(struct inode *inode)
{
struct erofs_inode *const vi = EROFS_I(inode);
@@ -44,12 +24,17 @@ static int init_inode_xattrs(struct inode *inode)
struct erofs_xattr_ibody_header *ih;
struct super_block *sb;
struct erofs_sb_info *sbi;
- bool atomic_map;
int ret = 0;
/* the most case is that xattrs of this inode are initialized. */
- if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags))
+ if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags)) {
+ /*
+ * paired with smp_mb() at the end of the function to ensure
+ * fields will only be observed after the bit is set.
+ */
+ smp_mb();
return 0;
+ }
if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_XATTR_BIT, TASK_KILLABLE))
return -ERESTARTSYS;
@@ -86,26 +71,23 @@ static int init_inode_xattrs(struct inode *inode)
sb = inode->i_sb;
sbi = EROFS_SB(sb);
+ it.buf = __EROFS_BUF_INITIALIZER;
it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize);
it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize);
- it.page = erofs_get_meta_page(sb, it.blkaddr);
- if (IS_ERR(it.page)) {
- ret = PTR_ERR(it.page);
+ /* read in shared xattr array (non-atomic, see kmalloc below) */
+ it.kaddr = erofs_read_metabuf(&it.buf, sb, it.blkaddr, EROFS_KMAP);
+ if (IS_ERR(it.kaddr)) {
+ ret = PTR_ERR(it.kaddr);
goto out_unlock;
}
- /* read in shared xattr array (non-atomic, see kmalloc below) */
- it.kaddr = kmap(it.page);
- atomic_map = false;
-
ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs);
-
vi->xattr_shared_count = ih->h_shared_count;
vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count,
sizeof(uint), GFP_KERNEL);
if (!vi->xattr_shared_xattrs) {
- xattr_iter_end(&it, atomic_map);
+ erofs_put_metabuf(&it.buf);
ret = -ENOMEM;
goto out_unlock;
}
@@ -117,26 +99,25 @@ static int init_inode_xattrs(struct inode *inode)
if (it.ofs >= EROFS_BLKSIZ) {
/* cannot be unaligned */
DBG_BUGON(it.ofs != EROFS_BLKSIZ);
- xattr_iter_end(&it, atomic_map);
- it.page = erofs_get_meta_page(sb, ++it.blkaddr);
- if (IS_ERR(it.page)) {
+ it.kaddr = erofs_read_metabuf(&it.buf, sb, ++it.blkaddr,
+ EROFS_KMAP);
+ if (IS_ERR(it.kaddr)) {
kfree(vi->xattr_shared_xattrs);
vi->xattr_shared_xattrs = NULL;
- ret = PTR_ERR(it.page);
+ ret = PTR_ERR(it.kaddr);
goto out_unlock;
}
-
- it.kaddr = kmap_atomic(it.page);
- atomic_map = true;
it.ofs = 0;
}
vi->xattr_shared_xattrs[i] =
le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs));
it.ofs += sizeof(__le32);
}
- xattr_iter_end(&it, atomic_map);
+ erofs_put_metabuf(&it.buf);
+ /* paired with smp_mb() at the beginning of the function. */
+ smp_mb();
set_bit(EROFS_I_EA_INITED_BIT, &vi->flags);
out_unlock:
@@ -165,19 +146,11 @@ static inline int xattr_iter_fixup(struct xattr_iter *it)
if (it->ofs < EROFS_BLKSIZ)
return 0;
- xattr_iter_end(it, true);
-
it->blkaddr += erofs_blknr(it->ofs);
-
- it->page = erofs_get_meta_page(it->sb, it->blkaddr);
- if (IS_ERR(it->page)) {
- int err = PTR_ERR(it->page);
-
- it->page = NULL;
- return err;
- }
-
- it->kaddr = kmap_atomic(it->page);
+ it->kaddr = erofs_read_metabuf(&it->buf, it->sb, it->blkaddr,
+ EROFS_KMAP_ATOMIC);
+ if (IS_ERR(it->kaddr))
+ return PTR_ERR(it->kaddr);
it->ofs = erofs_blkoff(it->ofs);
return 0;
}
@@ -200,11 +173,10 @@ static int inline_xattr_iter_begin(struct xattr_iter *it,
it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs);
it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs);
- it->page = erofs_get_meta_page(inode->i_sb, it->blkaddr);
- if (IS_ERR(it->page))
- return PTR_ERR(it->page);
-
- it->kaddr = kmap_atomic(it->page);
+ it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr,
+ EROFS_KMAP_ATOMIC);
+ if (IS_ERR(it->kaddr))
+ return PTR_ERR(it->kaddr);
return vi->xattr_isize - xattr_header_sz;
}
@@ -265,7 +237,7 @@ static int xattr_foreach(struct xattr_iter *it,
it->ofs = 0;
}
- slice = min_t(unsigned int, PAGE_SIZE - it->ofs,
+ slice = min_t(unsigned int, EROFS_BLKSIZ - it->ofs,
entry.e_name_len - processed);
/* handle name */
@@ -300,7 +272,7 @@ static int xattr_foreach(struct xattr_iter *it,
it->ofs = 0;
}
- slice = min_t(unsigned int, PAGE_SIZE - it->ofs,
+ slice = min_t(unsigned int, EROFS_BLKSIZ - it->ofs,
value_sz - processed);
op->value(it, processed, it->kaddr + it->ofs, slice);
it->ofs += slice;
@@ -379,8 +351,6 @@ static int inline_getxattr(struct inode *inode, struct getxattr_iter *it)
if (ret != -ENOATTR)
break;
}
- xattr_iter_end_final(&it->it);
-
return ret ? ret : it->buffer_size;
}
@@ -397,32 +367,22 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]);
it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
-
- if (!i || blkaddr != it->it.blkaddr) {
- if (i)
- xattr_iter_end(&it->it, true);
-
- it->it.page = erofs_get_meta_page(sb, blkaddr);
- if (IS_ERR(it->it.page))
- return PTR_ERR(it->it.page);
-
- it->it.kaddr = kmap_atomic(it->it.page);
- it->it.blkaddr = blkaddr;
- }
+ it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr,
+ EROFS_KMAP_ATOMIC);
+ if (IS_ERR(it->it.kaddr))
+ return PTR_ERR(it->it.kaddr);
+ it->it.blkaddr = blkaddr;
ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL);
if (ret != -ENOATTR)
break;
}
- if (vi->xattr_shared_count)
- xattr_iter_end_final(&it->it);
-
return ret ? ret : it->buffer_size;
}
static bool erofs_xattr_user_list(struct dentry *dentry)
{
- return test_opt(EROFS_SB(dentry->d_sb), XATTR_USER);
+ return test_opt(&EROFS_SB(dentry->d_sb)->opt, XATTR_USER);
}
static bool erofs_xattr_trusted_list(struct dentry *dentry)
@@ -445,10 +405,11 @@ int erofs_getxattr(struct inode *inode, int index,
return ret;
it.index = index;
-
it.name.len = strlen(name);
if (it.name.len > EROFS_NAME_LEN)
return -ERANGE;
+
+ it.it.buf = __EROFS_BUF_INITIALIZER;
it.name.name = name;
it.buffer = buffer;
@@ -458,6 +419,7 @@ int erofs_getxattr(struct inode *inode, int index,
ret = inline_getxattr(inode, &it);
if (ret == -ENOATTR)
ret = shared_getxattr(inode, &it);
+ erofs_put_metabuf(&it.it.buf);
return ret;
}
@@ -469,12 +431,10 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler,
switch (handler->flags) {
case EROFS_XATTR_INDEX_USER:
- if (!test_opt(sbi, XATTR_USER))
+ if (!test_opt(&sbi->opt, XATTR_USER))
return -EOPNOTSUPP;
break;
case EROFS_XATTR_INDEX_TRUSTED:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
break;
case EROFS_XATTR_INDEX_SECURITY:
break;
@@ -602,7 +562,6 @@ static int inline_listxattr(struct listxattr_iter *it)
if (ret)
break;
}
- xattr_iter_end_final(&it->it);
return ret ? ret : it->buffer_ofs;
}
@@ -620,25 +579,16 @@ static int shared_listxattr(struct listxattr_iter *it)
xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]);
it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
- if (!i || blkaddr != it->it.blkaddr) {
- if (i)
- xattr_iter_end(&it->it, true);
-
- it->it.page = erofs_get_meta_page(sb, blkaddr);
- if (IS_ERR(it->it.page))
- return PTR_ERR(it->it.page);
-
- it->it.kaddr = kmap_atomic(it->it.page);
- it->it.blkaddr = blkaddr;
- }
+ it->it.kaddr = erofs_read_metabuf(&it->it.buf, sb, blkaddr,
+ EROFS_KMAP_ATOMIC);
+ if (IS_ERR(it->it.kaddr))
+ return PTR_ERR(it->it.kaddr);
+ it->it.blkaddr = blkaddr;
ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL);
if (ret)
break;
}
- if (vi->xattr_shared_count)
- xattr_iter_end_final(&it->it);
-
return ret ? ret : it->buffer_ofs;
}
@@ -654,6 +604,7 @@ ssize_t erofs_listxattr(struct dentry *dentry,
if (ret)
return ret;
+ it.it.buf = __EROFS_BUF_INITIALIZER;
it.dentry = dentry;
it.buffer = buffer;
it.buffer_size = buffer_size;
@@ -662,18 +613,22 @@ ssize_t erofs_listxattr(struct dentry *dentry,
it.it.sb = dentry->d_sb;
ret = inline_listxattr(&it);
- if (ret < 0 && ret != -ENOATTR)
- return ret;
- return shared_listxattr(&it);
+ if (ret >= 0 || ret == -ENOATTR)
+ ret = shared_listxattr(&it);
+ erofs_put_metabuf(&it.it.buf);
+ return ret;
}
#ifdef CONFIG_EROFS_FS_POSIX_ACL
-struct posix_acl *erofs_get_acl(struct inode *inode, int type)
+struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu)
{
struct posix_acl *acl;
int prefix, rc;
char *value = NULL;
+ if (rcu)
+ return ERR_PTR(-ECHILD);
+
switch (type) {
case ACL_TYPE_ACCESS:
prefix = EROFS_XATTR_INDEX_POSIX_ACL_ACCESS;
@@ -703,4 +658,3 @@ struct posix_acl *erofs_get_acl(struct inode *inode, int type)
return acl;
}
#endif
-
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
index 50966f1c676e..0a43c9ee9f8f 100644
--- a/fs/erofs/xattr.h
+++ b/fs/erofs/xattr.h
@@ -1,8 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2017-2018 HUAWEI, Inc.
- * http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
+ * https://www.huawei.com/
*/
#ifndef __EROFS_XATTR_H
#define __EROFS_XATTR_H
@@ -40,9 +39,7 @@ static inline unsigned int xattrblock_offset(struct erofs_sb_info *sbi,
#ifdef CONFIG_EROFS_FS_XATTR
extern const struct xattr_handler erofs_xattr_user_handler;
extern const struct xattr_handler erofs_xattr_trusted_handler;
-#ifdef CONFIG_EROFS_FS_SECURITY
extern const struct xattr_handler erofs_xattr_security_handler;
-#endif
static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx)
{
@@ -76,18 +73,14 @@ static inline int erofs_getxattr(struct inode *inode, int index,
return -EOPNOTSUPP;
}
-static inline ssize_t erofs_listxattr(struct dentry *dentry,
- char *buffer, size_t buffer_size)
-{
- return -EOPNOTSUPP;
-}
+#define erofs_listxattr (NULL)
+#define erofs_xattr_handlers (NULL)
#endif /* !CONFIG_EROFS_FS_XATTR */
#ifdef CONFIG_EROFS_FS_POSIX_ACL
-struct posix_acl *erofs_get_acl(struct inode *inode, int type);
+struct posix_acl *erofs_get_acl(struct inode *inode, int type, bool rcu);
#else
#define erofs_get_acl (NULL)
#endif
#endif
-
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 80e47f07d946..064a166324a7 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1,25 +1,188 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2018 HUAWEI, Inc.
- * http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
+ * https://www.huawei.com/
+ * Copyright (C) 2022 Alibaba Cloud
*/
#include "zdata.h"
#include "compress.h"
#include <linux/prefetch.h>
+#include <linux/psi.h>
#include <trace/events/erofs.h>
/*
- * a compressed_pages[] placeholder in order to avoid
- * being filled with file pages for in-place decompression.
+ * since pclustersize is variable for big pcluster feature, introduce slab
+ * pools implementation for different pcluster sizes.
*/
-#define PAGE_UNALLOCATED ((void *)0x5F0E4B1D)
+struct z_erofs_pcluster_slab {
+ struct kmem_cache *slab;
+ unsigned int maxpages;
+ char name[48];
+};
+
+#define _PCLP(n) { .maxpages = n }
+
+static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = {
+ _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128),
+ _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES)
+};
+
+struct z_erofs_bvec_iter {
+ struct page *bvpage;
+ struct z_erofs_bvset *bvset;
+ unsigned int nr, cur;
+};
+
+static struct page *z_erofs_bvec_iter_end(struct z_erofs_bvec_iter *iter)
+{
+ if (iter->bvpage)
+ kunmap_local(iter->bvset);
+ return iter->bvpage;
+}
+
+static struct page *z_erofs_bvset_flip(struct z_erofs_bvec_iter *iter)
+{
+ unsigned long base = (unsigned long)((struct z_erofs_bvset *)0)->bvec;
+ /* have to access nextpage in advance, otherwise it will be unmapped */
+ struct page *nextpage = iter->bvset->nextpage;
+ struct page *oldpage;
+
+ DBG_BUGON(!nextpage);
+ oldpage = z_erofs_bvec_iter_end(iter);
+ iter->bvpage = nextpage;
+ iter->bvset = kmap_local_page(nextpage);
+ iter->nr = (PAGE_SIZE - base) / sizeof(struct z_erofs_bvec);
+ iter->cur = 0;
+ return oldpage;
+}
+
+static void z_erofs_bvec_iter_begin(struct z_erofs_bvec_iter *iter,
+ struct z_erofs_bvset_inline *bvset,
+ unsigned int bootstrap_nr,
+ unsigned int cur)
+{
+ *iter = (struct z_erofs_bvec_iter) {
+ .nr = bootstrap_nr,
+ .bvset = (struct z_erofs_bvset *)bvset,
+ };
+
+ while (cur > iter->nr) {
+ cur -= iter->nr;
+ z_erofs_bvset_flip(iter);
+ }
+ iter->cur = cur;
+}
+
+static int z_erofs_bvec_enqueue(struct z_erofs_bvec_iter *iter,
+ struct z_erofs_bvec *bvec,
+ struct page **candidate_bvpage)
+{
+ if (iter->cur == iter->nr) {
+ if (!*candidate_bvpage)
+ return -EAGAIN;
+
+ DBG_BUGON(iter->bvset->nextpage);
+ iter->bvset->nextpage = *candidate_bvpage;
+ z_erofs_bvset_flip(iter);
+
+ iter->bvset->nextpage = NULL;
+ *candidate_bvpage = NULL;
+ }
+ iter->bvset->bvec[iter->cur++] = *bvec;
+ return 0;
+}
+
+static void z_erofs_bvec_dequeue(struct z_erofs_bvec_iter *iter,
+ struct z_erofs_bvec *bvec,
+ struct page **old_bvpage)
+{
+ if (iter->cur == iter->nr)
+ *old_bvpage = z_erofs_bvset_flip(iter);
+ else
+ *old_bvpage = NULL;
+ *bvec = iter->bvset->bvec[iter->cur++];
+}
+
+static void z_erofs_destroy_pcluster_pool(void)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
+ if (!pcluster_pool[i].slab)
+ continue;
+ kmem_cache_destroy(pcluster_pool[i].slab);
+ pcluster_pool[i].slab = NULL;
+ }
+}
+
+static int z_erofs_create_pcluster_pool(void)
+{
+ struct z_erofs_pcluster_slab *pcs;
+ struct z_erofs_pcluster *a;
+ unsigned int size;
+
+ for (pcs = pcluster_pool;
+ pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
+ size = struct_size(a, compressed_bvecs, pcs->maxpages);
+
+ sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages);
+ pcs->slab = kmem_cache_create(pcs->name, size, 0,
+ SLAB_RECLAIM_ACCOUNT, NULL);
+ if (pcs->slab)
+ continue;
+
+ z_erofs_destroy_pcluster_pool();
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
+ struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
+ struct z_erofs_pcluster *pcl;
+
+ if (nrpages > pcs->maxpages)
+ continue;
+
+ pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS);
+ if (!pcl)
+ return ERR_PTR(-ENOMEM);
+ pcl->pclusterpages = nrpages;
+ return pcl;
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
+{
+ unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
+ struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
+
+ if (pclusterpages > pcs->maxpages)
+ continue;
+
+ kmem_cache_free(pcs->slab, pcl);
+ return;
+ }
+ DBG_BUGON(1);
+}
/* how to allocate cached pages for a pcluster */
enum z_erofs_cache_alloctype {
DONTALLOC, /* don't allocate any cached pages */
- DELAYEDALLOC, /* delayed allocation (at the time of submitting io) */
+ /*
+ * try to use cached I/O if page allocation succeeds or fallback
+ * to in-place I/O instead to avoid any direct reclaim.
+ */
+ TRYALLOC,
};
/*
@@ -32,84 +195,62 @@ typedef tagptr1_t compressed_page_t;
tagptr_fold(compressed_page_t, page, 1)
static struct workqueue_struct *z_erofs_workqueue __read_mostly;
-static struct kmem_cache *pcluster_cachep __read_mostly;
void z_erofs_exit_zip_subsystem(void)
{
destroy_workqueue(z_erofs_workqueue);
- kmem_cache_destroy(pcluster_cachep);
+ z_erofs_destroy_pcluster_pool();
}
static inline int z_erofs_init_workqueue(void)
{
const unsigned int onlinecpus = num_possible_cpus();
- const unsigned int flags = WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE;
/*
* no need to spawn too many threads, limiting threads could minimum
* scheduling overhead, perhaps per-CPU threads should be better?
*/
- z_erofs_workqueue = alloc_workqueue("erofs_unzipd", flags,
+ z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
+ WQ_UNBOUND | WQ_HIGHPRI,
onlinecpus + onlinecpus / 4);
return z_erofs_workqueue ? 0 : -ENOMEM;
}
-static void z_erofs_pcluster_init_once(void *ptr)
-{
- struct z_erofs_pcluster *pcl = ptr;
- struct z_erofs_collection *cl = z_erofs_primarycollection(pcl);
- unsigned int i;
-
- mutex_init(&cl->lock);
- cl->nr_pages = 0;
- cl->vcnt = 0;
- for (i = 0; i < Z_EROFS_CLUSTER_MAX_PAGES; ++i)
- pcl->compressed_pages[i] = NULL;
-}
-
-static void z_erofs_pcluster_init_always(struct z_erofs_pcluster *pcl)
-{
- struct z_erofs_collection *cl = z_erofs_primarycollection(pcl);
-
- atomic_set(&pcl->obj.refcount, 1);
-
- DBG_BUGON(cl->nr_pages);
- DBG_BUGON(cl->vcnt);
-}
-
int __init z_erofs_init_zip_subsystem(void)
{
- pcluster_cachep = kmem_cache_create("erofs_compress",
- Z_EROFS_WORKGROUP_SIZE, 0,
- SLAB_RECLAIM_ACCOUNT,
- z_erofs_pcluster_init_once);
- if (pcluster_cachep) {
- if (!z_erofs_init_workqueue())
- return 0;
+ int err = z_erofs_create_pcluster_pool();
- kmem_cache_destroy(pcluster_cachep);
- }
- return -ENOMEM;
+ if (err)
+ return err;
+ err = z_erofs_init_workqueue();
+ if (err)
+ z_erofs_destroy_pcluster_pool();
+ return err;
}
-enum z_erofs_collectmode {
- COLLECT_SECONDARY,
- COLLECT_PRIMARY,
+enum z_erofs_pclustermode {
+ Z_EROFS_PCLUSTER_INFLIGHT,
/*
- * The current collection was the tail of an exist chain, in addition
- * that the previous processed chained collections are all decided to
+ * The current pclusters was the tail of an exist chain, in addition
+ * that the previous processed chained pclusters are all decided to
* be hooked up to it.
- * A new chain will be created for the remaining collections which are
- * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED,
- * the next collection cannot reuse the whole page safely in
- * the following scenario:
+ * A new chain will be created for the remaining pclusters which are
+ * not processed yet, so different from Z_EROFS_PCLUSTER_FOLLOWED,
+ * the next pcluster cannot reuse the whole page safely for inplace I/O
+ * in the following scenario:
* ________________________________________________________________
* | tail (partial) page | head (partial) page |
- * | (belongs to the next cl) | (belongs to the current cl) |
- * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________|
+ * | (belongs to the next pcl) | (belongs to the current pcl) |
+ * |_______PCLUSTER_FOLLOWED______|________PCLUSTER_HOOKED__________|
+ */
+ Z_EROFS_PCLUSTER_HOOKED,
+ /*
+ * a weak form of Z_EROFS_PCLUSTER_FOLLOWED, the difference is that it
+ * could be dispatched into bypass queue later due to uptodated managed
+ * pages. All related online pages cannot be reused for inplace I/O (or
+ * bvpage) since it can be directly decoded without I/O submission.
*/
- COLLECT_PRIMARY_HOOKED,
- COLLECT_PRIMARY_FOLLOWED_NOINPLACE,
+ Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE,
/*
* The current collection has been linked with the owned chain, and
* could also be linked with the remaining collections, which means
@@ -120,91 +261,101 @@ enum z_erofs_collectmode {
* ________________________________________________________________
* | tail (partial) page | head (partial) page |
* | (of the current cl) | (of the previous collection) |
- * | PRIMARY_FOLLOWED or | |
- * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________|
+ * | PCLUSTER_FOLLOWED or | |
+ * |_____PCLUSTER_HOOKED__|___________PCLUSTER_FOLLOWED____________|
*
* [ (*) the above page can be used as inplace I/O. ]
*/
- COLLECT_PRIMARY_FOLLOWED,
-};
-
-struct z_erofs_collector {
- struct z_erofs_pagevec_ctor vector;
-
- struct z_erofs_pcluster *pcl, *tailpcl;
- struct z_erofs_collection *cl;
- struct page **compressedpages;
- z_erofs_next_pcluster_t owned_head;
-
- enum z_erofs_collectmode mode;
+ Z_EROFS_PCLUSTER_FOLLOWED,
};
struct z_erofs_decompress_frontend {
struct inode *const inode;
-
- struct z_erofs_collector clt;
struct erofs_map_blocks map;
+ struct z_erofs_bvec_iter biter;
+ struct page *candidate_bvpage;
+ struct z_erofs_pcluster *pcl, *tailpcl;
+ z_erofs_next_pcluster_t owned_head;
+ enum z_erofs_pclustermode mode;
+
+ bool readahead;
/* used for applying cache strategy on the fly */
bool backmost;
erofs_off_t headoffset;
-};
-#define COLLECTOR_INIT() { \
- .owned_head = Z_EROFS_PCLUSTER_TAIL, \
- .mode = COLLECT_PRIMARY_FOLLOWED }
+ /* a pointer used to pick up inplace I/O pages */
+ unsigned int icur;
+};
#define DECOMPRESS_FRONTEND_INIT(__i) { \
- .inode = __i, .clt = COLLECTOR_INIT(), \
- .backmost = true, }
-
-static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES];
-static DEFINE_MUTEX(z_pagemap_global_lock);
+ .inode = __i, .owned_head = Z_EROFS_PCLUSTER_TAIL, \
+ .mode = Z_EROFS_PCLUSTER_FOLLOWED, .backmost = true }
-static void preload_compressed_pages(struct z_erofs_collector *clt,
- struct address_space *mc,
- enum z_erofs_cache_alloctype type,
- struct list_head *pagepool)
+static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe,
+ enum z_erofs_cache_alloctype type,
+ struct page **pagepool)
{
- const struct z_erofs_pcluster *pcl = clt->pcl;
- const unsigned int clusterpages = BIT(pcl->clusterbits);
- struct page **pages = clt->compressedpages;
- pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages);
+ struct address_space *mc = MNGD_MAPPING(EROFS_I_SB(fe->inode));
+ struct z_erofs_pcluster *pcl = fe->pcl;
bool standalone = true;
+ /*
+ * optimistic allocation without direct reclaim since inplace I/O
+ * can be used if low memory otherwise.
+ */
+ gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
+ __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
+ unsigned int i;
- if (clt->mode < COLLECT_PRIMARY_FOLLOWED)
+ if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED)
return;
- for (; pages < pcl->compressed_pages + clusterpages; ++pages) {
+ for (i = 0; i < pcl->pclusterpages; ++i) {
struct page *page;
compressed_page_t t;
+ struct page *newpage = NULL;
/* the compressed page was loaded before */
- if (READ_ONCE(*pages))
+ if (READ_ONCE(pcl->compressed_bvecs[i].page))
continue;
- page = find_get_page(mc, index);
+ page = find_get_page(mc, pcl->obj.index + i);
if (page) {
t = tag_compressed_page_justfound(page);
- } else if (type == DELAYEDALLOC) {
- t = tagptr_init(compressed_page_t, PAGE_UNALLOCATED);
- } else { /* DONTALLOC */
- if (standalone)
- clt->compressedpages = pages;
+ } else {
+ /* I/O is needed, no possible to decompress directly */
standalone = false;
- continue;
+ switch (type) {
+ case TRYALLOC:
+ newpage = erofs_allocpage(pagepool, gfp);
+ if (!newpage)
+ continue;
+ set_page_private(newpage,
+ Z_EROFS_PREALLOCATED_PAGE);
+ t = tag_compressed_page_justfound(newpage);
+ break;
+ default: /* DONTALLOC */
+ continue;
+ }
}
- if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t)))
+ if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL,
+ tagptr_cast_ptr(t)))
continue;
if (page)
put_page(page);
+ else if (newpage)
+ erofs_pagepool_add(pagepool, newpage);
}
- if (standalone) /* downgrade to PRIMARY_FOLLOWED_NOINPLACE */
- clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
+ /*
+ * don't do inplace I/O if all compressed pages are available in
+ * managed cache since it can be moved to the bypass queue instead.
+ */
+ if (standalone)
+ fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
}
/* called by erofs_shrinker to get rid of all compressed_pages */
@@ -213,16 +364,15 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
{
struct z_erofs_pcluster *const pcl =
container_of(grp, struct z_erofs_pcluster, obj);
- struct address_space *const mapping = MNGD_MAPPING(sbi);
- const unsigned int clusterpages = BIT(pcl->clusterbits);
int i;
+ DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
/*
* refcount of workgroup is now freezed as 1,
* therefore no need to worry about available decompression users.
*/
- for (i = 0; i < clusterpages; ++i) {
- struct page *page = pcl->compressed_pages[i];
+ for (i = 0; i < pcl->pclusterpages; ++i) {
+ struct page *page = pcl->compressed_bvecs[i].page;
if (!page)
continue;
@@ -231,265 +381,215 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
if (!trylock_page(page))
return -EBUSY;
- if (page->mapping != mapping)
+ if (!erofs_page_is_managed(sbi, page))
continue;
/* barrier is implied in the following 'unlock_page' */
- WRITE_ONCE(pcl->compressed_pages[i], NULL);
- set_page_private(page, 0);
- ClearPagePrivate(page);
-
+ WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
+ detach_page_private(page);
unlock_page(page);
- put_page(page);
}
return 0;
}
-int erofs_try_to_free_cached_page(struct address_space *mapping,
- struct page *page)
+int erofs_try_to_free_cached_page(struct page *page)
{
struct z_erofs_pcluster *const pcl = (void *)page_private(page);
- const unsigned int clusterpages = BIT(pcl->clusterbits);
- int ret = 0; /* 0 - busy */
-
- if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) {
- unsigned int i;
+ int ret, i;
- for (i = 0; i < clusterpages; ++i) {
- if (pcl->compressed_pages[i] == page) {
- WRITE_ONCE(pcl->compressed_pages[i], NULL);
- ret = 1;
- break;
- }
- }
- erofs_workgroup_unfreeze(&pcl->obj, 1);
+ if (!erofs_workgroup_try_to_freeze(&pcl->obj, 1))
+ return 0;
- if (ret) {
- ClearPagePrivate(page);
- put_page(page);
+ ret = 0;
+ DBG_BUGON(z_erofs_is_inline_pcluster(pcl));
+ for (i = 0; i < pcl->pclusterpages; ++i) {
+ if (pcl->compressed_bvecs[i].page == page) {
+ WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
+ ret = 1;
+ break;
}
}
+ erofs_workgroup_unfreeze(&pcl->obj, 1);
+ if (ret)
+ detach_page_private(page);
return ret;
}
-/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
-static inline bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
- struct page *page)
+static bool z_erofs_try_inplace_io(struct z_erofs_decompress_frontend *fe,
+ struct z_erofs_bvec *bvec)
{
- struct z_erofs_pcluster *const pcl = clt->pcl;
- const unsigned int clusterpages = BIT(pcl->clusterbits);
+ struct z_erofs_pcluster *const pcl = fe->pcl;
- while (clt->compressedpages < pcl->compressed_pages + clusterpages) {
- if (!cmpxchg(clt->compressedpages++, NULL, page))
+ while (fe->icur > 0) {
+ if (!cmpxchg(&pcl->compressed_bvecs[--fe->icur].page,
+ NULL, bvec->page)) {
+ pcl->compressed_bvecs[fe->icur] = *bvec;
return true;
+ }
}
return false;
}
-/* callers must be with collection lock held */
-static int z_erofs_attach_page(struct z_erofs_collector *clt,
- struct page *page,
- enum z_erofs_page_type type)
+/* callers must be with pcluster lock held */
+static int z_erofs_attach_page(struct z_erofs_decompress_frontend *fe,
+ struct z_erofs_bvec *bvec, bool exclusive)
{
int ret;
- bool occupied;
- /* give priority for inplaceio */
- if (clt->mode >= COLLECT_PRIMARY &&
- type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
- z_erofs_try_inplace_io(clt, page))
- return 0;
-
- ret = z_erofs_pagevec_enqueue(&clt->vector,
- page, type, &occupied);
- clt->cl->vcnt += (unsigned int)ret;
-
- return ret ? 0 : -EAGAIN;
+ if (exclusive) {
+ /* give priority for inplaceio to use file pages first */
+ if (z_erofs_try_inplace_io(fe, bvec))
+ return 0;
+ /* otherwise, check if it can be used as a bvpage */
+ if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED &&
+ !fe->candidate_bvpage)
+ fe->candidate_bvpage = bvec->page;
+ }
+ ret = z_erofs_bvec_enqueue(&fe->biter, bvec, &fe->candidate_bvpage);
+ fe->pcl->vcnt += (ret >= 0);
+ return ret;
}
-static enum z_erofs_collectmode
-try_to_claim_pcluster(struct z_erofs_pcluster *pcl,
- z_erofs_next_pcluster_t *owned_head)
+static void z_erofs_try_to_claim_pcluster(struct z_erofs_decompress_frontend *f)
{
- /* let's claim these following types of pclusters */
-retry:
- if (pcl->next == Z_EROFS_PCLUSTER_NIL) {
- /* type 1, nil pcluster */
- if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
- *owned_head) != Z_EROFS_PCLUSTER_NIL)
- goto retry;
+ struct z_erofs_pcluster *pcl = f->pcl;
+ z_erofs_next_pcluster_t *owned_head = &f->owned_head;
+ /* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */
+ if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
+ *owned_head) == Z_EROFS_PCLUSTER_NIL) {
*owned_head = &pcl->next;
- /* lucky, I am the followee :) */
- return COLLECT_PRIMARY_FOLLOWED;
- } else if (pcl->next == Z_EROFS_PCLUSTER_TAIL) {
- /*
- * type 2, link to the end of a existing open chain,
- * be careful that its submission itself is governed
- * by the original owned chain.
- */
- if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
- *owned_head) != Z_EROFS_PCLUSTER_TAIL)
- goto retry;
+ /* so we can attach this pcluster to our submission chain. */
+ f->mode = Z_EROFS_PCLUSTER_FOLLOWED;
+ return;
+ }
+
+ /*
+ * type 2, link to the end of an existing open chain, be careful
+ * that its submission is controlled by the original attached chain.
+ */
+ if (*owned_head != &pcl->next && pcl != f->tailpcl &&
+ cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
+ *owned_head) == Z_EROFS_PCLUSTER_TAIL) {
*owned_head = Z_EROFS_PCLUSTER_TAIL;
- return COLLECT_PRIMARY_HOOKED;
+ f->mode = Z_EROFS_PCLUSTER_HOOKED;
+ f->tailpcl = NULL;
+ return;
}
- return COLLECT_PRIMARY; /* :( better luck next time */
+ /* type 3, it belongs to a chain, but it isn't the end of the chain */
+ f->mode = Z_EROFS_PCLUSTER_INFLIGHT;
}
-static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
- struct inode *inode,
- struct erofs_map_blocks *map)
+static int z_erofs_register_pcluster(struct z_erofs_decompress_frontend *fe)
{
- struct erofs_workgroup *grp;
+ struct erofs_map_blocks *map = &fe->map;
+ bool ztailpacking = map->m_flags & EROFS_MAP_META;
struct z_erofs_pcluster *pcl;
- struct z_erofs_collection *cl;
- unsigned int length;
-
- grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT);
- if (!grp)
- return -ENOENT;
-
- pcl = container_of(grp, struct z_erofs_pcluster, obj);
- if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) {
- DBG_BUGON(1);
- erofs_workgroup_put(grp);
- return -EFSCORRUPTED;
- }
+ struct erofs_workgroup *grp;
+ int err;
- cl = z_erofs_primarycollection(pcl);
- if (cl->pageofs != (map->m_la & ~PAGE_MASK)) {
+ if (!(map->m_flags & EROFS_MAP_ENCODED)) {
DBG_BUGON(1);
- erofs_workgroup_put(grp);
return -EFSCORRUPTED;
}
- length = READ_ONCE(pcl->length);
- if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) {
- if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) {
- DBG_BUGON(1);
- erofs_workgroup_put(grp);
- return -EFSCORRUPTED;
- }
- } else {
- unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT;
-
- if (map->m_flags & EROFS_MAP_FULL_MAPPED)
- llen |= Z_EROFS_PCLUSTER_FULL_LENGTH;
-
- while (llen > length &&
- length != cmpxchg_relaxed(&pcl->length, length, llen)) {
- cpu_relax();
- length = READ_ONCE(pcl->length);
- }
- }
- mutex_lock(&cl->lock);
- /* used to check tail merging loop due to corrupted images */
- if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
- clt->tailpcl = pcl;
- clt->mode = try_to_claim_pcluster(pcl, &clt->owned_head);
- /* clean tailpcl if the current owned_head is Z_EROFS_PCLUSTER_TAIL */
- if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
- clt->tailpcl = NULL;
- clt->pcl = pcl;
- clt->cl = cl;
- return 0;
-}
+ /* no available pcluster, let's allocate one */
+ pcl = z_erofs_alloc_pcluster(ztailpacking ? 1 :
+ map->m_plen >> PAGE_SHIFT);
+ if (IS_ERR(pcl))
+ return PTR_ERR(pcl);
-static int z_erofs_register_collection(struct z_erofs_collector *clt,
- struct inode *inode,
- struct erofs_map_blocks *map)
-{
- struct z_erofs_pcluster *pcl;
- struct z_erofs_collection *cl;
- int err;
-
- /* no available workgroup, let's allocate one */
- pcl = kmem_cache_alloc(pcluster_cachep, GFP_NOFS);
- if (!pcl)
- return -ENOMEM;
-
- z_erofs_pcluster_init_always(pcl);
- pcl->obj.index = map->m_pa >> PAGE_SHIFT;
-
- pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) |
- (map->m_flags & EROFS_MAP_FULL_MAPPED ?
- Z_EROFS_PCLUSTER_FULL_LENGTH : 0);
-
- if (map->m_flags & EROFS_MAP_ZIPPED)
- pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4;
- else
- pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
-
- pcl->clusterbits = EROFS_I(inode)->z_physical_clusterbits[0];
- pcl->clusterbits -= PAGE_SHIFT;
+ atomic_set(&pcl->obj.refcount, 1);
+ pcl->algorithmformat = map->m_algorithmformat;
+ pcl->length = 0;
+ pcl->partial = true;
/* new pclusters should be claimed as type 1, primary and followed */
- pcl->next = clt->owned_head;
- clt->mode = COLLECT_PRIMARY_FOLLOWED;
-
- cl = z_erofs_primarycollection(pcl);
- cl->pageofs = map->m_la & ~PAGE_MASK;
+ pcl->next = fe->owned_head;
+ pcl->pageofs_out = map->m_la & ~PAGE_MASK;
+ fe->mode = Z_EROFS_PCLUSTER_FOLLOWED;
/*
* lock all primary followed works before visible to others
* and mutex_trylock *never* fails for a new pcluster.
*/
- mutex_trylock(&cl->lock);
+ mutex_init(&pcl->lock);
+ DBG_BUGON(!mutex_trylock(&pcl->lock));
- err = erofs_register_workgroup(inode->i_sb, &pcl->obj);
- if (err) {
- mutex_unlock(&cl->lock);
- kmem_cache_free(pcluster_cachep, pcl);
- return -EAGAIN;
+ if (ztailpacking) {
+ pcl->obj.index = 0; /* which indicates ztailpacking */
+ pcl->pageofs_in = erofs_blkoff(map->m_pa);
+ pcl->tailpacking_size = map->m_plen;
+ } else {
+ pcl->obj.index = map->m_pa >> PAGE_SHIFT;
+
+ grp = erofs_insert_workgroup(fe->inode->i_sb, &pcl->obj);
+ if (IS_ERR(grp)) {
+ err = PTR_ERR(grp);
+ goto err_out;
+ }
+
+ if (grp != &pcl->obj) {
+ fe->pcl = container_of(grp,
+ struct z_erofs_pcluster, obj);
+ err = -EEXIST;
+ goto err_out;
+ }
}
/* used to check tail merging loop due to corrupted images */
- if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
- clt->tailpcl = pcl;
- clt->owned_head = &pcl->next;
- clt->pcl = pcl;
- clt->cl = cl;
+ if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
+ fe->tailpcl = pcl;
+ fe->owned_head = &pcl->next;
+ fe->pcl = pcl;
return 0;
+
+err_out:
+ mutex_unlock(&pcl->lock);
+ z_erofs_free_pcluster(pcl);
+ return err;
}
-static int z_erofs_collector_begin(struct z_erofs_collector *clt,
- struct inode *inode,
- struct erofs_map_blocks *map)
+static int z_erofs_collector_begin(struct z_erofs_decompress_frontend *fe)
{
+ struct erofs_map_blocks *map = &fe->map;
+ struct erofs_workgroup *grp = NULL;
int ret;
- DBG_BUGON(clt->cl);
+ DBG_BUGON(fe->pcl);
- /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */
- DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_NIL);
- DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+ /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous pcluster */
+ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_NIL);
+ DBG_BUGON(fe->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
- if (!PAGE_ALIGNED(map->m_pa)) {
+ if (!(map->m_flags & EROFS_MAP_META)) {
+ grp = erofs_find_workgroup(fe->inode->i_sb,
+ map->m_pa >> PAGE_SHIFT);
+ } else if ((map->m_pa & ~PAGE_MASK) + map->m_plen > PAGE_SIZE) {
DBG_BUGON(1);
- return -EINVAL;
+ return -EFSCORRUPTED;
}
-repeat:
- ret = z_erofs_lookup_collection(clt, inode, map);
- if (ret == -ENOENT) {
- ret = z_erofs_register_collection(clt, inode, map);
-
- /* someone registered at the same time, give another try */
- if (ret == -EAGAIN) {
- cond_resched();
- goto repeat;
- }
+ if (grp) {
+ fe->pcl = container_of(grp, struct z_erofs_pcluster, obj);
+ ret = -EEXIST;
+ } else {
+ ret = z_erofs_register_pcluster(fe);
}
- if (ret)
- return ret;
-
- z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS,
- clt->cl->pagevec, clt->cl->vcnt);
+ if (ret == -EEXIST) {
+ mutex_lock(&fe->pcl->lock);
+ /* used to check tail merging loop due to corrupted images */
+ if (fe->owned_head == Z_EROFS_PCLUSTER_TAIL)
+ fe->tailpcl = fe->pcl;
- clt->compressedpages = clt->pcl->compressed_pages;
- if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */
- clt->compressedpages += Z_EROFS_CLUSTER_MAX_PAGES;
+ z_erofs_try_to_claim_pcluster(fe);
+ } else if (ret) {
+ return ret;
+ }
+ z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset,
+ Z_EROFS_INLINE_BVECS, fe->pcl->vcnt);
+ /* since file-backed online pages are traversed in reverse order */
+ fe->icur = z_erofs_pclusterpages(fe->pcl);
return 0;
}
@@ -499,49 +599,41 @@ repeat:
*/
static void z_erofs_rcu_callback(struct rcu_head *head)
{
- struct z_erofs_collection *const cl =
- container_of(head, struct z_erofs_collection, rcu);
-
- kmem_cache_free(pcluster_cachep,
- container_of(cl, struct z_erofs_pcluster,
- primary_collection));
+ z_erofs_free_pcluster(container_of(head,
+ struct z_erofs_pcluster, rcu));
}
void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
{
struct z_erofs_pcluster *const pcl =
container_of(grp, struct z_erofs_pcluster, obj);
- struct z_erofs_collection *const cl = z_erofs_primarycollection(pcl);
-
- call_rcu(&cl->rcu, z_erofs_rcu_callback);
-}
-
-static void z_erofs_collection_put(struct z_erofs_collection *cl)
-{
- struct z_erofs_pcluster *const pcl =
- container_of(cl, struct z_erofs_pcluster, primary_collection);
- erofs_workgroup_put(&pcl->obj);
+ call_rcu(&pcl->rcu, z_erofs_rcu_callback);
}
-static bool z_erofs_collector_end(struct z_erofs_collector *clt)
+static bool z_erofs_collector_end(struct z_erofs_decompress_frontend *fe)
{
- struct z_erofs_collection *cl = clt->cl;
+ struct z_erofs_pcluster *pcl = fe->pcl;
- if (!cl)
+ if (!pcl)
return false;
- z_erofs_pagevec_ctor_exit(&clt->vector, false);
- mutex_unlock(&cl->lock);
+ z_erofs_bvec_iter_end(&fe->biter);
+ mutex_unlock(&pcl->lock);
+
+ if (fe->candidate_bvpage) {
+ DBG_BUGON(z_erofs_is_shortlived_page(fe->candidate_bvpage));
+ fe->candidate_bvpage = NULL;
+ }
/*
* if all pending pages are added, don't hold its reference
* any longer if the pcluster isn't hosted by ourselves.
*/
- if (clt->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE)
- z_erofs_collection_put(cl);
+ if (fe->mode < Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE)
+ erofs_workgroup_put(&pcl->obj);
- clt->cl = NULL;
+ fe->pcl = NULL;
return true;
}
@@ -559,20 +651,46 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
la < fe->headoffset;
}
+static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos,
+ struct page *page, unsigned int pageofs,
+ unsigned int len)
+{
+ struct inode *packed_inode = EROFS_I_SB(inode)->packed_inode;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
+ u8 *src, *dst;
+ unsigned int i, cnt;
+
+ pos += EROFS_I(inode)->z_fragmentoff;
+ for (i = 0; i < len; i += cnt) {
+ cnt = min_t(unsigned int, len - i,
+ EROFS_BLKSIZ - erofs_blkoff(pos));
+ src = erofs_bread(&buf, packed_inode,
+ erofs_blknr(pos), EROFS_KMAP);
+ if (IS_ERR(src)) {
+ erofs_put_metabuf(&buf);
+ return PTR_ERR(src);
+ }
+
+ dst = kmap_local_page(page);
+ memcpy(dst + pageofs + i, src + erofs_blkoff(pos), cnt);
+ kunmap_local(dst);
+ pos += cnt;
+ }
+ erofs_put_metabuf(&buf);
+ return 0;
+}
+
static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
- struct page *page,
- struct list_head *pagepool)
+ struct page *page, struct page **pagepool)
{
struct inode *const inode = fe->inode;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct erofs_map_blocks *const map = &fe->map;
- struct z_erofs_collector *const clt = &fe->clt;
const loff_t offset = page_offset(page);
- bool tight = true;
+ bool tight = true, exclusive;
enum z_erofs_cache_alloctype cache_strategy;
- enum z_erofs_page_type page_type;
- unsigned int cur, end, spiltted, index;
+ unsigned int cur, end, spiltted;
int err = 0;
/* register locked file pages as online pages in pack */
@@ -583,379 +701,456 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
repeat:
cur = end - 1;
- /* lucky, within the range of the current map_blocks */
- if (offset + cur >= map->m_la &&
- offset + cur < map->m_la + map->m_llen) {
- /* didn't get a valid collection previously (very rare) */
- if (!clt->cl)
- goto restart_now;
- goto hitted;
- }
-
- /* go ahead the next map_blocks */
- erofs_dbg("%s: [out-of-range] pos %llu", __func__, offset + cur);
+ if (offset + cur < map->m_la ||
+ offset + cur >= map->m_la + map->m_llen) {
+ erofs_dbg("out-of-range map @ pos %llu", offset + cur);
- if (z_erofs_collector_end(clt))
- fe->backmost = false;
-
- map->m_la = offset + cur;
- map->m_llen = 0;
- err = z_erofs_map_blocks_iter(inode, map, 0);
- if (err)
- goto err_out;
+ if (z_erofs_collector_end(fe))
+ fe->backmost = false;
+ map->m_la = offset + cur;
+ map->m_llen = 0;
+ err = z_erofs_map_blocks_iter(inode, map, 0);
+ if (err)
+ goto out;
+ } else {
+ if (fe->pcl)
+ goto hitted;
+ /* didn't get a valid pcluster previously (very rare) */
+ }
-restart_now:
- if (!(map->m_flags & EROFS_MAP_MAPPED))
+ if (!(map->m_flags & EROFS_MAP_MAPPED) ||
+ map->m_flags & EROFS_MAP_FRAGMENT)
goto hitted;
- err = z_erofs_collector_begin(clt, inode, map);
+ err = z_erofs_collector_begin(fe);
if (err)
- goto err_out;
+ goto out;
- /* preload all compressed pages (maybe downgrade role if necessary) */
- if (should_alloc_managed_pages(fe, sbi->cache_strategy, map->m_la))
- cache_strategy = DELAYEDALLOC;
- else
- cache_strategy = DONTALLOC;
+ if (z_erofs_is_inline_pcluster(fe->pcl)) {
+ void *mp;
- preload_compressed_pages(clt, MNGD_MAPPING(sbi),
- cache_strategy, pagepool);
+ mp = erofs_read_metabuf(&fe->map.buf, inode->i_sb,
+ erofs_blknr(map->m_pa), EROFS_NO_KMAP);
+ if (IS_ERR(mp)) {
+ err = PTR_ERR(mp);
+ erofs_err(inode->i_sb,
+ "failed to get inline page, err %d", err);
+ goto out;
+ }
+ get_page(fe->map.buf.page);
+ WRITE_ONCE(fe->pcl->compressed_bvecs[0].page,
+ fe->map.buf.page);
+ fe->mode = Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE;
+ } else {
+ /* bind cache first when cached decompression is preferred */
+ if (should_alloc_managed_pages(fe, sbi->opt.cache_strategy,
+ map->m_la))
+ cache_strategy = TRYALLOC;
+ else
+ cache_strategy = DONTALLOC;
+ z_erofs_bind_cache(fe, cache_strategy, pagepool);
+ }
hitted:
/*
* Ensure the current partial page belongs to this submit chain rather
* than other concurrent submit chains or the noio(bypass) chain since
* those chains are handled asynchronously thus the page cannot be used
- * for inplace I/O or pagevec (should be processed in strict order.)
+ * for inplace I/O or bvpage (should be processed in a strict order.)
*/
- tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED &&
- clt->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE);
+ tight &= (fe->mode >= Z_EROFS_PCLUSTER_HOOKED &&
+ fe->mode != Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE);
cur = end - min_t(unsigned int, offset + end - map->m_la, end);
if (!(map->m_flags & EROFS_MAP_MAPPED)) {
zero_user_segment(page, cur, end);
goto next_part;
}
+ if (map->m_flags & EROFS_MAP_FRAGMENT) {
+ unsigned int pageofs, skip, len;
- /* let's derive page type */
- page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
- (!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
- (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
- Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
+ if (offset > map->m_la) {
+ pageofs = 0;
+ skip = offset - map->m_la;
+ } else {
+ pageofs = map->m_la & ~PAGE_MASK;
+ skip = 0;
+ }
+ len = min_t(unsigned int, map->m_llen - skip, end - cur);
+ err = z_erofs_read_fragment(inode, skip, page, pageofs, len);
+ if (err)
+ goto out;
+ ++spiltted;
+ tight = false;
+ goto next_part;
+ }
+ exclusive = (!cur && (!spiltted || tight));
if (cur)
- tight &= (clt->mode >= COLLECT_PRIMARY_FOLLOWED);
+ tight &= (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED);
retry:
- err = z_erofs_attach_page(clt, page, page_type);
- /* should allocate an additional staging page for pagevec */
- if (err == -EAGAIN) {
- struct page *const newpage =
- erofs_allocpage(pagepool, GFP_NOFS | __GFP_NOFAIL);
-
- newpage->mapping = Z_EROFS_MAPPING_STAGING;
- err = z_erofs_attach_page(clt, newpage,
- Z_EROFS_PAGE_TYPE_EXCLUSIVE);
- if (!err)
- goto retry;
+ err = z_erofs_attach_page(fe, &((struct z_erofs_bvec) {
+ .page = page,
+ .offset = offset - map->m_la,
+ .end = end,
+ }), exclusive);
+ /* should allocate an additional short-lived page for bvset */
+ if (err == -EAGAIN && !fe->candidate_bvpage) {
+ fe->candidate_bvpage = alloc_page(GFP_NOFS | __GFP_NOFAIL);
+ set_page_private(fe->candidate_bvpage,
+ Z_EROFS_SHORTLIVED_PAGE);
+ goto retry;
}
- if (err)
- goto err_out;
-
- index = page->index - (map->m_la >> PAGE_SHIFT);
-
- z_erofs_onlinepage_fixup(page, index, true);
+ if (err) {
+ DBG_BUGON(err == -EAGAIN && fe->candidate_bvpage);
+ goto out;
+ }
+ z_erofs_onlinepage_split(page);
/* bump up the number of spiltted parts of a page */
++spiltted;
- /* also update nr_pages */
- clt->cl->nr_pages = max_t(pgoff_t, clt->cl->nr_pages, index + 1);
+ if (fe->pcl->pageofs_out != (map->m_la & ~PAGE_MASK))
+ fe->pcl->multibases = true;
+ if (fe->pcl->length < offset + end - map->m_la) {
+ fe->pcl->length = offset + end - map->m_la;
+ fe->pcl->pageofs_out = map->m_la & ~PAGE_MASK;
+ }
+ if ((map->m_flags & EROFS_MAP_FULL_MAPPED) &&
+ !(map->m_flags & EROFS_MAP_PARTIAL_REF) &&
+ fe->pcl->length == map->m_llen)
+ fe->pcl->partial = false;
next_part:
- /* can be used for verification */
+ /* shorten the remaining extent to update progress */
map->m_llen = offset + cur - map->m_la;
+ map->m_flags &= ~EROFS_MAP_FULL_MAPPED;
end = cur;
if (end > 0)
goto repeat;
out:
+ if (err)
+ z_erofs_page_mark_eio(page);
z_erofs_onlinepage_endio(page);
erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu",
__func__, page, spiltted, map->m_llen);
return err;
-
- /* if some error occurred while processing this page */
-err_out:
- SetPageError(page);
- goto out;
}
-static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
- bool sync, int bios)
+static bool z_erofs_get_sync_decompress_policy(struct erofs_sb_info *sbi,
+ unsigned int readahead_pages)
{
- /* wake up the caller thread for sync decompression */
- if (sync) {
- unsigned long flags;
+ /* auto: enable for read_folio, disable for readahead */
+ if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) &&
+ !readahead_pages)
+ return true;
- spin_lock_irqsave(&io->u.wait.lock, flags);
- if (!atomic_add_return(bios, &io->pending_bios))
- wake_up_locked(&io->u.wait);
- spin_unlock_irqrestore(&io->u.wait.lock, flags);
- return;
- }
+ if ((sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_FORCE_ON) &&
+ (readahead_pages <= sbi->opt.max_sync_decompress_pages))
+ return true;
- if (!atomic_add_return(bios, &io->pending_bios))
- queue_work(z_erofs_workqueue, &io->u.work);
+ return false;
}
-static void z_erofs_decompressqueue_endio(struct bio *bio)
+static bool z_erofs_page_is_invalidated(struct page *page)
{
- tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private);
- struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t);
- blk_status_t err = bio->bi_status;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ return !page->mapping && !z_erofs_is_shortlived_page(page);
+}
- bio_for_each_segment_all(bvec, bio, iter_all) {
- struct page *page = bvec->bv_page;
+struct z_erofs_decompress_backend {
+ struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES];
+ struct super_block *sb;
+ struct z_erofs_pcluster *pcl;
- DBG_BUGON(PageUptodate(page));
- DBG_BUGON(!page->mapping);
+ /* pages with the longest decompressed length for deduplication */
+ struct page **decompressed_pages;
+ /* pages to keep the compressed data */
+ struct page **compressed_pages;
- if (err)
- SetPageError(page);
+ struct list_head decompressed_secondary_bvecs;
+ struct page **pagepool;
+ unsigned int onstack_used, nr_pages;
+};
- if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
- if (!err)
- SetPageUptodate(page);
- unlock_page(page);
- }
- }
- z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1);
- bio_put(bio);
-}
+struct z_erofs_bvec_item {
+ struct z_erofs_bvec bvec;
+ struct list_head list;
+};
-static int z_erofs_decompress_pcluster(struct super_block *sb,
- struct z_erofs_pcluster *pcl,
- struct list_head *pagepool)
+static void z_erofs_do_decompressed_bvec(struct z_erofs_decompress_backend *be,
+ struct z_erofs_bvec *bvec)
{
- struct erofs_sb_info *const sbi = EROFS_SB(sb);
- const unsigned int clusterpages = BIT(pcl->clusterbits);
- struct z_erofs_pagevec_ctor ctor;
- unsigned int i, outputsize, llen, nr_pages;
- struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES];
- struct page **pages, **compressed_pages, *page;
-
- enum z_erofs_page_type page_type;
- bool overlapped, partial;
- struct z_erofs_collection *cl;
- int err;
+ struct z_erofs_bvec_item *item;
- might_sleep();
- cl = z_erofs_primarycollection(pcl);
- DBG_BUGON(!READ_ONCE(cl->nr_pages));
+ if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK)) {
+ unsigned int pgnr;
- mutex_lock(&cl->lock);
- nr_pages = cl->nr_pages;
-
- if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) {
- pages = pages_onstack;
- } else if (nr_pages <= Z_EROFS_VMAP_GLOBAL_PAGES &&
- mutex_trylock(&z_pagemap_global_lock)) {
- pages = z_pagemap_global;
- } else {
- gfp_t gfp_flags = GFP_KERNEL;
-
- if (nr_pages > Z_EROFS_VMAP_GLOBAL_PAGES)
- gfp_flags |= __GFP_NOFAIL;
+ pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT;
+ DBG_BUGON(pgnr >= be->nr_pages);
+ if (!be->decompressed_pages[pgnr]) {
+ be->decompressed_pages[pgnr] = bvec->page;
+ return;
+ }
+ }
- pages = kvmalloc_array(nr_pages, sizeof(struct page *),
- gfp_flags);
+ /* (cold path) one pcluster is requested multiple times */
+ item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_NOFAIL);
+ item->bvec = *bvec;
+ list_add(&item->list, &be->decompressed_secondary_bvecs);
+}
- /* fallback to global pagemap for the lowmem scenario */
- if (!pages) {
- mutex_lock(&z_pagemap_global_lock);
- pages = z_pagemap_global;
+static void z_erofs_fill_other_copies(struct z_erofs_decompress_backend *be,
+ int err)
+{
+ unsigned int off0 = be->pcl->pageofs_out;
+ struct list_head *p, *n;
+
+ list_for_each_safe(p, n, &be->decompressed_secondary_bvecs) {
+ struct z_erofs_bvec_item *bvi;
+ unsigned int end, cur;
+ void *dst, *src;
+
+ bvi = container_of(p, struct z_erofs_bvec_item, list);
+ cur = bvi->bvec.offset < 0 ? -bvi->bvec.offset : 0;
+ end = min_t(unsigned int, be->pcl->length - bvi->bvec.offset,
+ bvi->bvec.end);
+ dst = kmap_local_page(bvi->bvec.page);
+ while (cur < end) {
+ unsigned int pgnr, scur, len;
+
+ pgnr = (bvi->bvec.offset + cur + off0) >> PAGE_SHIFT;
+ DBG_BUGON(pgnr >= be->nr_pages);
+
+ scur = bvi->bvec.offset + cur -
+ ((pgnr << PAGE_SHIFT) - off0);
+ len = min_t(unsigned int, end - cur, PAGE_SIZE - scur);
+ if (!be->decompressed_pages[pgnr]) {
+ err = -EFSCORRUPTED;
+ cur += len;
+ continue;
+ }
+ src = kmap_local_page(be->decompressed_pages[pgnr]);
+ memcpy(dst + cur, src + scur, len);
+ kunmap_local(src);
+ cur += len;
}
+ kunmap_local(dst);
+ if (err)
+ z_erofs_page_mark_eio(bvi->bvec.page);
+ z_erofs_onlinepage_endio(bvi->bvec.page);
+ list_del(p);
+ kfree(bvi);
}
+}
- for (i = 0; i < nr_pages; ++i)
- pages[i] = NULL;
+static void z_erofs_parse_out_bvecs(struct z_erofs_decompress_backend *be)
+{
+ struct z_erofs_pcluster *pcl = be->pcl;
+ struct z_erofs_bvec_iter biter;
+ struct page *old_bvpage;
+ int i;
- err = 0;
- z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS,
- cl->pagevec, 0);
+ z_erofs_bvec_iter_begin(&biter, &pcl->bvset, Z_EROFS_INLINE_BVECS, 0);
+ for (i = 0; i < pcl->vcnt; ++i) {
+ struct z_erofs_bvec bvec;
- for (i = 0; i < cl->vcnt; ++i) {
- unsigned int pagenr;
+ z_erofs_bvec_dequeue(&biter, &bvec, &old_bvpage);
- page = z_erofs_pagevec_dequeue(&ctor, &page_type);
+ if (old_bvpage)
+ z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
- /* all pages in pagevec ought to be valid */
- DBG_BUGON(!page);
- DBG_BUGON(!page->mapping);
+ DBG_BUGON(z_erofs_page_is_invalidated(bvec.page));
+ z_erofs_do_decompressed_bvec(be, &bvec);
+ }
- if (z_erofs_put_stagingpage(pagepool, page))
- continue;
+ old_bvpage = z_erofs_bvec_iter_end(&biter);
+ if (old_bvpage)
+ z_erofs_put_shortlivedpage(be->pagepool, old_bvpage);
+}
- if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
- pagenr = 0;
- else
- pagenr = z_erofs_onlinepage_index(page);
+static int z_erofs_parse_in_bvecs(struct z_erofs_decompress_backend *be,
+ bool *overlapped)
+{
+ struct z_erofs_pcluster *pcl = be->pcl;
+ unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
+ int i, err = 0;
- DBG_BUGON(pagenr >= nr_pages);
+ *overlapped = false;
+ for (i = 0; i < pclusterpages; ++i) {
+ struct z_erofs_bvec *bvec = &pcl->compressed_bvecs[i];
+ struct page *page = bvec->page;
- /*
- * currently EROFS doesn't support multiref(dedup),
- * so here erroring out one multiref page.
- */
- if (pages[pagenr]) {
+ /* compressed pages ought to be present before decompressing */
+ if (!page) {
DBG_BUGON(1);
- SetPageError(pages[pagenr]);
- z_erofs_onlinepage_endio(pages[pagenr]);
- err = -EFSCORRUPTED;
+ continue;
}
- pages[pagenr] = page;
- }
- z_erofs_pagevec_ctor_exit(&ctor, true);
-
- overlapped = false;
- compressed_pages = pcl->compressed_pages;
-
- for (i = 0; i < clusterpages; ++i) {
- unsigned int pagenr;
-
- page = compressed_pages[i];
+ be->compressed_pages[i] = page;
- /* all compressed pages ought to be valid */
- DBG_BUGON(!page);
- DBG_BUGON(!page->mapping);
+ if (z_erofs_is_inline_pcluster(pcl)) {
+ if (!PageUptodate(page))
+ err = -EIO;
+ continue;
+ }
- if (!z_erofs_page_is_staging(page)) {
- if (erofs_page_is_managed(sbi, page)) {
+ DBG_BUGON(z_erofs_page_is_invalidated(page));
+ if (!z_erofs_is_shortlived_page(page)) {
+ if (erofs_page_is_managed(EROFS_SB(be->sb), page)) {
if (!PageUptodate(page))
err = -EIO;
continue;
}
-
- /*
- * only if non-head page can be selected
- * for inplace decompression
- */
- pagenr = z_erofs_onlinepage_index(page);
-
- DBG_BUGON(pagenr >= nr_pages);
- if (pages[pagenr]) {
- DBG_BUGON(1);
- SetPageError(pages[pagenr]);
- z_erofs_onlinepage_endio(pages[pagenr]);
- err = -EFSCORRUPTED;
- }
- pages[pagenr] = page;
-
- overlapped = true;
+ z_erofs_do_decompressed_bvec(be, bvec);
+ *overlapped = true;
}
+ }
- /* PG_error needs checking for inplaced and staging pages */
- if (PageError(page)) {
- DBG_BUGON(PageUptodate(page));
- err = -EIO;
- }
+ if (err)
+ return err;
+ return 0;
+}
+
+static int z_erofs_decompress_pcluster(struct z_erofs_decompress_backend *be,
+ int err)
+{
+ struct erofs_sb_info *const sbi = EROFS_SB(be->sb);
+ struct z_erofs_pcluster *pcl = be->pcl;
+ unsigned int pclusterpages = z_erofs_pclusterpages(pcl);
+ unsigned int i, inputsize;
+ int err2;
+ struct page *page;
+ bool overlapped;
+
+ mutex_lock(&pcl->lock);
+ be->nr_pages = PAGE_ALIGN(pcl->length + pcl->pageofs_out) >> PAGE_SHIFT;
+
+ /* allocate (de)compressed page arrays if cannot be kept on stack */
+ be->decompressed_pages = NULL;
+ be->compressed_pages = NULL;
+ be->onstack_used = 0;
+ if (be->nr_pages <= Z_EROFS_ONSTACK_PAGES) {
+ be->decompressed_pages = be->onstack_pages;
+ be->onstack_used = be->nr_pages;
+ memset(be->decompressed_pages, 0,
+ sizeof(struct page *) * be->nr_pages);
}
+ if (pclusterpages + be->onstack_used <= Z_EROFS_ONSTACK_PAGES)
+ be->compressed_pages = be->onstack_pages + be->onstack_used;
+
+ if (!be->decompressed_pages)
+ be->decompressed_pages =
+ kvcalloc(be->nr_pages, sizeof(struct page *),
+ GFP_KERNEL | __GFP_NOFAIL);
+ if (!be->compressed_pages)
+ be->compressed_pages =
+ kvcalloc(pclusterpages, sizeof(struct page *),
+ GFP_KERNEL | __GFP_NOFAIL);
+
+ z_erofs_parse_out_bvecs(be);
+ err2 = z_erofs_parse_in_bvecs(be, &overlapped);
+ if (err2)
+ err = err2;
if (err)
goto out;
- llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT;
- if (nr_pages << PAGE_SHIFT >= cl->pageofs + llen) {
- outputsize = llen;
- partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH);
- } else {
- outputsize = (nr_pages << PAGE_SHIFT) - cl->pageofs;
- partial = true;
- }
+ if (z_erofs_is_inline_pcluster(pcl))
+ inputsize = pcl->tailpacking_size;
+ else
+ inputsize = pclusterpages * PAGE_SIZE;
err = z_erofs_decompress(&(struct z_erofs_decompress_req) {
- .sb = sb,
- .in = compressed_pages,
- .out = pages,
- .pageofs_out = cl->pageofs,
- .inputsize = PAGE_SIZE,
- .outputsize = outputsize,
+ .sb = be->sb,
+ .in = be->compressed_pages,
+ .out = be->decompressed_pages,
+ .pageofs_in = pcl->pageofs_in,
+ .pageofs_out = pcl->pageofs_out,
+ .inputsize = inputsize,
+ .outputsize = pcl->length,
.alg = pcl->algorithmformat,
.inplace_io = overlapped,
- .partial_decoding = partial
- }, pagepool);
+ .partial_decoding = pcl->partial,
+ .fillgaps = pcl->multibases,
+ }, be->pagepool);
out:
- /* must handle all compressed pages before endding pages */
- for (i = 0; i < clusterpages; ++i) {
- page = compressed_pages[i];
-
- if (erofs_page_is_managed(sbi, page))
- continue;
+ /* must handle all compressed pages before actual file pages */
+ if (z_erofs_is_inline_pcluster(pcl)) {
+ page = pcl->compressed_bvecs[0].page;
+ WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL);
+ put_page(page);
+ } else {
+ for (i = 0; i < pclusterpages; ++i) {
+ page = pcl->compressed_bvecs[i].page;
- /* recycle all individual staging pages */
- (void)z_erofs_put_stagingpage(pagepool, page);
+ if (erofs_page_is_managed(sbi, page))
+ continue;
- WRITE_ONCE(compressed_pages[i], NULL);
+ /* recycle all individual short-lived pages */
+ (void)z_erofs_put_shortlivedpage(be->pagepool, page);
+ WRITE_ONCE(pcl->compressed_bvecs[i].page, NULL);
+ }
}
+ if (be->compressed_pages < be->onstack_pages ||
+ be->compressed_pages >= be->onstack_pages + Z_EROFS_ONSTACK_PAGES)
+ kvfree(be->compressed_pages);
+ z_erofs_fill_other_copies(be, err);
- for (i = 0; i < nr_pages; ++i) {
- page = pages[i];
+ for (i = 0; i < be->nr_pages; ++i) {
+ page = be->decompressed_pages[i];
if (!page)
continue;
- DBG_BUGON(!page->mapping);
+ DBG_BUGON(z_erofs_page_is_invalidated(page));
- /* recycle all individual staging pages */
- if (z_erofs_put_stagingpage(pagepool, page))
+ /* recycle all individual short-lived pages */
+ if (z_erofs_put_shortlivedpage(be->pagepool, page))
continue;
-
- if (err < 0)
- SetPageError(page);
-
+ if (err)
+ z_erofs_page_mark_eio(page);
z_erofs_onlinepage_endio(page);
}
- if (pages == z_pagemap_global)
- mutex_unlock(&z_pagemap_global_lock);
- else if (pages != pages_onstack)
- kvfree(pages);
+ if (be->decompressed_pages != be->onstack_pages)
+ kvfree(be->decompressed_pages);
- cl->nr_pages = 0;
- cl->vcnt = 0;
+ pcl->length = 0;
+ pcl->partial = true;
+ pcl->multibases = false;
+ pcl->bvset.nextpage = NULL;
+ pcl->vcnt = 0;
- /* all cl locks MUST be taken before the following line */
+ /* pcluster lock MUST be taken before the following line */
WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
-
- /* all cl locks SHOULD be released right now */
- mutex_unlock(&cl->lock);
-
- z_erofs_collection_put(cl);
+ mutex_unlock(&pcl->lock);
return err;
}
static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
- struct list_head *pagepool)
+ struct page **pagepool)
{
+ struct z_erofs_decompress_backend be = {
+ .sb = io->sb,
+ .pagepool = pagepool,
+ .decompressed_secondary_bvecs =
+ LIST_HEAD_INIT(be.decompressed_secondary_bvecs),
+ };
z_erofs_next_pcluster_t owned = io->head;
while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) {
- struct z_erofs_pcluster *pcl;
-
- /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
+ /* impossible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL);
-
- /* no possible that 'owned' equals NULL */
+ /* impossible that 'owned' equals Z_EROFS_PCLUSTER_NIL */
DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
- pcl = container_of(owned, struct z_erofs_pcluster, next);
- owned = READ_ONCE(pcl->next);
+ be.pcl = container_of(owned, struct z_erofs_pcluster, next);
+ owned = READ_ONCE(be.pcl->next);
- z_erofs_decompress_pcluster(io->sb, pcl, pagepool);
+ z_erofs_decompress_pcluster(&be, io->eio ? -EIO : 0);
+ erofs_workgroup_put(&be.pcl->obj);
}
}
@@ -963,22 +1158,47 @@ static void z_erofs_decompressqueue_work(struct work_struct *work)
{
struct z_erofs_decompressqueue *bgq =
container_of(work, struct z_erofs_decompressqueue, u.work);
- LIST_HEAD(pagepool);
+ struct page *pagepool = NULL;
DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
z_erofs_decompress_queue(bgq, &pagepool);
- put_pages_list(&pagepool);
+ erofs_release_pages(&pagepool);
kvfree(bgq);
}
+static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
+ bool sync, int bios)
+{
+ struct erofs_sb_info *const sbi = EROFS_SB(io->sb);
+
+ /* wake up the caller thread for sync decompression */
+ if (sync) {
+ if (!atomic_add_return(bios, &io->pending_bios))
+ complete(&io->u.done);
+ return;
+ }
+
+ if (atomic_add_return(bios, &io->pending_bios))
+ return;
+ /* Use workqueue and sync decompression for atomic contexts only */
+ if (in_atomic() || irqs_disabled()) {
+ queue_work(z_erofs_workqueue, &io->u.work);
+ /* enable sync decompression for readahead */
+ if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO)
+ sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON;
+ return;
+ }
+ z_erofs_decompressqueue_work(&io->u.work);
+}
+
static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
unsigned int nr,
- struct list_head *pagepool,
- struct address_space *mc,
- gfp_t gfp)
+ struct page **pagepool,
+ struct address_space *mc)
{
const pgoff_t index = pcl->obj.index;
+ gfp_t gfp = mapping_gfp_mask(mc);
bool tocache = false;
struct address_space *mapping;
@@ -988,36 +1208,41 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
int justfound;
repeat:
- page = READ_ONCE(pcl->compressed_pages[nr]);
+ page = READ_ONCE(pcl->compressed_bvecs[nr].page);
oldpage = page;
if (!page)
goto out_allocpage;
- /*
- * the cached page has not been allocated and
- * an placeholder is out there, prepare it now.
- */
- if (page == PAGE_UNALLOCATED) {
- tocache = true;
- goto out_allocpage;
- }
-
/* process the target tagged pointer */
t = tagptr_init(compressed_page_t, page);
justfound = tagptr_unfold_tags(t);
page = tagptr_unfold_ptr(t);
+ /*
+ * preallocated cached pages, which is used to avoid direct reclaim
+ * otherwise, it will go inplace I/O path instead.
+ */
+ if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
+ WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
+ set_page_private(page, 0);
+ tocache = true;
+ goto out_tocache;
+ }
mapping = READ_ONCE(page->mapping);
/*
- * unmanaged (file) pages are all locked solidly,
+ * file-backed online pages in plcuster are all locked steady,
* therefore it is impossible for `mapping' to be NULL.
*/
if (mapping && mapping != mc)
/* ought to be unmanaged pages */
goto out;
+ /* directly return for shortlived page as well */
+ if (z_erofs_is_shortlived_page(page))
+ goto out;
+
lock_page(page);
/* only true if page reclaim goes wrong, should never happen */
@@ -1025,14 +1250,13 @@ repeat:
/* the page is still in manage cache */
if (page->mapping == mc) {
- WRITE_ONCE(pcl->compressed_pages[nr], page);
+ WRITE_ONCE(pcl->compressed_bvecs[nr].page, page);
- ClearPageError(page);
if (!PagePrivate(page)) {
/*
* impossible to be !PagePrivate(page) for
* the current restriction as well if
- * the page is already in compressed_pages[].
+ * the page is already in compressed_bvecs[].
*/
DBG_BUGON(!justfound);
@@ -1061,25 +1285,22 @@ repeat:
put_page(page);
out_allocpage:
page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
- if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) {
- /* non-LRU / non-movable temporary page is needed */
- page->mapping = Z_EROFS_MAPPING_STAGING;
- tocache = false;
- }
-
- if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) {
- if (tocache) {
- /* since it added to managed cache successfully */
- unlock_page(page);
- put_page(page);
- } else {
- list_add(&page->lru, pagepool);
- }
+ if (oldpage != cmpxchg(&pcl->compressed_bvecs[nr].page,
+ oldpage, page)) {
+ erofs_pagepool_add(pagepool, page);
cond_resched();
goto repeat;
}
- set_page_private(page, (unsigned long)pcl);
- SetPagePrivate(page);
+out_tocache:
+ if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) {
+ /* turn into temporary page if fails (1 ref) */
+ set_page_private(page, Z_EROFS_SHORTLIVED_PAGE);
+ goto out;
+ }
+ attach_page_private(page, pcl);
+ /* drop a refcount added by allocpage (then we have 2 refs here) */
+ put_page(page);
+
out: /* the only exit (for tracing and debugging) */
return page;
}
@@ -1100,8 +1321,9 @@ jobqueue_init(struct super_block *sb,
} else {
fg_out:
q = fgq;
- init_waitqueue_head(&fgq->u.wait);
+ init_completion(&fgq->u.done);
atomic_set(&fgq->pending_bios, 0);
+ q->eio = false;
}
q->sb = sb;
q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
@@ -1148,20 +1370,50 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
qtail[JQ_BYPASS] = &pcl->next;
}
-static void z_erofs_submit_queue(struct super_block *sb,
- z_erofs_next_pcluster_t owned_head,
- struct list_head *pagepool,
+static void z_erofs_decompressqueue_endio(struct bio *bio)
+{
+ tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private);
+ struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t);
+ blk_status_t err = bio->bi_status;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ struct page *page = bvec->bv_page;
+
+ DBG_BUGON(PageUptodate(page));
+ DBG_BUGON(z_erofs_page_is_invalidated(page));
+
+ if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
+ if (!err)
+ SetPageUptodate(page);
+ unlock_page(page);
+ }
+ }
+ if (err)
+ q->eio = true;
+ z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1);
+ bio_put(bio);
+}
+
+static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f,
+ struct page **pagepool,
struct z_erofs_decompressqueue *fgq,
bool *force_fg)
{
- struct erofs_sb_info *const sbi = EROFS_SB(sb);
+ struct super_block *sb = f->inode->i_sb;
+ struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb));
z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
void *bi_private;
- /* since bio will be NULL, no need to initialize last_index */
- pgoff_t uninitialized_var(last_index);
+ z_erofs_next_pcluster_t owned_head = f->owned_head;
+ /* bio is NULL initially, so no need to initialize last_{index,bdev} */
+ pgoff_t last_index;
+ struct block_device *last_bdev;
unsigned int nr_bios = 0;
struct bio *bio = NULL;
+ unsigned long pflags;
+ int memstall = 0;
bi_private = jobqueueset_init(sb, q, fgq, force_fg);
qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
@@ -1171,6 +1423,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
q[JQ_SUBMIT]->head = owned_head;
do {
+ struct erofs_map_dev mdev;
struct z_erofs_pcluster *pcl;
pgoff_t cur, end;
unsigned int i = 0;
@@ -1182,43 +1435,62 @@ static void z_erofs_submit_queue(struct super_block *sb,
pcl = container_of(owned_head, struct z_erofs_pcluster, next);
- cur = pcl->obj.index;
- end = cur + BIT(pcl->clusterbits);
-
/* close the main owned chain at first */
owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
Z_EROFS_PCLUSTER_TAIL_CLOSED);
+ if (z_erofs_is_inline_pcluster(pcl)) {
+ move_to_bypass_jobqueue(pcl, qtail, owned_head);
+ continue;
+ }
+
+ /* no device id here, thus it will always succeed */
+ mdev = (struct erofs_map_dev) {
+ .m_pa = blknr_to_addr(pcl->obj.index),
+ };
+ (void)erofs_map_dev(sb, &mdev);
+
+ cur = erofs_blknr(mdev.m_pa);
+ end = cur + pcl->pclusterpages;
do {
struct page *page;
- int err;
page = pickup_page_for_submission(pcl, i++, pagepool,
- MNGD_MAPPING(sbi),
- GFP_NOFS);
+ mc);
if (!page)
continue;
- if (bio && cur != last_index + 1) {
+ if (bio && (cur != last_index + 1 ||
+ last_bdev != mdev.m_bdev)) {
submit_bio_retry:
submit_bio(bio);
+ if (memstall) {
+ psi_memstall_leave(&pflags);
+ memstall = 0;
+ }
bio = NULL;
}
- if (!bio) {
- bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
+ if (unlikely(PageWorkingset(page)) && !memstall) {
+ psi_memstall_enter(&pflags);
+ memstall = 1;
+ }
+ if (!bio) {
+ bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS,
+ REQ_OP_READ, GFP_NOIO);
bio->bi_end_io = z_erofs_decompressqueue_endio;
- bio_set_dev(bio, sb->s_bdev);
+
+ last_bdev = mdev.m_bdev;
bio->bi_iter.bi_sector = (sector_t)cur <<
LOG_SECTORS_PER_BLOCK;
bio->bi_private = bi_private;
- bio->bi_opf = REQ_OP_READ;
+ if (f->readahead)
+ bio->bi_opf |= REQ_RAHEAD;
++nr_bios;
}
- err = bio_add_page(bio, page, PAGE_SIZE, 0);
- if (err < PAGE_SIZE)
+ if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
goto submit_bio_retry;
last_index = cur;
@@ -1231,8 +1503,11 @@ submit_bio_retry:
move_to_bypass_jobqueue(pcl, qtail, owned_head);
} while (owned_head != Z_EROFS_PCLUSTER_TAIL);
- if (bio)
+ if (bio) {
submit_bio(bio);
+ if (memstall)
+ psi_memstall_leave(&pflags);
+ }
/*
* although background is preferred, no one is pending for submission.
@@ -1245,15 +1520,14 @@ submit_bio_retry:
z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios);
}
-static void z_erofs_runqueue(struct super_block *sb,
- struct z_erofs_collector *clt,
- struct list_head *pagepool, bool force_fg)
+static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f,
+ struct page **pagepool, bool force_fg)
{
struct z_erofs_decompressqueue io[NR_JOBQUEUES];
- if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
+ if (f->owned_head == Z_EROFS_PCLUSTER_TAIL)
return;
- z_erofs_submit_queue(sb, clt->owned_head, pagepool, io, &force_fg);
+ z_erofs_submit_queue(f, pagepool, io, &force_fg);
/* handle bypass queue (no i/o pclusters) immediately */
z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool);
@@ -1262,82 +1536,123 @@ static void z_erofs_runqueue(struct super_block *sb,
return;
/* wait until all bios are completed */
- io_wait_event(io[JQ_SUBMIT].u.wait,
- !atomic_read(&io[JQ_SUBMIT].pending_bios));
+ wait_for_completion_io(&io[JQ_SUBMIT].u.done);
/* handle synchronous decompress queue in the caller context */
z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool);
}
-static int z_erofs_readpage(struct file *file, struct page *page)
+/*
+ * Since partial uptodate is still unimplemented for now, we have to use
+ * approximate readmore strategies as a start.
+ */
+static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f,
+ struct readahead_control *rac,
+ erofs_off_t end,
+ struct page **pagepool,
+ bool backmost)
+{
+ struct inode *inode = f->inode;
+ struct erofs_map_blocks *map = &f->map;
+ erofs_off_t cur;
+ int err;
+
+ if (backmost) {
+ map->m_la = end;
+ err = z_erofs_map_blocks_iter(inode, map,
+ EROFS_GET_BLOCKS_READMORE);
+ if (err)
+ return;
+
+ /* expend ra for the trailing edge if readahead */
+ if (rac) {
+ loff_t newstart = readahead_pos(rac);
+
+ cur = round_up(map->m_la + map->m_llen, PAGE_SIZE);
+ readahead_expand(rac, newstart, cur - newstart);
+ return;
+ }
+ end = round_up(end, PAGE_SIZE);
+ } else {
+ end = round_up(map->m_la, PAGE_SIZE);
+
+ if (!map->m_llen)
+ return;
+ }
+
+ cur = map->m_la + map->m_llen - 1;
+ while (cur >= end) {
+ pgoff_t index = cur >> PAGE_SHIFT;
+ struct page *page;
+
+ page = erofs_grab_cache_page_nowait(inode->i_mapping, index);
+ if (page) {
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ } else {
+ err = z_erofs_do_read_page(f, page, pagepool);
+ if (err)
+ erofs_err(inode->i_sb,
+ "readmore error at page %lu @ nid %llu",
+ index, EROFS_I(inode)->nid);
+ }
+ put_page(page);
+ }
+
+ if (cur < PAGE_SIZE)
+ break;
+ cur = (index << PAGE_SHIFT) - 1;
+ }
+}
+
+static int z_erofs_read_folio(struct file *file, struct folio *folio)
{
+ struct page *page = &folio->page;
struct inode *const inode = page->mapping->host;
+ struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+ struct page *pagepool = NULL;
int err;
- LIST_HEAD(pagepool);
trace_erofs_readpage(page, false);
-
f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
+ z_erofs_pcluster_readmore(&f, NULL, f.headoffset + PAGE_SIZE - 1,
+ &pagepool, true);
err = z_erofs_do_read_page(&f, page, &pagepool);
- (void)z_erofs_collector_end(&f.clt);
+ z_erofs_pcluster_readmore(&f, NULL, 0, &pagepool, false);
+
+ (void)z_erofs_collector_end(&f);
/* if some compressed cluster ready, need submit them anyway */
- z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, true);
+ z_erofs_runqueue(&f, &pagepool,
+ z_erofs_get_sync_decompress_policy(sbi, 0));
if (err)
erofs_err(inode->i_sb, "failed to read, err [%d]", err);
- if (f.map.mpage)
- put_page(f.map.mpage);
-
- /* clean up the remaining free pages */
- put_pages_list(&pagepool);
+ erofs_put_metabuf(&f.map.buf);
+ erofs_release_pages(&pagepool);
return err;
}
-static bool should_decompress_synchronously(struct erofs_sb_info *sbi,
- unsigned int nr)
+static void z_erofs_readahead(struct readahead_control *rac)
{
- return nr <= sbi->max_sync_decompress_pages;
-}
-
-static int z_erofs_readpages(struct file *filp, struct address_space *mapping,
- struct list_head *pages, unsigned int nr_pages)
-{
- struct inode *const inode = mapping->host;
+ struct inode *const inode = rac->mapping->host;
struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
-
- bool sync = should_decompress_synchronously(sbi, nr_pages);
struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
- gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
- struct page *head = NULL;
- LIST_HEAD(pagepool);
-
- trace_erofs_readpages(mapping->host, lru_to_page(pages),
- nr_pages, false);
-
- f.headoffset = (erofs_off_t)lru_to_page(pages)->index << PAGE_SHIFT;
+ struct page *pagepool = NULL, *head = NULL, *page;
+ unsigned int nr_pages;
- for (; nr_pages; --nr_pages) {
- struct page *page = lru_to_page(pages);
+ f.readahead = true;
+ f.headoffset = readahead_pos(rac);
- prefetchw(&page->flags);
- list_del(&page->lru);
-
- /*
- * A pure asynchronous readahead is indicated if
- * a PG_readahead marked page is hitted at first.
- * Let's also do asynchronous decompression for this case.
- */
- sync &= !(PageReadahead(page) && !head);
-
- if (add_to_page_cache_lru(page, mapping, page->index, gfp)) {
- list_add(&page->lru, &pagepool);
- continue;
- }
+ z_erofs_pcluster_readmore(&f, rac, f.headoffset +
+ readahead_length(rac) - 1, &pagepool, true);
+ nr_pages = readahead_count(rac);
+ trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false);
+ while ((page = readahead_page(rac))) {
set_page_private(page, (unsigned long)head);
head = page;
}
@@ -1356,21 +1671,16 @@ static int z_erofs_readpages(struct file *filp, struct address_space *mapping,
page->index, EROFS_I(inode)->nid);
put_page(page);
}
+ z_erofs_pcluster_readmore(&f, rac, 0, &pagepool, false);
+ (void)z_erofs_collector_end(&f);
- (void)z_erofs_collector_end(&f.clt);
-
- z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, sync);
-
- if (f.map.mpage)
- put_page(f.map.mpage);
-
- /* clean up the remaining free pages */
- put_pages_list(&pagepool);
- return 0;
+ z_erofs_runqueue(&f, &pagepool,
+ z_erofs_get_sync_decompress_policy(sbi, nr_pages));
+ erofs_put_metabuf(&f.map.buf);
+ erofs_release_pages(&pagepool);
}
const struct address_space_operations z_erofs_aops = {
- .readpage = z_erofs_readpage,
- .readpages = z_erofs_readpages,
+ .read_folio = z_erofs_read_folio,
+ .readahead = z_erofs_readahead,
};
-
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index 7824f5563a55..d98c95212985 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -1,16 +1,37 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2018 HUAWEI, Inc.
- * http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
+ * https://www.huawei.com/
*/
#ifndef __EROFS_FS_ZDATA_H
#define __EROFS_FS_ZDATA_H
#include "internal.h"
-#include "zpvec.h"
+#include "tagptr.h"
-#define Z_EROFS_NR_INLINE_PAGEVECS 3
+#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
+#define Z_EROFS_INLINE_BVECS 2
+
+/*
+ * let's leave a type here in case of introducing
+ * another tagged pointer later.
+ */
+typedef void *z_erofs_next_pcluster_t;
+
+struct z_erofs_bvec {
+ struct page *page;
+ int offset;
+ unsigned int end;
+};
+
+#define __Z_EROFS_BVSET(name, total) \
+struct name { \
+ /* point to the next page which contains the following bvecs */ \
+ struct page *nextpage; \
+ struct z_erofs_bvec bvec[total]; \
+}
+__Z_EROFS_BVSET(z_erofs_bvset,);
+__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS);
/*
* Structure fields follow one of the following exclusion rules.
@@ -18,61 +39,58 @@
* I: Modifiable by initialization/destruction paths and read-only
* for everyone else;
*
- * L: Field should be protected by pageset lock;
+ * L: Field should be protected by the pcluster lock;
*
* A: Field should be accessed / updated in atomic for parallelized code.
*/
-struct z_erofs_collection {
+struct z_erofs_pcluster {
+ struct erofs_workgroup obj;
struct mutex lock;
- /* I: page offset of start position of decompression */
- unsigned short pageofs;
+ /* A: point to next chained pcluster or TAILs */
+ z_erofs_next_pcluster_t next;
- /* L: maximum relative page index in pagevec[] */
- unsigned short nr_pages;
+ /* L: the maximum decompression size of this round */
+ unsigned int length;
- /* L: total number of pages in pagevec[] */
+ /* L: total number of bvecs */
unsigned int vcnt;
+ /* I: page offset of start position of decompression */
+ unsigned short pageofs_out;
+
+ /* I: page offset of inline compressed data */
+ unsigned short pageofs_in;
+
union {
- /* L: inline a certain number of pagevecs for bootstrap */
- erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS];
+ /* L: inline a certain number of bvec for bootstrap */
+ struct z_erofs_bvset_inline bvset;
/* I: can be used to free the pcluster by RCU. */
struct rcu_head rcu;
};
-};
-
-#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001
-#define Z_EROFS_PCLUSTER_LENGTH_BIT 1
-/*
- * let's leave a type here in case of introducing
- * another tagged pointer later.
- */
-typedef void *z_erofs_next_pcluster_t;
+ union {
+ /* I: physical cluster size in pages */
+ unsigned short pclusterpages;
-struct z_erofs_pcluster {
- struct erofs_workgroup obj;
- struct z_erofs_collection primary_collection;
+ /* I: tailpacking inline compressed size */
+ unsigned short tailpacking_size;
+ };
- /* A: point to next chained pcluster or TAILs */
- z_erofs_next_pcluster_t next;
+ /* I: compression algorithm format */
+ unsigned char algorithmformat;
- /* A: compressed pages (including multi-usage pages) */
- struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES];
+ /* L: whether partial decompression or not */
+ bool partial;
- /* A: lower limit of decompressed length and if full length or not */
- unsigned int length;
+ /* L: indicate several pageofs_outs or not */
+ bool multibases;
- /* I: compression algorithm format */
- unsigned char algorithmformat;
- /* I: bit shift of physical cluster size */
- unsigned char clusterbits;
+ /* A: compressed bvecs (can be cached or inplaced pages) */
+ struct z_erofs_bvec compressed_bvecs[];
};
-#define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection)
-
/* let's avoid the valid 32-bit kernel addresses */
/* the chained workgroup has't submitted io (still open) */
@@ -82,58 +100,42 @@ struct z_erofs_pcluster {
#define Z_EROFS_PCLUSTER_NIL (NULL)
-#define Z_EROFS_WORKGROUP_SIZE sizeof(struct z_erofs_pcluster)
-
struct z_erofs_decompressqueue {
struct super_block *sb;
atomic_t pending_bios;
z_erofs_next_pcluster_t head;
union {
- wait_queue_head_t wait;
+ struct completion done;
struct work_struct work;
} u;
+
+ bool eio;
};
-#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
-static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
- struct page *page)
+static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl)
{
- return page->mapping == MNGD_MAPPING(sbi);
+ return !pcl->obj.index;
}
-#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2
-#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
-#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS)
+static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl)
+{
+ if (z_erofs_is_inline_pcluster(pcl))
+ return 1;
+ return pcl->pclusterpages;
+}
/*
- * waiters (aka. ongoing_packs): # to unlock the page
- * sub-index: 0 - for partial page, >= 1 full page sub-index
+ * bit 30: I/O error occurred on this page
+ * bit 0 - 29: remaining parts to complete this page
*/
-typedef atomic_t z_erofs_onlinepage_t;
-
-/* type punning */
-union z_erofs_onlinepage_converter {
- z_erofs_onlinepage_t *o;
- unsigned long *v;
-};
-
-static inline unsigned int z_erofs_onlinepage_index(struct page *page)
-{
- union z_erofs_onlinepage_converter u;
-
- DBG_BUGON(!PagePrivate(page));
- u.v = &page_private(page);
-
- return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
-}
+#define Z_EROFS_PAGE_EIO (1 << 30)
static inline void z_erofs_onlinepage_init(struct page *page)
{
union {
- z_erofs_onlinepage_t o;
+ atomic_t o;
unsigned long v;
- /* keep from being unlocked in advance */
} u = { .o = ATOMIC_INIT(1) };
set_page_private(page, u.v);
@@ -141,49 +143,36 @@ static inline void z_erofs_onlinepage_init(struct page *page)
SetPagePrivate(page);
}
-static inline void z_erofs_onlinepage_fixup(struct page *page,
- uintptr_t index, bool down)
+static inline void z_erofs_onlinepage_split(struct page *page)
{
- unsigned long *p, o, v, id;
-repeat:
- p = &page_private(page);
- o = READ_ONCE(*p);
-
- id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
- if (id) {
- if (!index)
- return;
+ atomic_inc((atomic_t *)&page->private);
+}
- DBG_BUGON(id != index);
- }
+static inline void z_erofs_page_mark_eio(struct page *page)
+{
+ int orig;
- v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
- ((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down);
- if (cmpxchg(p, o, v) != o)
- goto repeat;
+ do {
+ orig = atomic_read((atomic_t *)&page->private);
+ } while (atomic_cmpxchg((atomic_t *)&page->private, orig,
+ orig | Z_EROFS_PAGE_EIO) != orig);
}
static inline void z_erofs_onlinepage_endio(struct page *page)
{
- union z_erofs_onlinepage_converter u;
unsigned int v;
DBG_BUGON(!PagePrivate(page));
- u.v = &page_private(page);
-
- v = atomic_dec_return(u.o);
- if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+ v = atomic_dec_return((atomic_t *)&page->private);
+ if (!(v & ~Z_EROFS_PAGE_EIO)) {
+ set_page_private(page, 0);
ClearPagePrivate(page);
- if (!PageError(page))
+ if (!(v & Z_EROFS_PAGE_EIO))
SetPageUptodate(page);
unlock_page(page);
}
- erofs_dbg("%s, page %p value %x", __func__, page, atomic_read(u.o));
}
-#define Z_EROFS_VMAP_ONSTACK_PAGES \
- min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U)
-#define Z_EROFS_VMAP_GLOBAL_PAGES 2048
+#define Z_EROFS_ONSTACK_PAGES 32
#endif
-
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 736db3a4cdef..0bb66927e3d0 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -1,27 +1,30 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
* Copyright (C) 2018-2019 HUAWEI, Inc.
- * http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
+ * https://www.huawei.com/
*/
#include "internal.h"
#include <asm/unaligned.h>
#include <trace/events/erofs.h>
+static int z_erofs_do_map_blocks(struct inode *inode,
+ struct erofs_map_blocks *map,
+ int flags);
+
int z_erofs_fill_inode(struct inode *inode)
{
struct erofs_inode *const vi = EROFS_I(inode);
+ struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
- if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) {
+ if (!erofs_sb_has_big_pcluster(sbi) &&
+ !erofs_sb_has_ztailpacking(sbi) && !erofs_sb_has_fragments(sbi) &&
+ vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) {
vi->z_advise = 0;
vi->z_algorithmtype[0] = 0;
vi->z_algorithmtype[1] = 0;
vi->z_logical_clusterbits = LOG_BLOCK_SIZE;
- vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits;
- vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits;
set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
}
-
inode->i_mapping->a_ops = &z_erofs_aops;
return 0;
}
@@ -30,14 +33,20 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
{
struct erofs_inode *const vi = EROFS_I(inode);
struct super_block *const sb = inode->i_sb;
- int err;
+ int err, headnr;
erofs_off_t pos;
- struct page *page;
+ struct erofs_buf buf = __EROFS_BUF_INITIALIZER;
void *kaddr;
struct z_erofs_map_header *h;
- if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
+ if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) {
+ /*
+ * paired with smp_mb() at the end of the function to ensure
+ * fields will only be observed after the bit is set.
+ */
+ smp_mb();
return 0;
+ }
if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE))
return -ERESTARTSYS;
@@ -46,48 +55,95 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
goto out_unlock;
- DBG_BUGON(vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY);
-
pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
vi->xattr_isize, 8);
- page = erofs_get_meta_page(sb, erofs_blknr(pos));
- if (IS_ERR(page)) {
- err = PTR_ERR(page);
+ kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP);
+ if (IS_ERR(kaddr)) {
+ err = PTR_ERR(kaddr);
goto out_unlock;
}
- kaddr = kmap_atomic(page);
-
h = kaddr + erofs_blkoff(pos);
+ /*
+ * if the highest bit of the 8-byte map header is set, the whole file
+ * is stored in the packed inode. The rest bits keeps z_fragmentoff.
+ */
+ if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) {
+ vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER;
+ vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63);
+ vi->z_tailextent_headlcn = 0;
+ goto done;
+ }
vi->z_advise = le16_to_cpu(h->h_advise);
vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
- if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX) {
- erofs_err(sb, "unknown compression format %u for nid %llu, please upgrade kernel",
- vi->z_algorithmtype[0], vi->nid);
+ headnr = 0;
+ if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX ||
+ vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) {
+ erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel",
+ headnr + 1, vi->z_algorithmtype[headnr], vi->nid);
err = -EOPNOTSUPP;
- goto unmap_done;
+ goto out_put_metabuf;
}
vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7);
- vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits +
- ((h->h_clusterbits >> 3) & 3);
+ if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
+ vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
+ Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
+ erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu",
+ vi->nid);
+ err = -EFSCORRUPTED;
+ goto out_put_metabuf;
+ }
+ if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION &&
+ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^
+ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
+ erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu",
+ vi->nid);
+ err = -EFSCORRUPTED;
+ goto out_put_metabuf;
+ }
- if (vi->z_physical_clusterbits[0] != LOG_BLOCK_SIZE) {
- erofs_err(sb, "unsupported physical clusterbits %u for nid %llu, please upgrade kernel",
- vi->z_physical_clusterbits[0], vi->nid);
- err = -EOPNOTSUPP;
- goto unmap_done;
+ if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) {
+ struct erofs_map_blocks map = {
+ .buf = __EROFS_BUF_INITIALIZER
+ };
+
+ vi->z_idata_size = le16_to_cpu(h->h_idata_size);
+ err = z_erofs_do_map_blocks(inode, &map,
+ EROFS_GET_BLOCKS_FINDTAIL);
+ erofs_put_metabuf(&map.buf);
+
+ if (!map.m_plen ||
+ erofs_blkoff(map.m_pa) + map.m_plen > EROFS_BLKSIZ) {
+ erofs_err(sb, "invalid tail-packing pclustersize %llu",
+ map.m_plen);
+ err = -EFSCORRUPTED;
+ }
+ if (err < 0)
+ goto out_put_metabuf;
}
- vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits +
- ((h->h_clusterbits >> 5) & 7);
+ if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER &&
+ !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) {
+ struct erofs_map_blocks map = {
+ .buf = __EROFS_BUF_INITIALIZER
+ };
+
+ vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff);
+ err = z_erofs_do_map_blocks(inode, &map,
+ EROFS_GET_BLOCKS_FINDTAIL);
+ erofs_put_metabuf(&map.buf);
+ if (err < 0)
+ goto out_put_metabuf;
+ }
+done:
+ /* paired with smp_mb() at the beginning of the function */
+ smp_mb();
set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
-unmap_done:
- kunmap_atomic(kaddr);
- unlock_page(page);
- put_page(page);
+out_put_metabuf:
+ erofs_put_metabuf(&buf);
out_unlock:
clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags);
return err;
@@ -100,44 +156,14 @@ struct z_erofs_maprecorder {
unsigned long lcn;
/* compression extent information gathered */
- u8 type;
+ u8 type, headtype;
u16 clusterofs;
u16 delta[2];
- erofs_blk_t pblk;
+ erofs_blk_t pblk, compressedblks;
+ erofs_off_t nextpackoff;
+ bool partialref;
};
-static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m,
- erofs_blk_t eblk)
-{
- struct super_block *const sb = m->inode->i_sb;
- struct erofs_map_blocks *const map = m->map;
- struct page *mpage = map->mpage;
-
- if (mpage) {
- if (mpage->index == eblk) {
- if (!m->kaddr)
- m->kaddr = kmap_atomic(mpage);
- return 0;
- }
-
- if (m->kaddr) {
- kunmap_atomic(m->kaddr);
- m->kaddr = NULL;
- }
- put_page(mpage);
- }
-
- mpage = erofs_get_meta_page(sb, eblk);
- if (IS_ERR(mpage)) {
- map->mpage = NULL;
- return PTR_ERR(mpage);
- }
- m->kaddr = kmap_atomic(mpage);
- unlock_page(mpage);
- map->mpage = mpage;
- return 0;
-}
-
static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
unsigned long lcn)
{
@@ -150,12 +176,13 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
lcn * sizeof(struct z_erofs_vle_decompressed_index);
struct z_erofs_vle_decompressed_index *di;
unsigned int advise, type;
- int err;
- err = z_erofs_reload_indexes(m, erofs_blknr(pos));
- if (err)
- return err;
+ m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
+ erofs_blknr(pos), EROFS_KMAP_ATOMIC);
+ if (IS_ERR(m->kaddr))
+ return PTR_ERR(m->kaddr);
+ m->nextpackoff = pos + sizeof(struct z_erofs_vle_decompressed_index);
m->lcn = lcn;
di = m->kaddr + erofs_blkoff(pos);
@@ -166,10 +193,23 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
m->clusterofs = 1 << vi->z_logical_clusterbits;
m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
+ if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) {
+ if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
+ Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ m->compressedblks = m->delta[0] &
+ ~Z_EROFS_VLE_DI_D0_CBLKCNT;
+ m->delta[0] = 1;
+ }
m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
break;
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
+ if (advise & Z_EROFS_VLE_DI_PARTIAL_REF)
+ m->partialref = true;
m->clusterofs = le16_to_cpu(di->di_clusterofs);
m->pblk = le32_to_cpu(di->di_u.blkaddr);
break;
@@ -192,16 +232,42 @@ static unsigned int decode_compactedbits(unsigned int lobits,
return lo;
}
+static int get_compacted_la_distance(unsigned int lclusterbits,
+ unsigned int encodebits,
+ unsigned int vcnt, u8 *in, int i)
+{
+ const unsigned int lomask = (1 << lclusterbits) - 1;
+ unsigned int lo, d1 = 0;
+ u8 type;
+
+ DBG_BUGON(i >= vcnt);
+
+ do {
+ lo = decode_compactedbits(lclusterbits, lomask,
+ in, encodebits * i, &type);
+
+ if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
+ return d1;
+ ++d1;
+ } while (++i < vcnt);
+
+ /* vcnt - 1 (Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) item */
+ if (!(lo & Z_EROFS_VLE_DI_D0_CBLKCNT))
+ d1 += lo - 1;
+ return d1;
+}
+
static int unpack_compacted_index(struct z_erofs_maprecorder *m,
unsigned int amortizedshift,
- unsigned int eofs)
+ erofs_off_t pos, bool lookahead)
{
struct erofs_inode *const vi = EROFS_I(m->inode);
const unsigned int lclusterbits = vi->z_logical_clusterbits;
const unsigned int lomask = (1 << lclusterbits) - 1;
- unsigned int vcnt, base, lo, encodebits, nblk;
+ unsigned int vcnt, base, lo, encodebits, nblk, eofs;
int i;
u8 *in, type;
+ bool big_pcluster;
if (1 << amortizedshift == 4)
vcnt = 2;
@@ -210,7 +276,12 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
else
return -EOPNOTSUPP;
+ /* it doesn't equal to round_up(..) */
+ m->nextpackoff = round_down(pos, vcnt << amortizedshift) +
+ (vcnt << amortizedshift);
+ big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
+ eofs = erofs_blkoff(pos);
base = round_down(eofs, vcnt << amortizedshift);
in = m->kaddr + base;
@@ -221,7 +292,20 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
m->type = type;
if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
m->clusterofs = 1 << lclusterbits;
- if (i + 1 != vcnt) {
+
+ /* figure out lookahead_distance: delta[1] if needed */
+ if (lookahead)
+ m->delta[1] = get_compacted_la_distance(lclusterbits,
+ encodebits, vcnt, in, i);
+ if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) {
+ if (!big_pcluster) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ m->compressedblks = lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT;
+ m->delta[0] = 1;
+ return 0;
+ } else if (i + 1 != (int)vcnt) {
m->delta[0] = lo;
return 0;
}
@@ -234,22 +318,48 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
in, encodebits * (i - 1), &type);
if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
lo = 0;
+ else if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT)
+ lo = 1;
m->delta[0] = lo + 1;
return 0;
}
m->clusterofs = lo;
m->delta[0] = 0;
/* figout out blkaddr (pblk) for HEAD lclusters */
- nblk = 1;
- while (i > 0) {
- --i;
- lo = decode_compactedbits(lclusterbits, lomask,
- in, encodebits * i, &type);
- if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
- i -= lo;
-
- if (i >= 0)
+ if (!big_pcluster) {
+ nblk = 1;
+ while (i > 0) {
+ --i;
+ lo = decode_compactedbits(lclusterbits, lomask,
+ in, encodebits * i, &type);
+ if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
+ i -= lo;
+
+ if (i >= 0)
+ ++nblk;
+ }
+ } else {
+ nblk = 0;
+ while (i > 0) {
+ --i;
+ lo = decode_compactedbits(lclusterbits, lomask,
+ in, encodebits * i, &type);
+ if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
+ if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) {
+ --i;
+ nblk += lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT;
+ continue;
+ }
+ /* bigpcluster shouldn't have plain d0 == 1 */
+ if (lo <= 1) {
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ i -= lo - 2;
+ continue;
+ }
++nblk;
+ }
}
in += (vcnt << amortizedshift) - sizeof(__le32);
m->pblk = le32_to_cpu(*(__le32 *)in) + nblk;
@@ -257,7 +367,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
}
static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
- unsigned long lcn)
+ unsigned long lcn, bool lookahead)
{
struct inode *const inode = m->inode;
struct erofs_inode *const vi = EROFS_I(inode);
@@ -269,7 +379,6 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
unsigned int compacted_4b_initial, compacted_2b;
unsigned int amortizedshift;
erofs_off_t pos;
- int err;
if (lclusterbits != 12)
return -EOPNOTSUPP;
@@ -283,7 +392,8 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
if (compacted_4b_initial == 32 / 4)
compacted_4b_initial = 0;
- if (vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B)
+ if ((vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) &&
+ compacted_4b_initial < totalidx)
compacted_2b = rounddown(totalidx - compacted_4b_initial, 16);
else
compacted_2b = 0;
@@ -305,14 +415,15 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
amortizedshift = 2;
out:
pos += lcn * (1 << amortizedshift);
- err = z_erofs_reload_indexes(m, erofs_blknr(pos));
- if (err)
- return err;
- return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos));
+ m->kaddr = erofs_read_metabuf(&m->map->buf, inode->i_sb,
+ erofs_blknr(pos), EROFS_KMAP_ATOMIC);
+ if (IS_ERR(m->kaddr))
+ return PTR_ERR(m->kaddr);
+ return unpack_compacted_index(m, amortizedshift, pos, lookahead);
}
static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m,
- unsigned int lcn)
+ unsigned int lcn, bool lookahead)
{
const unsigned int datamode = EROFS_I(m->inode)->datalayout;
@@ -320,7 +431,7 @@ static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m,
return legacy_load_cluster_from_disk(m, lcn);
if (datamode == EROFS_INODE_FLAT_COMPRESSION)
- return compacted_load_cluster_from_disk(m, lcn);
+ return compacted_load_cluster_from_disk(m, lcn, lookahead);
return -EINVAL;
}
@@ -329,97 +440,212 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
unsigned int lookback_distance)
{
struct erofs_inode *const vi = EROFS_I(m->inode);
+ const unsigned int lclusterbits = vi->z_logical_clusterbits;
+
+ while (m->lcn >= lookback_distance) {
+ unsigned long lcn = m->lcn - lookback_distance;
+ int err;
+
+ /* load extent head logical cluster if needed */
+ err = z_erofs_load_cluster_from_disk(m, lcn, false);
+ if (err)
+ return err;
+
+ switch (m->type) {
+ case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+ if (!m->delta[0]) {
+ erofs_err(m->inode->i_sb,
+ "invalid lookback distance 0 @ nid %llu",
+ vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+ }
+ lookback_distance = m->delta[0];
+ continue;
+ case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
+ m->headtype = m->type;
+ m->map->m_la = (lcn << lclusterbits) | m->clusterofs;
+ return 0;
+ default:
+ erofs_err(m->inode->i_sb,
+ "unknown type %u @ lcn %lu of nid %llu",
+ m->type, lcn, vi->nid);
+ DBG_BUGON(1);
+ return -EOPNOTSUPP;
+ }
+ }
+
+ erofs_err(m->inode->i_sb, "bogus lookback distance @ nid %llu",
+ vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
+}
+
+static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
+ unsigned int initial_lcn)
+{
+ struct erofs_inode *const vi = EROFS_I(m->inode);
struct erofs_map_blocks *const map = m->map;
const unsigned int lclusterbits = vi->z_logical_clusterbits;
- unsigned long lcn = m->lcn;
+ unsigned long lcn;
int err;
- if (lcn < lookback_distance) {
- erofs_err(m->inode->i_sb,
- "bogus lookback distance @ nid %llu", vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
+ DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN &&
+ m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 &&
+ m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD2);
+ DBG_BUGON(m->type != m->headtype);
+
+ if (m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN ||
+ ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1) &&
+ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) ||
+ ((m->headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) &&
+ !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2))) {
+ map->m_plen = 1ULL << lclusterbits;
+ return 0;
}
+ lcn = m->lcn + 1;
+ if (m->compressedblks)
+ goto out;
- /* load extent head logical cluster if needed */
- lcn -= lookback_distance;
- err = z_erofs_load_cluster_from_disk(m, lcn);
+ err = z_erofs_load_cluster_from_disk(m, lcn, false);
if (err)
return err;
+ /*
+ * If the 1st NONHEAD lcluster has already been handled initially w/o
+ * valid compressedblks, which means at least it mustn't be CBLKCNT, or
+ * an internal implemenatation error is detected.
+ *
+ * The following code can also handle it properly anyway, but let's
+ * BUG_ON in the debugging mode only for developers to notice that.
+ */
+ DBG_BUGON(lcn == initial_lcn &&
+ m->type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD);
+
switch (m->type) {
- case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
- if (!m->delta[0]) {
- erofs_err(m->inode->i_sb,
- "invalid lookback distance 0 @ nid %llu",
- vi->nid);
- DBG_BUGON(1);
- return -EFSCORRUPTED;
- }
- return z_erofs_extent_lookback(m, m->delta[0]);
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- map->m_flags &= ~EROFS_MAP_ZIPPED;
- /* fallthrough */
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
- map->m_la = (lcn << lclusterbits) | m->clusterofs;
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
+ /*
+ * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
+ * rather than CBLKCNT, it's a 1 lcluster-sized pcluster.
+ */
+ m->compressedblks = 1 << (lclusterbits - LOG_BLOCK_SIZE);
break;
+ case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+ if (m->delta[0] != 1)
+ goto err_bonus_cblkcnt;
+ if (m->compressedblks)
+ break;
+ fallthrough;
default:
erofs_err(m->inode->i_sb,
- "unknown type %u @ lcn %lu of nid %llu",
- m->type, lcn, vi->nid);
+ "cannot found CBLKCNT @ lcn %lu of nid %llu",
+ lcn, vi->nid);
DBG_BUGON(1);
- return -EOPNOTSUPP;
+ return -EFSCORRUPTED;
}
+out:
+ map->m_plen = (u64)m->compressedblks << LOG_BLOCK_SIZE;
return 0;
+err_bonus_cblkcnt:
+ erofs_err(m->inode->i_sb,
+ "bogus CBLKCNT @ lcn %lu of nid %llu",
+ lcn, vi->nid);
+ DBG_BUGON(1);
+ return -EFSCORRUPTED;
}
-int z_erofs_map_blocks_iter(struct inode *inode,
- struct erofs_map_blocks *map,
- int flags)
+static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m)
+{
+ struct inode *inode = m->inode;
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct erofs_map_blocks *map = m->map;
+ unsigned int lclusterbits = vi->z_logical_clusterbits;
+ u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits;
+ int err;
+
+ do {
+ /* handle the last EOF pcluster (no next HEAD lcluster) */
+ if ((lcn << lclusterbits) >= inode->i_size) {
+ map->m_llen = inode->i_size - map->m_la;
+ return 0;
+ }
+
+ err = z_erofs_load_cluster_from_disk(m, lcn, true);
+ if (err)
+ return err;
+
+ if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
+ DBG_BUGON(!m->delta[1] &&
+ m->clusterofs != 1 << lclusterbits);
+ } else if (m->type == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN ||
+ m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD1 ||
+ m->type == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) {
+ /* go on until the next HEAD lcluster */
+ if (lcn != headlcn)
+ break;
+ m->delta[1] = 1;
+ } else {
+ erofs_err(inode->i_sb, "unknown type %u @ lcn %llu of nid %llu",
+ m->type, lcn, vi->nid);
+ DBG_BUGON(1);
+ return -EOPNOTSUPP;
+ }
+ lcn += m->delta[1];
+ } while (m->delta[1]);
+
+ map->m_llen = (lcn << lclusterbits) + m->clusterofs - map->m_la;
+ return 0;
+}
+
+static int z_erofs_do_map_blocks(struct inode *inode,
+ struct erofs_map_blocks *map,
+ int flags)
{
struct erofs_inode *const vi = EROFS_I(inode);
+ bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER;
+ bool fragment = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER;
struct z_erofs_maprecorder m = {
.inode = inode,
.map = map,
};
int err = 0;
unsigned int lclusterbits, endoff;
+ unsigned long initial_lcn;
unsigned long long ofs, end;
- trace_z_erofs_map_blocks_iter_enter(inode, map, flags);
-
- /* when trying to read beyond EOF, leave it unmapped */
- if (map->m_la >= inode->i_size) {
- map->m_llen = map->m_la + 1 - inode->i_size;
- map->m_la = inode->i_size;
- map->m_flags = 0;
- goto out;
- }
-
- err = z_erofs_fill_inode_lazy(inode);
- if (err)
- goto out;
-
lclusterbits = vi->z_logical_clusterbits;
- ofs = map->m_la;
- m.lcn = ofs >> lclusterbits;
+ ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la;
+ initial_lcn = ofs >> lclusterbits;
endoff = ofs & ((1 << lclusterbits) - 1);
- err = z_erofs_load_cluster_from_disk(&m, m.lcn);
+ err = z_erofs_load_cluster_from_disk(&m, initial_lcn, false);
if (err)
goto unmap_out;
- map->m_flags = EROFS_MAP_ZIPPED; /* by default, compressed */
+ if (ztailpacking && (flags & EROFS_GET_BLOCKS_FINDTAIL))
+ vi->z_idataoff = m.nextpackoff;
+
+ map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED;
end = (m.lcn + 1ULL) << lclusterbits;
switch (m.type) {
case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
- if (endoff >= m.clusterofs)
- map->m_flags &= ~EROFS_MAP_ZIPPED;
- /* fallthrough */
- case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD1:
+ case Z_EROFS_VLE_CLUSTER_TYPE_HEAD2:
if (endoff >= m.clusterofs) {
+ m.headtype = m.type;
map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
+ /*
+ * For ztailpacking files, in order to inline data more
+ * effectively, special EOF lclusters are now supported
+ * which can have three parts at most.
+ */
+ if (ztailpacking && end > inode->i_size)
+ end = inode->i_size;
break;
}
/* m.lcn should be >= 1 if endoff < m.clusterofs */
@@ -433,9 +659,9 @@ int z_erofs_map_blocks_iter(struct inode *inode,
end = (m.lcn << lclusterbits) | m.clusterofs;
map->m_flags |= EROFS_MAP_FULL_MAPPED;
m.delta[0] = 1;
- /* fallthrough */
+ fallthrough;
case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
- /* get the correspoinding first chunk */
+ /* get the corresponding first chunk */
err = z_erofs_extent_lookback(&m, m.delta[0]);
if (err)
goto unmap_out;
@@ -447,21 +673,93 @@ int z_erofs_map_blocks_iter(struct inode *inode,
err = -EOPNOTSUPP;
goto unmap_out;
}
-
+ if (m.partialref)
+ map->m_flags |= EROFS_MAP_PARTIAL_REF;
map->m_llen = end - map->m_la;
- map->m_plen = 1 << lclusterbits;
- map->m_pa = blknr_to_addr(m.pblk);
- map->m_flags |= EROFS_MAP_MAPPED;
+ if (flags & EROFS_GET_BLOCKS_FINDTAIL) {
+ vi->z_tailextent_headlcn = m.lcn;
+ /* for non-compact indexes, fragmentoff is 64 bits */
+ if (fragment &&
+ vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY)
+ vi->z_fragmentoff |= (u64)m.pblk << 32;
+ }
+ if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) {
+ map->m_flags |= EROFS_MAP_META;
+ map->m_pa = vi->z_idataoff;
+ map->m_plen = vi->z_idata_size;
+ } else if (fragment && m.lcn == vi->z_tailextent_headlcn) {
+ map->m_flags |= EROFS_MAP_FRAGMENT;
+ } else {
+ map->m_pa = blknr_to_addr(m.pblk);
+ err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
+ if (err)
+ goto out;
+ }
+
+ if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_PLAIN) {
+ if (vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER)
+ map->m_algorithmformat =
+ Z_EROFS_COMPRESSION_INTERLACED;
+ else
+ map->m_algorithmformat =
+ Z_EROFS_COMPRESSION_SHIFTED;
+ } else if (m.headtype == Z_EROFS_VLE_CLUSTER_TYPE_HEAD2) {
+ map->m_algorithmformat = vi->z_algorithmtype[1];
+ } else {
+ map->m_algorithmformat = vi->z_algorithmtype[0];
+ }
+
+ if ((flags & EROFS_GET_BLOCKS_FIEMAP) ||
+ ((flags & EROFS_GET_BLOCKS_READMORE) &&
+ map->m_algorithmformat == Z_EROFS_COMPRESSION_LZMA &&
+ map->m_llen >= EROFS_BLKSIZ)) {
+ err = z_erofs_get_extent_decompressedlen(&m);
+ if (!err)
+ map->m_flags |= EROFS_MAP_FULL_MAPPED;
+ }
unmap_out:
- if (m.kaddr)
- kunmap_atomic(m.kaddr);
+ erofs_unmap_metabuf(&m.map->buf);
out:
erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o",
__func__, map->m_la, map->m_pa,
map->m_llen, map->m_plen, map->m_flags);
+ return err;
+}
+
+int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map,
+ int flags)
+{
+ struct erofs_inode *const vi = EROFS_I(inode);
+ int err = 0;
+
+ trace_z_erofs_map_blocks_iter_enter(inode, map, flags);
+
+ /* when trying to read beyond EOF, leave it unmapped */
+ if (map->m_la >= inode->i_size) {
+ map->m_llen = map->m_la + 1 - inode->i_size;
+ map->m_la = inode->i_size;
+ map->m_flags = 0;
+ goto out;
+ }
+
+ err = z_erofs_fill_inode_lazy(inode);
+ if (err)
+ goto out;
+
+ if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) &&
+ !vi->z_tailextent_headlcn) {
+ map->m_la = 0;
+ map->m_llen = inode->i_size;
+ map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_FULL_MAPPED |
+ EROFS_MAP_FRAGMENT;
+ goto out;
+ }
+
+ err = z_erofs_do_map_blocks(inode, map, flags);
+out:
trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err);
/* aggressively BUG_ON iff CONFIG_EROFS_FS_DEBUG is on */
@@ -469,3 +767,40 @@ out:
return err;
}
+static int z_erofs_iomap_begin_report(struct inode *inode, loff_t offset,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
+{
+ int ret;
+ struct erofs_map_blocks map = { .m_la = offset };
+
+ ret = z_erofs_map_blocks_iter(inode, &map, EROFS_GET_BLOCKS_FIEMAP);
+ erofs_put_metabuf(&map.buf);
+ if (ret < 0)
+ return ret;
+
+ iomap->bdev = inode->i_sb->s_bdev;
+ iomap->offset = map.m_la;
+ iomap->length = map.m_llen;
+ if (map.m_flags & EROFS_MAP_MAPPED) {
+ iomap->type = IOMAP_MAPPED;
+ iomap->addr = map.m_flags & EROFS_MAP_FRAGMENT ?
+ IOMAP_NULL_ADDR : map.m_pa;
+ } else {
+ iomap->type = IOMAP_HOLE;
+ iomap->addr = IOMAP_NULL_ADDR;
+ /*
+ * No strict rule how to describe extents for post EOF, yet
+ * we need do like below. Otherwise, iomap itself will get
+ * into an endless loop on post EOF.
+ */
+ if (iomap->offset >= inode->i_size)
+ iomap->length = length + map.m_la - offset;
+ }
+ iomap->flags = 0;
+ return 0;
+}
+
+const struct iomap_ops z_erofs_iomap_report_ops = {
+ .iomap_begin = z_erofs_iomap_begin_report,
+};
diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h
deleted file mode 100644
index 58556903aa94..000000000000
--- a/fs/erofs/zpvec.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0-only */
-/*
- * Copyright (C) 2018 HUAWEI, Inc.
- * http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
- */
-#ifndef __EROFS_FS_ZPVEC_H
-#define __EROFS_FS_ZPVEC_H
-
-#include "tagptr.h"
-
-/* page type in pagevec for decompress subsystem */
-enum z_erofs_page_type {
- /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */
- Z_EROFS_PAGE_TYPE_EXCLUSIVE,
-
- Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED,
-
- Z_EROFS_VLE_PAGE_TYPE_HEAD,
- Z_EROFS_VLE_PAGE_TYPE_MAX
-};
-
-extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0")
- __bad_page_type_exclusive(void);
-
-/* pagevec tagged pointer */
-typedef tagptr2_t erofs_vtptr_t;
-
-/* pagevec collector */
-struct z_erofs_pagevec_ctor {
- struct page *curr, *next;
- erofs_vtptr_t *pages;
-
- unsigned int nr, index;
-};
-
-static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor,
- bool atomic)
-{
- if (!ctor->curr)
- return;
-
- if (atomic)
- kunmap_atomic(ctor->pages);
- else
- kunmap(ctor->curr);
-}
-
-static inline struct page *
-z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor,
- unsigned int nr)
-{
- unsigned int index;
-
- /* keep away from occupied pages */
- if (ctor->next)
- return ctor->next;
-
- for (index = 0; index < nr; ++index) {
- const erofs_vtptr_t t = ctor->pages[index];
- const unsigned int tags = tagptr_unfold_tags(t);
-
- if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE)
- return tagptr_unfold_ptr(t);
- }
- DBG_BUGON(nr >= ctor->nr);
- return NULL;
-}
-
-static inline void
-z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor,
- bool atomic)
-{
- struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr);
-
- z_erofs_pagevec_ctor_exit(ctor, atomic);
-
- ctor->curr = next;
- ctor->next = NULL;
- ctor->pages = atomic ?
- kmap_atomic(ctor->curr) : kmap(ctor->curr);
-
- ctor->nr = PAGE_SIZE / sizeof(struct page *);
- ctor->index = 0;
-}
-
-static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
- unsigned int nr,
- erofs_vtptr_t *pages,
- unsigned int i)
-{
- ctor->nr = nr;
- ctor->curr = ctor->next = NULL;
- ctor->pages = pages;
-
- if (i >= nr) {
- i -= nr;
- z_erofs_pagevec_ctor_pagedown(ctor, false);
- while (i > ctor->nr) {
- i -= ctor->nr;
- z_erofs_pagevec_ctor_pagedown(ctor, false);
- }
- }
- ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i);
- ctor->index = i;
-}
-
-static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor,
- struct page *page,
- enum z_erofs_page_type type,
- bool *occupied)
-{
- *occupied = false;
- if (!ctor->next && type)
- if (ctor->index + 1 == ctor->nr)
- return false;
-
- if (ctor->index >= ctor->nr)
- z_erofs_pagevec_ctor_pagedown(ctor, false);
-
- /* exclusive page type must be 0 */
- if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL)
- __bad_page_type_exclusive();
-
- /* should remind that collector->next never equal to 1, 2 */
- if (type == (uintptr_t)ctor->next) {
- ctor->next = page;
- *occupied = true;
- }
- ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type);
- return true;
-}
-
-static inline struct page *
-z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor,
- enum z_erofs_page_type *type)
-{
- erofs_vtptr_t t;
-
- if (ctor->index >= ctor->nr) {
- DBG_BUGON(!ctor->next);
- z_erofs_pagevec_ctor_pagedown(ctor, true);
- }
-
- t = ctor->pages[ctor->index];
-
- *type = tagptr_unfold_tags(t);
-
- /* should remind that collector->next never equal to 1, 2 */
- if (*type == (uintptr_t)ctor->next)
- ctor->next = tagptr_unfold_ptr(t);
-
- ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, NULL, 0);
- return tagptr_unfold_ptr(t);
-}
-#endif
-