aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/filesystems
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r--Documentation/filesystems/00-INDEX153
-rw-r--r--Documentation/filesystems/ceph.txt5
-rw-r--r--Documentation/filesystems/dax.txt2
-rw-r--r--Documentation/filesystems/ext2.txt2
-rw-r--r--Documentation/filesystems/ext4/about.rst (renamed from Documentation/filesystems/ext4/ondisk/about.rst)0
-rw-r--r--Documentation/filesystems/ext4/allocators.rst (renamed from Documentation/filesystems/ext4/ondisk/allocators.rst)0
-rw-r--r--Documentation/filesystems/ext4/attributes.rst (renamed from Documentation/filesystems/ext4/ondisk/attributes.rst)8
-rw-r--r--Documentation/filesystems/ext4/bigalloc.rst (renamed from Documentation/filesystems/ext4/ondisk/bigalloc.rst)0
-rw-r--r--Documentation/filesystems/ext4/bitmaps.rst (renamed from Documentation/filesystems/ext4/ondisk/bitmaps.rst)0
-rw-r--r--Documentation/filesystems/ext4/blockgroup.rst (renamed from Documentation/filesystems/ext4/ondisk/blockgroup.rst)0
-rw-r--r--Documentation/filesystems/ext4/blockmap.rst (renamed from Documentation/filesystems/ext4/ondisk/blockmap.rst)0
-rw-r--r--Documentation/filesystems/ext4/blocks.rst (renamed from Documentation/filesystems/ext4/ondisk/blocks.rst)0
-rw-r--r--Documentation/filesystems/ext4/checksums.rst (renamed from Documentation/filesystems/ext4/ondisk/checksums.rst)2
-rw-r--r--Documentation/filesystems/ext4/directory.rst (renamed from Documentation/filesystems/ext4/ondisk/directory.rst)18
-rw-r--r--Documentation/filesystems/ext4/dynamic.rst (renamed from Documentation/filesystems/ext4/ondisk/dynamic.rst)0
-rw-r--r--Documentation/filesystems/ext4/eainode.rst (renamed from Documentation/filesystems/ext4/ondisk/eainode.rst)0
-rw-r--r--Documentation/filesystems/ext4/ext4.rst613
-rw-r--r--Documentation/filesystems/ext4/globals.rst (renamed from Documentation/filesystems/ext4/ondisk/globals.rst)0
-rw-r--r--Documentation/filesystems/ext4/group_descr.rst (renamed from Documentation/filesystems/ext4/ondisk/group_descr.rst)4
-rw-r--r--Documentation/filesystems/ext4/ifork.rst (renamed from Documentation/filesystems/ext4/ondisk/ifork.rst)8
-rw-r--r--Documentation/filesystems/ext4/index.rst19
-rw-r--r--Documentation/filesystems/ext4/inlinedata.rst (renamed from Documentation/filesystems/ext4/ondisk/inlinedata.rst)0
-rw-r--r--Documentation/filesystems/ext4/inodes.rst (renamed from Documentation/filesystems/ext4/ondisk/inodes.rst)19
-rw-r--r--Documentation/filesystems/ext4/journal.rst (renamed from Documentation/filesystems/ext4/ondisk/journal.rst)32
-rw-r--r--Documentation/filesystems/ext4/mmp.rst (renamed from Documentation/filesystems/ext4/ondisk/mmp.rst)2
-rw-r--r--Documentation/filesystems/ext4/ondisk/index.rst9
-rw-r--r--Documentation/filesystems/ext4/overview.rst (renamed from Documentation/filesystems/ext4/ondisk/overview.rst)0
-rw-r--r--Documentation/filesystems/ext4/special_inodes.rst (renamed from Documentation/filesystems/ext4/ondisk/special_inodes.rst)2
-rw-r--r--Documentation/filesystems/ext4/super.rst (renamed from Documentation/filesystems/ext4/ondisk/super.rst)24
-rw-r--r--Documentation/filesystems/f2fs.txt8
-rw-r--r--Documentation/filesystems/fscrypt.rst10
-rw-r--r--Documentation/filesystems/nfs/00-INDEX26
-rw-r--r--Documentation/filesystems/nfs/rpc-cache.txt6
-rw-r--r--Documentation/filesystems/overlayfs.txt6
-rw-r--r--Documentation/filesystems/pohmelfs/design_notes.txt72
-rw-r--r--Documentation/filesystems/pohmelfs/info.txt99
-rw-r--r--Documentation/filesystems/pohmelfs/network_protocol.txt227
-rw-r--r--Documentation/filesystems/porting11
-rw-r--r--Documentation/filesystems/proc.txt4
-rw-r--r--Documentation/filesystems/vfs.txt21
40 files changed, 125 insertions, 1287 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
deleted file mode 100644
index 0937bade1099..000000000000
--- a/Documentation/filesystems/00-INDEX
+++ /dev/null
@@ -1,153 +0,0 @@
-00-INDEX
- - this file (info on some of the filesystems supported by linux).
-Locking
- - info on locking rules as they pertain to Linux VFS.
-9p.txt
- - 9p (v9fs) is an implementation of the Plan 9 remote fs protocol.
-adfs.txt
- - info and mount options for the Acorn Advanced Disc Filing System.
-afs.txt
- - info and examples for the distributed AFS (Andrew File System) fs.
-affs.txt
- - info and mount options for the Amiga Fast File System.
-autofs-mount-control.txt
- - info on device control operations for autofs module.
-automount-support.txt
- - information about filesystem automount support.
-befs.txt
- - information about the BeOS filesystem for Linux.
-bfs.txt
- - info for the SCO UnixWare Boot Filesystem (BFS).
-btrfs.txt
- - info for the BTRFS filesystem.
-caching/
- - directory containing filesystem cache documentation.
-ceph.txt
- - info for the Ceph Distributed File System.
-cifs/
- - directory containing CIFS filesystem documentation and example code.
-coda.txt
- - description of the CODA filesystem.
-configfs/
- - directory containing configfs documentation and example code.
-cramfs.txt
- - info on the cram filesystem for small storage (ROMs etc).
-dax.txt
- - info on avoiding the page cache for files stored on CPU-addressable
- storage devices.
-debugfs.txt
- - info on the debugfs filesystem.
-devpts.txt
- - info on the devpts filesystem.
-directory-locking
- - info about the locking scheme used for directory operations.
-dlmfs.txt
- - info on the userspace interface to the OCFS2 DLM.
-dnotify.txt
- - info about directory notification in Linux.
-dnotify_test.c
- - example program for dnotify.
-ecryptfs.txt
- - docs on eCryptfs: stacked cryptographic filesystem for Linux.
-efivarfs.txt
- - info for the efivarfs filesystem.
-exofs.txt
- - info, usage, mount options, design about EXOFS.
-ext2.txt
- - info, mount options and specifications for the Ext2 filesystem.
-ext3.txt
- - info, mount options and specifications for the Ext3 filesystem.
-ext4.txt
- - info, mount options and specifications for the Ext4 filesystem.
-f2fs.txt
- - info and mount options for the F2FS filesystem.
-fiemap.txt
- - info on fiemap ioctl.
-files.txt
- - info on file management in the Linux kernel.
-fuse.txt
- - info on the Filesystem in User SpacE including mount options.
-gfs2-glocks.txt
- - info on the Global File System 2 - Glock internal locking rules.
-gfs2-uevents.txt
- - info on the Global File System 2 - uevents.
-gfs2.txt
- - info on the Global File System 2.
-hfs.txt
- - info on the Macintosh HFS Filesystem for Linux.
-hfsplus.txt
- - info on the Macintosh HFSPlus Filesystem for Linux.
-hpfs.txt
- - info and mount options for the OS/2 HPFS.
-inotify.txt
- - info on the powerful yet simple file change notification system.
-isofs.txt
- - info and mount options for the ISO 9660 (CDROM) filesystem.
-jfs.txt
- - info and mount options for the JFS filesystem.
-locks.txt
- - info on file locking implementations, flock() vs. fcntl(), etc.
-mandatory-locking.txt
- - info on the Linux implementation of Sys V mandatory file locking.
-nfs/
- - nfs-related documentation.
-nilfs2.txt
- - info and mount options for the NILFS2 filesystem.
-ntfs.txt
- - info and mount options for the NTFS filesystem (Windows NT).
-ocfs2.txt
- - info and mount options for the OCFS2 clustered filesystem.
-omfs.txt
- - info on the Optimized MPEG FileSystem.
-path-lookup.txt
- - info on path walking and name lookup locking.
-pohmelfs/
- - directory containing pohmelfs filesystem documentation.
-porting
- - various information on filesystem porting.
-proc.txt
- - info on Linux's /proc filesystem.
-qnx6.txt
- - info on the QNX6 filesystem.
-quota.txt
- - info on Quota subsystem.
-ramfs-rootfs-initramfs.txt
- - info on the 'in memory' filesystems ramfs, rootfs and initramfs.
-relay.txt
- - info on relay, for efficient streaming from kernel to user space.
-romfs.txt
- - description of the ROMFS filesystem.
-seq_file.txt
- - how to use the seq_file API.
-sharedsubtree.txt
- - a description of shared subtrees for namespaces.
-spufs.txt
- - info and mount options for the SPU filesystem used on Cell.
-squashfs.txt
- - info on the squashfs filesystem.
-sysfs-pci.txt
- - info on accessing PCI device resources through sysfs.
-sysfs-tagging.txt
- - info on sysfs tagging to avoid duplicates.
-sysfs.txt
- - info on sysfs, a ram-based filesystem for exporting kernel objects.
-sysv-fs.txt
- - info on the SystemV/V7/Xenix/Coherent filesystem.
-tmpfs.txt
- - info on tmpfs, a filesystem that holds all files in virtual memory.
-ubifs.txt
- - info on the Unsorted Block Images FileSystem.
-udf.txt
- - info and mount options for the UDF filesystem.
-ufs.txt
- - info on the ufs filesystem.
-vfat.txt
- - info on using the VFAT filesystem used in Windows NT and Windows 95.
-vfs.txt
- - overview of the Virtual File System.
-xfs-delayed-logging-design.txt
- - info on the XFS Delayed Logging Design.
-xfs-self-describing-metadata.txt
- - info on XFS Self Describing Metadata.
-xfs.txt
- - info and mount options for the XFS filesystem.
diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
index 8bf62240e10d..1177052701e1 100644
--- a/Documentation/filesystems/ceph.txt
+++ b/Documentation/filesystems/ceph.txt
@@ -151,6 +151,11 @@ Mount Options
Report overall filesystem usage in statfs instead of using the root
directory quota.
+ nocopyfrom
+ Don't use the RADOS 'copy-from' operation to perform remote object
+ copies. Currently, it's only used in copy_file_range, which will revert
+ to the default VFS implementation if this option is used.
+
More Information
================
diff --git a/Documentation/filesystems/dax.txt b/Documentation/filesystems/dax.txt
index 70cb68bed2e8..bc393e0a22b8 100644
--- a/Documentation/filesystems/dax.txt
+++ b/Documentation/filesystems/dax.txt
@@ -75,7 +75,7 @@ exposure of uninitialized data through mmap.
These filesystems may be used for inspiration:
- ext2: see Documentation/filesystems/ext2.txt
-- ext4: see Documentation/filesystems/ext4.txt
+- ext4: see Documentation/filesystems/ext4/ext4.rst
- xfs: see Documentation/filesystems/xfs.txt
diff --git a/Documentation/filesystems/ext2.txt b/Documentation/filesystems/ext2.txt
index 81c0becab225..a45c9fc0747b 100644
--- a/Documentation/filesystems/ext2.txt
+++ b/Documentation/filesystems/ext2.txt
@@ -358,7 +358,7 @@ and are copied into the filesystem. If a transaction is incomplete at
the time of the crash, then there is no guarantee of consistency for
the blocks in that transaction so they are discarded (which means any
filesystem changes they represent are also lost).
-Check Documentation/filesystems/ext4.txt if you want to read more about
+Check Documentation/filesystems/ext4/ext4.rst if you want to read more about
ext4 and journaling.
References
diff --git a/Documentation/filesystems/ext4/ondisk/about.rst b/Documentation/filesystems/ext4/about.rst
index 0aadba052264..0aadba052264 100644
--- a/Documentation/filesystems/ext4/ondisk/about.rst
+++ b/Documentation/filesystems/ext4/about.rst
diff --git a/Documentation/filesystems/ext4/ondisk/allocators.rst b/Documentation/filesystems/ext4/allocators.rst
index 7aa85152ace3..7aa85152ace3 100644
--- a/Documentation/filesystems/ext4/ondisk/allocators.rst
+++ b/Documentation/filesystems/ext4/allocators.rst
diff --git a/Documentation/filesystems/ext4/ondisk/attributes.rst b/Documentation/filesystems/ext4/attributes.rst
index 0b01b67b81fe..54386a010a8d 100644
--- a/Documentation/filesystems/ext4/ondisk/attributes.rst
+++ b/Documentation/filesystems/ext4/attributes.rst
@@ -30,7 +30,7 @@ Extended attributes, when stored after the inode, have a header
``ext4_xattr_ibody_header`` that is 4 bytes long:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -47,7 +47,7 @@ The beginning of an extended attribute block is in
``struct ext4_xattr_header``, which is 32 bytes long:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -92,7 +92,7 @@ entries must be stored in sorted order. The sort order is
Attributes stored inside an inode do not need be stored in sorted order.
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -157,7 +157,7 @@ attribute name index field is set, and matching string is removed from
the key name. Here is a map of name index values to key prefixes:
.. list-table::
- :widths: 1 79
+ :widths: 16 64
:header-rows: 1
* - Name Index
diff --git a/Documentation/filesystems/ext4/ondisk/bigalloc.rst b/Documentation/filesystems/ext4/bigalloc.rst
index c6d88557553c..c6d88557553c 100644
--- a/Documentation/filesystems/ext4/ondisk/bigalloc.rst
+++ b/Documentation/filesystems/ext4/bigalloc.rst
diff --git a/Documentation/filesystems/ext4/ondisk/bitmaps.rst b/Documentation/filesystems/ext4/bitmaps.rst
index c7546dbc197a..c7546dbc197a 100644
--- a/Documentation/filesystems/ext4/ondisk/bitmaps.rst
+++ b/Documentation/filesystems/ext4/bitmaps.rst
diff --git a/Documentation/filesystems/ext4/ondisk/blockgroup.rst b/Documentation/filesystems/ext4/blockgroup.rst
index baf888e4c06a..baf888e4c06a 100644
--- a/Documentation/filesystems/ext4/ondisk/blockgroup.rst
+++ b/Documentation/filesystems/ext4/blockgroup.rst
diff --git a/Documentation/filesystems/ext4/ondisk/blockmap.rst b/Documentation/filesystems/ext4/blockmap.rst
index 30e25750d88a..30e25750d88a 100644
--- a/Documentation/filesystems/ext4/ondisk/blockmap.rst
+++ b/Documentation/filesystems/ext4/blockmap.rst
diff --git a/Documentation/filesystems/ext4/ondisk/blocks.rst b/Documentation/filesystems/ext4/blocks.rst
index 73d4dc0f7bda..73d4dc0f7bda 100644
--- a/Documentation/filesystems/ext4/ondisk/blocks.rst
+++ b/Documentation/filesystems/ext4/blocks.rst
diff --git a/Documentation/filesystems/ext4/ondisk/checksums.rst b/Documentation/filesystems/ext4/checksums.rst
index 9d6a793b2e03..5519e253810d 100644
--- a/Documentation/filesystems/ext4/ondisk/checksums.rst
+++ b/Documentation/filesystems/ext4/checksums.rst
@@ -28,7 +28,7 @@ of checksum. The checksum function is whatever the superblock describes
(crc32c as of October 2013) unless noted otherwise.
.. list-table::
- :widths: 1 1 4
+ :widths: 20 8 50
:header-rows: 1
* - Metadata
diff --git a/Documentation/filesystems/ext4/ondisk/directory.rst b/Documentation/filesystems/ext4/directory.rst
index 8fcba68c2884..614034e24669 100644
--- a/Documentation/filesystems/ext4/ondisk/directory.rst
+++ b/Documentation/filesystems/ext4/directory.rst
@@ -34,7 +34,7 @@ is at most 263 bytes long, though on disk you'll need to reference
``dirent.rec_len`` to know for sure.
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -66,7 +66,7 @@ tree traversal. This format is ``ext4_dir_entry_2``, which is at most
``dirent.rec_len`` to know for sure.
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -99,7 +99,7 @@ tree traversal. This format is ``ext4_dir_entry_2``, which is at most
The directory file type is one of the following values:
.. list-table::
- :widths: 1 79
+ :widths: 16 64
:header-rows: 1
* - Value
@@ -130,7 +130,7 @@ in the place where the name normally goes. The structure is
``struct ext4_dir_entry_tail``:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -212,7 +212,7 @@ The root of the htree is in ``struct dx_root``, which is the full length
of a data block:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -305,7 +305,7 @@ of a data block:
The directory hash is one of the following values:
.. list-table::
- :widths: 1 79
+ :widths: 16 64
:header-rows: 1
* - Value
@@ -327,7 +327,7 @@ Interior nodes of an htree are recorded as ``struct dx_node``, which is
also the full length of a data block:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -375,7 +375,7 @@ The hash maps that exist in both ``struct dx_root`` and
long:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -405,7 +405,7 @@ directory index (which will ensure that there's space for the checksum.
The dx\_tail structure is 8 bytes long and looks like this:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
diff --git a/Documentation/filesystems/ext4/ondisk/dynamic.rst b/Documentation/filesystems/ext4/dynamic.rst
index bb0c84333341..bb0c84333341 100644
--- a/Documentation/filesystems/ext4/ondisk/dynamic.rst
+++ b/Documentation/filesystems/ext4/dynamic.rst
diff --git a/Documentation/filesystems/ext4/ondisk/eainode.rst b/Documentation/filesystems/ext4/eainode.rst
index ecc0d01a0a72..ecc0d01a0a72 100644
--- a/Documentation/filesystems/ext4/ondisk/eainode.rst
+++ b/Documentation/filesystems/ext4/eainode.rst
diff --git a/Documentation/filesystems/ext4/ext4.rst b/Documentation/filesystems/ext4/ext4.rst
deleted file mode 100644
index 9d4368d591fa..000000000000
--- a/Documentation/filesystems/ext4/ext4.rst
+++ /dev/null
@@ -1,613 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-========================
-General Information
-========================
-
-Ext4 is an advanced level of the ext3 filesystem which incorporates
-scalability and reliability enhancements for supporting large filesystems
-(64 bit) in keeping with increasing disk capacities and state-of-the-art
-feature requirements.
-
-Mailing list: linux-ext4@vger.kernel.org
-Web site: http://ext4.wiki.kernel.org
-
-
-Quick usage instructions
-========================
-
-Note: More extensive information for getting started with ext4 can be
-found at the ext4 wiki site at the URL:
-http://ext4.wiki.kernel.org/index.php/Ext4_Howto
-
- - The latest version of e2fsprogs can be found at:
-
- https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
-
- or
-
- http://sourceforge.net/project/showfiles.php?group_id=2406
-
- or grab the latest git repository from:
-
- https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
-
- - Create a new filesystem using the ext4 filesystem type:
-
- # mke2fs -t ext4 /dev/hda1
-
- Or to configure an existing ext3 filesystem to support extents:
-
- # tune2fs -O extents /dev/hda1
-
- If the filesystem was created with 128 byte inodes, it can be
- converted to use 256 byte for greater efficiency via:
-
- # tune2fs -I 256 /dev/hda1
-
- - Mounting:
-
- # mount -t ext4 /dev/hda1 /wherever
-
- - When comparing performance with other filesystems, it's always
- important to try multiple workloads; very often a subtle change in a
- workload parameter can completely change the ranking of which
- filesystems do well compared to others. When comparing versus ext3,
- note that ext4 enables write barriers by default, while ext3 does
- not enable write barriers by default. So it is useful to use
- explicitly specify whether barriers are enabled or not when via the
- '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems
- for a fair comparison. When tuning ext3 for best benchmark numbers,
- it is often worthwhile to try changing the data journaling mode; '-o
- data=writeback' can be faster for some workloads. (Note however that
- running mounted with data=writeback can potentially leave stale data
- exposed in recently written files in case of an unclean shutdown,
- which could be a security exposure in some situations.) Configuring
- the filesystem with a large journal can also be helpful for
- metadata-intensive workloads.
-
-Features
-========
-
-Currently Available
--------------------
-
-* ability to use filesystems > 16TB (e2fsprogs support not available yet)
-* extent format reduces metadata overhead (RAM, IO for access, transactions)
-* extent format more robust in face of on-disk corruption due to magics,
-* internal redundancy in tree
-* improved file allocation (multi-block alloc)
-* lift 32000 subdirectory limit imposed by i_links_count[1]
-* nsec timestamps for mtime, atime, ctime, create time
-* inode version field on disk (NFSv4, Lustre)
-* reduced e2fsck time via uninit_bg feature
-* journal checksumming for robustness, performance
-* persistent file preallocation (e.g for streaming media, databases)
-* ability to pack bitmaps and inode tables into larger virtual groups via the
- flex_bg feature
-* large file support
-* inode allocation using large virtual block groups via flex_bg
-* delayed allocation
-* large block (up to pagesize) support
-* efficient new ordered mode in JBD2 and ext4 (avoid using buffer head to force
- the ordering)
-
-[1] Filesystems with a block size of 1k may see a limit imposed by the
-directory hash tree having a maximum depth of two.
-
-Options
-=======
-
-When mounting an ext4 filesystem, the following option are accepted:
-(*) == default
-
-======================= =======================================================
-Mount Option Description
-======================= =======================================================
-ro Mount filesystem read only. Note that ext4 will
- replay the journal (and thus write to the
- partition) even when mounted "read only". The
- mount options "ro,noload" can be used to prevent
- writes to the filesystem.
-
-journal_checksum Enable checksumming of the journal transactions.
- This will allow the recovery code in e2fsck and the
- kernel to detect corruption in the kernel. It is a
- compatible change and will be ignored by older kernels.
-
-journal_async_commit Commit block can be written to disk without waiting
- for descriptor blocks. If enabled older kernels cannot
- mount the device. This will enable 'journal_checksum'
- internally.
-
-journal_path=path
-journal_dev=devnum When the external journal device's major/minor numbers
- have changed, these options allow the user to specify
- the new journal location. The journal device is
- identified through either its new major/minor numbers
- encoded in devnum, or via a path to the device.
-
-norecovery Don't load the journal on mounting. Note that
-noload if the filesystem was not unmounted cleanly,
- skipping the journal replay will lead to the
- filesystem containing inconsistencies that can
- lead to any number of problems.
-
-data=journal All data are committed into the journal prior to being
- written into the main file system. Enabling
- this mode will disable delayed allocation and
- O_DIRECT support.
-
-data=ordered (*) All data are forced directly out to the main file
- system prior to its metadata being committed to the
- journal.
-
-data=writeback Data ordering is not preserved, data may be written
- into the main file system after its metadata has been
- committed to the journal.
-
-commit=nrsec (*) Ext4 can be told to sync all its data and metadata
- every 'nrsec' seconds. The default value is 5 seconds.
- This means that if you lose your power, you will lose
- as much as the latest 5 seconds of work (your
- filesystem will not be damaged though, thanks to the
- journaling). This default value (or any low value)
- will hurt performance, but it's good for data-safety.
- Setting it to 0 will have the same effect as leaving
- it at the default (5 seconds).
- Setting it to very large values will improve
- performance.
-
-barrier=<0|1(*)> This enables/disables the use of write barriers in
-barrier(*) the jbd code. barrier=0 disables, barrier=1 enables.
-nobarrier This also requires an IO stack which can support
- barriers, and if jbd gets an error on a barrier
- write, it will disable again with a warning.
- Write barriers enforce proper on-disk ordering
- of journal commits, making volatile disk write caches
- safe to use, at some performance penalty. If
- your disks are battery-backed in one way or another,
- disabling barriers may safely improve performance.
- The mount options "barrier" and "nobarrier" can
- also be used to enable or disable barriers, for
- consistency with other ext4 mount options.
-
-inode_readahead_blks=n This tuning parameter controls the maximum
- number of inode table blocks that ext4's inode
- table readahead algorithm will pre-read into
- the buffer cache. The default value is 32 blocks.
-
-nouser_xattr Disables Extended User Attributes. See the
- attr(5) manual page for more information about
- extended attributes.
-
-noacl This option disables POSIX Access Control List
- support. If ACL support is enabled in the kernel
- configuration (CONFIG_EXT4_FS_POSIX_ACL), ACL is
- enabled by default on mount. See the acl(5) manual
- page for more information about acl.
-
-bsddf (*) Make 'df' act like BSD.
-minixdf Make 'df' act like Minix.
-
-debug Extra debugging information is sent to syslog.
-
-abort Simulate the effects of calling ext4_abort() for
- debugging purposes. This is normally used while
- remounting a filesystem which is already mounted.
-
-errors=remount-ro Remount the filesystem read-only on an error.
-errors=continue Keep going on a filesystem error.
-errors=panic Panic and halt the machine if an error occurs.
- (These mount options override the errors behavior
- specified in the superblock, which can be configured
- using tune2fs)
-
-data_err=ignore(*) Just print an error message if an error occurs
- in a file data buffer in ordered mode.
-data_err=abort Abort the journal if an error occurs in a file
- data buffer in ordered mode.
-
-grpid New objects have the group ID of their parent.
-bsdgroups
-
-nogrpid (*) New objects have the group ID of their creator.
-sysvgroups
-
-resgid=n The group ID which may use the reserved blocks.
-
-resuid=n The user ID which may use the reserved blocks.
-
-sb=n Use alternate superblock at this location.
-
-quota These options are ignored by the filesystem. They
-noquota are used only by quota tools to recognize volumes
-grpquota where quota should be turned on. See documentation
-usrquota in the quota-tools package for more details
- (http://sourceforge.net/projects/linuxquota).
-
-jqfmt=<quota type> These options tell filesystem details about quota
-usrjquota=<file> so that quota information can be properly updated
-grpjquota=<file> during journal replay. They replace the above
- quota options. See documentation in the quota-tools
- package for more details
- (http://sourceforge.net/projects/linuxquota).
-
-stripe=n Number of filesystem blocks that mballoc will try
- to use for allocation size and alignment. For RAID5/6
- systems this should be the number of data
- disks * RAID chunk size in file system blocks.
-
-delalloc (*) Defer block allocation until just before ext4
- writes out the block(s) in question. This
- allows ext4 to better allocation decisions
- more efficiently.
-nodelalloc Disable delayed allocation. Blocks are allocated
- when the data is copied from userspace to the
- page cache, either via the write(2) system call
- or when an mmap'ed page which was previously
- unallocated is written for the first time.
-
-max_batch_time=usec Maximum amount of time ext4 should wait for
- additional filesystem operations to be batch
- together with a synchronous write operation.
- Since a synchronous write operation is going to
- force a commit and then a wait for the I/O
- complete, it doesn't cost much, and can be a
- huge throughput win, we wait for a small amount
- of time to see if any other transactions can
- piggyback on the synchronous write. The
- algorithm used is designed to automatically tune
- for the speed of the disk, by measuring the
- amount of time (on average) that it takes to
- finish committing a transaction. Call this time
- the "commit time". If the time that the
- transaction has been running is less than the
- commit time, ext4 will try sleeping for the
- commit time to see if other operations will join
- the transaction. The commit time is capped by
- the max_batch_time, which defaults to 15000us
- (15ms). This optimization can be turned off
- entirely by setting max_batch_time to 0.
-
-min_batch_time=usec This parameter sets the commit time (as
- described above) to be at least min_batch_time.
- It defaults to zero microseconds. Increasing
- this parameter may improve the throughput of
- multi-threaded, synchronous workloads on very
- fast disks, at the cost of increasing latency.
-
-journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the
- highest priority) which should be used for I/O
- operations submitted by kjournald2 during a
- commit operation. This defaults to 3, which is
- a slightly higher priority than the default I/O
- priority.
-
-auto_da_alloc(*) Many broken applications don't use fsync() when
-noauto_da_alloc replacing existing files via patterns such as
- fd = open("foo.new")/write(fd,..)/close(fd)/
- rename("foo.new", "foo"), or worse yet,
- fd = open("foo", O_TRUNC)/write(fd,..)/close(fd).
- If auto_da_alloc is enabled, ext4 will detect
- the replace-via-rename and replace-via-truncate
- patterns and force that any delayed allocation
- blocks are allocated such that at the next
- journal commit, in the default data=ordered
- mode, the data blocks of the new file are forced
- to disk before the rename() operation is
- committed. This provides roughly the same level
- of guarantees as ext3, and avoids the
- "zero-length" problem that can happen when a
- system crashes before the delayed allocation
- blocks are forced to disk.
-
-noinit_itable Do not initialize any uninitialized inode table
- blocks in the background. This feature may be
- used by installation CD's so that the install
- process can complete as quickly as possible; the
- inode table initialization process would then be
- deferred until the next time the file system
- is unmounted.
-
-init_itable=n The lazy itable init code will wait n times the
- number of milliseconds it took to zero out the
- previous block group's inode table. This
- minimizes the impact on the system performance
- while file system's inode table is being initialized.
-
-discard Controls whether ext4 should issue discard/TRIM
-nodiscard(*) commands to the underlying block device when
- blocks are freed. This is useful for SSD devices
- and sparse/thinly-provisioned LUNs, but it is off
- by default until sufficient testing has been done.
-
-nouid32 Disables 32-bit UIDs and GIDs. This is for
- interoperability with older kernels which only
- store and expect 16-bit values.
-
-block_validity(*) These options enable or disable the in-kernel
-noblock_validity facility for tracking filesystem metadata blocks
- within internal data structures. This allows multi-
- block allocator and other routines to notice
- bugs or corrupted allocation bitmaps which cause
- blocks to be allocated which overlap with
- filesystem metadata blocks.
-
-dioread_lock Controls whether or not ext4 should use the DIO read
-dioread_nolock locking. If the dioread_nolock option is specified
- ext4 will allocate uninitialized extent before buffer
- write and convert the extent to initialized after IO
- completes. This approach allows ext4 code to avoid
- using inode mutex, which improves scalability on high
- speed storages. However this does not work with
- data journaling and dioread_nolock option will be
- ignored with kernel warning. Note that dioread_nolock
- code path is only used for extent-based files.
- Because of the restrictions this options comprises
- it is off by default (e.g. dioread_lock).
-
-max_dir_size_kb=n This limits the size of directories so that any
- attempt to expand them beyond the specified
- limit in kilobytes will cause an ENOSPC error.
- This is useful in memory constrained
- environments, where a very large directory can
- cause severe performance problems or even
- provoke the Out Of Memory killer. (For example,
- if there is only 512mb memory available, a 176mb
- directory may seriously cramp the system's style.)
-
-i_version Enable 64-bit inode version support. This option is
- off by default.
-
-dax Use direct access (no page cache). See
- Documentation/filesystems/dax.txt. Note that
- this option is incompatible with data=journal.
-======================= =======================================================
-
-Data Mode
-=========
-There are 3 different data modes:
-
-* writeback mode
-
- In data=writeback mode, ext4 does not journal data at all. This mode provides
- a similar level of journaling as that of XFS, JFS, and ReiserFS in its default
- mode - metadata journaling. A crash+recovery can cause incorrect data to
- appear in files which were written shortly before the crash. This mode will
- typically provide the best ext4 performance.
-
-* ordered mode
-
- In data=ordered mode, ext4 only officially journals metadata, but it logically
- groups metadata information related to data changes with the data blocks into
- a single unit called a transaction. When it's time to write the new metadata
- out to disk, the associated data blocks are written first. In general, this
- mode performs slightly slower than writeback but significantly faster than
- journal mode.
-
-* journal mode
-
- data=journal mode provides full data and metadata journaling. All new data is
- written to the journal first, and then to its final location. In the event of
- a crash, the journal can be replayed, bringing both data and metadata into a
- consistent state. This mode is the slowest except when data needs to be read
- from and written to disk at the same time where it outperforms all others
- modes. Enabling this mode will disable delayed allocation and O_DIRECT
- support.
-
-/proc entries
-=============
-
-Information about mounted ext4 file systems can be found in
-/proc/fs/ext4. Each mounted filesystem will have a directory in
-/proc/fs/ext4 based on its device name (i.e., /proc/fs/ext4/hdc or
-/proc/fs/ext4/dm-0). The files in each per-device directory are shown
-in table below.
-
-Files in /proc/fs/ext4/<devname>
-
-================ =======
- File Content
-================ =======
- mb_groups details of multiblock allocator buddy cache of free blocks
-================ =======
-
-/sys entries
-============
-
-Information about mounted ext4 file systems can be found in
-/sys/fs/ext4. Each mounted filesystem will have a directory in
-/sys/fs/ext4 based on its device name (i.e., /sys/fs/ext4/hdc or
-/sys/fs/ext4/dm-0). The files in each per-device directory are shown
-in table below.
-
-Files in /sys/fs/ext4/<devname>:
-
-(see also Documentation/ABI/testing/sysfs-fs-ext4)
-
-============================= =================================================
-File Content
-============================= =================================================
- delayed_allocation_blocks This file is read-only and shows the number of
- blocks that are dirty in the page cache, but
- which do not have their location in the
- filesystem allocated yet.
-
-inode_goal Tuning parameter which (if non-zero) controls
- the goal inode used by the inode allocator in
- preference to all other allocation heuristics.
- This is intended for debugging use only, and
- should be 0 on production systems.
-
-inode_readahead_blks Tuning parameter which controls the maximum
- number of inode table blocks that ext4's inode
- table readahead algorithm will pre-read into
- the buffer cache
-
-lifetime_write_kbytes This file is read-only and shows the number of
- kilobytes of data that have been written to this
- filesystem since it was created.
-
- max_writeback_mb_bump The maximum number of megabytes the writeback
- code will try to write out before move on to
- another inode.
-
- mb_group_prealloc The multiblock allocator will round up allocation
- requests to a multiple of this tuning parameter if
- the stripe size is not set in the ext4 superblock
-
- mb_max_to_scan The maximum number of extents the multiblock
- allocator will search to find the best extent
-
- mb_min_to_scan The minimum number of extents the multiblock
- allocator will search to find the best extent
-
- mb_order2_req Tuning parameter which controls the minimum size
- for requests (as a power of 2) where the buddy
- cache is used
-
- mb_stats Controls whether the multiblock allocator should
- collect statistics, which are shown during the
- unmount. 1 means to collect statistics, 0 means
- not to collect statistics
-
- mb_stream_req Files which have fewer blocks than this tunable
- parameter will have their blocks allocated out
- of a block group specific preallocation pool, so
- that small files are packed closely together.
- Each large file will have its blocks allocated
- out of its own unique preallocation pool.
-
- session_write_kbytes This file is read-only and shows the number of
- kilobytes of data that have been written to this
- filesystem since it was mounted.
-
- reserved_clusters This is RW file and contains number of reserved
- clusters in the file system which will be used
- in the specific situations to avoid costly
- zeroout, unexpected ENOSPC, or possible data
- loss. The default is 2% or 4096 clusters,
- whichever is smaller and this can be changed
- however it can never exceed number of clusters
- in the file system. If there is not enough space
- for the reserved space when mounting the file
- mount will _not_ fail.
-============================= =================================================
-
-Ioctls
-======
-
-There is some Ext4 specific functionality which can be accessed by applications
-through the system call interfaces. The list of all Ext4 specific ioctls are
-shown in the table below.
-
-Table of Ext4 specific ioctls
-
-============================= =================================================
-Ioctl Description
-============================= =================================================
- EXT4_IOC_GETFLAGS Get additional attributes associated with inode.
- The ioctl argument is an integer bitfield, with
- bit values described in ext4.h. This ioctl is an
- alias for FS_IOC_GETFLAGS.
-
- EXT4_IOC_SETFLAGS Set additional attributes associated with inode.
- The ioctl argument is an integer bitfield, with
- bit values described in ext4.h. This ioctl is an
- alias for FS_IOC_SETFLAGS.
-
- EXT4_IOC_GETVERSION
- EXT4_IOC_GETVERSION_OLD
- Get the inode i_generation number stored for
- each inode. The i_generation number is normally
- changed only when new inode is created and it is
- particularly useful for network filesystems. The
- '_OLD' version of this ioctl is an alias for
- FS_IOC_GETVERSION.
-
- EXT4_IOC_SETVERSION
- EXT4_IOC_SETVERSION_OLD
- Set the inode i_generation number stored for
- each inode. The '_OLD' version of this ioctl
- is an alias for FS_IOC_SETVERSION.
-
- EXT4_IOC_GROUP_EXTEND This ioctl has the same purpose as the resize
- mount option. It allows to resize filesystem
- to the end of the last existing block group,
- further resize has to be done with resize2fs,
- either online, or offline. The argument points
- to the unsigned logn number representing the
- filesystem new block count.
-
- EXT4_IOC_MOVE_EXT Move the block extents from orig_fd (the one
- this ioctl is pointing to) to the donor_fd (the
- one specified in move_extent structure passed
- as an argument to this ioctl). Then, exchange
- inode metadata between orig_fd and donor_fd.
- This is especially useful for online
- defragmentation, because the allocator has the
- opportunity to allocate moved blocks better,
- ideally into one contiguous extent.
-
- EXT4_IOC_GROUP_ADD Add a new group descriptor to an existing or
- new group descriptor block. The new group
- descriptor is described by ext4_new_group_input
- structure, which is passed as an argument to
- this ioctl. This is especially useful in
- conjunction with EXT4_IOC_GROUP_EXTEND,
- which allows online resize of the filesystem
- to the end of the last existing block group.
- Those two ioctls combined is used in userspace
- online resize tool (e.g. resize2fs).
-
- EXT4_IOC_MIGRATE This ioctl operates on the filesystem itself.
- It converts (migrates) ext3 indirect block mapped
- inode to ext4 extent mapped inode by walking
- through indirect block mapping of the original
- inode and converting contiguous block ranges
- into ext4 extents of the temporary inode. Then,
- inodes are swapped. This ioctl might help, when
- migrating from ext3 to ext4 filesystem, however
- suggestion is to create fresh ext4 filesystem
- and copy data from the backup. Note, that
- filesystem has to support extents for this ioctl
- to work.
-
- EXT4_IOC_ALLOC_DA_BLKS Force all of the delay allocated blocks to be
- allocated to preserve application-expected ext3
- behaviour. Note that this will also start
- triggering a write of the data blocks, but this
- behaviour may change in the future as it is
- not necessary and has been done this way only
- for sake of simplicity.
-
- EXT4_IOC_RESIZE_FS Resize the filesystem to a new size. The number
- of blocks of resized filesystem is passed in via
- 64 bit integer argument. The kernel allocates
- bitmaps and inode table, the userspace tool thus
- just passes the new number of blocks.
-
- EXT4_IOC_SWAP_BOOT Swap i_blocks and associated attributes
- (like i_blocks, i_size, i_flags, ...) from
- the specified inode with inode
- EXT4_BOOT_LOADER_INO (#5). This is typically
- used to store a boot loader in a secure part of
- the filesystem, where it can't be changed by a
- normal user by accident.
- The data blocks of the previous boot loader
- will be associated with the given inode.
-============================= =================================================
-
-References
-==========
-
-kernel source: <file:fs/ext4/>
- <file:fs/jbd2/>
-
-programs: http://e2fsprogs.sourceforge.net/
-
-useful links: http://fedoraproject.org/wiki/ext3-devel
- http://www.bullopensource.org/ext4/
- http://ext4.wiki.kernel.org/index.php/Main_Page
- http://fedoraproject.org/wiki/Features/Ext4
diff --git a/Documentation/filesystems/ext4/ondisk/globals.rst b/Documentation/filesystems/ext4/globals.rst
index 368bf7662b96..368bf7662b96 100644
--- a/Documentation/filesystems/ext4/ondisk/globals.rst
+++ b/Documentation/filesystems/ext4/globals.rst
diff --git a/Documentation/filesystems/ext4/ondisk/group_descr.rst b/Documentation/filesystems/ext4/group_descr.rst
index 759827e5d2cf..0f783ed88592 100644
--- a/Documentation/filesystems/ext4/ondisk/group_descr.rst
+++ b/Documentation/filesystems/ext4/group_descr.rst
@@ -43,7 +43,7 @@ entire bitmap.
The block group descriptor is laid out in ``struct ext4_group_desc``.
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -157,7 +157,7 @@ The block group descriptor is laid out in ``struct ext4_group_desc``.
Block group flags can be any combination of the following:
.. list-table::
- :widths: 1 79
+ :widths: 16 64
:header-rows: 1
* - Value
diff --git a/Documentation/filesystems/ext4/ondisk/ifork.rst b/Documentation/filesystems/ext4/ifork.rst
index 5dbe3b2b121a..b9816d5a896b 100644
--- a/Documentation/filesystems/ext4/ondisk/ifork.rst
+++ b/Documentation/filesystems/ext4/ifork.rst
@@ -68,7 +68,7 @@ The extent tree header is recorded in ``struct ext4_extent_header``,
which is 12 bytes long:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -104,7 +104,7 @@ Internal nodes of the extent tree, also known as index nodes, are
recorded as ``struct ext4_extent_idx``, and are 12 bytes long:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -134,7 +134,7 @@ Leaf nodes of the extent tree are recorded as ``struct ext4_extent``,
and are also 12 bytes long:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -174,7 +174,7 @@ including) the checksum itself.
``struct ext4_extent_tail`` is 4 bytes long:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
diff --git a/Documentation/filesystems/ext4/index.rst b/Documentation/filesystems/ext4/index.rst
index 71121605558c..3be3e54d480d 100644
--- a/Documentation/filesystems/ext4/index.rst
+++ b/Documentation/filesystems/ext4/index.rst
@@ -1,17 +1,14 @@
.. SPDX-License-Identifier: GPL-2.0
-===============
-ext4 Filesystem
-===============
-
-General usage and on-disk artifacts writen by ext4. More documentation may
-be ported from the wiki as time permits. This should be considered the
-canonical source of information as the details here have been reviewed by
-the ext4 community.
+===================================
+ext4 Data Structures and Algorithms
+===================================
.. toctree::
- :maxdepth: 5
+ :maxdepth: 6
:numbered:
- ext4
- ondisk/index
+ about.rst
+ overview.rst
+ globals.rst
+ dynamic.rst
diff --git a/Documentation/filesystems/ext4/ondisk/inlinedata.rst b/Documentation/filesystems/ext4/inlinedata.rst
index d1075178ce0b..d1075178ce0b 100644
--- a/Documentation/filesystems/ext4/ondisk/inlinedata.rst
+++ b/Documentation/filesystems/ext4/inlinedata.rst
diff --git a/Documentation/filesystems/ext4/ondisk/inodes.rst b/Documentation/filesystems/ext4/inodes.rst
index 655ce898f3f5..6bd35e506b6f 100644
--- a/Documentation/filesystems/ext4/ondisk/inodes.rst
+++ b/Documentation/filesystems/ext4/inodes.rst
@@ -29,8 +29,9 @@ and the inode structure itself.
The inode table entry is laid out in ``struct ext4_inode``.
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
+ :class: longtable
* - Offset
- Size
@@ -176,7 +177,7 @@ The inode table entry is laid out in ``struct ext4_inode``.
The ``i_mode`` value is a combination of the following flags:
.. list-table::
- :widths: 1 79
+ :widths: 16 64
:header-rows: 1
* - Value
@@ -227,7 +228,7 @@ The ``i_mode`` value is a combination of the following flags:
The ``i_flags`` field is a combination of these values:
.. list-table::
- :widths: 1 79
+ :widths: 16 64
:header-rows: 1
* - Value
@@ -314,7 +315,7 @@ The ``osd1`` field has multiple meanings depending on the creator:
Linux:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -331,7 +332,7 @@ Linux:
Hurd:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -346,7 +347,7 @@ Hurd:
Masix:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -365,7 +366,7 @@ The ``osd2`` field has multiple meanings depending on the filesystem creator:
Linux:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -402,7 +403,7 @@ Linux:
Hurd:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -433,7 +434,7 @@ Hurd:
Masix:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
diff --git a/Documentation/filesystems/ext4/ondisk/journal.rst b/Documentation/filesystems/ext4/journal.rst
index e7031af86876..ea613ee701f5 100644
--- a/Documentation/filesystems/ext4/ondisk/journal.rst
+++ b/Documentation/filesystems/ext4/journal.rst
@@ -48,7 +48,7 @@ Layout
Generally speaking, the journal has this format:
.. list-table::
- :widths: 1 1 78
+ :widths: 16 48 16
:header-rows: 1
* - Superblock
@@ -76,7 +76,7 @@ The journal superblock will be in the next full block after the
superblock.
.. list-table::
- :widths: 1 1 1 1 76
+ :widths: 12 12 12 32 12
:header-rows: 1
* - 1024 bytes of padding
@@ -98,7 +98,7 @@ Every block in the journal starts with a common 12-byte header
``struct journal_header_s``:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -124,7 +124,7 @@ Every block in the journal starts with a common 12-byte header
The journal block type can be any one of:
.. list-table::
- :widths: 1 79
+ :widths: 16 64
:header-rows: 1
* - Value
@@ -154,7 +154,7 @@ The journal superblock is recorded as ``struct journal_superblock_s``,
which is 1024 bytes long:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -264,7 +264,7 @@ which is 1024 bytes long:
The journal compat features are any combination of the following:
.. list-table::
- :widths: 1 79
+ :widths: 16 64
:header-rows: 1
* - Value
@@ -278,7 +278,7 @@ The journal compat features are any combination of the following:
The journal incompat features are any combination of the following:
.. list-table::
- :widths: 1 79
+ :widths: 16 64
:header-rows: 1
* - Value
@@ -306,7 +306,7 @@ Journal checksum type codes are one of the following. crc32 or crc32c are the
most likely choices.
.. list-table::
- :widths: 1 79
+ :widths: 16 64
:header-rows: 1
* - Value
@@ -330,7 +330,7 @@ described by a data structure, but here is the block structure anyway.
Descriptor blocks consume at least 36 bytes, but use a full block:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -355,7 +355,7 @@ defined as ``struct journal_block_tag3_s``, which looks like the
following. The size is 16 or 32 bytes.
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -400,7 +400,7 @@ following. The size is 16 or 32 bytes.
The journal tag flags are any combination of the following:
.. list-table::
- :widths: 1 79
+ :widths: 16 64
:header-rows: 1
* - Value
@@ -421,7 +421,7 @@ is defined as ``struct journal_block_tag_s``, which looks like the
following. The size is 8, 12, 24, or 28 bytes:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -471,7 +471,7 @@ JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the block is a
``struct jbd2_journal_block_tail``, which looks like this:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -513,7 +513,7 @@ Revocation blocks are described in
length, but use a full block:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -543,7 +543,7 @@ JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the revocation
block is a ``struct jbd2_journal_revoke_tail``, which has this format:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -567,7 +567,7 @@ The commit block is described by ``struct commit_header``, which is 32
bytes long (but uses a full block):
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
diff --git a/Documentation/filesystems/ext4/ondisk/mmp.rst b/Documentation/filesystems/ext4/mmp.rst
index b7d7a3137f80..25660981d93c 100644
--- a/Documentation/filesystems/ext4/ondisk/mmp.rst
+++ b/Documentation/filesystems/ext4/mmp.rst
@@ -32,7 +32,7 @@ The checksum is calculated against the FS UUID and the MMP structure.
The MMP structure (``struct mmp_struct``) is as follows:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 12 20 40
:header-rows: 1
* - Offset
diff --git a/Documentation/filesystems/ext4/ondisk/index.rst b/Documentation/filesystems/ext4/ondisk/index.rst
deleted file mode 100644
index f7d082c3a435..000000000000
--- a/Documentation/filesystems/ext4/ondisk/index.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-.. SPDX-License-Identifier: GPL-2.0
-
-==============================
-Data Structures and Algorithms
-==============================
-.. include:: about.rst
-.. include:: overview.rst
-.. include:: globals.rst
-.. include:: dynamic.rst
diff --git a/Documentation/filesystems/ext4/ondisk/overview.rst b/Documentation/filesystems/ext4/overview.rst
index cbab18baba12..cbab18baba12 100644
--- a/Documentation/filesystems/ext4/ondisk/overview.rst
+++ b/Documentation/filesystems/ext4/overview.rst
diff --git a/Documentation/filesystems/ext4/ondisk/special_inodes.rst b/Documentation/filesystems/ext4/special_inodes.rst
index a82f70c9baeb..9061aabba827 100644
--- a/Documentation/filesystems/ext4/ondisk/special_inodes.rst
+++ b/Documentation/filesystems/ext4/special_inodes.rst
@@ -6,7 +6,7 @@ Special inodes
ext4 reserves some inode for special features, as follows:
.. list-table::
- :widths: 1 79
+ :widths: 6 70
:header-rows: 1
* - inode Number
diff --git a/Documentation/filesystems/ext4/ondisk/super.rst b/Documentation/filesystems/ext4/super.rst
index 5f81dd87e0b9..04ff079a2acf 100644
--- a/Documentation/filesystems/ext4/ondisk/super.rst
+++ b/Documentation/filesystems/ext4/super.rst
@@ -19,7 +19,7 @@ The ext4 superblock is laid out as follows in
``struct ext4_super_block``:
.. list-table::
- :widths: 1 1 1 77
+ :widths: 8 8 24 40
:header-rows: 1
* - Offset
@@ -483,7 +483,7 @@ The ext4 superblock is laid out as follows in
The superblock state is some combination of the following:
.. list-table::
- :widths: 1 79
+ :widths: 8 72
:header-rows: 1
* - Value
@@ -500,7 +500,7 @@ The superblock state is some combination of the following:
The superblock error policy is one of the following:
.. list-table::
- :widths: 1 79
+ :widths: 8 72
:header-rows: 1
* - Value
@@ -517,7 +517,7 @@ The superblock error policy is one of the following:
The filesystem creator is one of the following:
.. list-table::
- :widths: 1 79
+ :widths: 8 72
:header-rows: 1
* - Value
@@ -538,7 +538,7 @@ The filesystem creator is one of the following:
The superblock revision is one of the following:
.. list-table::
- :widths: 1 79
+ :widths: 8 72
:header-rows: 1
* - Value
@@ -556,7 +556,7 @@ The superblock compatible features field is a combination of any of the
following:
.. list-table::
- :widths: 1 79
+ :widths: 16 64
:header-rows: 1
* - Value
@@ -595,7 +595,7 @@ The superblock incompatible features field is a combination of any of the
following:
.. list-table::
- :widths: 1 79
+ :widths: 16 64
:header-rows: 1
* - Value
@@ -647,7 +647,7 @@ The superblock read-only compatible features field is a combination of any of
the following:
.. list-table::
- :widths: 1 79
+ :widths: 16 64
:header-rows: 1
* - Value
@@ -702,7 +702,7 @@ the following:
The ``s_def_hash_version`` field is one of the following:
.. list-table::
- :widths: 1 79
+ :widths: 8 72
:header-rows: 1
* - Value
@@ -725,7 +725,7 @@ The ``s_def_hash_version`` field is one of the following:
The ``s_default_mount_opts`` field is any combination of the following:
.. list-table::
- :widths: 1 79
+ :widths: 8 72
:header-rows: 1
* - Value
@@ -767,7 +767,7 @@ The ``s_default_mount_opts`` field is any combination of the following:
The ``s_flags`` field is any combination of the following:
.. list-table::
- :widths: 1 79
+ :widths: 8 72
:header-rows: 1
* - Value
@@ -784,7 +784,7 @@ The ``s_flags`` field is any combination of the following:
The ``s_encrypt_algos`` list can contain any of the following:
.. list-table::
- :widths: 1 79
+ :widths: 8 72
:header-rows: 1
* - Value
diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index e5edd29687b5..e46c2147ddf8 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -172,9 +172,10 @@ fault_type=%d Support configuring fault injection type, should be
FAULT_DIR_DEPTH 0x000000100
FAULT_EVICT_INODE 0x000000200
FAULT_TRUNCATE 0x000000400
- FAULT_IO 0x000000800
+ FAULT_READ_IO 0x000000800
FAULT_CHECKPOINT 0x000001000
FAULT_DISCARD 0x000002000
+ FAULT_WRITE_IO 0x000004000
mode=%s Control block allocation mode which supports "adaptive"
and "lfs". In "lfs" mode, there should be no random
writes towards main area.
@@ -211,6 +212,11 @@ fsync_mode=%s Control the policy of fsync. Currently supports "posix",
non-atomic files likewise "nobarrier" mount option.
test_dummy_encryption Enable dummy encryption, which provides a fake fscrypt
context. The fake fscrypt context is used by xfstests.
+checkpoint=%s Set to "disable" to turn off checkpointing. Set to "enable"
+ to reenable checkpointing. Is enabled by default. While
+ disabled, any unmounting or unexpected shutdowns will cause
+ the filesystem contents to appear as they did when the
+ filesystem was mounted with that option.
================================================================================
DEBUGFS ENTRIES
diff --git a/Documentation/filesystems/fscrypt.rst b/Documentation/filesystems/fscrypt.rst
index 48b424de85bb..cfbc18f0d9c9 100644
--- a/Documentation/filesystems/fscrypt.rst
+++ b/Documentation/filesystems/fscrypt.rst
@@ -191,21 +191,11 @@ Currently, the following pairs of encryption modes are supported:
- AES-256-XTS for contents and AES-256-CTS-CBC for filenames
- AES-128-CBC for contents and AES-128-CTS-CBC for filenames
-- Speck128/256-XTS for contents and Speck128/256-CTS-CBC for filenames
It is strongly recommended to use AES-256-XTS for contents encryption.
AES-128-CBC was added only for low-powered embedded devices with
crypto accelerators such as CAAM or CESA that do not support XTS.
-Similarly, Speck128/256 support was only added for older or low-end
-CPUs which cannot do AES fast enough -- especially ARM CPUs which have
-NEON instructions but not the Cryptography Extensions -- and for which
-it would not otherwise be feasible to use encryption at all. It is
-not recommended to use Speck on CPUs that have AES instructions.
-Speck support is only available if it has been enabled in the crypto
-API via CONFIG_CRYPTO_SPECK. Also, on ARM platforms, to get
-acceptable performance CONFIG_CRYPTO_SPECK_NEON must be enabled.
-
New encryption modes can be added relatively easily, without changes
to individual filesystems. However, authenticated encryption (AE)
modes are not currently supported because of the difficulty of dealing
diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX
deleted file mode 100644
index 53f3b596ac0d..000000000000
--- a/Documentation/filesystems/nfs/00-INDEX
+++ /dev/null
@@ -1,26 +0,0 @@
-00-INDEX
- - this file (nfs-related documentation).
-Exporting
- - explanation of how to make filesystems exportable.
-fault_injection.txt
- - information for using fault injection on the server
-knfsd-stats.txt
- - statistics which the NFS server makes available to user space.
-nfs.txt
- - nfs client, and DNS resolution for fs_locations.
-nfs41-server.txt
- - info on the Linux server implementation of NFSv4 minor version 1.
-nfs-rdma.txt
- - how to install and setup the Linux NFS/RDMA client and server software
-nfsd-admin-interfaces.txt
- - Administrative interfaces for nfsd.
-nfsroot.txt
- - short guide on setting up a diskless box with NFS root filesystem.
-pnfs.txt
- - short explanation of some of the internals of the pnfs client code
-rpc-cache.txt
- - introduction to the caching mechanisms in the sunrpc layer.
-idmapper.txt
- - information for configuring request-keys to be used by idmapper
-rpc-server-gss.txt
- - Information on GSS authentication support in the NFS Server
diff --git a/Documentation/filesystems/nfs/rpc-cache.txt b/Documentation/filesystems/nfs/rpc-cache.txt
index ebcaaee21616..c4dac829db0f 100644
--- a/Documentation/filesystems/nfs/rpc-cache.txt
+++ b/Documentation/filesystems/nfs/rpc-cache.txt
@@ -84,7 +84,7 @@ Creating a Cache
A message from user space has arrived to fill out a
cache entry. It is in 'buf' of length 'len'.
cache_parse should parse this, find the item in the
- cache with sunrpc_cache_lookup, and update the item
+ cache with sunrpc_cache_lookup_rcu, and update the item
with sunrpc_cache_update.
@@ -95,7 +95,7 @@ Creating a Cache
Using a cache
-------------
-To find a value in a cache, call sunrpc_cache_lookup passing a pointer
+To find a value in a cache, call sunrpc_cache_lookup_rcu passing a pointer
to the cache_head in a sample item with the 'key' fields filled in.
This will be passed to ->match to identify the target entry. If no
entry is found, a new entry will be create, added to the cache, and
@@ -116,7 +116,7 @@ item does become valid, the deferred copy of the request will be
revisited (->revisit). It is expected that this method will
reschedule the request for processing.
-The value returned by sunrpc_cache_lookup can also be passed to
+The value returned by sunrpc_cache_lookup_rcu can also be passed to
sunrpc_cache_update to set the content for the item. A second item is
passed which should hold the content. If the item found by _lookup
has valid data, then it is discarded and a new item is created. This
diff --git a/Documentation/filesystems/overlayfs.txt b/Documentation/filesystems/overlayfs.txt
index 51c136c821bf..eef7d9d259e8 100644
--- a/Documentation/filesystems/overlayfs.txt
+++ b/Documentation/filesystems/overlayfs.txt
@@ -286,6 +286,12 @@ pointed by REDIRECT. This should not be possible on local system as setting
"trusted." xattrs will require CAP_SYS_ADMIN. But it should be possible
for untrusted layers like from a pen drive.
+Note: redirect_dir={off|nofollow|follow(*)} conflicts with metacopy=on, and
+results in an error.
+
+(*) redirect_dir=follow only conflicts with metacopy=on if upperdir=... is
+given.
+
Sharing and copying layers
--------------------------
diff --git a/Documentation/filesystems/pohmelfs/design_notes.txt b/Documentation/filesystems/pohmelfs/design_notes.txt
deleted file mode 100644
index 106d17fbb05f..000000000000
--- a/Documentation/filesystems/pohmelfs/design_notes.txt
+++ /dev/null
@@ -1,72 +0,0 @@
-POHMELFS: Parallel Optimized Host Message Exchange Layered File System.
-
- Evgeniy Polyakov <zbr@ioremap.net>
-
-Homepage: http://www.ioremap.net/projects/pohmelfs
-
-POHMELFS first began as a network filesystem with coherent local data and
-metadata caches but is now evolving into a parallel distributed filesystem.
-
-Main features of this FS include:
- * Locally coherent cache for data and metadata with (potentially) byte-range locks.
- Since all Linux filesystems lock the whole inode during writing, algorithm
- is very simple and does not use byte-ranges, although they are sent in
- locking messages.
- * Completely async processing of all events except creation of hard and symbolic
- links, and rename events.
- Object creation and data reading and writing are processed asynchronously.
- * Flexible object architecture optimized for network processing.
- Ability to create long paths to objects and remove arbitrarily huge
- directories with a single network command.
- (like removing the whole kernel tree via a single network command).
- * Very high performance.
- * Fast and scalable multithreaded userspace server. Being in userspace it works
- with any underlying filesystem and still is much faster than async in-kernel NFS one.
- * Client is able to switch between different servers (if one goes down, client
- automatically reconnects to second and so on).
- * Transactions support. Full failover for all operations.
- Resending transactions to different servers on timeout or error.
- * Read request (data read, directory listing, lookup requests) balancing between multiple servers.
- * Write requests are replicated to multiple servers and completed only when all of them are acked.
- * Ability to add and/or remove servers from the working set at run-time.
- * Strong authentication and possible data encryption in network channel.
- * Extended attributes support.
-
-POHMELFS is based on transactions, which are potentially long-standing objects that live
-in the client's memory. Each transaction contains all the information needed to process a given
-command (or set of commands, which is frequently used during data writing: single transactions
-can contain creation and data writing commands). Transactions are committed by all the servers
-to which they are sent and, in case of failures, are eventually resent or dropped with an error.
-For example, reading will return an error if no servers are available.
-
-POHMELFS uses a asynchronous approach to data processing. Courtesy of transactions, it is
-possible to detach replies from requests and, if the command requires data to be received, the
-caller sleeps waiting for it. Thus, it is possible to issue multiple read commands to different
-servers and async threads will pick up replies in parallel, find appropriate transactions in the
-system and put the data where it belongs (like the page or inode cache).
-
-The main feature of POHMELFS is writeback data and the metadata cache.
-Only a few non-performance critical operations use the write-through cache and
-are synchronous: hard and symbolic link creation, and object rename. Creation,
-removal of objects and data writing are asynchronous and are sent to
-the server during system writeback. Only one writer at a time is allowed for any
-given inode, which is guarded by an appropriate locking protocol.
-Because of this feature, POHMELFS is extremely fast at metadata intensive
-workloads and can fully utilize the bandwidth to the servers when doing bulk
-data transfers.
-
-POHMELFS clients operate with a working set of servers and are capable of balancing read-only
-operations (like lookups or directory listings) between them according to IO priorities.
-Administrators can add or remove servers from the set at run-time via special commands (described
-in Documentation/filesystems/pohmelfs/info.txt file). Writes are replicated to all servers, which
-are connected with write permission turned on. IO priority and permissions can be changed in
-run-time.
-
-POHMELFS is capable of full data channel encryption and/or strong crypto hashing.
-One can select any kernel supported cipher, encryption mode, hash type and operation mode
-(hmac or digest). It is also possible to use both or neither (default). Crypto configuration
-is checked during mount time and, if the server does not support it, appropriate capabilities
-will be disabled or mount will fail (if 'crypto_fail_unsupported' mount option is specified).
-Crypto performance heavily depends on the number of crypto threads, which asynchronously perform
-crypto operations and send the resulting data to server or submit it up the stack. This number
-can be controlled via a mount option.
diff --git a/Documentation/filesystems/pohmelfs/info.txt b/Documentation/filesystems/pohmelfs/info.txt
deleted file mode 100644
index db2e41393626..000000000000
--- a/Documentation/filesystems/pohmelfs/info.txt
+++ /dev/null
@@ -1,99 +0,0 @@
-POHMELFS usage information.
-
-Mount options.
-All but index, number of crypto threads and maximum IO size can changed via remount.
-
-idx=%u
- Each mountpoint is associated with a special index via this option.
- Administrator can add or remove servers from the given index, so all mounts,
- which were attached to it, are updated.
- Default it is 0.
-
-trans_scan_timeout=%u
- This timeout, expressed in milliseconds, specifies time to scan transaction
- trees looking for stale requests, which have to be resent, or if number of
- retries exceed specified limit, dropped with error.
- Default is 5 seconds.
-
-drop_scan_timeout=%u
- Internal timeout, expressed in milliseconds, which specifies how frequently
- inodes marked to be dropped are freed. It also specifies how frequently
- the system checks that servers have to be added or removed from current working set.
- Default is 1 second.
-
-wait_on_page_timeout=%u
- Number of milliseconds to wait for reply from remote server for data reading command.
- If this timeout is exceeded, reading returns an error.
- Default is 5 seconds.
-
-trans_retries=%u
- This is the number of times that a transaction will be resent to a server that did
- not answer for the last @trans_scan_timeout milliseconds.
- When the number of resends exceeds this limit, the transaction is completed with error.
- Default is 5 resends.
-
-crypto_thread_num=%u
- Number of crypto processing threads. Threads are used both for RX and TX traffic.
- Default is 2, or no threads if crypto operations are not supported.
-
-trans_max_pages=%u
- Maximum number of pages in a single transaction. This parameter also controls
- the number of pages, allocated for crypto processing (each crypto thread has
- pool of pages, the number of which is equal to 'trans_max_pages'.
- Default is 100 pages.
-
-crypto_fail_unsupported
- If specified, mount will fail if the server does not support requested crypto operations.
- By default mount will disable non-matching crypto operations.
-
-mcache_timeout=%u
- Maximum number of milliseconds to wait for the mcache objects to be processed.
- Mcache includes locks (given lock should be granted by server), attributes (they should be
- fully received in the given timeframe).
- Default is 5 seconds.
-
-Usage examples.
-
-Add server server1.net:1025 into the working set with index $idx
-with appropriate hash algorithm and key file and cipher algorithm, mode and key file:
-$cfg A add -a server1.net -p 1025 -i $idx -K $hash_key -k $cipher_key
-
-Mount filesystem with given index $idx to /mnt mountpoint.
-Client will connect to all servers specified in the working set via previous command:
-mount -t pohmel -o idx=$idx q /mnt
-
-Change permissions to read-only (-I 1 option, '-I 2' - write-only, 3 - rw):
-$cfg A modify -a server1.net -p 1025 -i $idx -I 1
-
-Change IO priority to 123 (node with the highest priority gets read requests).
-$cfg A modify -a server1.net -p 1025 -i $idx -P 123
-
-One can check currect status of all connections in the mountstats file:
-# cat /proc/$PID/mountstats
-...
-device none mounted on /mnt with fstype pohmel
-idx addr(:port) socket_type protocol active priority permissions
-0 server1.net:1026 1 6 1 250 1
-0 server2.net:1025 1 6 1 123 3
-
-Server installation.
-
-Creating a server, which listens at port 1025 and 0.0.0.0 address.
-Working root directory (note, that server chroots there, so you have to have appropriate permissions)
-is set to /mnt, server will negotiate hash/cipher with client, in case client requested it, there
-are appropriate key files.
-Number of working threads is set to 10.
-
-# ./fserver -a 0.0.0.0 -p 1025 -r /mnt -w 10 -K hash_key -k cipher_key
-
- -A 6 - listen on ipv6 address. Default: Disabled.
- -r root - path to root directory. Default: /tmp.
- -a addr - listen address. Default: 0.0.0.0.
- -p port - listen port. Default: 1025.
- -w workers - number of workers per connected client. Default: 1.
- -K file - hash key size. Default: none.
- -k file - cipher key size. Default: none.
- -h - this help.
-
-Number of worker threads specifies how many workers will be created for each client.
-Bulk single-client transafers usually are better handled with smaller number (like 1-3).
diff --git a/Documentation/filesystems/pohmelfs/network_protocol.txt b/Documentation/filesystems/pohmelfs/network_protocol.txt
deleted file mode 100644
index c680b4b5353d..000000000000
--- a/Documentation/filesystems/pohmelfs/network_protocol.txt
+++ /dev/null
@@ -1,227 +0,0 @@
-POHMELFS network protocol.
-
-Basic structure used in network communication is following command:
-
-struct netfs_cmd
-{
- __u16 cmd; /* Command number */
- __u16 csize; /* Attached crypto information size */
- __u16 cpad; /* Attached padding size */
- __u16 ext; /* External flags */
- __u32 size; /* Size of the attached data */
- __u32 trans; /* Transaction id */
- __u64 id; /* Object ID to operate on. Used for feedback.*/
- __u64 start; /* Start of the object. */
- __u64 iv; /* IV sequence */
- __u8 data[0];
-};
-
-Commands can be embedded into transaction command (which in turn has own command),
-so one can extend protocol as needed without breaking backward compatibility as long
-as old commands are supported. All string lengths include tail 0 byte.
-
-All commands are transferred over the network in big-endian. CPU endianness is used at the end peers.
-
-@cmd - command number, which specifies command to be processed. Following
- commands are used currently:
-
- NETFS_READDIR = 1, /* Read directory for given inode number */
- NETFS_READ_PAGE, /* Read data page from the server */
- NETFS_WRITE_PAGE, /* Write data page to the server */
- NETFS_CREATE, /* Create directory entry */
- NETFS_REMOVE, /* Remove directory entry */
- NETFS_LOOKUP, /* Lookup single object */
- NETFS_LINK, /* Create a link */
- NETFS_TRANS, /* Transaction */
- NETFS_OPEN, /* Open intent */
- NETFS_INODE_INFO, /* Metadata cache coherency synchronization message */
- NETFS_PAGE_CACHE, /* Page cache invalidation message */
- NETFS_READ_PAGES, /* Read multiple contiguous pages in one go */
- NETFS_RENAME, /* Rename object */
- NETFS_CAPABILITIES, /* Capabilities of the client, for example supported crypto */
- NETFS_LOCK, /* Distributed lock message */
- NETFS_XATTR_SET, /* Set extended attribute */
- NETFS_XATTR_GET, /* Get extended attribute */
-
-@ext - external flags. Used by different commands to specify some extra arguments
- like partial size of the embedded objects or creation flags.
-
-@size - size of the attached data. For NETFS_READ_PAGE and NETFS_READ_PAGES no data is attached,
- but size of the requested data is incorporated here. It does not include size of the command
- header (struct netfs_cmd) itself.
-
-@id - id of the object this command operates on. Each command can use it for own purpose.
-
-@start - start of the object this command operates on. Each command can use it for own purpose.
-
-@csize, @cpad - size and padding size of the (attached if needed) crypto information.
-
-Command specifications.
-
-@NETFS_READDIR
-This command is used to sync content of the remote dir to the client.
-
-@ext - length of the path to object.
-@size - the same.
-@id - local inode number of the directory to read.
-@start - zero.
-
-
-@NETFS_READ_PAGE
-This command is used to read data from remote server.
-Data size does not exceed local page cache size.
-
-@id - inode number.
-@start - first byte offset.
-@size - number of bytes to read plus length of the path to object.
-@ext - object path length.
-
-
-@NETFS_CREATE
-Used to create object.
-It does not require that all directories on top of the object were
-already created, it will create them automatically. Each object has
-associated @netfs_path_entry data structure, which contains creation
-mode (permissions and type) and length of the name as long as name itself.
-
-@start - 0
-@size - size of the all data structures needed to create a path
-@id - local inode number
-@ext - 0
-
-
-@NETFS_REMOVE
-Used to remove object.
-
-@ext - length of the path to object.
-@size - the same.
-@id - local inode number.
-@start - zero.
-
-
-@NETFS_LOOKUP
-Lookup information about object on server.
-
-@ext - length of the path to object.
-@size - the same.
-@id - local inode number of the directory to look object in.
-@start - local inode number of the object to look at.
-
-
-@NETFS_LINK
-Create hard of symlink.
-Command is sent as "object_path|target_path".
-
-@size - size of the above string.
-@id - parent local inode number.
-@start - 1 for symlink, 0 for hardlink.
-@ext - size of the "object_path" above.
-
-
-@NETFS_TRANS
-Transaction header.
-
-@size - incorporates all embedded command sizes including theirs header sizes.
-@start - transaction generation number - unique id used to find transaction.
-@ext - transaction flags. Unused at the moment.
-@id - 0.
-
-
-@NETFS_OPEN
-Open intent for given transaction.
-
-@id - local inode number.
-@start - 0.
-@size - path length to the object.
-@ext - open flags (O_RDWR and so on).
-
-
-@NETFS_INODE_INFO
-Metadata update command.
-It is sent to servers when attributes of the object are changed and received
-when data or metadata were updated. It operates with the following structure:
-
-struct netfs_inode_info
-{
- unsigned int mode;
- unsigned int nlink;
- unsigned int uid;
- unsigned int gid;
- unsigned int blocksize;
- unsigned int padding;
- __u64 ino;
- __u64 blocks;
- __u64 rdev;
- __u64 size;
- __u64 version;
-};
-
-It effectively mirrors stat(2) returned data.
-
-
-@ext - path length to the object.
-@size - the same plus size of the netfs_inode_info structure.
-@id - local inode number.
-@start - 0.
-
-
-@NETFS_PAGE_CACHE
-Command is only received by clients. It contains information about
-page to be marked as not up-to-date.
-
-@id - client's inode number.
-@start - last byte of the page to be invalidated. If it is not equal to
- current inode size, it will be vmtruncated().
-@size - 0
-@ext - 0
-
-
-@NETFS_READ_PAGES
-Used to read multiple contiguous pages in one go.
-
-@start - first byte of the contiguous region to read.
-@size - contains of two fields: lower 8 bits are used to represent page cache shift
- used by client, another 3 bytes are used to get number of pages.
-@id - local inode number.
-@ext - path length to the object.
-
-
-@NETFS_RENAME
-Used to rename object.
-Attached data is formed into following string: "old_path|new_path".
-
-@id - local inode number.
-@start - parent inode number.
-@size - length of the above string.
-@ext - length of the old path part.
-
-
-@NETFS_CAPABILITIES
-Used to exchange crypto capabilities with server.
-If crypto capabilities are not supported by server, then client will disable it
-or fail (if 'crypto_fail_unsupported' mount options was specified).
-
-@id - superblock index. Used to specify crypto information for group of servers.
-@size - size of the attached capabilities structure.
-@start - 0.
-@size - 0.
-@scsize - 0.
-
-@NETFS_LOCK
-Used to send lock request/release messages. Although it sends byte range request
-and is capable of flushing pages based on that, it is not used, since all Linux
-filesystems lock the whole inode.
-
-@id - lock generation number.
-@start - start of the locked range.
-@size - size of the locked range.
-@ext - lock type: read/write. Not used actually. 15'th bit is used to determine,
- if it is lock request (1) or release (0).
-
-@NETFS_XATTR_SET
-@NETFS_XATTR_GET
-Used to set/get extended attributes for given inode.
-@id - attribute generation number or xattr setting type
-@start - size of the attribute (request or attached)
-@size - name length, path len and data size for given attribute
-@ext - path length for given object
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 7b7b845c490a..321d74b73937 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -622,3 +622,14 @@ in your dentry operations instead.
alloc_file_clone(file, flags, ops) does not affect any caller's references.
On success you get a new struct file sharing the mount/dentry with the
original, on failure - ERR_PTR().
+--
+[recommended]
+ ->lookup() instances doing an equivalent of
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+ return d_splice_alias(inode, dentry);
+ don't need to bother with the check - d_splice_alias() will do the
+ right thing when given ERR_PTR(...) as inode. Moreover, passing NULL
+ inode to d_splice_alias() will also do the right thing (equivalent of
+ d_add(dentry, NULL); return NULL;), so that kind of special cases
+ also doesn't need a separate treatment.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 22b4b00dee31..12a5e6e693b6 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -858,6 +858,7 @@ Writeback: 0 kB
AnonPages: 861800 kB
Mapped: 280372 kB
Shmem: 644 kB
+KReclaimable: 168048 kB
Slab: 284364 kB
SReclaimable: 159856 kB
SUnreclaim: 124508 kB
@@ -925,6 +926,9 @@ AnonHugePages: Non-file backed huge pages mapped into userspace page tables
ShmemHugePages: Memory used by shared memory (shmem) and tmpfs allocated
with huge pages
ShmemPmdMapped: Shared memory mapped into userspace with huge pages
+KReclaimable: Kernel allocations that the kernel will attempt to reclaim
+ under memory pressure. Includes SReclaimable (below), and other
+ direct allocations with a shrinker.
Slab: in-kernel data structures cache
SReclaimable: Part of Slab, that might be reclaimed, such as caches
SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 4b2084d0f1fb..a6c6a8af48a2 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -848,7 +848,7 @@ struct file_operations
----------------------
This describes how the VFS can manipulate an open file. As of kernel
-4.1, the following members are defined:
+4.18, the following members are defined:
struct file_operations {
struct module *owner;
@@ -858,11 +858,11 @@ struct file_operations {
ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
int (*iterate) (struct file *, struct dir_context *);
+ int (*iterate_shared) (struct file *, struct dir_context *);
__poll_t (*poll) (struct file *, struct poll_table_struct *);
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *);
- int (*mremap)(struct file *, struct vm_area_struct *);
int (*open) (struct inode *, struct file *);
int (*flush) (struct file *, fl_owner_t id);
int (*release) (struct inode *, struct file *);
@@ -882,6 +882,10 @@ struct file_operations {
#ifndef CONFIG_MMU
unsigned (*mmap_capabilities)(struct file *);
#endif
+ ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int);
+ int (*clone_file_range)(struct file *, loff_t, struct file *, loff_t, u64);
+ int (*dedupe_file_range)(struct file *, loff_t, struct file *, loff_t, u64);
+ int (*fadvise)(struct file *, loff_t, loff_t, int);
};
Again, all methods are called without any locks being held, unless
@@ -899,6 +903,9 @@ otherwise noted.
iterate: called when the VFS needs to read the directory contents
+ iterate_shared: called when the VFS needs to read the directory contents
+ when filesystem supports concurrent dir iterators
+
poll: called by the VFS when a process wants to check if there is
activity on this file and (optionally) go to sleep until there
is activity. Called by the select(2) and poll(2) system calls
@@ -951,6 +958,16 @@ otherwise noted.
fallocate: called by the VFS to preallocate blocks or punch a hole.
+ copy_file_range: called by the copy_file_range(2) system call.
+
+ clone_file_range: called by the ioctl(2) system call for FICLONERANGE and
+ FICLONE commands.
+
+ dedupe_file_range: called by the ioctl(2) system call for FIDEDUPERANGE
+ command.
+
+ fadvise: possibly called by the fadvise64() system call.
+
Note that the file operations are implemented by the specific
filesystem in which the inode resides. When opening a device node
(character or block special) most filesystems will call special