aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/filesystems
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r--Documentation/filesystems/Locking8
-rw-r--r--Documentation/filesystems/btrfs.txt91
-rw-r--r--Documentation/filesystems/ext4.txt85
-rw-r--r--Documentation/filesystems/proc.txt289
-rw-r--r--Documentation/filesystems/squashfs.txt225
-rw-r--r--Documentation/filesystems/vfs.txt8
6 files changed, 394 insertions, 312 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index cfbfa15a46ba..ec6a9392a173 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -97,8 +97,8 @@ prototypes:
void (*put_super) (struct super_block *);
void (*write_super) (struct super_block *);
int (*sync_fs)(struct super_block *sb, int wait);
- void (*write_super_lockfs) (struct super_block *);
- void (*unlockfs) (struct super_block *);
+ int (*freeze_fs) (struct super_block *);
+ int (*unfreeze_fs) (struct super_block *);
int (*statfs) (struct dentry *, struct kstatfs *);
int (*remount_fs) (struct super_block *, int *, char *);
void (*clear_inode) (struct inode *);
@@ -119,8 +119,8 @@ delete_inode: no
put_super: yes yes no
write_super: no yes read
sync_fs: no no read
-write_super_lockfs: ?
-unlockfs: ?
+freeze_fs: ?
+unfreeze_fs: ?
statfs: no no no
remount_fs: yes yes maybe (see below)
clear_inode: no
diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt
new file mode 100644
index 000000000000..64087c34327f
--- /dev/null
+++ b/Documentation/filesystems/btrfs.txt
@@ -0,0 +1,91 @@
+
+ BTRFS
+ =====
+
+Btrfs is a new copy on write filesystem for Linux aimed at
+implementing advanced features while focusing on fault tolerance,
+repair and easy administration. Initially developed by Oracle, Btrfs
+is licensed under the GPL and open for contribution from anyone.
+
+Linux has a wealth of filesystems to choose from, but we are facing a
+number of challenges with scaling to the large storage subsystems that
+are becoming common in today's data centers. Filesystems need to scale
+in their ability to address and manage large storage, and also in
+their ability to detect, repair and tolerate errors in the data stored
+on disk. Btrfs is under heavy development, and is not suitable for
+any uses other than benchmarking and review. The Btrfs disk format is
+not yet finalized.
+
+The main Btrfs features include:
+
+ * Extent based file storage (2^64 max file size)
+ * Space efficient packing of small files
+ * Space efficient indexed directories
+ * Dynamic inode allocation
+ * Writable snapshots
+ * Subvolumes (separate internal filesystem roots)
+ * Object level mirroring and striping
+ * Checksums on data and metadata (multiple algorithms available)
+ * Compression
+ * Integrated multiple device support, with several raid algorithms
+ * Online filesystem check (not yet implemented)
+ * Very fast offline filesystem check
+ * Efficient incremental backup and FS mirroring (not yet implemented)
+ * Online filesystem defragmentation
+
+
+
+ MAILING LIST
+ ============
+
+There is a Btrfs mailing list hosted on vger.kernel.org. You can
+find details on how to subscribe here:
+
+http://vger.kernel.org/vger-lists.html#linux-btrfs
+
+Mailing list archives are available from gmane:
+
+http://dir.gmane.org/gmane.comp.file-systems.btrfs
+
+
+
+ IRC
+ ===
+
+Discussion of Btrfs also occurs on the #btrfs channel of the Freenode
+IRC network.
+
+
+
+ UTILITIES
+ =========
+
+Userspace tools for creating and manipulating Btrfs file systems are
+available from the git repository at the following location:
+
+ http://git.kernel.org/?p=linux/kernel/git/mason/btrfs-progs-unstable.git
+ git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-progs-unstable.git
+
+These include the following tools:
+
+mkfs.btrfs: create a filesystem
+
+btrfsctl: control program to create snapshots and subvolumes:
+
+ mount /dev/sda2 /mnt
+ btrfsctl -s new_subvol_name /mnt
+ btrfsctl -s snapshot_of_default /mnt/default
+ btrfsctl -s snapshot_of_new_subvol /mnt/new_subvol_name
+ btrfsctl -s snapshot_of_a_snapshot /mnt/snapshot_of_new_subvol
+ ls /mnt
+ default snapshot_of_a_snapshot snapshot_of_new_subvol
+ new_subvol_name snapshot_of_default
+
+ Snapshots and subvolumes cannot be deleted right now, but you can
+ rm -rf all the files and directories inside them.
+
+btrfsck: do a limited check of the FS extent trees.
+
+btrfs-debug-tree: print all of the FS metadata in text form. Example:
+
+ btrfs-debug-tree /dev/sda2 >& big_output_file
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 174eaff7ded9..cec829bc7291 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -58,13 +58,22 @@ Note: More extensive information for getting started with ext4 can be
# mount -t ext4 /dev/hda1 /wherever
- - When comparing performance with other filesystems, remember that
- ext3/4 by default offers higher data integrity guarantees than most.
- So when comparing with a metadata-only journalling filesystem, such
- as ext3, use `mount -o data=writeback'. And you might as well use
- `mount -o nobh' too along with it. Making the journal larger than
- the mke2fs default often helps performance with metadata-intensive
- workloads.
+ - When comparing performance with other filesystems, it's always
+ important to try multiple workloads; very often a subtle change in a
+ workload parameter can completely change the ranking of which
+ filesystems do well compared to others. When comparing versus ext3,
+ note that ext4 enables write barriers by default, while ext3 does
+ not enable write barriers by default. So it is useful to use
+ explicitly specify whether barriers are enabled or not when via the
+ '-o barriers=[0|1]' mount option for both ext3 and ext4 filesystems
+ for a fair comparison. When tuning ext3 for best benchmark numbers,
+ it is often worthwhile to try changing the data journaling mode; '-o
+ data=writeback,nobh' can be faster for some workloads. (Note
+ however that running mounted with data=writeback can potentially
+ leave stale data exposed in recently written files in case of an
+ unclean shutdown, which could be a security exposure in some
+ situations.) Configuring the filesystem with a large journal can
+ also be helpful for metadata-intensive workloads.
2. Features
===========
@@ -74,7 +83,7 @@ Note: More extensive information for getting started with ext4 can be
* ability to use filesystems > 16TB (e2fsprogs support not available yet)
* extent format reduces metadata overhead (RAM, IO for access, transactions)
* extent format more robust in face of on-disk corruption due to magics,
-* internal redunancy in tree
+* internal redundancy in tree
* improved file allocation (multi-block alloc)
* fix 32000 subdirectory limit
* nsec timestamps for mtime, atime, ctime, create time
@@ -116,10 +125,11 @@ grouping of bitmaps and inode tables. Some test results available here:
When mounting an ext4 filesystem, the following option are accepted:
(*) == default
-extents (*) ext4 will use extents to address file data. The
- file system will no longer be mountable by ext3.
-
-noextents ext4 will not use extents for newly created files
+ro Mount filesystem read only. Note that ext4 will
+ replay the journal (and thus write to the
+ partition) even when mounted "read only". The
+ mount options "ro,noload" can be used to prevent
+ writes to the filesystem.
journal_checksum Enable checksumming of the journal transactions.
This will allow the recovery code in e2fsck and the
@@ -134,17 +144,17 @@ journal_async_commit Commit block can be written to disk without waiting
journal=update Update the ext4 file system's journal to the current
format.
-journal=inum When a journal already exists, this option is ignored.
- Otherwise, it specifies the number of the inode which
- will represent the ext4 file system's journal file.
-
journal_dev=devnum When the external journal device's major/minor numbers
have changed, this option allows the user to specify
the new journal location. The journal device is
identified through its new major/minor numbers encoded
in devnum.
-noload Don't load the journal on mounting.
+noload Don't load the journal on mounting. Note that
+ if the filesystem was not unmounted cleanly,
+ skipping the journal replay will lead to the
+ filesystem containing inconsistencies that can
+ lead to any number of problems.
data=journal All data are committed into the journal prior to being
written into the main file system.
@@ -219,9 +229,12 @@ minixdf Make 'df' act like Minix.
debug Extra debugging information is sent to syslog.
-errors=remount-ro(*) Remount the filesystem read-only on an error.
+errors=remount-ro Remount the filesystem read-only on an error.
errors=continue Keep going on a filesystem error.
errors=panic Panic and halt the machine if an error occurs.
+ (These mount options override the errors behavior
+ specified in the superblock, which can be configured
+ using tune2fs)
data_err=ignore(*) Just print an error message if an error occurs
in a file data buffer in ordered mode.
@@ -261,6 +274,42 @@ delalloc (*) Deferring block allocation until write-out time.
nodelalloc Disable delayed allocation. Blocks are allocation
when data is copied from user to page cache.
+max_batch_time=usec Maximum amount of time ext4 should wait for
+ additional filesystem operations to be batch
+ together with a synchronous write operation.
+ Since a synchronous write operation is going to
+ force a commit and then a wait for the I/O
+ complete, it doesn't cost much, and can be a
+ huge throughput win, we wait for a small amount
+ of time to see if any other transactions can
+ piggyback on the synchronous write. The
+ algorithm used is designed to automatically tune
+ for the speed of the disk, by measuring the
+ amount of time (on average) that it takes to
+ finish committing a transaction. Call this time
+ the "commit time". If the time that the
+ transactoin has been running is less than the
+ commit time, ext4 will try sleeping for the
+ commit time to see if other operations will join
+ the transaction. The commit time is capped by
+ the max_batch_time, which defaults to 15000us
+ (15ms). This optimization can be turned off
+ entirely by setting max_batch_time to 0.
+
+min_batch_time=usec This parameter sets the commit time (as
+ described above) to be at least min_batch_time.
+ It defaults to zero microseconds. Increasing
+ this parameter may improve the throughput of
+ multi-threaded, synchronous workloads on very
+ fast disks, at the cost of increasing latency.
+
+journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the
+ highest priorty) which should be used for I/O
+ operations submitted by kjournald2 during a
+ commit operation. This defaults to 3, which is
+ a slightly higher priority than the default I/O
+ priority.
+
Data Mode
=========
There are 3 different data modes:
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 32e94635484f..bbebc3a43ac0 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -140,6 +140,7 @@ Table 1-1: Process specific entries in /proc
statm Process memory status information
status Process status in human readable form
wchan If CONFIG_KALLSYMS is set, a pre-decoded wchan
+ stack Report full stack trace, enable via CONFIG_STACKTRACE
smaps Extension based on maps, the rss size for each mapped file
..............................................................................
@@ -1370,292 +1371,8 @@ auto_msgmni default value is 1.
2.4 /proc/sys/vm - The virtual memory subsystem
-----------------------------------------------
-The files in this directory can be used to tune the operation of the virtual
-memory (VM) subsystem of the Linux kernel.
-
-vfs_cache_pressure
-------------------
-
-Controls the tendency of the kernel to reclaim the memory which is used for
-caching of directory and inode objects.
-
-At the default value of vfs_cache_pressure=100 the kernel will attempt to
-reclaim dentries and inodes at a "fair" rate with respect to pagecache and
-swapcache reclaim. Decreasing vfs_cache_pressure causes the kernel to prefer
-to retain dentry and inode caches. Increasing vfs_cache_pressure beyond 100
-causes the kernel to prefer to reclaim dentries and inodes.
-
-dirty_background_bytes
-----------------------
-
-Contains the amount of dirty memory at which the pdflush background writeback
-daemon will start writeback.
-
-If dirty_background_bytes is written, dirty_background_ratio becomes a function
-of its value (dirty_background_bytes / the amount of dirtyable system memory).
-
-dirty_background_ratio
-----------------------
-
-Contains, as a percentage of the dirtyable system memory (free pages + mapped
-pages + file cache, not including locked pages and HugePages), the number of
-pages at which the pdflush background writeback daemon will start writing out
-dirty data.
-
-If dirty_background_ratio is written, dirty_background_bytes becomes a function
-of its value (dirty_background_ratio * the amount of dirtyable system memory).
-
-dirty_bytes
------------
-
-Contains the amount of dirty memory at which a process generating disk writes
-will itself start writeback.
-
-If dirty_bytes is written, dirty_ratio becomes a function of its value
-(dirty_bytes / the amount of dirtyable system memory).
-
-dirty_ratio
------------
-
-Contains, as a percentage of the dirtyable system memory (free pages + mapped
-pages + file cache, not including locked pages and HugePages), the number of
-pages at which a process which is generating disk writes will itself start
-writing out dirty data.
-
-If dirty_ratio is written, dirty_bytes becomes a function of its value
-(dirty_ratio * the amount of dirtyable system memory).
-
-dirty_writeback_centisecs
--------------------------
-
-The pdflush writeback daemons will periodically wake up and write `old' data
-out to disk. This tunable expresses the interval between those wakeups, in
-100'ths of a second.
-
-Setting this to zero disables periodic writeback altogether.
-
-dirty_expire_centisecs
-----------------------
-
-This tunable is used to define when dirty data is old enough to be eligible
-for writeout by the pdflush daemons. It is expressed in 100'ths of a second.
-Data which has been dirty in-memory for longer than this interval will be
-written out next time a pdflush daemon wakes up.
-
-highmem_is_dirtyable
---------------------
-
-Only present if CONFIG_HIGHMEM is set.
-
-This defaults to 0 (false), meaning that the ratios set above are calculated
-as a percentage of lowmem only. This protects against excessive scanning
-in page reclaim, swapping and general VM distress.
-
-Setting this to 1 can be useful on 32 bit machines where you want to make
-random changes within an MMAPed file that is larger than your available
-lowmem without causing large quantities of random IO. Is is safe if the
-behavior of all programs running on the machine is known and memory will
-not be otherwise stressed.
-
-legacy_va_layout
-----------------
-
-If non-zero, this sysctl disables the new 32-bit mmap mmap layout - the kernel
-will use the legacy (2.4) layout for all processes.
-
-lowmem_reserve_ratio
----------------------
-
-For some specialised workloads on highmem machines it is dangerous for
-the kernel to allow process memory to be allocated from the "lowmem"
-zone. This is because that memory could then be pinned via the mlock()
-system call, or by unavailability of swapspace.
-
-And on large highmem machines this lack of reclaimable lowmem memory
-can be fatal.
-
-So the Linux page allocator has a mechanism which prevents allocations
-which _could_ use highmem from using too much lowmem. This means that
-a certain amount of lowmem is defended from the possibility of being
-captured into pinned user memory.
-
-(The same argument applies to the old 16 megabyte ISA DMA region. This
-mechanism will also defend that region from allocations which could use
-highmem or lowmem).
-
-The `lowmem_reserve_ratio' tunable determines how aggressive the kernel is
-in defending these lower zones.
-
-If you have a machine which uses highmem or ISA DMA and your
-applications are using mlock(), or if you are running with no swap then
-you probably should change the lowmem_reserve_ratio setting.
-
-The lowmem_reserve_ratio is an array. You can see them by reading this file.
--
-% cat /proc/sys/vm/lowmem_reserve_ratio
-256 256 32
--
-Note: # of this elements is one fewer than number of zones. Because the highest
- zone's value is not necessary for following calculation.
-
-But, these values are not used directly. The kernel calculates # of protection
-pages for each zones from them. These are shown as array of protection pages
-in /proc/zoneinfo like followings. (This is an example of x86-64 box).
-Each zone has an array of protection pages like this.
-
--
-Node 0, zone DMA
- pages free 1355
- min 3
- low 3
- high 4
- :
- :
- numa_other 0
- protection: (0, 2004, 2004, 2004)
- ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
- pagesets
- cpu: 0 pcp: 0
- :
--
-These protections are added to score to judge whether this zone should be used
-for page allocation or should be reclaimed.
-
-In this example, if normal pages (index=2) are required to this DMA zone and
-pages_high is used for watermark, the kernel judges this zone should not be
-used because pages_free(1355) is smaller than watermark + protection[2]
-(4 + 2004 = 2008). If this protection value is 0, this zone would be used for
-normal page requirement. If requirement is DMA zone(index=0), protection[0]
-(=0) is used.
-
-zone[i]'s protection[j] is calculated by following expression.
-
-(i < j):
- zone[i]->protection[j]
- = (total sums of present_pages from zone[i+1] to zone[j] on the node)
- / lowmem_reserve_ratio[i];
-(i = j):
- (should not be protected. = 0;
-(i > j):
- (not necessary, but looks 0)
-
-The default values of lowmem_reserve_ratio[i] are
- 256 (if zone[i] means DMA or DMA32 zone)
- 32 (others).
-As above expression, they are reciprocal number of ratio.
-256 means 1/256. # of protection pages becomes about "0.39%" of total present
-pages of higher zones on the node.
-
-If you would like to protect more pages, smaller values are effective.
-The minimum value is 1 (1/1 -> 100%).
-
-page-cluster
-------------
-
-page-cluster controls the number of pages which are written to swap in
-a single attempt. The swap I/O size.
-
-It is a logarithmic value - setting it to zero means "1 page", setting
-it to 1 means "2 pages", setting it to 2 means "4 pages", etc.
-
-The default value is three (eight pages at a time). There may be some
-small benefits in tuning this to a different value if your workload is
-swap-intensive.
-
-overcommit_memory
------------------
-
-Controls overcommit of system memory, possibly allowing processes
-to allocate (but not use) more memory than is actually available.
-
-
-0 - Heuristic overcommit handling. Obvious overcommits of
- address space are refused. Used for a typical system. It
- ensures a seriously wild allocation fails while allowing
- overcommit to reduce swap usage. root is allowed to
- allocate slightly more memory in this mode. This is the
- default.
-
-1 - Always overcommit. Appropriate for some scientific
- applications.
-
-2 - Don't overcommit. The total address space commit
- for the system is not permitted to exceed swap plus a
- configurable percentage (default is 50) of physical RAM.
- Depending on the percentage you use, in most situations
- this means a process will not be killed while attempting
- to use already-allocated memory but will receive errors
- on memory allocation as appropriate.
-
-overcommit_ratio
-----------------
-
-Percentage of physical memory size to include in overcommit calculations
-(see above.)
-
-Memory allocation limit = swapspace + physmem * (overcommit_ratio / 100)
-
- swapspace = total size of all swap areas
- physmem = size of physical memory in system
-
-nr_hugepages and hugetlb_shm_group
-----------------------------------
-
-nr_hugepages configures number of hugetlb page reserved for the system.
-
-hugetlb_shm_group contains group id that is allowed to create SysV shared
-memory segment using hugetlb page.
-
-hugepages_treat_as_movable
---------------------------
-
-This parameter is only useful when kernelcore= is specified at boot time to
-create ZONE_MOVABLE for pages that may be reclaimed or migrated. Huge pages
-are not movable so are not normally allocated from ZONE_MOVABLE. A non-zero
-value written to hugepages_treat_as_movable allows huge pages to be allocated
-from ZONE_MOVABLE.
-
-Once enabled, the ZONE_MOVABLE is treated as an area of memory the huge
-pages pool can easily grow or shrink within. Assuming that applications are
-not running that mlock() a lot of memory, it is likely the huge pages pool
-can grow to the size of ZONE_MOVABLE by repeatedly entering the desired value
-into nr_hugepages and triggering page reclaim.
-
-laptop_mode
------------
-
-laptop_mode is a knob that controls "laptop mode". All the things that are
-controlled by this knob are discussed in Documentation/laptops/laptop-mode.txt.
-
-block_dump
-----------
-
-block_dump enables block I/O debugging when set to a nonzero value. More
-information on block I/O debugging is in Documentation/laptops/laptop-mode.txt.
-
-swap_token_timeout
-------------------
-
-This file contains valid hold time of swap out protection token. The Linux
-VM has token based thrashing control mechanism and uses the token to prevent
-unnecessary page faults in thrashing situation. The unit of the value is
-second. The value would be useful to tune thrashing behavior.
-
-drop_caches
------------
-
-Writing to this will cause the kernel to drop clean caches, dentries and
-inodes from memory, causing that memory to become free.
-
-To free pagecache:
- echo 1 > /proc/sys/vm/drop_caches
-To free dentries and inodes:
- echo 2 > /proc/sys/vm/drop_caches
-To free pagecache, dentries and inodes:
- echo 3 > /proc/sys/vm/drop_caches
-
-As this is a non-destructive operation and dirty objects are not freeable, the
-user should run `sync' first.
+Please see: Documentation/sysctls/vm.txt for a description of these
+entries.
2.5 /proc/sys/dev - Device specific parameters
diff --git a/Documentation/filesystems/squashfs.txt b/Documentation/filesystems/squashfs.txt
new file mode 100644
index 000000000000..3e79e4a7a392
--- /dev/null
+++ b/Documentation/filesystems/squashfs.txt
@@ -0,0 +1,225 @@
+SQUASHFS 4.0 FILESYSTEM
+=======================
+
+Squashfs is a compressed read-only filesystem for Linux.
+It uses zlib compression to compress files, inodes and directories.
+Inodes in the system are very small and all blocks are packed to minimise
+data overhead. Block sizes greater than 4K are supported up to a maximum
+of 1Mbytes (default block size 128K).
+
+Squashfs is intended for general read-only filesystem use, for archival
+use (i.e. in cases where a .tar.gz file may be used), and in constrained
+block device/memory systems (e.g. embedded systems) where low overhead is
+needed.
+
+Mailing list: squashfs-devel@lists.sourceforge.net
+Web site: www.squashfs.org
+
+1. FILESYSTEM FEATURES
+----------------------
+
+Squashfs filesystem features versus Cramfs:
+
+ Squashfs Cramfs
+
+Max filesystem size: 2^64 16 MiB
+Max file size: ~ 2 TiB 16 MiB
+Max files: unlimited unlimited
+Max directories: unlimited unlimited
+Max entries per directory: unlimited unlimited
+Max block size: 1 MiB 4 KiB
+Metadata compression: yes no
+Directory indexes: yes no
+Sparse file support: yes no
+Tail-end packing (fragments): yes no
+Exportable (NFS etc.): yes no
+Hard link support: yes no
+"." and ".." in readdir: yes no
+Real inode numbers: yes no
+32-bit uids/gids: yes no
+File creation time: yes no
+Xattr and ACL support: no no
+
+Squashfs compresses data, inodes and directories. In addition, inode and
+directory data are highly compacted, and packed on byte boundaries. Each
+compressed inode is on average 8 bytes in length (the exact length varies on
+file type, i.e. regular file, directory, symbolic link, and block/char device
+inodes have different sizes).
+
+2. USING SQUASHFS
+-----------------
+
+As squashfs is a read-only filesystem, the mksquashfs program must be used to
+create populated squashfs filesystems. This and other squashfs utilities
+can be obtained from http://www.squashfs.org. Usage instructions can be
+obtained from this site also.
+
+
+3. SQUASHFS FILESYSTEM DESIGN
+-----------------------------
+
+A squashfs filesystem consists of seven parts, packed together on a byte
+alignment:
+
+ ---------------
+ | superblock |
+ |---------------|
+ | datablocks |
+ | & fragments |
+ |---------------|
+ | inode table |
+ |---------------|
+ | directory |
+ | table |
+ |---------------|
+ | fragment |
+ | table |
+ |---------------|
+ | export |
+ | table |
+ |---------------|
+ | uid/gid |
+ | lookup table |
+ ---------------
+
+Compressed data blocks are written to the filesystem as files are read from
+the source directory, and checked for duplicates. Once all file data has been
+written the completed inode, directory, fragment, export and uid/gid lookup
+tables are written.
+
+3.1 Inodes
+----------
+
+Metadata (inodes and directories) are compressed in 8Kbyte blocks. Each
+compressed block is prefixed by a two byte length, the top bit is set if the
+block is uncompressed. A block will be uncompressed if the -noI option is set,
+or if the compressed block was larger than the uncompressed block.
+
+Inodes are packed into the metadata blocks, and are not aligned to block
+boundaries, therefore inodes overlap compressed blocks. Inodes are identified
+by a 48-bit number which encodes the location of the compressed metadata block
+containing the inode, and the byte offset into that block where the inode is
+placed (<block, offset>).
+
+To maximise compression there are different inodes for each file type
+(regular file, directory, device, etc.), the inode contents and length
+varying with the type.
+
+To further maximise compression, two types of regular file inode and
+directory inode are defined: inodes optimised for frequently occurring
+regular files and directories, and extended types where extra
+information has to be stored.
+
+3.2 Directories
+---------------
+
+Like inodes, directories are packed into compressed metadata blocks, stored
+in a directory table. Directories are accessed using the start address of
+the metablock containing the directory and the offset into the
+decompressed block (<block, offset>).
+
+Directories are organised in a slightly complex way, and are not simply
+a list of file names. The organisation takes advantage of the
+fact that (in most cases) the inodes of the files will be in the same
+compressed metadata block, and therefore, can share the start block.
+Directories are therefore organised in a two level list, a directory
+header containing the shared start block value, and a sequence of directory
+entries, each of which share the shared start block. A new directory header
+is written once/if the inode start block changes. The directory
+header/directory entry list is repeated as many times as necessary.
+
+Directories are sorted, and can contain a directory index to speed up
+file lookup. Directory indexes store one entry per metablock, each entry
+storing the index/filename mapping to the first directory header
+in each metadata block. Directories are sorted in alphabetical order,
+and at lookup the index is scanned linearly looking for the first filename
+alphabetically larger than the filename being looked up. At this point the
+location of the metadata block the filename is in has been found.
+The general idea of the index is ensure only one metadata block needs to be
+decompressed to do a lookup irrespective of the length of the directory.
+This scheme has the advantage that it doesn't require extra memory overhead
+and doesn't require much extra storage on disk.
+
+3.3 File data
+-------------
+
+Regular files consist of a sequence of contiguous compressed blocks, and/or a
+compressed fragment block (tail-end packed block). The compressed size
+of each datablock is stored in a block list contained within the
+file inode.
+
+To speed up access to datablocks when reading 'large' files (256 Mbytes or
+larger), the code implements an index cache that caches the mapping from
+block index to datablock location on disk.
+
+The index cache allows Squashfs to handle large files (up to 1.75 TiB) while
+retaining a simple and space-efficient block list on disk. The cache
+is split into slots, caching up to eight 224 GiB files (128 KiB blocks).
+Larger files use multiple slots, with 1.75 TiB files using all 8 slots.
+The index cache is designed to be memory efficient, and by default uses
+16 KiB.
+
+3.4 Fragment lookup table
+-------------------------
+
+Regular files can contain a fragment index which is mapped to a fragment
+location on disk and compressed size using a fragment lookup table. This
+fragment lookup table is itself stored compressed into metadata blocks.
+A second index table is used to locate these. This second index table for
+speed of access (and because it is small) is read at mount time and cached
+in memory.
+
+3.5 Uid/gid lookup table
+------------------------
+
+For space efficiency regular files store uid and gid indexes, which are
+converted to 32-bit uids/gids using an id look up table. This table is
+stored compressed into metadata blocks. A second index table is used to
+locate these. This second index table for speed of access (and because it
+is small) is read at mount time and cached in memory.
+
+3.6 Export table
+----------------
+
+To enable Squashfs filesystems to be exportable (via NFS etc.) filesystems
+can optionally (disabled with the -no-exports Mksquashfs option) contain
+an inode number to inode disk location lookup table. This is required to
+enable Squashfs to map inode numbers passed in filehandles to the inode
+location on disk, which is necessary when the export code reinstantiates
+expired/flushed inodes.
+
+This table is stored compressed into metadata blocks. A second index table is
+used to locate these. This second index table for speed of access (and because
+it is small) is read at mount time and cached in memory.
+
+
+4. TODOS AND OUTSTANDING ISSUES
+-------------------------------
+
+4.1 Todo list
+-------------
+
+Implement Xattr and ACL support. The Squashfs 4.0 filesystem layout has hooks
+for these but the code has not been written. Once the code has been written
+the existing layout should not require modification.
+
+4.2 Squashfs internal cache
+---------------------------
+
+Blocks in Squashfs are compressed. To avoid repeatedly decompressing
+recently accessed data Squashfs uses two small metadata and fragment caches.
+
+The cache is not used for file datablocks, these are decompressed and cached in
+the page-cache in the normal way. The cache is used to temporarily cache
+fragment and metadata blocks which have been read as a result of a metadata
+(i.e. inode or directory) or fragment access. Because metadata and fragments
+are packed together into blocks (to gain greater compression) the read of a
+particular piece of metadata or fragment will retrieve other metadata/fragments
+which have been packed with it, these because of locality-of-reference may be
+read in the near future. Temporarily caching them ensures they are available
+for near future access without requiring an additional read and decompress.
+
+In the future this internal cache may be replaced with an implementation which
+uses the kernel page cache. Because the page cache operates on page sized
+units this may introduce additional complexity in terms of locking and
+associated race conditions.
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index ef19afa186a9..deeeed0faa8f 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -210,8 +210,8 @@ struct super_operations {
void (*put_super) (struct super_block *);
void (*write_super) (struct super_block *);
int (*sync_fs)(struct super_block *sb, int wait);
- void (*write_super_lockfs) (struct super_block *);
- void (*unlockfs) (struct super_block *);
+ int (*freeze_fs) (struct super_block *);
+ int (*unfreeze_fs) (struct super_block *);
int (*statfs) (struct dentry *, struct kstatfs *);
int (*remount_fs) (struct super_block *, int *, char *);
void (*clear_inode) (struct inode *);
@@ -270,11 +270,11 @@ or bottom half).
a superblock. The second parameter indicates whether the method
should wait until the write out has been completed. Optional.
- write_super_lockfs: called when VFS is locking a filesystem and
+ freeze_fs: called when VFS is locking a filesystem and
forcing it into a consistent state. This method is currently
used by the Logical Volume Manager (LVM).
- unlockfs: called when VFS is unlocking a filesystem and making it writable
+ unfreeze_fs: called when VFS is unlocking a filesystem and making it writable
again.
statfs: called when the VFS needs to get filesystem statistics. This