aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation/filesystems
diff options
context:
space:
mode:
authorMike Marshall <hubcap@omnibond.com>2016-03-14 15:39:42 -0400
committerMike Marshall <hubcap@omnibond.com>2016-03-14 15:39:42 -0400
commitab6652524aaf834d5dcdb46dd7695813b8d63da5 (patch)
treebb3876a9b61254be902416f7dbf578deb28e9f22 /Documentation/filesystems
parentorangefs: make fs_mount_pending static (diff)
parentLinux 4.5 (diff)
downloadlinux-dev-ab6652524aaf834d5dcdb46dd7695813b8d63da5.tar.xz
linux-dev-ab6652524aaf834d5dcdb46dd7695813b8d63da5.zip
Orangefs: merge to v4.5
Merge tag 'v4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux into current Linux 4.5
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r--Documentation/filesystems/Locking6
-rw-r--r--Documentation/filesystems/configfs/configfs.txt57
-rw-r--r--Documentation/filesystems/efivarfs.txt7
-rw-r--r--Documentation/filesystems/f2fs.txt10
-rw-r--r--Documentation/filesystems/porting21
-rw-r--r--Documentation/filesystems/proc.txt38
-rw-r--r--Documentation/filesystems/sharedsubtree.txt2
-rw-r--r--Documentation/filesystems/tmpfs.txt8
-rw-r--r--Documentation/filesystems/vfat.txt10
-rw-r--r--Documentation/filesystems/vfs.txt21
10 files changed, 134 insertions, 46 deletions
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 06d443450f21..619af9bfdcb3 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -50,8 +50,7 @@ prototypes:
int (*rename2) (struct inode *, struct dentry *,
struct inode *, struct dentry *, unsigned int);
int (*readlink) (struct dentry *, char __user *,int);
- const char *(*follow_link) (struct dentry *, void **);
- void (*put_link) (struct inode *, void *);
+ const char *(*get_link) (struct dentry *, struct inode *, void **);
void (*truncate) (struct inode *);
int (*permission) (struct inode *, int, unsigned int);
int (*get_acl)(struct inode *, int);
@@ -83,8 +82,7 @@ rmdir: yes (both) (see below)
rename: yes (all) (see below)
rename2: yes (all) (see below)
readlink: no
-follow_link: no
-put_link: no
+get_link: no
setattr: yes
permission: no (may not block if called in rcu-walk mode)
get_acl: no
diff --git a/Documentation/filesystems/configfs/configfs.txt b/Documentation/filesystems/configfs/configfs.txt
index af68efdbbfad..e5fe521eea1d 100644
--- a/Documentation/filesystems/configfs/configfs.txt
+++ b/Documentation/filesystems/configfs/configfs.txt
@@ -51,15 +51,27 @@ configfs tree is always there, whether mounted on /config or not.
An item is created via mkdir(2). The item's attributes will also
appear at this time. readdir(3) can determine what the attributes are,
read(2) can query their default values, and write(2) can store new
-values. Like sysfs, attributes should be ASCII text files, preferably
-with only one value per file. The same efficiency caveats from sysfs
-apply. Don't mix more than one attribute in one attribute file.
-
-Like sysfs, configfs expects write(2) to store the entire buffer at
-once. When writing to configfs attributes, userspace processes should
-first read the entire file, modify the portions they wish to change, and
-then write the entire buffer back. Attribute files have a maximum size
-of one page (PAGE_SIZE, 4096 on i386).
+values. Don't mix more than one attribute in one attribute file.
+
+There are two types of configfs attributes:
+
+* Normal attributes, which similar to sysfs attributes, are small ASCII text
+files, with a maximum size of one page (PAGE_SIZE, 4096 on i386). Preferably
+only one value per file should be used, and the same caveats from sysfs apply.
+Configfs expects write(2) to store the entire buffer at once. When writing to
+normal configfs attributes, userspace processes should first read the entire
+file, modify the portions they wish to change, and then write the entire
+buffer back.
+
+* Binary attributes, which are somewhat similar to sysfs binary attributes,
+but with a few slight changes to semantics. The PAGE_SIZE limitation does not
+apply, but the whole binary item must fit in single kernel vmalloc'ed buffer.
+The write(2) calls from user space are buffered, and the attributes'
+write_bin_attribute method will be invoked on the final close, therefore it is
+imperative for user-space to check the return code of close(2) in order to
+verify that the operation finished successfully.
+To avoid a malicious user OOMing the kernel, there's a per-binary attribute
+maximum buffer value.
When an item needs to be destroyed, remove it with rmdir(2). An
item cannot be destroyed if any other item has a link to it (via
@@ -171,6 +183,7 @@ among other things. For that, it needs a type.
struct configfs_item_operations *ct_item_ops;
struct configfs_group_operations *ct_group_ops;
struct configfs_attribute **ct_attrs;
+ struct configfs_bin_attribute **ct_bin_attrs;
};
The most basic function of a config_item_type is to define what
@@ -201,6 +214,32 @@ be called whenever userspace asks for a read(2) on the attribute. If an
attribute is writable and provides a ->store method, that method will be
be called whenever userspace asks for a write(2) on the attribute.
+[struct configfs_bin_attribute]
+
+ struct configfs_attribute {
+ struct configfs_attribute cb_attr;
+ void *cb_private;
+ size_t cb_max_size;
+ };
+
+The binary attribute is used when the one needs to use binary blob to
+appear as the contents of a file in the item's configfs directory.
+To do so add the binary attribute to the NULL-terminated array
+config_item_type->ct_bin_attrs, and the item appears in configfs, the
+attribute file will appear with the configfs_bin_attribute->cb_attr.ca_name
+filename. configfs_bin_attribute->cb_attr.ca_mode specifies the file
+permissions.
+The cb_private member is provided for use by the driver, while the
+cb_max_size member specifies the maximum amount of vmalloc buffer
+to be used.
+
+If binary attribute is readable and the config_item provides a
+ct_item_ops->read_bin_attribute() method, that method will be called
+whenever userspace asks for a read(2) on the attribute. The converse
+will happen for write(2). The reads/writes are bufferred so only a
+single read/write will occur; the attributes' need not concern itself
+with it.
+
[struct config_group]
A config_item cannot live in a vacuum. The only way one can be created
diff --git a/Documentation/filesystems/efivarfs.txt b/Documentation/filesystems/efivarfs.txt
index c477af086e65..686a64bba775 100644
--- a/Documentation/filesystems/efivarfs.txt
+++ b/Documentation/filesystems/efivarfs.txt
@@ -14,3 +14,10 @@ filesystem.
efivarfs is typically mounted like this,
mount -t efivarfs none /sys/firmware/efi/efivars
+
+Due to the presence of numerous firmware bugs where removing non-standard
+UEFI variables causes the system firmware to fail to POST, efivarfs
+files that are not well-known standardized variables are created
+as immutable files. This doesn't prevent removal - "chattr -i" will work -
+but it does prevent this kind of failure from being accomplished
+accidentally.
diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt
index b102b436563e..e1c9f0849da6 100644
--- a/Documentation/filesystems/f2fs.txt
+++ b/Documentation/filesystems/f2fs.txt
@@ -102,7 +102,7 @@ background_gc=%s Turn on/off cleaning operations, namely garbage
collection, triggered in background when I/O subsystem is
idle. If background_gc=on, it will turn on the garbage
collection and if background_gc=off, garbage collection
- will be truned off. If background_gc=sync, it will turn
+ will be turned off. If background_gc=sync, it will turn
on synchronous garbage collection running in background.
Default value for this option is on. So garbage
collection is on by default.
@@ -145,10 +145,12 @@ extent_cache Enable an extent cache based on rb-tree, it can cache
as many as extent which map between contiguous logical
address and physical address per inode, resulting in
increasing the cache hit ratio. Set by default.
-noextent_cache Diable an extent cache based on rb-tree explicitly, see
+noextent_cache Disable an extent cache based on rb-tree explicitly, see
the above extent_cache mount option.
noinline_data Disable the inline data feature, inline data feature is
enabled by default.
+data_flush Enable data flushing before checkpoint in order to
+ persist data of regular and symlink.
================================================================================
DEBUGFS ENTRIES
@@ -192,7 +194,7 @@ Files in /sys/fs/f2fs/<devname>
policy for garbage collection. Setting gc_idle = 0
(default) will disable this option. Setting
gc_idle = 1 will select the Cost Benefit approach
- & setting gc_idle = 2 will select the greedy aproach.
+ & setting gc_idle = 2 will select the greedy approach.
reclaim_segments This parameter controls the number of prefree
segments to be reclaimed. If the number of prefree
@@ -298,7 +300,7 @@ The dump.f2fs shows the information of specific inode and dumps SSA and SIT to
file. Each file is dump_ssa and dump_sit.
The dump.f2fs is used to debug on-disk data structures of the f2fs filesystem.
-It shows on-disk inode information reconized by a given inode number, and is
+It shows on-disk inode information recognized by a given inode number, and is
able to dump all the SSA and SIT entries into predefined files, ./dump_ssa and
./dump_sit respectively.
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index f24d1b833957..f1b87d8aa2da 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -504,3 +504,24 @@ in your dentry operations instead.
[mandatory]
__fd_install() & fd_install() can now sleep. Callers should not
hold a spinlock or other resources that do not allow a schedule.
+--
+[mandatory]
+ any symlink that might use page_follow_link_light/page_put_link() must
+ have inode_nohighmem(inode) called before anything might start playing with
+ its pagecache. No highmem pages should end up in the pagecache of such
+ symlinks. That includes any preseeding that might be done during symlink
+ creation. __page_symlink() will honour the mapping gfp flags, so once
+ you've done inode_nohighmem() it's safe to use, but if you allocate and
+ insert the page manually, make sure to use the right gfp flags.
+--
+[mandatory]
+ ->follow_link() is replaced with ->get_link(); same API, except that
+ * ->get_link() gets inode as a separate argument
+ * ->get_link() may be called in RCU mode - in that case NULL
+ dentry is passed
+--
+[mandatory]
+ ->get_link() gets struct delayed_call *done now, and should do
+ set_delayed_call() where it used to set *cookie.
+ ->put_link() is gone - just give the destructor to set_delayed_call()
+ in ->get_link().
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 402ab99e409f..843b045b4069 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -169,6 +169,9 @@ read the file /proc/PID/status:
VmLck: 0 kB
VmHWM: 476 kB
VmRSS: 476 kB
+ RssAnon: 352 kB
+ RssFile: 120 kB
+ RssShmem: 4 kB
VmData: 156 kB
VmStk: 88 kB
VmExe: 68 kB
@@ -231,14 +234,20 @@ Table 1-2: Contents of the status files (as of 4.1)
VmSize total program size
VmLck locked memory size
VmHWM peak resident set size ("high water mark")
- VmRSS size of memory portions
- VmData size of data, stack, and text segments
- VmStk size of data, stack, and text segments
+ VmRSS size of memory portions. It contains the three
+ following parts (VmRSS = RssAnon + RssFile + RssShmem)
+ RssAnon size of resident anonymous memory
+ RssFile size of resident file mappings
+ RssShmem size of resident shmem memory (includes SysV shm,
+ mapping of tmpfs and shared anonymous mappings)
+ VmData size of private data segments
+ VmStk size of stack segments
VmExe size of text segment
VmLib size of shared library code
VmPTE size of page table entries
VmPMD size of second level page tables
- VmSwap size of swap usage (the number of referred swapents)
+ VmSwap amount of swap used by anonymous private data
+ (shmem swap usage is not included)
HugetlbPages size of hugetlb memory portions
Threads number of threads
SigQ number of signals queued/max. number for queue
@@ -265,7 +274,8 @@ Table 1-3: Contents of the statm files (as of 2.6.8-rc3)
Field Content
size total program size (pages) (same as VmSize in status)
resident size of memory portions (pages) (same as VmRSS in status)
- shared number of pages that are shared (i.e. backed by a file)
+ shared number of pages that are shared (i.e. backed by a file, same
+ as RssFile+RssShmem in status)
trs number of pages that are 'code' (not including libs; broken,
includes data segment)
lrs number of pages of library (always 0 on 2.6)
@@ -346,7 +356,7 @@ address perms offset dev inode pathname
a7cb1000-a7cb2000 ---p 00000000 00:00 0
a7cb2000-a7eb2000 rw-p 00000000 00:00 0
a7eb2000-a7eb3000 ---p 00000000 00:00 0
-a7eb3000-a7ed5000 rw-p 00000000 00:00 0 [stack:1001]
+a7eb3000-a7ed5000 rw-p 00000000 00:00 0
a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6
a8008000-a800a000 r--p 00133000 03:00 4222 /lib/libc.so.6
a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/libc.so.6
@@ -378,7 +388,6 @@ is not associated with a file:
[heap] = the heap of the program
[stack] = the stack of the main process
- [stack:1001] = the stack of the thread with tid 1001
[vdso] = the "virtual dynamic shared object",
the kernel system call handler
@@ -386,10 +395,8 @@ is not associated with a file:
The /proc/PID/task/TID/maps is a view of the virtual memory from the viewpoint
of the individual tasks of a process. In this file you will see a mapping marked
-as [stack] if that task sees it as a stack. This is a key difference from the
-content of /proc/PID/maps, where you will see all mappings that are being used
-as stack by all of those tasks. Hence, for the example above, the task-level
-map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this:
+as [stack] if that task sees it as a stack. Hence, for the example above, the
+task-level map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this:
08048000-08049000 r-xp 00000000 03:00 8312 /opt/test
08049000-0804a000 rw-p 00001000 03:00 8312 /opt/test
@@ -459,7 +466,10 @@ and a page is modified, the file page is replaced by a private anonymous copy.
hugetlbfs page which is *not* counted in "RSS" or "PSS" field for historical
reasons. And these are not included in {Shared,Private}_{Clean,Dirty} field.
"Swap" shows how much would-be-anonymous memory is also used, but out on swap.
-"SwapPss" shows proportional swap share of this mapping.
+For shmem mappings, "Swap" includes also the size of the mapped (and not
+replaced by copy-on-write) part of the underlying shmem object out on swap.
+"SwapPss" shows proportional swap share of this mapping. Unlike "Swap", this
+does not take into account swapped out page of underlying shmem objects.
"Locked" indicates whether the mapping is locked in memory or not.
"VmFlags" field deserves a separate description. This member represents the kernel
@@ -807,7 +817,7 @@ by migrate-type and finishes with details on how many page blocks of each
type exist.
If min_free_kbytes has been tuned correctly (recommendations made by hugeadm
-from libhugetlbfs http://sourceforge.net/projects/libhugetlbfs/), one can
+from libhugetlbfs https://github.com/libhugetlbfs/libhugetlbfs/), one can
make an estimate of the likely number of huge pages that can be allocated
at a given point in time. All the "Movable" blocks should be allocatable
unless memory has been mlock()'d. Some of the Reclaimable blocks should
@@ -842,6 +852,7 @@ Dirty: 968 kB
Writeback: 0 kB
AnonPages: 861800 kB
Mapped: 280372 kB
+Shmem: 644 kB
Slab: 284364 kB
SReclaimable: 159856 kB
SUnreclaim: 124508 kB
@@ -898,6 +909,7 @@ MemAvailable: An estimate of how much memory is available for starting new
AnonPages: Non-file backed pages mapped into userspace page tables
AnonHugePages: Non-file backed huge pages mapped into userspace page tables
Mapped: files which have been mmaped, such as libraries
+ Shmem: Total memory used by shared memory (shmem) and tmpfs
Slab: in-kernel data structures cache
SReclaimable: Part of Slab, that might be reclaimed, such as caches
SUnreclaim: Part of Slab, that cannot be reclaimed on memory pressure
diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt
index 32a173dd3158..e3f4c778eb98 100644
--- a/Documentation/filesystems/sharedsubtree.txt
+++ b/Documentation/filesystems/sharedsubtree.txt
@@ -664,7 +664,7 @@ replicas continue to be exactly same.
if one rbind mounts a tree within the same subtree 'n' times
the number of mounts created is an exponential function of 'n'.
Having unbindable mount can help prune the unneeded bind
- mounts. Here is a example.
+ mounts. Here is an example.
step 1:
let's say the root tree has just two directories with
diff --git a/Documentation/filesystems/tmpfs.txt b/Documentation/filesystems/tmpfs.txt
index 98ef55124158..d392e1505f17 100644
--- a/Documentation/filesystems/tmpfs.txt
+++ b/Documentation/filesystems/tmpfs.txt
@@ -17,10 +17,10 @@ RAM, where you have to create an ordinary filesystem on top. Ramdisks
cannot swap and you do not have the possibility to resize them.
Since tmpfs lives completely in the page cache and on swap, all tmpfs
-pages currently in memory will show up as cached. It will not show up
-as shared or something like that. Further on you can check the actual
-RAM+swap use of a tmpfs instance with df(1) and du(1).
-
+pages will be shown as "Shmem" in /proc/meminfo and "Shared" in
+free(1). Notice that these counters also include shared memory
+(shmem, see ipcs(1)). The most reliable way to get the count is
+using df(1) and du(1).
tmpfs has the following uses:
diff --git a/Documentation/filesystems/vfat.txt b/Documentation/filesystems/vfat.txt
index ce1126aceed8..223c32171dcc 100644
--- a/Documentation/filesystems/vfat.txt
+++ b/Documentation/filesystems/vfat.txt
@@ -180,6 +180,16 @@ dos1xfloppy -- If set, use a fallback default BIOS Parameter Block
<bool>: 0,1,yes,no,true,false
+LIMITATION
+---------------------------------------------------------------------
+* The fallocated region of file is discarded at umount/evict time
+ when using fallocate with FALLOC_FL_KEEP_SIZE.
+ So, User should assume that fallocated region can be discarded at
+ last close if there is memory pressure resulting in eviction of
+ the inode from the memory. As a result, for any dependency on
+ the fallocated region, user should make sure to recheck fallocate
+ after reopening the file.
+
TODO
----------------------------------------------------------------------
* Need to get rid of the raw scanning stuff. Instead, always use
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 8c6f07ad373a..b02a7d598258 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -350,8 +350,8 @@ struct inode_operations {
int (*rename2) (struct inode *, struct dentry *,
struct inode *, struct dentry *, unsigned int);
int (*readlink) (struct dentry *, char __user *,int);
- const char *(*follow_link) (struct dentry *, void **);
- void (*put_link) (struct inode *, void *);
+ const char *(*get_link) (struct dentry *, struct inode *,
+ struct delayed_call *);
int (*permission) (struct inode *, int);
int (*get_acl)(struct inode *, int);
int (*setattr) (struct dentry *, struct iattr *);
@@ -434,20 +434,19 @@ otherwise noted.
readlink: called by the readlink(2) system call. Only required if
you want to support reading symbolic links
- follow_link: called by the VFS to follow a symbolic link to the
+ get_link: called by the VFS to follow a symbolic link to the
inode it points to. Only required if you want to support
symbolic links. This method returns the symlink body
to traverse (and possibly resets the current position with
nd_jump_link()). If the body won't go away until the inode
is gone, nothing else is needed; if it needs to be otherwise
- pinned, the data needed to release whatever we'd grabbed
- is to be stored in void * variable passed by address to
- follow_link() instance.
-
- put_link: called by the VFS to release resources allocated by
- follow_link(). The cookie stored by follow_link() is passed
- to this method as the last parameter; only called when
- cookie isn't NULL.
+ pinned, arrange for its release by having get_link(..., ..., done)
+ do set_delayed_call(done, destructor, argument).
+ In that case destructor(argument) will be called once VFS is
+ done with the body you've returned.
+ May be called in RCU mode; that is indicated by NULL dentry
+ argument. If request can't be handled without leaving RCU mode,
+ have it return ERR_PTR(-ECHILD).
permission: called by the VFS to check for access rights on a POSIX-like
filesystem.