aboutsummaryrefslogtreecommitdiffstats
path: root/fs/dcache.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/dcache.c')
-rw-r--r--fs/dcache.c616
1 files changed, 432 insertions, 184 deletions
diff --git a/fs/dcache.c b/fs/dcache.c
index ca8e9cd60f87..41000305d716 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -37,6 +37,7 @@
#include <linux/rculist_bl.h>
#include <linux/prefetch.h>
#include <linux/ratelimit.h>
+#include <linux/list_lru.h>
#include "internal.h"
#include "mount.h"
@@ -48,7 +49,7 @@
* - the dcache hash table
* s_anon bl list spinlock protects:
* - the s_anon list (see __d_drop)
- * dcache_lru_lock protects:
+ * dentry->d_sb->s_dentry_lru_lock protects:
* - the dcache lru lists and counters
* d_lock protects:
* - d_flags
@@ -63,7 +64,7 @@
* Ordering:
* dentry->d_inode->i_lock
* dentry->d_lock
- * dcache_lru_lock
+ * dentry->d_sb->s_dentry_lru_lock
* dcache_hash_bucket lock
* s_anon lock
*
@@ -81,13 +82,41 @@
int sysctl_vfs_cache_pressure __read_mostly = 100;
EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
__cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
EXPORT_SYMBOL(rename_lock);
static struct kmem_cache *dentry_cache __read_mostly;
+/**
+ * read_seqbegin_or_lock - begin a sequence number check or locking block
+ * @lock: sequence lock
+ * @seq : sequence number to be checked
+ *
+ * First try it once optimistically without taking the lock. If that fails,
+ * take the lock. The sequence number is also used as a marker for deciding
+ * whether to be a reader (even) or writer (odd).
+ * N.B. seq must be initialized to an even number to begin with.
+ */
+static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
+{
+ if (!(*seq & 1)) /* Even */
+ *seq = read_seqbegin(lock);
+ else /* Odd */
+ read_seqlock_excl(lock);
+}
+
+static inline int need_seqretry(seqlock_t *lock, int seq)
+{
+ return !(seq & 1) && read_seqretry(lock, seq);
+}
+
+static inline void done_seqretry(seqlock_t *lock, int seq)
+{
+ if (seq & 1)
+ read_sequnlock_excl(lock);
+}
+
/*
* This is the single most critical data structure when it comes
* to the dcache: the hashtable for lookups. Somebody should try
@@ -117,23 +146,47 @@ struct dentry_stat_t dentry_stat = {
.age_limit = 45,
};
-static DEFINE_PER_CPU(unsigned int, nr_dentry);
+static DEFINE_PER_CPU(long, nr_dentry);
+static DEFINE_PER_CPU(long, nr_dentry_unused);
#if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
-static int get_nr_dentry(void)
+
+/*
+ * Here we resort to our own counters instead of using generic per-cpu counters
+ * for consistency with what the vfs inode code does. We are expected to harvest
+ * better code and performance by having our own specialized counters.
+ *
+ * Please note that the loop is done over all possible CPUs, not over all online
+ * CPUs. The reason for this is that we don't want to play games with CPUs going
+ * on and off. If one of them goes off, we will just keep their counters.
+ *
+ * glommer: See cffbc8a for details, and if you ever intend to change this,
+ * please update all vfs counters to match.
+ */
+static long get_nr_dentry(void)
{
int i;
- int sum = 0;
+ long sum = 0;
for_each_possible_cpu(i)
sum += per_cpu(nr_dentry, i);
return sum < 0 ? 0 : sum;
}
+static long get_nr_dentry_unused(void)
+{
+ int i;
+ long sum = 0;
+ for_each_possible_cpu(i)
+ sum += per_cpu(nr_dentry_unused, i);
+ return sum < 0 ? 0 : sum;
+}
+
int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
size_t *lenp, loff_t *ppos)
{
dentry_stat.nr_dentry = get_nr_dentry();
- return proc_dointvec(table, write, buffer, lenp, ppos);
+ dentry_stat.nr_unused = get_nr_dentry_unused();
+ return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
}
#endif
@@ -304,52 +357,96 @@ static void dentry_unlink_inode(struct dentry * dentry)
}
/*
- * dentry_lru_(add|del|prune|move_tail) must be called with d_lock held.
+ * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry
+ * is in use - which includes both the "real" per-superblock
+ * LRU list _and_ the DCACHE_SHRINK_LIST use.
+ *
+ * The DCACHE_SHRINK_LIST bit is set whenever the dentry is
+ * on the shrink list (ie not on the superblock LRU list).
+ *
+ * The per-cpu "nr_dentry_unused" counters are updated with
+ * the DCACHE_LRU_LIST bit.
+ *
+ * These helper functions make sure we always follow the
+ * rules. d_lock must be held by the caller.
*/
-static void dentry_lru_add(struct dentry *dentry)
+#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x))
+static void d_lru_add(struct dentry *dentry)
{
- if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) {
- spin_lock(&dcache_lru_lock);
- dentry->d_flags |= DCACHE_LRU_LIST;
- list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
- dentry->d_sb->s_nr_dentry_unused++;
- dentry_stat.nr_unused++;
- spin_unlock(&dcache_lru_lock);
- }
+ D_FLAG_VERIFY(dentry, 0);
+ dentry->d_flags |= DCACHE_LRU_LIST;
+ this_cpu_inc(nr_dentry_unused);
+ WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
}
-static void __dentry_lru_del(struct dentry *dentry)
+static void d_lru_del(struct dentry *dentry)
{
+ D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
+ dentry->d_flags &= ~DCACHE_LRU_LIST;
+ this_cpu_dec(nr_dentry_unused);
+ WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
+}
+
+static void d_shrink_del(struct dentry *dentry)
+{
+ D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
list_del_init(&dentry->d_lru);
dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
- dentry->d_sb->s_nr_dentry_unused--;
- dentry_stat.nr_unused--;
+ this_cpu_dec(nr_dentry_unused);
+}
+
+static void d_shrink_add(struct dentry *dentry, struct list_head *list)
+{
+ D_FLAG_VERIFY(dentry, 0);
+ list_add(&dentry->d_lru, list);
+ dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST;
+ this_cpu_inc(nr_dentry_unused);
}
/*
- * Remove a dentry with references from the LRU.
+ * These can only be called under the global LRU lock, ie during the
+ * callback for freeing the LRU list. "isolate" removes it from the
+ * LRU lists entirely, while shrink_move moves it to the indicated
+ * private list.
*/
-static void dentry_lru_del(struct dentry *dentry)
+static void d_lru_isolate(struct dentry *dentry)
{
- if (!list_empty(&dentry->d_lru)) {
- spin_lock(&dcache_lru_lock);
- __dentry_lru_del(dentry);
- spin_unlock(&dcache_lru_lock);
- }
+ D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
+ dentry->d_flags &= ~DCACHE_LRU_LIST;
+ this_cpu_dec(nr_dentry_unused);
+ list_del_init(&dentry->d_lru);
}
-static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
+static void d_lru_shrink_move(struct dentry *dentry, struct list_head *list)
{
- spin_lock(&dcache_lru_lock);
- if (list_empty(&dentry->d_lru)) {
- dentry->d_flags |= DCACHE_LRU_LIST;
- list_add_tail(&dentry->d_lru, list);
- dentry->d_sb->s_nr_dentry_unused++;
- dentry_stat.nr_unused++;
- } else {
- list_move_tail(&dentry->d_lru, list);
+ D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
+ dentry->d_flags |= DCACHE_SHRINK_LIST;
+ list_move_tail(&dentry->d_lru, list);
+}
+
+/*
+ * dentry_lru_(add|del)_list) must be called with d_lock held.
+ */
+static void dentry_lru_add(struct dentry *dentry)
+{
+ if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
+ d_lru_add(dentry);
+}
+
+/*
+ * Remove a dentry with references from the LRU.
+ *
+ * If we are on the shrink list, then we can get to try_prune_one_dentry() and
+ * lose our last reference through the parent walk. In this case, we need to
+ * remove ourselves from the shrink list, not the LRU.
+ */
+static void dentry_lru_del(struct dentry *dentry)
+{
+ if (dentry->d_flags & DCACHE_LRU_LIST) {
+ if (dentry->d_flags & DCACHE_SHRINK_LIST)
+ return d_shrink_del(dentry);
+ d_lru_del(dentry);
}
- spin_unlock(&dcache_lru_lock);
}
/**
@@ -445,7 +542,8 @@ EXPORT_SYMBOL(d_drop);
* If ref is non-zero, then decrement the refcount too.
* Returns dentry requiring refcount drop, or NULL if we're done.
*/
-static inline struct dentry *dentry_kill(struct dentry *dentry)
+static inline struct dentry *
+dentry_kill(struct dentry *dentry, int unlock_on_failure)
__releases(dentry->d_lock)
{
struct inode *inode;
@@ -454,8 +552,10 @@ static inline struct dentry *dentry_kill(struct dentry *dentry)
inode = dentry->d_inode;
if (inode && !spin_trylock(&inode->i_lock)) {
relock:
- spin_unlock(&dentry->d_lock);
- cpu_relax();
+ if (unlock_on_failure) {
+ spin_unlock(&dentry->d_lock);
+ cpu_relax();
+ }
return dentry; /* try again with same dentry */
}
if (IS_ROOT(dentry))
@@ -538,7 +638,7 @@ repeat:
return;
kill_it:
- dentry = dentry_kill(dentry);
+ dentry = dentry_kill(dentry, 1);
if (dentry)
goto repeat;
}
@@ -758,12 +858,12 @@ EXPORT_SYMBOL(d_prune_aliases);
*
* This may fail if locks cannot be acquired no problem, just try again.
*/
-static void try_prune_one_dentry(struct dentry *dentry)
+static struct dentry * try_prune_one_dentry(struct dentry *dentry)
__releases(dentry->d_lock)
{
struct dentry *parent;
- parent = dentry_kill(dentry);
+ parent = dentry_kill(dentry, 0);
/*
* If dentry_kill returns NULL, we have nothing more to do.
* if it returns the same dentry, trylocks failed. In either
@@ -775,17 +875,18 @@ static void try_prune_one_dentry(struct dentry *dentry)
* fragmentation.
*/
if (!parent)
- return;
+ return NULL;
if (parent == dentry)
- return;
+ return dentry;
/* Prune ancestors. */
dentry = parent;
while (dentry) {
if (lockref_put_or_lock(&dentry->d_lockref))
- return;
- dentry = dentry_kill(dentry);
+ return NULL;
+ dentry = dentry_kill(dentry, 1);
}
+ return NULL;
}
static void shrink_dentry_list(struct list_head *list)
@@ -797,6 +898,12 @@ static void shrink_dentry_list(struct list_head *list)
dentry = list_entry_rcu(list->prev, struct dentry, d_lru);
if (&dentry->d_lru == list)
break; /* empty */
+
+ /*
+ * Get the dentry lock, and re-verify that the dentry is
+ * this on the shrinking list. If it is, we know that
+ * DCACHE_SHRINK_LIST and DCACHE_LRU_LIST are set.
+ */
spin_lock(&dentry->d_lock);
if (dentry != list_entry(list->prev, struct dentry, d_lru)) {
spin_unlock(&dentry->d_lock);
@@ -804,76 +911,146 @@ static void shrink_dentry_list(struct list_head *list)
}
/*
+ * The dispose list is isolated and dentries are not accounted
+ * to the LRU here, so we can simply remove it from the list
+ * here regardless of whether it is referenced or not.
+ */
+ d_shrink_del(dentry);
+
+ /*
* We found an inuse dentry which was not removed from
- * the LRU because of laziness during lookup. Do not free
- * it - just keep it off the LRU list.
+ * the LRU because of laziness during lookup. Do not free it.
*/
if (dentry->d_lockref.count) {
- dentry_lru_del(dentry);
spin_unlock(&dentry->d_lock);
continue;
}
-
rcu_read_unlock();
- try_prune_one_dentry(dentry);
+ /*
+ * If 'try_to_prune()' returns a dentry, it will
+ * be the same one we passed in, and d_lock will
+ * have been held the whole time, so it will not
+ * have been added to any other lists. We failed
+ * to get the inode lock.
+ *
+ * We just add it back to the shrink list.
+ */
+ dentry = try_prune_one_dentry(dentry);
rcu_read_lock();
+ if (dentry) {
+ d_shrink_add(dentry, list);
+ spin_unlock(&dentry->d_lock);
+ }
}
rcu_read_unlock();
}
+static enum lru_status
+dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
+{
+ struct list_head *freeable = arg;
+ struct dentry *dentry = container_of(item, struct dentry, d_lru);
+
+
+ /*
+ * we are inverting the lru lock/dentry->d_lock here,
+ * so use a trylock. If we fail to get the lock, just skip
+ * it
+ */
+ if (!spin_trylock(&dentry->d_lock))
+ return LRU_SKIP;
+
+ /*
+ * Referenced dentries are still in use. If they have active
+ * counts, just remove them from the LRU. Otherwise give them
+ * another pass through the LRU.
+ */
+ if (dentry->d_lockref.count) {
+ d_lru_isolate(dentry);
+ spin_unlock(&dentry->d_lock);
+ return LRU_REMOVED;
+ }
+
+ if (dentry->d_flags & DCACHE_REFERENCED) {
+ dentry->d_flags &= ~DCACHE_REFERENCED;
+ spin_unlock(&dentry->d_lock);
+
+ /*
+ * The list move itself will be made by the common LRU code. At
+ * this point, we've dropped the dentry->d_lock but keep the
+ * lru lock. This is safe to do, since every list movement is
+ * protected by the lru lock even if both locks are held.
+ *
+ * This is guaranteed by the fact that all LRU management
+ * functions are intermediated by the LRU API calls like
+ * list_lru_add and list_lru_del. List movement in this file
+ * only ever occur through this functions or through callbacks
+ * like this one, that are called from the LRU API.
+ *
+ * The only exceptions to this are functions like
+ * shrink_dentry_list, and code that first checks for the
+ * DCACHE_SHRINK_LIST flag. Those are guaranteed to be
+ * operating only with stack provided lists after they are
+ * properly isolated from the main list. It is thus, always a
+ * local access.
+ */
+ return LRU_ROTATE;
+ }
+
+ d_lru_shrink_move(dentry, freeable);
+ spin_unlock(&dentry->d_lock);
+
+ return LRU_REMOVED;
+}
+
/**
* prune_dcache_sb - shrink the dcache
* @sb: superblock
- * @count: number of entries to try to free
+ * @nr_to_scan : number of entries to try to free
+ * @nid: which node to scan for freeable entities
*
- * Attempt to shrink the superblock dcache LRU by @count entries. This is
+ * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
* done when we need more memory an called from the superblock shrinker
* function.
*
* This function may fail to free any resources if all the dentries are in
* use.
*/
-void prune_dcache_sb(struct super_block *sb, int count)
+long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
+ int nid)
{
- struct dentry *dentry;
- LIST_HEAD(referenced);
- LIST_HEAD(tmp);
+ LIST_HEAD(dispose);
+ long freed;
-relock:
- spin_lock(&dcache_lru_lock);
- while (!list_empty(&sb->s_dentry_lru)) {
- dentry = list_entry(sb->s_dentry_lru.prev,
- struct dentry, d_lru);
- BUG_ON(dentry->d_sb != sb);
-
- if (!spin_trylock(&dentry->d_lock)) {
- spin_unlock(&dcache_lru_lock);
- cpu_relax();
- goto relock;
- }
+ freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate,
+ &dispose, &nr_to_scan);
+ shrink_dentry_list(&dispose);
+ return freed;
+}
- if (dentry->d_flags & DCACHE_REFERENCED) {
- dentry->d_flags &= ~DCACHE_REFERENCED;
- list_move(&dentry->d_lru, &referenced);
- spin_unlock(&dentry->d_lock);
- } else {
- list_move_tail(&dentry->d_lru, &tmp);
- dentry->d_flags |= DCACHE_SHRINK_LIST;
- spin_unlock(&dentry->d_lock);
- if (!--count)
- break;
- }
- cond_resched_lock(&dcache_lru_lock);
- }
- if (!list_empty(&referenced))
- list_splice(&referenced, &sb->s_dentry_lru);
- spin_unlock(&dcache_lru_lock);
+static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
+ spinlock_t *lru_lock, void *arg)
+{
+ struct list_head *freeable = arg;
+ struct dentry *dentry = container_of(item, struct dentry, d_lru);
- shrink_dentry_list(&tmp);
+ /*
+ * we are inverting the lru lock/dentry->d_lock here,
+ * so use a trylock. If we fail to get the lock, just skip
+ * it
+ */
+ if (!spin_trylock(&dentry->d_lock))
+ return LRU_SKIP;
+
+ d_lru_shrink_move(dentry, freeable);
+ spin_unlock(&dentry->d_lock);
+
+ return LRU_REMOVED;
}
+
/**
* shrink_dcache_sb - shrink dcache for a superblock
* @sb: superblock
@@ -883,16 +1060,17 @@ relock:
*/
void shrink_dcache_sb(struct super_block *sb)
{
- LIST_HEAD(tmp);
+ long freed;
- spin_lock(&dcache_lru_lock);
- while (!list_empty(&sb->s_dentry_lru)) {
- list_splice_init(&sb->s_dentry_lru, &tmp);
- spin_unlock(&dcache_lru_lock);
- shrink_dentry_list(&tmp);
- spin_lock(&dcache_lru_lock);
- }
- spin_unlock(&dcache_lru_lock);
+ do {
+ LIST_HEAD(dispose);
+
+ freed = list_lru_walk(&sb->s_dentry_lru,
+ dentry_lru_isolate_shrink, &dispose, UINT_MAX);
+
+ this_cpu_sub(nr_dentry_unused, freed);
+ shrink_dentry_list(&dispose);
+ } while (freed > 0);
}
EXPORT_SYMBOL(shrink_dcache_sb);
@@ -1012,7 +1190,7 @@ void shrink_dcache_for_umount(struct super_block *sb)
* the parenthood after dropping the lock and check
* that the sequence number still matches.
*/
-static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq)
+static struct dentry *try_to_ascend(struct dentry *old, unsigned seq)
{
struct dentry *new = old->d_parent;
@@ -1026,7 +1204,7 @@ static struct dentry *try_to_ascend(struct dentry *old, int locked, unsigned seq
*/
if (new != old->d_parent ||
(old->d_flags & DCACHE_DENTRY_KILLED) ||
- (!locked && read_seqretry(&rename_lock, seq))) {
+ need_seqretry(&rename_lock, seq)) {
spin_unlock(&new->d_lock);
new = NULL;
}
@@ -1063,13 +1241,12 @@ static void d_walk(struct dentry *parent, void *data,
{
struct dentry *this_parent;
struct list_head *next;
- unsigned seq;
- int locked = 0;
+ unsigned seq = 0;
enum d_walk_ret ret;
bool retry = true;
- seq = read_seqbegin(&rename_lock);
again:
+ read_seqbegin_or_lock(&rename_lock, &seq);
this_parent = parent;
spin_lock(&this_parent->d_lock);
@@ -1123,13 +1300,13 @@ resume:
*/
if (this_parent != parent) {
struct dentry *child = this_parent;
- this_parent = try_to_ascend(this_parent, locked, seq);
+ this_parent = try_to_ascend(this_parent, seq);
if (!this_parent)
goto rename_retry;
next = child->d_u.d_child.next;
goto resume;
}
- if (!locked && read_seqretry(&rename_lock, seq)) {
+ if (need_seqretry(&rename_lock, seq)) {
spin_unlock(&this_parent->d_lock);
goto rename_retry;
}
@@ -1138,17 +1315,13 @@ resume:
out_unlock:
spin_unlock(&this_parent->d_lock);
- if (locked)
- write_sequnlock(&rename_lock);
+ done_seqretry(&rename_lock, seq);
return;
rename_retry:
if (!retry)
return;
- if (locked)
- goto again;
- locked = 1;
- write_seqlock(&rename_lock);
+ seq = 1;
goto again;
}
@@ -1259,8 +1432,13 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
if (dentry->d_lockref.count) {
dentry_lru_del(dentry);
} else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
- dentry_lru_move_list(dentry, &data->dispose);
- dentry->d_flags |= DCACHE_SHRINK_LIST;
+ /*
+ * We can't use d_lru_shrink_move() because we
+ * need to get the global LRU lock and do the
+ * LRU accounting.
+ */
+ d_lru_del(dentry);
+ d_shrink_add(dentry, &data->dispose);
data->found++;
ret = D_WALK_NORETRY;
}
@@ -2647,9 +2825,39 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen)
return 0;
}
+/**
+ * prepend_name - prepend a pathname in front of current buffer pointer
+ * @buffer: buffer pointer
+ * @buflen: allocated length of the buffer
+ * @name: name string and length qstr structure
+ *
+ * With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to
+ * make sure that either the old or the new name pointer and length are
+ * fetched. However, there may be mismatch between length and pointer.
+ * The length cannot be trusted, we need to copy it byte-by-byte until
+ * the length is reached or a null byte is found. It also prepends "/" at
+ * the beginning of the name. The sequence number check at the caller will
+ * retry it again when a d_move() does happen. So any garbage in the buffer
+ * due to mismatched pointer and length will be discarded.
+ */
static int prepend_name(char **buffer, int *buflen, struct qstr *name)
{
- return prepend(buffer, buflen, name->name, name->len);
+ const char *dname = ACCESS_ONCE(name->name);
+ u32 dlen = ACCESS_ONCE(name->len);
+ char *p;
+
+ if (*buflen < dlen + 1)
+ return -ENAMETOOLONG;
+ *buflen -= dlen + 1;
+ p = *buffer -= dlen + 1;
+ *p++ = '/';
+ while (dlen--) {
+ char c = *dname++;
+ if (!c)
+ break;
+ *p++ = c;
+ }
+ return 0;
}
/**
@@ -2659,7 +2867,15 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
* @buffer: pointer to the end of the buffer
* @buflen: pointer to buffer length
*
- * Caller holds the rename_lock.
+ * The function will first try to write out the pathname without taking any
+ * lock other than the RCU read lock to make sure that dentries won't go away.
+ * It only checks the sequence number of the global rename_lock as any change
+ * in the dentry's d_seq will be preceded by changes in the rename_lock
+ * sequence number. If the sequence number had been changed, it will restart
+ * the whole pathname back-tracing sequence again by taking the rename_lock.
+ * In this case, there is no need to take the RCU read lock as the recursive
+ * parent pointer references will keep the dentry chain alive as long as no
+ * rename operation is performed.
*/
static int prepend_path(const struct path *path,
const struct path *root,
@@ -2668,54 +2884,66 @@ static int prepend_path(const struct path *path,
struct dentry *dentry = path->dentry;
struct vfsmount *vfsmnt = path->mnt;
struct mount *mnt = real_mount(vfsmnt);
- bool slash = false;
int error = 0;
+ unsigned seq = 0;
+ char *bptr;
+ int blen;
+ rcu_read_lock();
+restart:
+ bptr = *buffer;
+ blen = *buflen;
+ read_seqbegin_or_lock(&rename_lock, &seq);
while (dentry != root->dentry || vfsmnt != root->mnt) {
struct dentry * parent;
if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) {
/* Global root? */
- if (!mnt_has_parent(mnt))
- goto global_root;
- dentry = mnt->mnt_mountpoint;
- mnt = mnt->mnt_parent;
- vfsmnt = &mnt->mnt;
- continue;
+ if (mnt_has_parent(mnt)) {
+ dentry = mnt->mnt_mountpoint;
+ mnt = mnt->mnt_parent;
+ vfsmnt = &mnt->mnt;
+ continue;
+ }
+ /*
+ * Filesystems needing to implement special "root names"
+ * should do so with ->d_dname()
+ */
+ if (IS_ROOT(dentry) &&
+ (dentry->d_name.len != 1 ||
+ dentry->d_name.name[0] != '/')) {
+ WARN(1, "Root dentry has weird name <%.*s>\n",
+ (int) dentry->d_name.len,
+ dentry->d_name.name);
+ }
+ if (!error)
+ error = is_mounted(vfsmnt) ? 1 : 2;
+ break;
}
parent = dentry->d_parent;
prefetch(parent);
- spin_lock(&dentry->d_lock);
- error = prepend_name(buffer, buflen, &dentry->d_name);
- spin_unlock(&dentry->d_lock);
- if (!error)
- error = prepend(buffer, buflen, "/", 1);
+ error = prepend_name(&bptr, &blen, &dentry->d_name);
if (error)
break;
- slash = true;
dentry = parent;
}
+ if (!(seq & 1))
+ rcu_read_unlock();
+ if (need_seqretry(&rename_lock, seq)) {
+ seq = 1;
+ goto restart;
+ }
+ done_seqretry(&rename_lock, seq);
- if (!error && !slash)
- error = prepend(buffer, buflen, "/", 1);
-
- return error;
-
-global_root:
- /*
- * Filesystems needing to implement special "root names"
- * should do so with ->d_dname()
- */
- if (IS_ROOT(dentry) &&
- (dentry->d_name.len != 1 || dentry->d_name.name[0] != '/')) {
- WARN(1, "Root dentry has weird name <%.*s>\n",
- (int) dentry->d_name.len, dentry->d_name.name);
- }
- if (!slash)
- error = prepend(buffer, buflen, "/", 1);
- if (!error)
- error = is_mounted(vfsmnt) ? 1 : 2;
+ if (error >= 0 && bptr == *buffer) {
+ if (--blen < 0)
+ error = -ENAMETOOLONG;
+ else
+ *--bptr = '/';
+ }
+ *buffer = bptr;
+ *buflen = blen;
return error;
}
@@ -2744,9 +2972,7 @@ char *__d_path(const struct path *path,
prepend(&res, &buflen, "\0", 1);
br_read_lock(&vfsmount_lock);
- write_seqlock(&rename_lock);
error = prepend_path(path, root, &res, &buflen);
- write_sequnlock(&rename_lock);
br_read_unlock(&vfsmount_lock);
if (error < 0)
@@ -2765,9 +2991,7 @@ char *d_absolute_path(const struct path *path,
prepend(&res, &buflen, "\0", 1);
br_read_lock(&vfsmount_lock);
- write_seqlock(&rename_lock);
error = prepend_path(path, &root, &res, &buflen);
- write_sequnlock(&rename_lock);
br_read_unlock(&vfsmount_lock);
if (error > 1)
@@ -2799,6 +3023,16 @@ static int prepend_unreachable(char **buffer, int *buflen)
return prepend(buffer, buflen, "(unreachable)", 13);
}
+static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
+{
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&fs->seq);
+ *root = fs->root;
+ } while (read_seqcount_retry(&fs->seq, seq));
+}
+
/**
* d_path - return the path of a dentry
* @path: path to report
@@ -2831,15 +3065,15 @@ char *d_path(const struct path *path, char *buf, int buflen)
if (path->dentry->d_op && path->dentry->d_op->d_dname)
return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
- get_fs_root(current->fs, &root);
+ rcu_read_lock();
+ get_fs_root_rcu(current->fs, &root);
br_read_lock(&vfsmount_lock);
- write_seqlock(&rename_lock);
error = path_with_deleted(path, &root, &res, &buflen);
- write_sequnlock(&rename_lock);
br_read_unlock(&vfsmount_lock);
+ rcu_read_unlock();
+
if (error < 0)
res = ERR_PTR(error);
- path_put(&root);
return res;
}
EXPORT_SYMBOL(d_path);
@@ -2870,10 +3104,10 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
char *end = buffer + buflen;
/* these dentries are never renamed, so d_lock is not needed */
if (prepend(&end, &buflen, " (deleted)", 11) ||
- prepend_name(&end, &buflen, &dentry->d_name) ||
+ prepend(&end, &buflen, dentry->d_name.name, dentry->d_name.len) ||
prepend(&end, &buflen, "/", 1))
end = ERR_PTR(-ENAMETOOLONG);
- return end;
+ return end;
}
/*
@@ -2881,30 +3115,42 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen)
*/
static char *__dentry_path(struct dentry *dentry, char *buf, int buflen)
{
- char *end = buf + buflen;
- char *retval;
+ char *end, *retval;
+ int len, seq = 0;
+ int error = 0;
- prepend(&end, &buflen, "\0", 1);
+ rcu_read_lock();
+restart:
+ end = buf + buflen;
+ len = buflen;
+ prepend(&end, &len, "\0", 1);
if (buflen < 1)
goto Elong;
/* Get '/' right */
retval = end-1;
*retval = '/';
-
+ read_seqbegin_or_lock(&rename_lock, &seq);
while (!IS_ROOT(dentry)) {
struct dentry *parent = dentry->d_parent;
int error;
prefetch(parent);
- spin_lock(&dentry->d_lock);
- error = prepend_name(&end, &buflen, &dentry->d_name);
- spin_unlock(&dentry->d_lock);
- if (error != 0 || prepend(&end, &buflen, "/", 1) != 0)
- goto Elong;
+ error = prepend_name(&end, &len, &dentry->d_name);
+ if (error)
+ break;
retval = end;
dentry = parent;
}
+ if (!(seq & 1))
+ rcu_read_unlock();
+ if (need_seqretry(&rename_lock, seq)) {
+ seq = 1;
+ goto restart;
+ }
+ done_seqretry(&rename_lock, seq);
+ if (error)
+ goto Elong;
return retval;
Elong:
return ERR_PTR(-ENAMETOOLONG);
@@ -2912,13 +3158,7 @@ Elong:
char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen)
{
- char *retval;
-
- write_seqlock(&rename_lock);
- retval = __dentry_path(dentry, buf, buflen);
- write_sequnlock(&rename_lock);
-
- return retval;
+ return __dentry_path(dentry, buf, buflen);
}
EXPORT_SYMBOL(dentry_path_raw);
@@ -2927,7 +3167,6 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
char *p = NULL;
char *retval;
- write_seqlock(&rename_lock);
if (d_unlinked(dentry)) {
p = buf + buflen;
if (prepend(&p, &buflen, "//deleted", 10) != 0)
@@ -2935,7 +3174,6 @@ char *dentry_path(struct dentry *dentry, char *buf, int buflen)
buflen++;
}
retval = __dentry_path(dentry, buf, buflen);
- write_sequnlock(&rename_lock);
if (!IS_ERR(retval) && p)
*p = '/'; /* restore '/' overriden with '\0' */
return retval;
@@ -2943,6 +3181,18 @@ Elong:
return ERR_PTR(-ENAMETOOLONG);
}
+static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root,
+ struct path *pwd)
+{
+ unsigned seq;
+
+ do {
+ seq = read_seqcount_begin(&fs->seq);
+ *root = fs->root;
+ *pwd = fs->pwd;
+ } while (read_seqcount_retry(&fs->seq, seq));
+}
+
/*
* NOTE! The user-level library version returns a
* character pointer. The kernel system call just
@@ -2965,25 +3215,25 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
{
int error;
struct path pwd, root;
- char *page = (char *) __get_free_page(GFP_USER);
+ char *page = __getname();
if (!page)
return -ENOMEM;
- get_fs_root_and_pwd(current->fs, &root, &pwd);
+ rcu_read_lock();
+ get_fs_root_and_pwd_rcu(current->fs, &root, &pwd);
error = -ENOENT;
br_read_lock(&vfsmount_lock);
- write_seqlock(&rename_lock);
if (!d_unlinked(pwd.dentry)) {
unsigned long len;
- char *cwd = page + PAGE_SIZE;
- int buflen = PAGE_SIZE;
+ char *cwd = page + PATH_MAX;
+ int buflen = PATH_MAX;
prepend(&cwd, &buflen, "\0", 1);
error = prepend_path(&pwd, &root, &cwd, &buflen);
- write_sequnlock(&rename_lock);
br_read_unlock(&vfsmount_lock);
+ rcu_read_unlock();
if (error < 0)
goto out;
@@ -2996,21 +3246,19 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
}
error = -ERANGE;
- len = PAGE_SIZE + page - cwd;
+ len = PATH_MAX + page - cwd;
if (len <= size) {
error = len;
if (copy_to_user(buf, cwd, len))
error = -EFAULT;
}
} else {
- write_sequnlock(&rename_lock);
br_read_unlock(&vfsmount_lock);
+ rcu_read_unlock();
}
out:
- path_put(&pwd);
- path_put(&root);
- free_page((unsigned long) page);
+ __putname(page);
return error;
}