From 17e6afc75ad0150d265a86a8f155b2871f9c07fe Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Tue, 9 Jan 2007 08:57:34 -0600 Subject: JFS: Avoid BUG() on a damaged file system On Mon, 2006-12-18 at 19:51 +0100, Eric Sesterhenn wrote: > hi, > > while playing around with fsfuzzer, i got the following oops with jfs: > > [ 851.804875] BUG at fs/jfs/jfs_xtree.c:760 > assert(!BT_STACK_FULL(btstack)) > [ 851.805179] ------------[ cut here ]------------ > [ 851.805238] kernel BUG at fs/jfs/jfs_xtree.c:760! JFS should mark the superblock dirty and return an error rather than calling BUG(). Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_xtree.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/jfs/jfs_xtree.c b/fs/jfs/jfs_xtree.c index e98eb03e5310..acc97c46d8a4 100644 --- a/fs/jfs/jfs_xtree.c +++ b/fs/jfs/jfs_xtree.c @@ -757,6 +757,11 @@ static int xtSearch(struct inode *ip, s64 xoff, s64 *nextp, nsplit = 0; /* push (bn, index) of the parent page/entry */ + if (BT_STACK_FULL(btstack)) { + jfs_error(ip->i_sb, "stack overrun in xtSearch!"); + XT_PUTPAGE(mp); + return -EIO; + } BT_PUSH(btstack, bn, index); /* get the child page block number */ @@ -3915,6 +3920,11 @@ s64 xtTruncate(tid_t tid, struct inode *ip, s64 newsize, int flag) */ getChild: /* save current parent entry for the child page */ + if (BT_STACK_FULL(&btstack)) { + jfs_error(ip->i_sb, "stack overrun in xtTruncate!"); + XT_PUTPAGE(mp); + return -EIO; + } BT_PUSH(&btstack, bn, index); /* get child page */ @@ -4112,6 +4122,11 @@ s64 xtTruncate_pmap(tid_t tid, struct inode *ip, s64 committed_size) */ getChild: /* save current parent entry for the child page */ + if (BT_STACK_FULL(&btstack)) { + jfs_error(ip->i_sb, "stack overrun in xtTruncate_pmap!"); + XT_PUTPAGE(mp); + return -EIO; + } BT_PUSH(&btstack, bn, index); /* get child page */ -- cgit v1.2.3-59-g8ed1b From 82d5b9a7c63054a9a2cd838ffd177697f86e7e34 Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Tue, 9 Jan 2007 14:14:48 -0600 Subject: JFS: Add lockdep annotations Yeah, it's about time. Signed-off-by: Dave Kleikamp --- fs/jfs/inode.c | 6 +++--- fs/jfs/jfs_dmap.c | 16 ++++++++-------- fs/jfs/jfs_imap.c | 16 ++++++++-------- fs/jfs/jfs_incore.h | 29 +++++++++++++++++++++++++++-- fs/jfs/namei.c | 48 ++++++++++++++++++++++++++++-------------------- 5 files changed, 74 insertions(+), 41 deletions(-) diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index f5719117edfe..e285022f006c 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -182,9 +182,9 @@ int jfs_get_block(struct inode *ip, sector_t lblock, * Take appropriate lock on inode */ if (create) - IWRITE_LOCK(ip); + IWRITE_LOCK(ip, RDWRLOCK_NORMAL); else - IREAD_LOCK(ip); + IREAD_LOCK(ip, RDWRLOCK_NORMAL); if (((lblock64 << ip->i_sb->s_blocksize_bits) < ip->i_size) && (!xtLookup(ip, lblock64, xlen, &xflag, &xaddr, &xlen, 0)) && @@ -359,7 +359,7 @@ void jfs_truncate(struct inode *ip) nobh_truncate_page(ip->i_mapping, ip->i_size); - IWRITE_LOCK(ip); + IWRITE_LOCK(ip, RDWRLOCK_NORMAL); jfs_truncate_nolock(ip, ip->i_size); IWRITE_UNLOCK(ip); } diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 23546c8fd48b..82b0544bd76d 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -337,7 +337,7 @@ int dbFree(struct inode *ip, s64 blkno, s64 nblocks) struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; - IREAD_LOCK(ipbmap); + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); /* block to be freed better be within the mapsize. */ if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) { @@ -733,7 +733,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) * allocation group size, try to allocate anywhere. */ if (l2nb > bmp->db_agl2size) { - IWRITE_LOCK(ipbmap); + IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP); rc = dbAllocAny(bmp, nblocks, l2nb, results); @@ -774,7 +774,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) * the hint using a tiered strategy. */ if (nblocks <= BPERDMAP) { - IREAD_LOCK(ipbmap); + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); /* get the buffer for the dmap containing the hint. */ @@ -844,7 +844,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) /* try to satisfy the allocation request with blocks within * the same allocation group as the hint. */ - IWRITE_LOCK(ipbmap); + IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP); if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) != -ENOSPC) goto write_unlock; @@ -856,7 +856,7 @@ int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results) * Let dbNextAG recommend a preferred allocation group */ agno = dbNextAG(ipbmap); - IWRITE_LOCK(ipbmap); + IWRITE_LOCK(ipbmap, RDWRLOCK_DMAP); /* Try to allocate within this allocation group. if that fails, try to * allocate anywhere in the map. @@ -900,7 +900,7 @@ int dbAllocExact(struct inode *ip, s64 blkno, int nblocks) s64 lblkno; struct metapage *mp; - IREAD_LOCK(ipbmap); + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); /* * validate extent request: @@ -1050,7 +1050,7 @@ static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks) */ extblkno = lastblkno + 1; - IREAD_LOCK(ipbmap); + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); /* better be within the file system */ bmp = sbi->bmap; @@ -3116,7 +3116,7 @@ int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks) struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap; struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap; - IREAD_LOCK(ipbmap); + IREAD_LOCK(ipbmap, RDWRLOCK_DMAP); /* block to be allocated better be within the mapsize. */ ASSERT(nblocks <= bmp->db_mapsize - blkno); diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 53f63b47a6d3..aa5124b643b1 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -331,7 +331,7 @@ int diRead(struct inode *ip) /* read the iag */ imap = JFS_IP(ipimap)->i_imap; - IREAD_LOCK(ipimap); + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); rc = diIAGRead(imap, iagno, &mp); IREAD_UNLOCK(ipimap); if (rc) { @@ -920,7 +920,7 @@ int diFree(struct inode *ip) /* Obtain read lock in imap inode. Don't release it until we have * read all of the IAG's that we are going to. */ - IREAD_LOCK(ipimap); + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); /* read the iag. */ @@ -1415,7 +1415,7 @@ int diAlloc(struct inode *pip, bool dir, struct inode *ip) AG_LOCK(imap, agno); /* Get read lock on imap inode */ - IREAD_LOCK(ipimap); + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); /* get the iag number and read the iag */ iagno = INOTOIAG(inum); @@ -1808,7 +1808,7 @@ static int diAllocIno(struct inomap * imap, int agno, struct inode *ip) return -ENOSPC; /* obtain read lock on imap inode */ - IREAD_LOCK(imap->im_ipimap); + IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP); /* read the iag at the head of the list. */ @@ -1946,7 +1946,7 @@ static int diAllocExt(struct inomap * imap, int agno, struct inode *ip) } else { /* read the iag. */ - IREAD_LOCK(imap->im_ipimap); + IREAD_LOCK(imap->im_ipimap, RDWRLOCK_IMAP); if ((rc = diIAGRead(imap, iagno, &mp))) { IREAD_UNLOCK(imap->im_ipimap); jfs_error(ip->i_sb, "diAllocExt: error reading iag"); @@ -2509,7 +2509,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) */ /* acquire inode map lock */ - IWRITE_LOCK(ipimap); + IWRITE_LOCK(ipimap, RDWRLOCK_IMAP); if (ipimap->i_size >> L2PSIZE != imap->im_nextiag + 1) { IWRITE_UNLOCK(ipimap); @@ -2648,7 +2648,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) } /* obtain read lock on map */ - IREAD_LOCK(ipimap); + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); /* read the iag */ if ((rc = diIAGRead(imap, iagno, &mp))) { @@ -2779,7 +2779,7 @@ diUpdatePMap(struct inode *ipimap, return -EIO; } /* read the iag */ - IREAD_LOCK(ipimap); + IREAD_LOCK(ipimap, RDWRLOCK_IMAP); rc = diIAGRead(imap, iagno, &mp); IREAD_UNLOCK(ipimap); if (rc) diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index 94005584445a..8f453eff3c83 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -109,9 +109,11 @@ struct jfs_inode_info { #define JFS_ACL_NOT_CACHED ((void *)-1) -#define IREAD_LOCK(ip) down_read(&JFS_IP(ip)->rdwrlock) +#define IREAD_LOCK(ip, subclass) \ + down_read_nested(&JFS_IP(ip)->rdwrlock, subclass) #define IREAD_UNLOCK(ip) up_read(&JFS_IP(ip)->rdwrlock) -#define IWRITE_LOCK(ip) down_write(&JFS_IP(ip)->rdwrlock) +#define IWRITE_LOCK(ip, subclass) \ + down_write_nested(&JFS_IP(ip)->rdwrlock, subclass) #define IWRITE_UNLOCK(ip) up_write(&JFS_IP(ip)->rdwrlock) /* @@ -127,6 +129,29 @@ enum cflags { COMMIT_Synclist, /* metadata pages on group commit synclist */ }; +/* + * commit_mutex nesting subclasses: + */ +enum commit_mutex_class +{ + COMMIT_MUTEX_PARENT, + COMMIT_MUTEX_CHILD, + COMMIT_MUTEX_SECOND_PARENT, /* Renaming */ + COMMIT_MUTEX_VICTIM /* Inode being unlinked due to rename */ +}; + +/* + * rdwrlock subclasses: + * The dmap inode may be locked while a normal inode or the imap inode are + * locked. + */ +enum rdwrlock_class +{ + RDWRLOCK_NORMAL, + RDWRLOCK_IMAP, + RDWRLOCK_DMAP +}; + #define set_cflag(flag, ip) set_bit(flag, &(JFS_IP(ip)->cflag)) #define clear_cflag(flag, ip) clear_bit(flag, &(JFS_IP(ip)->cflag)) #define test_cflag(flag, ip) test_bit(flag, &(JFS_IP(ip)->cflag)) diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index a6a8c16c872c..7ab47561b68d 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -104,8 +104,8 @@ static int jfs_create(struct inode *dip, struct dentry *dentry, int mode, tid = txBegin(dip->i_sb, 0); - mutex_lock(&JFS_IP(dip)->commit_mutex); - mutex_lock(&JFS_IP(ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); rc = jfs_init_acl(tid, ip, dip); if (rc) @@ -238,8 +238,8 @@ static int jfs_mkdir(struct inode *dip, struct dentry *dentry, int mode) tid = txBegin(dip->i_sb, 0); - mutex_lock(&JFS_IP(dip)->commit_mutex); - mutex_lock(&JFS_IP(ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); rc = jfs_init_acl(tid, ip, dip); if (rc) @@ -365,8 +365,8 @@ static int jfs_rmdir(struct inode *dip, struct dentry *dentry) tid = txBegin(dip->i_sb, 0); - mutex_lock(&JFS_IP(dip)->commit_mutex); - mutex_lock(&JFS_IP(ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); iplist[0] = dip; iplist[1] = ip; @@ -483,12 +483,12 @@ static int jfs_unlink(struct inode *dip, struct dentry *dentry) if ((rc = get_UCSname(&dname, dentry))) goto out; - IWRITE_LOCK(ip); + IWRITE_LOCK(ip, RDWRLOCK_NORMAL); tid = txBegin(dip->i_sb, 0); - mutex_lock(&JFS_IP(dip)->commit_mutex); - mutex_lock(&JFS_IP(ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); iplist[0] = dip; iplist[1] = ip; @@ -802,8 +802,8 @@ static int jfs_link(struct dentry *old_dentry, tid = txBegin(ip->i_sb, 0); - mutex_lock(&JFS_IP(dir)->commit_mutex); - mutex_lock(&JFS_IP(ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); /* * scan parent directory for entry/freespace @@ -913,8 +913,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, tid = txBegin(dip->i_sb, 0); - mutex_lock(&JFS_IP(dip)->commit_mutex); - mutex_lock(&JFS_IP(ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(dip)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); rc = jfs_init_security(tid, ip, dip); if (rc) @@ -1127,7 +1127,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out3; } } else if (new_ip) { - IWRITE_LOCK(new_ip); + IWRITE_LOCK(new_ip, RDWRLOCK_NORMAL); /* Init inode for quota operations. */ DQUOT_INIT(new_ip); } @@ -1137,13 +1137,21 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, */ tid = txBegin(new_dir->i_sb, 0); - mutex_lock(&JFS_IP(new_dir)->commit_mutex); - mutex_lock(&JFS_IP(old_ip)->commit_mutex); + /* + * How do we know the locking is safe from deadlocks? + * The vfs does the hard part for us. Any time we are taking nested + * commit_mutexes, the vfs already has i_mutex held on the parent. + * Here, the vfs has already taken i_mutex on both old_dir and new_dir. + */ + mutex_lock_nested(&JFS_IP(new_dir)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(old_ip)->commit_mutex, COMMIT_MUTEX_CHILD); if (old_dir != new_dir) - mutex_lock(&JFS_IP(old_dir)->commit_mutex); + mutex_lock_nested(&JFS_IP(old_dir)->commit_mutex, + COMMIT_MUTEX_SECOND_PARENT); if (new_ip) { - mutex_lock(&JFS_IP(new_ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(new_ip)->commit_mutex, + COMMIT_MUTEX_VICTIM); /* * Change existing directory entry to new inode number */ @@ -1357,8 +1365,8 @@ static int jfs_mknod(struct inode *dir, struct dentry *dentry, tid = txBegin(dir->i_sb, 0); - mutex_lock(&JFS_IP(dir)->commit_mutex); - mutex_lock(&JFS_IP(ip)->commit_mutex); + mutex_lock_nested(&JFS_IP(dir)->commit_mutex, COMMIT_MUTEX_PARENT); + mutex_lock_nested(&JFS_IP(ip)->commit_mutex, COMMIT_MUTEX_CHILD); rc = jfs_init_acl(tid, ip, dir); if (rc) -- cgit v1.2.3-59-g8ed1b From 4aa0d230c2cfc1ac4bcf7c5466f9943cf14233a9 Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Wed, 17 Jan 2007 21:18:35 -0600 Subject: JFS: call io_schedule() instead of schedule() to avoid deadlock The introduction of Jens Axboe's explicit i/o plugging patches introduced a deadlock in jfs. This was caused by the process initiating I/O not unplugging the queue before waiting on the commit thread. The commit thread itself was waiting for that I/O to complete. Calling io_schedule() rather than schedule() unplugs the I/O queue avoiding the deadlock, and it appears to be the right function to call in any case. Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_lock.h | 2 +- fs/jfs/jfs_metapage.c | 2 +- fs/jfs/jfs_txnmgr.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/jfs/jfs_lock.h b/fs/jfs/jfs_lock.h index 7d78e83d7c40..df48ece4b7a3 100644 --- a/fs/jfs/jfs_lock.h +++ b/fs/jfs/jfs_lock.h @@ -42,7 +42,7 @@ do { \ if (cond) \ break; \ unlock_cmd; \ - schedule(); \ + io_schedule(); \ lock_cmd; \ } \ current->state = TASK_RUNNING; \ diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index ceaf03b94935..58deae007507 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -56,7 +56,7 @@ static inline void __lock_metapage(struct metapage *mp) set_current_state(TASK_UNINTERRUPTIBLE); if (metapage_locked(mp)) { unlock_page(mp->page); - schedule(); + io_schedule(); lock_page(mp->page); } } while (trylock_metapage(mp)); diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index d558e51b0df8..6988a1082f58 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -135,7 +135,7 @@ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event) add_wait_queue(event, &wait); set_current_state(TASK_UNINTERRUPTIBLE); TXN_UNLOCK(); - schedule(); + io_schedule(); current->state = TASK_RUNNING; remove_wait_queue(event, &wait); } -- cgit v1.2.3-59-g8ed1b From 7220c0177b45600eef2cfd3e5e57ab5b96f3222c Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Fri, 26 Jan 2007 10:14:36 -0600 Subject: JFS: Remove incorrect kgdb define jfs_debug.h uses an incorrect CONFIG_KERNEL_ASSERT ifdef to redefine the assert macro for kgdb use. I believe the code worked a long time ago, but today it's not a valid config option. Since I'm not aware of anybody interested in debugging jfs with kgdb, it should just be removed. Thanks to Robert P. J. Day for reporting this. Signed-off-by: Dave Kleikamp --- fs/jfs/jfs_debug.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/jfs/jfs_debug.h b/fs/jfs/jfs_debug.h index ddffbbd4d955..7378798f0b21 100644 --- a/fs/jfs/jfs_debug.h +++ b/fs/jfs/jfs_debug.h @@ -39,10 +39,6 @@ extern void jfs_proc_clean(void); /* * assert with traditional printf/panic */ -#ifdef CONFIG_KERNEL_ASSERTS -/* kgdb stuff */ -#define assert(p) KERNEL_ASSERT(#p, p) -#else #define assert(p) do { \ if (!(p)) { \ printk(KERN_CRIT "BUG at %s:%d assert(%s)\n", \ @@ -50,7 +46,6 @@ extern void jfs_proc_clean(void); BUG(); \ } \ } while (0) -#endif /* * debug ON -- cgit v1.2.3-59-g8ed1b