From 5db53f3e80dee2d9dff5e534f9e9fe1db17c9936 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Fri, 20 Nov 2009 20:13:39 +0100 Subject: [LogFS] add new flash file system This is a new flash file system. See Documentation/filesystems/logfs.txt Signed-off-by: Joern Engel --- fs/logfs/dev_mtd.c | 253 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 253 insertions(+) create mode 100644 fs/logfs/dev_mtd.c (limited to 'fs/logfs/dev_mtd.c') diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c new file mode 100644 index 000000000000..68e99d046c23 --- /dev/null +++ b/fs/logfs/dev_mtd.c @@ -0,0 +1,253 @@ +/* + * fs/logfs/dev_mtd.c - Device access methods for MTD + * + * As should be obvious for Linux kernel code, license is GPLv2 + * + * Copyright (c) 2005-2008 Joern Engel + */ +#include "logfs.h" +#include +#include +#include + +#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1)) + +static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf) +{ + struct mtd_info *mtd = logfs_super(sb)->s_mtd; + size_t retlen; + int ret; + + ret = mtd->read(mtd, ofs, len, &retlen, buf); + BUG_ON(ret == -EINVAL); + if (ret) + return ret; + + /* Not sure if we should loop instead. */ + if (retlen != len) + return -EIO; + + return 0; +} + +static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf) +{ + struct logfs_super *super = logfs_super(sb); + struct mtd_info *mtd = super->s_mtd; + size_t retlen; + loff_t page_start, page_end; + int ret; + + if (super->s_flags & LOGFS_SB_FLAG_RO) + return -EROFS; + + BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs)); + BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift); + BUG_ON(len > PAGE_CACHE_SIZE); + page_start = ofs & PAGE_CACHE_MASK; + page_end = PAGE_CACHE_ALIGN(ofs + len) - 1; + ret = mtd->write(mtd, ofs, len, &retlen, buf); + if (ret || (retlen != len)) + return -EIO; + + return 0; +} + +/* + * For as long as I can remember (since about 2001) mtd->erase has been an + * asynchronous interface lacking the first driver to actually use the + * asynchronous properties. So just to prevent the first implementor of such + * a thing from breaking logfs in 2350, we do the usual pointless dance to + * declare a completion variable and wait for completion before returning + * from mtd_erase(). What an excercise in futility! + */ +static void logfs_erase_callback(struct erase_info *ei) +{ + complete((struct completion *)ei->priv); +} + +static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + pgoff_t index = ofs >> PAGE_SHIFT; + + for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) { + page = find_get_page(mapping, index); + if (!page) + continue; + memset(page_address(page), 0xFF, PAGE_SIZE); + page_cache_release(page); + } + return 0; +} + +static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len) +{ + struct mtd_info *mtd = logfs_super(sb)->s_mtd; + struct erase_info ei; + DECLARE_COMPLETION_ONSTACK(complete); + int ret; + + BUG_ON(len % mtd->erasesize); + if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO) + return -EROFS; + + memset(&ei, 0, sizeof(ei)); + ei.mtd = mtd; + ei.addr = ofs; + ei.len = len; + ei.callback = logfs_erase_callback; + ei.priv = (long)&complete; + ret = mtd->erase(mtd, &ei); + if (ret) + return -EIO; + + wait_for_completion(&complete); + if (ei.state != MTD_ERASE_DONE) + return -EIO; + return mtd_erase_mapping(sb, ofs, len); +} + +static void mtd_sync(struct super_block *sb) +{ + struct mtd_info *mtd = logfs_super(sb)->s_mtd; + + if (mtd->sync) + mtd->sync(mtd); +} + +static int mtd_readpage(void *_sb, struct page *page) +{ + struct super_block *sb = _sb; + int err; + + err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE, + page_address(page)); + if (err == -EUCLEAN) { + err = 0; + /* FIXME: force GC this segment */ + } + if (err) { + ClearPageUptodate(page); + SetPageError(page); + } else { + SetPageUptodate(page); + ClearPageError(page); + } + unlock_page(page); + return err; +} + +static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = mtd_readpage; + struct mtd_info *mtd = super->s_mtd; + + if (!mtd->block_isbad) + return NULL; + + *ofs = 0; + while (mtd->block_isbad(mtd, *ofs)) { + *ofs += mtd->erasesize; + if (*ofs >= mtd->size) + return NULL; + } + BUG_ON(*ofs & ~PAGE_MASK); + return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); +} + +static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + filler_t *filler = mtd_readpage; + struct mtd_info *mtd = super->s_mtd; + + if (!mtd->block_isbad) + return NULL; + + *ofs = mtd->size - mtd->erasesize; + while (mtd->block_isbad(mtd, *ofs)) { + *ofs -= mtd->erasesize; + if (*ofs <= 0) + return NULL; + } + *ofs = *ofs + mtd->erasesize - 0x1000; + BUG_ON(*ofs & ~PAGE_MASK); + return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb); +} + +static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index, + size_t nr_pages) +{ + struct logfs_super *super = logfs_super(sb); + struct address_space *mapping = super->s_mapping_inode->i_mapping; + struct page *page; + int i, err; + + for (i = 0; i < nr_pages; i++) { + page = find_lock_page(mapping, index + i); + BUG_ON(!page); + + err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE, + page_address(page)); + unlock_page(page); + page_cache_release(page); + if (err) + return err; + } + return 0; +} + +static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len) +{ + struct logfs_super *super = logfs_super(sb); + int head; + + if (super->s_flags & LOGFS_SB_FLAG_RO) + return; + + if (len == 0) { + /* This can happen when the object fit perfectly into a + * segment, the segment gets written per sync and subsequently + * closed. + */ + return; + } + head = ofs & (PAGE_SIZE - 1); + if (head) { + ofs -= head; + len += head; + } + len = PAGE_ALIGN(len); + __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT); +} + +static void mtd_put_device(struct super_block *sb) +{ + put_mtd_device(logfs_super(sb)->s_mtd); +} + +static const struct logfs_device_ops mtd_devops = { + .find_first_sb = mtd_find_first_sb, + .find_last_sb = mtd_find_last_sb, + .readpage = mtd_readpage, + .writeseg = mtd_writeseg, + .erase = mtd_erase, + .sync = mtd_sync, + .put_device = mtd_put_device, +}; + +int logfs_get_sb_mtd(struct file_system_type *type, int flags, + int mtdnr, struct vfsmount *mnt) +{ + struct mtd_info *mtd; + const struct logfs_device_ops *devops = &mtd_devops; + + mtd = get_mtd_device(NULL, mtdnr); + return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt); +} -- cgit v1.2.3-59-g8ed1b From 9421502b4fc894cc477be8fc49776830e37ca157 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Thu, 4 Mar 2010 21:30:58 +0100 Subject: [LogFS] Fix bdev erases Erases for block devices were always just emulated by writing 0xff. Some time back the write was removed and only the page cache was changed to 0xff. Superficialy a good idea with two problems: 1. Touching the page cache isn't necessary either. 2. However, writing out 0xff _is_ necessary for the journal. As the journal is scanned linearly, an old non-overwritten commit entry can be used on next mount and cause havoc. This should fix both aspects. --- fs/logfs/dev_bdev.c | 88 +++++++++++++++++++++++++++++++++++++++++++++-------- fs/logfs/dev_mtd.c | 3 +- fs/logfs/journal.c | 2 +- fs/logfs/logfs.h | 6 ++-- fs/logfs/segment.c | 6 ++-- fs/logfs/super.c | 12 +++++++- 6 files changed, 97 insertions(+), 20 deletions(-) (limited to 'fs/logfs/dev_mtd.c') diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c index 58a057b6e1af..9718c22f186d 100644 --- a/fs/logfs/dev_bdev.c +++ b/fs/logfs/dev_bdev.c @@ -167,27 +167,91 @@ static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len) generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev)); } -static int bdev_erase(struct super_block *sb, loff_t to, size_t len) + +static void erase_end_io(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct super_block *sb = bio->bi_private; + struct logfs_super *super = logfs_super(sb); + + BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */ + BUG_ON(err); + BUG_ON(bio->bi_vcnt == 0); + bio_put(bio); + if (atomic_dec_and_test(&super->s_pending_writes)) + wake_up(&wq); +} + +static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index, + size_t nr_pages) +{ + struct logfs_super *super = logfs_super(sb); + struct bio *bio; + struct request_queue *q = bdev_get_queue(sb->s_bdev); + unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9); + int i; + + bio = bio_alloc(GFP_NOFS, max_pages); + BUG_ON(!bio); /* FIXME: handle this */ + + for (i = 0; i < nr_pages; i++) { + if (i >= max_pages) { + /* Block layer cannot split bios :( */ + bio->bi_vcnt = i; + bio->bi_idx = 0; + bio->bi_size = i * PAGE_SIZE; + bio->bi_bdev = super->s_bdev; + bio->bi_sector = ofs >> 9; + bio->bi_private = sb; + bio->bi_end_io = erase_end_io; + atomic_inc(&super->s_pending_writes); + submit_bio(WRITE, bio); + + ofs += i * PAGE_SIZE; + index += i; + nr_pages -= i; + i = 0; + + bio = bio_alloc(GFP_NOFS, max_pages); + BUG_ON(!bio); + } + bio->bi_io_vec[i].bv_page = super->s_erase_page; + bio->bi_io_vec[i].bv_len = PAGE_SIZE; + bio->bi_io_vec[i].bv_offset = 0; + } + bio->bi_vcnt = nr_pages; + bio->bi_idx = 0; + bio->bi_size = nr_pages * PAGE_SIZE; + bio->bi_bdev = super->s_bdev; + bio->bi_sector = ofs >> 9; + bio->bi_private = sb; + bio->bi_end_io = erase_end_io; + atomic_inc(&super->s_pending_writes); + submit_bio(WRITE, bio); + return 0; +} + +static int bdev_erase(struct super_block *sb, loff_t to, size_t len, + int ensure_write) { struct logfs_super *super = logfs_super(sb); - struct address_space *mapping = super->s_mapping_inode->i_mapping; - struct page *page; - pgoff_t index = to >> PAGE_SHIFT; - int i, nr_pages = len >> PAGE_SHIFT; BUG_ON(to & (PAGE_SIZE - 1)); BUG_ON(len & (PAGE_SIZE - 1)); - if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO) + if (super->s_flags & LOGFS_SB_FLAG_RO) return -EROFS; - for (i = 0; i < nr_pages; i++) { - page = find_get_page(mapping, index + i); - if (page) { - memset(page_address(page), 0xFF, PAGE_SIZE); - page_cache_release(page); - } + if (ensure_write) { + /* + * Object store doesn't care whether erases happen or not. + * But for the journal they are required. Otherwise a scan + * can find an old commit entry and assume it is the current + * one, travelling back in time. + */ + do_erase(sb, to, to >> PAGE_SHIFT, len >> PAGE_SHIFT); } + return 0; } diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c index 68e99d046c23..cafb6ef2e05b 100644 --- a/fs/logfs/dev_mtd.c +++ b/fs/logfs/dev_mtd.c @@ -83,7 +83,8 @@ static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len) return 0; } -static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len) +static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len, + int ensure_write) { struct mtd_info *mtd = logfs_super(sb)->s_mtd; struct erase_info ei; diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c index 2f2e8e4fd02d..c0e7d63221d4 100644 --- a/fs/logfs/journal.c +++ b/fs/logfs/journal.c @@ -392,7 +392,7 @@ static int journal_erase_segment(struct logfs_area *area) u64 ofs; int err; - err = logfs_erase_segment(sb, area->a_segno); + err = logfs_erase_segment(sb, area->a_segno, 1); if (err) return err; diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index e3082abe9e3b..72592114a28f 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -151,7 +151,8 @@ struct logfs_device_ops { int (*write_sb)(struct super_block *sb, struct page *page); int (*readpage)(void *_sb, struct page *page); void (*writeseg)(struct super_block *sb, u64 ofs, size_t len); - int (*erase)(struct super_block *sb, loff_t ofs, size_t len); + int (*erase)(struct super_block *sb, loff_t ofs, size_t len, + int ensure_write); void (*sync)(struct super_block *sb); void (*put_device)(struct super_block *sb); }; @@ -327,6 +328,7 @@ struct logfs_super { u64 s_feature_compat; u64 s_feature_flags; u64 s_sb_ofs[2]; + struct page *s_erase_page; /* for dev_bdev.c */ /* alias.c fields */ struct btree_head32 s_segment_alias; /* remapped segments */ int s_no_object_aliases; @@ -572,7 +574,7 @@ int get_page_reserve(struct inode *inode, struct page *page); extern struct logfs_block_ops indirect_block_ops; /* segment.c */ -int logfs_erase_segment(struct super_block *sb, u32 ofs); +int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase); int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf); int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix, level_t level); diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index 5f58b74516ca..664cd0dd3576 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c @@ -25,14 +25,14 @@ static int logfs_mark_segment_bad(struct super_block *sb, u32 segno) return 0; } -int logfs_erase_segment(struct super_block *sb, u32 segno) +int logfs_erase_segment(struct super_block *sb, u32 segno, int ensure_erase) { struct logfs_super *super = logfs_super(sb); super->s_gec++; return super->s_devops->erase(sb, (u64)segno << super->s_segshift, - super->s_segsize); + super->s_segsize, ensure_erase); } static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes) @@ -798,7 +798,7 @@ static int ostore_erase_segment(struct logfs_area *area) u64 ofs; int err; - err = logfs_erase_segment(sb, area->a_segno); + err = logfs_erase_segment(sb, area->a_segno, 0); if (err) return err; diff --git a/fs/logfs/super.c b/fs/logfs/super.c index d128a2c1c8d1..94d80f7ee7c2 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -317,6 +317,7 @@ static int logfs_make_writeable(struct super_block *sb) static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt) { + struct logfs_super *super = logfs_super(sb); struct inode *rootdir; int err; @@ -329,15 +330,22 @@ static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt) if (!sb->s_root) goto fail; + super->s_erase_page = alloc_pages(GFP_KERNEL, 0); + if (!super->s_erase_page) + goto fail2; + memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE); + /* FIXME: check for read-only mounts */ err = logfs_make_writeable(sb); if (err) - goto fail2; + goto fail3; log_super("LogFS: Finished mounting\n"); simple_set_mnt(mnt, sb); return 0; +fail3: + __free_page(super->s_erase_page); fail2: iput(rootdir); fail: @@ -498,6 +506,8 @@ static void logfs_kill_sb(struct super_block *sb) logfs_cleanup_journal(sb); logfs_cleanup_areas(sb); logfs_cleanup_rw(sb); + if (super->s_erase_page) + __free_page(super->s_erase_page); super->s_devops->put_device(sb); mempool_destroy(super->s_btree_pool); mempool_destroy(super->s_alias_pool); -- cgit v1.2.3-59-g8ed1b