From 1bf94ca73ea524228b864275efa44373ebb939a0 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 25 May 2010 15:06:06 +0200 Subject: fuse: use get_user_pages_fast() Replace uses of get_user_pages() with get_user_pages_fast(). It looks nicer and should be faster in most cases. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'fs/fuse/dev.c') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index eb7e9423691f..4623018e104a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -551,10 +551,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->iov++; cs->nr_segs--; } - down_read(¤t->mm->mmap_sem); - err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0, - &cs->pg, NULL); - up_read(¤t->mm->mmap_sem); + err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg); if (err < 0) return err; BUG_ON(err != 1); -- cgit v1.3-8-gc7d7 From dd3bb14f44a6382de2508ec387c7e5569ad2d4f1 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 25 May 2010 15:06:06 +0200 Subject: fuse: support splice() writing to fuse device Allow userspace filesystem implementation to use splice() to write to the fuse device. The semantics of using splice() are: 1) buffer the message header and data in a temporary pipe 2) with a *single* splice() call move the message from the temporary pipe to the fuse device The READ reply message has the most interesting use for this, since now the data from an arbitrary file descriptor (which could be a regular file, a block device or a socket) can be tranferred into the fuse device without having to go through a userspace buffer. It will also allow zero copy moving of pages. One caveat is that the protocol on the fuse device requires the length of the whole message to be written into the header. But the length of the data transferred into the temporary pipe may not be known in advance. The current library implementation works around this by using vmplice to write the header and modifying the header after splicing the data into the pipe (error handling omitted): struct fuse_out_header out; iov.iov_base = &out; iov.iov_len = sizeof(struct fuse_out_header); vmsplice(pip[1], &iov, 1, 0); len = splice(input_fd, input_offset, pip[1], NULL, len, 0); /* retrospectively modify the header: */ out.len = len + sizeof(struct fuse_out_header); splice(pip[0], NULL, fuse_chan_fd(req->ch), NULL, out.len, flags); This works since vmsplice only saves a pointer to the data, it does not copy the data itself. Since pipes are currently limited to 16 pages and messages need to be spliced atomically, the length of the data is limited to 15 pages (or 60kB for 4k pages). Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 175 ++++++++++++++++++++++++++++++++++++++++++--------- include/linux/fuse.h | 5 +- 2 files changed, 148 insertions(+), 32 deletions(-) (limited to 'fs/fuse/dev.c') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 4623018e104a..2795045484ee 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -16,6 +16,7 @@ #include #include #include +#include MODULE_ALIAS_MISCDEV(FUSE_MINOR); @@ -498,6 +499,9 @@ struct fuse_copy_state { int write; struct fuse_req *req; const struct iovec *iov; + struct pipe_buffer *pipebufs; + struct pipe_buffer *currbuf; + struct pipe_inode_info *pipe; unsigned long nr_segs; unsigned long seglen; unsigned long addr; @@ -522,7 +526,14 @@ static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc, /* Unmap and put previous page of userspace buffer */ static void fuse_copy_finish(struct fuse_copy_state *cs) { - if (cs->mapaddr) { + if (cs->currbuf) { + struct pipe_buffer *buf = cs->currbuf; + + buf->ops->unmap(cs->pipe, buf, cs->mapaddr); + + cs->currbuf = NULL; + cs->mapaddr = NULL; + } else if (cs->mapaddr) { kunmap_atomic(cs->mapaddr, KM_USER0); if (cs->write) { flush_dcache_page(cs->pg); @@ -544,23 +555,39 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) unlock_request(cs->fc, cs->req); fuse_copy_finish(cs); - if (!cs->seglen) { + if (cs->pipebufs) { + struct pipe_buffer *buf = cs->pipebufs; + + err = buf->ops->confirm(cs->pipe, buf); + if (err) + return err; + BUG_ON(!cs->nr_segs); - cs->seglen = cs->iov[0].iov_len; - cs->addr = (unsigned long) cs->iov[0].iov_base; - cs->iov++; + cs->currbuf = buf; + cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); + cs->len = buf->len; + cs->buf = cs->mapaddr + buf->offset; + cs->pipebufs++; cs->nr_segs--; + } else { + if (!cs->seglen) { + BUG_ON(!cs->nr_segs); + cs->seglen = cs->iov[0].iov_len; + cs->addr = (unsigned long) cs->iov[0].iov_base; + cs->iov++; + cs->nr_segs--; + } + err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg); + if (err < 0) + return err; + BUG_ON(err != 1); + offset = cs->addr % PAGE_SIZE; + cs->mapaddr = kmap_atomic(cs->pg, KM_USER0); + cs->buf = cs->mapaddr + offset; + cs->len = min(PAGE_SIZE - offset, cs->seglen); + cs->seglen -= cs->len; + cs->addr += cs->len; } - err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg); - if (err < 0) - return err; - BUG_ON(err != 1); - offset = cs->addr % PAGE_SIZE; - cs->mapaddr = kmap_atomic(cs->pg, KM_USER0); - cs->buf = cs->mapaddr + offset; - cs->len = min(PAGE_SIZE - offset, cs->seglen); - cs->seglen -= cs->len; - cs->addr += cs->len; return lock_request(cs->fc, cs->req); } @@ -984,23 +1011,17 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, * it from the list and copy the rest of the buffer to the request. * The request is finished by calling request_end() */ -static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t fuse_dev_do_write(struct fuse_conn *fc, + struct fuse_copy_state *cs, size_t nbytes) { int err; - size_t nbytes = iov_length(iov, nr_segs); struct fuse_req *req; struct fuse_out_header oh; - struct fuse_copy_state cs; - struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp); - if (!fc) - return -EPERM; - fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs); if (nbytes < sizeof(struct fuse_out_header)) return -EINVAL; - err = fuse_copy_one(&cs, &oh, sizeof(oh)); + err = fuse_copy_one(cs, &oh, sizeof(oh)); if (err) goto err_finish; @@ -1013,7 +1034,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, * and error contains notification code. */ if (!oh.unique) { - err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs); + err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs); return err ? err : nbytes; } @@ -1032,7 +1053,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, if (req->aborted) { spin_unlock(&fc->lock); - fuse_copy_finish(&cs); + fuse_copy_finish(cs); spin_lock(&fc->lock); request_end(fc, req); return -ENOENT; @@ -1049,7 +1070,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, queue_interrupt(fc, req); spin_unlock(&fc->lock); - fuse_copy_finish(&cs); + fuse_copy_finish(cs); return nbytes; } @@ -1057,11 +1078,11 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, list_move(&req->list, &fc->io); req->out.h = oh; req->locked = 1; - cs.req = req; + cs->req = req; spin_unlock(&fc->lock); - err = copy_out_args(&cs, &req->out, nbytes); - fuse_copy_finish(&cs); + err = copy_out_args(cs, &req->out, nbytes); + fuse_copy_finish(cs); spin_lock(&fc->lock); req->locked = 0; @@ -1077,10 +1098,101 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, err_unlock: spin_unlock(&fc->lock); err_finish: - fuse_copy_finish(&cs); + fuse_copy_finish(cs); return err; } +static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct fuse_copy_state cs; + struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp); + if (!fc) + return -EPERM; + + fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs); + + return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs)); +} + +static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags) +{ + unsigned nbuf; + unsigned idx; + struct pipe_buffer *bufs; + struct fuse_copy_state cs; + struct fuse_conn *fc; + size_t rem; + ssize_t ret; + + fc = fuse_get_conn(out); + if (!fc) + return -EPERM; + + bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL); + if (!bufs) + return -ENOMEM; + + pipe_lock(pipe); + nbuf = 0; + rem = 0; + for (idx = 0; idx < pipe->nrbufs && rem < len; idx++) + rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len; + + ret = -EINVAL; + if (rem < len) { + pipe_unlock(pipe); + goto out; + } + + rem = len; + while (rem) { + struct pipe_buffer *ibuf; + struct pipe_buffer *obuf; + + BUG_ON(nbuf >= pipe->buffers); + BUG_ON(!pipe->nrbufs); + ibuf = &pipe->bufs[pipe->curbuf]; + obuf = &bufs[nbuf]; + + if (rem >= ibuf->len) { + *obuf = *ibuf; + ibuf->ops = NULL; + pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1); + pipe->nrbufs--; + } else { + ibuf->ops->get(pipe, ibuf); + *obuf = *ibuf; + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + obuf->len = rem; + ibuf->offset += obuf->len; + ibuf->len -= obuf->len; + } + nbuf++; + rem -= obuf->len; + } + pipe_unlock(pipe); + + memset(&cs, 0, sizeof(struct fuse_copy_state)); + cs.fc = fc; + cs.write = 0; + cs.pipebufs = bufs; + cs.nr_segs = nbuf; + cs.pipe = pipe; + + ret = fuse_dev_do_write(fc, &cs, len); + + for (idx = 0; idx < nbuf; idx++) { + struct pipe_buffer *buf = &bufs[idx]; + buf->ops->release(pipe, buf); + } +out: + kfree(bufs); + return ret; +} + static unsigned fuse_dev_poll(struct file *file, poll_table *wait) { unsigned mask = POLLOUT | POLLWRNORM; @@ -1224,6 +1336,7 @@ const struct file_operations fuse_dev_operations = { .aio_read = fuse_dev_read, .write = do_sync_write, .aio_write = fuse_dev_write, + .splice_write = fuse_dev_splice_write, .poll = fuse_dev_poll, .release = fuse_dev_release, .fasync = fuse_dev_fasync, diff --git a/include/linux/fuse.h b/include/linux/fuse.h index 3e2925a34bf0..88e0eb596919 100644 --- a/include/linux/fuse.h +++ b/include/linux/fuse.h @@ -34,6 +34,9 @@ * 7.13 * - make max number of background requests and congestion threshold * tunables + * + * 7.14 + * - add splice support to fuse device */ #ifndef _LINUX_FUSE_H @@ -65,7 +68,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 13 +#define FUSE_KERNEL_MINOR_VERSION 14 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 -- cgit v1.3-8-gc7d7 From ce534fb052928ce556639d7ecf01cbf4e01321e1 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 25 May 2010 15:06:07 +0200 Subject: fuse: allow splice to move pages When splicing buffers to the fuse device with SPLICE_F_MOVE, try to move pages from the pipe buffer into the page cache. This allows populating the fuse filesystem's cache without ever touching the page contents, i.e. zero copy read capability. The following steps are performed when trying to move a page into the page cache: - buf->ops->confirm() to make sure the new page is uptodate - buf->ops->steal() to try to remove the new page from it's previous place - remove_from_page_cache() on the old page - add_to_page_cache_locked() on the new page If any of the above steps fail (non fatally) then the code falls back to copying the page. In particular ->steal() will fail if there are external references (other than the page cache and the pipe buffer) to the page. Also since the remove_from_page_cache() + add_to_page_cache_locked() are non-atomic it is possible that the page cache is repopulated in between the two and add_to_page_cache_locked() will fail. This could be fixed by creating a new atomic replace_page_cache_page() function. fuse_readpages_end() needed to be reworked so it works even if page->mapping is NULL for some or all pages which can happen if the add_to_page_cache_locked() failed. A number of sanity checks were added to make sure the stolen pages don't have weird flags set, etc... These could be moved into generic splice/steal code. Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 151 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- fs/fuse/file.c | 28 +++++++---- fs/fuse/fuse_i.h | 3 ++ 3 files changed, 167 insertions(+), 15 deletions(-) (limited to 'fs/fuse/dev.c') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 2795045484ee..b070d3adf9b0 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -17,6 +17,8 @@ #include #include #include +#include +#include MODULE_ALIAS_MISCDEV(FUSE_MINOR); @@ -509,6 +511,7 @@ struct fuse_copy_state { void *mapaddr; void *buf; unsigned len; + unsigned move_pages:1; }; static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc, @@ -609,13 +612,135 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size) return ncpy; } +static int fuse_check_page(struct page *page) +{ + if (page_mapcount(page) || + page->mapping != NULL || + page_count(page) != 1 || + (page->flags & PAGE_FLAGS_CHECK_AT_PREP & + ~(1 << PG_locked | + 1 << PG_referenced | + 1 << PG_uptodate | + 1 << PG_lru | + 1 << PG_active | + 1 << PG_reclaim))) { + printk(KERN_WARNING "fuse: trying to steal weird page\n"); + printk(KERN_WARNING " page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping); + return 1; + } + return 0; +} + +static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) +{ + int err; + struct page *oldpage = *pagep; + struct page *newpage; + struct pipe_buffer *buf = cs->pipebufs; + struct address_space *mapping; + pgoff_t index; + + unlock_request(cs->fc, cs->req); + fuse_copy_finish(cs); + + err = buf->ops->confirm(cs->pipe, buf); + if (err) + return err; + + BUG_ON(!cs->nr_segs); + cs->currbuf = buf; + cs->len = buf->len; + cs->pipebufs++; + cs->nr_segs--; + + if (cs->len != PAGE_SIZE) + goto out_fallback; + + if (buf->ops->steal(cs->pipe, buf) != 0) + goto out_fallback; + + newpage = buf->page; + + if (WARN_ON(!PageUptodate(newpage))) + return -EIO; + + ClearPageMappedToDisk(newpage); + + if (fuse_check_page(newpage) != 0) + goto out_fallback_unlock; + + mapping = oldpage->mapping; + index = oldpage->index; + + /* + * This is a new and locked page, it shouldn't be mapped or + * have any special flags on it + */ + if (WARN_ON(page_mapped(oldpage))) + goto out_fallback_unlock; + if (WARN_ON(page_has_private(oldpage))) + goto out_fallback_unlock; + if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage))) + goto out_fallback_unlock; + if (WARN_ON(PageMlocked(oldpage))) + goto out_fallback_unlock; + + remove_from_page_cache(oldpage); + page_cache_release(oldpage); + + err = add_to_page_cache_locked(newpage, mapping, index, GFP_KERNEL); + if (err) { + printk(KERN_WARNING "fuse_try_move_page: failed to add page"); + goto out_fallback_unlock; + } + page_cache_get(newpage); + + if (!(buf->flags & PIPE_BUF_FLAG_LRU)) + lru_cache_add_file(newpage); + + err = 0; + spin_lock(&cs->fc->lock); + if (cs->req->aborted) + err = -ENOENT; + else + *pagep = newpage; + spin_unlock(&cs->fc->lock); + + if (err) { + unlock_page(newpage); + page_cache_release(newpage); + return err; + } + + unlock_page(oldpage); + page_cache_release(oldpage); + cs->len = 0; + + return 0; + +out_fallback_unlock: + unlock_page(newpage); +out_fallback: + cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); + cs->buf = cs->mapaddr + buf->offset; + + err = lock_request(cs->fc, cs->req); + if (err) + return err; + + return 1; +} + /* * Copy a page in the request to/from the userspace buffer. Must be * done atomically */ -static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page, +static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, unsigned offset, unsigned count, int zeroing) { + int err; + struct page *page = *pagep; + if (page && zeroing && count < PAGE_SIZE) { void *mapaddr = kmap_atomic(page, KM_USER1); memset(mapaddr, 0, PAGE_SIZE); @@ -623,9 +748,16 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page, } while (count) { if (!cs->len) { - int err = fuse_copy_fill(cs); - if (err) - return err; + if (cs->move_pages && page && + offset == 0 && count == PAGE_SIZE) { + err = fuse_try_move_page(cs, pagep); + if (err <= 0) + return err; + } else { + err = fuse_copy_fill(cs); + if (err) + return err; + } } if (page) { void *mapaddr = kmap_atomic(page, KM_USER1); @@ -650,8 +782,10 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset); for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) { - struct page *page = req->pages[i]; - int err = fuse_copy_page(cs, page, offset, count, zeroing); + int err; + + err = fuse_copy_page(cs, &req->pages[i], offset, count, + zeroing); if (err) return err; @@ -1079,6 +1213,8 @@ static ssize_t fuse_dev_do_write(struct fuse_conn *fc, req->out.h = oh; req->locked = 1; cs->req = req; + if (!req->out.page_replace) + cs->move_pages = 0; spin_unlock(&fc->lock); err = copy_out_args(cs, &req->out, nbytes); @@ -1182,6 +1318,9 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, cs.nr_segs = nbuf; cs.pipe = pipe; + if (flags & SPLICE_F_MOVE) + cs.move_pages = 1; + ret = fuse_dev_do_write(fc, &cs, len); for (idx = 0; idx < nbuf; idx++) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 9ca68edcbdbe..06e3775b2282 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -517,17 +517,26 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) int i; size_t count = req->misc.read.in.size; size_t num_read = req->out.args[0].size; - struct inode *inode = req->pages[0]->mapping->host; + struct address_space *mapping = NULL; - /* - * Short read means EOF. If file size is larger, truncate it - */ - if (!req->out.h.error && num_read < count) { - loff_t pos = page_offset(req->pages[0]) + num_read; - fuse_read_update_size(inode, pos, req->misc.read.attr_ver); - } + for (i = 0; mapping == NULL && i < req->num_pages; i++) + mapping = req->pages[i]->mapping; - fuse_invalidate_attr(inode); /* atime changed */ + if (mapping) { + struct inode *inode = mapping->host; + + /* + * Short read means EOF. If file size is larger, truncate it + */ + if (!req->out.h.error && num_read < count) { + loff_t pos; + + pos = page_offset(req->pages[0]) + num_read; + fuse_read_update_size(inode, pos, + req->misc.read.attr_ver); + } + fuse_invalidate_attr(inode); /* atime changed */ + } for (i = 0; i < req->num_pages; i++) { struct page *page = req->pages[i]; @@ -551,6 +560,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file) req->out.argpages = 1; req->out.page_zeroing = 1; + req->out.page_replace = 1; fuse_read_fill(req, file, pos, count, FUSE_READ); req->misc.read.attr_ver = fuse_get_attr_version(fc); if (fc->async_read) { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 01cc462ff45d..9d0a51852d8a 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -177,6 +177,9 @@ struct fuse_out { /** Zero partially or not copied pages */ unsigned page_zeroing:1; + /** Pages may be replaced with new ones */ + unsigned page_replace:1; + /** Number or arguments */ unsigned numargs; -- cgit v1.3-8-gc7d7 From c3021629a0d820247ee12b6c5192a1d5380e21c6 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 25 May 2010 15:06:07 +0200 Subject: fuse: support splice() reading from fuse device Allow userspace filesystem implementation to use splice() to read from the fuse device. The userspace filesystem can now transfer data coming from a WRITE request to an arbitrary file descriptor (regular file, block device or socket) without having to go through a userspace buffer. The semantics of using splice() to read messages are: 1) with a single splice() call move the whole message from the fuse device to a temporary pipe 2) read the header from the pipe and determine the message type 3a) if message is a WRITE then splice data from pipe to destination 3b) else read rest of message to userspace buffer Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 228 +++++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 187 insertions(+), 41 deletions(-) (limited to 'fs/fuse/dev.c') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index b070d3adf9b0..4413f5e7b133 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -515,13 +515,12 @@ struct fuse_copy_state { }; static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc, - int write, struct fuse_req *req, + int write, const struct iovec *iov, unsigned long nr_segs) { memset(cs, 0, sizeof(*cs)); cs->fc = fc; cs->write = write; - cs->req = req; cs->iov = iov; cs->nr_segs = nr_segs; } @@ -532,8 +531,12 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) if (cs->currbuf) { struct pipe_buffer *buf = cs->currbuf; - buf->ops->unmap(cs->pipe, buf, cs->mapaddr); - + if (!cs->write) { + buf->ops->unmap(cs->pipe, buf, cs->mapaddr); + } else { + kunmap_atomic(cs->mapaddr, KM_USER0); + buf->len = PAGE_SIZE - cs->len; + } cs->currbuf = NULL; cs->mapaddr = NULL; } else if (cs->mapaddr) { @@ -561,17 +564,39 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) if (cs->pipebufs) { struct pipe_buffer *buf = cs->pipebufs; - err = buf->ops->confirm(cs->pipe, buf); - if (err) - return err; + if (!cs->write) { + err = buf->ops->confirm(cs->pipe, buf); + if (err) + return err; + + BUG_ON(!cs->nr_segs); + cs->currbuf = buf; + cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); + cs->len = buf->len; + cs->buf = cs->mapaddr + buf->offset; + cs->pipebufs++; + cs->nr_segs--; + } else { + struct page *page; - BUG_ON(!cs->nr_segs); - cs->currbuf = buf; - cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); - cs->len = buf->len; - cs->buf = cs->mapaddr + buf->offset; - cs->pipebufs++; - cs->nr_segs--; + if (cs->nr_segs == cs->pipe->buffers) + return -EIO; + + page = alloc_page(GFP_HIGHUSER); + if (!page) + return -ENOMEM; + + buf->page = page; + buf->offset = 0; + buf->len = 0; + + cs->currbuf = buf; + cs->mapaddr = kmap_atomic(page, KM_USER0); + cs->buf = cs->mapaddr; + cs->len = PAGE_SIZE; + cs->pipebufs++; + cs->nr_segs++; + } } else { if (!cs->seglen) { BUG_ON(!cs->nr_segs); @@ -731,6 +756,30 @@ out_fallback: return 1; } +static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, + unsigned offset, unsigned count) +{ + struct pipe_buffer *buf; + + if (cs->nr_segs == cs->pipe->buffers) + return -EIO; + + unlock_request(cs->fc, cs->req); + fuse_copy_finish(cs); + + buf = cs->pipebufs; + page_cache_get(page); + buf->page = page; + buf->offset = offset; + buf->len = count; + + cs->pipebufs++; + cs->nr_segs++; + cs->len = 0; + + return 0; +} + /* * Copy a page in the request to/from the userspace buffer. Must be * done atomically @@ -747,7 +796,9 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, kunmap_atomic(mapaddr, KM_USER1); } while (count) { - if (!cs->len) { + if (cs->write && cs->pipebufs && page) { + return fuse_ref_page(cs, page, offset, count); + } else if (!cs->len) { if (cs->move_pages && page && offset == 0 && count == PAGE_SIZE) { err = fuse_try_move_page(cs, pagep); @@ -862,11 +913,10 @@ __acquires(&fc->lock) * * Called with fc->lock held, releases it */ -static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req, - const struct iovec *iov, unsigned long nr_segs) +static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs, + size_t nbytes, struct fuse_req *req) __releases(&fc->lock) { - struct fuse_copy_state cs; struct fuse_in_header ih; struct fuse_interrupt_in arg; unsigned reqsize = sizeof(ih) + sizeof(arg); @@ -882,14 +932,13 @@ __releases(&fc->lock) arg.unique = req->in.h.unique; spin_unlock(&fc->lock); - if (iov_length(iov, nr_segs) < reqsize) + if (nbytes < reqsize) return -EINVAL; - fuse_copy_init(&cs, fc, 1, NULL, iov, nr_segs); - err = fuse_copy_one(&cs, &ih, sizeof(ih)); + err = fuse_copy_one(cs, &ih, sizeof(ih)); if (!err) - err = fuse_copy_one(&cs, &arg, sizeof(arg)); - fuse_copy_finish(&cs); + err = fuse_copy_one(cs, &arg, sizeof(arg)); + fuse_copy_finish(cs); return err ? err : reqsize; } @@ -903,18 +952,13 @@ __releases(&fc->lock) * request_end(). Otherwise add it to the processing list, and set * the 'sent' flag. */ -static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) +static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file, + struct fuse_copy_state *cs, size_t nbytes) { int err; struct fuse_req *req; struct fuse_in *in; - struct fuse_copy_state cs; unsigned reqsize; - struct file *file = iocb->ki_filp; - struct fuse_conn *fc = fuse_get_conn(file); - if (!fc) - return -EPERM; restart: spin_lock(&fc->lock); @@ -934,7 +978,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, if (!list_empty(&fc->interrupts)) { req = list_entry(fc->interrupts.next, struct fuse_req, intr_entry); - return fuse_read_interrupt(fc, req, iov, nr_segs); + return fuse_read_interrupt(fc, cs, nbytes, req); } req = list_entry(fc->pending.next, struct fuse_req, list); @@ -944,7 +988,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, in = &req->in; reqsize = in->h.len; /* If request is too large, reply with an error and restart the read */ - if (iov_length(iov, nr_segs) < reqsize) { + if (nbytes < reqsize) { req->out.h.error = -EIO; /* SETXATTR is special, since it may contain too large data */ if (in->h.opcode == FUSE_SETXATTR) @@ -953,12 +997,12 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, goto restart; } spin_unlock(&fc->lock); - fuse_copy_init(&cs, fc, 1, req, iov, nr_segs); - err = fuse_copy_one(&cs, &in->h, sizeof(in->h)); + cs->req = req; + err = fuse_copy_one(cs, &in->h, sizeof(in->h)); if (!err) - err = fuse_copy_args(&cs, in->numargs, in->argpages, + err = fuse_copy_args(cs, in->numargs, in->argpages, (struct fuse_arg *) in->args, 0); - fuse_copy_finish(&cs); + fuse_copy_finish(cs); spin_lock(&fc->lock); req->locked = 0; if (req->aborted) { @@ -986,6 +1030,110 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, return err; } +static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct fuse_copy_state cs; + struct file *file = iocb->ki_filp; + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) + return -EPERM; + + fuse_copy_init(&cs, fc, 1, iov, nr_segs); + + return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs)); +} + +static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + return 1; +} + +static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = generic_pipe_buf_release, + .steal = fuse_dev_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, unsigned int flags) +{ + int ret; + int page_nr = 0; + int do_wakeup = 0; + struct pipe_buffer *bufs; + struct fuse_copy_state cs; + struct fuse_conn *fc = fuse_get_conn(in); + if (!fc) + return -EPERM; + + bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL); + if (!bufs) + return -ENOMEM; + + fuse_copy_init(&cs, fc, 1, NULL, 0); + cs.pipebufs = bufs; + cs.pipe = pipe; + ret = fuse_dev_do_read(fc, in, &cs, len); + if (ret < 0) + goto out; + + ret = 0; + pipe_lock(pipe); + + if (!pipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + goto out_unlock; + } + + if (pipe->nrbufs + cs.nr_segs > pipe->buffers) { + ret = -EIO; + goto out_unlock; + } + + while (page_nr < cs.nr_segs) { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1); + struct pipe_buffer *buf = pipe->bufs + newbuf; + + buf->page = bufs[page_nr].page; + buf->offset = bufs[page_nr].offset; + buf->len = bufs[page_nr].len; + buf->ops = &fuse_dev_pipe_buf_ops; + + pipe->nrbufs++; + page_nr++; + ret += buf->len; + + if (pipe->inode) + do_wakeup = 1; + } + +out_unlock: + pipe_unlock(pipe); + + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + } + +out: + for (; page_nr < cs.nr_segs; page_nr++) + page_cache_release(bufs[page_nr].page); + + kfree(bufs); + return ret; +} + static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { @@ -1246,7 +1394,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, if (!fc) return -EPERM; - fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs); + fuse_copy_init(&cs, fc, 0, iov, nr_segs); return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs)); } @@ -1311,11 +1459,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, } pipe_unlock(pipe); - memset(&cs, 0, sizeof(struct fuse_copy_state)); - cs.fc = fc; - cs.write = 0; + fuse_copy_init(&cs, fc, 0, NULL, nbuf); cs.pipebufs = bufs; - cs.nr_segs = nbuf; cs.pipe = pipe; if (flags & SPLICE_F_MOVE) @@ -1473,6 +1618,7 @@ const struct file_operations fuse_dev_operations = { .llseek = no_llseek, .read = do_sync_read, .aio_read = fuse_dev_read, + .splice_read = fuse_dev_splice_read, .write = do_sync_write, .aio_write = fuse_dev_write, .splice_write = fuse_dev_splice_write, -- cgit v1.3-8-gc7d7 From 578454ff7eab61d13a26b568f99a89a2c9edc881 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Thu, 20 May 2010 18:07:20 +0200 Subject: driver core: add devname module aliases to allow module on-demand auto-loading This adds: alias: devname: to some common kernel modules, which will allow the on-demand loading of the kernel module when the device node is accessed. Ideally all these modules would be compiled-in, but distros seems too much in love with their modularization that we need to cover the common cases with this new facility. It will allow us to remove a bunch of pretty useless init scripts and modprobes from init scripts. The static device node aliases will be carried in the module itself. The program depmod will extract this information to a file in the module directory: $ cat /lib/modules/2.6.34-00650-g537b60d-dirty/modules.devname # Device nodes to trigger on-demand module loading. microcode cpu/microcode c10:184 fuse fuse c10:229 ppp_generic ppp c108:0 tun net/tun c10:200 dm_mod mapper/control c10:235 Udev will pick up the depmod created file on startup and create all the static device nodes which the kernel modules specify, so that these modules get automatically loaded when the device node is accessed: $ /sbin/udevd --debug ... static_dev_create_from_modules: mknod '/dev/cpu/microcode' c10:184 static_dev_create_from_modules: mknod '/dev/fuse' c10:229 static_dev_create_from_modules: mknod '/dev/ppp' c108:0 static_dev_create_from_modules: mknod '/dev/net/tun' c10:200 static_dev_create_from_modules: mknod '/dev/mapper/control' c10:235 udev_rules_apply_static_dev_perms: chmod '/dev/net/tun' 0666 udev_rules_apply_static_dev_perms: chmod '/dev/fuse' 0666 A few device nodes are switched to statically allocated numbers, to allow the static nodes to work. This might also useful for systems which still run a plain static /dev, which is completely unsafe to use with any dynamic minor numbers. Note: The devname aliases must be limited to the *common* and *single*instance* device nodes, like the misc devices, and never be used for conceptually limited systems like the loop devices, which should rather get fixed properly and get a control node for losetup to talk to, instead of creating a random number of device nodes in advance, regardless if they are ever used. This facility is to hide the mess distros are creating with too modualized kernels, and just to hide that these modules are not compiled-in, and not to paper-over broken concepts. Thanks! :) Cc: Greg Kroah-Hartman Cc: David S. Miller Cc: Miklos Szeredi Cc: Chris Mason Cc: Alasdair G Kergon Cc: Tigran Aivazian Cc: Ian Kent Signed-Off-By: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- Documentation/devices.txt | 2 ++ arch/x86/kernel/microcode_core.c | 1 + drivers/net/ppp_generic.c | 4 ++-- drivers/net/tun.c | 1 + fs/autofs4/dev-ioctl.c | 5 ++++- fs/btrfs/super.c | 5 ++++- fs/fuse/dev.c | 1 + include/linux/miscdevice.h | 2 ++ 8 files changed, 17 insertions(+), 4 deletions(-) (limited to 'fs/fuse/dev.c') diff --git a/Documentation/devices.txt b/Documentation/devices.txt index 53d64d382343..1d83d124056c 100644 --- a/Documentation/devices.txt +++ b/Documentation/devices.txt @@ -443,6 +443,8 @@ Your cooperation is appreciated. 231 = /dev/snapshot System memory snapshot device 232 = /dev/kvm Kernel-based virtual machine (hardware virtualization extensions) 233 = /dev/kmview View-OS A process with a view + 234 = /dev/btrfs-control Btrfs control device + 235 = /dev/autofs Autofs control device 240-254 Reserved for local use 255 Reserved for MISC_DYNAMIC_MINOR diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 2cd8c544e41a..fa6551d36c10 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -260,6 +260,7 @@ static void microcode_dev_exit(void) } MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); +MODULE_ALIAS("devname:cpu/microcode"); #else #define microcode_dev_init() 0 #define microcode_dev_exit() do { } while (0) diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c index 5441688daba7..c5f8eb102bf7 100644 --- a/drivers/net/ppp_generic.c +++ b/drivers/net/ppp_generic.c @@ -2926,5 +2926,5 @@ EXPORT_SYMBOL(ppp_output_wakeup); EXPORT_SYMBOL(ppp_register_compressor); EXPORT_SYMBOL(ppp_unregister_compressor); MODULE_LICENSE("GPL"); -MODULE_ALIAS_CHARDEV_MAJOR(PPP_MAJOR); -MODULE_ALIAS("/dev/ppp"); +MODULE_ALIAS_CHARDEV(PPP_MAJOR, 0); +MODULE_ALIAS("devname:ppp"); diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 97b25533e5fb..005cad689578 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1649,3 +1649,4 @@ MODULE_DESCRIPTION(DRV_DESCRIPTION); MODULE_AUTHOR(DRV_COPYRIGHT); MODULE_LICENSE("GPL"); MODULE_ALIAS_MISCDEV(TUN_MINOR); +MODULE_ALIAS("devname:net/tun"); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index d29b7f6df862..d832062869f6 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -736,11 +736,14 @@ static const struct file_operations _dev_ioctl_fops = { }; static struct miscdevice _autofs_dev_ioctl_misc = { - .minor = MISC_DYNAMIC_MINOR, + .minor = AUTOFS_MINOR, .name = AUTOFS_DEVICE_NAME, .fops = &_dev_ioctl_fops }; +MODULE_ALIAS_MISCDEV(AUTOFS_MINOR); +MODULE_ALIAS("devname:autofs"); + /* Register/deregister misc character device */ int autofs_dev_ioctl_init(void) { diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 1866dff0538e..2909a03e5230 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -832,11 +832,14 @@ static const struct file_operations btrfs_ctl_fops = { }; static struct miscdevice btrfs_misc = { - .minor = MISC_DYNAMIC_MINOR, + .minor = BTRFS_MINOR, .name = "btrfs-control", .fops = &btrfs_ctl_fops }; +MODULE_ALIAS_MISCDEV(BTRFS_MINOR); +MODULE_ALIAS("devname:btrfs-control"); + static int btrfs_interface_init(void) { return misc_register(&btrfs_misc); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index eb7e9423691f..e53df5ebb2b8 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -18,6 +18,7 @@ #include MODULE_ALIAS_MISCDEV(FUSE_MINOR); +MODULE_ALIAS("devname:fuse"); static struct kmem_cache *fuse_req_cachep; diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h index 8b5f7cc0fba6..b631c46cffd9 100644 --- a/include/linux/miscdevice.h +++ b/include/linux/miscdevice.h @@ -31,6 +31,8 @@ #define FUSE_MINOR 229 #define KVM_MINOR 232 #define VHOST_NET_MINOR 233 +#define BTRFS_MINOR 234 +#define AUTOFS_MINOR 235 #define MISC_DYNAMIC_MINOR 255 struct device; -- cgit v1.3-8-gc7d7