From 627ad9fd0733f0a31a266ff98a4a933eee710f0b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 20 Jun 2009 23:21:41 -0400 Subject: ext4: Fix type warning on 64-bit platforms in tracing events header Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index acf4cc9cd36d..b456fb0a3c57 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -34,7 +34,8 @@ TRACE_EVENT(ext4_free_inode, TP_printk("dev %s ino %lu mode %d uid %u gid %u blocks %llu", jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->mode, - __entry->uid, __entry->gid, __entry->blocks) + __entry->uid, __entry->gid, + (unsigned long long) __entry->blocks) ); TRACE_EVENT(ext4_request_inode, -- cgit v1.2.3-59-g8ed1b From 43ce1d23b43330634507a049b55c36e91d27282e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sun, 14 Jun 2009 17:58:45 -0400 Subject: ext4: Fix mmap/truncate race when blocksize < pagesize && !nodellaoc This patch fixes the mmap/truncate race that was fixed for delayed allocation by merging ext4_{journalled,normal,da}_writepage() into ext4_writepage(). Signed-off-by: Aneesh Kumar K.V Acked-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 234 +++++++++++--------------------------------- include/trace/events/ext4.h | 45 +-------- 2 files changed, 58 insertions(+), 221 deletions(-) (limited to 'include') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1275f34589c7..97c48b5b0578 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -47,6 +47,10 @@ #define MPAGE_DA_EXTENT_TAIL 0x01 +static int __ext4_journalled_writepage(struct page *page, + struct writeback_control *wbc, + unsigned int len); + static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { @@ -2392,7 +2396,7 @@ static int __mpage_da_writepage(struct page *page, * We need to try to allocate * unmapped blocks in the same page. * Otherwise we won't make progress - * with the page in ext4_da_writepage + * with the page in ext4_writepage */ if (ext4_bh_delay_or_unwritten(NULL, bh)) { mpage_add_bh_to_extent(mpd, logical, @@ -2519,13 +2523,47 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock, } /* + * Note that we don't need to start a transaction unless we're journaling data + * because we should have holes filled from ext4_page_mkwrite(). We even don't + * need to file the inode to the transaction's list in ordered mode because if + * we are writing back data added by write(), the inode is already there and if + * we are writing back data modified via mmap(), noone guarantees in which + * transaction the data will hit the disk. In case we are journaling data, we + * cannot start transaction directly because transaction start ranks above page + * lock so we have to do some magic. + * * This function can get called via... * - ext4_da_writepages after taking page lock (have journal handle) * - journal_submit_inode_data_buffers (no journal handle) * - shrink_page_list via pdflush (no journal handle) * - grab_page_cache when doing write_begin (have journal handle) + * + * We don't do any block allocation in this function. If we have page with + * multiple blocks we need to write those buffer_heads that are mapped. This + * is important for mmaped based write. So if we do with blocksize 1K + * truncate(f, 1024); + * a = mmap(f, 0, 4096); + * a[0] = 'a'; + * truncate(f, 4096); + * we have in the page first buffer_head mapped via page_mkwrite call back + * but other bufer_heads would be unmapped but dirty(dirty done via the + * do_wp_page). So writepage should write the first block. If we modify + * the mmap area beyond 1024 we will again get a page_fault and the + * page_mkwrite callback will do the block allocation and mark the + * buffer_heads mapped. + * + * We redirty the page if we have any buffer_heads that is either delay or + * unwritten in the page. + * + * We can get recursively called as show below. + * + * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> + * ext4_writepage() + * + * But since we don't do any block allocation we should not deadlock. + * Page also have the dirty flag cleared so we don't get recurive page_lock. */ -static int ext4_da_writepage(struct page *page, +static int ext4_writepage(struct page *page, struct writeback_control *wbc) { int ret = 0; @@ -2534,7 +2572,7 @@ static int ext4_da_writepage(struct page *page, struct buffer_head *page_bufs; struct inode *inode = page->mapping->host; - trace_ext4_da_writepage(inode, page); + trace_ext4_writepage(inode, page); size = i_size_read(inode); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; @@ -2596,6 +2634,15 @@ static int ext4_da_writepage(struct page *page, block_commit_write(page, 0, len); } + if (PageChecked(page) && ext4_should_journal_data(inode)) { + /* + * It's mmapped pagecache. Add buffers and journal it. There + * doesn't seem much point in redirtying the page here. + */ + ClearPageChecked(page); + return __ext4_journalled_writepage(page, wbc, len); + } + if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) ret = nobh_writepage(page, noalloc_get_block_write, wbc); else @@ -3135,112 +3182,10 @@ static int bput_one(handle_t *handle, struct buffer_head *bh) return 0; } -/* - * Note that we don't need to start a transaction unless we're journaling data - * because we should have holes filled from ext4_page_mkwrite(). We even don't - * need to file the inode to the transaction's list in ordered mode because if - * we are writing back data added by write(), the inode is already there and if - * we are writing back data modified via mmap(), noone guarantees in which - * transaction the data will hit the disk. In case we are journaling data, we - * cannot start transaction directly because transaction start ranks above page - * lock so we have to do some magic. - * - * In all journaling modes block_write_full_page() will start the I/O. - * - * Problem: - * - * ext4_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() -> - * ext4_writepage() - * - * Similar for: - * - * ext4_file_write() -> generic_file_write() -> __alloc_pages() -> ... - * - * Same applies to ext4_get_block(). We will deadlock on various things like - * lock_journal and i_data_sem - * - * Setting PF_MEMALLOC here doesn't work - too many internal memory - * allocations fail. - * - * 16May01: If we're reentered then journal_current_handle() will be - * non-zero. We simply *return*. - * - * 1 July 2001: @@@ FIXME: - * In journalled data mode, a data buffer may be metadata against the - * current transaction. But the same file is part of a shared mapping - * and someone does a writepage() on it. - * - * We will move the buffer onto the async_data list, but *after* it has - * been dirtied. So there's a small window where we have dirty data on - * BJ_Metadata. - * - * Note that this only applies to the last partial page in the file. The - * bit which block_write_full_page() uses prepare/commit for. (That's - * broken code anyway: it's wrong for msync()). - * - * It's a rare case: affects the final partial page, for journalled data - * where the file is subject to bith write() and writepage() in the same - * transction. To fix it we'll need a custom block_write_full_page(). - * We'll probably need that anyway for journalling writepage() output. - * - * We don't honour synchronous mounts for writepage(). That would be - * disastrous. Any write() or metadata operation will sync the fs for - * us. - * - */ -static int __ext4_normal_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - - if (test_opt(inode->i_sb, NOBH)) - return nobh_writepage(page, noalloc_get_block_write, wbc); - else - return block_write_full_page(page, noalloc_get_block_write, - wbc); -} - -static int ext4_normal_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - loff_t size = i_size_read(inode); - loff_t len; - - trace_ext4_normal_writepage(inode, page); - J_ASSERT(PageLocked(page)); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - - if (page_has_buffers(page)) { - /* if page has buffers it should all be mapped - * and allocated. If there are not buffers attached - * to the page we know the page is dirty but it lost - * buffers. That means that at some moment in time - * after write_begin() / write_end() has been called - * all buffers have been clean and thus they must have been - * written at least once. So they are all mapped and we can - * happily proceed with mapping them and writing the page. - */ - BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, - ext4_bh_delay_or_unwritten)); - } - - if (!ext4_journal_current_handle()) - return __ext4_normal_writepage(page, wbc); - - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; -} - static int __ext4_journalled_writepage(struct page *page, - struct writeback_control *wbc) + struct writeback_control *wbc, + unsigned int len) { - loff_t size; - unsigned int len; struct address_space *mapping = page->mapping; struct inode *inode = mapping->host; struct buffer_head *page_bufs; @@ -3248,16 +3193,8 @@ static int __ext4_journalled_writepage(struct page *page, int ret = 0; int err; - size = i_size_read(inode); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - ret = block_prepare_write(page, 0, len, noalloc_get_block_write); - if (ret != 0) - goto out_unlock; - page_bufs = page_buffers(page); + BUG_ON(!page_bufs); walk_page_buffers(handle, page_bufs, 0, len, NULL, bget_one); /* As soon as we unlock the page, it can go away, but we have * references to buffers so we are safe */ @@ -3282,67 +3219,10 @@ static int __ext4_journalled_writepage(struct page *page, walk_page_buffers(handle, page_bufs, 0, len, NULL, bput_one); EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; - goto out; - -out_unlock: - unlock_page(page); out: return ret; } -static int ext4_journalled_writepage(struct page *page, - struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - loff_t size = i_size_read(inode); - loff_t len; - - trace_ext4_journalled_writepage(inode, page); - J_ASSERT(PageLocked(page)); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; - - if (page_has_buffers(page)) { - /* if page has buffers it should all be mapped - * and allocated. If there are not buffers attached - * to the page we know the page is dirty but it lost - * buffers. That means that at some moment in time - * after write_begin() / write_end() has been called - * all buffers have been clean and thus they must have been - * written at least once. So they are all mapped and we can - * happily proceed with mapping them and writing the page. - */ - BUG_ON(walk_page_buffers(NULL, page_buffers(page), 0, len, NULL, - ext4_bh_delay_or_unwritten)); - } - - if (ext4_journal_current_handle()) - goto no_write; - - if (PageChecked(page)) { - /* - * It's mmapped pagecache. Add buffers and journal it. There - * doesn't seem much point in redirtying the page here. - */ - ClearPageChecked(page); - return __ext4_journalled_writepage(page, wbc); - } else { - /* - * It may be a page full of checkpoint-mode buffers. We don't - * really know unless we go poke around in the buffer_heads. - * But block_write_full_page will do the right thing. - */ - return block_write_full_page(page, noalloc_get_block_write, - wbc); - } -no_write: - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; -} - static int ext4_readpage(struct file *file, struct page *page) { return mpage_readpage(page, ext4_get_block); @@ -3489,7 +3369,7 @@ static int ext4_journalled_set_page_dirty(struct page *page) static const struct address_space_operations ext4_ordered_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_normal_writepage, + .writepage = ext4_writepage, .sync_page = block_sync_page, .write_begin = ext4_write_begin, .write_end = ext4_ordered_write_end, @@ -3504,7 +3384,7 @@ static const struct address_space_operations ext4_ordered_aops = { static const struct address_space_operations ext4_writeback_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_normal_writepage, + .writepage = ext4_writepage, .sync_page = block_sync_page, .write_begin = ext4_write_begin, .write_end = ext4_writeback_write_end, @@ -3519,7 +3399,7 @@ static const struct address_space_operations ext4_writeback_aops = { static const struct address_space_operations ext4_journalled_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_journalled_writepage, + .writepage = ext4_writepage, .sync_page = block_sync_page, .write_begin = ext4_write_begin, .write_end = ext4_journalled_write_end, @@ -3533,7 +3413,7 @@ static const struct address_space_operations ext4_journalled_aops = { static const struct address_space_operations ext4_da_aops = { .readpage = ext4_readpage, .readpages = ext4_readpages, - .writepage = ext4_da_writepage, + .writepage = ext4_writepage, .writepages = ext4_da_writepages, .sync_page = block_sync_page, .write_begin = ext4_da_write_begin, diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index b456fb0a3c57..dfbc9b0edc88 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -190,7 +190,7 @@ TRACE_EVENT(ext4_journalled_write_end, __entry->copied) ); -TRACE_EVENT(ext4_da_writepage, +TRACE_EVENT(ext4_writepage, TP_PROTO(struct inode *inode, struct page *page), TP_ARGS(inode, page), @@ -342,49 +342,6 @@ TRACE_EVENT(ext4_da_write_end, __entry->copied) ); -TRACE_EVENT(ext4_normal_writepage, - TP_PROTO(struct inode *inode, struct page *page), - - TP_ARGS(inode, page), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( pgoff_t, index ) - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->index = page->index; - ), - - TP_printk("dev %s ino %lu page_index %lu", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index) -); - -TRACE_EVENT(ext4_journalled_writepage, - TP_PROTO(struct inode *inode, struct page *page), - - TP_ARGS(inode, page), - - TP_STRUCT__entry( - __field( dev_t, dev ) - __field( ino_t, ino ) - __field( pgoff_t, index ) - - ), - - TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; - __entry->ino = inode->i_ino; - __entry->index = page->index; - ), - - TP_printk("dev %s ino %lu page_index %lu", - jbd2_dev_to_name(__entry->dev), __entry->ino, __entry->index) -); - TRACE_EVENT(ext4_discard_blocks, TP_PROTO(struct super_block *sb, unsigned long long blk, unsigned long long count), -- cgit v1.2.3-59-g8ed1b From 6fdc03709433ccc2005f0f593ae9d9dd04f7b485 Mon Sep 17 00:00:00 2001 From: Stefan Richter Date: Sat, 20 Jun 2009 13:23:59 +0200 Subject: firewire: core: do not DMA-map stack addresses The DMA mapping API cannot map on-stack addresses, as explained in Documentation/DMA-mapping.txt. Convert the two cases of on-stack packet payload buffers in firewire-core (payload of lock requests in the bus manager work and in iso resource management) to slab-allocated memory. There are a number on-stack buffers for quadlet write or quadlet read requests in firewire-core and firewire-sbp2. These are harmless; they are copied to/ from card driver internal DMA buffers since quadlet payloads are inlined with packet headers. Signed-off-by: Stefan Richter --- drivers/firewire/core-card.c | 14 +++++++------- drivers/firewire/core-cdev.c | 4 +++- drivers/firewire/core-iso.c | 24 +++++++++++++----------- drivers/firewire/core.h | 3 ++- include/linux/firewire.h | 1 + 5 files changed, 26 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/drivers/firewire/core-card.c b/drivers/firewire/core-card.c index 543fccac81bb..f74edae5cb4c 100644 --- a/drivers/firewire/core-card.c +++ b/drivers/firewire/core-card.c @@ -196,8 +196,8 @@ static void allocate_broadcast_channel(struct fw_card *card, int generation) { int channel, bandwidth = 0; - fw_iso_resource_manage(card, generation, 1ULL << 31, - &channel, &bandwidth, true); + fw_iso_resource_manage(card, generation, 1ULL << 31, &channel, + &bandwidth, true, card->bm_transaction_data); if (channel == 31) { card->broadcast_channel_allocated = true; device_for_each_child(card->device, (void *)(long)generation, @@ -230,7 +230,6 @@ static void fw_card_bm_work(struct work_struct *work) bool do_reset = false; bool root_device_is_running; bool root_device_is_cmc; - __be32 lock_data[2]; spin_lock_irqsave(&card->lock, flags); @@ -273,22 +272,23 @@ static void fw_card_bm_work(struct work_struct *work) goto pick_me; } - lock_data[0] = cpu_to_be32(0x3f); - lock_data[1] = cpu_to_be32(local_id); + card->bm_transaction_data[0] = cpu_to_be32(0x3f); + card->bm_transaction_data[1] = cpu_to_be32(local_id); spin_unlock_irqrestore(&card->lock, flags); rcode = fw_run_transaction(card, TCODE_LOCK_COMPARE_SWAP, irm_id, generation, SCODE_100, CSR_REGISTER_BASE + CSR_BUS_MANAGER_ID, - lock_data, sizeof(lock_data)); + card->bm_transaction_data, + sizeof(card->bm_transaction_data)); if (rcode == RCODE_GENERATION) /* Another bus reset, BM work has been rescheduled. */ goto out; if (rcode == RCODE_COMPLETE && - lock_data[0] != cpu_to_be32(0x3f)) { + card->bm_transaction_data[0] != cpu_to_be32(0x3f)) { /* Somebody else is BM. Only act as IRM. */ if (local_id == irm_id) diff --git a/drivers/firewire/core-cdev.c b/drivers/firewire/core-cdev.c index d1d30c615b0f..ced186d7e9a9 100644 --- a/drivers/firewire/core-cdev.c +++ b/drivers/firewire/core-cdev.c @@ -125,6 +125,7 @@ struct iso_resource { int generation; u64 channels; s32 bandwidth; + __be32 transaction_data[2]; struct iso_resource_event *e_alloc, *e_dealloc; }; @@ -1049,7 +1050,8 @@ static void iso_resource_work(struct work_struct *work) r->channels, &channel, &bandwidth, todo == ISO_RES_ALLOC || todo == ISO_RES_REALLOC || - todo == ISO_RES_ALLOC_ONCE); + todo == ISO_RES_ALLOC_ONCE, + r->transaction_data); /* * Is this generation outdated already? As long as this resource sticks * in the idr, it will be scheduled again for a newer generation or at diff --git a/drivers/firewire/core-iso.c b/drivers/firewire/core-iso.c index 166f19c6d38d..110e731f5574 100644 --- a/drivers/firewire/core-iso.c +++ b/drivers/firewire/core-iso.c @@ -177,9 +177,8 @@ EXPORT_SYMBOL(fw_iso_context_stop); */ static int manage_bandwidth(struct fw_card *card, int irm_id, int generation, - int bandwidth, bool allocate) + int bandwidth, bool allocate, __be32 data[2]) { - __be32 data[2]; int try, new, old = allocate ? BANDWIDTH_AVAILABLE_INITIAL : 0; /* @@ -215,9 +214,9 @@ static int manage_bandwidth(struct fw_card *card, int irm_id, int generation, } static int manage_channel(struct fw_card *card, int irm_id, int generation, - u32 channels_mask, u64 offset, bool allocate) + u32 channels_mask, u64 offset, bool allocate, __be32 data[2]) { - __be32 data[2], c, all, old; + __be32 c, all, old; int i, retry = 5; old = all = allocate ? cpu_to_be32(~0) : 0; @@ -260,7 +259,7 @@ static int manage_channel(struct fw_card *card, int irm_id, int generation, } static void deallocate_channel(struct fw_card *card, int irm_id, - int generation, int channel) + int generation, int channel, __be32 buffer[2]) { u32 mask; u64 offset; @@ -269,7 +268,7 @@ static void deallocate_channel(struct fw_card *card, int irm_id, offset = channel < 32 ? CSR_REGISTER_BASE + CSR_CHANNELS_AVAILABLE_HI : CSR_REGISTER_BASE + CSR_CHANNELS_AVAILABLE_LO; - manage_channel(card, irm_id, generation, mask, offset, false); + manage_channel(card, irm_id, generation, mask, offset, false, buffer); } /** @@ -298,7 +297,7 @@ static void deallocate_channel(struct fw_card *card, int irm_id, */ void fw_iso_resource_manage(struct fw_card *card, int generation, u64 channels_mask, int *channel, int *bandwidth, - bool allocate) + bool allocate, __be32 buffer[2]) { u32 channels_hi = channels_mask; /* channels 31...0 */ u32 channels_lo = channels_mask >> 32; /* channels 63...32 */ @@ -310,10 +309,12 @@ void fw_iso_resource_manage(struct fw_card *card, int generation, if (channels_hi) c = manage_channel(card, irm_id, generation, channels_hi, - CSR_REGISTER_BASE + CSR_CHANNELS_AVAILABLE_HI, allocate); + CSR_REGISTER_BASE + CSR_CHANNELS_AVAILABLE_HI, + allocate, buffer); if (channels_lo && c < 0) { c = manage_channel(card, irm_id, generation, channels_lo, - CSR_REGISTER_BASE + CSR_CHANNELS_AVAILABLE_LO, allocate); + CSR_REGISTER_BASE + CSR_CHANNELS_AVAILABLE_LO, + allocate, buffer); if (c >= 0) c += 32; } @@ -325,12 +326,13 @@ void fw_iso_resource_manage(struct fw_card *card, int generation, if (*bandwidth == 0) return; - ret = manage_bandwidth(card, irm_id, generation, *bandwidth, allocate); + ret = manage_bandwidth(card, irm_id, generation, *bandwidth, + allocate, buffer); if (ret < 0) *bandwidth = 0; if (allocate && ret < 0 && c >= 0) { - deallocate_channel(card, irm_id, generation, c); + deallocate_channel(card, irm_id, generation, c, buffer); *channel = ret; } } diff --git a/drivers/firewire/core.h b/drivers/firewire/core.h index c3cfc647e5e3..6052816be353 100644 --- a/drivers/firewire/core.h +++ b/drivers/firewire/core.h @@ -120,7 +120,8 @@ void fw_node_event(struct fw_card *card, struct fw_node *node, int event); int fw_iso_buffer_map(struct fw_iso_buffer *buffer, struct vm_area_struct *vma); void fw_iso_resource_manage(struct fw_card *card, int generation, - u64 channels_mask, int *channel, int *bandwidth, bool allocate); + u64 channels_mask, int *channel, int *bandwidth, + bool allocate, __be32 buffer[2]); /* -topology */ diff --git a/include/linux/firewire.h b/include/linux/firewire.h index 9823946adbc5..192d1e43c43c 100644 --- a/include/linux/firewire.h +++ b/include/linux/firewire.h @@ -127,6 +127,7 @@ struct fw_card { struct delayed_work work; int bm_retries; int bm_generation; + __be32 bm_transaction_data[2]; bool broadcast_channel_allocated; u32 broadcast_channel; -- cgit v1.2.3-59-g8ed1b From 73f1d9391a6aa72efdcea2f302ee7bfcd313c631 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 24 Jun 2009 01:04:36 +0900 Subject: asm-generic/vmlinux.lds.h: Fix up RW_DATA_SECTION definition. RW_DATA_SECTION is defined to take 4 different alignment parameters, while NOSAVE_DATA currently uses a fixed PAGE_SIZE alignment as noted in the comments. There are presently no in-tree users of this at present, and I just stumbled across this while implementing the simplified script on a new architecture port, which subsequently resulted in a syntax error. Signed-off-by: Paul Mundt Signed-off-by: Sam Ravnborg --- include/asm-generic/vmlinux.lds.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 92b73b6140ff..f92e730695c8 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -704,7 +704,7 @@ * matches the requirment of PAGE_ALIGNED_DATA. * * use 0 as page_align if page_aligned data is not used */ -#define RW_DATA_SECTION(cacheline, nosave, pagealigned, inittask) \ +#define RW_DATA_SECTION(cacheline, pagealigned, inittask) \ . = ALIGN(PAGE_SIZE); \ .data : AT(ADDR(.data) - LOAD_OFFSET) { \ INIT_TASK(inittask) \ @@ -712,7 +712,7 @@ READ_MOSTLY_DATA(cacheline) \ DATA_DATA \ CONSTRUCTORS \ - NOSAVE_DATA(nosave) \ + NOSAVE_DATA \ PAGE_ALIGNED_DATA(pagealigned) \ } -- cgit v1.2.3-59-g8ed1b From d2af12aeadaedf657c9fb9c3df984d2c5ab25f4c Mon Sep 17 00:00:00 2001 From: Tim Abbott Date: Tue, 23 Jun 2009 19:59:35 -0400 Subject: Add new macros for page-aligned data and bss sections. This patch is preparation for replacing most uses of ".bss.page_aligned" and ".data.page_aligned" in the kernel with macros, so that the section name can later be changed without having to touch a lot of the kernel. The long-term goal here is to be able to change the kernel's magic section names to those that are compatible with -ffunction-sections -fdata-sections. This requires renaming all magic sections with names of the form ".data.foo". Signed-off-by: Tim Abbott Acked-by: David Howells Signed-off-by: Sam Ravnborg --- include/linux/linkage.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include') diff --git a/include/linux/linkage.h b/include/linux/linkage.h index fee9e59649c1..691f59171c6c 100644 --- a/include/linux/linkage.h +++ b/include/linux/linkage.h @@ -21,6 +21,15 @@ #define __page_aligned_data __section(.data.page_aligned) __aligned(PAGE_SIZE) #define __page_aligned_bss __section(.bss.page_aligned) __aligned(PAGE_SIZE) +/* + * For assembly routines. + * + * Note when using these that you must specify the appropriate + * alignment directives yourself + */ +#define __PAGE_ALIGNED_DATA .section ".data.page_aligned", "aw" +#define __PAGE_ALIGNED_BSS .section ".bss.page_aligned", "aw" + /* * This is used by architectures to keep arguments on the stack * untouched by the compiler by keeping them live until the end. -- cgit v1.2.3-59-g8ed1b From 39a449d96ac3db9b6d498b6ffbf4c763746d5e8b Mon Sep 17 00:00:00 2001 From: Tim Abbott Date: Tue, 23 Jun 2009 18:53:15 -0400 Subject: asm-generic/vmlinux.lds.h: shuffle INIT_TASK* macro names in vmlinux.lds.h We recently added a INIT_TASK(align) in include/asm-generic/vmlinux.lds.h, but there is already a macro INIT_TASK in include/linux/init_task.h, which is quite confusing. We should switch the macro in the linker script to INIT_TASK_DATA. (Sorry that I missed this in reviewing the patch). Since the macros are new, there is only one user of the INIT_TASK in vmlinux.lds.h, arch/mn10300/kernel/vmlinux.lds.S. However, we are currently using INIT_TASK_DATA for laying down an entire .data.init_task section. So rename that to INIT_TASK_DATA_SECTION. I would be worried about changing the meaning of INIT_TASK_DATA, but the old INIT_TASK_DATA implementation had no users, and in fact if anyone had tried to use it, it would have failed to compile because it didn't pass the alignment to the old INIT_TASK. Signed-off-by: Tim Abbott Cc: David Howells Cc: Jesper Nilsson --- arch/mn10300/kernel/vmlinux.lds.S | 2 +- include/asm-generic/vmlinux.lds.h | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/arch/mn10300/kernel/vmlinux.lds.S b/arch/mn10300/kernel/vmlinux.lds.S index bcebcefb4ad7..c96ba3da95ac 100644 --- a/arch/mn10300/kernel/vmlinux.lds.S +++ b/arch/mn10300/kernel/vmlinux.lds.S @@ -61,7 +61,7 @@ SECTIONS _edata = .; /* End of data section */ } - .data.init_task : { INIT_TASK(THREAD_SIZE); } + .data.init_task : { INIT_TASK_DATA(THREAD_SIZE); } /* might get freed after init */ . = ALIGN(PAGE_SIZE); diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index f92e730695c8..720af4c72206 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -191,7 +191,7 @@ . = ALIGN(align); \ *(.data.cacheline_aligned) -#define INIT_TASK(align) \ +#define INIT_TASK_DATA(align) \ . = ALIGN(align); \ *(.data.init_task) @@ -434,10 +434,10 @@ /* * Init task */ -#define INIT_TASK_DATA(align) \ +#define INIT_TASK_DATA_SECTION(align) \ . = ALIGN(align); \ .data.init_task : { \ - INIT_TASK \ + INIT_TASK_DATA(align) \ } #ifdef CONFIG_CONSTRUCTORS @@ -707,7 +707,7 @@ #define RW_DATA_SECTION(cacheline, pagealigned, inittask) \ . = ALIGN(PAGE_SIZE); \ .data : AT(ADDR(.data) - LOAD_OFFSET) { \ - INIT_TASK(inittask) \ + INIT_TASK_DATA(inittask) \ CACHELINE_ALIGNED_DATA(cacheline) \ READ_MOSTLY_DATA(cacheline) \ DATA_DATA \ -- cgit v1.2.3-59-g8ed1b From 857eceebd2803c9a3459f784acf45e5266921e4d Mon Sep 17 00:00:00 2001 From: Tim Abbott Date: Tue, 23 Jun 2009 19:59:36 -0400 Subject: Add new __init_task_data macro to be used in arch init_task.c files. This patch is preparation for replacing most ".data.init_task" in the kernel with macros, so that the section name can later be changed without having to touch a lot of the kernel. The long-term goal here is to be able to change the kernel's magic section names to those that are compatible with -ffunction-sections -fdata-sections. This requires renaming all magic sections with names of the form ".data.foo". Signed-off-by: Tim Abbott Signed-off-by: Sam Ravnborg --- include/linux/init_task.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 5368fbdc7801..7fc01b13be43 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -183,5 +183,8 @@ extern struct cred init_cred; LIST_HEAD_INIT(cpu_timers[2]), \ } +/* Attach to the init_task data structure for proper alignment */ +#define __init_task_data __attribute__((__section__(".data.init_task"))) + #endif -- cgit v1.2.3-59-g8ed1b From bab7614d6d1b1fc96ec6c5a7ca34c8641060e659 Mon Sep 17 00:00:00 2001 From: Eric Miao Date: Mon, 29 Jun 2009 00:20:52 -0700 Subject: Input: add support for generic GPIO-based matrix keypad Original patch by Marek Vasut, modified by Eric in: 1. use delayed work to simplify the debouncing 2. combine col_polarity/row_polarity into a single active_low field 3. use a generic bit array based XOR algorithm to detect key press/release, which should make the column assertion time shorter and code a bit cleaner 4. remove the ALT_FN handling, which is no way generic, the ALT_FN key should be treated as no different from other keys, and translation will be done by user space by commands like 'loadkeys'. 5. explicitly disable row IRQs and flush potential pending work, and schedule an immediate scan after resuming as suggested by Uli Luckas 6. incorporate review comments from many others Patch tested on Littleton/PXA310 (though PXA310 has a dedicate keypad controller, I have to configure those pins as generic GPIO to use this driver, works quite well, though), and Sharp Zaurus model SL-C7x0 and SL-C1000. [dtor@mail.ru: fix error unwinding path, support changing keymap from userspace] Signed-off-by: Marek Vasut Reviewed-by: Trilok Soni Reviewed-by: Uli Luckas Reviewed-by: Russell King Reviewed-by: Robert Jarzmik Signed-off-by: Eric Miao Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/Kconfig | 13 +- drivers/input/keyboard/Makefile | 1 + drivers/input/keyboard/matrix_keypad.c | 453 +++++++++++++++++++++++++++++++++ include/linux/input/matrix_keypad.h | 65 +++++ 4 files changed, 530 insertions(+), 2 deletions(-) create mode 100644 drivers/input/keyboard/matrix_keypad.c create mode 100644 include/linux/input/matrix_keypad.h (limited to 'include') diff --git a/drivers/input/keyboard/Kconfig b/drivers/input/keyboard/Kconfig index d2df1030675a..a6b989a9dc07 100644 --- a/drivers/input/keyboard/Kconfig +++ b/drivers/input/keyboard/Kconfig @@ -158,7 +158,16 @@ config KEYBOARD_GPIO with configuration data saying which GPIOs are used. To compile this driver as a module, choose M here: the - module will be called gpio-keys. + module will be called gpio_keys. + +config KEYBOARD_MATRIX + tristate "GPIO driven matrix keypad support" + depends on GENERIC_GPIO + help + Enable support for GPIO driven matrix keypad. + + To compile this driver as a module, choose M here: the + module will be called matrix_keypad. config KEYBOARD_HIL_OLD tristate "HP HIL keyboard support (simple driver)" @@ -254,7 +263,7 @@ config KEYBOARD_PXA27x tristate "PXA27x/PXA3xx keypad support" depends on PXA27x || PXA3xx help - Enable support for PXA27x/PXA3xx keypad controller + Enable support for PXA27x/PXA3xx keypad controller. To compile this driver as a module, choose M here: the module will be called pxa27x_keypad. diff --git a/drivers/input/keyboard/Makefile b/drivers/input/keyboard/Makefile index 632efbc18c44..b5b5eae9724f 100644 --- a/drivers/input/keyboard/Makefile +++ b/drivers/input/keyboard/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_KEYBOARD_LKKBD) += lkkbd.o obj-$(CONFIG_KEYBOARD_LM8323) += lm8323.o obj-$(CONFIG_KEYBOARD_LOCOMO) += locomokbd.o obj-$(CONFIG_KEYBOARD_MAPLE) += maple_keyb.o +obj-$(CONFIG_KEYBOARD_MATRIX) += matrix_keypad.o obj-$(CONFIG_KEYBOARD_NEWTON) += newtonkbd.o obj-$(CONFIG_KEYBOARD_OMAP) += omap-keypad.o obj-$(CONFIG_KEYBOARD_PXA27x) += pxa27x_keypad.o diff --git a/drivers/input/keyboard/matrix_keypad.c b/drivers/input/keyboard/matrix_keypad.c new file mode 100644 index 000000000000..e9b2e7cb05be --- /dev/null +++ b/drivers/input/keyboard/matrix_keypad.c @@ -0,0 +1,453 @@ +/* + * GPIO driven matrix keyboard driver + * + * Copyright (c) 2008 Marek Vasut + * + * Based on corgikbd.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct matrix_keypad { + const struct matrix_keypad_platform_data *pdata; + struct input_dev *input_dev; + unsigned short *keycodes; + + uint32_t last_key_state[MATRIX_MAX_COLS]; + struct delayed_work work; + bool scan_pending; + bool stopped; + spinlock_t lock; +}; + +/* + * NOTE: normally the GPIO has to be put into HiZ when de-activated to cause + * minmal side effect when scanning other columns, here it is configured to + * be input, and it should work on most platforms. + */ +static void __activate_col(const struct matrix_keypad_platform_data *pdata, + int col, bool on) +{ + bool level_on = !pdata->active_low; + + if (on) { + gpio_direction_output(pdata->col_gpios[col], level_on); + } else { + gpio_set_value_cansleep(pdata->col_gpios[col], !level_on); + gpio_direction_input(pdata->col_gpios[col]); + } +} + +static void activate_col(const struct matrix_keypad_platform_data *pdata, + int col, bool on) +{ + __activate_col(pdata, col, on); + + if (on && pdata->col_scan_delay_us) + udelay(pdata->col_scan_delay_us); +} + +static void activate_all_cols(const struct matrix_keypad_platform_data *pdata, + bool on) +{ + int col; + + for (col = 0; col < pdata->num_col_gpios; col++) + __activate_col(pdata, col, on); +} + +static bool row_asserted(const struct matrix_keypad_platform_data *pdata, + int row) +{ + return gpio_get_value_cansleep(pdata->row_gpios[row]) ? + !pdata->active_low : pdata->active_low; +} + +static void enable_row_irqs(struct matrix_keypad *keypad) +{ + const struct matrix_keypad_platform_data *pdata = keypad->pdata; + int i; + + for (i = 0; i < pdata->num_row_gpios; i++) + enable_irq(gpio_to_irq(pdata->row_gpios[i])); +} + +static void disable_row_irqs(struct matrix_keypad *keypad) +{ + const struct matrix_keypad_platform_data *pdata = keypad->pdata; + int i; + + for (i = 0; i < pdata->num_row_gpios; i++) + disable_irq_nosync(gpio_to_irq(pdata->row_gpios[i])); +} + +/* + * This gets the keys from keyboard and reports it to input subsystem + */ +static void matrix_keypad_scan(struct work_struct *work) +{ + struct matrix_keypad *keypad = + container_of(work, struct matrix_keypad, work.work); + struct input_dev *input_dev = keypad->input_dev; + const struct matrix_keypad_platform_data *pdata = keypad->pdata; + uint32_t new_state[MATRIX_MAX_COLS]; + int row, col, code; + + /* de-activate all columns for scanning */ + activate_all_cols(pdata, false); + + memset(new_state, 0, sizeof(new_state)); + + /* assert each column and read the row status out */ + for (col = 0; col < pdata->num_col_gpios; col++) { + + activate_col(pdata, col, true); + + for (row = 0; row < pdata->num_row_gpios; row++) + new_state[col] |= + row_asserted(pdata, row) ? (1 << row) : 0; + + activate_col(pdata, col, false); + } + + for (col = 0; col < pdata->num_col_gpios; col++) { + uint32_t bits_changed; + + bits_changed = keypad->last_key_state[col] ^ new_state[col]; + if (bits_changed == 0) + continue; + + for (row = 0; row < pdata->num_row_gpios; row++) { + if ((bits_changed & (1 << row)) == 0) + continue; + + code = (row << 4) + col; + input_event(input_dev, EV_MSC, MSC_SCAN, code); + input_report_key(input_dev, + keypad->keycodes[code], + new_state[col] & (1 << row)); + } + } + input_sync(input_dev); + + memcpy(keypad->last_key_state, new_state, sizeof(new_state)); + + activate_all_cols(pdata, true); + + /* Enable IRQs again */ + spin_lock_irq(&keypad->lock); + keypad->scan_pending = false; + enable_row_irqs(keypad); + spin_unlock_irq(&keypad->lock); +} + +static irqreturn_t matrix_keypad_interrupt(int irq, void *id) +{ + struct matrix_keypad *keypad = id; + unsigned long flags; + + spin_lock_irqsave(&keypad->lock, flags); + + /* + * See if another IRQ beaten us to it and scheduled the + * scan already. In that case we should not try to + * disable IRQs again. + */ + if (unlikely(keypad->scan_pending || keypad->stopped)) + goto out; + + disable_row_irqs(keypad); + keypad->scan_pending = true; + schedule_delayed_work(&keypad->work, + msecs_to_jiffies(keypad->pdata->debounce_ms)); + +out: + spin_unlock_irqrestore(&keypad->lock, flags); + return IRQ_HANDLED; +} + +static int matrix_keypad_start(struct input_dev *dev) +{ + struct matrix_keypad *keypad = input_get_drvdata(dev); + + keypad->stopped = false; + mb(); + + /* + * Schedule an immediate key scan to capture current key state; + * columns will be activated and IRQs be enabled after the scan. + */ + schedule_delayed_work(&keypad->work, 0); + + return 0; +} + +static void matrix_keypad_stop(struct input_dev *dev) +{ + struct matrix_keypad *keypad = input_get_drvdata(dev); + + keypad->stopped = true; + mb(); + flush_work(&keypad->work.work); + /* + * matrix_keypad_scan() will leave IRQs enabled; + * we should disable them now. + */ + disable_row_irqs(keypad); +} + +#ifdef CONFIG_PM +static int matrix_keypad_suspend(struct platform_device *pdev, pm_message_t state) +{ + struct matrix_keypad *keypad = platform_get_drvdata(pdev); + const struct matrix_keypad_platform_data *pdata = keypad->pdata; + int i; + + matrix_keypad_stop(keypad->input_dev); + + if (device_may_wakeup(&pdev->dev)) + for (i = 0; i < pdata->num_row_gpios; i++) + enable_irq_wake(gpio_to_irq(pdata->row_gpios[i])); + + return 0; +} + +static int matrix_keypad_resume(struct platform_device *pdev) +{ + struct matrix_keypad *keypad = platform_get_drvdata(pdev); + const struct matrix_keypad_platform_data *pdata = keypad->pdata; + int i; + + if (device_may_wakeup(&pdev->dev)) + for (i = 0; i < pdata->num_row_gpios; i++) + disable_irq_wake(gpio_to_irq(pdata->row_gpios[i])); + + matrix_keypad_start(keypad->input_dev); + + return 0; +} +#else +#define matrix_keypad_suspend NULL +#define matrix_keypad_resume NULL +#endif + +static int __devinit init_matrix_gpio(struct platform_device *pdev, + struct matrix_keypad *keypad) +{ + const struct matrix_keypad_platform_data *pdata = keypad->pdata; + int i, err = -EINVAL; + + /* initialized strobe lines as outputs, activated */ + for (i = 0; i < pdata->num_col_gpios; i++) { + err = gpio_request(pdata->col_gpios[i], "matrix_kbd_col"); + if (err) { + dev_err(&pdev->dev, + "failed to request GPIO%d for COL%d\n", + pdata->col_gpios[i], i); + goto err_free_cols; + } + + gpio_direction_output(pdata->col_gpios[i], !pdata->active_low); + } + + for (i = 0; i < pdata->num_row_gpios; i++) { + err = gpio_request(pdata->row_gpios[i], "matrix_kbd_row"); + if (err) { + dev_err(&pdev->dev, + "failed to request GPIO%d for ROW%d\n", + pdata->row_gpios[i], i); + goto err_free_rows; + } + + gpio_direction_input(pdata->row_gpios[i]); + } + + for (i = 0; i < pdata->num_row_gpios; i++) { + err = request_irq(gpio_to_irq(pdata->row_gpios[i]), + matrix_keypad_interrupt, + IRQF_DISABLED | + IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING, + "matrix-keypad", keypad); + if (err) { + dev_err(&pdev->dev, + "Unable to acquire interrupt for GPIO line %i\n", + pdata->row_gpios[i]); + goto err_free_irqs; + } + } + + /* initialized as disabled - enabled by input->open */ + disable_row_irqs(keypad); + return 0; + +err_free_irqs: + while (--i >= 0) + free_irq(gpio_to_irq(pdata->row_gpios[i]), keypad); + i = pdata->num_row_gpios; +err_free_rows: + while (--i >= 0) + gpio_free(pdata->row_gpios[i]); + i = pdata->num_col_gpios; +err_free_cols: + while (--i >= 0) + gpio_free(pdata->col_gpios[i]); + + return err; +} + +static int __devinit matrix_keypad_probe(struct platform_device *pdev) +{ + const struct matrix_keypad_platform_data *pdata; + const struct matrix_keymap_data *keymap_data; + struct matrix_keypad *keypad; + struct input_dev *input_dev; + unsigned short *keycodes; + int i; + int err; + + pdata = pdev->dev.platform_data; + if (!pdata) { + dev_err(&pdev->dev, "no platform data defined\n"); + return -EINVAL; + } + + keymap_data = pdata->keymap_data; + if (!keymap_data) { + dev_err(&pdev->dev, "no keymap data defined\n"); + return -EINVAL; + } + + if (!keymap_data->max_keymap_size) { + dev_err(&pdev->dev, "invalid keymap data supplied\n"); + return -EINVAL; + } + + keypad = kzalloc(sizeof(struct matrix_keypad), GFP_KERNEL); + keycodes = kzalloc(keymap_data->max_keymap_size * + sizeof(keypad->keycodes), + GFP_KERNEL); + input_dev = input_allocate_device(); + if (!keypad || !keycodes || !input_dev) { + err = -ENOMEM; + goto err_free_mem; + } + + keypad->input_dev = input_dev; + keypad->pdata = pdata; + keypad->keycodes = keycodes; + keypad->stopped = true; + INIT_DELAYED_WORK(&keypad->work, matrix_keypad_scan); + spin_lock_init(&keypad->lock); + + input_dev->name = pdev->name; + input_dev->id.bustype = BUS_HOST; + input_dev->dev.parent = &pdev->dev; + input_dev->evbit[0] = BIT_MASK(EV_KEY) | BIT_MASK(EV_REP); + input_dev->open = matrix_keypad_start; + input_dev->close = matrix_keypad_stop; + + input_dev->keycode = keycodes; + input_dev->keycodesize = sizeof(*keycodes); + input_dev->keycodemax = keymap_data->max_keymap_size; + + for (i = 0; i < keymap_data->keymap_size; i++) { + unsigned int key = keymap_data->keymap[i]; + unsigned int row = KEY_ROW(key); + unsigned int col = KEY_COL(key); + unsigned short code = KEY_VAL(key); + + keycodes[(row << 4) + col] = code; + __set_bit(code, input_dev->keybit); + } + __clear_bit(KEY_RESERVED, input_dev->keybit); + + input_set_capability(input_dev, EV_MSC, MSC_SCAN); + input_set_drvdata(input_dev, keypad); + + err = init_matrix_gpio(pdev, keypad); + if (err) + goto err_free_mem; + + err = input_register_device(keypad->input_dev); + if (err) + goto err_free_mem; + + device_init_wakeup(&pdev->dev, pdata->wakeup); + platform_set_drvdata(pdev, keypad); + + return 0; + +err_free_mem: + input_free_device(input_dev); + kfree(keycodes); + kfree(keypad); + return err; +} + +static int __devexit matrix_keypad_remove(struct platform_device *pdev) +{ + struct matrix_keypad *keypad = platform_get_drvdata(pdev); + const struct matrix_keypad_platform_data *pdata = keypad->pdata; + int i; + + device_init_wakeup(&pdev->dev, 0); + + for (i = 0; i < pdata->num_row_gpios; i++) { + free_irq(gpio_to_irq(pdata->row_gpios[i]), keypad); + gpio_free(pdata->row_gpios[i]); + } + + for (i = 0; i < pdata->num_col_gpios; i++) + gpio_free(pdata->col_gpios[i]); + + input_unregister_device(keypad->input_dev); + platform_set_drvdata(pdev, NULL); + kfree(keypad->keycodes); + kfree(keypad); + + return 0; +} + +static struct platform_driver matrix_keypad_driver = { + .probe = matrix_keypad_probe, + .remove = __devexit_p(matrix_keypad_remove), + .suspend = matrix_keypad_suspend, + .resume = matrix_keypad_resume, + .driver = { + .name = "matrix-keypad", + .owner = THIS_MODULE, + }, +}; + +static int __init matrix_keypad_init(void) +{ + return platform_driver_register(&matrix_keypad_driver); +} + +static void __exit matrix_keypad_exit(void) +{ + platform_driver_unregister(&matrix_keypad_driver); +} + +module_init(matrix_keypad_init); +module_exit(matrix_keypad_exit); + +MODULE_AUTHOR("Marek Vasut "); +MODULE_DESCRIPTION("GPIO Driven Matrix Keypad Driver"); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("platform:matrix-keypad"); diff --git a/include/linux/input/matrix_keypad.h b/include/linux/input/matrix_keypad.h new file mode 100644 index 000000000000..7964516c6954 --- /dev/null +++ b/include/linux/input/matrix_keypad.h @@ -0,0 +1,65 @@ +#ifndef _MATRIX_KEYPAD_H +#define _MATRIX_KEYPAD_H + +#include +#include + +#define MATRIX_MAX_ROWS 16 +#define MATRIX_MAX_COLS 16 + +#define KEY(row, col, val) ((((row) & (MATRIX_MAX_ROWS - 1)) << 24) |\ + (((col) & (MATRIX_MAX_COLS - 1)) << 16) |\ + (val & 0xffff)) + +#define KEY_ROW(k) (((k) >> 24) & 0xff) +#define KEY_COL(k) (((k) >> 16) & 0xff) +#define KEY_VAL(k) ((k) & 0xffff) + +/** + * struct matrix_keymap_data - keymap for matrix keyboards + * @keymap: pointer to array of uint32 values encoded with KEY() macro + * representing keymap + * @keymap_size: number of entries (initialized) in this keymap + * @max_keymap_size: maximum size of keymap supported by the device + * + * This structure is supposed to be used by platform code to supply + * keymaps to drivers that implement matrix-like keypads/keyboards. + */ +struct matrix_keymap_data { + const uint32_t *keymap; + unsigned int keymap_size; + unsigned int max_keymap_size; +}; + +/** + * struct matrix_keypad_platform_data - platform-dependent keypad data + * @keymap_data: pointer to &matrix_keymap_data + * @row_gpios: array of gpio numbers reporesenting rows + * @col_gpios: array of gpio numbers reporesenting colums + * @num_row_gpios: actual number of row gpios used by device + * @num_col_gpios: actual number of col gpios used by device + * @col_scan_delay_us: delay, measured in microseconds, that is + * needed before we can keypad after activating column gpio + * @debounce_ms: debounce interval in milliseconds + * + * This structure represents platform-specific data that use used by + * matrix_keypad driver to perform proper initialization. + */ +struct matrix_keypad_platform_data { + const struct matrix_keymap_data *keymap_data; + + unsigned int row_gpios[MATRIX_MAX_ROWS]; + unsigned int col_gpios[MATRIX_MAX_COLS]; + unsigned int num_row_gpios; + unsigned int num_col_gpios; + + unsigned int col_scan_delay_us; + + /* key debounce interval in milli-second */ + unsigned int debounce_ms; + + bool active_low; + bool wakeup; +}; + +#endif /* _MATRIX_KEYPAD_H */ -- cgit v1.2.3-59-g8ed1b From 2fc90f6133a87da8177636866557d4cc5f56e661 Mon Sep 17 00:00:00 2001 From: Alexey Zaytsev Date: Wed, 24 Jun 2009 16:22:30 +0400 Subject: PCI: make pci_name() take const argument Since this function should never modify it (saves warnings when called with const args too). Signed-off-by: Alexey Zaytsev Signed-off-by: Jesse Barnes --- include/linux/pci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/pci.h b/include/linux/pci.h index d304ddf412d0..115fb7ba5089 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1145,7 +1145,7 @@ static inline void pci_set_drvdata(struct pci_dev *pdev, void *data) /* If you want to know what to call your pci_dev, ask this function. * Again, it's a wrapper around the generic device. */ -static inline const char *pci_name(struct pci_dev *pdev) +static inline const char *pci_name(const struct pci_dev *pdev) { return dev_name(&pdev->dev); } -- cgit v1.2.3-59-g8ed1b From 887b5ea3683ce04966e35fa2e5fe215bedcde73c Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 1 Jul 2009 13:38:26 +0000 Subject: if_ether: add define for 1588 aka Timesync This patch adds ETH_P_1588 protocol ID define. Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- include/linux/if_ether.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/if_ether.h b/include/linux/if_ether.h index ae3a1871413d..70fdba2bbf71 100644 --- a/include/linux/if_ether.h +++ b/include/linux/if_ether.h @@ -78,6 +78,7 @@ #define ETH_P_PAE 0x888E /* Port Access Entity (IEEE 802.1X) */ #define ETH_P_AOE 0x88A2 /* ATA over Ethernet */ #define ETH_P_TIPC 0x88CA /* TIPC */ +#define ETH_P_1588 0x88F7 /* IEEE 1588 Timesync */ #define ETH_P_FCOE 0x8906 /* Fibre Channel over Ethernet */ #define ETH_P_FIP 0x8914 /* FCoE Initialization Protocol */ #define ETH_P_EDSA 0xDADA /* Ethertype DSA [ NOT AN OFFICIALLY REGISTERED ID ] */ -- cgit v1.2.3-59-g8ed1b From 509dd025a4ac6f32921211557cfd434b81d8d7a7 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Tue, 30 Jun 2009 16:07:26 -0300 Subject: V4L/DVB (12148): move V4L2_PIX_FMT_SGRBG8 to the proper place Instead of defining a new pif format on an internal header, move it to the V4L2 API header. Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/gspca/stv06xx/stv06xx.h | 4 ---- include/linux/videodev2.h | 2 ++ 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/media/video/gspca/stv06xx/stv06xx.h b/drivers/media/video/gspca/stv06xx/stv06xx.h index 9df7137fe67e..992ce530f138 100644 --- a/drivers/media/video/gspca/stv06xx/stv06xx.h +++ b/drivers/media/video/gspca/stv06xx/stv06xx.h @@ -36,10 +36,6 @@ #define STV_ISOC_ENDPOINT_ADDR 0x81 -#ifndef V4L2_PIX_FMT_SGRBG8 -#define V4L2_PIX_FMT_SGRBG8 v4l2_fourcc('G', 'R', 'B', 'G') -#endif - #define STV_REG23 0x0423 /* Control registers of the STV0600 ASIC */ diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index 8a025d510904..95846d988011 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -318,6 +318,8 @@ struct v4l2_pix_format { /* see http://www.siliconimaging.com/RGB%20Bayer.htm */ #define V4L2_PIX_FMT_SBGGR8 v4l2_fourcc('B', 'A', '8', '1') /* 8 BGBG.. GRGR.. */ #define V4L2_PIX_FMT_SGBRG8 v4l2_fourcc('G', 'B', 'R', 'G') /* 8 GBGB.. RGRG.. */ +#define V4L2_PIX_FMT_SGRBG8 v4l2_fourcc('G', 'R', 'B', 'G') /* 8 GRGR.. BGBG.. */ + /* * 10bit raw bayer, expanded to 16 bits * xxxxrrrrrrrrrrxxxxgggggggggg xxxxggggggggggxxxxbbbbbbbbbb... -- cgit v1.2.3-59-g8ed1b From 7dfba00d05f3c7db9510f3b54a472981cf1521af Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 29 Jun 2009 05:41:26 -0300 Subject: V4L/DVB (12135): Add a driver for mt9v011 sensor Adds driver for mt9v011 based on its datasheet, available at: http://download.micron.com/pdf/datasheets/imaging/MT9V011.pdf The driver was tested with a webcam that will be added on a next patch. Signed-off-by: Mauro Carvalho Chehab --- drivers/media/video/Kconfig | 8 + drivers/media/video/Makefile | 1 + drivers/media/video/mt9v011.c | 355 ++++++++++++++++++++++++++++++++++++++++ drivers/media/video/mt9v011.h | 35 ++++ include/media/v4l2-chip-ident.h | 3 + 5 files changed, 402 insertions(+) create mode 100644 drivers/media/video/mt9v011.c create mode 100644 drivers/media/video/mt9v011.h (limited to 'include') diff --git a/drivers/media/video/Kconfig b/drivers/media/video/Kconfig index 061e147f6f26..84b6fc15519d 100644 --- a/drivers/media/video/Kconfig +++ b/drivers/media/video/Kconfig @@ -312,6 +312,14 @@ config VIDEO_OV7670 OV7670 VGA camera. It currently only works with the M88ALP01 controller. +config VIDEO_MT9V011 + tristate "Micron mt9v011 sensor support" + depends on I2C && VIDEO_V4L2 + ---help--- + This is a Video4Linux2 sensor-level driver for the Micron + mt0v011 1.3 Mpixel camera. It currently only works with the + em28xx driver. + config VIDEO_TCM825X tristate "TCM825x camera sensor support" depends on I2C && VIDEO_V4L2 diff --git a/drivers/media/video/Makefile b/drivers/media/video/Makefile index 7fb3add1b387..9f2e3214a482 100644 --- a/drivers/media/video/Makefile +++ b/drivers/media/video/Makefile @@ -69,6 +69,7 @@ obj-$(CONFIG_VIDEO_UPD64083) += upd64083.o obj-$(CONFIG_VIDEO_OV7670) += ov7670.o obj-$(CONFIG_VIDEO_TCM825X) += tcm825x.o obj-$(CONFIG_VIDEO_TVEEPROM) += tveeprom.o +obj-$(CONFIG_VIDEO_MT9V011) += mt9v011.o obj-$(CONFIG_SOC_CAMERA_MT9M001) += mt9m001.o obj-$(CONFIG_SOC_CAMERA_MT9M111) += mt9m111.o diff --git a/drivers/media/video/mt9v011.c b/drivers/media/video/mt9v011.c new file mode 100644 index 000000000000..4389197cedd7 --- /dev/null +++ b/drivers/media/video/mt9v011.c @@ -0,0 +1,355 @@ +/* + * mt9v011 -Micron 1/4-Inch VGA Digital Image Sensor + * + * Copyright (c) 2009 Mauro Carvalho Chehab (mchehab@redhat.com) + * This code is placed under the terms of the GNU General Public License v2 + */ + +#include +#include +#include +#include +#include "mt9v011.h" +#include +#include + +MODULE_DESCRIPTION("Micron mt9v011 sensor driver"); +MODULE_AUTHOR("Mauro Carvalho Chehab "); +MODULE_LICENSE("GPL"); + + +static int debug; +module_param(debug, int, 0); +MODULE_PARM_DESC(debug, "Debug level (0-2)"); + +/* supported controls */ +static struct v4l2_queryctrl mt9v011_qctrl[] = { + { + .id = V4L2_CID_GAIN, + .type = V4L2_CTRL_TYPE_INTEGER, + .name = "Gain", + .minimum = 0, + .maximum = (1 << 10) - 1, + .step = 1, + .default_value = 0x0020, + .flags = 0, + }, { + .id = V4L2_CID_RED_BALANCE, + .type = V4L2_CTRL_TYPE_INTEGER, + .name = "Red Balance", + .minimum = -1 << 9, + .maximum = (1 << 9) - 1, + .step = 1, + .default_value = 0, + .flags = 0, + }, { + .id = V4L2_CID_BLUE_BALANCE, + .type = V4L2_CTRL_TYPE_INTEGER, + .name = "Blue Balance", + .minimum = -1 << 9, + .maximum = (1 << 9) - 1, + .step = 1, + .default_value = 0, + .flags = 0, + }, +}; + +struct mt9v011 { + struct v4l2_subdev sd; + + u16 global_gain, red_bal, blue_bal; +}; + +static inline struct mt9v011 *to_mt9v011(struct v4l2_subdev *sd) +{ + return container_of(sd, struct mt9v011, sd); +} + +static int mt9v011_read(struct v4l2_subdev *sd, unsigned char addr) +{ + struct i2c_client *c = v4l2_get_subdevdata(sd); + __be16 buffer; + int rc, val; + + if (1 != (rc = i2c_master_send(c, &addr, 1))) + v4l2_dbg(0, debug, sd, + "i2c i/o error: rc == %d (should be 1)\n", rc); + + msleep(10); + + if (2 != (rc = i2c_master_recv(c, (char *)&buffer, 2))) + v4l2_dbg(0, debug, sd, + "i2c i/o error: rc == %d (should be 1)\n", rc); + + val = be16_to_cpu(buffer); + + v4l2_dbg(2, debug, sd, "mt9v011: read 0x%02x = 0x%04x\n", addr, val); + + return val; +} + +static void mt9v011_write(struct v4l2_subdev *sd, unsigned char addr, + u16 value) +{ + struct i2c_client *c = v4l2_get_subdevdata(sd); + unsigned char buffer[3]; + int rc; + + buffer[0] = addr; + buffer[1] = value >> 8; + buffer[2] = value & 0xff; + + v4l2_dbg(2, debug, sd, + "mt9v011: writing 0x%02x 0x%04x\n", buffer[0], value); + if (3 != (rc = i2c_master_send(c, buffer, 3))) + v4l2_dbg(0, debug, sd, + "i2c i/o error: rc == %d (should be 3)\n", rc); +} + + +struct i2c_reg_value { + unsigned char reg; + u16 value; +}; + +/* + * Values used at the original driver + * Some values are marked as Reserved at the datasheet + */ +static const struct i2c_reg_value mt9v011_init_default[] = { + /* guessed meaning - as mt9m111 */ + { R0D_MT9V011_RESET, 0x0001 }, + { R0D_MT9V011_RESET, 0x0000 }, + { R01_MT9V011_ROWSTART, 0x0008 }, + { R02_MT9V011_COLSTART, 0x0014 }, + { R03_MT9V011_HEIGHT, 0x01e0 }, + { R04_MT9V011_WIDTH, 0x0280 }, + { R05_MT9V011_HBLANK, 0x0001 }, + { R05_MT9V011_HBLANK, 0x0001 }, + { R0A_MT9V011_CLK_SPEED, 0x0000 }, + { R05_MT9V011_HBLANK, 0x000a }, + { 0x30, 0x0005 }, + { 0x34, 0x0100 }, + { 0x3d, 0x068f }, + { 0x40, 0x01e0 }, + { 0x52, 0x0100 }, + { 0x58, 0x0038 }, /* Datasheet default 0x0078 */ + { 0x59, 0x0723 }, /* Datasheet default 0x0703 */ + { 0x62, 0x041a }, /* Datasheet default 0x0418 */ + { R09_MT9V011_SHUTTER_WIDTH, 0x0418 }, + { R20_MT9V011_READ_MODE, 0x1100 }, +}; + +static void set_balance(struct v4l2_subdev *sd) +{ + struct mt9v011 *core = to_mt9v011(sd); + u16 green1_gain, green2_gain, blue_gain, red_gain; + + green1_gain = core->global_gain; + green2_gain = core->global_gain; + + blue_gain = core->global_gain + + core->global_gain * core->blue_bal / (1 << 9); + + red_gain = core->global_gain + + core->global_gain * core->blue_bal / (1 << 9); + + mt9v011_write(sd, R2B_MT9V011_GREEN_1_GAIN, green1_gain); + mt9v011_write(sd, R2E_MT9V011_GREEN_2_GAIN, green1_gain); + mt9v011_write(sd, R2C_MT9V011_BLUE_GAIN, blue_gain); + mt9v011_write(sd, R2D_MT9V011_RED_GAIN, red_gain); +} + +static int mt9v011_reset(struct v4l2_subdev *sd, u32 val) +{ + u16 version; + int i; + + version = mt9v011_read(sd, R00_MT9V011_CHIP_VERSION); + + if (version != MT9V011_VERSION) { + v4l2_info(sd, "*** unknown micron chip detected (0x%04x.\n", + version); + return -EINVAL; + + } + + for (i = 0; i < ARRAY_SIZE(mt9v011_init_default); i++) + mt9v011_write(sd, mt9v011_init_default[i].reg, + mt9v011_init_default[i].value); + + set_balance(sd); + + return 0; +}; + +static int mt9v011_g_ctrl(struct v4l2_subdev *sd, struct v4l2_control *ctrl) +{ + struct mt9v011 *core = to_mt9v011(sd); + + v4l2_dbg(1, debug, sd, "g_ctrl called\n"); + + switch (ctrl->id) { + case V4L2_CID_GAIN: + ctrl->value = core->global_gain; + return 0; + case V4L2_CID_RED_BALANCE: + ctrl->value = core->red_bal; + return 0; + case V4L2_CID_BLUE_BALANCE: + ctrl->value = core->blue_bal; + return 0; + } + return -EINVAL; +} + +static int mt9v011_s_ctrl(struct v4l2_subdev *sd, struct v4l2_control *ctrl) +{ + struct mt9v011 *core = to_mt9v011(sd); + u8 i, n; + n = ARRAY_SIZE(mt9v011_qctrl); + + for (i = 0; i < n; i++) { + if (ctrl->id != mt9v011_qctrl[i].id) + continue; + if (ctrl->value < mt9v011_qctrl[i].minimum || + ctrl->value > mt9v011_qctrl[i].maximum) + return -ERANGE; + v4l2_dbg(1, debug, sd, "s_ctrl: id=%d, value=%d\n", + ctrl->id, ctrl->value); + break; + } + + switch (ctrl->id) { + case V4L2_CID_GAIN: + core->global_gain = ctrl->value; + break; + case V4L2_CID_RED_BALANCE: + core->red_bal = ctrl->value; + break; + case V4L2_CID_BLUE_BALANCE: + core->blue_bal = ctrl->value; + break; + default: + return -EINVAL; + } + + set_balance(sd); + + return 0; +} + +#ifdef CONFIG_VIDEO_ADV_DEBUG +static int mt9v011_g_register(struct v4l2_subdev *sd, + struct v4l2_dbg_register *reg) +{ + struct i2c_client *client = v4l2_get_subdevdata(sd); + + if (!v4l2_chip_match_i2c_client(client, ®->match)) + return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + reg->val = mt9v011_read(sd, reg->reg & 0xff); + reg->size = 2; + + return 0; +} + +static int mt9v011_s_register(struct v4l2_subdev *sd, + struct v4l2_dbg_register *reg) +{ + struct i2c_client *client = v4l2_get_subdevdata(sd); + + if (!v4l2_chip_match_i2c_client(client, ®->match)) + return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + mt9v011_write(sd, reg->reg & 0xff, reg->val & 0xffff); + + return 0; +} +#endif + +static int mt9v011_g_chip_ident(struct v4l2_subdev *sd, + struct v4l2_dbg_chip_ident *chip) +{ + struct i2c_client *client = v4l2_get_subdevdata(sd); + + return v4l2_chip_ident_i2c_client(client, chip, V4L2_IDENT_MT9V011, + MT9V011_VERSION); +} + +static const struct v4l2_subdev_core_ops mt9v011_core_ops = { + .g_ctrl = mt9v011_g_ctrl, + .s_ctrl = mt9v011_s_ctrl, + .reset = mt9v011_reset, + .g_chip_ident = mt9v011_g_chip_ident, +#ifdef CONFIG_VIDEO_ADV_DEBUG + .g_register = mt9v011_g_register, + .s_register = mt9v011_s_register, +#endif +}; + +static const struct v4l2_subdev_ops mt9v011_ops = { + .core = &mt9v011_core_ops, +}; + + +/**************************************************************************** + I2C Client & Driver + ****************************************************************************/ + +static int mt9v011_probe(struct i2c_client *c, + const struct i2c_device_id *id) +{ + struct mt9v011 *core; + struct v4l2_subdev *sd; + + /* Check if the adapter supports the needed features */ + if (!i2c_check_functionality(c->adapter, + I2C_FUNC_SMBUS_READ_BYTE | I2C_FUNC_SMBUS_WRITE_BYTE_DATA)) + return -EIO; + + core = kzalloc(sizeof(struct mt9v011), GFP_KERNEL); + if (!core) + return -ENOMEM; + + core->global_gain = 0x0024; + + sd = &core->sd; + v4l2_i2c_subdev_init(sd, c, &mt9v011_ops); + v4l_info(c, "chip found @ 0x%02x (%s)\n", + c->addr << 1, c->adapter->name); + + return 0; +} + +static int mt9v011_remove(struct i2c_client *c) +{ + struct v4l2_subdev *sd = i2c_get_clientdata(c); + + v4l2_dbg(1, debug, sd, + "mt9v011.c: removing mt9v011 adapter on address 0x%x\n", + c->addr << 1); + + v4l2_device_unregister_subdev(sd); + kfree(to_mt9v011(sd)); + return 0; +} + +/* ----------------------------------------------------------------------- */ + +static const struct i2c_device_id mt9v011_id[] = { + { "mt9v011", 0 }, + { } +}; +MODULE_DEVICE_TABLE(i2c, mt9v011_id); + +static struct v4l2_i2c_driver_data v4l2_i2c_data = { + .name = "mt9v011", + .probe = mt9v011_probe, + .remove = mt9v011_remove, + .id_table = mt9v011_id, +}; diff --git a/drivers/media/video/mt9v011.h b/drivers/media/video/mt9v011.h new file mode 100644 index 000000000000..9e443ee30558 --- /dev/null +++ b/drivers/media/video/mt9v011.h @@ -0,0 +1,35 @@ +/* + * mt9v011 -Micron 1/4-Inch VGA Digital Image Sensor + * + * Copyright (c) 2009 Mauro Carvalho Chehab (mchehab@redhat.com) + * This code is placed under the terms of the GNU General Public License v2 + */ + +#ifndef MT9V011_H_ +#define MT9V011_H_ + +#define R00_MT9V011_CHIP_VERSION 0x00 +#define R01_MT9V011_ROWSTART 0x01 +#define R02_MT9V011_COLSTART 0x02 +#define R03_MT9V011_HEIGHT 0x03 +#define R04_MT9V011_WIDTH 0x04 +#define R05_MT9V011_HBLANK 0x05 +#define R06_MT9V011_VBLANK 0x06 +#define R07_MT9V011_OUT_CTRL 0x07 +#define R09_MT9V011_SHUTTER_WIDTH 0x09 +#define R0A_MT9V011_CLK_SPEED 0x0a +#define R0B_MT9V011_RESTART 0x0b +#define R0C_MT9V011_SHUTTER_DELAY 0x0c +#define R0D_MT9V011_RESET 0x0d +#define R1E_MT9V011_DIGITAL_ZOOM 0x1e +#define R20_MT9V011_READ_MODE 0x20 +#define R2B_MT9V011_GREEN_1_GAIN 0x2b +#define R2C_MT9V011_BLUE_GAIN 0x2c +#define R2D_MT9V011_RED_GAIN 0x2d +#define R2E_MT9V011_GREEN_2_GAIN 0x2e +#define R35_MT9V011_GLOBAL_GAIN 0x35 +#define RF1_MT9V011_CHIP_ENABLE 0xf1 + +#define MT9V011_VERSION 0x8243 + +#endif diff --git a/include/media/v4l2-chip-ident.h b/include/media/v4l2-chip-ident.h index 4d7e2272c42f..11a4a2d3e364 100644 --- a/include/media/v4l2-chip-ident.h +++ b/include/media/v4l2-chip-ident.h @@ -155,6 +155,9 @@ enum { /* module cafe_ccic, just ident 8801 */ V4L2_IDENT_CAFE = 8801, + /* module mt9v011, just ident 8243 */ + V4L2_IDENT_MT9V011 = 8243, + /* module tw9910: just ident 9910 */ V4L2_IDENT_TW9910 = 9910, -- cgit v1.2.3-59-g8ed1b From a65e7bfcd74e4c0939f235d2bf9f48ddb3a57991 Mon Sep 17 00:00:00 2001 From: Hui Zhu Date: Sun, 5 Jul 2009 12:08:15 -0700 Subject: elf: fix multithreaded program core dumping on arm Fix the multithread program core thread message error. This issue affects arches with neither has CORE_DUMP_USE_REGSET nor ELF_CORE_COPY_TASK_REGS, ARM is one of them. The thread message of core file is generated in elf_dump_thread_status. The register values is set by elf_core_copy_task_regs in this function. If an arch doesn't define ELF_CORE_COPY_TASK_REGS, elf_core_copy_task_regs() will do nothing. Then the core file will not have the register message of thread. So add elf_core_copy_regs to set regiser values if ELF_CORE_COPY_TASK_REGS doesn't define. The following is how to reproduce this issue: cat 1.c #include #include #include void td1(void * i) { while (1) { printf ("1\n"); sleep (1); } return; } void td2(void * i) { while (1) { printf ("2\n"); sleep (1); } return; } int main(int argc,char *argv[],char *envp[]) { pthread_t t1,t2; pthread_create(&t1, NULL, (void*)td1, NULL); pthread_create(&t2, NULL, (void*)td2, NULL); sleep (10); assert(0); return (0); } arm-xxx-gcc -g -lpthread 1.c -o 1 copy 1.c and 1 to a arm board. Goto this board. ulimit -c 1800000 ./1 # ./1 1 2 1 ... ... 1 1: 1.c:37: main: Assertion `0' failed. Aborted (core dumped) Then you can get a core file. gdb 1 core.xxx Without the patch: (gdb) info threads 3 process 909 0x00000000 in ?? () 2 process 908 0x00000000 in ?? () * 1 process 907 0x4a6e2238 in raise () from /lib/libc.so.6 You can found that the pc of 909 and 908 is 0x00000000. With the patch: (gdb) info threads 3 process 885 0x4a749974 in nanosleep () from /lib/libc.so.6 2 process 884 0x4a749974 in nanosleep () from /lib/libc.so.6 * 1 process 883 0x4a6e2238 in raise () from /lib/libc.so.6 The pc of 885 and 884 is right. Signed-off-by: Hui Zhu Cc: Amerigo Wang Cc: Al Viro Cc: David Howells Cc: Roland McGrath Cc: Jakub Jelinek Cc: Russell King Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/elfcore.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h index 7605c5e9589f..03ec16779802 100644 --- a/include/linux/elfcore.h +++ b/include/linux/elfcore.h @@ -125,6 +125,8 @@ static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* #ifdef ELF_CORE_COPY_TASK_REGS return ELF_CORE_COPY_TASK_REGS(t, elfregs); +#else + elf_core_copy_regs(elfregs, task_pt_regs(t)); #endif return 0; } -- cgit v1.2.3-59-g8ed1b From 82e3310ace59794ecf0f531eca94647b2863dfda Mon Sep 17 00:00:00 2001 From: Tobias Doerffel Date: Sun, 5 Jul 2009 12:08:23 -0700 Subject: linux/sysrq.h needs linux/errno.h In include/linux/sysrq.h the constant EINVAL is being used but is undefined if include/linux/errno.h is not included before. Fix this by adding #include at the beginning. Signed-off-by: Tobias Doerffel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sysrq.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/sysrq.h b/include/linux/sysrq.h index 98a1d8cfb73d..99adcdc0d3ca 100644 --- a/include/linux/sysrq.h +++ b/include/linux/sysrq.h @@ -14,6 +14,8 @@ #ifndef _LINUX_SYSRQ_H #define _LINUX_SYSRQ_H +#include + struct pt_regs; struct tty_struct; -- cgit v1.2.3-59-g8ed1b From 7afdbf23c3acdec3eaf1b94f87132fff3d81ce73 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 7 Jul 2009 10:23:43 +0200 Subject: signals: declare sys_rt_tgsigqueueinfo in syscalls.h sys_rt_tgsigqueueinfo needs to be declared in linux/syscalls.h so that architectures defining the system call table in C can reference it. Signed-off-by: Arnd Bergmann LKML-Reference: <200907071023.44008.arnd@arndb.de> Signed-off-by: Thomas Gleixner --- include/linux/syscalls.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index fa4242cdade8..80de7003d8c2 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -321,6 +321,8 @@ asmlinkage long sys_rt_sigtimedwait(const sigset_t __user *uthese, siginfo_t __user *uinfo, const struct timespec __user *uts, size_t sigsetsize); +asmlinkage long sys_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, + siginfo_t __user *uinfo); asmlinkage long sys_kill(int pid, int sig); asmlinkage long sys_tgkill(int tgid, int pid, int sig); asmlinkage long sys_tkill(int pid, int sig); -- cgit v1.2.3-59-g8ed1b From e4f7c0b44a8ac8935f223195af9ea637d0c08091 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 7 Jul 2009 10:32:59 +0100 Subject: kmemleak: Trace the kmalloc_large* functions in slub The kmalloc_large() and kmalloc_large_node() functions were missed when adding the kmemleak hooks to the slub allocator. However, they should be traced to avoid false positives. Signed-off-by: Catalin Marinas Cc: Christoph Lameter Acked-by: Pekka Enberg --- include/linux/slub_def.h | 2 ++ mm/slub.c | 10 ++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 4dcbc2c71491..c1c862b1d01a 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -11,6 +11,7 @@ #include #include #include +#include enum stat_item { ALLOC_FASTPATH, /* Allocation from cpu slab */ @@ -233,6 +234,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) unsigned int order = get_order(size); void *ret = (void *) __get_free_pages(flags | __GFP_COMP, order); + kmemleak_alloc(ret, size, 1, flags); trace_kmalloc(_THIS_IP_, ret, size, PAGE_SIZE << order, flags); return ret; diff --git a/mm/slub.c b/mm/slub.c index a9201d83178b..b9f1491a58a1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -2835,13 +2834,15 @@ EXPORT_SYMBOL(__kmalloc); static void *kmalloc_large_node(size_t size, gfp_t flags, int node) { struct page *page; + void *ptr = NULL; flags |= __GFP_COMP | __GFP_NOTRACK; page = alloc_pages_node(node, flags, get_order(size)); if (page) - return page_address(page); - else - return NULL; + ptr = page_address(page); + + kmemleak_alloc(ptr, size, 1, flags); + return ptr; } #ifdef CONFIG_NUMA @@ -2926,6 +2927,7 @@ void kfree(const void *x) page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); + kmemleak_free(x); put_page(page); return; } -- cgit v1.2.3-59-g8ed1b From 53238a60dd4a679f6fe5613a7ed46899587205cf Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 7 Jul 2009 10:33:00 +0100 Subject: kmemleak: Allow partial freeing of memory blocks Functions like free_bootmem() are allowed to free only part of a memory block. This patch adds support for this via the kmemleak_free_part() callback which removes the original object and creates one or two additional objects as a result of the memory block split. Signed-off-by: Catalin Marinas Cc: Ingo Molnar Acked-by: Pekka Enberg --- include/linux/kmemleak.h | 4 ++ mm/kmemleak.c | 95 +++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 85 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 7796aed6cdd5..6a63807f714e 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -27,6 +27,7 @@ extern void kmemleak_init(void); extern void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp); extern void kmemleak_free(const void *ptr); +extern void kmemleak_free_part(const void *ptr, size_t size); extern void kmemleak_padding(const void *ptr, unsigned long offset, size_t size); extern void kmemleak_not_leak(const void *ptr); @@ -71,6 +72,9 @@ static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, static inline void kmemleak_free(const void *ptr) { } +static inline void kmemleak_free_part(const void *ptr, size_t size) +{ +} static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags) { } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 466d39007264..5aabd41ffb8f 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -210,6 +210,7 @@ static DEFINE_MUTEX(scan_mutex); enum { KMEMLEAK_ALLOC, KMEMLEAK_FREE, + KMEMLEAK_FREE_PART, KMEMLEAK_NOT_LEAK, KMEMLEAK_IGNORE, KMEMLEAK_SCAN_AREA, @@ -523,27 +524,17 @@ out: * Remove the metadata (struct kmemleak_object) for a memory block from the * object_list and object_tree_root and decrement its use_count. */ -static void delete_object(unsigned long ptr) +static void __delete_object(struct kmemleak_object *object) { unsigned long flags; - struct kmemleak_object *object; write_lock_irqsave(&kmemleak_lock, flags); - object = lookup_object(ptr, 0); - if (!object) { -#ifdef DEBUG - kmemleak_warn("Freeing unknown object at 0x%08lx\n", - ptr); -#endif - write_unlock_irqrestore(&kmemleak_lock, flags); - return; - } prio_tree_remove(&object_tree_root, &object->tree_node); list_del_rcu(&object->object_list); write_unlock_irqrestore(&kmemleak_lock, flags); WARN_ON(!(object->flags & OBJECT_ALLOCATED)); - WARN_ON(atomic_read(&object->use_count) < 1); + WARN_ON(atomic_read(&object->use_count) < 2); /* * Locking here also ensures that the corresponding memory block @@ -555,6 +546,64 @@ static void delete_object(unsigned long ptr) put_object(object); } +/* + * Look up the metadata (struct kmemleak_object) corresponding to ptr and + * delete it. + */ +static void delete_object_full(unsigned long ptr) +{ + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { +#ifdef DEBUG + kmemleak_warn("Freeing unknown object at 0x%08lx\n", + ptr); +#endif + return; + } + __delete_object(object); + put_object(object); +} + +/* + * Look up the metadata (struct kmemleak_object) corresponding to ptr and + * delete it. If the memory block is partially freed, the function may create + * additional metadata for the remaining parts of the block. + */ +static void delete_object_part(unsigned long ptr, size_t size) +{ + struct kmemleak_object *object; + unsigned long start, end; + + object = find_and_get_object(ptr, 1); + if (!object) { +#ifdef DEBUG + kmemleak_warn("Partially freeing unknown object at 0x%08lx " + "(size %zu)\n", ptr, size); +#endif + return; + } + __delete_object(object); + + /* + * Create one or two objects that may result from the memory block + * split. Note that partial freeing is only done by free_bootmem() and + * this happens before kmemleak_init() is called. The path below is + * only executed during early log recording in kmemleak_init(), so + * GFP_KERNEL is enough. + */ + start = object->pointer; + end = object->pointer + object->size; + if (ptr > start) + create_object(start, ptr - start, object->min_count, + GFP_KERNEL); + if (ptr + size < end) + create_object(ptr + size, end - ptr - size, object->min_count, + GFP_KERNEL); + + put_object(object); +} /* * Make a object permanently as gray-colored so that it can no longer be * reported as a leak. This is used in general to mark a false positive. @@ -719,12 +768,27 @@ void kmemleak_free(const void *ptr) pr_debug("%s(0x%p)\n", __func__, ptr); if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) - delete_object((unsigned long)ptr); + delete_object_full((unsigned long)ptr); else if (atomic_read(&kmemleak_early_log)) log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free); +/* + * Partial memory freeing function callback. This function is usually called + * from bootmem allocator when (part of) a memory block is freed. + */ +void kmemleak_free_part(const void *ptr, size_t size) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + delete_object_part((unsigned long)ptr, size); + else if (atomic_read(&kmemleak_early_log)) + log_early(KMEMLEAK_FREE_PART, ptr, size, 0, 0, 0); +} +EXPORT_SYMBOL_GPL(kmemleak_free_part); + /* * Mark an already allocated memory block as a false positive. This will cause * the block to no longer be reported as leak and always be scanned. @@ -1318,7 +1382,7 @@ static int kmemleak_cleanup_thread(void *arg) rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) - delete_object(object->pointer); + delete_object_full(object->pointer); rcu_read_unlock(); mutex_unlock(&scan_mutex); @@ -1413,6 +1477,9 @@ void __init kmemleak_init(void) case KMEMLEAK_FREE: kmemleak_free(log->ptr); break; + case KMEMLEAK_FREE_PART: + kmemleak_free_part(log->ptr, log->size); + break; case KMEMLEAK_NOT_LEAK: kmemleak_not_leak(log->ptr); break; -- cgit v1.2.3-59-g8ed1b From e9bf0cc7cbfbf3952cdf8028aa0d348d09ecdba1 Mon Sep 17 00:00:00 2001 From: Parag Warudkar Date: Wed, 8 Jul 2009 11:46:02 -0400 Subject: elfcore.h : Fix UML build breakage Commit a65e7bfcd74e4c0939f235d2bf9f48ddb3a57991 broke the UML build with the following error - In file included from fs/proc/kcore.c:17: include/linux/elfcore.h: In function 'elf_core_copy_task_regs': include/linux/elfcore.h:129: error: implicit declaration of function 'task_pt_regs' Fix this by restoring the previous behavior of returning 0 for all arches like UML that don't define task_pt_regs. Signed-off-by: Parag Warudkar Acked-by: Amerigo Wang Signed-off-by: Linus Torvalds --- include/linux/elfcore.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h index 03ec16779802..00d6a68d0421 100644 --- a/include/linux/elfcore.h +++ b/include/linux/elfcore.h @@ -122,10 +122,9 @@ static inline void elf_core_copy_kernel_regs(elf_gregset_t *elfregs, struct pt_r static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs) { -#ifdef ELF_CORE_COPY_TASK_REGS - +#if defined (ELF_CORE_COPY_TASK_REGS) return ELF_CORE_COPY_TASK_REGS(t, elfregs); -#else +#elif defined (task_pt_regs) elf_core_copy_regs(elfregs, task_pt_regs(t)); #endif return 0; -- cgit v1.2.3-59-g8ed1b From b43f3cbd21ffbd719fd4fa6642bfe6af255ded34 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 8 Jul 2009 01:54:37 +0400 Subject: headers: mnt_namespace.h redux Fix various silly problems wrt mnt_namespace.h: - exit_mnt_ns() isn't used, remove it - done that, sched.h and nsproxy.h inclusions aren't needed - mount.h inclusion was need for vfsmount_lock, but no longer - remove mnt_namespace.h inclusion from files which don't use anything from mnt_namespace.h Signed-off-by: Alexey Dobriyan Signed-off-by: Linus Torvalds --- fs/afs/mntpt.c | 1 - fs/namespace.c | 1 + fs/nfs/getroot.c | 1 - fs/reiserfs/super.c | 1 - include/linux/mnt_namespace.h | 13 ++----------- kernel/exit.c | 1 - kernel/fork.c | 1 - kernel/kmod.c | 1 - 8 files changed, 3 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index c52be53f6946..5ffb570cd3a8 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -17,7 +17,6 @@ #include #include #include -#include #include "internal.h" diff --git a/fs/namespace.c b/fs/namespace.c index 3dc283fd4716..277c28a63ead 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c index 46177cb87064..b35d2a616066 100644 --- a/fs/nfs/getroot.c +++ b/fs/nfs/getroot.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index d3aeb061612b..7adea74d6a8a 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h index 3beb2592b03f..d74785c2393a 100644 --- a/include/linux/mnt_namespace.h +++ b/include/linux/mnt_namespace.h @@ -2,10 +2,9 @@ #define _NAMESPACE_H_ #ifdef __KERNEL__ -#include -#include -#include +#include #include +#include struct mnt_namespace { atomic_t count; @@ -28,14 +27,6 @@ extern struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt); extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *, struct fs_struct *); extern void put_mnt_ns(struct mnt_namespace *ns); - -static inline void exit_mnt_ns(struct task_struct *p) -{ - struct mnt_namespace *ns = p->nsproxy->mnt_ns; - if (ns) - put_mnt_ns(ns); -} - static inline void get_mnt_ns(struct mnt_namespace *ns) { atomic_inc(&ns->count); diff --git a/kernel/exit.c b/kernel/exit.c index 628d41f0dd54..869dc221733e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/fork.c b/kernel/fork.c index 467746b3f0aa..bd2959228871 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/kmod.c b/kernel/kmod.c index 7e95bedb2bfc..385c31a1bdbf 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include -- cgit v1.2.3-59-g8ed1b From 1ce822fa04fd6878f079461a4b8affe4bb5ec27b Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 8 Jul 2009 21:25:54 +0530 Subject: includecheck fix: include/linux, rfkill.h fix the following 'make includecheck' warning: include/linux/rfkill.h: linux/types.h is included more than once. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: John W. Linville --- include/linux/rfkill.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h index e73e2429a1b1..2ce29831feb6 100644 --- a/include/linux/rfkill.h +++ b/include/linux/rfkill.h @@ -99,7 +99,6 @@ enum rfkill_user_states { #undef RFKILL_STATE_UNBLOCKED #undef RFKILL_STATE_HARD_BLOCKED -#include #include #include #include -- cgit v1.2.3-59-g8ed1b From a57de0b4336e48db2811a2030bb68dba8dd09d88 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 8 Jul 2009 12:09:13 +0000 Subject: net: adding memory barrier to the poll and receive callbacks Adding memory barrier after the poll_wait function, paired with receive callbacks. Adding fuctions sock_poll_wait and sk_has_sleeper to wrap the memory barrier. Without the memory barrier, following race can happen. The race fires, when following code paths meet, and the tp->rcv_nxt and __add_wait_queue updates stay in CPU caches. CPU1 CPU2 sys_select receive packet ... ... __add_wait_queue update tp->rcv_nxt ... ... tp->rcv_nxt check sock_def_readable ... { schedule ... if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible(sk->sk_sleep) ... } If there was no cache the code would work ok, since the wait_queue and rcv_nxt are opposit to each other. Meaning that once tp->rcv_nxt is updated by CPU2, the CPU1 either already passed the tp->rcv_nxt check and sleeps, or will get the new value for tp->rcv_nxt and will return with new data mask. In both cases the process (CPU1) is being added to the wait queue, so the waitqueue_active (CPU2) call cannot miss and will wake up CPU1. The bad case is when the __add_wait_queue changes done by CPU1 stay in its cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 will then endup calling schedule and sleep forever if there are no more data on the socket. Calls to poll_wait in following modules were ommited: net/bluetooth/af_bluetooth.c net/irda/af_irda.c net/irda/irnet/irnet_ppp.c net/mac80211/rc80211_pid_debugfs.c net/phonet/socket.c net/rds/af_rds.c net/rfkill/core.c net/sunrpc/cache.c net/sunrpc/rpc_pipe.c net/tipc/socket.c Signed-off-by: Jiri Olsa Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++ net/atm/common.c | 6 ++--- net/core/datagram.c | 2 +- net/core/sock.c | 8 +++---- net/dccp/output.c | 2 +- net/dccp/proto.c | 2 +- net/ipv4/tcp.c | 2 +- net/iucv/af_iucv.c | 4 ++-- net/rxrpc/af_rxrpc.c | 4 ++-- net/unix/af_unix.c | 8 +++---- 10 files changed, 85 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 352f06bbd7a9..4eb8409249f6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -54,6 +54,7 @@ #include #include +#include #include #include @@ -1241,6 +1242,71 @@ static inline int sk_has_allocations(const struct sock *sk) return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk); } +/** + * sk_has_sleeper - check if there are any waiting processes + * @sk: socket + * + * Returns true if socket has waiting processes + * + * The purpose of the sk_has_sleeper and sock_poll_wait is to wrap the memory + * barrier call. They were added due to the race found within the tcp code. + * + * Consider following tcp code paths: + * + * CPU1 CPU2 + * + * sys_select receive packet + * ... ... + * __add_wait_queue update tp->rcv_nxt + * ... ... + * tp->rcv_nxt check sock_def_readable + * ... { + * schedule ... + * if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + * wake_up_interruptible(sk->sk_sleep) + * ... + * } + * + * The race for tcp fires when the __add_wait_queue changes done by CPU1 stay + * in its cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 + * could then endup calling schedule and sleep forever if there are no more + * data on the socket. + */ +static inline int sk_has_sleeper(struct sock *sk) +{ + /* + * We need to be sure we are in sync with the + * add_wait_queue modifications to the wait queue. + * + * This memory barrier is paired in the sock_poll_wait. + */ + smp_mb(); + return sk->sk_sleep && waitqueue_active(sk->sk_sleep); +} + +/** + * sock_poll_wait - place memory barrier behind the poll_wait call. + * @filp: file + * @wait_address: socket wait queue + * @p: poll_table + * + * See the comments in the sk_has_sleeper function. + */ +static inline void sock_poll_wait(struct file *filp, + wait_queue_head_t *wait_address, poll_table *p) +{ + if (p && wait_address) { + poll_wait(filp, wait_address, p); + /* + * We need to be sure we are in sync with the + * socket flags modification. + * + * This memory barrier is paired in the sk_has_sleeper. + */ + smp_mb(); + } +} + /* * Queue a received datagram if it will fit. Stream and sequenced * protocols can't normally use this as they need to fit buffers in diff --git a/net/atm/common.c b/net/atm/common.c index c1c97936192c..8c4d843eb17f 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -92,7 +92,7 @@ static void vcc_sock_destruct(struct sock *sk) static void vcc_def_wakeup(struct sock *sk) { read_lock(&sk->sk_callback_lock); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up(sk->sk_sleep); read_unlock(&sk->sk_callback_lock); } @@ -110,7 +110,7 @@ static void vcc_write_space(struct sock *sk) read_lock(&sk->sk_callback_lock); if (vcc_writable(sk)) { - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible(sk->sk_sleep); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); @@ -594,7 +594,7 @@ unsigned int vcc_poll(struct file *file, struct socket *sock, poll_table *wait) struct atm_vcc *vcc; unsigned int mask; - poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk->sk_sleep, wait); mask = 0; vcc = ATM_SD(sock); diff --git a/net/core/datagram.c b/net/core/datagram.c index 58abee1f1df1..b0fe69211eef 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -712,7 +712,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk; unsigned int mask; - poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk->sk_sleep, wait); mask = 0; /* exceptional events? */ diff --git a/net/core/sock.c b/net/core/sock.c index b0ba569bc973..6354863b1c68 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1715,7 +1715,7 @@ EXPORT_SYMBOL(sock_no_sendpage); static void sock_def_wakeup(struct sock *sk) { read_lock(&sk->sk_callback_lock); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible_all(sk->sk_sleep); read_unlock(&sk->sk_callback_lock); } @@ -1723,7 +1723,7 @@ static void sock_def_wakeup(struct sock *sk) static void sock_def_error_report(struct sock *sk) { read_lock(&sk->sk_callback_lock); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible_poll(sk->sk_sleep, POLLERR); sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); read_unlock(&sk->sk_callback_lock); @@ -1732,7 +1732,7 @@ static void sock_def_error_report(struct sock *sk) static void sock_def_readable(struct sock *sk, int len) { read_lock(&sk->sk_callback_lock); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN | POLLRDNORM | POLLRDBAND); sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); @@ -1747,7 +1747,7 @@ static void sock_def_write_space(struct sock *sk) * progress. --DaveM */ if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT | POLLWRNORM | POLLWRBAND); diff --git a/net/dccp/output.c b/net/dccp/output.c index c0e88c16d088..c96119fda688 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -196,7 +196,7 @@ void dccp_write_space(struct sock *sk) { read_lock(&sk->sk_callback_lock); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible(sk->sk_sleep); /* Should agree with poll, otherwise some programs break */ if (sock_writeable(sk)) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 314a1b5c033c..94ca8eaace7d 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -311,7 +311,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock, unsigned int mask; struct sock *sk = sock->sk; - poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk->sk_sleep, wait); if (sk->sk_state == DCCP_LISTEN) return inet_csk_listen_poll(sk); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 7870a535dac6..91145244ea63 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -339,7 +339,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) struct sock *sk = sock->sk; struct tcp_sock *tp = tcp_sk(sk); - poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk->sk_sleep, wait); if (sk->sk_state == TCP_LISTEN) return inet_csk_listen_poll(sk); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index 6be5f92d1094..49c15b48408e 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -306,7 +306,7 @@ static inline int iucv_below_msglim(struct sock *sk) static void iucv_sock_wake_msglim(struct sock *sk) { read_lock(&sk->sk_callback_lock); - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible_all(sk->sk_sleep); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); read_unlock(&sk->sk_callback_lock); @@ -1256,7 +1256,7 @@ unsigned int iucv_sock_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk; unsigned int mask = 0; - poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk->sk_sleep, wait); if (sk->sk_state == IUCV_LISTEN) return iucv_accept_poll(sk); diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index eac5e7bb7365..bfe493ebf27c 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -63,7 +63,7 @@ static void rxrpc_write_space(struct sock *sk) _enter("%p", sk); read_lock(&sk->sk_callback_lock); if (rxrpc_writable(sk)) { - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible(sk->sk_sleep); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } @@ -588,7 +588,7 @@ static unsigned int rxrpc_poll(struct file *file, struct socket *sock, unsigned int mask; struct sock *sk = sock->sk; - poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk->sk_sleep, wait); mask = 0; /* the socket is readable if there are any messages waiting on the Rx diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 36d4e44d6233..fc3ebb906911 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -315,7 +315,7 @@ static void unix_write_space(struct sock *sk) { read_lock(&sk->sk_callback_lock); if (unix_writable(sk)) { - if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) + if (sk_has_sleeper(sk)) wake_up_interruptible_sync(sk->sk_sleep); sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } @@ -1985,7 +1985,7 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table struct sock *sk = sock->sk; unsigned int mask; - poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk->sk_sleep, wait); mask = 0; /* exceptional events? */ @@ -2022,7 +2022,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, struct sock *sk = sock->sk, *other; unsigned int mask, writable; - poll_wait(file, sk->sk_sleep, wait); + sock_poll_wait(file, sk->sk_sleep, wait); mask = 0; /* exceptional events? */ @@ -2053,7 +2053,7 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock, other = unix_peer_get(sk); if (other) { if (unix_peer(other) != sk) { - poll_wait(file, &unix_sk(other)->peer_wait, + sock_poll_wait(file, &unix_sk(other)->peer_wait, wait); if (unix_recvq_full(other)) writable = 0; -- cgit v1.2.3-59-g8ed1b From ad46276952f1af34cd91d46d49ba13d347d56367 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Wed, 8 Jul 2009 12:10:31 +0000 Subject: memory barrier: adding smp_mb__after_lock Adding smp_mb__after_lock define to be used as a smp_mb call after a lock. Making it nop for x86, since {read|write|spin}_lock() on x86 are full memory barriers. Signed-off-by: Jiri Olsa Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- arch/x86/include/asm/spinlock.h | 4 ++++ include/linux/spinlock.h | 5 +++++ include/net/sock.h | 5 ++++- 3 files changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index b7e5db876399..4e77853321db 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -302,4 +302,8 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw) #define _raw_read_relax(lock) cpu_relax() #define _raw_write_relax(lock) cpu_relax() +/* The {read|write|spin}_lock() on x86 are full memory barriers. */ +static inline void smp_mb__after_lock(void) { } +#define ARCH_HAS_SMP_MB_AFTER_LOCK + #endif /* _ASM_X86_SPINLOCK_H */ diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 252b245cfcf4..4be57ab03478 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -132,6 +132,11 @@ do { \ #endif /*__raw_spin_is_contended*/ #endif +/* The lock does not imply full memory barrier. */ +#ifndef ARCH_HAS_SMP_MB_AFTER_LOCK +static inline void smp_mb__after_lock(void) { smp_mb(); } +#endif + /** * spin_unlock_wait - wait until the spinlock gets unlocked * @lock: the spinlock in question. diff --git a/include/net/sock.h b/include/net/sock.h index 4eb8409249f6..2c0da9239b95 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1271,6 +1271,9 @@ static inline int sk_has_allocations(const struct sock *sk) * in its cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 * could then endup calling schedule and sleep forever if there are no more * data on the socket. + * + * The sk_has_sleeper is always called right after a call to read_lock, so we + * can use smp_mb__after_lock barrier. */ static inline int sk_has_sleeper(struct sock *sk) { @@ -1280,7 +1283,7 @@ static inline int sk_has_sleeper(struct sock *sk) * * This memory barrier is paired in the sock_poll_wait. */ - smp_mb(); + smp_mb__after_lock(); return sk->sk_sleep && waitqueue_active(sk->sk_sleep); } -- cgit v1.2.3-59-g8ed1b From 6ff7041dbfeb3bd7dfe9aa67275c21199ef760d6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Jul 2009 14:57:05 +0200 Subject: hrtimer: Fix migration expiry check The timer migration expiry check should prevent the migration of a timer to another CPU when the timer expires before the next event is scheduled on the other CPU. Migrating the timer might delay it because we can not reprogram the clock event device on the other CPU. But the code implementing that check has two flaws: - for !HIGHRES the check compares the expiry value with the clock events device expiry value which is wrong for CLOCK_REALTIME based timers. - the check is racy. It holds the hrtimer base lock of the target CPU, but the clock event device expiry value can be modified nevertheless, e.g. by an timer interrupt firing. The !HIGHRES case is easy to fix as we can enqueue the timer on the cpu which was selected by the load balancer. It runs the idle balancing code once per jiffy anyway. So the maximum delay for the timer is the same as when we keep the tick on the current cpu going. In the HIGHRES case we can get the next expiry value from the hrtimer cpu_base of the target CPU and serialize the update with the cpu_base lock. This moves the lock section in hrtimer_interrupt() so we can set next_event to KTIME_MAX while we are handling the expired timers and set it to the next expiry value after we handled the timers under the base lock. While the expired timers are processed timer migration is blocked because the expiry time of the timer is always <= KTIME_MAX. Also remove the now useless clockevents_get_next_event() function. Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 9 ---- kernel/hrtimer.c | 121 ++++++++++++++++++++++++--------------------- kernel/time/clockevents.c | 11 ----- 3 files changed, 64 insertions(+), 77 deletions(-) (limited to 'include') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 20a100fe2b4f..3a1dbba4d3ae 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -143,12 +143,3 @@ extern void clockevents_notify(unsigned long reason, void *arg); #endif #endif - -#ifdef CONFIG_GENERIC_CLOCKEVENTS -extern ktime_t clockevents_get_next_event(int cpu); -#else -static inline ktime_t clockevents_get_next_event(int cpu) -{ - return (ktime_t) { .tv64 = KTIME_MAX }; -} -#endif diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 126b9808f287..49da79ab8486 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -191,6 +191,46 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } } + +/* + * Get the preferred target CPU for NOHZ + */ +static int hrtimer_get_target(int this_cpu, int pinned) +{ +#ifdef CONFIG_NO_HZ + if (!pinned && get_sysctl_timer_migration() && idle_cpu(this_cpu)) { + int preferred_cpu = get_nohz_load_balancer(); + + if (preferred_cpu >= 0) + return preferred_cpu; + } +#endif + return this_cpu; +} + +/* + * With HIGHRES=y we do not migrate the timer when it is expiring + * before the next event on the target cpu because we cannot reprogram + * the target cpu hardware and we would cause it to fire late. + * + * Called with cpu_base->lock of target cpu held. + */ +static int +hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) +{ +#ifdef CONFIG_HIGH_RES_TIMERS + ktime_t expires; + + if (!new_base->cpu_base->hres_active) + return 0; + + expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); + return expires.tv64 <= new_base->cpu_base->expires_next.tv64; +#else + return 0; +#endif +} + /* * Switch the timer base to the current CPU when possible. */ @@ -200,27 +240,8 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, { struct hrtimer_clock_base *new_base; struct hrtimer_cpu_base *new_cpu_base; - int cpu, preferred_cpu = -1; - - cpu = smp_processor_id(); -#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP) - if (!pinned && get_sysctl_timer_migration() && idle_cpu(cpu)) { - preferred_cpu = get_nohz_load_balancer(); - if (preferred_cpu >= 0) { - /* - * We must not check the expiry value when - * preferred_cpu is the current cpu. If base - * != new_base we would loop forever when the - * timer expires before the current programmed - * next timer event. - */ - if (preferred_cpu != cpu) - cpu = preferred_cpu; - else - preferred_cpu = -1; - } - } -#endif + int this_cpu = smp_processor_id(); + int cpu = hrtimer_get_target(this_cpu, pinned); again: new_cpu_base = &per_cpu(hrtimer_bases, cpu); @@ -228,7 +249,7 @@ again: if (base != new_base) { /* - * We are trying to schedule the timer on the local CPU. + * We are trying to move timer to new_base. * However we can't change timer's base while it is running, * so we keep it on the same CPU. No hassle vs. reprogramming * the event source in the high resolution case. The softirq @@ -244,38 +265,12 @@ again: spin_unlock(&base->cpu_base->lock); spin_lock(&new_base->cpu_base->lock); - /* Optimized away for NOHZ=n SMP=n */ - if (cpu == preferred_cpu) { - /* Calculate clock monotonic expiry time */ -#ifdef CONFIG_HIGH_RES_TIMERS - ktime_t expires = ktime_sub(hrtimer_get_expires(timer), - new_base->offset); -#else - ktime_t expires = hrtimer_get_expires(timer); -#endif - - /* - * Get the next event on target cpu from the - * clock events layer. - * This covers the highres=off nohz=on case as well. - */ - ktime_t next = clockevents_get_next_event(cpu); - - ktime_t delta = ktime_sub(expires, next); - - /* - * We do not migrate the timer when it is expiring - * before the next event on the target cpu because - * we cannot reprogram the target cpu hardware and - * we would cause it to fire late. - */ - if (delta.tv64 < 0) { - cpu = smp_processor_id(); - spin_unlock(&new_base->cpu_base->lock); - spin_lock(&base->cpu_base->lock); - timer->base = base; - goto again; - } + if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) { + cpu = this_cpu; + spin_unlock(&new_base->cpu_base->lock); + spin_lock(&base->cpu_base->lock); + timer->base = base; + goto again; } timer->base = new_base; } @@ -1287,14 +1282,22 @@ void hrtimer_interrupt(struct clock_event_device *dev) expires_next.tv64 = KTIME_MAX; + spin_lock(&cpu_base->lock); + /* + * We set expires_next to KTIME_MAX here with cpu_base->lock + * held to prevent that a timer is enqueued in our queue via + * the migration code. This does not affect enqueueing of + * timers which run their callback and need to be requeued on + * this CPU. + */ + cpu_base->expires_next.tv64 = KTIME_MAX; + base = cpu_base->clock_base; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { ktime_t basenow; struct rb_node *node; - spin_lock(&cpu_base->lock); - basenow = ktime_add(now, base->offset); while ((node = base->first)) { @@ -1327,11 +1330,15 @@ void hrtimer_interrupt(struct clock_event_device *dev) __run_hrtimer(timer); } - spin_unlock(&cpu_base->lock); base++; } + /* + * Store the new expiry value so the migration code can verify + * against it. + */ cpu_base->expires_next = expires_next; + spin_unlock(&cpu_base->lock); /* Reprogramming necessary ? */ if (expires_next.tv64 != KTIME_MAX) { diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 1ad6dd461119..a6dcd67b041d 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -254,15 +254,4 @@ void clockevents_notify(unsigned long reason, void *arg) spin_unlock(&clockevents_lock); } EXPORT_SYMBOL_GPL(clockevents_notify); - -ktime_t clockevents_get_next_event(int cpu) -{ - struct tick_device *td; - struct clock_event_device *dev; - - td = &per_cpu(tick_cpu_device, cpu); - dev = td->evtdev; - - return dev->next_event; -} #endif -- cgit v1.2.3-59-g8ed1b From f9f868dbcca961ed62f1df1d114abd0c38c47dce Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 9 Jul 2009 11:24:26 +0200 Subject: timer stats: fix quick check optimization git commit 507e1231 "timer stats: Optimize by adding quick check to avoid function calls" added one wrong check so that one unnecessary function call isn't elimated. time_stats_account_hrtimer() checks if timer->start_pid isn't initialized in order to find out if timer_stats_update_stats() should be called. However start_pid is initialized with -1 instead of 0, so that the function call always happens. Check timer->start_site like in timer_stats_account_timer() to fix this. Signed-off-by: Heiko Carstens Signed-off-by: Thomas Gleixner --- include/linux/hrtimer.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 54648e625efd..4759917adc71 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -448,7 +448,7 @@ extern void timer_stats_update_stats(void *timer, pid_t pid, void *startf, static inline void timer_stats_account_hrtimer(struct hrtimer *timer) { - if (likely(!timer->start_pid)) + if (likely(!timer->start_site)) return; timer_stats_update_stats(timer, timer->start_pid, timer->start_site, timer->function, timer->start_comm, 0); -- cgit v1.2.3-59-g8ed1b From 8aa7e847d834ed937a9ad37a0f2ad5b8584c1ab0 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 9 Jul 2009 14:52:32 +0200 Subject: Fix congestion_wait() sync/async vs read/write confusion Commit 1faa16d22877f4839bd433547d770c676d1d964c accidentally broke the bdi congestion wait queue logic, causing us to wait on congestion for WRITE (== 1) when we really wanted BLK_RW_ASYNC (== 0) instead. Signed-off-by: Jens Axboe --- arch/x86/lib/usercopy_32.c | 2 +- drivers/block/pktcdvd.c | 10 ++++++---- drivers/md/dm-crypt.c | 2 +- fs/fat/file.c | 2 +- fs/fuse/dev.c | 8 ++++---- fs/nfs/write.c | 8 +++++--- fs/reiserfs/journal.c | 2 +- fs/xfs/linux-2.6/kmem.c | 4 ++-- fs/xfs/linux-2.6/xfs_buf.c | 2 +- include/linux/backing-dev.h | 6 +++--- include/linux/blkdev.h | 8 ++++---- mm/backing-dev.c | 7 +++---- mm/memcontrol.c | 2 +- mm/page-writeback.c | 8 ++++---- mm/page_alloc.c | 4 ++-- mm/vmscan.c | 8 ++++---- 16 files changed, 43 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 7c8ca91bb9ec..1f118d462acc 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -751,7 +751,7 @@ survive: if (retval == -ENOMEM && is_global_init(current)) { up_read(¤t->mm->mmap_sem); - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); goto survive; } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 83650e00632d..99a506f619b7 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -1372,8 +1372,10 @@ try_next_bio: wakeup = (pd->write_congestion_on > 0 && pd->bio_queue_size <= pd->write_congestion_off); spin_unlock(&pd->lock); - if (wakeup) - clear_bdi_congested(&pd->disk->queue->backing_dev_info, WRITE); + if (wakeup) { + clear_bdi_congested(&pd->disk->queue->backing_dev_info, + BLK_RW_ASYNC); + } pkt->sleep_time = max(PACKET_WAIT_TIME, 1); pkt_set_state(pkt, PACKET_WAITING_STATE); @@ -2592,10 +2594,10 @@ static int pkt_make_request(struct request_queue *q, struct bio *bio) spin_lock(&pd->lock); if (pd->write_congestion_on > 0 && pd->bio_queue_size >= pd->write_congestion_on) { - set_bdi_congested(&q->backing_dev_info, WRITE); + set_bdi_congested(&q->backing_dev_info, BLK_RW_ASYNC); do { spin_unlock(&pd->lock); - congestion_wait(WRITE, HZ); + congestion_wait(BLK_RW_ASYNC, HZ); spin_lock(&pd->lock); } while(pd->bio_queue_size > pd->write_congestion_off); } diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 9933eb861c71..529e2ba505c3 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -776,7 +776,7 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) * But don't wait if split was due to the io size restriction */ if (unlikely(out_of_pages)) - congestion_wait(WRITE, HZ/100); + congestion_wait(BLK_RW_ASYNC, HZ/100); /* * With async crypto it is unsafe to share the crypto context diff --git a/fs/fat/file.c b/fs/fat/file.c index b28ea646ff60..f042b965c95c 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -134,7 +134,7 @@ static int fat_file_release(struct inode *inode, struct file *filp) if ((filp->f_mode & FMODE_WRITE) && MSDOS_SB(inode->i_sb)->options.flush) { fat_flush_inodes(inode->i_sb, inode, NULL); - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); } return 0; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index f58ecbc416c8..6484eb75acd6 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -286,8 +286,8 @@ __releases(&fc->lock) } if (fc->num_background == FUSE_CONGESTION_THRESHOLD && fc->connected && fc->bdi_initialized) { - clear_bdi_congested(&fc->bdi, READ); - clear_bdi_congested(&fc->bdi, WRITE); + clear_bdi_congested(&fc->bdi, BLK_RW_SYNC); + clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC); } fc->num_background--; fc->active_background--; @@ -414,8 +414,8 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc, fc->blocked = 1; if (fc->num_background == FUSE_CONGESTION_THRESHOLD && fc->bdi_initialized) { - set_bdi_congested(&fc->bdi, READ); - set_bdi_congested(&fc->bdi, WRITE); + set_bdi_congested(&fc->bdi, BLK_RW_SYNC); + set_bdi_congested(&fc->bdi, BLK_RW_ASYNC); } list_add_tail(&req->list, &fc->bg_queue); flush_bg_queue(fc); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ce728829f79a..0a0a2ff767c3 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -202,8 +202,10 @@ static int nfs_set_page_writeback(struct page *page) struct nfs_server *nfss = NFS_SERVER(inode); if (atomic_long_inc_return(&nfss->writeback) > - NFS_CONGESTION_ON_THRESH) - set_bdi_congested(&nfss->backing_dev_info, WRITE); + NFS_CONGESTION_ON_THRESH) { + set_bdi_congested(&nfss->backing_dev_info, + BLK_RW_ASYNC); + } } return ret; } @@ -215,7 +217,7 @@ static void nfs_end_page_writeback(struct page *page) end_page_writeback(page); if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) - clear_bdi_congested(&nfss->backing_dev_info, WRITE); + clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); } /* diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 77f5bb746bf0..90622200b39c 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -997,7 +997,7 @@ static int reiserfs_async_progress_wait(struct super_block *s) DEFINE_WAIT(wait); struct reiserfs_journal *j = SB_JOURNAL(s); if (atomic_read(&j->j_async_throttle)) - congestion_wait(WRITE, HZ / 10); + congestion_wait(BLK_RW_ASYNC, HZ / 10); return 0; } diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c index 1cd3b55ee3d2..2d3f90afe5f1 100644 --- a/fs/xfs/linux-2.6/kmem.c +++ b/fs/xfs/linux-2.6/kmem.c @@ -53,7 +53,7 @@ kmem_alloc(size_t size, unsigned int __nocast flags) printk(KERN_ERR "XFS: possible memory allocation " "deadlock in %s (mode:0x%x)\n", __func__, lflags); - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); } @@ -130,7 +130,7 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags) printk(KERN_ERR "XFS: possible memory allocation " "deadlock in %s (mode:0x%x)\n", __func__, lflags); - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); } diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 1418b916fc27..0c93c7ef3d18 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -412,7 +412,7 @@ _xfs_buf_lookup_pages( XFS_STATS_INC(xb_page_retries); xfsbufd_wakeup(0, gfp_mask); - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 0ec2c594868e..3a52a63c1351 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -229,9 +229,9 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi) (1 << BDI_async_congested)); } -void clear_bdi_congested(struct backing_dev_info *bdi, int rw); -void set_bdi_congested(struct backing_dev_info *bdi, int rw); -long congestion_wait(int rw, long timeout); +void clear_bdi_congested(struct backing_dev_info *bdi, int sync); +void set_bdi_congested(struct backing_dev_info *bdi, int sync); +long congestion_wait(int sync, long timeout); static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 49ae07951d55..bb3d39978701 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -779,18 +779,18 @@ extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, * congested queues, and wake up anyone who was waiting for requests to be * put back. */ -static inline void blk_clear_queue_congested(struct request_queue *q, int rw) +static inline void blk_clear_queue_congested(struct request_queue *q, int sync) { - clear_bdi_congested(&q->backing_dev_info, rw); + clear_bdi_congested(&q->backing_dev_info, sync); } /* * A queue has just entered congestion. Flag that in the queue's VM-visible * state flags and increment the global gounter of congested queues. */ -static inline void blk_set_queue_congested(struct request_queue *q, int rw) +static inline void blk_set_queue_congested(struct request_queue *q, int sync) { - set_bdi_congested(&q->backing_dev_info, rw); + set_bdi_congested(&q->backing_dev_info, sync); } extern void blk_start_queue(struct request_queue *q); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 493b468a5035..c86edd244294 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -283,7 +283,6 @@ static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) }; - void clear_bdi_congested(struct backing_dev_info *bdi, int sync) { enum bdi_state bit; @@ -308,18 +307,18 @@ EXPORT_SYMBOL(set_bdi_congested); /** * congestion_wait - wait for a backing_dev to become uncongested - * @rw: READ or WRITE + * @sync: SYNC or ASYNC IO * @timeout: timeout in jiffies * * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit * write congestion. If no backing_devs are congested then just wait for the * next write to be completed. */ -long congestion_wait(int rw, long timeout) +long congestion_wait(int sync, long timeout) { long ret; DEFINE_WAIT(wait); - wait_queue_head_t *wqh = &congestion_wqh[rw]; + wait_queue_head_t *wqh = &congestion_wqh[sync]; prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); ret = io_schedule_timeout(timeout); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e2fa20dadf40..e717964cb5a0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1973,7 +1973,7 @@ try_to_free: if (!progress) { nr_retries--; /* maybe some writeback is necessary */ - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); } } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7687879253b9..81627ebcd313 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -575,7 +575,7 @@ static void balance_dirty_pages(struct address_space *mapping) if (pages_written >= write_chunk) break; /* We've done our duty */ - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); } if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && @@ -669,7 +669,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) if (global_page_state(NR_UNSTABLE_NFS) + global_page_state(NR_WRITEBACK) <= dirty_thresh) break; - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* * The caller might hold locks which can prevent IO completion @@ -715,7 +715,7 @@ static void background_writeout(unsigned long _min_pages) if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { /* Wrote less than expected */ if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); else break; } @@ -787,7 +787,7 @@ static void wb_kupdate(unsigned long arg) writeback_inodes(&wbc); if (wbc.nr_to_write > 0) { if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); else break; /* All the old data is written */ } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ad7cd1c56b07..a35eeab2724c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1666,7 +1666,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, preferred_zone, migratetype); if (!page && gfp_mask & __GFP_NOFAIL) - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); } while (!page && (gfp_mask & __GFP_NOFAIL)); return page; @@ -1831,7 +1831,7 @@ rebalance: pages_reclaimed += did_some_progress; if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { /* Wait for some write requests to complete then retry */ - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); goto rebalance; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 54155268dfca..dea7abd31098 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1104,7 +1104,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, */ if (nr_freed < nr_taken && !current_is_kswapd() && lumpy_reclaim) { - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* * The attempt at page out may have made some @@ -1721,7 +1721,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, /* Take a nap, wait for some writeback to complete */ if (sc->nr_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); } /* top priority shrink_zones still had more to do? don't OOM, then */ if (!sc->all_unreclaimable && scanning_global_lru(sc)) @@ -1960,7 +1960,7 @@ loop_again: * another pass across the zones. */ if (total_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* * We do this so kswapd doesn't build up large priorities for @@ -2233,7 +2233,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) goto out; if (sc.nr_scanned && prio < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ / 10); + congestion_wait(BLK_RW_ASYNC, HZ / 10); } } -- cgit v1.2.3-59-g8ed1b From ecb554a846f8e9d2a58f6d6c118168a63ac065aa Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 9 Jul 2009 14:46:53 +0200 Subject: block: fix sg SG_DXFER_TO_FROM_DEV regression I overlooked SG_DXFER_TO_FROM_DEV support when I converted sg to use the block layer mapping API (2.6.28). Douglas Gilbert explained SG_DXFER_TO_FROM_DEV: http://www.spinics.net/lists/linux-scsi/msg37135.html = The semantics of SG_DXFER_TO_FROM_DEV were: - copy user space buffer to kernel (LLD) buffer - do SCSI command which is assumed to be of the DATA_IN (data from device) variety. This would overwrite some or all of the kernel buffer - copy kernel (LLD) buffer back to the user space. The idea was to detect short reads by filling the original user space buffer with some marker bytes ("0xec" it would seem in this report). The "resid" value is a better way of detecting short reads but that was only added this century and requires co-operation from the LLD. = This patch changes the block layer mapping API to support this semantics. This simply adds another field to struct rq_map_data and enables __bio_copy_iov() to copy data from user space even with READ requests. It's better to add the flags field and kills null_mapped and the new from_user fields in struct rq_map_data but that approach makes it difficult to send this patch to stable trees because st and osst drivers use struct rq_map_data (they were converted to use the block layer in 2.6.29 and 2.6.30). Well, I should clean up the block layer mapping API. zhou sf reported this regiression and tested this patch: http://www.spinics.net/lists/linux-scsi/msg37128.html http://www.spinics.net/lists/linux-scsi/msg37168.html Reported-by: zhou sf Tested-by: zhou sf Cc: stable@kernel.org Signed-off-by: FUJITA Tomonori Signed-off-by: Jens Axboe --- drivers/scsi/sg.c | 4 ++++ fs/bio.c | 22 ++++++++++++---------- include/linux/blkdev.h | 1 + 3 files changed, 17 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 4d6f2fe1cfe9..9230402c45af 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1656,6 +1656,10 @@ static int sg_start_req(Sg_request *srp, unsigned char *cmd) md->nr_entries = req_schp->k_use_sg; md->offset = 0; md->null_mapped = hp->dxferp ? 0 : 1; + if (dxfer_dir == SG_DXFER_TO_FROM_DEV) + md->from_user = 1; + else + md->from_user = 0; } if (iov_count) { diff --git a/fs/bio.c b/fs/bio.c index 1486b19fc431..76738005c8e8 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -705,14 +705,13 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, } static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, - struct sg_iovec *iov, int iov_count, int uncopy, - int do_free_page) + struct sg_iovec *iov, int iov_count, + int to_user, int from_user, int do_free_page) { int ret = 0, i; struct bio_vec *bvec; int iov_idx = 0; unsigned int iov_off = 0; - int read = bio_data_dir(bio) == READ; __bio_for_each_segment(bvec, bio, i, 0) { char *bv_addr = page_address(bvec->bv_page); @@ -727,13 +726,14 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, iov_addr = iov[iov_idx].iov_base + iov_off; if (!ret) { - if (!read && !uncopy) - ret = copy_from_user(bv_addr, iov_addr, - bytes); - if (read && uncopy) + if (to_user) ret = copy_to_user(iov_addr, bv_addr, bytes); + if (from_user) + ret = copy_from_user(bv_addr, iov_addr, + bytes); + if (ret) ret = -EFAULT; } @@ -770,7 +770,8 @@ int bio_uncopy_user(struct bio *bio) if (!bio_flagged(bio, BIO_NULL_MAPPED)) ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, - bmd->nr_sgvecs, 1, bmd->is_our_pages); + bmd->nr_sgvecs, bio_data_dir(bio) == READ, + 0, bmd->is_our_pages); bio_free_map_data(bmd); bio_put(bio); return ret; @@ -875,8 +876,9 @@ struct bio *bio_copy_user_iov(struct request_queue *q, /* * success */ - if (!write_to_vm && (!map_data || !map_data->null_mapped)) { - ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0); + if ((!write_to_vm && (!map_data || !map_data->null_mapped)) || + (map_data && map_data->from_user)) { + ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 1, 0); if (ret) goto cleanup; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index bb3d39978701..0146e0fecf1a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -723,6 +723,7 @@ struct rq_map_data { int nr_entries; unsigned long offset; int null_mapped; + int from_user; }; struct req_iterator { -- cgit v1.2.3-59-g8ed1b From c99e6efe1ba04561e7d93a81f0be07e37427e835 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 10 Jul 2009 14:57:56 +0200 Subject: sched: INIT_PREEMPT_COUNT Pull the initial preempt_count value into a single definition site. Maintainers for: alpha, ia64 and m68k, please have a look, your arch code is funny. The header magic is a bit odd, but similar to the KERNEL_DS one, CPP waits with expanding these macros until the INIT_THREAD_INFO macro itself is expanded, which is in arch/*/kernel/init_task.c where we've already included sched.h so we're good. Cc: tony.luck@intel.com Cc: rth@twiddle.net Cc: geert@linux-m68k.org Signed-off-by: Peter Zijlstra Acked-by: Matt Mackall Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/thread_info.h | 1 + arch/arm/include/asm/thread_info.h | 2 +- arch/avr32/include/asm/thread_info.h | 2 +- arch/blackfin/include/asm/thread_info.h | 2 +- arch/cris/include/asm/thread_info.h | 4 +--- arch/frv/include/asm/thread_info.h | 4 +--- arch/h8300/include/asm/thread_info.h | 2 +- arch/ia64/include/asm/thread_info.h | 2 +- arch/m32r/include/asm/thread_info.h | 4 +--- arch/m68k/include/asm/thread_info_mm.h | 1 + arch/m68k/include/asm/thread_info_no.h | 1 + arch/microblaze/include/asm/thread_info.h | 4 +--- arch/mips/include/asm/thread_info.h | 4 +--- arch/mn10300/include/asm/thread_info.h | 4 +--- arch/parisc/include/asm/thread_info.h | 2 +- arch/powerpc/include/asm/thread_info.h | 4 +--- arch/s390/include/asm/thread_info.h | 2 +- arch/sh/include/asm/thread_info.h | 2 +- arch/sparc/include/asm/thread_info_32.h | 4 +--- arch/sparc/include/asm/thread_info_64.h | 4 +--- arch/um/include/asm/thread_info.h | 2 +- arch/x86/include/asm/thread_info.h | 2 +- arch/xtensa/include/asm/thread_info.h | 4 +--- include/linux/sched.h | 6 ++++++ 24 files changed, 29 insertions(+), 40 deletions(-) (limited to 'include') diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h index d069526bd767..60c83abfde70 100644 --- a/arch/alpha/include/asm/thread_info.h +++ b/arch/alpha/include/asm/thread_info.h @@ -37,6 +37,7 @@ struct thread_info { .task = &tsk, \ .exec_domain = &default_exec_domain, \ .addr_limit = KERNEL_DS, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 4f8848260ee2..73394e50cbca 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -73,7 +73,7 @@ struct thread_info { .task = &tsk, \ .exec_domain = &default_exec_domain, \ .flags = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .cpu_domain = domain_val(DOMAIN_USER, DOMAIN_MANAGER) | \ domain_val(DOMAIN_KERNEL, DOMAIN_MANAGER) | \ diff --git a/arch/avr32/include/asm/thread_info.h b/arch/avr32/include/asm/thread_info.h index 4442f8d2d423..fc42de5ca209 100644 --- a/arch/avr32/include/asm/thread_info.h +++ b/arch/avr32/include/asm/thread_info.h @@ -40,7 +40,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall \ } \ diff --git a/arch/blackfin/include/asm/thread_info.h b/arch/blackfin/include/asm/thread_info.h index 2920087516f2..2bbfdd950afc 100644 --- a/arch/blackfin/include/asm/thread_info.h +++ b/arch/blackfin/include/asm/thread_info.h @@ -77,7 +77,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ diff --git a/arch/cris/include/asm/thread_info.h b/arch/cris/include/asm/thread_info.h index bc5b2935ca53..c3aade36c330 100644 --- a/arch/cris/include/asm/thread_info.h +++ b/arch/cris/include/asm/thread_info.h @@ -50,8 +50,6 @@ struct thread_info { /* * macros/functions for gaining access to the thread information structure - * - * preempt_count needs to be 1 initially, until the scheduler is functional. */ #ifndef __ASSEMBLY__ #define INIT_THREAD_INFO(tsk) \ @@ -60,7 +58,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/arch/frv/include/asm/thread_info.h b/arch/frv/include/asm/thread_info.h index e8a5ed7be021..e608e056bb53 100644 --- a/arch/frv/include/asm/thread_info.h +++ b/arch/frv/include/asm/thread_info.h @@ -56,8 +56,6 @@ struct thread_info { /* * macros/functions for gaining access to the thread information structure - * - * preempt_count needs to be 1 initially, until the scheduler is functional. */ #ifndef __ASSEMBLY__ @@ -67,7 +65,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/arch/h8300/include/asm/thread_info.h b/arch/h8300/include/asm/thread_info.h index 700014d2155f..8bbc8b0ee45d 100644 --- a/arch/h8300/include/asm/thread_info.h +++ b/arch/h8300/include/asm/thread_info.h @@ -36,7 +36,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index ae6922626bf4..8ce2e388e37c 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -48,7 +48,7 @@ struct thread_info { .flags = 0, \ .cpu = 0, \ .addr_limit = KERNEL_DS, \ - .preempt_count = 0, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ diff --git a/arch/m32r/include/asm/thread_info.h b/arch/m32r/include/asm/thread_info.h index 8589d462df27..07bb5bd00e2a 100644 --- a/arch/m32r/include/asm/thread_info.h +++ b/arch/m32r/include/asm/thread_info.h @@ -57,8 +57,6 @@ struct thread_info { /* * macros/functions for gaining access to the thread information structure - * - * preempt_count needs to be 1 initially, until the scheduler is functional. */ #ifndef __ASSEMBLY__ @@ -68,7 +66,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/arch/m68k/include/asm/thread_info_mm.h b/arch/m68k/include/asm/thread_info_mm.h index af0fda46e94b..6ea5c33b3c56 100644 --- a/arch/m68k/include/asm/thread_info_mm.h +++ b/arch/m68k/include/asm/thread_info_mm.h @@ -19,6 +19,7 @@ struct thread_info { { \ .task = &tsk, \ .exec_domain = &default_exec_domain, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ diff --git a/arch/m68k/include/asm/thread_info_no.h b/arch/m68k/include/asm/thread_info_no.h index 82529f424ea3..c2bde5e24b0b 100644 --- a/arch/m68k/include/asm/thread_info_no.h +++ b/arch/m68k/include/asm/thread_info_no.h @@ -49,6 +49,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h index 7fac44498445..6e92885d381a 100644 --- a/arch/microblaze/include/asm/thread_info.h +++ b/arch/microblaze/include/asm/thread_info.h @@ -75,8 +75,6 @@ struct thread_info { /* * macros/functions for gaining access to the thread information structure - * - * preempt_count needs to be 1 initially, until the scheduler is functional. */ #define INIT_THREAD_INFO(tsk) \ { \ @@ -84,7 +82,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/arch/mips/include/asm/thread_info.h b/arch/mips/include/asm/thread_info.h index 143a48136a4b..f9df720d2e40 100644 --- a/arch/mips/include/asm/thread_info.h +++ b/arch/mips/include/asm/thread_info.h @@ -39,8 +39,6 @@ struct thread_info { /* * macros/functions for gaining access to the thread information structure - * - * preempt_count needs to be 1 initially, until the scheduler is functional. */ #define INIT_THREAD_INFO(tsk) \ { \ @@ -48,7 +46,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = _TIF_FIXADE, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h index 78a3881f3c12..58d64f8b2cc3 100644 --- a/arch/mn10300/include/asm/thread_info.h +++ b/arch/mn10300/include/asm/thread_info.h @@ -65,8 +65,6 @@ struct thread_info { /* * macros/functions for gaining access to the thread information structure - * - * preempt_count needs to be 1 initially, until the scheduler is functional. */ #ifndef __ASSEMBLY__ @@ -76,7 +74,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h index 0407959da489..4ce0edfbe969 100644 --- a/arch/parisc/include/asm/thread_info.h +++ b/arch/parisc/include/asm/thread_info.h @@ -23,7 +23,7 @@ struct thread_info { .flags = 0, \ .cpu = 0, \ .addr_limit = KERNEL_DS, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall \ } \ diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 9aba5a38a7c4..c8b329255678 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -46,15 +46,13 @@ struct thread_info { /* * macros/functions for gaining access to the thread information structure - * - * preempt_count needs to be 1 initially, until the scheduler is functional. */ #define INIT_THREAD_INFO(tsk) \ { \ .task = &tsk, \ .exec_domain = &default_exec_domain, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index 925bcc649035..ba1cab9fc1f9 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -61,7 +61,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h index f09ac4806294..d570ac2e5cb9 100644 --- a/arch/sh/include/asm/thread_info.h +++ b/arch/sh/include/asm/thread_info.h @@ -51,7 +51,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/arch/sparc/include/asm/thread_info_32.h b/arch/sparc/include/asm/thread_info_32.h index 0f7b0e5fb1c7..844d73a0340c 100644 --- a/arch/sparc/include/asm/thread_info_32.h +++ b/arch/sparc/include/asm/thread_info_32.h @@ -54,8 +54,6 @@ struct thread_info { /* * macros/functions for gaining access to the thread information structure - * - * preempt_count needs to be 1 initially, until the scheduler is functional. */ #define INIT_THREAD_INFO(tsk) \ { \ @@ -64,7 +62,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index 65865726b283..1b45a7bbe407 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -125,8 +125,6 @@ struct thread_info { /* * macros/functions for gaining access to the thread information structure - * - * preempt_count needs to be 1 initially, until the scheduler is functional. */ #ifndef __ASSEMBLY__ @@ -135,7 +133,7 @@ struct thread_info { .task = &tsk, \ .flags = ((unsigned long)ASI_P) << TI_FLAG_CURRENT_DS_SHIFT, \ .exec_domain = &default_exec_domain, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .restart_block = { \ .fn = do_no_restart_syscall, \ }, \ diff --git a/arch/um/include/asm/thread_info.h b/arch/um/include/asm/thread_info.h index 62274ab9471f..fd911f855367 100644 --- a/arch/um/include/asm/thread_info.h +++ b/arch/um/include/asm/thread_info.h @@ -32,7 +32,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index b0783520988b..fad7d40b75f8 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -49,7 +49,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/arch/xtensa/include/asm/thread_info.h b/arch/xtensa/include/asm/thread_info.h index 0f4fe1faf9ba..13165641cc51 100644 --- a/arch/xtensa/include/asm/thread_info.h +++ b/arch/xtensa/include/asm/thread_info.h @@ -80,8 +80,6 @@ struct thread_info { /* * macros/functions for gaining access to the thread information structure - * - * preempt_count needs to be 1 initially, until the scheduler is functional. */ #ifndef __ASSEMBLY__ @@ -92,7 +90,7 @@ struct thread_info { .exec_domain = &default_exec_domain, \ .flags = 0, \ .cpu = 0, \ - .preempt_count = 1, \ + .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ .restart_block = { \ .fn = do_no_restart_syscall, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 0085d758d645..2a99f1c15cf8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -498,6 +498,12 @@ struct task_cputime { .sum_exec_runtime = 0, \ } +/* + * Disable preemption until the scheduler is running. + * Reset by start_kernel()->sched_init()->init_idle(). + */ +#define INIT_PREEMPT_COUNT (1) + /** * struct thread_group_cputimer - thread group interval timer counts * @cputime: thread group interval timers. -- cgit v1.2.3-59-g8ed1b From d86ee4809d0329d4aa0d0f2c76c2295a16862799 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 10 Jul 2009 14:57:57 +0200 Subject: sched: optimize cond_resched() Optimize cond_resched() by removing one conditional. Currently cond_resched() checks system_state == SYSTEM_RUNNING in order to avoid scheduling before the scheduler is running. We can however, as per suggestion of Matt, use PREEMPT_ACTIVE to accomplish that very same. Suggested-by: Matt Mackall Signed-off-by: Peter Zijlstra Acked-by: Matt Mackall Signed-off-by: Linus Torvalds --- include/linux/sched.h | 5 ++++- kernel/sched.c | 14 +++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 2a99f1c15cf8..16a982e389fb 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -501,8 +501,11 @@ struct task_cputime { /* * Disable preemption until the scheduler is running. * Reset by start_kernel()->sched_init()->init_idle(). + * + * We include PREEMPT_ACTIVE to avoid cond_resched() from working + * before the scheduler is active -- see should_resched(). */ -#define INIT_PREEMPT_COUNT (1) +#define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE) /** * struct thread_group_cputimer - thread group interval timer counts diff --git a/kernel/sched.c b/kernel/sched.c index 7c9098d186e6..01f55ada3598 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6541,6 +6541,11 @@ SYSCALL_DEFINE0(sched_yield) return 0; } +static inline int should_resched(void) +{ + return need_resched() && !(preempt_count() & PREEMPT_ACTIVE); +} + static void __cond_resched(void) { #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP @@ -6560,8 +6565,7 @@ static void __cond_resched(void) int __sched _cond_resched(void) { - if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && - system_state == SYSTEM_RUNNING) { + if (should_resched()) { __cond_resched(); return 1; } @@ -6579,12 +6583,12 @@ EXPORT_SYMBOL(_cond_resched); */ int cond_resched_lock(spinlock_t *lock) { - int resched = need_resched() && system_state == SYSTEM_RUNNING; + int resched = should_resched(); int ret = 0; if (spin_needbreak(lock) || resched) { spin_unlock(lock); - if (resched && need_resched()) + if (resched) __cond_resched(); else cpu_relax(); @@ -6599,7 +6603,7 @@ int __sched cond_resched_softirq(void) { BUG_ON(!in_softirq()); - if (need_resched() && system_state == SYSTEM_RUNNING) { + if (should_resched()) { local_bh_enable(); __cond_resched(); local_bh_disable(); -- cgit v1.2.3-59-g8ed1b From 24a15a62dcb1968bf4ffdae55c88fa934d971180 Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Thu, 9 Jul 2009 13:36:22 +0100 Subject: tty: Fix USB kref leak The sysrq code acquired a kref leak. Fix it by passing the tty separately from the caller (thus effectively using the callers kref which all the callers hold anyway) Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- drivers/usb/serial/ftdi_sio.c | 2 +- drivers/usb/serial/generic.c | 7 ++++--- drivers/usb/serial/pl2303.c | 2 +- include/linux/usb/serial.h | 3 ++- 4 files changed, 8 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c index 3dc3768ca71c..5f08702f672f 100644 --- a/drivers/usb/serial/ftdi_sio.c +++ b/drivers/usb/serial/ftdi_sio.c @@ -2121,7 +2121,7 @@ static void ftdi_process_read(struct work_struct *work) /* Note that the error flag is duplicated for every character received since we don't know which character it applied to */ - if (!usb_serial_handle_sysrq_char(port, + if (!usb_serial_handle_sysrq_char(tty, port, data[packet_offset + i])) tty_insert_flip_char(tty, data[packet_offset + i], diff --git a/drivers/usb/serial/generic.c b/drivers/usb/serial/generic.c index 3d8dc5671bea..ce57f6a32bdf 100644 --- a/drivers/usb/serial/generic.c +++ b/drivers/usb/serial/generic.c @@ -432,7 +432,7 @@ static void flush_and_resubmit_read_urb(struct usb_serial_port *port) else { /* Push data to tty */ for (i = 0; i < urb->actual_length; i++, ch++) { - if (!usb_serial_handle_sysrq_char(port, *ch)) + if (!usb_serial_handle_sysrq_char(tty, port, *ch)) tty_insert_flip_char(tty, *ch, TTY_NORMAL); } } @@ -534,11 +534,12 @@ void usb_serial_generic_unthrottle(struct tty_struct *tty) } } -int usb_serial_handle_sysrq_char(struct usb_serial_port *port, unsigned int ch) +int usb_serial_handle_sysrq_char(struct tty_struct *tty, + struct usb_serial_port *port, unsigned int ch) { if (port->sysrq && port->console) { if (ch && time_before(jiffies, port->sysrq)) { - handle_sysrq(ch, tty_port_tty_get(&port->port)); + handle_sysrq(ch, tty); port->sysrq = 0; return 1; } diff --git a/drivers/usb/serial/pl2303.c b/drivers/usb/serial/pl2303.c index ec6c132a25b5..8835802067f7 100644 --- a/drivers/usb/serial/pl2303.c +++ b/drivers/usb/serial/pl2303.c @@ -1038,7 +1038,7 @@ static void pl2303_read_bulk_callback(struct urb *urb) if (line_status & UART_OVERRUN_ERROR) tty_insert_flip_char(tty, 0, TTY_OVERRUN); for (i = 0; i < urb->actual_length; ++i) - if (!usb_serial_handle_sysrq_char(port, data[i])) + if (!usb_serial_handle_sysrq_char(tty, port, data[i])) tty_insert_flip_char(tty, data[i], tty_flag); tty_flip_buffer_push(tty); } diff --git a/include/linux/usb/serial.h b/include/linux/usb/serial.h index 44801d26a37a..0ec50ba62139 100644 --- a/include/linux/usb/serial.h +++ b/include/linux/usb/serial.h @@ -317,7 +317,8 @@ extern int usb_serial_generic_register(int debug); extern void usb_serial_generic_deregister(void); extern void usb_serial_generic_resubmit_read_urb(struct usb_serial_port *port, gfp_t mem_flags); -extern int usb_serial_handle_sysrq_char(struct usb_serial_port *port, +extern int usb_serial_handle_sysrq_char(struct tty_struct *tty, + struct usb_serial_port *port, unsigned int ch); extern int usb_serial_handle_break(struct usb_serial_port *port); -- cgit v1.2.3-59-g8ed1b From 373c0a7ed3ea3b34efedb7c83ffb521adff7c894 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 11 Jul 2009 10:06:54 -0400 Subject: Fix compile error due to congestion_wait() changes Move the definition of BLK_RW_ASYNC/BLK_RW_SYNC into linux/backing-dev.h so that it is available to all callers of set/clear_bdi_congested(). This replaces commit 097041e576ee3a50d92dd643ee8ca65bf6a62e21 ("fuse: Fix build error"), which will be reverted. Signed-off-by: Trond Myklebust Acked-by: Larry Finger Cc: Jens Axboe Cc: Miklos Szeredi Signed-off-by: Linus Torvalds --- include/linux/backing-dev.h | 5 +++++ include/linux/blkdev.h | 5 ----- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 3a52a63c1351..1d52425a6118 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -229,6 +229,11 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi) (1 << BDI_async_congested)); } +enum { + BLK_RW_ASYNC = 0, + BLK_RW_SYNC = 1, +}; + void clear_bdi_congested(struct backing_dev_info *bdi, int sync); void set_bdi_congested(struct backing_dev_info *bdi, int sync); long congestion_wait(int sync, long timeout); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 0146e0fecf1a..e7cb5dbf6c26 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -70,11 +70,6 @@ enum rq_cmd_type_bits { REQ_TYPE_ATA_PC, }; -enum { - BLK_RW_ASYNC = 0, - BLK_RW_SYNC = 1, -}; - /* * For request of type REQ_TYPE_LINUX_BLOCK, rq->cmd[0] is the opcode being * sent down (similar to how REQ_TYPE_BLOCK_PC means that ->cmd[] holds a -- cgit v1.2.3-59-g8ed1b From f9fabcb58a6d26d6efde842d1703ac7cfa9427b6 Mon Sep 17 00:00:00 2001 From: Julien Tinnes Date: Fri, 26 Jun 2009 20:27:40 +0200 Subject: personality: fix PER_CLEAR_ON_SETID We have found that the current PER_CLEAR_ON_SETID mask on Linux doesn't include neither ADDR_COMPAT_LAYOUT, nor MMAP_PAGE_ZERO. The current mask is READ_IMPLIES_EXEC|ADDR_NO_RANDOMIZE. We believe it is important to add MMAP_PAGE_ZERO, because by using this personality it is possible to have the first page mapped inside a process running as setuid root. This could be used in those scenarios: - Exploiting a NULL pointer dereference issue in a setuid root binary - Bypassing the mmap_min_addr restrictions of the Linux kernel: by running a setuid binary that would drop privileges before giving us control back (for instance by loading a user-supplied library), we could get the first page mapped in a process we control. By further using mremap and mprotect on this mapping, we can then completely bypass the mmap_min_addr restrictions. Less importantly, we believe ADDR_COMPAT_LAYOUT should also be added since on x86 32bits it will in practice disable most of the address space layout randomization (only the stack will remain randomized). Signed-off-by: Julien Tinnes Signed-off-by: Tavis Ormandy Cc: stable@kernel.org Acked-by: Christoph Hellwig Acked-by: Kees Cook Acked-by: Eugene Teo [ Shortened lines and fixed whitespace as per Christophs' suggestion ] Signed-off-by: Linus Torvalds --- include/linux/personality.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/personality.h b/include/linux/personality.h index a84e9ff9b27e..126120819a0d 100644 --- a/include/linux/personality.h +++ b/include/linux/personality.h @@ -40,7 +40,10 @@ enum { * Security-relevant compatibility flags that must be * cleared upon setuid or setgid exec: */ -#define PER_CLEAR_ON_SETID (READ_IMPLIES_EXEC|ADDR_NO_RANDOMIZE) +#define PER_CLEAR_ON_SETID (READ_IMPLIES_EXEC | \ + ADDR_NO_RANDOMIZE | \ + ADDR_COMPAT_LAYOUT | \ + MMAP_PAGE_ZERO) /* * Personality types. -- cgit v1.2.3-59-g8ed1b From 405f55712dfe464b3240d7816cc4fe4174831be2 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 11 Jul 2009 22:08:37 +0400 Subject: headers: smp_lock.h redux * Remove smp_lock.h from files which don't need it (including some headers!) * Add smp_lock.h to files which do need it * Make smp_lock.h include conditional in hardirq.h It's needed only for one kernel_locked() usage which is under CONFIG_PREEMPT This will make hardirq.h inclusion cheaper for every PREEMPT=n config (which includes allmodconfig/allyesconfig, BTW) Signed-off-by: Alexey Dobriyan Signed-off-by: Linus Torvalds --- arch/alpha/kernel/ptrace.c | 1 - arch/blackfin/kernel/ptrace.c | 1 - arch/blackfin/kernel/sys_bfin.c | 1 - arch/cris/kernel/sys_cris.c | 1 - arch/ia64/kernel/ptrace.c | 1 - arch/m32r/kernel/ptrace.c | 1 - arch/microblaze/kernel/ptrace.c | 1 - arch/microblaze/kernel/signal.c | 1 - arch/microblaze/kernel/sys_microblaze.c | 1 - arch/mips/kernel/ptrace32.c | 1 - arch/mips/mm/hugetlbpage.c | 1 - arch/mn10300/kernel/ptrace.c | 1 - arch/mn10300/kernel/signal.c | 1 - arch/mn10300/kernel/sys_mn10300.c | 1 - arch/mn10300/kernel/traps.c | 1 - arch/mn10300/mm/fault.c | 1 - arch/mn10300/mm/misalignment.c | 1 - arch/powerpc/kernel/ptrace32.c | 1 - arch/s390/kernel/dis.c | 1 - arch/s390/kernel/ptrace.c | 1 - arch/s390/mm/fault.c | 1 - arch/sh/mm/tlb-sh3.c | 1 - arch/sparc/kernel/ptrace_32.c | 1 - arch/sparc/kernel/ptrace_64.c | 1 - arch/sparc/kernel/time_64.c | 1 - arch/sparc/kernel/traps_32.c | 1 - drivers/block/DAC960.c | 1 + drivers/block/cciss.c | 1 + drivers/block/loop.c | 1 - drivers/bluetooth/hci_vhci.c | 1 - drivers/char/amiserial.c | 1 + drivers/char/cyclades.c | 1 + drivers/char/epca.c | 1 + drivers/char/isicom.c | 1 + drivers/char/istallion.c | 1 + drivers/char/moxa.c | 1 + drivers/char/mxser.c | 1 + drivers/char/n_hdlc.c | 1 + drivers/char/n_r3964.c | 1 + drivers/char/pty.c | 1 + drivers/char/rio/rio_linux.c | 1 + drivers/char/riscom8.c | 1 + drivers/char/rocket.c | 1 + drivers/char/serial167.c | 1 + drivers/char/specialix.c | 1 + drivers/char/sx.c | 1 + drivers/char/synclink.c | 1 + drivers/char/synclink_gt.c | 1 + drivers/char/synclinkmp.c | 1 + drivers/char/tpm/tpm.c | 1 - drivers/char/tty_ioctl.c | 1 - drivers/char/tty_ldisc.c | 1 - drivers/char/vt.c | 1 + drivers/char/vt_ioctl.c | 1 + drivers/gpio/vr41xx_giu.c | 1 - drivers/hid/usbhid/hid-core.c | 1 - drivers/isdn/hisax/hfc_usb.c | 1 - drivers/isdn/i4l/isdn_tty.c | 1 + drivers/isdn/mISDN/stack.c | 1 + drivers/media/dvb/bt8xx/dst_ca.c | 1 + drivers/media/dvb/dvb-core/dvbdev.h | 1 - drivers/media/dvb/ttpci/av7110.c | 1 - drivers/media/radio/radio-mr800.c | 1 + drivers/media/radio/radio-si470x.c | 1 + drivers/media/video/bt8xx/bttv-driver.c | 1 + drivers/media/video/cx23885/cx23885-417.c | 1 + drivers/media/video/cx23885/cx23885-video.c | 1 + drivers/media/video/cx88/cx88-blackbird.c | 1 + drivers/media/video/cx88/cx88-video.c | 1 + drivers/media/video/dabusb.c | 1 + drivers/media/video/pwc/pwc-if.c | 1 + drivers/media/video/pwc/pwc.h | 1 - drivers/media/video/s2255drv.c | 1 + drivers/media/video/saa5246a.c | 1 - drivers/media/video/saa5249.c | 1 - drivers/media/video/saa7134/saa7134-empress.c | 1 + drivers/media/video/se401.c | 1 + drivers/media/video/stk-webcam.c | 1 + drivers/media/video/stradis.c | 1 + drivers/media/video/stv680.c | 1 + drivers/media/video/usbvideo/vicam.c | 1 + drivers/media/video/usbvision/usbvision-video.c | 1 + drivers/media/video/v4l2-dev.c | 1 - drivers/media/video/zoran/zoran_driver.c | 1 + drivers/misc/sgi-gru/grufile.c | 1 - drivers/misc/sgi-gru/grukservices.c | 1 - drivers/net/irda/irtty-sir.c | 1 - drivers/pci/hotplug/cpci_hotplug_core.c | 1 - drivers/pci/hotplug/cpqphp_ctrl.c | 1 - drivers/pci/hotplug/cpqphp_sysfs.c | 1 + drivers/pci/hotplug/pciehp_ctrl.c | 1 - drivers/pci/syscall.c | 1 - drivers/s390/block/dasd_ioctl.c | 1 + drivers/scsi/qla2xxx/qla_mid.c | 1 - drivers/telephony/ixj.c | 1 + drivers/telephony/phonedev.c | 1 - drivers/usb/class/cdc-wdm.c | 1 - drivers/usb/gadget/amd5536udc.c | 1 - drivers/usb/gadget/langwell_udc.c | 1 - drivers/usb/gadget/s3c2410_udc.c | 1 - drivers/usb/host/r8a66597-hcd.c | 1 - drivers/usb/misc/iowarrior.c | 1 + drivers/usb/misc/rio500.c | 1 + drivers/usb/misc/usblcd.c | 1 + drivers/usb/musb/cppi_dma.h | 1 - drivers/usb/musb/musb_core.h | 1 - drivers/usb/serial/ftdi_sio.c | 1 + drivers/usb/serial/mos7840.c | 1 + drivers/usb/serial/usb-serial.c | 1 + drivers/video/fbmem.c | 1 - fs/adfs/super.c | 1 + fs/afs/super.c | 1 + fs/autofs4/dev-ioctl.c | 1 - fs/bfs/dir.c | 1 - fs/bfs/file.c | 1 - fs/btrfs/compression.c | 1 - fs/btrfs/file.c | 1 - fs/btrfs/inode.c | 1 - fs/btrfs/ioctl.c | 1 - fs/btrfs/super.c | 1 - fs/char_dev.c | 1 - fs/compat.c | 1 - fs/compat_ioctl.c | 1 + fs/exofs/super.c | 1 + fs/ext2/ioctl.c | 1 - fs/ext4/ioctl.c | 1 - fs/fat/dir.c | 1 - fs/fat/namei_msdos.c | 1 - fs/fat/namei_vfat.c | 1 - fs/fcntl.c | 1 - fs/freevxfs/vxfs_super.c | 1 + fs/hfs/super.c | 1 + fs/hfsplus/super.c | 1 + fs/hpfs/dir.c | 1 + fs/hpfs/file.c | 1 + fs/hpfs/hpfs_fn.h | 1 - fs/hpfs/inode.c | 1 + fs/hpfs/namei.c | 1 + fs/jffs2/super.c | 1 + fs/lockd/clntproc.c | 1 + fs/lockd/svc4proc.c | 1 + fs/lockd/svcproc.c | 1 + fs/nfs/delegation.c | 1 + fs/nfs/dir.c | 1 - fs/nfs/file.c | 1 - fs/nfs/inode.c | 1 - fs/nfs/nfs4proc.c | 1 - fs/nfs/read.c | 1 - fs/nfsd/nfsctl.c | 1 - fs/nfsd/nfssvc.c | 1 - fs/nilfs2/dir.c | 1 - fs/ocfs2/ioctl.c | 1 - fs/reiserfs/xattr.c | 1 - fs/squashfs/super.c | 1 + fs/ubifs/ioctl.c | 1 - fs/xfs/linux-2.6/xfs_file.c | 1 - include/linux/crash_dump.h | 1 - include/linux/hardirq.h | 2 ++ include/linux/quotaops.h | 1 - include/linux/sunrpc/xdr.h | 1 - kernel/power/user.c | 1 - kernel/trace/blktrace.c | 1 + kernel/trace/trace.c | 1 + net/appletalk/ddp.c | 1 + net/ipx/af_ipx.c | 1 + net/irda/af_irda.c | 1 + net/irda/irnet/irnet.h | 1 - net/irda/irnet/irnet_ppp.c | 1 + net/sunrpc/clnt.c | 1 - net/sunrpc/sched.c | 1 - net/sunrpc/svc_xprt.c | 1 + net/wanrouter/wanmain.c | 1 + net/x25/af_x25.c | 1 + 173 files changed, 81 insertions(+), 93 deletions(-) (limited to 'include') diff --git a/arch/alpha/kernel/ptrace.c b/arch/alpha/kernel/ptrace.c index 1e9ad52c460e..e072041d19f8 100644 --- a/arch/alpha/kernel/ptrace.c +++ b/arch/alpha/kernel/ptrace.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/blackfin/kernel/ptrace.c b/arch/blackfin/kernel/ptrace.c index d76618db50df..6a387eec6b65 100644 --- a/arch/blackfin/kernel/ptrace.c +++ b/arch/blackfin/kernel/ptrace.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/blackfin/kernel/sys_bfin.c b/arch/blackfin/kernel/sys_bfin.c index a8f1329c15a4..3da60fb13ce4 100644 --- a/arch/blackfin/kernel/sys_bfin.c +++ b/arch/blackfin/kernel/sys_bfin.c @@ -29,7 +29,6 @@ * 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ -#include #include #include #include diff --git a/arch/cris/kernel/sys_cris.c b/arch/cris/kernel/sys_cris.c index a79fbd87021b..2ad962c7e88e 100644 --- a/arch/cris/kernel/sys_cris.c +++ b/arch/cris/kernel/sys_cris.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 92c9689b7d97..9daa87fdb018 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/m32r/kernel/ptrace.c b/arch/m32r/kernel/ptrace.c index bf0abe9e1f73..98b8feb12ed8 100644 --- a/arch/m32r/kernel/ptrace.c +++ b/arch/m32r/kernel/ptrace.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/microblaze/kernel/ptrace.c b/arch/microblaze/kernel/ptrace.c index b86aa623e36d..53ff39af6a5c 100644 --- a/arch/microblaze/kernel/ptrace.c +++ b/arch/microblaze/kernel/ptrace.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include diff --git a/arch/microblaze/kernel/signal.c b/arch/microblaze/kernel/signal.c index 493819c25fba..1c80e4fc40ce 100644 --- a/arch/microblaze/kernel/signal.c +++ b/arch/microblaze/kernel/signal.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/microblaze/kernel/sys_microblaze.c b/arch/microblaze/kernel/sys_microblaze.c index 8c9ebac5da10..e000bce09b2b 100644 --- a/arch/microblaze/kernel/sys_microblaze.c +++ b/arch/microblaze/kernel/sys_microblaze.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/mips/kernel/ptrace32.c b/arch/mips/kernel/ptrace32.c index c4f9ac17474a..32644b4a0714 100644 --- a/arch/mips/kernel/ptrace32.c +++ b/arch/mips/kernel/ptrace32.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c index 471c09aa1614..8c2834f5919d 100644 --- a/arch/mips/mm/hugetlbpage.c +++ b/arch/mips/mm/hugetlbpage.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/mn10300/kernel/ptrace.c b/arch/mn10300/kernel/ptrace.c index e143339ad28e..cf847dabc1bd 100644 --- a/arch/mn10300/kernel/ptrace.c +++ b/arch/mn10300/kernel/ptrace.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c index 9f7572a0f578..feb2f2e810db 100644 --- a/arch/mn10300/kernel/signal.c +++ b/arch/mn10300/kernel/signal.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/mn10300/kernel/sys_mn10300.c b/arch/mn10300/kernel/sys_mn10300.c index bca5a84dc72c..29d196b83d25 100644 --- a/arch/mn10300/kernel/sys_mn10300.c +++ b/arch/mn10300/kernel/sys_mn10300.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/mn10300/kernel/traps.c b/arch/mn10300/kernel/traps.c index 0dfdc5001124..91365adba4f5 100644 --- a/arch/mn10300/kernel/traps.c +++ b/arch/mn10300/kernel/traps.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/mn10300/mm/fault.c b/arch/mn10300/mm/fault.c index a62e1e138bc1..53bb17d0f068 100644 --- a/arch/mn10300/mm/fault.c +++ b/arch/mn10300/mm/fault.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include /* For unblank_screen() */ diff --git a/arch/mn10300/mm/misalignment.c b/arch/mn10300/mm/misalignment.c index 94c4a4358065..30016251f658 100644 --- a/arch/mn10300/mm/misalignment.c +++ b/arch/mn10300/mm/misalignment.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/powerpc/kernel/ptrace32.c b/arch/powerpc/kernel/ptrace32.c index 297632cba047..8a6daf4129f6 100644 --- a/arch/powerpc/kernel/ptrace32.c +++ b/arch/powerpc/kernel/ptrace32.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index d2f270c995d9..db943a7ec513 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 490b39934d65..43acd73105b7 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 74eb26bf1970..e5e119fe03b2 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sh/mm/tlb-sh3.c b/arch/sh/mm/tlb-sh3.c index 7fbfd5a11ffa..17cb7c3adf22 100644 --- a/arch/sh/mm/tlb-sh3.c +++ b/arch/sh/mm/tlb-sh3.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include diff --git a/arch/sparc/kernel/ptrace_32.c b/arch/sparc/kernel/ptrace_32.c index 8ce6285a06d5..7e3dfd9bb97e 100644 --- a/arch/sparc/kernel/ptrace_32.c +++ b/arch/sparc/kernel/ptrace_32.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c index a941c610e7ce..4ae91dc2feb9 100644 --- a/arch/sparc/kernel/ptrace_64.c +++ b/arch/sparc/kernel/ptrace_64.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sparc/kernel/time_64.c b/arch/sparc/kernel/time_64.c index 5c12e79b4bdf..da1218e8ee87 100644 --- a/arch/sparc/kernel/time_64.c +++ b/arch/sparc/kernel/time_64.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/sparc/kernel/traps_32.c b/arch/sparc/kernel/traps_32.c index 358283341b47..c0490c7bbde0 100644 --- a/arch/sparc/kernel/traps_32.c +++ b/arch/sparc/kernel/traps_32.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 668dc234b8e2..1e6b7c14f697 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 65a0655e7fc8..a52cc7fe45ea 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 801f4ab83302..5757188cd1fb 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -61,7 +61,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/bluetooth/hci_vhci.c b/drivers/bluetooth/hci_vhci.c index 1df9dda2e377..d5cde6d86f89 100644 --- a/drivers/bluetooth/hci_vhci.c +++ b/drivers/bluetooth/hci_vhci.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/char/amiserial.c b/drivers/char/amiserial.c index 72429b6b2fa8..6c32fbf07164 100644 --- a/drivers/char/amiserial.c +++ b/drivers/char/amiserial.c @@ -81,6 +81,7 @@ static char *serial_version = "4.30"; #include #include #include +#include #include #include diff --git a/drivers/char/cyclades.c b/drivers/char/cyclades.c index f3366d3f06cf..2dafc2da0648 100644 --- a/drivers/char/cyclades.c +++ b/drivers/char/cyclades.c @@ -633,6 +633,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/epca.c b/drivers/char/epca.c index abef1f7d84fe..ff647ca1c489 100644 --- a/drivers/char/epca.c +++ b/drivers/char/epca.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/isicom.c b/drivers/char/isicom.c index 621d1184673c..4f1f4cd670da 100644 --- a/drivers/char/isicom.c +++ b/drivers/char/isicom.c @@ -122,6 +122,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/istallion.c b/drivers/char/istallion.c index 0c999f5bb3db..ab2f3349c5c4 100644 --- a/drivers/char/istallion.c +++ b/drivers/char/istallion.c @@ -20,6 +20,7 @@ #include #include +#include #include #include #include diff --git a/drivers/char/moxa.c b/drivers/char/moxa.c index 65b6ff2442c6..dd0083bbb64a 100644 --- a/drivers/char/moxa.c +++ b/drivers/char/moxa.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/mxser.c b/drivers/char/mxser.c index 52d953eb30c3..dbf8d52f31d0 100644 --- a/drivers/char/mxser.c +++ b/drivers/char/mxser.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/n_hdlc.c b/drivers/char/n_hdlc.c index 1c43c8cdee25..c68118efad84 100644 --- a/drivers/char/n_hdlc.c +++ b/drivers/char/n_hdlc.c @@ -97,6 +97,7 @@ #include #include #include +#include #include /* used in new tty drivers */ #include /* used in new tty drivers */ #include diff --git a/drivers/char/n_r3964.c b/drivers/char/n_r3964.c index 2e99158ebb8a..6934025a1ac1 100644 --- a/drivers/char/n_r3964.c +++ b/drivers/char/n_r3964.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include #include /* used in new tty drivers */ diff --git a/drivers/char/pty.c b/drivers/char/pty.c index 9d1b4f548f67..6e6942c45f5b 100644 --- a/drivers/char/pty.c +++ b/drivers/char/pty.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/rio/rio_linux.c b/drivers/char/rio/rio_linux.c index ce81da5b2da9..d58c2eb07f07 100644 --- a/drivers/char/rio/rio_linux.c +++ b/drivers/char/rio/rio_linux.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include diff --git a/drivers/char/riscom8.c b/drivers/char/riscom8.c index 217660451237..171711acf5cd 100644 --- a/drivers/char/riscom8.c +++ b/drivers/char/riscom8.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include diff --git a/drivers/char/rocket.c b/drivers/char/rocket.c index 63d5b628477a..0e29a23ec4c5 100644 --- a/drivers/char/rocket.c +++ b/drivers/char/rocket.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/serial167.c b/drivers/char/serial167.c index f1f24f0ee26f..51e7a46787be 100644 --- a/drivers/char/serial167.c +++ b/drivers/char/serial167.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/specialix.c b/drivers/char/specialix.c index e72be4190a44..bfe4cdb2febb 100644 --- a/drivers/char/specialix.c +++ b/drivers/char/specialix.c @@ -87,6 +87,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/sx.c b/drivers/char/sx.c index 518f2a25d91e..a81ec4fcf6ff 100644 --- a/drivers/char/sx.c +++ b/drivers/char/sx.c @@ -216,6 +216,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/synclink.c b/drivers/char/synclink.c index afded3a2379c..813552f14884 100644 --- a/drivers/char/synclink.c +++ b/drivers/char/synclink.c @@ -81,6 +81,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/synclink_gt.c b/drivers/char/synclink_gt.c index a2e67e6df3a1..91f20a92fddf 100644 --- a/drivers/char/synclink_gt.c +++ b/drivers/char/synclink_gt.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/synclinkmp.c b/drivers/char/synclinkmp.c index 6f727e3c53ad..8d4a2a8a0a70 100644 --- a/drivers/char/synclinkmp.c +++ b/drivers/char/synclinkmp.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/tpm/tpm.c b/drivers/char/tpm/tpm.c index ccdd828adcef..b0603b2e5684 100644 --- a/drivers/char/tpm/tpm.c +++ b/drivers/char/tpm/tpm.c @@ -26,7 +26,6 @@ #include #include #include -#include #include "tpm.h" diff --git a/drivers/char/tty_ioctl.c b/drivers/char/tty_ioctl.c index b24f6c6a1ea3..ad6ba4ed2808 100644 --- a/drivers/char/tty_ioctl.c +++ b/drivers/char/tty_ioctl.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c index 913aa8d3f1c5..0ef0dc97ba20 100644 --- a/drivers/char/tty_ldisc.c +++ b/drivers/char/tty_ldisc.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/char/vt.c b/drivers/char/vt.c index d9113b4c76e3..7947bd1b4cf7 100644 --- a/drivers/char/vt.c +++ b/drivers/char/vt.c @@ -89,6 +89,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c index 7539bed0f7e0..95189f288f8c 100644 --- a/drivers/char/vt_ioctl.c +++ b/drivers/char/vt_ioctl.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include diff --git a/drivers/gpio/vr41xx_giu.c b/drivers/gpio/vr41xx_giu.c index b70e06133e78..b16c9a8c03f5 100644 --- a/drivers/gpio/vr41xx_giu.c +++ b/drivers/gpio/vr41xx_giu.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index 76c4bbe9dccb..3c1fcb7640ab 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/isdn/hisax/hfc_usb.c b/drivers/isdn/hisax/hfc_usb.c index 8df889b0c1a9..9de54202c90c 100644 --- a/drivers/isdn/hisax/hfc_usb.c +++ b/drivers/isdn/hisax/hfc_usb.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include #include "hisax.h" diff --git a/drivers/isdn/i4l/isdn_tty.c b/drivers/isdn/i4l/isdn_tty.c index b4d4522e5071..2881a66c1aa9 100644 --- a/drivers/isdn/i4l/isdn_tty.c +++ b/drivers/isdn/i4l/isdn_tty.c @@ -13,6 +13,7 @@ #include #include +#include #include "isdn_common.h" #include "isdn_tty.h" #ifdef CONFIG_ISDN_AUDIO diff --git a/drivers/isdn/mISDN/stack.c b/drivers/isdn/mISDN/stack.c index e2f45019ebf0..3e1532a180ff 100644 --- a/drivers/isdn/mISDN/stack.c +++ b/drivers/isdn/mISDN/stack.c @@ -17,6 +17,7 @@ #include #include +#include #include "core.h" static u_int *debug; diff --git a/drivers/media/dvb/bt8xx/dst_ca.c b/drivers/media/dvb/bt8xx/dst_ca.c index 4601b059b2b2..0e246eaad05a 100644 --- a/drivers/media/dvb/bt8xx/dst_ca.c +++ b/drivers/media/dvb/bt8xx/dst_ca.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include "dvbdev.h" diff --git a/drivers/media/dvb/dvb-core/dvbdev.h b/drivers/media/dvb/dvb-core/dvbdev.h index 79927305e84d..487919bea7ae 100644 --- a/drivers/media/dvb/dvb-core/dvbdev.h +++ b/drivers/media/dvb/dvb-core/dvbdev.h @@ -27,7 +27,6 @@ #include #include #include -#include #define DVB_MAJOR 212 diff --git a/drivers/media/dvb/ttpci/av7110.c b/drivers/media/dvb/ttpci/av7110.c index d1d959ed37b7..8d65c652ba50 100644 --- a/drivers/media/dvb/ttpci/av7110.c +++ b/drivers/media/dvb/ttpci/av7110.c @@ -36,7 +36,6 @@ #include #include #include -#include #include #include diff --git a/drivers/media/radio/radio-mr800.c b/drivers/media/radio/radio-mr800.c index 837467f93805..575bf9d89419 100644 --- a/drivers/media/radio/radio-mr800.c +++ b/drivers/media/radio/radio-mr800.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/radio/radio-si470x.c b/drivers/media/radio/radio-si470x.c index 46d216329611..e85f318b4d2b 100644 --- a/drivers/media/radio/radio-si470x.c +++ b/drivers/media/radio/radio-si470x.c @@ -127,6 +127,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/video/bt8xx/bttv-driver.c b/drivers/media/video/bt8xx/bttv-driver.c index 5eb1464af670..d147d29bb0d3 100644 --- a/drivers/media/video/bt8xx/bttv-driver.c +++ b/drivers/media/video/bt8xx/bttv-driver.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include "bttvp.h" diff --git a/drivers/media/video/cx23885/cx23885-417.c b/drivers/media/video/cx23885/cx23885-417.c index 2943bfd32a94..428f0c45e6b7 100644 --- a/drivers/media/video/cx23885/cx23885-417.c +++ b/drivers/media/video/cx23885/cx23885-417.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/video/cx23885/cx23885-video.c b/drivers/media/video/cx23885/cx23885-video.c index 70836af3ab48..5d6093336300 100644 --- a/drivers/media/video/cx23885/cx23885-video.c +++ b/drivers/media/video/cx23885/cx23885-video.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/video/cx88/cx88-blackbird.c b/drivers/media/video/cx88/cx88-blackbird.c index 44eacfb0d0d6..356d6896da3f 100644 --- a/drivers/media/video/cx88/cx88-blackbird.c +++ b/drivers/media/video/cx88/cx88-blackbird.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/video/cx88/cx88-video.c b/drivers/media/video/cx88/cx88-video.c index b12770848c00..2bb54c3ef5cd 100644 --- a/drivers/media/video/cx88/cx88-video.c +++ b/drivers/media/video/cx88/cx88-video.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/video/dabusb.c b/drivers/media/video/dabusb.c index ec2f45dde164..0664d111085f 100644 --- a/drivers/media/video/dabusb.c +++ b/drivers/media/video/dabusb.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/video/pwc/pwc-if.c b/drivers/media/video/pwc/pwc-if.c index db25c3034c11..8d17cf613306 100644 --- a/drivers/media/video/pwc/pwc-if.c +++ b/drivers/media/video/pwc/pwc-if.c @@ -62,6 +62,7 @@ #include #include #include +#include #ifdef CONFIG_USB_PWC_INPUT_EVDEV #include #endif diff --git a/drivers/media/video/pwc/pwc.h b/drivers/media/video/pwc/pwc.h index 0be6f814f539..0b658dee05a4 100644 --- a/drivers/media/video/pwc/pwc.h +++ b/drivers/media/video/pwc/pwc.h @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/media/video/s2255drv.c b/drivers/media/video/s2255drv.c index 6be845ccc7d7..9e3262c0ba37 100644 --- a/drivers/media/video/s2255drv.c +++ b/drivers/media/video/s2255drv.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/video/saa5246a.c b/drivers/media/video/saa5246a.c index 155804b061e9..b624a4c01fdc 100644 --- a/drivers/media/video/saa5246a.c +++ b/drivers/media/video/saa5246a.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/media/video/saa5249.c b/drivers/media/video/saa5249.c index 271d6e931b75..12835fb82c95 100644 --- a/drivers/media/video/saa5249.c +++ b/drivers/media/video/saa5249.c @@ -46,7 +46,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/media/video/saa7134/saa7134-empress.c b/drivers/media/video/saa7134/saa7134-empress.c index add1757f8930..296788c3bf0e 100644 --- a/drivers/media/video/saa7134/saa7134-empress.c +++ b/drivers/media/video/saa7134/saa7134-empress.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include "saa7134-reg.h" diff --git a/drivers/media/video/se401.c b/drivers/media/video/se401.c index c8f05297d0f0..85ffc2cba039 100644 --- a/drivers/media/video/se401.c +++ b/drivers/media/video/se401.c @@ -31,6 +31,7 @@ static const char version[] = "0.24"; #include #include #include +#include #include #include #include "se401.h" diff --git a/drivers/media/video/stk-webcam.c b/drivers/media/video/stk-webcam.c index 2e5937047278..4d6785e63455 100644 --- a/drivers/media/video/stk-webcam.c +++ b/drivers/media/video/stk-webcam.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include diff --git a/drivers/media/video/stradis.c b/drivers/media/video/stradis.c index 0eb313082c97..eaada39c76fd 100644 --- a/drivers/media/video/stradis.c +++ b/drivers/media/video/stradis.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/video/stv680.c b/drivers/media/video/stv680.c index 75f286f7a2e9..8b4e7dafce7b 100644 --- a/drivers/media/video/stv680.c +++ b/drivers/media/video/stv680.c @@ -62,6 +62,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/video/usbvideo/vicam.c b/drivers/media/video/usbvideo/vicam.c index 8d73979596f9..45fce39ec9ad 100644 --- a/drivers/media/video/usbvideo/vicam.c +++ b/drivers/media/video/usbvideo/vicam.c @@ -43,6 +43,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/video/usbvision/usbvision-video.c b/drivers/media/video/usbvision/usbvision-video.c index 90b58914f984..90d9b5c0e9a7 100644 --- a/drivers/media/video/usbvision/usbvision-video.c +++ b/drivers/media/video/usbvision/usbvision-video.c @@ -50,6 +50,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/media/video/v4l2-dev.c b/drivers/media/video/v4l2-dev.c index 31eac66411d7..a7f1b69a7dab 100644 --- a/drivers/media/video/v4l2-dev.c +++ b/drivers/media/video/v4l2-dev.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include diff --git a/drivers/media/video/zoran/zoran_driver.c b/drivers/media/video/zoran/zoran_driver.c index 3d7df32a3d87..bcdefb1bcb3d 100644 --- a/drivers/media/video/zoran/zoran_driver.c +++ b/drivers/media/video/zoran/zoran_driver.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c index fa2d93a9fb8d..aed609832bc2 100644 --- a/drivers/misc/sgi-gru/grufile.c +++ b/drivers/misc/sgi-gru/grufile.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/misc/sgi-gru/grukservices.c b/drivers/misc/sgi-gru/grukservices.c index eedbf9c32760..79689b10f937 100644 --- a/drivers/misc/sgi-gru/grukservices.c +++ b/drivers/misc/sgi-gru/grukservices.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/net/irda/irtty-sir.c b/drivers/net/irda/irtty-sir.c index d53aa9582137..20f9bc626688 100644 --- a/drivers/net/irda/irtty-sir.c +++ b/drivers/net/irda/irtty-sir.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include diff --git a/drivers/pci/hotplug/cpci_hotplug_core.c b/drivers/pci/hotplug/cpci_hotplug_core.c index a5b9f6ae507b..d703e73fffa7 100644 --- a/drivers/pci/hotplug/cpci_hotplug_core.c +++ b/drivers/pci/hotplug/cpci_hotplug_core.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/pci/hotplug/cpqphp_ctrl.c b/drivers/pci/hotplug/cpqphp_ctrl.c index 2fa47af992a8..0ff689afa757 100644 --- a/drivers/pci/hotplug/cpqphp_ctrl.c +++ b/drivers/pci/hotplug/cpqphp_ctrl.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/pci/hotplug/cpqphp_sysfs.c b/drivers/pci/hotplug/cpqphp_sysfs.c index 8450f4a6568a..e6089bdb6e5b 100644 --- a/drivers/pci/hotplug/cpqphp_sysfs.c +++ b/drivers/pci/hotplug/cpqphp_sysfs.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include "cpqphp.h" diff --git a/drivers/pci/hotplug/pciehp_ctrl.c b/drivers/pci/hotplug/pciehp_ctrl.c index ff4034502d24..8aab8edf123e 100644 --- a/drivers/pci/hotplug/pciehp_ctrl.c +++ b/drivers/pci/hotplug/pciehp_ctrl.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include "../pci.h" diff --git a/drivers/pci/syscall.c b/drivers/pci/syscall.c index ec22284eed30..e1c1ec540893 100644 --- a/drivers/pci/syscall.c +++ b/drivers/pci/syscall.c @@ -9,7 +9,6 @@ #include #include -#include #include #include #include "pci.h" diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c index 4ce3f72ee1c1..df918ef27965 100644 --- a/drivers/s390/block/dasd_ioctl.c +++ b/drivers/s390/block/dasd_ioctl.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include diff --git a/drivers/scsi/qla2xxx/qla_mid.c b/drivers/scsi/qla2xxx/qla_mid.c index 650bcef08f2a..cd78c501803a 100644 --- a/drivers/scsi/qla2xxx/qla_mid.c +++ b/drivers/scsi/qla2xxx/qla_mid.c @@ -9,7 +9,6 @@ #include #include -#include #include #include diff --git a/drivers/telephony/ixj.c b/drivers/telephony/ixj.c index a913efc69669..40de151f2789 100644 --- a/drivers/telephony/ixj.c +++ b/drivers/telephony/ixj.c @@ -257,6 +257,7 @@ #include /* everything... */ #include /* error codes */ #include +#include #include #include #include diff --git a/drivers/telephony/phonedev.c b/drivers/telephony/phonedev.c index b52cc830c0b4..f3873f650bb4 100644 --- a/drivers/telephony/phonedev.c +++ b/drivers/telephony/phonedev.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index 0fe434505ac4..ba589d4ca8bc 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/usb/gadget/amd5536udc.c b/drivers/usb/gadget/amd5536udc.c index 826f3adde5d8..77352ccc245e 100644 --- a/drivers/usb/gadget/amd5536udc.c +++ b/drivers/usb/gadget/amd5536udc.c @@ -48,7 +48,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/usb/gadget/langwell_udc.c b/drivers/usb/gadget/langwell_udc.c index 6829d5961359..a3913519fd58 100644 --- a/drivers/usb/gadget/langwell_udc.c +++ b/drivers/usb/gadget/langwell_udc.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/usb/gadget/s3c2410_udc.c b/drivers/usb/gadget/s3c2410_udc.c index 9a2b8920532d..a9b452fe6221 100644 --- a/drivers/usb/gadget/s3c2410_udc.c +++ b/drivers/usb/gadget/s3c2410_udc.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/usb/host/r8a66597-hcd.c b/drivers/usb/host/r8a66597-hcd.c index 56976cc0352a..e18f74946e68 100644 --- a/drivers/usb/host/r8a66597-hcd.c +++ b/drivers/usb/host/r8a66597-hcd.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/usb/misc/iowarrior.c b/drivers/usb/misc/iowarrior.c index 3c5fe5cee05a..90e1a8dedfa9 100644 --- a/drivers/usb/misc/iowarrior.c +++ b/drivers/usb/misc/iowarrior.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include diff --git a/drivers/usb/misc/rio500.c b/drivers/usb/misc/rio500.c index deb95bb49fd1..d645f3899fe1 100644 --- a/drivers/usb/misc/rio500.c +++ b/drivers/usb/misc/rio500.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/usb/misc/usblcd.c b/drivers/usb/misc/usblcd.c index e0ff9ccd866b..29092b8e59ce 100644 --- a/drivers/usb/misc/usblcd.c +++ b/drivers/usb/misc/usblcd.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/usb/musb/cppi_dma.h b/drivers/usb/musb/cppi_dma.h index 8a39de3e6e47..59bf949e589b 100644 --- a/drivers/usb/musb/cppi_dma.h +++ b/drivers/usb/musb/cppi_dma.h @@ -5,7 +5,6 @@ #include #include -#include #include #include diff --git a/drivers/usb/musb/musb_core.h b/drivers/usb/musb/musb_core.h index f3772ca3b2cf..381d648a36b8 100644 --- a/drivers/usb/musb/musb_core.h +++ b/drivers/usb/musb/musb_core.h @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c index 5f08702f672f..5a8ae274d258 100644 --- a/drivers/usb/serial/ftdi_sio.c +++ b/drivers/usb/serial/ftdi_sio.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c index c40f95c1951c..c31940a307f8 100644 --- a/drivers/usb/serial/mos7840.c +++ b/drivers/usb/serial/mos7840.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/usb/serial/usb-serial.c b/drivers/usb/serial/usb-serial.c index a84216464ca0..0c39b55aeef4 100644 --- a/drivers/usb/serial/usb-serial.c +++ b/drivers/usb/serial/usb-serial.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include diff --git a/drivers/video/fbmem.c b/drivers/video/fbmem.c index 53ea05645ff8..a85c818be945 100644 --- a/drivers/video/fbmem.c +++ b/drivers/video/fbmem.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/adfs/super.c b/fs/adfs/super.c index aad92f0a1048..6910a98bd73c 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include "adfs.h" #include "dir_f.h" diff --git a/fs/afs/super.c b/fs/afs/super.c index ad0514d0115f..e1ea1c240b6a 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index f3da2eb51f56..00bf8fcb245f 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 54bd07d44e68..1e41aadb1068 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include "bfs.h" diff --git a/fs/bfs/file.c b/fs/bfs/file.c index 6a021265f018..88b9a3ff44e4 100644 --- a/fs/bfs/file.c +++ b/fs/bfs/file.c @@ -11,7 +11,6 @@ #include #include -#include #include "bfs.h" #undef DEBUG diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index de1e2fd32080..9d8ba4d54a37 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7c3cd248d8d6..4b833972273a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7ffa3d34ea19..791eab19e330 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 9f4db848db10..bd88f25889f7 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9f179d4832d5..6d6d06cb6dfc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/char_dev.c b/fs/char_dev.c index b7c9d5187a75..a173551e19d7 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include diff --git a/fs/compat.c b/fs/compat.c index fbadb947727b..94502dab972a 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 626c7483b4de..f28f070a60fc 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/exofs/super.c b/fs/exofs/super.c index a343b4ea62f6..5ab10c3bbebe 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -31,6 +31,7 @@ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ +#include #include #include #include diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 7cb4badef927..e7431309bdca 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bb415408fdb6..24a6abb2aef5 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 38ff75a0fe22..530b4ca01510 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index 82f88733b681..bbc94ae4fd77 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -9,7 +9,6 @@ #include #include #include -#include #include "fat.h" /* Characters that are undesirable in an MS-DOS file name */ diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 73471b7ecc8c..cb6e83557112 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include "fat.h" diff --git a/fs/fcntl.c b/fs/fcntl.c index a040b764f8e3..ae413086db97 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include diff --git a/fs/freevxfs/vxfs_super.c b/fs/freevxfs/vxfs_super.c index cdbd1654e4cd..1e8af939b3e4 100644 --- a/fs/freevxfs/vxfs_super.c +++ b/fs/freevxfs/vxfs_super.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 6f833dc8e910..f7fcbe49da72 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include "hfs_fs.h" diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 9fc3af0c0dab..c0759fe0855b 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 6916c41d7017..8865c94f55f6 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -6,6 +6,7 @@ * directory VFS functions */ +#include #include "hpfs_fn.h" static int hpfs_dir_release(struct inode *inode, struct file *filp) diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 64ab52259204..3efabff00367 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -6,6 +6,7 @@ * file VFS functions */ +#include #include "hpfs_fn.h" #define BLOCKS(size) (((size) + 511) >> 9) diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index c2ea31bae313..701ca54c0867 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -13,7 +13,6 @@ #include #include #include -#include #include "hpfs.h" diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 39a1bfbea312..fe703ae46bc7 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -6,6 +6,7 @@ * inode VFS functions */ +#include #include "hpfs_fn.h" void hpfs_init_inode(struct inode *i) diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index b649232dde97..82b9c4ba9ed0 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -6,6 +6,7 @@ * adding & removing files & directories */ #include +#include #include "hpfs_fn.h" static int hpfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 07a22caf2687..0035c021395a 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index f2fdcbce143e..4336adba952a 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -7,6 +7,7 @@ */ #include +#include #include #include #include diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 1725037374c5..bd173a6ca3b1 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 3688e55901fc..e1d28ddd2169 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index af05b918cb5b..6dd48a4405b4 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 89f98e9a024b..38d42c29fb92 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -29,7 +29,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 0055b813ec2c..05062329b678 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -26,7 +26,6 @@ #include #include #include -#include #include #include diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 64f87194d390..bd7938eda6a8 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 92ce43517814..ff0c080db59b 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -45,7 +45,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 96c4ebfa46f4..73ea5e8d66ce 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -18,7 +18,6 @@ #include #include #include -#include #include diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 1250fb978ac1..6d0847562d87 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -25,7 +25,6 @@ #include #include #include -#include #include #include diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index d4c9884cd54b..492c79b7800b 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -18,7 +18,6 @@ #include #include #include -#include #include #include #include diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 54100acc1102..1a4fa04cf071 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -43,7 +43,6 @@ */ #include -#include #include "nilfs.h" #include "page.h" diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 9fcd36dcc9a0..467b413bec21 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -7,7 +7,6 @@ #include #include -#include #define MLOG_MASK_PREFIX ML_INODE #include diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index f3d47d856848..6925b835a43b 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -46,7 +46,6 @@ #include #include #include -#include #include #include diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 3b52770f46ff..cb5fc57e370b 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c index 6db7a6be6c97..8aacd64957a2 100644 --- a/fs/ubifs/ioctl.c +++ b/fs/ubifs/ioctl.c @@ -25,7 +25,6 @@ /* This file implements EXT2-compatible extended attribute ioctl() calls */ #include -#include #include #include "ubifs.h" diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index f4e255441574..0542fd507649 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -41,7 +41,6 @@ #include "xfs_ioctl.h" #include -#include static struct vm_operations_struct xfs_file_vm_ops; diff --git a/include/linux/crash_dump.h b/include/linux/crash_dump.h index 2dac064d8359..0026f267da20 100644 --- a/include/linux/crash_dump.h +++ b/include/linux/crash_dump.h @@ -3,7 +3,6 @@ #ifdef CONFIG_CRASH_DUMP #include -#include #include #include diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 45257475623c..8246c697863d 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -2,7 +2,9 @@ #define LINUX_HARDIRQ_H #include +#ifdef CONFIG_PREEMPT #include +#endif #include #include #include diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h index 7bc457593684..26361c4c037a 100644 --- a/include/linux/quotaops.h +++ b/include/linux/quotaops.h @@ -7,7 +7,6 @@ #ifndef _LINUX_QUOTAOPS_ #define _LINUX_QUOTAOPS_ -#include #include static inline struct quota_info *sb_dqopt(struct super_block *sb) diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index d8910b68e1bd..b99c625fddfe 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -12,7 +12,6 @@ #include #include #include -#include /* * Buffer adjustment diff --git a/kernel/power/user.c b/kernel/power/user.c index ed97375daae9..bf0014d6a5f0 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 39af8af6fc30..1090b0aed9ba 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 3aa0a0dfdfa8..8bc8d8afea6a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 590b83963622..bfbe13786bb4 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -54,6 +54,7 @@ #include #include #include +#include #include /* For TIOCOUTQ/INQ */ #include #include diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 417b0e309495..f1118d92a191 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index cb762c8723ea..80cf29aae096 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include #include diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h index bccf4d0059f0..b001c361ad30 100644 --- a/net/irda/irnet/irnet.h +++ b/net/irda/irnet/irnet.h @@ -241,7 +241,6 @@ #include #include -#include #include #include #include diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c index 6d8ae03c14f5..68cbcb19cbd8 100644 --- a/net/irda/irnet/irnet_ppp.c +++ b/net/irda/irnet/irnet_ppp.c @@ -13,6 +13,7 @@ * 2) as a control channel (write commands, read events) */ +#include #include "irnet_ppp.h" /* Private header */ /* Please put other headers in irnet.h - Thanks */ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 5bc2f45bddf0..ebfcf9b89909 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 1102ce1251f7..8f459abe97cf 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 6f33d33cc064..27d44332f017 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -5,6 +5,7 @@ */ #include +#include #include #include #include diff --git a/net/wanrouter/wanmain.c b/net/wanrouter/wanmain.c index 466e2d22d256..258daa80ad92 100644 --- a/net/wanrouter/wanmain.c +++ b/net/wanrouter/wanmain.c @@ -48,6 +48,7 @@ #include #include /* support for loadable modules */ #include /* kmalloc(), kfree() */ +#include #include #include /* inline mem*, str* functions */ diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 21cdc872004e..5e6c072c64d3 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -40,6 +40,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3-59-g8ed1b From 4ead0a2b6b3aa0b527871ec978c3ef12aafb6bfb Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Thu, 2 Jul 2009 23:25:44 +0200 Subject: Driver Core: remove BUS_ID_SIZE The name size limit is gone from the driver-core, this is the removal of the last left-over. Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- include/linux/device.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/device.h b/include/linux/device.h index ed4e39f2c423..aebb81036db2 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -25,8 +25,6 @@ #include #include -#define BUS_ID_SIZE 20 - struct device; struct device_private; struct device_driver; -- cgit v1.2.3-59-g8ed1b From 1a74826fa1cd6c2e382f927403b4440675f0f55a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 23 Jun 2009 15:58:48 -0700 Subject: Revert "USB: Add Intel Langwell USB OTG Transceiver Drive" This reverts commit 453f77558810ffa669ed5a510a7173ec49def396. The driver should not have been accepted as the MSRT code is not in the main kernel yet, which this depends on. Cc: Alan Cox Cc: Hao Wu Signed-off-by: Greg Kroah-Hartman --- drivers/usb/otg/Kconfig | 14 - drivers/usb/otg/Makefile | 1 - drivers/usb/otg/langwell_otg.c | 1915 -------------------------------------- include/linux/usb/langwell_otg.h | 177 ---- 4 files changed, 2107 deletions(-) delete mode 100644 drivers/usb/otg/langwell_otg.c delete mode 100644 include/linux/usb/langwell_otg.h (limited to 'include') diff --git a/drivers/usb/otg/Kconfig b/drivers/usb/otg/Kconfig index 69feeec1628c..aa884d072f0b 100644 --- a/drivers/usb/otg/Kconfig +++ b/drivers/usb/otg/Kconfig @@ -59,18 +59,4 @@ config NOP_USB_XCEIV built-in with usb ip or which are autonomous and doesn't require any phy programming such as ISP1x04 etc. -config USB_LANGWELL_OTG - tristate "Intel Langwell USB OTG dual-role support" - depends on USB && MRST - select USB_OTG - select USB_OTG_UTILS - help - Say Y here if you want to build Intel Langwell USB OTG - transciever driver in kernel. This driver implements role - switch between EHCI host driver and Langwell USB OTG - client driver. - - To compile this driver as a module, choose M here: the - module will be called langwell_otg. - endif # USB || OTG diff --git a/drivers/usb/otg/Makefile b/drivers/usb/otg/Makefile index 6d1abdd3c0ac..208167856529 100644 --- a/drivers/usb/otg/Makefile +++ b/drivers/usb/otg/Makefile @@ -9,7 +9,6 @@ obj-$(CONFIG_USB_OTG_UTILS) += otg.o obj-$(CONFIG_USB_GPIO_VBUS) += gpio_vbus.o obj-$(CONFIG_ISP1301_OMAP) += isp1301_omap.o obj-$(CONFIG_TWL4030_USB) += twl4030-usb.o -obj-$(CONFIG_USB_LANGWELL_OTG) += langwell_otg.o obj-$(CONFIG_NOP_USB_XCEIV) += nop-usb-xceiv.o ccflags-$(CONFIG_USB_DEBUG) += -DDEBUG diff --git a/drivers/usb/otg/langwell_otg.c b/drivers/usb/otg/langwell_otg.c deleted file mode 100644 index 6f628d0e9f39..000000000000 --- a/drivers/usb/otg/langwell_otg.c +++ /dev/null @@ -1,1915 +0,0 @@ -/* - * Intel Langwell USB OTG transceiver driver - * Copyright (C) 2008 - 2009, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * - */ -/* This driver helps to switch Langwell OTG controller function between host - * and peripheral. It works with EHCI driver and Langwell client controller - * driver together. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "../core/hcd.h" - -#include - -#define DRIVER_DESC "Intel Langwell USB OTG transceiver driver" -#define DRIVER_VERSION "3.0.0.32L.0002" - -MODULE_DESCRIPTION(DRIVER_DESC); -MODULE_AUTHOR("Henry Yuan , Hao Wu "); -MODULE_VERSION(DRIVER_VERSION); -MODULE_LICENSE("GPL"); - -static const char driver_name[] = "langwell_otg"; - -static int langwell_otg_probe(struct pci_dev *pdev, - const struct pci_device_id *id); -static void langwell_otg_remove(struct pci_dev *pdev); -static int langwell_otg_suspend(struct pci_dev *pdev, pm_message_t message); -static int langwell_otg_resume(struct pci_dev *pdev); - -static int langwell_otg_set_host(struct otg_transceiver *otg, - struct usb_bus *host); -static int langwell_otg_set_peripheral(struct otg_transceiver *otg, - struct usb_gadget *gadget); -static int langwell_otg_start_srp(struct otg_transceiver *otg); - -static const struct pci_device_id pci_ids[] = {{ - .class = ((PCI_CLASS_SERIAL_USB << 8) | 0xfe), - .class_mask = ~0, - .vendor = 0x8086, - .device = 0x0811, - .subvendor = PCI_ANY_ID, - .subdevice = PCI_ANY_ID, -}, { /* end: all zeroes */ } -}; - -static struct pci_driver otg_pci_driver = { - .name = (char *) driver_name, - .id_table = pci_ids, - - .probe = langwell_otg_probe, - .remove = langwell_otg_remove, - - .suspend = langwell_otg_suspend, - .resume = langwell_otg_resume, -}; - -static const char *state_string(enum usb_otg_state state) -{ - switch (state) { - case OTG_STATE_A_IDLE: - return "a_idle"; - case OTG_STATE_A_WAIT_VRISE: - return "a_wait_vrise"; - case OTG_STATE_A_WAIT_BCON: - return "a_wait_bcon"; - case OTG_STATE_A_HOST: - return "a_host"; - case OTG_STATE_A_SUSPEND: - return "a_suspend"; - case OTG_STATE_A_PERIPHERAL: - return "a_peripheral"; - case OTG_STATE_A_WAIT_VFALL: - return "a_wait_vfall"; - case OTG_STATE_A_VBUS_ERR: - return "a_vbus_err"; - case OTG_STATE_B_IDLE: - return "b_idle"; - case OTG_STATE_B_SRP_INIT: - return "b_srp_init"; - case OTG_STATE_B_PERIPHERAL: - return "b_peripheral"; - case OTG_STATE_B_WAIT_ACON: - return "b_wait_acon"; - case OTG_STATE_B_HOST: - return "b_host"; - default: - return "UNDEFINED"; - } -} - -/* HSM timers */ -static inline struct langwell_otg_timer *otg_timer_initializer -(void (*function)(unsigned long), unsigned long expires, unsigned long data) -{ - struct langwell_otg_timer *timer; - timer = kmalloc(sizeof(struct langwell_otg_timer), GFP_KERNEL); - timer->function = function; - timer->expires = expires; - timer->data = data; - return timer; -} - -static struct langwell_otg_timer *a_wait_vrise_tmr, *a_wait_bcon_tmr, - *a_aidl_bdis_tmr, *b_ase0_brst_tmr, *b_se0_srp_tmr, *b_srp_res_tmr, - *b_bus_suspend_tmr; - -static struct list_head active_timers; - -static struct langwell_otg *the_transceiver; - -/* host/client notify transceiver when event affects HNP state */ -void langwell_update_transceiver() -{ - otg_dbg("transceiver driver is notified\n"); - queue_work(the_transceiver->qwork, &the_transceiver->work); -} -EXPORT_SYMBOL(langwell_update_transceiver); - -static int langwell_otg_set_host(struct otg_transceiver *otg, - struct usb_bus *host) -{ - otg->host = host; - - return 0; -} - -static int langwell_otg_set_peripheral(struct otg_transceiver *otg, - struct usb_gadget *gadget) -{ - otg->gadget = gadget; - - return 0; -} - -static int langwell_otg_set_power(struct otg_transceiver *otg, - unsigned mA) -{ - return 0; -} - -/* A-device drives vbus, controlled through PMIC CHRGCNTL register*/ -static void langwell_otg_drv_vbus(int on) -{ - struct ipc_pmic_reg_data pmic_data = {0}; - struct ipc_pmic_reg_data battery_data; - - /* Check if battery is attached or not */ - battery_data.pmic_reg_data[0].register_address = 0xd2; - battery_data.ioc = 0; - battery_data.num_entries = 1; - if (ipc_pmic_register_read(&battery_data)) { - otg_dbg("Failed to read PMIC register 0xd2.\n"); - return; - } - - if ((battery_data.pmic_reg_data[0].value & 0x20) == 0) { - otg_dbg("no battery attached\n"); - return; - } - - /* Workaround for battery attachment issue */ - if (battery_data.pmic_reg_data[0].value == 0x34) { - otg_dbg("battery \n"); - return; - } - - otg_dbg("battery attached\n"); - - pmic_data.ioc = 0; - pmic_data.pmic_reg_data[0].register_address = 0xD4; - pmic_data.num_entries = 1; - if (on) - pmic_data.pmic_reg_data[0].value = 0x20; - else - pmic_data.pmic_reg_data[0].value = 0xc0; - - if (ipc_pmic_register_write(&pmic_data, TRUE)) - otg_dbg("Failed to write PMIC.\n"); - -} - -/* charge vbus or discharge vbus through a resistor to ground */ -static void langwell_otg_chrg_vbus(int on) -{ - - u32 val; - - val = readl(the_transceiver->regs + CI_OTGSC); - - if (on) - writel((val & ~OTGSC_INTSTS_MASK) | OTGSC_VC, - the_transceiver->regs + CI_OTGSC); - else - writel((val & ~OTGSC_INTSTS_MASK) | OTGSC_VD, - the_transceiver->regs + CI_OTGSC); - -} - -/* Start SRP */ -static int langwell_otg_start_srp(struct otg_transceiver *otg) -{ - u32 val; - - otg_dbg("Start SRP ->\n"); - - val = readl(the_transceiver->regs + CI_OTGSC); - - writel((val & ~OTGSC_INTSTS_MASK) | OTGSC_HADP, - the_transceiver->regs + CI_OTGSC); - - /* Check if the data plus is finished or not */ - msleep(8); - val = readl(the_transceiver->regs + CI_OTGSC); - if (val & (OTGSC_HADP | OTGSC_DP)) - otg_dbg("DataLine SRP Error\n"); - - /* FIXME: VBus SRP */ - - return 0; -} - - -/* stop SOF via bus_suspend */ -static void langwell_otg_loc_sof(int on) -{ - struct usb_hcd *hcd; - int err; - - otg_dbg("loc_sof -> %d\n", on); - - hcd = bus_to_hcd(the_transceiver->otg.host); - if (on) - err = hcd->driver->bus_resume(hcd); - else - err = hcd->driver->bus_suspend(hcd); - - if (err) - otg_dbg("Failed to resume/suspend bus - %d\n", err); -} - -static void langwell_otg_phy_low_power(int on) -{ - u32 val; - - otg_dbg("phy low power mode-> %d\n", on); - - val = readl(the_transceiver->regs + CI_HOSTPC1); - if (on) - writel(val | HOSTPC1_PHCD, the_transceiver->regs + CI_HOSTPC1); - else - writel(val & ~HOSTPC1_PHCD, the_transceiver->regs + CI_HOSTPC1); -} - -/* Enable/Disable OTG interrupt */ -static void langwell_otg_intr(int on) -{ - u32 val; - - otg_dbg("interrupt -> %d\n", on); - - val = readl(the_transceiver->regs + CI_OTGSC); - if (on) { - val = val | (OTGSC_INTEN_MASK | OTGSC_IDPU); - writel(val, the_transceiver->regs + CI_OTGSC); - } else { - val = val & ~(OTGSC_INTEN_MASK | OTGSC_IDPU); - writel(val, the_transceiver->regs + CI_OTGSC); - } -} - -/* set HAAR: Hardware Assist Auto-Reset */ -static void langwell_otg_HAAR(int on) -{ - u32 val; - - otg_dbg("HAAR -> %d\n", on); - - val = readl(the_transceiver->regs + CI_OTGSC); - if (on) - writel((val & ~OTGSC_INTSTS_MASK) | OTGSC_HAAR, - the_transceiver->regs + CI_OTGSC); - else - writel((val & ~OTGSC_INTSTS_MASK) & ~OTGSC_HAAR, - the_transceiver->regs + CI_OTGSC); -} - -/* set HABA: Hardware Assist B-Disconnect to A-Connect */ -static void langwell_otg_HABA(int on) -{ - u32 val; - - otg_dbg("HABA -> %d\n", on); - - val = readl(the_transceiver->regs + CI_OTGSC); - if (on) - writel((val & ~OTGSC_INTSTS_MASK) | OTGSC_HABA, - the_transceiver->regs + CI_OTGSC); - else - writel((val & ~OTGSC_INTSTS_MASK) & ~OTGSC_HABA, - the_transceiver->regs + CI_OTGSC); -} - -static int langwell_otg_check_se0_srp(int on) -{ - u32 val; - - int delay_time = TB_SE0_SRP * 10; /* step is 100us */ - - otg_dbg("check_se0_srp -> \n"); - - do { - udelay(100); - if (!delay_time--) - break; - val = readl(the_transceiver->regs + CI_PORTSC1); - val &= PORTSC_LS; - } while (!val); - - otg_dbg("check_se0_srp <- \n"); - return val; -} - -/* The timeout callback function to set time out bit */ -static void set_tmout(unsigned long indicator) -{ - *(int *)indicator = 1; -} - -void langwell_otg_nsf_msg(unsigned long indicator) -{ - switch (indicator) { - case 2: - case 4: - case 6: - case 7: - printk(KERN_ERR "OTG:NSF-%lu - deivce not responding\n", - indicator); - break; - case 3: - printk(KERN_ERR "OTG:NSF-%lu - deivce not supported\n", - indicator); - break; - default: - printk(KERN_ERR "Do not have this kind of NSF\n"); - break; - } -} - -/* Initialize timers */ -static void langwell_otg_init_timers(struct otg_hsm *hsm) -{ - /* HSM used timers */ - a_wait_vrise_tmr = otg_timer_initializer(&set_tmout, TA_WAIT_VRISE, - (unsigned long)&hsm->a_wait_vrise_tmout); - a_wait_bcon_tmr = otg_timer_initializer(&set_tmout, TA_WAIT_BCON, - (unsigned long)&hsm->a_wait_bcon_tmout); - a_aidl_bdis_tmr = otg_timer_initializer(&set_tmout, TA_AIDL_BDIS, - (unsigned long)&hsm->a_aidl_bdis_tmout); - b_ase0_brst_tmr = otg_timer_initializer(&set_tmout, TB_ASE0_BRST, - (unsigned long)&hsm->b_ase0_brst_tmout); - b_se0_srp_tmr = otg_timer_initializer(&set_tmout, TB_SE0_SRP, - (unsigned long)&hsm->b_se0_srp); - b_srp_res_tmr = otg_timer_initializer(&set_tmout, TB_SRP_RES, - (unsigned long)&hsm->b_srp_res_tmout); - b_bus_suspend_tmr = otg_timer_initializer(&set_tmout, TB_BUS_SUSPEND, - (unsigned long)&hsm->b_bus_suspend_tmout); -} - -/* Free timers */ -static void langwell_otg_free_timers(void) -{ - kfree(a_wait_vrise_tmr); - kfree(a_wait_bcon_tmr); - kfree(a_aidl_bdis_tmr); - kfree(b_ase0_brst_tmr); - kfree(b_se0_srp_tmr); - kfree(b_srp_res_tmr); - kfree(b_bus_suspend_tmr); -} - -/* Add timer to timer list */ -static void langwell_otg_add_timer(void *gtimer) -{ - struct langwell_otg_timer *timer = (struct langwell_otg_timer *)gtimer; - struct langwell_otg_timer *tmp_timer; - u32 val32; - - /* Check if the timer is already in the active list, - * if so update timer count - */ - list_for_each_entry(tmp_timer, &active_timers, list) - if (tmp_timer == timer) { - timer->count = timer->expires; - return; - } - timer->count = timer->expires; - - if (list_empty(&active_timers)) { - val32 = readl(the_transceiver->regs + CI_OTGSC); - writel(val32 | OTGSC_1MSE, the_transceiver->regs + CI_OTGSC); - } - - list_add_tail(&timer->list, &active_timers); -} - -/* Remove timer from the timer list; clear timeout status */ -static void langwell_otg_del_timer(void *gtimer) -{ - struct langwell_otg_timer *timer = (struct langwell_otg_timer *)gtimer; - struct langwell_otg_timer *tmp_timer, *del_tmp; - u32 val32; - - list_for_each_entry_safe(tmp_timer, del_tmp, &active_timers, list) - if (tmp_timer == timer) - list_del(&timer->list); - - if (list_empty(&active_timers)) { - val32 = readl(the_transceiver->regs + CI_OTGSC); - writel(val32 & ~OTGSC_1MSE, the_transceiver->regs + CI_OTGSC); - } -} - -/* Reduce timer count by 1, and find timeout conditions.*/ -static int langwell_otg_tick_timer(u32 *int_sts) -{ - struct langwell_otg_timer *tmp_timer, *del_tmp; - int expired = 0; - - list_for_each_entry_safe(tmp_timer, del_tmp, &active_timers, list) { - tmp_timer->count--; - /* check if timer expires */ - if (!tmp_timer->count) { - list_del(&tmp_timer->list); - tmp_timer->function(tmp_timer->data); - expired = 1; - } - } - - if (list_empty(&active_timers)) { - otg_dbg("tick timer: disable 1ms int\n"); - *int_sts = *int_sts & ~OTGSC_1MSE; - } - return expired; -} - -static void reset_otg(void) -{ - u32 val; - int delay_time = 1000; - - otg_dbg("reseting OTG controller ...\n"); - val = readl(the_transceiver->regs + CI_USBCMD); - writel(val | USBCMD_RST, the_transceiver->regs + CI_USBCMD); - do { - udelay(100); - if (!delay_time--) - otg_dbg("reset timeout\n"); - val = readl(the_transceiver->regs + CI_USBCMD); - val &= USBCMD_RST; - } while (val != 0); - otg_dbg("reset done.\n"); -} - -static void set_host_mode(void) -{ - u32 val; - - reset_otg(); - val = readl(the_transceiver->regs + CI_USBMODE); - val = (val & (~USBMODE_CM)) | USBMODE_HOST; - writel(val, the_transceiver->regs + CI_USBMODE); -} - -static void set_client_mode(void) -{ - u32 val; - - reset_otg(); - val = readl(the_transceiver->regs + CI_USBMODE); - val = (val & (~USBMODE_CM)) | USBMODE_DEVICE; - writel(val, the_transceiver->regs + CI_USBMODE); -} - -static void init_hsm(void) -{ - struct langwell_otg *langwell = the_transceiver; - u32 val32; - - /* read OTGSC after reset */ - val32 = readl(langwell->regs + CI_OTGSC); - otg_dbg("%s: OTGSC init value = 0x%x\n", __func__, val32); - - /* set init state */ - if (val32 & OTGSC_ID) { - langwell->hsm.id = 1; - langwell->otg.default_a = 0; - set_client_mode(); - langwell->otg.state = OTG_STATE_B_IDLE; - langwell_otg_drv_vbus(0); - } else { - langwell->hsm.id = 0; - langwell->otg.default_a = 1; - set_host_mode(); - langwell->otg.state = OTG_STATE_A_IDLE; - } - - /* set session indicator */ - if (val32 & OTGSC_BSE) - langwell->hsm.b_sess_end = 1; - if (val32 & OTGSC_BSV) - langwell->hsm.b_sess_vld = 1; - if (val32 & OTGSC_ASV) - langwell->hsm.a_sess_vld = 1; - if (val32 & OTGSC_AVV) - langwell->hsm.a_vbus_vld = 1; - - /* defautly power the bus */ - langwell->hsm.a_bus_req = 1; - langwell->hsm.a_bus_drop = 0; - /* defautly don't request bus as B device */ - langwell->hsm.b_bus_req = 0; - /* no system error */ - langwell->hsm.a_clr_err = 0; -} - -static irqreturn_t otg_dummy_irq(int irq, void *_dev) -{ - void __iomem *reg_base = _dev; - u32 val; - u32 int_mask = 0; - - val = readl(reg_base + CI_USBMODE); - if ((val & USBMODE_CM) != USBMODE_DEVICE) - return IRQ_NONE; - - val = readl(reg_base + CI_USBSTS); - int_mask = val & INTR_DUMMY_MASK; - - if (int_mask == 0) - return IRQ_NONE; - - /* clear hsm.b_conn here since host driver can't detect it - * otg_dummy_irq called means B-disconnect happened. - */ - if (the_transceiver->hsm.b_conn) { - the_transceiver->hsm.b_conn = 0; - if (spin_trylock(&the_transceiver->wq_lock)) { - queue_work(the_transceiver->qwork, - &the_transceiver->work); - spin_unlock(&the_transceiver->wq_lock); - } - } - /* Clear interrupts */ - writel(int_mask, reg_base + CI_USBSTS); - return IRQ_HANDLED; -} - -static irqreturn_t otg_irq(int irq, void *_dev) -{ - struct langwell_otg *langwell = _dev; - u32 int_sts, int_en; - u32 int_mask = 0; - int flag = 0; - - int_sts = readl(langwell->regs + CI_OTGSC); - int_en = (int_sts & OTGSC_INTEN_MASK) >> 8; - int_mask = int_sts & int_en; - if (int_mask == 0) - return IRQ_NONE; - - if (int_mask & OTGSC_IDIS) { - otg_dbg("%s: id change int\n", __func__); - langwell->hsm.id = (int_sts & OTGSC_ID) ? 1 : 0; - flag = 1; - } - if (int_mask & OTGSC_DPIS) { - otg_dbg("%s: data pulse int\n", __func__); - langwell->hsm.a_srp_det = (int_sts & OTGSC_DPS) ? 1 : 0; - flag = 1; - } - if (int_mask & OTGSC_BSEIS) { - otg_dbg("%s: b session end int\n", __func__); - langwell->hsm.b_sess_end = (int_sts & OTGSC_BSE) ? 1 : 0; - flag = 1; - } - if (int_mask & OTGSC_BSVIS) { - otg_dbg("%s: b session valid int\n", __func__); - langwell->hsm.b_sess_vld = (int_sts & OTGSC_BSV) ? 1 : 0; - flag = 1; - } - if (int_mask & OTGSC_ASVIS) { - otg_dbg("%s: a session valid int\n", __func__); - langwell->hsm.a_sess_vld = (int_sts & OTGSC_ASV) ? 1 : 0; - flag = 1; - } - if (int_mask & OTGSC_AVVIS) { - otg_dbg("%s: a vbus valid int\n", __func__); - langwell->hsm.a_vbus_vld = (int_sts & OTGSC_AVV) ? 1 : 0; - flag = 1; - } - - if (int_mask & OTGSC_1MSS) { - /* need to schedule otg_work if any timer is expired */ - if (langwell_otg_tick_timer(&int_sts)) - flag = 1; - } - - writel((int_sts & ~OTGSC_INTSTS_MASK) | int_mask, - langwell->regs + CI_OTGSC); - if (flag) - queue_work(langwell->qwork, &langwell->work); - - return IRQ_HANDLED; -} - -static void langwell_otg_work(struct work_struct *work) -{ - struct langwell_otg *langwell = container_of(work, - struct langwell_otg, work); - int retval; - - otg_dbg("%s: old state = %s\n", __func__, - state_string(langwell->otg.state)); - - switch (langwell->otg.state) { - case OTG_STATE_UNDEFINED: - case OTG_STATE_B_IDLE: - if (!langwell->hsm.id) { - langwell_otg_del_timer(b_srp_res_tmr); - langwell->otg.default_a = 1; - langwell->hsm.a_srp_det = 0; - - langwell_otg_chrg_vbus(0); - langwell_otg_drv_vbus(0); - - set_host_mode(); - langwell->otg.state = OTG_STATE_A_IDLE; - queue_work(langwell->qwork, &langwell->work); - } else if (langwell->hsm.b_srp_res_tmout) { - langwell->hsm.b_srp_res_tmout = 0; - langwell->hsm.b_bus_req = 0; - langwell_otg_nsf_msg(6); - } else if (langwell->hsm.b_sess_vld) { - langwell_otg_del_timer(b_srp_res_tmr); - langwell->hsm.b_sess_end = 0; - langwell->hsm.a_bus_suspend = 0; - - langwell_otg_chrg_vbus(0); - if (langwell->client_ops) { - langwell->client_ops->resume(langwell->pdev); - langwell->otg.state = OTG_STATE_B_PERIPHERAL; - } else - otg_dbg("client driver not loaded.\n"); - - } else if (langwell->hsm.b_bus_req && - (langwell->hsm.b_sess_end)) { - /* workaround for b_se0_srp detection */ - retval = langwell_otg_check_se0_srp(0); - if (retval) { - langwell->hsm.b_bus_req = 0; - otg_dbg("LS is not SE0, try again later\n"); - } else { - /* Start SRP */ - langwell_otg_start_srp(&langwell->otg); - langwell_otg_add_timer(b_srp_res_tmr); - } - } - break; - case OTG_STATE_B_SRP_INIT: - if (!langwell->hsm.id) { - langwell->otg.default_a = 1; - langwell->hsm.a_srp_det = 0; - - langwell_otg_drv_vbus(0); - langwell_otg_chrg_vbus(0); - - langwell->otg.state = OTG_STATE_A_IDLE; - queue_work(langwell->qwork, &langwell->work); - } else if (langwell->hsm.b_sess_vld) { - langwell_otg_chrg_vbus(0); - if (langwell->client_ops) { - langwell->client_ops->resume(langwell->pdev); - langwell->otg.state = OTG_STATE_B_PERIPHERAL; - } else - otg_dbg("client driver not loaded.\n"); - } - break; - case OTG_STATE_B_PERIPHERAL: - if (!langwell->hsm.id) { - langwell->otg.default_a = 1; - langwell->hsm.a_srp_det = 0; - - langwell_otg_drv_vbus(0); - langwell_otg_chrg_vbus(0); - set_host_mode(); - - if (langwell->client_ops) { - langwell->client_ops->suspend(langwell->pdev, - PMSG_FREEZE); - } else - otg_dbg("client driver has been removed.\n"); - - langwell->otg.state = OTG_STATE_A_IDLE; - queue_work(langwell->qwork, &langwell->work); - } else if (!langwell->hsm.b_sess_vld) { - langwell->hsm.b_hnp_enable = 0; - - if (langwell->client_ops) { - langwell->client_ops->suspend(langwell->pdev, - PMSG_FREEZE); - } else - otg_dbg("client driver has been removed.\n"); - - langwell->otg.state = OTG_STATE_B_IDLE; - } else if (langwell->hsm.b_bus_req && langwell->hsm.b_hnp_enable - && langwell->hsm.a_bus_suspend) { - - if (langwell->client_ops) { - langwell->client_ops->suspend(langwell->pdev, - PMSG_FREEZE); - } else - otg_dbg("client driver has been removed.\n"); - - langwell_otg_HAAR(1); - langwell->hsm.a_conn = 0; - - if (langwell->host_ops) { - langwell->host_ops->probe(langwell->pdev, - langwell->host_ops->id_table); - langwell->otg.state = OTG_STATE_B_WAIT_ACON; - } else - otg_dbg("host driver not loaded.\n"); - - langwell->hsm.a_bus_resume = 0; - langwell->hsm.b_ase0_brst_tmout = 0; - langwell_otg_add_timer(b_ase0_brst_tmr); - } - break; - - case OTG_STATE_B_WAIT_ACON: - if (!langwell->hsm.id) { - langwell_otg_del_timer(b_ase0_brst_tmr); - langwell->otg.default_a = 1; - langwell->hsm.a_srp_det = 0; - - langwell_otg_drv_vbus(0); - langwell_otg_chrg_vbus(0); - set_host_mode(); - - langwell_otg_HAAR(0); - if (langwell->host_ops) - langwell->host_ops->remove(langwell->pdev); - else - otg_dbg("host driver has been removed.\n"); - langwell->otg.state = OTG_STATE_A_IDLE; - queue_work(langwell->qwork, &langwell->work); - } else if (!langwell->hsm.b_sess_vld) { - langwell_otg_del_timer(b_ase0_brst_tmr); - langwell->hsm.b_hnp_enable = 0; - langwell->hsm.b_bus_req = 0; - langwell_otg_chrg_vbus(0); - langwell_otg_HAAR(0); - - if (langwell->host_ops) - langwell->host_ops->remove(langwell->pdev); - else - otg_dbg("host driver has been removed.\n"); - langwell->otg.state = OTG_STATE_B_IDLE; - } else if (langwell->hsm.a_conn) { - langwell_otg_del_timer(b_ase0_brst_tmr); - langwell_otg_HAAR(0); - langwell->otg.state = OTG_STATE_B_HOST; - queue_work(langwell->qwork, &langwell->work); - } else if (langwell->hsm.a_bus_resume || - langwell->hsm.b_ase0_brst_tmout) { - langwell_otg_del_timer(b_ase0_brst_tmr); - langwell_otg_HAAR(0); - langwell_otg_nsf_msg(7); - - if (langwell->host_ops) - langwell->host_ops->remove(langwell->pdev); - else - otg_dbg("host driver has been removed.\n"); - - langwell->hsm.a_bus_suspend = 0; - langwell->hsm.b_bus_req = 0; - - if (langwell->client_ops) - langwell->client_ops->resume(langwell->pdev); - else - otg_dbg("client driver not loaded.\n"); - - langwell->otg.state = OTG_STATE_B_PERIPHERAL; - } - break; - - case OTG_STATE_B_HOST: - if (!langwell->hsm.id) { - langwell->otg.default_a = 1; - langwell->hsm.a_srp_det = 0; - - langwell_otg_drv_vbus(0); - langwell_otg_chrg_vbus(0); - set_host_mode(); - if (langwell->host_ops) - langwell->host_ops->remove(langwell->pdev); - else - otg_dbg("host driver has been removed.\n"); - langwell->otg.state = OTG_STATE_A_IDLE; - queue_work(langwell->qwork, &langwell->work); - } else if (!langwell->hsm.b_sess_vld) { - langwell->hsm.b_hnp_enable = 0; - langwell->hsm.b_bus_req = 0; - langwell_otg_chrg_vbus(0); - if (langwell->host_ops) - langwell->host_ops->remove(langwell->pdev); - else - otg_dbg("host driver has been removed.\n"); - langwell->otg.state = OTG_STATE_B_IDLE; - } else if ((!langwell->hsm.b_bus_req) || - (!langwell->hsm.a_conn)) { - langwell->hsm.b_bus_req = 0; - langwell_otg_loc_sof(0); - if (langwell->host_ops) - langwell->host_ops->remove(langwell->pdev); - else - otg_dbg("host driver has been removed.\n"); - - langwell->hsm.a_bus_suspend = 0; - - if (langwell->client_ops) - langwell->client_ops->resume(langwell->pdev); - else - otg_dbg("client driver not loaded.\n"); - - langwell->otg.state = OTG_STATE_B_PERIPHERAL; - } - break; - - case OTG_STATE_A_IDLE: - langwell->otg.default_a = 1; - if (langwell->hsm.id) { - langwell->otg.default_a = 0; - langwell->hsm.b_bus_req = 0; - langwell_otg_drv_vbus(0); - langwell_otg_chrg_vbus(0); - - langwell->otg.state = OTG_STATE_B_IDLE; - queue_work(langwell->qwork, &langwell->work); - } else if (langwell->hsm.a_sess_vld) { - langwell_otg_drv_vbus(1); - langwell->hsm.a_srp_det = 1; - langwell->hsm.a_wait_vrise_tmout = 0; - langwell_otg_add_timer(a_wait_vrise_tmr); - langwell->otg.state = OTG_STATE_A_WAIT_VRISE; - queue_work(langwell->qwork, &langwell->work); - } else if (!langwell->hsm.a_bus_drop && - (langwell->hsm.a_srp_det || langwell->hsm.a_bus_req)) { - langwell_otg_drv_vbus(1); - langwell->hsm.a_wait_vrise_tmout = 0; - langwell_otg_add_timer(a_wait_vrise_tmr); - langwell->otg.state = OTG_STATE_A_WAIT_VRISE; - queue_work(langwell->qwork, &langwell->work); - } - break; - case OTG_STATE_A_WAIT_VRISE: - if (langwell->hsm.id) { - langwell_otg_del_timer(a_wait_vrise_tmr); - langwell->hsm.b_bus_req = 0; - langwell->otg.default_a = 0; - langwell_otg_drv_vbus(0); - langwell->otg.state = OTG_STATE_B_IDLE; - } else if (langwell->hsm.a_vbus_vld) { - langwell_otg_del_timer(a_wait_vrise_tmr); - if (langwell->host_ops) - langwell->host_ops->probe(langwell->pdev, - langwell->host_ops->id_table); - else - otg_dbg("host driver not loaded.\n"); - langwell->hsm.b_conn = 0; - langwell->hsm.a_set_b_hnp_en = 0; - langwell->hsm.a_wait_bcon_tmout = 0; - langwell_otg_add_timer(a_wait_bcon_tmr); - langwell->otg.state = OTG_STATE_A_WAIT_BCON; - } else if (langwell->hsm.a_wait_vrise_tmout) { - if (langwell->hsm.a_vbus_vld) { - if (langwell->host_ops) - langwell->host_ops->probe( - langwell->pdev, - langwell->host_ops->id_table); - else - otg_dbg("host driver not loaded.\n"); - langwell->hsm.b_conn = 0; - langwell->hsm.a_set_b_hnp_en = 0; - langwell->hsm.a_wait_bcon_tmout = 0; - langwell_otg_add_timer(a_wait_bcon_tmr); - langwell->otg.state = OTG_STATE_A_WAIT_BCON; - } else { - langwell_otg_drv_vbus(0); - langwell->otg.state = OTG_STATE_A_VBUS_ERR; - } - } - break; - case OTG_STATE_A_WAIT_BCON: - if (langwell->hsm.id) { - langwell_otg_del_timer(a_wait_bcon_tmr); - - langwell->otg.default_a = 0; - langwell->hsm.b_bus_req = 0; - if (langwell->host_ops) - langwell->host_ops->remove(langwell->pdev); - else - otg_dbg("host driver has been removed.\n"); - langwell_otg_drv_vbus(0); - langwell->otg.state = OTG_STATE_B_IDLE; - queue_work(langwell->qwork, &langwell->work); - } else if (!langwell->hsm.a_vbus_vld) { - langwell_otg_del_timer(a_wait_bcon_tmr); - - if (langwell->host_ops) - langwell->host_ops->remove(langwell->pdev); - else - otg_dbg("host driver has been removed.\n"); - langwell_otg_drv_vbus(0); - langwell->otg.state = OTG_STATE_A_VBUS_ERR; - } else if (langwell->hsm.a_bus_drop || - (langwell->hsm.a_wait_bcon_tmout && - !langwell->hsm.a_bus_req)) { - langwell_otg_del_timer(a_wait_bcon_tmr); - - if (langwell->host_ops) - langwell->host_ops->remove(langwell->pdev); - else - otg_dbg("host driver has been removed.\n"); - langwell_otg_drv_vbus(0); - langwell->otg.state = OTG_STATE_A_WAIT_VFALL; - } else if (langwell->hsm.b_conn) { - langwell_otg_del_timer(a_wait_bcon_tmr); - - langwell->hsm.a_suspend_req = 0; - langwell->otg.state = OTG_STATE_A_HOST; - if (!langwell->hsm.a_bus_req && - langwell->hsm.a_set_b_hnp_en) { - /* It is not safe enough to do a fast - * transistion from A_WAIT_BCON to - * A_SUSPEND */ - msleep(10000); - if (langwell->hsm.a_bus_req) - break; - - if (request_irq(langwell->pdev->irq, - otg_dummy_irq, IRQF_SHARED, - driver_name, langwell->regs) != 0) { - otg_dbg("request interrupt %d fail\n", - langwell->pdev->irq); - } - - langwell_otg_HABA(1); - langwell->hsm.b_bus_resume = 0; - langwell->hsm.a_aidl_bdis_tmout = 0; - langwell_otg_add_timer(a_aidl_bdis_tmr); - - langwell_otg_loc_sof(0); - langwell->otg.state = OTG_STATE_A_SUSPEND; - } else if (!langwell->hsm.a_bus_req && - !langwell->hsm.a_set_b_hnp_en) { - struct pci_dev *pdev = langwell->pdev; - if (langwell->host_ops) - langwell->host_ops->remove(pdev); - else - otg_dbg("host driver removed.\n"); - langwell_otg_drv_vbus(0); - langwell->otg.state = OTG_STATE_A_WAIT_VFALL; - } - } - break; - case OTG_STATE_A_HOST: - if (langwell->hsm.id) { - langwell->otg.default_a = 0; - langwell->hsm.b_bus_req = 0; - if (langwell->host_ops) - langwell->host_ops->remove(langwell->pdev); - else - otg_dbg("host driver has been removed.\n"); - langwell_otg_drv_vbus(0); - langwell->otg.state = OTG_STATE_B_IDLE; - queue_work(langwell->qwork, &langwell->work); - } else if (langwell->hsm.a_bus_drop || - (!langwell->hsm.a_set_b_hnp_en && !langwell->hsm.a_bus_req)) { - if (langwell->host_ops) - langwell->host_ops->remove(langwell->pdev); - else - otg_dbg("host driver has been removed.\n"); - langwell_otg_drv_vbus(0); - langwell->otg.state = OTG_STATE_A_WAIT_VFALL; - } else if (!langwell->hsm.a_vbus_vld) { - if (langwell->host_ops) - langwell->host_ops->remove(langwell->pdev); - else - otg_dbg("host driver has been removed.\n"); - langwell_otg_drv_vbus(0); - langwell->otg.state = OTG_STATE_A_VBUS_ERR; - } else if (langwell->hsm.a_set_b_hnp_en - && !langwell->hsm.a_bus_req) { - /* Set HABA to enable hardware assistance to signal - * A-connect after receiver B-disconnect. Hardware - * will then set client mode and enable URE, SLE and - * PCE after the assistance. otg_dummy_irq is used to - * clean these ints when client driver is not resumed. - */ - if (request_irq(langwell->pdev->irq, - otg_dummy_irq, IRQF_SHARED, driver_name, - langwell->regs) != 0) { - otg_dbg("request interrupt %d failed\n", - langwell->pdev->irq); - } - - /* set HABA */ - langwell_otg_HABA(1); - langwell->hsm.b_bus_resume = 0; - langwell->hsm.a_aidl_bdis_tmout = 0; - langwell_otg_add_timer(a_aidl_bdis_tmr); - langwell_otg_loc_sof(0); - langwell->otg.state = OTG_STATE_A_SUSPEND; - } else if (!langwell->hsm.b_conn || !langwell->hsm.a_bus_req) { - langwell->hsm.a_wait_bcon_tmout = 0; - langwell->hsm.a_set_b_hnp_en = 0; - langwell_otg_add_timer(a_wait_bcon_tmr); - langwell->otg.state = OTG_STATE_A_WAIT_BCON; - } - break; - case OTG_STATE_A_SUSPEND: - if (langwell->hsm.id) { - langwell_otg_del_timer(a_aidl_bdis_tmr); - langwell_otg_HABA(0); - free_irq(langwell->pdev->irq, langwell->regs); - langwell->otg.default_a = 0; - langwell->hsm.b_bus_req = 0; - if (langwell->host_ops) - langwell->host_ops->remove(langwell->pdev); - else - otg_dbg("host driver has been removed.\n"); - langwell_otg_drv_vbus(0); - langwell->otg.state = OTG_STATE_B_IDLE; - queue_work(langwell->qwork, &langwell->work); - } else if (langwell->hsm.a_bus_req || - langwell->hsm.b_bus_resume) { - langwell_otg_del_timer(a_aidl_bdis_tmr); - langwell_otg_HABA(0); - free_irq(langwell->pdev->irq, langwell->regs); - langwell->hsm.a_suspend_req = 0; - langwell_otg_loc_sof(1); - langwell->otg.state = OTG_STATE_A_HOST; - } else if (langwell->hsm.a_aidl_bdis_tmout || - langwell->hsm.a_bus_drop) { - langwell_otg_del_timer(a_aidl_bdis_tmr); - langwell_otg_HABA(0); - free_irq(langwell->pdev->irq, langwell->regs); - if (langwell->host_ops) - langwell->host_ops->remove(langwell->pdev); - else - otg_dbg("host driver has been removed.\n"); - langwell_otg_drv_vbus(0); - langwell->otg.state = OTG_STATE_A_WAIT_VFALL; - } else if (!langwell->hsm.b_conn && - langwell->hsm.a_set_b_hnp_en) { - langwell_otg_del_timer(a_aidl_bdis_tmr); - langwell_otg_HABA(0); - free_irq(langwell->pdev->irq, langwell->regs); - - if (langwell->host_ops) - langwell->host_ops->remove(langwell->pdev); - else - otg_dbg("host driver has been removed.\n"); - - langwell->hsm.b_bus_suspend = 0; - langwell->hsm.b_bus_suspend_vld = 0; - langwell->hsm.b_bus_suspend_tmout = 0; - - /* msleep(200); */ - if (langwell->client_ops) - langwell->client_ops->resume(langwell->pdev); - else - otg_dbg("client driver not loaded.\n"); - - langwell_otg_add_timer(b_bus_suspend_tmr); - langwell->otg.state = OTG_STATE_A_PERIPHERAL; - break; - } else if (!langwell->hsm.a_vbus_vld) { - langwell_otg_del_timer(a_aidl_bdis_tmr); - langwell_otg_HABA(0); - free_irq(langwell->pdev->irq, langwell->regs); - if (langwell->host_ops) - langwell->host_ops->remove(langwell->pdev); - else - otg_dbg("host driver has been removed.\n"); - langwell_otg_drv_vbus(0); - langwell->otg.state = OTG_STATE_A_VBUS_ERR; - } - break; - case OTG_STATE_A_PERIPHERAL: - if (langwell->hsm.id) { - langwell_otg_del_timer(b_bus_suspend_tmr); - langwell->otg.default_a = 0; - langwell->hsm.b_bus_req = 0; - if (langwell->client_ops) - langwell->client_ops->suspend(langwell->pdev, - PMSG_FREEZE); - else - otg_dbg("client driver has been removed.\n"); - langwell_otg_drv_vbus(0); - langwell->otg.state = OTG_STATE_B_IDLE; - queue_work(langwell->qwork, &langwell->work); - } else if (!langwell->hsm.a_vbus_vld) { - langwell_otg_del_timer(b_bus_suspend_tmr); - if (langwell->client_ops) - langwell->client_ops->suspend(langwell->pdev, - PMSG_FREEZE); - else - otg_dbg("client driver has been removed.\n"); - langwell_otg_drv_vbus(0); - langwell->otg.state = OTG_STATE_A_VBUS_ERR; - } else if (langwell->hsm.a_bus_drop) { - langwell_otg_del_timer(b_bus_suspend_tmr); - if (langwell->client_ops) - langwell->client_ops->suspend(langwell->pdev, - PMSG_FREEZE); - else - otg_dbg("client driver has been removed.\n"); - langwell_otg_drv_vbus(0); - langwell->otg.state = OTG_STATE_A_WAIT_VFALL; - } else if (langwell->hsm.b_bus_suspend) { - langwell_otg_del_timer(b_bus_suspend_tmr); - if (langwell->client_ops) - langwell->client_ops->suspend(langwell->pdev, - PMSG_FREEZE); - else - otg_dbg("client driver has been removed.\n"); - - if (langwell->host_ops) - langwell->host_ops->probe(langwell->pdev, - langwell->host_ops->id_table); - else - otg_dbg("host driver not loaded.\n"); - langwell->hsm.a_set_b_hnp_en = 0; - langwell->hsm.a_wait_bcon_tmout = 0; - langwell_otg_add_timer(a_wait_bcon_tmr); - langwell->otg.state = OTG_STATE_A_WAIT_BCON; - } else if (langwell->hsm.b_bus_suspend_tmout) { - u32 val; - val = readl(langwell->regs + CI_PORTSC1); - if (!(val & PORTSC_SUSP)) - break; - if (langwell->client_ops) - langwell->client_ops->suspend(langwell->pdev, - PMSG_FREEZE); - else - otg_dbg("client driver has been removed.\n"); - if (langwell->host_ops) - langwell->host_ops->probe(langwell->pdev, - langwell->host_ops->id_table); - else - otg_dbg("host driver not loaded.\n"); - langwell->hsm.a_set_b_hnp_en = 0; - langwell->hsm.a_wait_bcon_tmout = 0; - langwell_otg_add_timer(a_wait_bcon_tmr); - langwell->otg.state = OTG_STATE_A_WAIT_BCON; - } - break; - case OTG_STATE_A_VBUS_ERR: - if (langwell->hsm.id) { - langwell->otg.default_a = 0; - langwell->hsm.a_clr_err = 0; - langwell->hsm.a_srp_det = 0; - langwell->otg.state = OTG_STATE_B_IDLE; - queue_work(langwell->qwork, &langwell->work); - } else if (langwell->hsm.a_clr_err) { - langwell->hsm.a_clr_err = 0; - langwell->hsm.a_srp_det = 0; - reset_otg(); - init_hsm(); - if (langwell->otg.state == OTG_STATE_A_IDLE) - queue_work(langwell->qwork, &langwell->work); - } - break; - case OTG_STATE_A_WAIT_VFALL: - if (langwell->hsm.id) { - langwell->otg.default_a = 0; - langwell->otg.state = OTG_STATE_B_IDLE; - queue_work(langwell->qwork, &langwell->work); - } else if (langwell->hsm.a_bus_req) { - langwell_otg_drv_vbus(1); - langwell->hsm.a_wait_vrise_tmout = 0; - langwell_otg_add_timer(a_wait_vrise_tmr); - langwell->otg.state = OTG_STATE_A_WAIT_VRISE; - } else if (!langwell->hsm.a_sess_vld) { - langwell->hsm.a_srp_det = 0; - langwell_otg_drv_vbus(0); - set_host_mode(); - langwell->otg.state = OTG_STATE_A_IDLE; - } - break; - default: - ; - } - - otg_dbg("%s: new state = %s\n", __func__, - state_string(langwell->otg.state)); -} - - static ssize_t -show_registers(struct device *_dev, struct device_attribute *attr, char *buf) -{ - struct langwell_otg *langwell; - char *next; - unsigned size; - unsigned t; - - langwell = the_transceiver; - next = buf; - size = PAGE_SIZE; - - t = scnprintf(next, size, - "\n" - "USBCMD = 0x%08x \n" - "USBSTS = 0x%08x \n" - "USBINTR = 0x%08x \n" - "ASYNCLISTADDR = 0x%08x \n" - "PORTSC1 = 0x%08x \n" - "HOSTPC1 = 0x%08x \n" - "OTGSC = 0x%08x \n" - "USBMODE = 0x%08x \n", - readl(langwell->regs + 0x30), - readl(langwell->regs + 0x34), - readl(langwell->regs + 0x38), - readl(langwell->regs + 0x48), - readl(langwell->regs + 0x74), - readl(langwell->regs + 0xb4), - readl(langwell->regs + 0xf4), - readl(langwell->regs + 0xf8) - ); - size -= t; - next += t; - - return PAGE_SIZE - size; -} -static DEVICE_ATTR(registers, S_IRUGO, show_registers, NULL); - -static ssize_t -show_hsm(struct device *_dev, struct device_attribute *attr, char *buf) -{ - struct langwell_otg *langwell; - char *next; - unsigned size; - unsigned t; - - langwell = the_transceiver; - next = buf; - size = PAGE_SIZE; - - t = scnprintf(next, size, - "\n" - "current state = %s\n" - "a_bus_resume = \t%d\n" - "a_bus_suspend = \t%d\n" - "a_conn = \t%d\n" - "a_sess_vld = \t%d\n" - "a_srp_det = \t%d\n" - "a_vbus_vld = \t%d\n" - "b_bus_resume = \t%d\n" - "b_bus_suspend = \t%d\n" - "b_conn = \t%d\n" - "b_se0_srp = \t%d\n" - "b_sess_end = \t%d\n" - "b_sess_vld = \t%d\n" - "id = \t%d\n" - "a_set_b_hnp_en = \t%d\n" - "b_srp_done = \t%d\n" - "b_hnp_enable = \t%d\n" - "a_wait_vrise_tmout = \t%d\n" - "a_wait_bcon_tmout = \t%d\n" - "a_aidl_bdis_tmout = \t%d\n" - "b_ase0_brst_tmout = \t%d\n" - "a_bus_drop = \t%d\n" - "a_bus_req = \t%d\n" - "a_clr_err = \t%d\n" - "a_suspend_req = \t%d\n" - "b_bus_req = \t%d\n" - "b_bus_suspend_tmout = \t%d\n" - "b_bus_suspend_vld = \t%d\n", - state_string(langwell->otg.state), - langwell->hsm.a_bus_resume, - langwell->hsm.a_bus_suspend, - langwell->hsm.a_conn, - langwell->hsm.a_sess_vld, - langwell->hsm.a_srp_det, - langwell->hsm.a_vbus_vld, - langwell->hsm.b_bus_resume, - langwell->hsm.b_bus_suspend, - langwell->hsm.b_conn, - langwell->hsm.b_se0_srp, - langwell->hsm.b_sess_end, - langwell->hsm.b_sess_vld, - langwell->hsm.id, - langwell->hsm.a_set_b_hnp_en, - langwell->hsm.b_srp_done, - langwell->hsm.b_hnp_enable, - langwell->hsm.a_wait_vrise_tmout, - langwell->hsm.a_wait_bcon_tmout, - langwell->hsm.a_aidl_bdis_tmout, - langwell->hsm.b_ase0_brst_tmout, - langwell->hsm.a_bus_drop, - langwell->hsm.a_bus_req, - langwell->hsm.a_clr_err, - langwell->hsm.a_suspend_req, - langwell->hsm.b_bus_req, - langwell->hsm.b_bus_suspend_tmout, - langwell->hsm.b_bus_suspend_vld - ); - size -= t; - next += t; - - return PAGE_SIZE - size; -} -static DEVICE_ATTR(hsm, S_IRUGO, show_hsm, NULL); - -static ssize_t -get_a_bus_req(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct langwell_otg *langwell; - char *next; - unsigned size; - unsigned t; - - langwell = the_transceiver; - next = buf; - size = PAGE_SIZE; - - t = scnprintf(next, size, "%d", langwell->hsm.a_bus_req); - size -= t; - next += t; - - return PAGE_SIZE - size; -} - -static ssize_t -set_a_bus_req(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct langwell_otg *langwell; - langwell = the_transceiver; - if (!langwell->otg.default_a) - return -1; - if (count > 2) - return -1; - - if (buf[0] == '0') { - langwell->hsm.a_bus_req = 0; - otg_dbg("a_bus_req = 0\n"); - } else if (buf[0] == '1') { - /* If a_bus_drop is TRUE, a_bus_req can't be set */ - if (langwell->hsm.a_bus_drop) - return -1; - langwell->hsm.a_bus_req = 1; - otg_dbg("a_bus_req = 1\n"); - } - if (spin_trylock(&langwell->wq_lock)) { - queue_work(langwell->qwork, &langwell->work); - spin_unlock(&langwell->wq_lock); - } - return count; -} -static DEVICE_ATTR(a_bus_req, S_IRUGO | S_IWUGO, get_a_bus_req, set_a_bus_req); - -static ssize_t -get_a_bus_drop(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct langwell_otg *langwell; - char *next; - unsigned size; - unsigned t; - - langwell = the_transceiver; - next = buf; - size = PAGE_SIZE; - - t = scnprintf(next, size, "%d", langwell->hsm.a_bus_drop); - size -= t; - next += t; - - return PAGE_SIZE - size; -} - -static ssize_t -set_a_bus_drop(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct langwell_otg *langwell; - langwell = the_transceiver; - if (!langwell->otg.default_a) - return -1; - if (count > 2) - return -1; - - if (buf[0] == '0') { - langwell->hsm.a_bus_drop = 0; - otg_dbg("a_bus_drop = 0\n"); - } else if (buf[0] == '1') { - langwell->hsm.a_bus_drop = 1; - langwell->hsm.a_bus_req = 0; - otg_dbg("a_bus_drop = 1, then a_bus_req = 0\n"); - } - if (spin_trylock(&langwell->wq_lock)) { - queue_work(langwell->qwork, &langwell->work); - spin_unlock(&langwell->wq_lock); - } - return count; -} -static DEVICE_ATTR(a_bus_drop, S_IRUGO | S_IWUGO, - get_a_bus_drop, set_a_bus_drop); - -static ssize_t -get_b_bus_req(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct langwell_otg *langwell; - char *next; - unsigned size; - unsigned t; - - langwell = the_transceiver; - next = buf; - size = PAGE_SIZE; - - t = scnprintf(next, size, "%d", langwell->hsm.b_bus_req); - size -= t; - next += t; - - return PAGE_SIZE - size; -} - -static ssize_t -set_b_bus_req(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct langwell_otg *langwell; - langwell = the_transceiver; - - if (langwell->otg.default_a) - return -1; - - if (count > 2) - return -1; - - if (buf[0] == '0') { - langwell->hsm.b_bus_req = 0; - otg_dbg("b_bus_req = 0\n"); - } else if (buf[0] == '1') { - langwell->hsm.b_bus_req = 1; - otg_dbg("b_bus_req = 1\n"); - } - if (spin_trylock(&langwell->wq_lock)) { - queue_work(langwell->qwork, &langwell->work); - spin_unlock(&langwell->wq_lock); - } - return count; -} -static DEVICE_ATTR(b_bus_req, S_IRUGO | S_IWUGO, get_b_bus_req, set_b_bus_req); - -static ssize_t -set_a_clr_err(struct device *dev, struct device_attribute *attr, - const char *buf, size_t count) -{ - struct langwell_otg *langwell; - langwell = the_transceiver; - - if (!langwell->otg.default_a) - return -1; - if (count > 2) - return -1; - - if (buf[0] == '1') { - langwell->hsm.a_clr_err = 1; - otg_dbg("a_clr_err = 1\n"); - } - if (spin_trylock(&langwell->wq_lock)) { - queue_work(langwell->qwork, &langwell->work); - spin_unlock(&langwell->wq_lock); - } - return count; -} -static DEVICE_ATTR(a_clr_err, S_IWUGO, NULL, set_a_clr_err); - -static struct attribute *inputs_attrs[] = { - &dev_attr_a_bus_req.attr, - &dev_attr_a_bus_drop.attr, - &dev_attr_b_bus_req.attr, - &dev_attr_a_clr_err.attr, - NULL, -}; - -static struct attribute_group debug_dev_attr_group = { - .name = "inputs", - .attrs = inputs_attrs, -}; - -int langwell_register_host(struct pci_driver *host_driver) -{ - int ret = 0; - - the_transceiver->host_ops = host_driver; - queue_work(the_transceiver->qwork, &the_transceiver->work); - otg_dbg("host controller driver is registered\n"); - - return ret; -} -EXPORT_SYMBOL(langwell_register_host); - -void langwell_unregister_host(struct pci_driver *host_driver) -{ - if (the_transceiver->host_ops) - the_transceiver->host_ops->remove(the_transceiver->pdev); - the_transceiver->host_ops = NULL; - the_transceiver->hsm.a_bus_drop = 1; - queue_work(the_transceiver->qwork, &the_transceiver->work); - otg_dbg("host controller driver is unregistered\n"); -} -EXPORT_SYMBOL(langwell_unregister_host); - -int langwell_register_peripheral(struct pci_driver *client_driver) -{ - int ret = 0; - - if (client_driver) - ret = client_driver->probe(the_transceiver->pdev, - client_driver->id_table); - if (!ret) { - the_transceiver->client_ops = client_driver; - queue_work(the_transceiver->qwork, &the_transceiver->work); - otg_dbg("client controller driver is registered\n"); - } - - return ret; -} -EXPORT_SYMBOL(langwell_register_peripheral); - -void langwell_unregister_peripheral(struct pci_driver *client_driver) -{ - if (the_transceiver->client_ops) - the_transceiver->client_ops->remove(the_transceiver->pdev); - the_transceiver->client_ops = NULL; - the_transceiver->hsm.b_bus_req = 0; - queue_work(the_transceiver->qwork, &the_transceiver->work); - otg_dbg("client controller driver is unregistered\n"); -} -EXPORT_SYMBOL(langwell_unregister_peripheral); - -static int langwell_otg_probe(struct pci_dev *pdev, - const struct pci_device_id *id) -{ - unsigned long resource, len; - void __iomem *base = NULL; - int retval; - u32 val32; - struct langwell_otg *langwell; - char qname[] = "langwell_otg_queue"; - - retval = 0; - otg_dbg("\notg controller is detected.\n"); - if (pci_enable_device(pdev) < 0) { - retval = -ENODEV; - goto done; - } - - langwell = kzalloc(sizeof *langwell, GFP_KERNEL); - if (langwell == NULL) { - retval = -ENOMEM; - goto done; - } - the_transceiver = langwell; - - /* control register: BAR 0 */ - resource = pci_resource_start(pdev, 0); - len = pci_resource_len(pdev, 0); - if (!request_mem_region(resource, len, driver_name)) { - retval = -EBUSY; - goto err; - } - langwell->region = 1; - - base = ioremap_nocache(resource, len); - if (base == NULL) { - retval = -EFAULT; - goto err; - } - langwell->regs = base; - - if (!pdev->irq) { - otg_dbg("No IRQ.\n"); - retval = -ENODEV; - goto err; - } - - langwell->qwork = create_workqueue(qname); - if (!langwell->qwork) { - otg_dbg("cannot create workqueue %s\n", qname); - retval = -ENOMEM; - goto err; - } - INIT_WORK(&langwell->work, langwell_otg_work); - - /* OTG common part */ - langwell->pdev = pdev; - langwell->otg.dev = &pdev->dev; - langwell->otg.label = driver_name; - langwell->otg.set_host = langwell_otg_set_host; - langwell->otg.set_peripheral = langwell_otg_set_peripheral; - langwell->otg.set_power = langwell_otg_set_power; - langwell->otg.start_srp = langwell_otg_start_srp; - langwell->otg.state = OTG_STATE_UNDEFINED; - if (otg_set_transceiver(&langwell->otg)) { - otg_dbg("can't set transceiver\n"); - retval = -EBUSY; - goto err; - } - - reset_otg(); - init_hsm(); - - spin_lock_init(&langwell->lock); - spin_lock_init(&langwell->wq_lock); - INIT_LIST_HEAD(&active_timers); - langwell_otg_init_timers(&langwell->hsm); - - if (request_irq(pdev->irq, otg_irq, IRQF_SHARED, - driver_name, langwell) != 0) { - otg_dbg("request interrupt %d failed\n", pdev->irq); - retval = -EBUSY; - goto err; - } - - /* enable OTGSC int */ - val32 = OTGSC_DPIE | OTGSC_BSEIE | OTGSC_BSVIE | - OTGSC_ASVIE | OTGSC_AVVIE | OTGSC_IDIE | OTGSC_IDPU; - writel(val32, langwell->regs + CI_OTGSC); - - retval = device_create_file(&pdev->dev, &dev_attr_registers); - if (retval < 0) { - otg_dbg("Can't register sysfs attribute: %d\n", retval); - goto err; - } - - retval = device_create_file(&pdev->dev, &dev_attr_hsm); - if (retval < 0) { - otg_dbg("Can't hsm sysfs attribute: %d\n", retval); - goto err; - } - - retval = sysfs_create_group(&pdev->dev.kobj, &debug_dev_attr_group); - if (retval < 0) { - otg_dbg("Can't register sysfs attr group: %d\n", retval); - goto err; - } - - if (langwell->otg.state == OTG_STATE_A_IDLE) - queue_work(langwell->qwork, &langwell->work); - - return 0; - -err: - if (the_transceiver) - langwell_otg_remove(pdev); -done: - return retval; -} - -static void langwell_otg_remove(struct pci_dev *pdev) -{ - struct langwell_otg *langwell; - - langwell = the_transceiver; - - if (langwell->qwork) { - flush_workqueue(langwell->qwork); - destroy_workqueue(langwell->qwork); - } - langwell_otg_free_timers(); - - /* disable OTGSC interrupt as OTGSC doesn't change in reset */ - writel(0, langwell->regs + CI_OTGSC); - - if (pdev->irq) - free_irq(pdev->irq, langwell); - if (langwell->regs) - iounmap(langwell->regs); - if (langwell->region) - release_mem_region(pci_resource_start(pdev, 0), - pci_resource_len(pdev, 0)); - - otg_set_transceiver(NULL); - pci_disable_device(pdev); - sysfs_remove_group(&pdev->dev.kobj, &debug_dev_attr_group); - device_remove_file(&pdev->dev, &dev_attr_hsm); - device_remove_file(&pdev->dev, &dev_attr_registers); - kfree(langwell); - langwell = NULL; -} - -static void transceiver_suspend(struct pci_dev *pdev) -{ - pci_save_state(pdev); - pci_set_power_state(pdev, PCI_D3hot); - langwell_otg_phy_low_power(1); -} - -static int langwell_otg_suspend(struct pci_dev *pdev, pm_message_t message) -{ - int ret = 0; - struct langwell_otg *langwell; - - langwell = the_transceiver; - - /* Disbale OTG interrupts */ - langwell_otg_intr(0); - - if (pdev->irq) - free_irq(pdev->irq, langwell); - - /* Prevent more otg_work */ - flush_workqueue(langwell->qwork); - spin_lock(&langwell->wq_lock); - - /* start actions */ - switch (langwell->otg.state) { - case OTG_STATE_A_IDLE: - case OTG_STATE_B_IDLE: - case OTG_STATE_A_WAIT_VFALL: - case OTG_STATE_A_VBUS_ERR: - transceiver_suspend(pdev); - break; - case OTG_STATE_A_WAIT_VRISE: - langwell_otg_del_timer(a_wait_vrise_tmr); - langwell->hsm.a_srp_det = 0; - langwell_otg_drv_vbus(0); - langwell->otg.state = OTG_STATE_A_IDLE; - transceiver_suspend(pdev); - break; - case OTG_STATE_A_WAIT_BCON: - langwell_otg_del_timer(a_wait_bcon_tmr); - if (langwell->host_ops) - ret = langwell->host_ops->suspend(pdev, message); - langwell_otg_drv_vbus(0); - break; - case OTG_STATE_A_HOST: - if (langwell->host_ops) - ret = langwell->host_ops->suspend(pdev, message); - langwell_otg_drv_vbus(0); - langwell_otg_phy_low_power(1); - break; - case OTG_STATE_A_SUSPEND: - langwell_otg_del_timer(a_aidl_bdis_tmr); - langwell_otg_HABA(0); - if (langwell->host_ops) - langwell->host_ops->remove(pdev); - else - otg_dbg("host driver has been removed.\n"); - langwell_otg_drv_vbus(0); - transceiver_suspend(pdev); - langwell->otg.state = OTG_STATE_A_WAIT_VFALL; - break; - case OTG_STATE_A_PERIPHERAL: - if (langwell->client_ops) - ret = langwell->client_ops->suspend(pdev, message); - else - otg_dbg("client driver has been removed.\n"); - langwell_otg_drv_vbus(0); - transceiver_suspend(pdev); - langwell->otg.state = OTG_STATE_A_WAIT_VFALL; - break; - case OTG_STATE_B_HOST: - if (langwell->host_ops) - langwell->host_ops->remove(pdev); - else - otg_dbg("host driver has been removed.\n"); - langwell->hsm.b_bus_req = 0; - transceiver_suspend(pdev); - langwell->otg.state = OTG_STATE_B_IDLE; - break; - case OTG_STATE_B_PERIPHERAL: - if (langwell->client_ops) - ret = langwell->client_ops->suspend(pdev, message); - else - otg_dbg("client driver has been removed.\n"); - break; - case OTG_STATE_B_WAIT_ACON: - langwell_otg_del_timer(b_ase0_brst_tmr); - langwell_otg_HAAR(0); - if (langwell->host_ops) - langwell->host_ops->remove(pdev); - else - otg_dbg("host driver has been removed.\n"); - langwell->hsm.b_bus_req = 0; - langwell->otg.state = OTG_STATE_B_IDLE; - transceiver_suspend(pdev); - break; - default: - otg_dbg("error state before suspend\n "); - break; - } - spin_unlock(&langwell->wq_lock); - - return ret; -} - -static void transceiver_resume(struct pci_dev *pdev) -{ - pci_restore_state(pdev); - pci_set_power_state(pdev, PCI_D0); - langwell_otg_phy_low_power(0); -} - -static int langwell_otg_resume(struct pci_dev *pdev) -{ - int ret = 0; - struct langwell_otg *langwell; - - langwell = the_transceiver; - - spin_lock(&langwell->wq_lock); - - switch (langwell->otg.state) { - case OTG_STATE_A_IDLE: - case OTG_STATE_B_IDLE: - case OTG_STATE_A_WAIT_VFALL: - case OTG_STATE_A_VBUS_ERR: - transceiver_resume(pdev); - break; - case OTG_STATE_A_WAIT_BCON: - langwell_otg_add_timer(a_wait_bcon_tmr); - langwell_otg_drv_vbus(1); - if (langwell->host_ops) - ret = langwell->host_ops->resume(pdev); - break; - case OTG_STATE_A_HOST: - langwell_otg_drv_vbus(1); - langwell_otg_phy_low_power(0); - if (langwell->host_ops) - ret = langwell->host_ops->resume(pdev); - break; - case OTG_STATE_B_PERIPHERAL: - if (langwell->client_ops) - ret = langwell->client_ops->resume(pdev); - else - otg_dbg("client driver not loaded.\n"); - break; - default: - otg_dbg("error state before suspend\n "); - break; - } - - if (request_irq(pdev->irq, otg_irq, IRQF_SHARED, - driver_name, the_transceiver) != 0) { - otg_dbg("request interrupt %d failed\n", pdev->irq); - ret = -EBUSY; - } - - /* enable OTG interrupts */ - langwell_otg_intr(1); - - spin_unlock(&langwell->wq_lock); - - queue_work(langwell->qwork, &langwell->work); - - - return ret; -} - -static int __init langwell_otg_init(void) -{ - return pci_register_driver(&otg_pci_driver); -} -module_init(langwell_otg_init); - -static void __exit langwell_otg_cleanup(void) -{ - pci_unregister_driver(&otg_pci_driver); -} -module_exit(langwell_otg_cleanup); diff --git a/include/linux/usb/langwell_otg.h b/include/linux/usb/langwell_otg.h deleted file mode 100644 index e115ae6df1da..000000000000 --- a/include/linux/usb/langwell_otg.h +++ /dev/null @@ -1,177 +0,0 @@ -/* - * Intel Langwell USB OTG transceiver driver - * Copyright (C) 2008, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. - * - */ - -#ifndef __LANGWELL_OTG_H__ -#define __LANGWELL_OTG_H__ - -/* notify transceiver driver about OTG events */ -extern void langwell_update_transceiver(void); -/* HCD register bus driver */ -extern int langwell_register_host(struct pci_driver *host_driver); -/* HCD unregister bus driver */ -extern void langwell_unregister_host(struct pci_driver *host_driver); -/* DCD register bus driver */ -extern int langwell_register_peripheral(struct pci_driver *client_driver); -/* DCD unregister bus driver */ -extern void langwell_unregister_peripheral(struct pci_driver *client_driver); -/* No silent failure, output warning message */ -extern void langwell_otg_nsf_msg(unsigned long message); - -#define CI_USBCMD 0x30 -# define USBCMD_RST BIT(1) -# define USBCMD_RS BIT(0) -#define CI_USBSTS 0x34 -# define USBSTS_SLI BIT(8) -# define USBSTS_URI BIT(6) -# define USBSTS_PCI BIT(2) -#define CI_PORTSC1 0x74 -# define PORTSC_PP BIT(12) -# define PORTSC_LS (BIT(11) | BIT(10)) -# define PORTSC_SUSP BIT(7) -# define PORTSC_CCS BIT(0) -#define CI_HOSTPC1 0xb4 -# define HOSTPC1_PHCD BIT(22) -#define CI_OTGSC 0xf4 -# define OTGSC_DPIE BIT(30) -# define OTGSC_1MSE BIT(29) -# define OTGSC_BSEIE BIT(28) -# define OTGSC_BSVIE BIT(27) -# define OTGSC_ASVIE BIT(26) -# define OTGSC_AVVIE BIT(25) -# define OTGSC_IDIE BIT(24) -# define OTGSC_DPIS BIT(22) -# define OTGSC_1MSS BIT(21) -# define OTGSC_BSEIS BIT(20) -# define OTGSC_BSVIS BIT(19) -# define OTGSC_ASVIS BIT(18) -# define OTGSC_AVVIS BIT(17) -# define OTGSC_IDIS BIT(16) -# define OTGSC_DPS BIT(14) -# define OTGSC_1MST BIT(13) -# define OTGSC_BSE BIT(12) -# define OTGSC_BSV BIT(11) -# define OTGSC_ASV BIT(10) -# define OTGSC_AVV BIT(9) -# define OTGSC_ID BIT(8) -# define OTGSC_HABA BIT(7) -# define OTGSC_HADP BIT(6) -# define OTGSC_IDPU BIT(5) -# define OTGSC_DP BIT(4) -# define OTGSC_OT BIT(3) -# define OTGSC_HAAR BIT(2) -# define OTGSC_VC BIT(1) -# define OTGSC_VD BIT(0) -# define OTGSC_INTEN_MASK (0x7f << 24) -# define OTGSC_INTSTS_MASK (0x7f << 16) -#define CI_USBMODE 0xf8 -# define USBMODE_CM (BIT(1) | BIT(0)) -# define USBMODE_IDLE 0 -# define USBMODE_DEVICE 0x2 -# define USBMODE_HOST 0x3 - -#define INTR_DUMMY_MASK (USBSTS_SLI | USBSTS_URI | USBSTS_PCI) - -struct otg_hsm { - /* Input */ - int a_bus_resume; - int a_bus_suspend; - int a_conn; - int a_sess_vld; - int a_srp_det; - int a_vbus_vld; - int b_bus_resume; - int b_bus_suspend; - int b_conn; - int b_se0_srp; - int b_sess_end; - int b_sess_vld; - int id; - - /* Internal variables */ - int a_set_b_hnp_en; - int b_srp_done; - int b_hnp_enable; - - /* Timeout indicator for timers */ - int a_wait_vrise_tmout; - int a_wait_bcon_tmout; - int a_aidl_bdis_tmout; - int b_ase0_brst_tmout; - int b_bus_suspend_tmout; - int b_srp_res_tmout; - - /* Informative variables */ - int a_bus_drop; - int a_bus_req; - int a_clr_err; - int a_suspend_req; - int b_bus_req; - - /* Output */ - int drv_vbus; - int loc_conn; - int loc_sof; - - /* Others */ - int b_bus_suspend_vld; -}; - -#define TA_WAIT_VRISE 100 -#define TA_WAIT_BCON 30000 -#define TA_AIDL_BDIS 15000 -#define TB_ASE0_BRST 5000 -#define TB_SE0_SRP 2 -#define TB_SRP_RES 100 -#define TB_BUS_SUSPEND 500 - -struct langwell_otg_timer { - unsigned long expires; /* Number of count increase to timeout */ - unsigned long count; /* Tick counter */ - void (*function)(unsigned long); /* Timeout function */ - unsigned long data; /* Data passed to function */ - struct list_head list; -}; - -struct langwell_otg { - struct otg_transceiver otg; - struct otg_hsm hsm; - void __iomem *regs; - unsigned region; - struct pci_driver *host_ops; - struct pci_driver *client_ops; - struct pci_dev *pdev; - struct work_struct work; - struct workqueue_struct *qwork; - spinlock_t lock; - spinlock_t wq_lock; -}; - -static inline struct langwell_otg *otg_to_langwell(struct otg_transceiver *otg) -{ - return container_of(otg, struct langwell_otg, otg); -} - -#ifdef DEBUG -#define otg_dbg(fmt, args...) \ - printk(KERN_DEBUG fmt , ## args) -#else -#define otg_dbg(fmt, args...) \ - do { } while (0) -#endif /* DEBUG */ -#endif /* __LANGWELL_OTG_H__ */ -- cgit v1.2.3-59-g8ed1b From e376bbbb6a82cf119c93bde66937f66c72cba27b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 18 Jun 2009 10:39:11 -0700 Subject: USB: usb.h: fix kernel-doc notation Fix usb.h kernel-doc warnings: Warning(include/linux/usb.h:918): Excess struct/union/enum/typedef member 'nodename' description in 'usb_device_driver' Warning(include/linux/usb.h:939): No description found for parameter 'nodename' Warning(include/linux/usb.h:1219): No description found for parameter 'sg' Warning(include/linux/usb.h:1219): No description found for parameter 'num_sgs' Signed-off-by: Randy Dunlap Signed-off-by: Greg Kroah-Hartman --- include/linux/usb.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/usb.h b/include/linux/usb.h index 84929e914034..b1e3c2fbfe11 100644 --- a/include/linux/usb.h +++ b/include/linux/usb.h @@ -888,8 +888,6 @@ struct usb_driver { * struct usb_device_driver - identifies USB device driver to usbcore * @name: The driver name should be unique among USB drivers, * and should normally be the same as the module name. - * @nodename: Callback to provide a naming hint for a possible - * device node to create. * @probe: Called to see if the driver is willing to manage a particular * device. If it is, probe returns zero and uses dev_set_drvdata() * to associate driver-specific data with the device. If unwilling @@ -924,6 +922,8 @@ extern struct bus_type usb_bus_type; /** * struct usb_class_driver - identifies a USB driver that wants to use the USB major number * @name: the usb class device name for this driver. Will show up in sysfs. + * @nodename: Callback to provide a naming hint for a possible + * device node to create. * @fops: pointer to the struct file_operations of this driver. * @minor_base: the start of the minor range for this driver. * @@ -1046,6 +1046,8 @@ typedef void (*usb_complete_t)(struct urb *); * the device driver is saying that it provided this DMA address, * which the host controller driver should use in preference to the * transfer_buffer. + * @sg: scatter gather buffer list + * @num_sgs: number of entries in the sg list * @transfer_buffer_length: How big is transfer_buffer. The transfer may * be broken up into chunks according to the current maximum packet * size for the endpoint, which is a function of the configuration -- cgit v1.2.3-59-g8ed1b From d0b6e04a4cd8360e3c9c419f7c30a3081a0c142a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 13 Jul 2009 10:33:21 +0800 Subject: tracing/events: Move TRACE_SYSTEM outside of include guard If TRACE_INCLDUE_FILE is defined, will be included and compiled, otherwise it will be So TRACE_SYSTEM should be defined outside of #if proctection, just like TRACE_INCLUDE_FILE. Imaging this scenario: #include -> TRACE_SYSTEM == foo ... #include -> TRACE_SYSTEM == bar ... #define CREATE_TRACE_POINTS #include -> TRACE_SYSTEM == bar !!! and then bar.h will be included and compiled. Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A5A9CF1.2010007@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- fs/gfs2/trace_gfs2.h | 8 +++---- include/trace/events/block.h | 6 ++--- include/trace/events/ext4.h | 6 ++--- include/trace/events/irq.h | 6 ++--- include/trace/events/jbd2.h | 6 ++--- include/trace/events/kmem.h | 6 ++--- include/trace/events/lockdep.h | 6 ++--- include/trace/events/sched.h | 6 ++--- include/trace/events/skb.h | 6 ++--- include/trace/events/workqueue.h | 6 ++--- samples/trace_events/trace-events-sample.h | 37 ++++++++++++++++-------------- 11 files changed, 51 insertions(+), 48 deletions(-) (limited to 'include') diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 98d6ef1c1dc0..148d55c14171 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -1,12 +1,11 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM gfs2 + #if !defined(_TRACE_GFS2_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_GFS2_H #include -#undef TRACE_SYSTEM -#define TRACE_SYSTEM gfs2 -#define TRACE_INCLUDE_FILE trace_gfs2 - #include #include #include @@ -403,5 +402,6 @@ TRACE_EVENT(gfs2_block_alloc, /* This part must be outside protection */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . +#define TRACE_INCLUDE_FILE trace_gfs2 #include diff --git a/include/trace/events/block.h b/include/trace/events/block.h index d6b05f42dd44..9a74b468a229 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -1,3 +1,6 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM block + #if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_BLOCK_H @@ -5,9 +8,6 @@ #include #include -#undef TRACE_SYSTEM -#define TRACE_SYSTEM block - TRACE_EVENT(block_rq_abort, TP_PROTO(struct request_queue *q, struct request *rq), diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index acf4cc9cd36d..8eb82cc8d149 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -1,9 +1,9 @@ -#if !defined(_TRACE_EXT4_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_EXT4_H - #undef TRACE_SYSTEM #define TRACE_SYSTEM ext4 +#if !defined(_TRACE_EXT4_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EXT4_H + #include #include "../../../fs/ext4/ext4.h" #include "../../../fs/ext4/mballoc.h" diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index b0c7ede55eb1..1cb0c3aa11e6 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -1,12 +1,12 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM irq + #if !defined(_TRACE_IRQ_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_IRQ_H #include #include -#undef TRACE_SYSTEM -#define TRACE_SYSTEM irq - #define softirq_name(sirq) { sirq##_SOFTIRQ, #sirq } #define show_softirq_name(val) \ __print_symbolic(val, \ diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h index 845b0b4b48fd..10813fa0c8d0 100644 --- a/include/trace/events/jbd2.h +++ b/include/trace/events/jbd2.h @@ -1,12 +1,12 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM jbd2 + #if !defined(_TRACE_JBD2_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_JBD2_H #include #include -#undef TRACE_SYSTEM -#define TRACE_SYSTEM jbd2 - TRACE_EVENT(jbd2_checkpoint, TP_PROTO(journal_t *journal, int result), diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 9baba50d6512..1493c541f9c4 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -1,12 +1,12 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kmem + #if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_KMEM_H #include #include -#undef TRACE_SYSTEM -#define TRACE_SYSTEM kmem - /* * The order of these masks is important. Matching masks will be seen * first and the left over flags will end up showing by themselves. diff --git a/include/trace/events/lockdep.h b/include/trace/events/lockdep.h index 0e956c9dfd7e..bcf1d209a00d 100644 --- a/include/trace/events/lockdep.h +++ b/include/trace/events/lockdep.h @@ -1,12 +1,12 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM lockdep + #if !defined(_TRACE_LOCKDEP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_LOCKDEP_H #include #include -#undef TRACE_SYSTEM -#define TRACE_SYSTEM lockdep - #ifdef CONFIG_LOCKDEP TRACE_EVENT(lock_acquire, diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index 24ab5bcff7b2..8949bb7eb082 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -1,12 +1,12 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM sched + #if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SCHED_H #include #include -#undef TRACE_SYSTEM -#define TRACE_SYSTEM sched - /* * Tracepoint for calling kthread_stop, performed to end a kthread: */ diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h index 1e8fabb57c06..e499863b9669 100644 --- a/include/trace/events/skb.h +++ b/include/trace/events/skb.h @@ -1,12 +1,12 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM skb + #if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SKB_H #include #include -#undef TRACE_SYSTEM -#define TRACE_SYSTEM skb - /* * Tracepoint for free an sk_buff: */ diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h index 035f1bff288e..fcfd9a1e4b96 100644 --- a/include/trace/events/workqueue.h +++ b/include/trace/events/workqueue.h @@ -1,3 +1,6 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM workqueue + #if !defined(_TRACE_WORKQUEUE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_WORKQUEUE_H @@ -5,9 +8,6 @@ #include #include -#undef TRACE_SYSTEM -#define TRACE_SYSTEM workqueue - TRACE_EVENT(workqueue_insertion, TP_PROTO(struct task_struct *wq_thread, struct work_struct *work), diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index 9977a756fb32..f24ae370e514 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -1,20 +1,3 @@ -/* - * Notice that this file is not protected like a normal header. - * We also must allow for rereading of this file. The - * - * || defined(TRACE_HEADER_MULTI_READ) - * - * serves this purpose. - */ -#if !defined(_TRACE_EVENT_SAMPLE_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_EVENT_SAMPLE_H - -/* - * All trace headers should include tracepoint.h, until we finally - * make it into a standard header. - */ -#include - /* * If TRACE_SYSTEM is defined, that will be the directory created * in the ftrace directory under /debugfs/tracing/events/ @@ -34,10 +17,30 @@ * #define TRACE_INCLUDE_FILE trace-events-sample * * As we do an the bottom of this file. + * + * Notice that TRACE_SYSTEM should be defined outside of #if + * protection, just like TRACE_INCLUDE_FILE. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM sample +/* + * Notice that this file is not protected like a normal header. + * We also must allow for rereading of this file. The + * + * || defined(TRACE_HEADER_MULTI_READ) + * + * serves this purpose. + */ +#if !defined(_TRACE_EVENT_SAMPLE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EVENT_SAMPLE_H + +/* + * All trace headers should include tracepoint.h, until we finally + * make it into a standard header. + */ +#include + /* * The TRACE_EVENT macro is broken up into 5 parts. * -- cgit v1.2.3-59-g8ed1b From 8660c1240ec6016522b882c88751cb4ce40bf0e8 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 13 Jul 2009 22:48:16 +0000 Subject: skbuff.h: Fix comment for NET_IP_ALIGN Use the correct function call for skb_reserve in the comment for NET_IP_ALIGN. Signed-off-by: Tobias Klauser Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b47b3f039d14..f2c69a2cca17 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1342,12 +1342,12 @@ static inline int skb_network_offset(const struct sk_buff *skb) * shifting the start of the packet by 2 bytes. Drivers should do this * with: * - * skb_reserve(NET_IP_ALIGN); + * skb_reserve(skb, NET_IP_ALIGN); * * The downside to this alignment of the IP header is that the DMA is now * unaligned. On some architectures the cost of an unaligned DMA is high * and this cost outweighs the gains made by aligning the IP header. - * + * * Since this trade off varies between architectures, we allow NET_IP_ALIGN * to be overridden. */ -- cgit v1.2.3-59-g8ed1b From d0cb43b35d64877b2944bd37719708be5d7bbf99 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 9 Jul 2009 09:27:50 +0900 Subject: libata: implement and use HORKAGE_NOSETXFER, take#2 PIONEER DVD-RW DVRTD08 times out SETXFER if no media is present. The device is SATA and simply skipping SETXFER works around the problem. Implement ATA_HORKAGE_NOSETXFER and apply it to the device. Reported by Moritz Rigler in the following thread. http://thread.gmane.org/gmane.linux.ide/36790 and by Lars in bko#9540. Updated to whine and ignore NOSETXFER if PATA component is detected as suggested by Alan Cox. Signed-off-by: Tejun Heo Reported-by: Moritz Rigler Reported-by: Lars Cc: Alan Cox Signed-off-by: Jeff Garzik --- drivers/ata/libata-core.c | 20 ++++++++++++++++++-- include/linux/libata.h | 1 + 2 files changed, 19 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 045a486a09ea..2c6aedaef718 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -3392,17 +3392,27 @@ int ata_down_xfermask_limit(struct ata_device *dev, unsigned int sel) static int ata_dev_set_mode(struct ata_device *dev) { + struct ata_port *ap = dev->link->ap; struct ata_eh_context *ehc = &dev->link->eh_context; + const bool nosetxfer = dev->horkage & ATA_HORKAGE_NOSETXFER; const char *dev_err_whine = ""; int ign_dev_err = 0; - unsigned int err_mask; + unsigned int err_mask = 0; int rc; dev->flags &= ~ATA_DFLAG_PIO; if (dev->xfer_shift == ATA_SHIFT_PIO) dev->flags |= ATA_DFLAG_PIO; - err_mask = ata_dev_set_xfermode(dev); + if (nosetxfer && ap->flags & ATA_FLAG_SATA && ata_id_is_sata(dev->id)) + dev_err_whine = " (SET_XFERMODE skipped)"; + else { + if (nosetxfer) + ata_dev_printk(dev, KERN_WARNING, + "NOSETXFER but PATA detected - can't " + "skip SETXFER, might malfunction\n"); + err_mask = ata_dev_set_xfermode(dev); + } if (err_mask & ~AC_ERR_DEV) goto fail; @@ -4297,6 +4307,12 @@ static const struct ata_blacklist_entry ata_device_blacklist [] = { /* Devices which aren't very happy with higher link speeds */ { "WD My Book", NULL, ATA_HORKAGE_1_5_GBPS, }, + /* + * Devices which choke on SETXFER. Applies only if both the + * device and controller are SATA. + */ + { "PIONEER DVD-RW DVRTD08", "1.00", ATA_HORKAGE_NOSETXFER }, + /* End Marker */ { } }; diff --git a/include/linux/libata.h b/include/linux/libata.h index 3d501db36a26..79b6d7fd4ac2 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -385,6 +385,7 @@ enum { not multiple of 16 bytes */ ATA_HORKAGE_FIRMWARE_WARN = (1 << 12), /* firmware update warning */ ATA_HORKAGE_1_5_GBPS = (1 << 13), /* force 1.5 Gbps */ + ATA_HORKAGE_NOSETXFER = (1 << 14), /* skip SETXFER, SATA only */ /* DMA mask for user DMA control: User visible values; DO NOT renumber */ -- cgit v1.2.3-59-g8ed1b From 069a9dce384e211784ce6fdfaf1f13921327480d Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 1 Jul 2009 13:03:52 -0400 Subject: drm/radeon: add some missing pci ids Also, fix ordering for a couple others Signed-off-by: Alex Deucher Signed-off-by: Dave Airlie --- include/drm/drm_pciids.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/drm/drm_pciids.h b/include/drm/drm_pciids.h index 45c18672b093..7174818c2c13 100644 --- a/include/drm/drm_pciids.h +++ b/include/drm/drm_pciids.h @@ -43,6 +43,7 @@ {0x1002, 0x4A4F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_R420|RADEON_NEW_MEMMAP}, \ {0x1002, 0x4A50, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_R420|RADEON_NEW_MEMMAP}, \ {0x1002, 0x4A54, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_R420|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x4B48, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_R420|RADEON_NEW_MEMMAP}, \ {0x1002, 0x4B49, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_R420|RADEON_NEW_MEMMAP}, \ {0x1002, 0x4B4A, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_R420|RADEON_NEW_MEMMAP}, \ {0x1002, 0x4B4B, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_R420|RADEON_NEW_MEMMAP}, \ @@ -262,6 +263,7 @@ {0x1002, 0x9440, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV770|RADEON_NEW_MEMMAP}, \ {0x1002, 0x9441, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV770|RADEON_NEW_MEMMAP}, \ {0x1002, 0x9442, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV770|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x9443, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV770|RADEON_NEW_MEMMAP}, \ {0x1002, 0x9444, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV770|RADEON_NEW_MEMMAP}, \ {0x1002, 0x9446, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV770|RADEON_NEW_MEMMAP}, \ {0x1002, 0x944A, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV770|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ @@ -346,12 +348,12 @@ {0x1002, 0x9599, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV635|RADEON_NEW_MEMMAP}, \ {0x1002, 0x959B, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV635|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x95C0, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV620|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x95C2, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV620|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x95C4, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV620|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x95C5, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV620|RADEON_NEW_MEMMAP}, \ {0x1002, 0x95C6, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV620|RADEON_NEW_MEMMAP}, \ {0x1002, 0x95C7, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV620|RADEON_NEW_MEMMAP}, \ {0x1002, 0x95C9, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV620|RADEON_NEW_MEMMAP}, \ - {0x1002, 0x95C2, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV620|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ - {0x1002, 0x95C4, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV620|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x95CC, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV620|RADEON_NEW_MEMMAP}, \ {0x1002, 0x95CD, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV620|RADEON_NEW_MEMMAP}, \ {0x1002, 0x95CE, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV620|RADEON_NEW_MEMMAP}, \ -- cgit v1.2.3-59-g8ed1b From d1724078d6a01177c1db4ea0b75fda1ca8a73d57 Mon Sep 17 00:00:00 2001 From: Thomas Hellstrom Date: Wed, 24 Jun 2009 19:57:35 +0200 Subject: ttm: Make messages more readable. Signed-off-by: Thomas Hellstrom Signed-off-by: Dave Airlie --- include/drm/ttm/ttm_module.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/drm/ttm/ttm_module.h b/include/drm/ttm/ttm_module.h index 889a4c7958ae..d1d433834e4f 100644 --- a/include/drm/ttm/ttm_module.h +++ b/include/drm/ttm/ttm_module.h @@ -33,7 +33,7 @@ #include -#define TTM_PFX "[TTM]" +#define TTM_PFX "[TTM] " enum ttm_global_types { TTM_GLOBAL_TTM_MEM = 0, -- cgit v1.2.3-59-g8ed1b From ad49f501867cba87e1e45e5ebae0b12435d68bf1 Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Fri, 10 Jul 2009 22:36:26 +1000 Subject: drm/ttm/radeon: add dma32 support. This add support for using dma32 memory on gpus that really need it. Currently IGPs are left without DMA32 but we might need to change that unless we can fix rs690. Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/radeon.h | 1 + drivers/gpu/drm/radeon/radeon_device.c | 17 +++++++++++++++-- drivers/gpu/drm/radeon/radeon_ttm.c | 3 ++- drivers/gpu/drm/ttm/ttm_bo.c | 7 ++++++- drivers/gpu/drm/ttm/ttm_tt.c | 9 +++++++-- include/drm/ttm/ttm_bo_driver.h | 5 ++++- 6 files changed, 35 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index e7662ba9abfb..3060ce14071e 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -624,6 +624,7 @@ struct radeon_device { bool gpu_lockup; bool shutdown; bool suspend; + bool need_dma32; }; int radeon_device_init(struct radeon_device *rdev, diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index 332911267ebe..27a5ac969953 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -450,6 +450,7 @@ int radeon_device_init(struct radeon_device *rdev, uint32_t flags) { int r, ret; + int dma_bits; DRM_INFO("radeon: Initializing kernel modesetting.\n"); rdev->shutdown = false; @@ -492,8 +493,20 @@ int radeon_device_init(struct radeon_device *rdev, return r; } - /* Report DMA addressing limitation */ - r = pci_set_dma_mask(rdev->pdev, DMA_BIT_MASK(32)); + /* set DMA mask + need_dma32 flags. + * PCIE - can handle 40-bits. + * IGP - can handle 40-bits (in theory) + * AGP - generally dma32 is safest + * PCI - only dma32 + */ + rdev->need_dma32 = false; + if (rdev->flags & RADEON_IS_AGP) + rdev->need_dma32 = true; + if (rdev->flags & RADEON_IS_PCI) + rdev->need_dma32 = true; + + dma_bits = rdev->need_dma32 ? 32 : 40; + r = pci_set_dma_mask(rdev->pdev, DMA_BIT_MASK(dma_bits)); if (r) { printk(KERN_WARNING "radeon: No suitable DMA available.\n"); } diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index 1227a97f5169..4ca9aa9203d0 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -442,7 +442,8 @@ int radeon_ttm_init(struct radeon_device *rdev) /* No others user of address space so set it to 0 */ r = ttm_bo_device_init(&rdev->mman.bdev, rdev->mman.mem_global_ref.object, - &radeon_bo_driver, DRM_FILE_PAGE_OFFSET); + &radeon_bo_driver, DRM_FILE_PAGE_OFFSET, + rdev->need_dma32); if (r) { DRM_ERROR("failed initializing buffer object driver(%d).\n", r); return r; diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index a753598a5e35..e55e7972c897 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -224,6 +224,9 @@ static int ttm_bo_add_ttm(struct ttm_buffer_object *bo, bool zero_alloc) TTM_ASSERT_LOCKED(&bo->mutex); bo->ttm = NULL; + if (bdev->need_dma32) + page_flags |= TTM_PAGE_FLAG_DMA32; + switch (bo->type) { case ttm_bo_type_device: if (zero_alloc) @@ -1332,7 +1335,8 @@ EXPORT_SYMBOL(ttm_bo_device_release); int ttm_bo_device_init(struct ttm_bo_device *bdev, struct ttm_mem_global *mem_glob, - struct ttm_bo_driver *driver, uint64_t file_page_offset) + struct ttm_bo_driver *driver, uint64_t file_page_offset, + bool need_dma32) { int ret = -EINVAL; @@ -1369,6 +1373,7 @@ int ttm_bo_device_init(struct ttm_bo_device *bdev, INIT_LIST_HEAD(&bdev->ddestroy); INIT_LIST_HEAD(&bdev->swap_lru); bdev->dev_mapping = NULL; + bdev->need_dma32 = need_dma32; ttm_mem_init_shrink(&bdev->shrink, ttm_bo_swapout); ret = ttm_mem_register_shrink(mem_glob, &bdev->shrink); if (unlikely(ret != 0)) { diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c index 75dc8bd24592..81ab81f030a3 100644 --- a/drivers/gpu/drm/ttm/ttm_tt.c +++ b/drivers/gpu/drm/ttm/ttm_tt.c @@ -131,10 +131,15 @@ static void ttm_tt_free_page_directory(struct ttm_tt *ttm) static struct page *ttm_tt_alloc_page(unsigned page_flags) { + gfp_t gfp_flags = GFP_HIGHUSER; + if (page_flags & TTM_PAGE_FLAG_ZERO_ALLOC) - return alloc_page(GFP_HIGHUSER | __GFP_ZERO); + gfp_flags |= __GFP_ZERO; + + if (page_flags & TTM_PAGE_FLAG_DMA32) + gfp_flags |= __GFP_DMA32; - return alloc_page(GFP_HIGHUSER); + return alloc_page(gfp_flags); } static void ttm_tt_free_user_pages(struct ttm_tt *ttm) diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h index 62ed733c52a2..ea83dd23a4d7 100644 --- a/include/drm/ttm/ttm_bo_driver.h +++ b/include/drm/ttm/ttm_bo_driver.h @@ -121,6 +121,7 @@ struct ttm_backend { #define TTM_PAGE_FLAG_SWAPPED (1 << 4) #define TTM_PAGE_FLAG_PERSISTANT_SWAP (1 << 5) #define TTM_PAGE_FLAG_ZERO_ALLOC (1 << 6) +#define TTM_PAGE_FLAG_DMA32 (1 << 7) enum ttm_caching_state { tt_uncached, @@ -429,6 +430,8 @@ struct ttm_bo_device { */ struct delayed_work wq; + + bool need_dma32; }; /** @@ -648,7 +651,7 @@ extern int ttm_bo_device_release(struct ttm_bo_device *bdev); extern int ttm_bo_device_init(struct ttm_bo_device *bdev, struct ttm_mem_global *mem_glob, struct ttm_bo_driver *driver, - uint64_t file_page_offset); + uint64_t file_page_offset, bool need_dma32); /** * ttm_bo_reserve: -- cgit v1.2.3-59-g8ed1b From 43237b5490e8f2f4679decd660064ff35ce490cc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 20 May 2009 18:41:58 +0200 Subject: ext3: Get rid of extenddisksize parameter of ext3_get_blocks_handle() Get rid of extenddisksize parameter of ext3_get_blocks_handle(). This seems to be a relict from some old days and setting disksize in this function does not make much sence. Currently it was set only by ext3_getblk(). Since the parameter has some effect only if create == 1, it is easy to check that the three callers which end up calling ext3_getblk() with create == 1 (ext3_append, ext3_quota_write, ext3_mkdir) do the right thing and set disksize themselves. Signed-off-by: Jan Kara --- fs/ext3/dir.c | 3 +-- fs/ext3/inode.c | 13 +++---------- include/linux/ext3_fs.h | 2 +- 3 files changed, 5 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c index 3d724a95882f..373fa90c796a 100644 --- a/fs/ext3/dir.c +++ b/fs/ext3/dir.c @@ -130,8 +130,7 @@ static int ext3_readdir(struct file * filp, struct buffer_head *bh = NULL; map_bh.b_state = 0; - err = ext3_get_blocks_handle(NULL, inode, blk, 1, - &map_bh, 0, 0); + err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0); if (err > 0) { pgoff_t index = map_bh.b_blocknr >> (PAGE_CACHE_SHIFT - inode->i_blkbits); diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 4d7da6f6184b..b49908a167ae 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -788,7 +788,7 @@ err_out: int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, - int create, int extend_disksize) + int create) { int err = -EIO; int offsets[4]; @@ -911,13 +911,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, if (!err) err = ext3_splice_branch(handle, inode, iblock, partial, indirect_blks, count); - /* - * i_disksize growing is protected by truncate_mutex. Don't forget to - * protect it if you're about to implement concurrent - * ext3_get_block() -bzzz - */ - if (!err && extend_disksize && inode->i_size > ei->i_disksize) - ei->i_disksize = inode->i_size; mutex_unlock(&ei->truncate_mutex); if (err) goto cleanup; @@ -972,7 +965,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock, } ret = ext3_get_blocks_handle(handle, inode, iblock, - max_blocks, bh_result, create, 0); + max_blocks, bh_result, create); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -1005,7 +998,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode, dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); err = ext3_get_blocks_handle(handle, inode, block, 1, - &dummy, create, 1); + &dummy, create); /* * ext3_get_blocks_handle() returns number of blocks * mapped. 0 in case of a HOLE. diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index 634a5e5aba3e..7499b3667798 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -874,7 +874,7 @@ struct buffer_head * ext3_getblk (handle_t *, struct inode *, long, int, int *); struct buffer_head * ext3_bread (handle_t *, struct inode *, int, int, int *); int ext3_get_blocks_handle(handle_t *handle, struct inode *inode, sector_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, - int create, int extend_disksize); + int create); extern struct inode *ext3_iget(struct super_block *, unsigned long); extern int ext3_write_inode (struct inode *, int); -- cgit v1.2.3-59-g8ed1b From 5c9228f0cfb09a098a8a380116b42ae099e967b6 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 16 Jul 2009 16:06:09 +0100 Subject: vt: drop bootmem/slab memory distinction Bootmem is not used for the vt screen buffer anymore as slab is now available at the time the console is initialized. Get rid of the now superfluous distinction between slab and bootmem, it's always slab. This also fixes a kmalloc leak which Catalin described thusly: Commit a5f4f52e ("vt: use kzalloc() instead of the bootmem allocator") replaced the alloc_bootmem() with kzalloc() but didn't set vc_kmalloced to 1 and the memory block is later leaked. The corresponding kmemleak trace: unreferenced object 0xdf828000 (size 8192): comm "swapper", pid 0, jiffies 4294937296 backtrace: [] __save_stack_trace+0x17/0x1c [] log_early+0x55/0x84 [] kmemleak_alloc+0x33/0x3c [] __kmalloc+0xd7/0xe4 [] con_init+0xbf/0x1b8 [] console_init+0x11/0x20 [] start_kernel+0x137/0x1e4 Signed-off-by: Johannes Weiner Reviewed-by: Pekka Enberg Tested-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- drivers/char/vt.c | 12 +++--------- include/linux/console_struct.h | 1 - 2 files changed, 3 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/drivers/char/vt.c b/drivers/char/vt.c index 7947bd1b4cf7..404f4c1ee431 100644 --- a/drivers/char/vt.c +++ b/drivers/char/vt.c @@ -770,14 +770,12 @@ int vc_allocate(unsigned int currcons) /* return 0 on success */ visual_init(vc, currcons, 1); if (!*vc->vc_uni_pagedir_loc) con_set_default_unimap(vc); - if (!vc->vc_kmalloced) - vc->vc_screenbuf = kmalloc(vc->vc_screenbuf_size, GFP_KERNEL); + vc->vc_screenbuf = kmalloc(vc->vc_screenbuf_size, GFP_KERNEL); if (!vc->vc_screenbuf) { kfree(vc); vc_cons[currcons].d = NULL; return -ENOMEM; } - vc->vc_kmalloced = 1; vc_init(vc, vc->vc_rows, vc->vc_cols, 1); vcs_make_sysfs(currcons); atomic_notifier_call_chain(&vt_notifier_list, VT_ALLOCATE, ¶m); @@ -913,10 +911,8 @@ static int vc_do_resize(struct tty_struct *tty, struct vc_data *vc, if (new_scr_end > new_origin) scr_memsetw((void *)new_origin, vc->vc_video_erase_char, new_scr_end - new_origin); - if (vc->vc_kmalloced) - kfree(vc->vc_screenbuf); + kfree(vc->vc_screenbuf); vc->vc_screenbuf = newscreen; - vc->vc_kmalloced = 1; vc->vc_screenbuf_size = new_screen_size; set_origin(vc); @@ -995,8 +991,7 @@ void vc_deallocate(unsigned int currcons) vc->vc_sw->con_deinit(vc); put_pid(vc->vt_pid); module_put(vc->vc_sw->owner); - if (vc->vc_kmalloced) - kfree(vc->vc_screenbuf); + kfree(vc->vc_screenbuf); if (currcons >= MIN_NR_CONSOLES) kfree(vc); vc_cons[currcons].d = NULL; @@ -2881,7 +2876,6 @@ static int __init con_init(void) INIT_WORK(&vc_cons[currcons].SAK_work, vc_SAK); visual_init(vc, currcons, 1); vc->vc_screenbuf = kzalloc(vc->vc_screenbuf_size, GFP_NOWAIT); - vc->vc_kmalloced = 0; vc_init(vc, vc->vc_rows, vc->vc_cols, currcons || !vc->vc_sw->con_save_screen); } diff --git a/include/linux/console_struct.h b/include/linux/console_struct.h index d71f7c0f931b..38fe59dc89ae 100644 --- a/include/linux/console_struct.h +++ b/include/linux/console_struct.h @@ -89,7 +89,6 @@ struct vc_data { unsigned int vc_need_wrap : 1; unsigned int vc_can_do_color : 1; unsigned int vc_report_mouse : 2; - unsigned int vc_kmalloced : 1; unsigned char vc_utf : 1; /* Unicode UTF-8 encoding */ unsigned char vc_utf_count; int vc_utf_char; -- cgit v1.2.3-59-g8ed1b From 4dc6dc7162c08b9965163c9ab3f9375d4adff2c7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 Jul 2009 23:13:10 +0000 Subject: net: sock_copy() fixes Commit e912b1142be8f1e2c71c71001dc992c6e5eb2ec1 (net: sk_prot_alloc() should not blindly overwrite memory) took care of not zeroing whole new socket at allocation time. sock_copy() is another spot where we should be very careful. We should not set refcnt to a non null value, until we are sure other fields are correctly setup, or a lockless reader could catch this socket by mistake, while not fully (re)initialized. This patch puts sk_node & sk_refcnt to the very beginning of struct sock to ease sock_copy() & sk_prot_alloc() job. We add appropriate smp_wmb() before sk_refcnt initializations to match our RCU requirements (changes to sock keys should be committed to memory before sk_refcnt setting) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/sock.h | 32 +++++++++++++++++++------------- net/core/sock.c | 20 ++++++++++++++++++-- 2 files changed, 37 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/net/sock.h b/include/net/sock.h index 2c0da9239b95..950409dcec3d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -104,15 +104,15 @@ struct net; /** * struct sock_common - minimal network layer representation of sockets + * @skc_node: main hash linkage for various protocol lookup tables + * @skc_nulls_node: main hash linkage for UDP/UDP-Lite protocol + * @skc_refcnt: reference count + * @skc_hash: hash value used with various protocol lookup tables * @skc_family: network address family * @skc_state: Connection state * @skc_reuse: %SO_REUSEADDR setting * @skc_bound_dev_if: bound device index if != 0 - * @skc_node: main hash linkage for various protocol lookup tables - * @skc_nulls_node: main hash linkage for UDP/UDP-Lite protocol * @skc_bind_node: bind hash linkage for various protocol lookup tables - * @skc_refcnt: reference count - * @skc_hash: hash value used with various protocol lookup tables * @skc_prot: protocol handlers inside a network family * @skc_net: reference to the network namespace of this socket * @@ -120,17 +120,21 @@ struct net; * for struct sock and struct inet_timewait_sock. */ struct sock_common { - unsigned short skc_family; - volatile unsigned char skc_state; - unsigned char skc_reuse; - int skc_bound_dev_if; + /* + * first fields are not copied in sock_copy() + */ union { struct hlist_node skc_node; struct hlist_nulls_node skc_nulls_node; }; - struct hlist_node skc_bind_node; atomic_t skc_refcnt; + unsigned int skc_hash; + unsigned short skc_family; + volatile unsigned char skc_state; + unsigned char skc_reuse; + int skc_bound_dev_if; + struct hlist_node skc_bind_node; struct proto *skc_prot; #ifdef CONFIG_NET_NS struct net *skc_net; @@ -208,15 +212,17 @@ struct sock { * don't add nothing before this first member (__sk_common) --acme */ struct sock_common __sk_common; +#define sk_node __sk_common.skc_node +#define sk_nulls_node __sk_common.skc_nulls_node +#define sk_refcnt __sk_common.skc_refcnt + +#define sk_copy_start __sk_common.skc_hash +#define sk_hash __sk_common.skc_hash #define sk_family __sk_common.skc_family #define sk_state __sk_common.skc_state #define sk_reuse __sk_common.skc_reuse #define sk_bound_dev_if __sk_common.skc_bound_dev_if -#define sk_node __sk_common.skc_node -#define sk_nulls_node __sk_common.skc_nulls_node #define sk_bind_node __sk_common.skc_bind_node -#define sk_refcnt __sk_common.skc_refcnt -#define sk_hash __sk_common.skc_hash #define sk_prot __sk_common.skc_prot #define sk_net __sk_common.skc_net kmemcheck_bitfield_begin(flags); diff --git a/net/core/sock.c b/net/core/sock.c index ba5d2116aea1..d9eec153d531 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -919,13 +919,19 @@ static inline void sock_lock_init(struct sock *sk) af_family_keys + sk->sk_family); } +/* + * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet, + * even temporarly, because of RCU lookups. sk_node should also be left as is. + */ static void sock_copy(struct sock *nsk, const struct sock *osk) { #ifdef CONFIG_SECURITY_NETWORK void *sptr = nsk->sk_security; #endif - - memcpy(nsk, osk, osk->sk_prot->obj_size); + BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) != + sizeof(osk->sk_node) + sizeof(osk->sk_refcnt)); + memcpy(&nsk->sk_copy_start, &osk->sk_copy_start, + osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start)); #ifdef CONFIG_SECURITY_NETWORK nsk->sk_security = sptr; security_sk_clone(osk, nsk); @@ -1140,6 +1146,11 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) newsk->sk_err = 0; newsk->sk_priority = 0; + /* + * Before updating sk_refcnt, we must commit prior changes to memory + * (Documentation/RCU/rculist_nulls.txt for details) + */ + smp_wmb(); atomic_set(&newsk->sk_refcnt, 2); /* @@ -1855,6 +1866,11 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_stamp = ktime_set(-1L, 0); + /* + * Before updating sk_refcnt, we must commit prior changes to memory + * (Documentation/RCU/rculist_nulls.txt for details) + */ + smp_wmb(); atomic_set(&sk->sk_refcnt, 1); atomic_set(&sk->sk_wmem_alloc, 1); atomic_set(&sk->sk_drops, 0); -- cgit v1.2.3-59-g8ed1b From 5780888bcac316508eb5f4dd23bbea8b5057647c Mon Sep 17 00:00:00 2001 From: Matias Zabaljauregui Date: Thu, 18 Jun 2009 11:44:06 -0300 Subject: lguest: fix journey fix: "make Guest" was complaining about duplicated G:032 Signed-off-by: Matias Zabaljauregui Signed-off-by: Rusty Russell --- arch/x86/include/asm/lguest_hcall.h | 2 +- arch/x86/lguest/boot.c | 2 +- include/linux/lguest.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index d31c4a684078..33600a66755f 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -30,7 +30,7 @@ #include #include -/*G:031 But first, how does our Guest contact the Host to ask for privileged +/*G:030 But first, how does our Guest contact the Host to ask for privileged * operations? There are two ways: the direct way is to make a "hypercall", * to make requests of the Host Itself. * diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 7bc65f0f62c4..0188fd37b6c0 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1079,7 +1079,7 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, return insn_len; } -/*G:030 Once we get to lguest_init(), we know we're a Guest. The various +/*G:029 Once we get to lguest_init(), we know we're a Guest. The various * pv_ops structures in the kernel provide points for (almost) every routine we * have to override to avoid privileged instructions. */ __init void lguest_init(void) diff --git a/include/linux/lguest.h b/include/linux/lguest.h index 7bc1440fc473..dbf2479e808e 100644 --- a/include/linux/lguest.h +++ b/include/linux/lguest.h @@ -11,7 +11,7 @@ #define LG_CLOCK_MIN_DELTA 100UL #define LG_CLOCK_MAX_DELTA ULONG_MAX -/*G:032 The second method of communicating with the Host is to via "struct +/*G:031 The second method of communicating with the Host is to via "struct * lguest_data". Once the Guest's initialization hypercall tells the Host where * this is, the Guest and Host both publish information in it. :*/ struct lguest_data -- cgit v1.2.3-59-g8ed1b From e79f07e2925b10f09a2621650c16f3d6ea778747 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 7 Jul 2009 08:47:10 -0600 Subject: virtio_net: Sync header with qemu Qemu added support for a few extra RX modes that Linux doesn't currently make use of. Sync the headers to maintain consistency. Signed-off-by: Alex Williamson Signed-off-by: Rusty Russell --- include/linux/virtio_net.h | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index cec79adbe3ea..9c543d6ac535 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -27,6 +27,7 @@ #define VIRTIO_NET_F_CTRL_VQ 17 /* Control channel available */ #define VIRTIO_NET_F_CTRL_RX 18 /* Control channel RX mode support */ #define VIRTIO_NET_F_CTRL_VLAN 19 /* Control channel VLAN filtering */ +#define VIRTIO_NET_F_CTRL_RX_EXTRA 20 /* Extra RX mode control support */ #define VIRTIO_NET_S_LINK_UP 1 /* Link is up */ @@ -81,14 +82,19 @@ typedef __u8 virtio_net_ctrl_ack; #define VIRTIO_NET_ERR 1 /* - * Control the RX mode, ie. promisucous and allmulti. PROMISC and - * ALLMULTI commands require an "out" sg entry containing a 1 byte - * state value, zero = disable, non-zero = enable. These commands - * are supported with the VIRTIO_NET_F_CTRL_RX feature. + * Control the RX mode, ie. promisucous, allmulti, etc... + * All commands require an "out" sg entry containing a 1 byte + * state value, zero = disable, non-zero = enable. Commands + * 0 and 1 are supported with the VIRTIO_NET_F_CTRL_RX feature. + * Commands 2-5 are added with VIRTIO_NET_F_CTRL_RX_EXTRA. */ #define VIRTIO_NET_CTRL_RX 0 #define VIRTIO_NET_CTRL_RX_PROMISC 0 #define VIRTIO_NET_CTRL_RX_ALLMULTI 1 + #define VIRTIO_NET_CTRL_RX_ALLUNI 2 + #define VIRTIO_NET_CTRL_RX_NOMULTI 3 + #define VIRTIO_NET_CTRL_RX_NOUNI 4 + #define VIRTIO_NET_CTRL_RX_NOBCAST 5 /* * Control the MAC filter table. -- cgit v1.2.3-59-g8ed1b From 04e448d9a386640a79a4aa71251aa1cdd314f662 Mon Sep 17 00:00:00 2001 From: Tim Abbott Date: Sun, 12 Jul 2009 18:23:33 -0400 Subject: vmlinux.lds.h: restructure BSS linker script macros The BSS section macros in vmlinux.lds.h currently place the .sbss input section outside the bounds of [__bss_start, __bss_end]. On all architectures except for microblaze that handle both .sbss and __bss_start/__bss_end, this is wrong: the .sbss input section is within the range [__bss_start, __bss_end]. Relatedly, the example code at the top of the file actually has __bss_start/__bss_end defined twice; I believe the right fix here is to define them in the BSS_SECTION macro but not in the BSS macro. Another problem with the current macros is that several architectures have an ALIGN(4) or some other small number just before __bss_stop in their linker scripts. The BSS_SECTION macro currently hardcodes this to 4; while it should really be an argument. It also ignores its sbss_align argument; fix that. mn10300 is the only user at present of any of the macros touched by this patch. It looks like mn10300 actually was incorrectly converted to use the new BSS() macro (the alignment of 4 prior to conversion was a __bss_stop alignment, but the argument to the BSS macro is a start alignment). So fix this as well. I'd like acks from Sam and David on this one. Also CCing Paul, since he has a patch from me which will need to be updated to use BSS_SECTION(0, PAGE_SIZE, 4) once this gets merged. Signed-off-by: Tim Abbott Cc: Paul Mundt Cc: David Howells Signed-off-by: Sam Ravnborg --- arch/mn10300/kernel/vmlinux.lds.S | 2 +- include/asm-generic/vmlinux.lds.h | 19 +++++++++---------- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/arch/mn10300/kernel/vmlinux.lds.S b/arch/mn10300/kernel/vmlinux.lds.S index c96ba3da95ac..f4aa07934654 100644 --- a/arch/mn10300/kernel/vmlinux.lds.S +++ b/arch/mn10300/kernel/vmlinux.lds.S @@ -107,7 +107,7 @@ SECTIONS __init_end = .; /* freed after init ends here */ - BSS(4) + BSS_SECTION(0, PAGE_SIZE, 4) _end = . ; diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index a553f1041cf1..6ad76bf5fb40 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -30,9 +30,7 @@ * EXCEPTION_TABLE(...) * NOTES * - * __bss_start = .; - * BSS_SECTION(0, 0) - * __bss_stop = .; + * BSS_SECTION(0, 0, 0) * _end = .; * * /DISCARD/ : { @@ -489,7 +487,8 @@ * bss (Block Started by Symbol) - uninitialized data * zeroed during startup */ -#define SBSS \ +#define SBSS(sbss_align) \ + . = ALIGN(sbss_align); \ .sbss : AT(ADDR(.sbss) - LOAD_OFFSET) { \ *(.sbss) \ *(.scommon) \ @@ -498,12 +497,10 @@ #define BSS(bss_align) \ . = ALIGN(bss_align); \ .bss : AT(ADDR(.bss) - LOAD_OFFSET) { \ - VMLINUX_SYMBOL(__bss_start) = .; \ *(.bss.page_aligned) \ *(.dynbss) \ *(.bss) \ *(COMMON) \ - VMLINUX_SYMBOL(__bss_stop) = .; \ } /* @@ -735,8 +732,10 @@ INIT_RAM_FS \ } -#define BSS_SECTION(sbss_align, bss_align) \ - SBSS \ +#define BSS_SECTION(sbss_align, bss_align, stop_align) \ + . = ALIGN(sbss_align); \ + VMLINUX_SYMBOL(__bss_start) = .; \ + SBSS(sbss_align) \ BSS(bss_align) \ - . = ALIGN(4); - + . = ALIGN(stop_align); \ + VMLINUX_SYMBOL(__bss_stop) = .; -- cgit v1.2.3-59-g8ed1b From 6301cb95c119ebf324bb96ee226fa9ddffad80a7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 17 Jul 2009 14:15:47 +0200 Subject: sched: fix nr_uninterruptible accounting of frozen tasks really commit e3c8ca8336 (sched: do not count frozen tasks toward load) broke the nr_uninterruptible accounting on freeze/thaw. On freeze the task is excluded from accounting with a check for (task->flags & PF_FROZEN), but that flag is cleared before the task is thawed. So while we prevent that the task with state TASK_UNINTERRUPTIBLE is accounted to nr_uninterruptible on freeze we decrement nr_uninterruptible on thaw. Use a separate flag which is handled by the freezing task itself. Set it before calling the scheduler with TASK_UNINTERRUPTIBLE state and clear it after we return from frozen state. Cc: Signed-off-by: Thomas Gleixner --- include/linux/sched.h | 3 ++- kernel/freezer.c | 7 +++++++ 2 files changed, 9 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 16a982e389fb..3ab08e4bb6b8 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -209,7 +209,7 @@ extern unsigned long long time_sync_thresh; ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) #define task_contributes_to_load(task) \ ((task->state & TASK_UNINTERRUPTIBLE) != 0 && \ - (task->flags & PF_FROZEN) == 0) + (task->flags & PF_FREEZING) == 0) #define __set_task_state(tsk, state_value) \ do { (tsk)->state = (state_value); } while (0) @@ -1680,6 +1680,7 @@ extern cputime_t task_gtime(struct task_struct *p); #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_FLUSHER 0x00001000 /* responsible for disk writeback */ #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ +#define PF_FREEZING 0x00004000 /* freeze in progress. do not account to load */ #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ diff --git a/kernel/freezer.c b/kernel/freezer.c index 2f4936cf7083..bd1d42b17cb2 100644 --- a/kernel/freezer.c +++ b/kernel/freezer.c @@ -44,12 +44,19 @@ void refrigerator(void) recalc_sigpending(); /* We sent fake signal, clean it up */ spin_unlock_irq(¤t->sighand->siglock); + /* prevent accounting of that task to load */ + current->flags |= PF_FREEZING; + for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!frozen(current)) break; schedule(); } + + /* Remove the accounting blocker */ + current->flags &= ~PF_FREEZING; + pr_debug("%s left refrigerator\n", current->comm); __set_current_state(save); } -- cgit v1.2.3-59-g8ed1b From e3afe7b75ed8f809c1473ea9b39267487c187ccb Mon Sep 17 00:00:00 2001 From: John Dykstra Date: Thu, 16 Jul 2009 05:04:51 +0000 Subject: tcp: Fix MD5 signature checking on IPv4 mapped sockets Fix MD5 signature checking so that an IPv4 active open to an IPv6 socket can succeed. In particular, use the correct address family's signature generation function for the SYN/ACK. Reported-by: Stephen Hemminger Signed-off-by: John Dykstra Signed-off-by: David S. Miller --- include/net/tcp.h | 5 +++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv4/tcp_output.c | 2 +- net/ipv6/tcp_ipv6.c | 1 + 4 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/tcp.h b/include/net/tcp.h index 19f4150f4d4d..88af84306471 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1425,6 +1425,11 @@ struct tcp_request_sock_ops { #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *(*md5_lookup) (struct sock *sk, struct request_sock *req); + int (*calc_md5_hash) (char *location, + struct tcp_md5sig_key *md5, + struct sock *sk, + struct request_sock *req, + struct sk_buff *skb); #endif }; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5a1ca2698c88..7c107eb876c8 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1160,6 +1160,7 @@ struct request_sock_ops tcp_request_sock_ops __read_mostly = { #ifdef CONFIG_TCP_MD5SIG static struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { .md5_lookup = tcp_v4_reqsk_md5_lookup, + .calc_md5_hash = tcp_v4_md5_hash_skb, }; #endif diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5bdf08d312d9..bd62712848fa 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2261,7 +2261,7 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ if (md5) { - tp->af_specific->calc_md5_hash(md5_hash_location, + tcp_rsk(req)->af_specific->calc_md5_hash(md5_hash_location, md5, NULL, req, skb); } #endif diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 58810c65b635..ae3d65753562 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -896,6 +896,7 @@ struct request_sock_ops tcp6_request_sock_ops __read_mostly = { #ifdef CONFIG_TCP_MD5SIG static struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { .md5_lookup = tcp_v6_reqsk_md5_lookup, + .calc_md5_hash = tcp_v6_md5_hash_skb, }; #endif -- cgit v1.2.3-59-g8ed1b From 591d2fb02ea80472d846c0b8507007806bdd69cc Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 21 Jul 2009 11:09:39 +0200 Subject: genirq: Delegate irq affinity setting to the irq thread irq_set_thread_affinity() calls set_cpus_allowed_ptr() which might sleep, but irq_set_thread_affinity() is called with desc->lock held and can be called from hard interrupt context as well. The code has another bug as it does not hold a ref on the task struct as required by set_cpus_allowed_ptr(). Just set the IRQTF_AFFINITY bit in action->thread_flags. The next time the thread runs it migrates itself. Solves all of the above problems nicely. Add kerneldoc to irq_set_thread_affinity() while at it. Signed-off-by: Thomas Gleixner LKML-Reference: --- include/linux/interrupt.h | 2 ++ kernel/irq/internals.h | 3 +-- kernel/irq/manage.c | 50 +++++++++++++++++++++++++++++++++++++++++------ kernel/irq/migration.c | 2 +- 4 files changed, 48 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 2721f07e9354..88b056ac5629 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -64,11 +64,13 @@ * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run * IRQTF_DIED - handler thread died * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed + * IRQTF_AFFINITY - irq thread is requested to adjust affinity */ enum { IRQTF_RUNTHREAD, IRQTF_DIED, IRQTF_WARNED, + IRQTF_AFFINITY, }; typedef irqreturn_t (*irq_handler_t)(int, void *); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 73468253143b..e70ed5592eb9 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -42,8 +42,7 @@ static inline void unregister_handler_proc(unsigned int irq, extern int irq_select_affinity_usr(unsigned int irq); -extern void -irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask); +extern void irq_set_thread_affinity(struct irq_desc *desc); /* * Debugging printout: diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 50da67672901..f0de36f13a44 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -80,14 +80,22 @@ int irq_can_set_affinity(unsigned int irq) return 1; } -void -irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask) +/** + * irq_set_thread_affinity - Notify irq threads to adjust affinity + * @desc: irq descriptor which has affitnity changed + * + * We just set IRQTF_AFFINITY and delegate the affinity setting + * to the interrupt thread itself. We can not call + * set_cpus_allowed_ptr() here as we hold desc->lock and this + * code can be called from hard interrupt context. + */ +void irq_set_thread_affinity(struct irq_desc *desc) { struct irqaction *action = desc->action; while (action) { if (action->thread) - set_cpus_allowed_ptr(action->thread, cpumask); + set_bit(IRQTF_AFFINITY, &action->thread_flags); action = action->next; } } @@ -112,7 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) if (desc->status & IRQ_MOVE_PCNTXT) { if (!desc->chip->set_affinity(irq, cpumask)) { cpumask_copy(desc->affinity, cpumask); - irq_set_thread_affinity(desc, cpumask); + irq_set_thread_affinity(desc); } } else { @@ -122,7 +130,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask) #else if (!desc->chip->set_affinity(irq, cpumask)) { cpumask_copy(desc->affinity, cpumask); - irq_set_thread_affinity(desc, cpumask); + irq_set_thread_affinity(desc); } #endif desc->status |= IRQ_AFFINITY_SET; @@ -176,7 +184,7 @@ int irq_select_affinity_usr(unsigned int irq) spin_lock_irqsave(&desc->lock, flags); ret = setup_affinity(irq, desc); if (!ret) - irq_set_thread_affinity(desc, desc->affinity); + irq_set_thread_affinity(desc); spin_unlock_irqrestore(&desc->lock, flags); return ret; @@ -443,6 +451,34 @@ static int irq_wait_for_interrupt(struct irqaction *action) return -1; } +/* + * Check whether we need to change the affinity of the interrupt thread. + */ +static void +irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) +{ + cpumask_var_t mask; + + if (!test_and_clear_bit(IRQTF_AFFINITY, &action->thread_flags)) + return; + + /* + * In case we are out of memory we set IRQTF_AFFINITY again and + * try again next time + */ + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) { + set_bit(IRQTF_AFFINITY, &action->thread_flags); + return; + } + + spin_lock_irq(&desc->lock); + cpumask_copy(mask, desc->affinity); + spin_unlock_irq(&desc->lock); + + set_cpus_allowed_ptr(current, mask); + free_cpumask_var(mask); +} + /* * Interrupt handler thread */ @@ -458,6 +494,8 @@ static int irq_thread(void *data) while (!irq_wait_for_interrupt(action)) { + irq_thread_check_affinity(desc, action); + atomic_inc(&desc->threads_active); spin_lock_irq(&desc->lock); diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c index cfe767ca1545..fcb6c96f2627 100644 --- a/kernel/irq/migration.c +++ b/kernel/irq/migration.c @@ -45,7 +45,7 @@ void move_masked_irq(int irq) < nr_cpu_ids)) if (!desc->chip->set_affinity(irq, desc->pending_mask)) { cpumask_copy(desc->affinity, desc->pending_mask); - irq_set_thread_affinity(desc, desc->pending_mask); + irq_set_thread_affinity(desc); } cpumask_clear(desc->pending_mask); -- cgit v1.2.3-59-g8ed1b From e56f0975360369347725c49654ecfe3792710429 Mon Sep 17 00:00:00 2001 From: Alan Jenkins Date: Sat, 18 Jul 2009 19:20:20 +0100 Subject: rfkill: remove too-strict __must_check Some drivers don't need the return value of rfkill_set_hw_state(), so it should not be marked as __must_check. Signed-off-by: Alan Jenkins Acked-by: Johannes Berg Signed-off-by: John W. Linville --- include/linux/rfkill.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/rfkill.h b/include/linux/rfkill.h index 2ce29831feb6..278777fa8a3a 100644 --- a/include/linux/rfkill.h +++ b/include/linux/rfkill.h @@ -224,7 +224,7 @@ void rfkill_destroy(struct rfkill *rfkill); * should be blocked) so that drivers need not keep track of the soft * block state -- which they might not be able to. */ -bool __must_check rfkill_set_hw_state(struct rfkill *rfkill, bool blocked); +bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked); /** * rfkill_set_sw_state - Set the internal rfkill software block state -- cgit v1.2.3-59-g8ed1b From f44aebcc566d1d6275f7191867b9633dc11de2ee Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 15 Jul 2009 15:49:52 -0400 Subject: inotify: use GFP_NOFS under potential memory pressure inotify can have a watchs removed under filesystem reclaim. ================================= [ INFO: inconsistent lock state ] 2.6.31-rc2 #16 --------------------------------- inconsistent {IN-RECLAIM_FS-W} -> {RECLAIM_FS-ON-W} usage. khubd/217 [HC0[0]:SC0[0]:HE1:SE1] takes: (iprune_mutex){+.+.?.}, at: [] invalidate_inodes+0x20/0xe3 {IN-RECLAIM_FS-W} state was registered at: [] __lock_acquire+0x2c9/0xac4 [] lock_acquire+0x9f/0xc2 [] __mutex_lock_common+0x2d/0x323 [] mutex_lock_nested+0x2e/0x36 [] shrink_icache_memory+0x38/0x1b2 [] shrink_slab+0xe2/0x13c [] kswapd+0x3d1/0x55d [] kthread+0x66/0x6b [] kernel_thread_helper+0x7/0x10 [] 0xffffffff Two things are needed to fix this. First we need a method to tell fsnotify_create_event() to use GFP_NOFS and second we need to stop using one global IN_IGNORED event and allocate them one at a time. This solves current issues with multiple IN_IGNORED on a queue having tail drop problems and simplifies the allocations since we don't have to worry about two tasks opperating on the IGNORED event concurrently. Signed-off-by: Eric Paris --- fs/notify/fsnotify.c | 4 +++- fs/notify/inotify/inotify_user.c | 18 ++++++++++++------ fs/notify/notification.c | 9 +++++---- include/linux/fsnotify_backend.h | 2 +- 4 files changed, 21 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index ec2f7bd76818..037e878e03fc 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -159,7 +159,9 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const if (!group->ops->should_send_event(group, to_tell, mask)) continue; if (!event) { - event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie); + event = fsnotify_create_event(to_tell, mask, data, + data_is, file_name, cookie, + GFP_KERNEL); /* shit, we OOM'd and now we can't tell, maybe * someday someone else will want to do something * here */ diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 726118a5845b..f30d9bbc2e1b 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -57,7 +57,6 @@ int inotify_max_user_watches __read_mostly; static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; struct kmem_cache *event_priv_cachep __read_mostly; -static struct fsnotify_event *inotify_ignored_event; /* * When inotify registers a new group it increments this and uses that @@ -384,12 +383,19 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, struct fsnotify_group *group) { struct inotify_inode_mark_entry *ientry; + struct fsnotify_event *ignored_event; struct inotify_event_private_data *event_priv; struct fsnotify_event_private_data *fsn_event_priv; + ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, + FSNOTIFY_EVENT_NONE, NULL, 0, + GFP_NOFS); + if (!ignored_event) + return; + ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); - event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL); + event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS); if (unlikely(!event_priv)) goto skip_send_ignore; @@ -398,7 +404,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, fsn_event_priv->group = group; event_priv->wd = ientry->wd; - fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv); + fsnotify_add_notify_event(group, ignored_event, fsn_event_priv); /* did the private data get added? */ if (list_empty(&fsn_event_priv->event_list)) @@ -406,6 +412,9 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, skip_send_ignore: + /* matches the reference taken when the event was created */ + fsnotify_put_event(ignored_event); + /* remove this entry from the idr */ inotify_remove_from_idr(group, ientry); @@ -748,9 +757,6 @@ static int __init inotify_user_setup(void) inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC); event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC); - inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0); - if (!inotify_ignored_event) - panic("unable to allocate the inotify ignored event\n"); inotify_max_queued_events = 16384; inotify_max_user_instances = 128; diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 2b20feaf263a..521368574e97 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -153,7 +153,7 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new return true; break; case (FSNOTIFY_EVENT_NONE): - return true; + return false; }; } return false; @@ -345,18 +345,19 @@ static void initialize_event(struct fsnotify_event *event) * @name the filename, if available */ struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, - int data_type, const char *name, u32 cookie) + int data_type, const char *name, u32 cookie, + gfp_t gfp) { struct fsnotify_event *event; - event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL); + event = kmem_cache_alloc(fsnotify_event_cachep, gfp); if (!event) return NULL; initialize_event(event); if (name) { - event->file_name = kstrdup(name, GFP_KERNEL); + event->file_name = kstrdup(name, gfp); if (!event->file_name) { kmem_cache_free(fsnotify_event_cachep, event); return NULL; diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 6c3de999fb34..4d6f47b51189 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -352,7 +352,7 @@ extern void fsnotify_unmount_inodes(struct list_head *list); /* put here because inotify does some weird stuff when destroying watches */ extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data, int data_is, const char *name, - u32 cookie); + u32 cookie, gfp_t gfp); #else -- cgit v1.2.3-59-g8ed1b From 9ba5f005c994ad28e266a0cd14ef29354be382c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jul 2009 14:18:35 +0200 Subject: softirq: introduce tasklet_hrtimer infrastructure commit ca109491f (hrtimer: removing all ur callback modes) moved all hrtimer callbacks into hard interrupt context when high resolution timers are active. That breaks code which relied on the assumption that the callback happens in softirq context. Provide a generic infrastructure which combines tasklets and hrtimers together to provide an in-softirq hrtimer experience. Signed-off-by: Peter Zijlstra Cc: torvalds@linux-foundation.org Cc: kaber@trash.net Cc: David Miller LKML-Reference: <1248265724.27058.1366.camel@twins> Signed-off-by: Thomas Gleixner --- include/linux/interrupt.h | 26 +++++++++++++++++++ kernel/softirq.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 89 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 2721f07e9354..fd4c9c63c757 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -517,6 +518,31 @@ extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu); extern void tasklet_init(struct tasklet_struct *t, void (*func)(unsigned long), unsigned long data); +struct tasklet_hrtimer { + struct hrtimer timer; + struct tasklet_struct tasklet; + enum hrtimer_restart (*function)(struct hrtimer *); +}; + +extern void +tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t which_clock, enum hrtimer_mode mode); + +static inline +int tasklet_hrtimer_start(struct tasklet_hrtimer *ttimer, ktime_t time, + const enum hrtimer_mode mode) +{ + return hrtimer_start(&ttimer->timer, time, mode); +} + +static inline +void tasklet_hrtimer_cancel(struct tasklet_hrtimer *ttimer) +{ + hrtimer_cancel(&ttimer->timer); + tasklet_kill(&ttimer->tasklet); +} + /* * Autoprobing for irqs: * diff --git a/kernel/softirq.c b/kernel/softirq.c index 3a94905fa5d2..eb5e131a0485 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -345,7 +345,9 @@ void open_softirq(int nr, void (*action)(struct softirq_action *)) softirq_vec[nr].action = action; } -/* Tasklets */ +/* + * Tasklets + */ struct tasklet_head { struct tasklet_struct *head; @@ -493,6 +495,66 @@ void tasklet_kill(struct tasklet_struct *t) EXPORT_SYMBOL(tasklet_kill); +/* + * tasklet_hrtimer + */ + +/* + * The trampoline is called when the hrtimer expires. If this is + * called from the hrtimer interrupt then we schedule the tasklet as + * the timer callback function expects to run in softirq context. If + * it's called in softirq context anyway (i.e. high resolution timers + * disabled) then the hrtimer callback is called right away. + */ +static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) +{ + struct tasklet_hrtimer *ttimer = + container_of(timer, struct tasklet_hrtimer, timer); + + if (hrtimer_is_hres_active(timer)) { + tasklet_hi_schedule(&ttimer->tasklet); + return HRTIMER_NORESTART; + } + return ttimer->function(timer); +} + +/* + * Helper function which calls the hrtimer callback from + * tasklet/softirq context + */ +static void __tasklet_hrtimer_trampoline(unsigned long data) +{ + struct tasklet_hrtimer *ttimer = (void *)data; + enum hrtimer_restart restart; + + restart = ttimer->function(&ttimer->timer); + if (restart != HRTIMER_NORESTART) + hrtimer_restart(&ttimer->timer); +} + +/** + * tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks + * @ttimer: tasklet_hrtimer which is initialized + * @function: hrtimer callback funtion which gets called from softirq context + * @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME) + * @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL) + */ +void tasklet_hrtimer_init(struct tasklet_hrtimer *ttimer, + enum hrtimer_restart (*function)(struct hrtimer *), + clockid_t which_clock, enum hrtimer_mode mode) +{ + hrtimer_init(&ttimer->timer, which_clock, mode); + ttimer->timer.function = __hrtimer_tasklet_trampoline; + tasklet_init(&ttimer->tasklet, __tasklet_hrtimer_trampoline, + (unsigned long)ttimer); + ttimer->function = function; +} +EXPORT_SYMBOL_GPL(tasklet_hrtimer_init); + +/* + * Remote softirq bits + */ + DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list); EXPORT_PER_CPU_SYMBOL(softirq_work_list); -- cgit v1.2.3-59-g8ed1b From 7f453c24b95a085fc7bd35d53b33abc4dc5a048b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jul 2009 13:19:40 +0200 Subject: perf_counter: PERF_SAMPLE_ID and inherited counters Anton noted that for inherited counters the counter-id as provided by PERF_SAMPLE_ID isn't mappable to the id found through PERF_RECORD_ID because each inherited counter gets its own id. His suggestion was to always return the parent counter id, since that is the primary counter id as exposed. However, these inherited counters have a unique identifier so that events like PERF_EVENT_PERIOD and PERF_EVENT_THROTTLE can be specific about which counter gets modified, which is important when trying to normalize the sample streams. This patch removes PERF_EVENT_PERIOD in favour of PERF_SAMPLE_PERIOD, which is more useful anyway, since changing periods became a lot more common than initially thought -- rendering PERF_EVENT_PERIOD the less useful solution (also, PERF_SAMPLE_PERIOD reports the more accurate value, since it reports the value used to trigger the overflow, whereas PERF_EVENT_PERIOD simply reports the requested period changed, which might only take effect on the next cycle). This still leaves us PERF_EVENT_THROTTLE to consider, but since that _should_ be a rare occurrence, and linking it to a primary id is the most useful bit to diagnose the problem, we introduce a PERF_SAMPLE_STREAM_ID, for those few cases where the full reconstruction is important. [Does change the ABI a little, but I see no other way out] Suggested-by: Anton Blanchard Signed-off-by: Peter Zijlstra LKML-Reference: <1248095846.15751.8781.camel@twins> --- include/linux/perf_counter.h | 15 ++----- kernel/perf_counter.c | 92 +++++++++++++++---------------------------- tools/perf/builtin-annotate.c | 24 ----------- tools/perf/builtin-report.c | 24 ----------- 4 files changed, 35 insertions(+), 120 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 5e970c7d3fd5..bd15d7a5f5ce 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -120,8 +120,9 @@ enum perf_counter_sample_format { PERF_SAMPLE_ID = 1U << 6, PERF_SAMPLE_CPU = 1U << 7, PERF_SAMPLE_PERIOD = 1U << 8, + PERF_SAMPLE_STREAM_ID = 1U << 9, - PERF_SAMPLE_MAX = 1U << 9, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 10, /* non-ABI */ }; /* @@ -312,16 +313,7 @@ enum perf_event_type { * struct perf_event_header header; * u64 time; * u64 id; - * u64 sample_period; - * }; - */ - PERF_EVENT_PERIOD = 4, - - /* - * struct { - * struct perf_event_header header; - * u64 time; - * u64 id; + * u64 stream_id; * }; */ PERF_EVENT_THROTTLE = 5, @@ -356,6 +348,7 @@ enum perf_event_type { * { u64 time; } && PERF_SAMPLE_TIME * { u64 addr; } && PERF_SAMPLE_ADDR * { u64 id; } && PERF_SAMPLE_ID + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID * { u32 cpu, res; } && PERF_SAMPLE_CPU * { u64 period; } && PERF_SAMPLE_PERIOD * diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e1d6a3aa1333..7530588fa5c5 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -154,6 +154,20 @@ static void unclone_ctx(struct perf_counter_context *ctx) } } +/* + * If we inherit counters we want to return the parent counter id + * to userspace. + */ +static u64 primary_counter_id(struct perf_counter *counter) +{ + u64 id = counter->id; + + if (counter->parent) + id = counter->parent->id; + + return id; +} + /* * Get the perf_counter_context for a task and lock it. * This has to cope with with the fact that until it is locked, @@ -1296,7 +1310,6 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_counter *counter, int enable); -static void perf_log_period(struct perf_counter *counter, u64 period); static void perf_adjust_period(struct perf_counter *counter, u64 events) { @@ -1315,8 +1328,6 @@ static void perf_adjust_period(struct perf_counter *counter, u64 events) if (!sample_period) sample_period = 1; - perf_log_period(counter, sample_period); - hwc->sample_period = sample_period; } @@ -1705,7 +1716,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) values[n++] = counter->total_time_running + atomic64_read(&counter->child_total_time_running); if (counter->attr.read_format & PERF_FORMAT_ID) - values[n++] = counter->id; + values[n++] = primary_counter_id(counter); mutex_unlock(&counter->child_mutex); if (count < n * sizeof(u64)) @@ -1812,8 +1823,6 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) counter->attr.sample_freq = value; } else { - perf_log_period(counter, value); - counter->attr.sample_period = value; counter->hw.sample_period = value; } @@ -2662,6 +2671,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_ID) header.size += sizeof(u64); + if (sample_type & PERF_SAMPLE_STREAM_ID) + header.size += sizeof(u64); + if (sample_type & PERF_SAMPLE_CPU) { header.size += sizeof(cpu_entry); @@ -2705,7 +2717,13 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_ADDR) perf_output_put(&handle, data->addr); - if (sample_type & PERF_SAMPLE_ID) + if (sample_type & PERF_SAMPLE_ID) { + u64 id = primary_counter_id(counter); + + perf_output_put(&handle, id); + } + + if (sample_type & PERF_SAMPLE_STREAM_ID) perf_output_put(&handle, counter->id); if (sample_type & PERF_SAMPLE_CPU) @@ -2728,7 +2746,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (sub != counter) sub->pmu->read(sub); - group_entry.id = sub->id; + group_entry.id = primary_counter_id(sub); group_entry.counter = atomic64_read(&sub->count); perf_output_put(&handle, group_entry); @@ -2788,15 +2806,8 @@ perf_counter_read_event(struct perf_counter *counter, } if (counter->attr.read_format & PERF_FORMAT_ID) { - u64 id; - event.header.size += sizeof(u64); - if (counter->parent) - id = counter->parent->id; - else - id = counter->id; - - event.format[i++] = id; + event.format[i++] = primary_counter_id(counter); } ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); @@ -3190,49 +3201,6 @@ void __perf_counter_mmap(struct vm_area_struct *vma) perf_counter_mmap_event(&mmap_event); } -/* - * Log sample_period changes so that analyzing tools can re-normalize the - * event flow. - */ - -struct freq_event { - struct perf_event_header header; - u64 time; - u64 id; - u64 period; -}; - -static void perf_log_period(struct perf_counter *counter, u64 period) -{ - struct perf_output_handle handle; - struct freq_event event; - int ret; - - if (counter->hw.sample_period == period) - return; - - if (counter->attr.sample_type & PERF_SAMPLE_PERIOD) - return; - - event = (struct freq_event) { - .header = { - .type = PERF_EVENT_PERIOD, - .misc = 0, - .size = sizeof(event), - }, - .time = sched_clock(), - .id = counter->id, - .period = period, - }; - - ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0); - if (ret) - return; - - perf_output_put(&handle, event); - perf_output_end(&handle); -} - /* * IRQ throttle logging */ @@ -3246,14 +3214,16 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) struct perf_event_header header; u64 time; u64 id; + u64 stream_id; } throttle_event = { .header = { .type = PERF_EVENT_THROTTLE + 1, .misc = 0, .size = sizeof(throttle_event), }, - .time = sched_clock(), - .id = counter->id, + .time = sched_clock(), + .id = primary_counter_id(counter), + .stream_id = counter->id, }; ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 5f9eefecc574..1dba568e1941 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -74,20 +74,12 @@ struct fork_event { u32 pid, ppid; }; -struct period_event { - struct perf_event_header header; - u64 time; - u64 id; - u64 sample_period; -}; - typedef union event_union { struct perf_event_header header; struct ip_event ip; struct mmap_event mmap; struct comm_event comm; struct fork_event fork; - struct period_event period; } event_t; @@ -997,19 +989,6 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head) return 0; } -static int -process_period_event(event_t *event, unsigned long offset, unsigned long head) -{ - dprintf("%p [%p]: PERF_EVENT_PERIOD: time:%Ld, id:%Ld: period:%Ld\n", - (void *)(offset + head), - (void *)(long)(event->header.size), - event->period.time, - event->period.id, - event->period.sample_period); - - return 0; -} - static int process_event(event_t *event, unsigned long offset, unsigned long head) { @@ -1025,9 +1004,6 @@ process_event(event_t *event, unsigned long offset, unsigned long head) case PERF_EVENT_FORK: return process_fork_event(event, offset, head); - - case PERF_EVENT_PERIOD: - return process_period_event(event, offset, head); /* * We dont process them right now but they are fine: */ diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index a118bc77286d..b20a4b6e31b7 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -101,13 +101,6 @@ struct fork_event { u32 pid, ppid; }; -struct period_event { - struct perf_event_header header; - u64 time; - u64 id; - u64 sample_period; -}; - struct lost_event { struct perf_event_header header; u64 id; @@ -127,7 +120,6 @@ typedef union event_union { struct mmap_event mmap; struct comm_event comm; struct fork_event fork; - struct period_event period; struct lost_event lost; struct read_event read; } event_t; @@ -1635,19 +1627,6 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head) return 0; } -static int -process_period_event(event_t *event, unsigned long offset, unsigned long head) -{ - dprintf("%p [%p]: PERF_EVENT_PERIOD: time:%Ld, id:%Ld: period:%Ld\n", - (void *)(offset + head), - (void *)(long)(event->header.size), - event->period.time, - event->period.id, - event->period.sample_period); - - return 0; -} - static int process_lost_event(event_t *event, unsigned long offset, unsigned long head) { @@ -1729,9 +1708,6 @@ process_event(event_t *event, unsigned long offset, unsigned long head) case PERF_EVENT_FORK: return process_fork_event(event, offset, head); - case PERF_EVENT_PERIOD: - return process_period_event(event, offset, head); - case PERF_EVENT_LOST: return process_lost_event(event, offset, head); -- cgit v1.2.3-59-g8ed1b From 24c30dbbcdda9aeccb23b4eecb6bb8e538742ea4 Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Thu, 16 Jul 2009 21:31:31 +0000 Subject: of/mdio: Add support function for Ethernet fixed-link property Fixed-link support is broken for the ucc_eth, gianfar, and fs_enet device drivers. The "OF MDIO rework" patches removed most of the support. Instead of re-adding fixed-link stuff to the drivers, this patch adds a support function for parsing the fixed-link property and obtaining a dummy phy to match. Note: the dummy phy handling in arch/powerpc is a bit of a hack and needs to be reworked. This function is being added now to solve the regression in the Ethernet drivers, but it should be considered a temporary measure until the fixed link handling can be reworked. Signed-off-by: Anton Vorontsov Signed-off-by: Grant Likely Signed-off-by: David S. Miller --- drivers/of/of_mdio.c | 42 ++++++++++++++++++++++++++++++++++++++++++ include/linux/of_mdio.h | 3 +++ 2 files changed, 45 insertions(+) (limited to 'include') diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c index aee967d7f760..bacaa536fd51 100644 --- a/drivers/of/of_mdio.c +++ b/drivers/of/of_mdio.c @@ -9,6 +9,10 @@ * out of the OpenFirmware device tree and using it to populate an mii_bus. */ +#include +#include +#include +#include #include #include #include @@ -137,3 +141,41 @@ struct phy_device *of_phy_connect(struct net_device *dev, return phy_connect_direct(dev, phy, hndlr, flags, iface) ? NULL : phy; } EXPORT_SYMBOL(of_phy_connect); + +/** + * of_phy_connect_fixed_link - Parse fixed-link property and return a dummy phy + * @dev: pointer to net_device claiming the phy + * @hndlr: Link state callback for the network device + * @iface: PHY data interface type + * + * This function is a temporary stop-gap and will be removed soon. It is + * only to support the fs_enet, ucc_geth and gianfar Ethernet drivers. Do + * not call this function from new drivers. + */ +struct phy_device *of_phy_connect_fixed_link(struct net_device *dev, + void (*hndlr)(struct net_device *), + phy_interface_t iface) +{ + struct device_node *net_np; + char bus_id[MII_BUS_ID_SIZE + 3]; + struct phy_device *phy; + const u32 *phy_id; + int sz; + + if (!dev->dev.parent) + return NULL; + + net_np = dev_archdata_get_node(&dev->dev.parent->archdata); + if (!net_np) + return NULL; + + phy_id = of_get_property(net_np, "fixed-link", &sz); + if (!phy_id || sz < sizeof(*phy_id)) + return NULL; + + sprintf(bus_id, PHY_ID_FMT, "0", phy_id[0]); + + phy = phy_connect(dev, bus_id, hndlr, 0, iface); + return IS_ERR(phy) ? NULL : phy; +} +EXPORT_SYMBOL(of_phy_connect_fixed_link); diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h index c9663c690303..53b94e025c7c 100644 --- a/include/linux/of_mdio.h +++ b/include/linux/of_mdio.h @@ -18,5 +18,8 @@ extern struct phy_device *of_phy_connect(struct net_device *dev, struct device_node *phy_np, void (*hndlr)(struct net_device *), u32 flags, phy_interface_t iface); +extern struct phy_device *of_phy_connect_fixed_link(struct net_device *dev, + void (*hndlr)(struct net_device *), + phy_interface_t iface); #endif /* __LINUX_OF_MDIO_H */ -- cgit v1.2.3-59-g8ed1b From 5dea271b6d87bd1d79a59c1d5baac2596a841c37 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 23 Jul 2009 20:30:42 +0100 Subject: dm table: pass correct dev area size to device_area_is_valid Incorrect device area lengths are being passed to device_area_is_valid(). The regression appeared in 2.6.31-rc1 through commit 754c5fc7ebb417b23601a6222a6005cc2e7f2913. With the dm-stripe target, the size of the target (ti->len) was used instead of the stripe_width (ti->len/#stripes). An example of a consequent incorrect error message is: device-mapper: table: 254:0: sdb too small for target Signed-off-by: Mike Snitzer Signed-off-by: Alasdair G Kergon --- drivers/md/dm-crypt.c | 2 +- drivers/md/dm-delay.c | 4 ++-- drivers/md/dm-linear.c | 2 +- drivers/md/dm-mpath.c | 2 +- drivers/md/dm-raid1.c | 2 +- drivers/md/dm-stripe.c | 7 ++++--- drivers/md/dm-table.c | 10 +++++----- include/linux/device-mapper.h | 4 ++-- 8 files changed, 17 insertions(+), 16 deletions(-) (limited to 'include') diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 529e2ba505c3..ed1038164019 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1318,7 +1318,7 @@ static int crypt_iterate_devices(struct dm_target *ti, { struct crypt_config *cc = ti->private; - return fn(ti, cc->dev, cc->start, data); + return fn(ti, cc->dev, cc->start, ti->len, data); } static struct target_type crypt_target = { diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 4e5b843cd4d7..ebe7381f47c8 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -324,12 +324,12 @@ static int delay_iterate_devices(struct dm_target *ti, struct delay_c *dc = ti->private; int ret = 0; - ret = fn(ti, dc->dev_read, dc->start_read, data); + ret = fn(ti, dc->dev_read, dc->start_read, ti->len, data); if (ret) goto out; if (dc->dev_write) - ret = fn(ti, dc->dev_write, dc->start_write, data); + ret = fn(ti, dc->dev_write, dc->start_write, ti->len, data); out: return ret; diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 9184b6deb868..82f7d6e6b1ea 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c @@ -139,7 +139,7 @@ static int linear_iterate_devices(struct dm_target *ti, { struct linear_c *lc = ti->private; - return fn(ti, lc->dev, lc->start, data); + return fn(ti, lc->dev, lc->start, ti->len, data); } static struct target_type linear_target = { diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index c70604a20897..6f0d90d4a541 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c @@ -1453,7 +1453,7 @@ static int multipath_iterate_devices(struct dm_target *ti, list_for_each_entry(pg, &m->priority_groups, list) { list_for_each_entry(p, &pg->pgpaths, list) { - ret = fn(ti, p->path.dev, ti->begin, data); + ret = fn(ti, p->path.dev, ti->begin, ti->len, data); if (ret) goto out; } diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 42a3e4341d60..9726577cde49 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -1293,7 +1293,7 @@ static int mirror_iterate_devices(struct dm_target *ti, for (i = 0; !ret && i < ms->nr_mirrors; i++) ret = fn(ti, ms->mirror[i].dev, - ms->mirror[i].offset, data); + ms->mirror[i].offset, ti->len, data); return ret; } diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index b240e85ae39a..4e0e5937e42a 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -320,10 +320,11 @@ static int stripe_iterate_devices(struct dm_target *ti, int ret = 0; unsigned i = 0; - do + do { ret = fn(ti, sc->stripe[i].dev, - sc->stripe[i].physical_start, data); - while (!ret && ++i < sc->stripes); + sc->stripe[i].physical_start, + sc->stripe_width, data); + } while (!ret && ++i < sc->stripes); return ret; } diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 5d16d06613aa..d952b3441913 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -346,7 +346,7 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) * If possible, this checks an area of a destination device is valid. */ static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev, - sector_t start, void *data) + sector_t start, sector_t len, void *data) { struct queue_limits *limits = data; struct block_device *bdev = dev->bdev; @@ -359,7 +359,7 @@ static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev, if (!dev_size) return 1; - if ((start >= dev_size) || (start + ti->len > dev_size)) { + if ((start >= dev_size) || (start + len > dev_size)) { DMWARN("%s: %s too small for target", dm_device_name(ti->table->md), bdevname(bdev, b)); return 0; @@ -377,11 +377,11 @@ static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev, return 0; } - if (ti->len & (logical_block_size_sectors - 1)) { + if (len & (logical_block_size_sectors - 1)) { DMWARN("%s: len=%llu not aligned to h/w " "logical block size %hu of %s", dm_device_name(ti->table->md), - (unsigned long long)ti->len, + (unsigned long long)len, limits->logical_block_size, bdevname(bdev, b)); return 0; } @@ -482,7 +482,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti, #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, - sector_t start, void *data) + sector_t start, sector_t len, void *data) { struct queue_limits *limits = data; struct block_device *bdev = dev->bdev; diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index 0d6310657f32..655e7721580a 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -84,7 +84,7 @@ typedef int (*dm_merge_fn) (struct dm_target *ti, struct bvec_merge_data *bvm, typedef int (*iterate_devices_callout_fn) (struct dm_target *ti, struct dm_dev *dev, - sector_t physical_start, + sector_t start, sector_t len, void *data); typedef int (*dm_iterate_devices_fn) (struct dm_target *ti, @@ -104,7 +104,7 @@ void dm_error(const char *message); * Combine device limits. */ int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, - sector_t start, void *data); + sector_t start, sector_t len, void *data); struct dm_dev { struct block_device *bdev; -- cgit v1.2.3-59-g8ed1b From 26e744b6b61066203fd57de0d3962353621e06f8 Mon Sep 17 00:00:00 2001 From: Brian Johnson Date: Sun, 19 Jul 2009 05:52:58 -0300 Subject: V4L/DVB (12283): gspca - sn9c20x: New subdriver for sn9c201 and sn9c202 bridges. Signed-off-by: Brian Johnson Signed-off-by: Jean-Francois Moine Signed-off-by: Mauro Carvalho Chehab --- Documentation/video4linux/gspca.txt | 32 + drivers/media/video/gspca/Kconfig | 16 + drivers/media/video/gspca/Makefile | 2 + drivers/media/video/gspca/sn9c20x.c | 2433 +++++++++++++++++++++++++++++++++++ include/linux/videodev2.h | 1 + include/media/v4l2-chip-ident.h | 12 + 6 files changed, 2496 insertions(+) create mode 100644 drivers/media/video/gspca/sn9c20x.c (limited to 'include') diff --git a/Documentation/video4linux/gspca.txt b/Documentation/video4linux/gspca.txt index 2bcf78896e22..573f95b58807 100644 --- a/Documentation/video4linux/gspca.txt +++ b/Documentation/video4linux/gspca.txt @@ -44,7 +44,9 @@ zc3xx 0458:7007 Genius VideoCam V2 zc3xx 0458:700c Genius VideoCam V3 zc3xx 0458:700f Genius VideoCam Web V2 sonixj 0458:7025 Genius Eye 311Q +sn9c20x 0458:7029 Genius Look 320s sonixj 0458:702e Genius Slim 310 NB +sn9c20x 045e:00f4 LifeCam VX-6000 (SN9C20x + OV9650) sonixj 045e:00f5 MicroSoft VX3000 sonixj 045e:00f7 MicroSoft VX1000 ov519 045e:028c Micro$oft xbox cam @@ -282,6 +284,28 @@ sonixj 0c45:613a Microdia Sonix PC Camera sonixj 0c45:613b Surfer SN-206 sonixj 0c45:613c Sonix Pccam168 sonixj 0c45:6143 Sonix Pccam168 +sn9c20x 0c45:6240 PC Camera (SN9C201 + MT9M001) +sn9c20x 0c45:6242 PC Camera (SN9C201 + MT9M111) +sn9c20x 0c45:6248 PC Camera (SN9C201 + OV9655) +sn9c20x 0c45:624e PC Camera (SN9C201 + SOI968) +sn9c20x 0c45:624f PC Camera (SN9C201 + OV9650) +sn9c20x 0c45:6251 PC Camera (SN9C201 + OV9650) +sn9c20x 0c45:6253 PC Camera (SN9C201 + OV9650) +sn9c20x 0c45:6260 PC Camera (SN9C201 + OV7670) +sn9c20x 0c45:6270 PC Camera (SN9C201 + MT9V011/MT9V111/MT9V112) +sn9c20x 0c45:627b PC Camera (SN9C201 + OV7660) +sn9c20x 0c45:627c PC Camera (SN9C201 + HV7131R) +sn9c20x 0c45:627f PC Camera (SN9C201 + OV9650) +sn9c20x 0c45:6280 PC Camera (SN9C202 + MT9M001) +sn9c20x 0c45:6282 PC Camera (SN9C202 + MT9M111) +sn9c20x 0c45:6288 PC Camera (SN9C202 + OV9655) +sn9c20x 0c45:628e PC Camera (SN9C202 + SOI968) +sn9c20x 0c45:628f PC Camera (SN9C202 + OV9650) +sn9c20x 0c45:62a0 PC Camera (SN9C202 + OV7670) +sn9c20x 0c45:62b0 PC Camera (SN9C202 + MT9V011/MT9V111/MT9V112) +sn9c20x 0c45:62b3 PC Camera (SN9C202 + OV9655) +sn9c20x 0c45:62bb PC Camera (SN9C202 + OV7660) +sn9c20x 0c45:62bc PC Camera (SN9C202 + HV7131R) sunplus 0d64:0303 Sunplus FashionCam DXG etoms 102c:6151 Qcam Sangha CIF etoms 102c:6251 Qcam xxxxxx VGA @@ -290,6 +314,7 @@ spca561 10fd:7e50 FlyCam Usb 100 zc3xx 10fd:8050 Typhoon Webshot II USB 300k ov534 1415:2000 Sony HD Eye for PS3 (SLEH 00201) pac207 145f:013a Trust WB-1300N +sn9c20x 145f:013d Trust WB-3600R vc032x 15b8:6001 HP 2.0 Megapixel vc032x 15b8:6002 HP 2.0 Megapixel rz406aa spca501 1776:501c Arowana 300K CMOS Camera @@ -300,4 +325,11 @@ spca500 2899:012c Toptro Industrial spca508 8086:0110 Intel Easy PC Camera spca500 8086:0630 Intel Pocket PC Camera spca506 99fa:8988 Grandtec V.cap +sn9c20x a168:0610 Dino-Lite Digital Microscope (SN9C201 + HV7131R) +sn9c20x a168:0611 Dino-Lite Digital Microscope (SN9C201 + HV7131R) +sn9c20x a168:0613 Dino-Lite Digital Microscope (SN9C201 + HV7131R) +sn9c20x a168:0618 Dino-Lite Digital Microscope (SN9C201 + HV7131R) +sn9c20x a168:0614 Dino-Lite Digital Microscope (SN9C201 + MT9M111) +sn9c20x a168:0615 Dino-Lite Digital Microscope (SN9C201 + MT9M111) +sn9c20x a168:0617 Dino-Lite Digital Microscope (SN9C201 + MT9M111) spca561 abcd:cdee Petcam diff --git a/drivers/media/video/gspca/Kconfig b/drivers/media/video/gspca/Kconfig index 578dc4ffc965..34f46f2bc040 100644 --- a/drivers/media/video/gspca/Kconfig +++ b/drivers/media/video/gspca/Kconfig @@ -102,6 +102,22 @@ config USB_GSPCA_PAC7311 To compile this driver as a module, choose M here: the module will be called gspca_pac7311. +config USB_GSPCA_SN9C20X + tristate "SN9C20X USB Camera Driver" + depends on VIDEO_V4L2 && USB_GSPCA + help + Say Y here if you want support for cameras based on the + sn9c20x chips (SN9C201 and SN9C202). + + To compile this driver as a module, choose M here: the + module will be called gspca_sn9c20x. + +config USB_GSPCA_SN9C20X_EVDEV + bool "Enable evdev support" + depends on USB_GSPCA_SN9C20X + ---help--- + Say Y here in order to enable evdev support for sn9c20x webcam button. + config USB_GSPCA_SONIXB tristate "SONIX Bayer USB Camera Driver" depends on VIDEO_V4L2 && USB_GSPCA diff --git a/drivers/media/video/gspca/Makefile b/drivers/media/video/gspca/Makefile index 8a6643e8eb96..f6d3b86e9ad5 100644 --- a/drivers/media/video/gspca/Makefile +++ b/drivers/media/video/gspca/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_USB_GSPCA_OV519) += gspca_ov519.o obj-$(CONFIG_USB_GSPCA_OV534) += gspca_ov534.o obj-$(CONFIG_USB_GSPCA_PAC207) += gspca_pac207.o obj-$(CONFIG_USB_GSPCA_PAC7311) += gspca_pac7311.o +obj-$(CONFIG_USB_GSPCA_SN9C20X) += gspca_sn9c20x.o obj-$(CONFIG_USB_GSPCA_SONIXB) += gspca_sonixb.o obj-$(CONFIG_USB_GSPCA_SONIXJ) += gspca_sonixj.o obj-$(CONFIG_USB_GSPCA_SPCA500) += gspca_spca500.o @@ -35,6 +36,7 @@ gspca_ov519-objs := ov519.o gspca_ov534-objs := ov534.o gspca_pac207-objs := pac207.o gspca_pac7311-objs := pac7311.o +gspca_sn9c20x-objs := sn9c20x.o gspca_sonixb-objs := sonixb.o gspca_sonixj-objs := sonixj.o gspca_spca500-objs := spca500.o diff --git a/drivers/media/video/gspca/sn9c20x.c b/drivers/media/video/gspca/sn9c20x.c new file mode 100644 index 000000000000..78ab26ceb90e --- /dev/null +++ b/drivers/media/video/gspca/sn9c20x.c @@ -0,0 +1,2433 @@ +/* + * Sonix sn9c201 sn9c202 library + * Copyright (C) 2008-2009 microdia project + * Copyright (C) 2009 Brian Johnson + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "gspca.h" +#include "jpeg.h" + +#include +#ifdef CONFIG_USB_GSPCA_SN9C20X_EVDEV +#include +#include +#include +#include +#endif + +MODULE_AUTHOR("Brian Johnson , " + "microdia project "); +MODULE_DESCRIPTION("GSPCA/SN9C20X USB Camera Driver"); +MODULE_LICENSE("GPL"); + +#define MODULE_NAME "sn9c20x" + +#define MODE_RAW 0x10 +#define MODE_JPEG 0x20 +#define MODE_SXGA 0x80 + +#define SENSOR_OV9650 0 +#define SENSOR_OV9655 1 +#define SENSOR_SOI968 2 +#define SENSOR_OV7660 3 +#define SENSOR_OV7670 4 +#define SENSOR_MT9V011 5 +#define SENSOR_MT9V111 6 +#define SENSOR_MT9V112 7 +#define SENSOR_MT9M001 8 +#define SENSOR_MT9M111 9 +#define SENSOR_HV7131R 10 +#define SENSOR_MT9VPRB 20 + +/* specific webcam descriptor */ +struct sd { + struct gspca_dev gspca_dev; + +#define MIN_AVG_LUM 80 +#define MAX_AVG_LUM 130 + atomic_t avg_lum; + u8 old_step; + u8 older_step; + u8 exposure_step; + + u8 brightness; + u8 contrast; + u8 saturation; + s16 hue; + u8 gamma; + u8 red; + u8 blue; + + u8 hflip; + u8 vflip; + u8 gain; + u16 exposure; + u8 auto_exposure; + + u8 i2c_addr; + u8 sensor; + u8 hstart; + u8 vstart; + + u8 *jpeg_hdr; + u8 quality; + +#ifdef CONFIG_USB_GSPCA_SN9C20X_EVDEV + struct input_dev *input_dev; + u8 input_gpio; + struct task_struct *input_task; +#endif +}; + +static int sd_setbrightness(struct gspca_dev *gspca_dev, s32 val); +static int sd_getbrightness(struct gspca_dev *gspca_dev, s32 *val); +static int sd_setcontrast(struct gspca_dev *gspca_dev, s32 val); +static int sd_getcontrast(struct gspca_dev *gspca_dev, s32 *val); +static int sd_setsaturation(struct gspca_dev *gspca_dev, s32 val); +static int sd_getsaturation(struct gspca_dev *gspca_dev, s32 *val); +static int sd_sethue(struct gspca_dev *gspca_dev, s32 val); +static int sd_gethue(struct gspca_dev *gspca_dev, s32 *val); +static int sd_setgamma(struct gspca_dev *gspca_dev, s32 val); +static int sd_getgamma(struct gspca_dev *gspca_dev, s32 *val); +static int sd_setredbalance(struct gspca_dev *gspca_dev, s32 val); +static int sd_getredbalance(struct gspca_dev *gspca_dev, s32 *val); +static int sd_setbluebalance(struct gspca_dev *gspca_dev, s32 val); +static int sd_getbluebalance(struct gspca_dev *gspca_dev, s32 *val); +static int sd_setvflip(struct gspca_dev *gspca_dev, s32 val); +static int sd_getvflip(struct gspca_dev *gspca_dev, s32 *val); +static int sd_sethflip(struct gspca_dev *gspca_dev, s32 val); +static int sd_gethflip(struct gspca_dev *gspca_dev, s32 *val); +static int sd_setgain(struct gspca_dev *gspca_dev, s32 val); +static int sd_getgain(struct gspca_dev *gspca_dev, s32 *val); +static int sd_setexposure(struct gspca_dev *gspca_dev, s32 val); +static int sd_getexposure(struct gspca_dev *gspca_dev, s32 *val); +static int sd_setautoexposure(struct gspca_dev *gspca_dev, s32 val); +static int sd_getautoexposure(struct gspca_dev *gspca_dev, s32 *val); + +static struct ctrl sd_ctrls[] = { + { +#define BRIGHTNESS_IDX 0 + { + .id = V4L2_CID_BRIGHTNESS, + .type = V4L2_CTRL_TYPE_INTEGER, + .name = "Brightness", + .minimum = 0, + .maximum = 0xff, + .step = 1, +#define BRIGHTNESS_DEFAULT 0x7f + .default_value = BRIGHTNESS_DEFAULT, + }, + .set = sd_setbrightness, + .get = sd_getbrightness, + }, + { +#define CONTRAST_IDX 1 + { + .id = V4L2_CID_CONTRAST, + .type = V4L2_CTRL_TYPE_INTEGER, + .name = "Contrast", + .minimum = 0, + .maximum = 0xff, + .step = 1, +#define CONTRAST_DEFAULT 0x7f + .default_value = CONTRAST_DEFAULT, + }, + .set = sd_setcontrast, + .get = sd_getcontrast, + }, + { +#define SATURATION_IDX 2 + { + .id = V4L2_CID_SATURATION, + .type = V4L2_CTRL_TYPE_INTEGER, + .name = "Saturation", + .minimum = 0, + .maximum = 0xff, + .step = 1, +#define SATURATION_DEFAULT 0x7f + .default_value = SATURATION_DEFAULT, + }, + .set = sd_setsaturation, + .get = sd_getsaturation, + }, + { +#define HUE_IDX 3 + { + .id = V4L2_CID_HUE, + .type = V4L2_CTRL_TYPE_INTEGER, + .name = "Hue", + .minimum = -180, + .maximum = 180, + .step = 1, +#define HUE_DEFAULT 0 + .default_value = HUE_DEFAULT, + }, + .set = sd_sethue, + .get = sd_gethue, + }, + { +#define GAMMA_IDX 4 + { + .id = V4L2_CID_GAMMA, + .type = V4L2_CTRL_TYPE_INTEGER, + .name = "Gamma", + .minimum = 0, + .maximum = 0xff, + .step = 1, +#define GAMMA_DEFAULT 0x10 + .default_value = GAMMA_DEFAULT, + }, + .set = sd_setgamma, + .get = sd_getgamma, + }, + { +#define BLUE_IDX 5 + { + .id = V4L2_CID_BLUE_BALANCE, + .type = V4L2_CTRL_TYPE_INTEGER, + .name = "Blue Balance", + .minimum = 0, + .maximum = 0x7f, + .step = 1, +#define BLUE_DEFAULT 0x28 + .default_value = BLUE_DEFAULT, + }, + .set = sd_setbluebalance, + .get = sd_getbluebalance, + }, + { +#define RED_IDX 6 + { + .id = V4L2_CID_RED_BALANCE, + .type = V4L2_CTRL_TYPE_INTEGER, + .name = "Red Balance", + .minimum = 0, + .maximum = 0x7f, + .step = 1, +#define RED_DEFAULT 0x28 + .default_value = RED_DEFAULT, + }, + .set = sd_setredbalance, + .get = sd_getredbalance, + }, + { +#define HFLIP_IDX 7 + { + .id = V4L2_CID_HFLIP, + .type = V4L2_CTRL_TYPE_BOOLEAN, + .name = "Horizontal Flip", + .minimum = 0, + .maximum = 1, + .step = 1, +#define HFLIP_DEFAULT 0 + .default_value = HFLIP_DEFAULT, + }, + .set = sd_sethflip, + .get = sd_gethflip, + }, + { +#define VFLIP_IDX 8 + { + .id = V4L2_CID_VFLIP, + .type = V4L2_CTRL_TYPE_BOOLEAN, + .name = "Vertical Flip", + .minimum = 0, + .maximum = 1, + .step = 1, +#define VFLIP_DEFAULT 0 + .default_value = VFLIP_DEFAULT, + }, + .set = sd_setvflip, + .get = sd_getvflip, + }, + { +#define EXPOSURE_IDX 9 + { + .id = V4L2_CID_EXPOSURE, + .type = V4L2_CTRL_TYPE_INTEGER, + .name = "Exposure", + .minimum = 0, + .maximum = 0x1780, + .step = 1, +#define EXPOSURE_DEFAULT 0x33 + .default_value = EXPOSURE_DEFAULT, + }, + .set = sd_setexposure, + .get = sd_getexposure, + }, + { +#define GAIN_IDX 10 + { + .id = V4L2_CID_GAIN, + .type = V4L2_CTRL_TYPE_INTEGER, + .name = "Gain", + .minimum = 0, + .maximum = 28, + .step = 1, +#define GAIN_DEFAULT 0x00 + .default_value = GAIN_DEFAULT, + }, + .set = sd_setgain, + .get = sd_getgain, + }, + { +#define AUTOGAIN_IDX 11 + { + .id = V4L2_CID_AUTOGAIN, + .type = V4L2_CTRL_TYPE_BOOLEAN, + .name = "Auto Exposure", + .minimum = 0, + .maximum = 1, + .step = 1, +#define AUTO_EXPOSURE_DEFAULT 1 + .default_value = AUTO_EXPOSURE_DEFAULT, + }, + .set = sd_setautoexposure, + .get = sd_getautoexposure, + }, +}; + +static const struct v4l2_pix_format vga_mode[] = { + {160, 120, V4L2_PIX_FMT_JPEG, V4L2_FIELD_NONE, + .bytesperline = 240, + .sizeimage = 240 * 120, + .colorspace = V4L2_COLORSPACE_JPEG, + .priv = 0 | MODE_JPEG}, + {160, 120, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, + .bytesperline = 160, + .sizeimage = 160 * 120, + .colorspace = V4L2_COLORSPACE_SRGB, + .priv = 0 | MODE_RAW}, + {160, 120, V4L2_PIX_FMT_SN9C20X_I420, V4L2_FIELD_NONE, + .bytesperline = 240, + .sizeimage = 240 * 120, + .colorspace = V4L2_COLORSPACE_SRGB, + .priv = 0}, + {320, 240, V4L2_PIX_FMT_JPEG, V4L2_FIELD_NONE, + .bytesperline = 480, + .sizeimage = 480 * 240 , + .colorspace = V4L2_COLORSPACE_JPEG, + .priv = 1 | MODE_JPEG}, + {320, 240, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, + .bytesperline = 320, + .sizeimage = 320 * 240 , + .colorspace = V4L2_COLORSPACE_SRGB, + .priv = 1 | MODE_RAW}, + {320, 240, V4L2_PIX_FMT_SN9C20X_I420, V4L2_FIELD_NONE, + .bytesperline = 480, + .sizeimage = 480 * 240 , + .colorspace = V4L2_COLORSPACE_SRGB, + .priv = 1}, + {640, 480, V4L2_PIX_FMT_JPEG, V4L2_FIELD_NONE, + .bytesperline = 960, + .sizeimage = 960 * 480, + .colorspace = V4L2_COLORSPACE_JPEG, + .priv = 2 | MODE_JPEG}, + {640, 480, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, + .bytesperline = 640, + .sizeimage = 640 * 480, + .colorspace = V4L2_COLORSPACE_SRGB, + .priv = 2 | MODE_RAW}, + {640, 480, V4L2_PIX_FMT_SN9C20X_I420, V4L2_FIELD_NONE, + .bytesperline = 960, + .sizeimage = 960 * 480, + .colorspace = V4L2_COLORSPACE_SRGB, + .priv = 2}, +}; + +static const struct v4l2_pix_format sxga_mode[] = { + {160, 120, V4L2_PIX_FMT_JPEG, V4L2_FIELD_NONE, + .bytesperline = 240, + .sizeimage = 240 * 120, + .colorspace = V4L2_COLORSPACE_JPEG, + .priv = 0 | MODE_JPEG}, + {160, 120, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, + .bytesperline = 160, + .sizeimage = 160 * 120, + .colorspace = V4L2_COLORSPACE_SRGB, + .priv = 0 | MODE_RAW}, + {160, 120, V4L2_PIX_FMT_SN9C20X_I420, V4L2_FIELD_NONE, + .bytesperline = 240, + .sizeimage = 240 * 120, + .colorspace = V4L2_COLORSPACE_SRGB, + .priv = 0}, + {320, 240, V4L2_PIX_FMT_JPEG, V4L2_FIELD_NONE, + .bytesperline = 480, + .sizeimage = 480 * 240 , + .colorspace = V4L2_COLORSPACE_JPEG, + .priv = 1 | MODE_JPEG}, + {320, 240, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, + .bytesperline = 320, + .sizeimage = 320 * 240 , + .colorspace = V4L2_COLORSPACE_SRGB, + .priv = 1 | MODE_RAW}, + {320, 240, V4L2_PIX_FMT_SN9C20X_I420, V4L2_FIELD_NONE, + .bytesperline = 480, + .sizeimage = 480 * 240 , + .colorspace = V4L2_COLORSPACE_SRGB, + .priv = 1}, + {640, 480, V4L2_PIX_FMT_JPEG, V4L2_FIELD_NONE, + .bytesperline = 960, + .sizeimage = 960 * 480, + .colorspace = V4L2_COLORSPACE_JPEG, + .priv = 2 | MODE_JPEG}, + {640, 480, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, + .bytesperline = 640, + .sizeimage = 640 * 480, + .colorspace = V4L2_COLORSPACE_SRGB, + .priv = 2 | MODE_RAW}, + {640, 480, V4L2_PIX_FMT_SN9C20X_I420, V4L2_FIELD_NONE, + .bytesperline = 960, + .sizeimage = 960 * 480, + .colorspace = V4L2_COLORSPACE_SRGB, + .priv = 2}, + {1280, 1024, V4L2_PIX_FMT_SBGGR8, V4L2_FIELD_NONE, + .bytesperline = 1280, + .sizeimage = (1280 * 1024) + 64, + .colorspace = V4L2_COLORSPACE_SRGB, + .priv = 3 | MODE_RAW | MODE_SXGA}, +}; + +static const int hsv_red_x[] = { + 41, 44, 46, 48, 50, 52, 54, 56, + 58, 60, 62, 64, 66, 68, 70, 72, + 74, 76, 78, 80, 81, 83, 85, 87, + 88, 90, 92, 93, 95, 97, 98, 100, + 101, 102, 104, 105, 107, 108, 109, 110, + 112, 113, 114, 115, 116, 117, 118, 119, + 120, 121, 122, 123, 123, 124, 125, 125, + 126, 127, 127, 128, 128, 129, 129, 129, + 130, 130, 130, 130, 131, 131, 131, 131, + 131, 131, 131, 131, 130, 130, 130, 130, + 129, 129, 129, 128, 128, 127, 127, 126, + 125, 125, 124, 123, 122, 122, 121, 120, + 119, 118, 117, 116, 115, 114, 112, 111, + 110, 109, 107, 106, 105, 103, 102, 101, + 99, 98, 96, 94, 93, 91, 90, 88, + 86, 84, 83, 81, 79, 77, 75, 74, + 72, 70, 68, 66, 64, 62, 60, 58, + 56, 54, 52, 49, 47, 45, 43, 41, + 39, 36, 34, 32, 30, 28, 25, 23, + 21, 19, 16, 14, 12, 9, 7, 5, + 3, 0, -1, -3, -6, -8, -10, -12, + -15, -17, -19, -22, -24, -26, -28, -30, + -33, -35, -37, -39, -41, -44, -46, -48, + -50, -52, -54, -56, -58, -60, -62, -64, + -66, -68, -70, -72, -74, -76, -78, -80, + -81, -83, -85, -87, -88, -90, -92, -93, + -95, -97, -98, -100, -101, -102, -104, -105, + -107, -108, -109, -110, -112, -113, -114, -115, + -116, -117, -118, -119, -120, -121, -122, -123, + -123, -124, -125, -125, -126, -127, -127, -128, + -128, -128, -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, -128, -128, -128, -128, + -128, -127, -127, -126, -125, -125, -124, -123, + -122, -122, -121, -120, -119, -118, -117, -116, + -115, -114, -112, -111, -110, -109, -107, -106, + -105, -103, -102, -101, -99, -98, -96, -94, + -93, -91, -90, -88, -86, -84, -83, -81, + -79, -77, -75, -74, -72, -70, -68, -66, + -64, -62, -60, -58, -56, -54, -52, -49, + -47, -45, -43, -41, -39, -36, -34, -32, + -30, -28, -25, -23, -21, -19, -16, -14, + -12, -9, -7, -5, -3, 0, 1, 3, + 6, 8, 10, 12, 15, 17, 19, 22, + 24, 26, 28, 30, 33, 35, 37, 39, 41 +}; + +static const int hsv_red_y[] = { + 82, 80, 78, 76, 74, 73, 71, 69, + 67, 65, 63, 61, 58, 56, 54, 52, + 50, 48, 46, 44, 41, 39, 37, 35, + 32, 30, 28, 26, 23, 21, 19, 16, + 14, 12, 10, 7, 5, 3, 0, -1, + -3, -6, -8, -10, -13, -15, -17, -19, + -22, -24, -26, -29, -31, -33, -35, -38, + -40, -42, -44, -46, -48, -51, -53, -55, + -57, -59, -61, -63, -65, -67, -69, -71, + -73, -75, -77, -79, -81, -82, -84, -86, + -88, -89, -91, -93, -94, -96, -98, -99, + -101, -102, -104, -105, -106, -108, -109, -110, + -112, -113, -114, -115, -116, -117, -119, -120, + -120, -121, -122, -123, -124, -125, -126, -126, + -127, -128, -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, -128, -128, -128, -128, + -128, -128, -128, -128, -128, -128, -128, -128, + -127, -127, -126, -125, -125, -124, -123, -122, + -121, -120, -119, -118, -117, -116, -115, -114, + -113, -111, -110, -109, -107, -106, -105, -103, + -102, -100, -99, -97, -96, -94, -92, -91, + -89, -87, -85, -84, -82, -80, -78, -76, + -74, -73, -71, -69, -67, -65, -63, -61, + -58, -56, -54, -52, -50, -48, -46, -44, + -41, -39, -37, -35, -32, -30, -28, -26, + -23, -21, -19, -16, -14, -12, -10, -7, + -5, -3, 0, 1, 3, 6, 8, 10, + 13, 15, 17, 19, 22, 24, 26, 29, + 31, 33, 35, 38, 40, 42, 44, 46, + 48, 51, 53, 55, 57, 59, 61, 63, + 65, 67, 69, 71, 73, 75, 77, 79, + 81, 82, 84, 86, 88, 89, 91, 93, + 94, 96, 98, 99, 101, 102, 104, 105, + 106, 108, 109, 110, 112, 113, 114, 115, + 116, 117, 119, 120, 120, 121, 122, 123, + 124, 125, 126, 126, 127, 128, 128, 129, + 129, 130, 130, 131, 131, 131, 131, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 131, 131, 131, 130, 130, + 130, 129, 129, 128, 127, 127, 126, 125, + 125, 124, 123, 122, 121, 120, 119, 118, + 117, 116, 115, 114, 113, 111, 110, 109, + 107, 106, 105, 103, 102, 100, 99, 97, + 96, 94, 92, 91, 89, 87, 85, 84, 82 +}; + +static const int hsv_green_x[] = { + -124, -124, -125, -125, -125, -125, -125, -125, + -125, -126, -126, -125, -125, -125, -125, -125, + -125, -124, -124, -124, -123, -123, -122, -122, + -121, -121, -120, -120, -119, -118, -117, -117, + -116, -115, -114, -113, -112, -111, -110, -109, + -108, -107, -105, -104, -103, -102, -100, -99, + -98, -96, -95, -93, -92, -91, -89, -87, + -86, -84, -83, -81, -79, -77, -76, -74, + -72, -70, -69, -67, -65, -63, -61, -59, + -57, -55, -53, -51, -49, -47, -45, -43, + -41, -39, -37, -35, -33, -30, -28, -26, + -24, -22, -20, -18, -15, -13, -11, -9, + -7, -4, -2, 0, 1, 3, 6, 8, + 10, 12, 14, 17, 19, 21, 23, 25, + 27, 29, 32, 34, 36, 38, 40, 42, + 44, 46, 48, 50, 52, 54, 56, 58, + 60, 62, 64, 66, 68, 70, 71, 73, + 75, 77, 78, 80, 82, 83, 85, 87, + 88, 90, 91, 93, 94, 96, 97, 98, + 100, 101, 102, 104, 105, 106, 107, 108, + 109, 111, 112, 113, 113, 114, 115, 116, + 117, 118, 118, 119, 120, 120, 121, 122, + 122, 123, 123, 124, 124, 124, 125, 125, + 125, 125, 125, 125, 125, 126, 126, 125, + 125, 125, 125, 125, 125, 124, 124, 124, + 123, 123, 122, 122, 121, 121, 120, 120, + 119, 118, 117, 117, 116, 115, 114, 113, + 112, 111, 110, 109, 108, 107, 105, 104, + 103, 102, 100, 99, 98, 96, 95, 93, + 92, 91, 89, 87, 86, 84, 83, 81, + 79, 77, 76, 74, 72, 70, 69, 67, + 65, 63, 61, 59, 57, 55, 53, 51, + 49, 47, 45, 43, 41, 39, 37, 35, + 33, 30, 28, 26, 24, 22, 20, 18, + 15, 13, 11, 9, 7, 4, 2, 0, + -1, -3, -6, -8, -10, -12, -14, -17, + -19, -21, -23, -25, -27, -29, -32, -34, + -36, -38, -40, -42, -44, -46, -48, -50, + -52, -54, -56, -58, -60, -62, -64, -66, + -68, -70, -71, -73, -75, -77, -78, -80, + -82, -83, -85, -87, -88, -90, -91, -93, + -94, -96, -97, -98, -100, -101, -102, -104, + -105, -106, -107, -108, -109, -111, -112, -113, + -113, -114, -115, -116, -117, -118, -118, -119, + -120, -120, -121, -122, -122, -123, -123, -124, -124 +}; + +static const int hsv_green_y[] = { + -100, -99, -98, -97, -95, -94, -93, -91, + -90, -89, -87, -86, -84, -83, -81, -80, + -78, -76, -75, -73, -71, -70, -68, -66, + -64, -63, -61, -59, -57, -55, -53, -51, + -49, -48, -46, -44, -42, -40, -38, -36, + -34, -32, -30, -27, -25, -23, -21, -19, + -17, -15, -13, -11, -9, -7, -4, -2, + 0, 1, 3, 5, 7, 9, 11, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 59, 61, + 63, 65, 67, 68, 70, 72, 74, 75, + 77, 78, 80, 82, 83, 85, 86, 88, + 89, 90, 92, 93, 95, 96, 97, 98, + 100, 101, 102, 103, 104, 105, 106, 107, + 108, 109, 110, 111, 112, 112, 113, 114, + 115, 115, 116, 116, 117, 117, 118, 118, + 119, 119, 119, 120, 120, 120, 120, 120, + 121, 121, 121, 121, 121, 121, 120, 120, + 120, 120, 120, 119, 119, 119, 118, 118, + 117, 117, 116, 116, 115, 114, 114, 113, + 112, 111, 111, 110, 109, 108, 107, 106, + 105, 104, 103, 102, 100, 99, 98, 97, + 95, 94, 93, 91, 90, 89, 87, 86, + 84, 83, 81, 80, 78, 76, 75, 73, + 71, 70, 68, 66, 64, 63, 61, 59, + 57, 55, 53, 51, 49, 48, 46, 44, + 42, 40, 38, 36, 34, 32, 30, 27, + 25, 23, 21, 19, 17, 15, 13, 11, + 9, 7, 4, 2, 0, -1, -3, -5, + -7, -9, -11, -14, -16, -18, -20, -22, + -24, -26, -28, -30, -32, -34, -36, -38, + -40, -42, -44, -46, -48, -50, -52, -54, + -56, -58, -59, -61, -63, -65, -67, -68, + -70, -72, -74, -75, -77, -78, -80, -82, + -83, -85, -86, -88, -89, -90, -92, -93, + -95, -96, -97, -98, -100, -101, -102, -103, + -104, -105, -106, -107, -108, -109, -110, -111, + -112, -112, -113, -114, -115, -115, -116, -116, + -117, -117, -118, -118, -119, -119, -119, -120, + -120, -120, -120, -120, -121, -121, -121, -121, + -121, -121, -120, -120, -120, -120, -120, -119, + -119, -119, -118, -118, -117, -117, -116, -116, + -115, -114, -114, -113, -112, -111, -111, -110, + -109, -108, -107, -106, -105, -104, -103, -102, -100 +}; + +static const int hsv_blue_x[] = { + 112, 113, 114, 114, 115, 116, 117, 117, + 118, 118, 119, 119, 120, 120, 120, 121, + 121, 121, 122, 122, 122, 122, 122, 122, + 122, 122, 122, 122, 122, 122, 121, 121, + 121, 120, 120, 120, 119, 119, 118, 118, + 117, 116, 116, 115, 114, 113, 113, 112, + 111, 110, 109, 108, 107, 106, 105, 104, + 103, 102, 100, 99, 98, 97, 95, 94, + 93, 91, 90, 88, 87, 85, 84, 82, + 80, 79, 77, 76, 74, 72, 70, 69, + 67, 65, 63, 61, 60, 58, 56, 54, + 52, 50, 48, 46, 44, 42, 40, 38, + 36, 34, 32, 30, 28, 26, 24, 22, + 19, 17, 15, 13, 11, 9, 7, 5, + 2, 0, -1, -3, -5, -7, -9, -12, + -14, -16, -18, -20, -22, -24, -26, -28, + -31, -33, -35, -37, -39, -41, -43, -45, + -47, -49, -51, -53, -54, -56, -58, -60, + -62, -64, -66, -67, -69, -71, -73, -74, + -76, -78, -79, -81, -83, -84, -86, -87, + -89, -90, -92, -93, -94, -96, -97, -98, + -99, -101, -102, -103, -104, -105, -106, -107, + -108, -109, -110, -111, -112, -113, -114, -114, + -115, -116, -117, -117, -118, -118, -119, -119, + -120, -120, -120, -121, -121, -121, -122, -122, + -122, -122, -122, -122, -122, -122, -122, -122, + -122, -122, -121, -121, -121, -120, -120, -120, + -119, -119, -118, -118, -117, -116, -116, -115, + -114, -113, -113, -112, -111, -110, -109, -108, + -107, -106, -105, -104, -103, -102, -100, -99, + -98, -97, -95, -94, -93, -91, -90, -88, + -87, -85, -84, -82, -80, -79, -77, -76, + -74, -72, -70, -69, -67, -65, -63, -61, + -60, -58, -56, -54, -52, -50, -48, -46, + -44, -42, -40, -38, -36, -34, -32, -30, + -28, -26, -24, -22, -19, -17, -15, -13, + -11, -9, -7, -5, -2, 0, 1, 3, + 5, 7, 9, 12, 14, 16, 18, 20, + 22, 24, 26, 28, 31, 33, 35, 37, + 39, 41, 43, 45, 47, 49, 51, 53, + 54, 56, 58, 60, 62, 64, 66, 67, + 69, 71, 73, 74, 76, 78, 79, 81, + 83, 84, 86, 87, 89, 90, 92, 93, + 94, 96, 97, 98, 99, 101, 102, 103, + 104, 105, 106, 107, 108, 109, 110, 111, 112 +}; + +static const int hsv_blue_y[] = { + -11, -13, -15, -17, -19, -21, -23, -25, + -27, -29, -31, -33, -35, -37, -39, -41, + -43, -45, -46, -48, -50, -52, -54, -55, + -57, -59, -61, -62, -64, -66, -67, -69, + -71, -72, -74, -75, -77, -78, -80, -81, + -83, -84, -86, -87, -88, -90, -91, -92, + -93, -95, -96, -97, -98, -99, -100, -101, + -102, -103, -104, -105, -106, -106, -107, -108, + -109, -109, -110, -111, -111, -112, -112, -113, + -113, -114, -114, -114, -115, -115, -115, -115, + -116, -116, -116, -116, -116, -116, -116, -116, + -116, -115, -115, -115, -115, -114, -114, -114, + -113, -113, -112, -112, -111, -111, -110, -110, + -109, -108, -108, -107, -106, -105, -104, -103, + -102, -101, -100, -99, -98, -97, -96, -95, + -94, -93, -91, -90, -89, -88, -86, -85, + -84, -82, -81, -79, -78, -76, -75, -73, + -71, -70, -68, -67, -65, -63, -62, -60, + -58, -56, -55, -53, -51, -49, -47, -45, + -44, -42, -40, -38, -36, -34, -32, -30, + -28, -26, -24, -22, -20, -18, -16, -14, + -12, -10, -8, -6, -4, -2, 0, 1, + 3, 5, 7, 9, 11, 13, 15, 17, + 19, 21, 23, 25, 27, 29, 31, 33, + 35, 37, 39, 41, 43, 45, 46, 48, + 50, 52, 54, 55, 57, 59, 61, 62, + 64, 66, 67, 69, 71, 72, 74, 75, + 77, 78, 80, 81, 83, 84, 86, 87, + 88, 90, 91, 92, 93, 95, 96, 97, + 98, 99, 100, 101, 102, 103, 104, 105, + 106, 106, 107, 108, 109, 109, 110, 111, + 111, 112, 112, 113, 113, 114, 114, 114, + 115, 115, 115, 115, 116, 116, 116, 116, + 116, 116, 116, 116, 116, 115, 115, 115, + 115, 114, 114, 114, 113, 113, 112, 112, + 111, 111, 110, 110, 109, 108, 108, 107, + 106, 105, 104, 103, 102, 101, 100, 99, + 98, 97, 96, 95, 94, 93, 91, 90, + 89, 88, 86, 85, 84, 82, 81, 79, + 78, 76, 75, 73, 71, 70, 68, 67, + 65, 63, 62, 60, 58, 56, 55, 53, + 51, 49, 47, 45, 44, 42, 40, 38, + 36, 34, 32, 30, 28, 26, 24, 22, + 20, 18, 16, 14, 12, 10, 8, 6, + 4, 2, 0, -1, -3, -5, -7, -9, -11 +}; + +static u16 i2c_ident[] = { + V4L2_IDENT_OV9650, + V4L2_IDENT_OV9655, + V4L2_IDENT_SOI968, + V4L2_IDENT_OV7660, + V4L2_IDENT_OV7670, + V4L2_IDENT_MT9V011, + V4L2_IDENT_MT9V111, + V4L2_IDENT_MT9V112, + V4L2_IDENT_MT9M001C12ST, + V4L2_IDENT_MT9M111, + V4L2_IDENT_HV7131R, +}; + +static u16 bridge_init[][2] = { + {0x1000, 0x78}, {0x1001, 0x40}, {0x1002, 0x1c}, + {0x1020, 0x80}, {0x1061, 0x01}, {0x1067, 0x40}, + {0x1068, 0x30}, {0x1069, 0x20}, {0x106a, 0x10}, + {0x106b, 0x08}, {0x1188, 0x87}, {0x11a1, 0x00}, + {0x11a2, 0x00}, {0x11a3, 0x6a}, {0x11a4, 0x50}, + {0x11ab, 0x00}, {0x11ac, 0x00}, {0x11ad, 0x50}, + {0x11ae, 0x3c}, {0x118a, 0x04}, {0x0395, 0x04}, + {0x11b8, 0x3a}, {0x118b, 0x0e}, {0x10f7, 0x05}, + {0x10f8, 0x14}, {0x10fa, 0xff}, {0x10f9, 0x00}, + {0x11ba, 0x0a}, {0x11a5, 0x2d}, {0x11a6, 0x2d}, + {0x11a7, 0x3a}, {0x11a8, 0x05}, {0x11a9, 0x04}, + {0x11aa, 0x3f}, {0x11af, 0x28}, {0x11b0, 0xd8}, + {0x11b1, 0x14}, {0x11b2, 0xec}, {0x11b3, 0x32}, + {0x11b4, 0xdd}, {0x11b5, 0x32}, {0x11b6, 0xdd}, + {0x10e0, 0x2c}, {0x11bc, 0x40}, {0x11bd, 0x01}, + {0x11be, 0xf0}, {0x11bf, 0x00}, {0x118c, 0x1f}, + {0x118d, 0x1f}, {0x118e, 0x1f}, {0x118f, 0x1f}, + {0x1180, 0x01}, {0x1181, 0x00}, {0x1182, 0x01}, + {0x1183, 0x00}, {0x1184, 0x50}, {0x1185, 0x80} +}; + +/* Gain = (bit[3:0] / 16 + 1) * (bit[4] + 1) * (bit[5] + 1) * (bit[6] + 1) */ +static u8 ov_gain[] = { + 0x00 /* 1x */, 0x04 /* 1.25x */, 0x08 /* 1.5x */, 0x0c /* 1.75x */, + 0x10 /* 2x */, 0x12 /* 2.25x */, 0x14 /* 2.5x */, 0x16 /* 2.75x */, + 0x18 /* 3x */, 0x1a /* 3.25x */, 0x1c /* 3.5x */, 0x1e /* 3.75x */, + 0x30 /* 4x */, 0x31 /* 4.25x */, 0x32 /* 4.5x */, 0x33 /* 4.75x */, + 0x34 /* 5x */, 0x35 /* 5.25x */, 0x36 /* 5.5x */, 0x37 /* 5.75x */, + 0x38 /* 6x */, 0x39 /* 6.25x */, 0x3a /* 6.5x */, 0x3b /* 6.75x */, + 0x3c /* 7x */, 0x3d /* 7.25x */, 0x3e /* 7.5x */, 0x3f /* 7.75x */, + 0x70 /* 8x */ +}; + +/* Gain = (bit[8] + 1) * (bit[7] + 1) * (bit[6:0] * 0.03125) */ +static u16 micron1_gain[] = { + /* 1x 1.25x 1.5x 1.75x */ + 0x0020, 0x0028, 0x0030, 0x0038, + /* 2x 2.25x 2.5x 2.75x */ + 0x00a0, 0x00a4, 0x00a8, 0x00ac, + /* 3x 3.25x 3.5x 3.75x */ + 0x00b0, 0x00b4, 0x00b8, 0x00bc, + /* 4x 4.25x 4.5x 4.75x */ + 0x00c0, 0x00c4, 0x00c8, 0x00cc, + /* 5x 5.25x 5.5x 5.75x */ + 0x00d0, 0x00d4, 0x00d8, 0x00dc, + /* 6x 6.25x 6.5x 6.75x */ + 0x00e0, 0x00e4, 0x00e8, 0x00ec, + /* 7x 7.25x 7.5x 7.75x */ + 0x00f0, 0x00f4, 0x00f8, 0x00fc, + /* 8x */ + 0x01c0 +}; + +/* mt9m001 sensor uses a different gain formula then other micron sensors */ +/* Gain = (bit[6] + 1) * (bit[5-0] * 0.125) */ +static u16 micron2_gain[] = { + /* 1x 1.25x 1.5x 1.75x */ + 0x0008, 0x000a, 0x000c, 0x000e, + /* 2x 2.25x 2.5x 2.75x */ + 0x0010, 0x0012, 0x0014, 0x0016, + /* 3x 3.25x 3.5x 3.75x */ + 0x0018, 0x001a, 0x001c, 0x001e, + /* 4x 4.25x 4.5x 4.75x */ + 0x0020, 0x0051, 0x0052, 0x0053, + /* 5x 5.25x 5.5x 5.75x */ + 0x0054, 0x0055, 0x0056, 0x0057, + /* 6x 6.25x 6.5x 6.75x */ + 0x0058, 0x0059, 0x005a, 0x005b, + /* 7x 7.25x 7.5x 7.75x */ + 0x005c, 0x005d, 0x005e, 0x005f, + /* 8x */ + 0x0060 +}; + +/* Gain = .5 + bit[7:0] / 16 */ +static u8 hv7131r_gain[] = { + 0x08 /* 1x */, 0x0c /* 1.25x */, 0x10 /* 1.5x */, 0x14 /* 1.75x */, + 0x18 /* 2x */, 0x1c /* 2.25x */, 0x20 /* 2.5x */, 0x24 /* 2.75x */, + 0x28 /* 3x */, 0x2c /* 3.25x */, 0x30 /* 3.5x */, 0x34 /* 3.75x */, + 0x38 /* 4x */, 0x3c /* 4.25x */, 0x40 /* 4.5x */, 0x44 /* 4.75x */, + 0x48 /* 5x */, 0x4c /* 5.25x */, 0x50 /* 5.5x */, 0x54 /* 5.75x */, + 0x58 /* 6x */, 0x5c /* 6.25x */, 0x60 /* 6.5x */, 0x64 /* 6.75x */, + 0x68 /* 7x */, 0x6c /* 7.25x */, 0x70 /* 7.5x */, 0x74 /* 7.75x */, + 0x78 /* 8x */ +}; + +static u8 soi968_init[][2] = { + {0x12, 0x80}, {0x0c, 0x00}, {0x0f, 0x1f}, + {0x11, 0x80}, {0x38, 0x52}, {0x1e, 0x00}, + {0x33, 0x08}, {0x35, 0x8c}, {0x36, 0x0c}, + {0x37, 0x04}, {0x45, 0x04}, {0x47, 0xff}, + {0x3e, 0x00}, {0x3f, 0x00}, {0x3b, 0x20}, + {0x3a, 0x96}, {0x3d, 0x0a}, {0x14, 0x8e}, + {0x13, 0x8a}, {0x12, 0x40}, {0x17, 0x13}, + {0x18, 0x63}, {0x19, 0x01}, {0x1a, 0x79}, + {0x32, 0x24}, {0x03, 0x00}, {0x11, 0x40}, + {0x2a, 0x10}, {0x2b, 0xe0}, {0x10, 0x32}, + {0x00, 0x00}, {0x01, 0x80}, {0x02, 0x80}, +}; + +static u8 ov7660_init[][2] = { + {0x0e, 0x80}, {0x0d, 0x08}, {0x0f, 0xc3}, + {0x04, 0xc3}, {0x10, 0x40}, {0x11, 0x40}, + {0x12, 0x05}, {0x13, 0xba}, {0x14, 0x2a}, + {0x37, 0x0f}, {0x38, 0x02}, {0x39, 0x43}, + {0x3a, 0x00}, {0x69, 0x90}, {0x2d, 0xf6}, + {0x2e, 0x0b}, {0x01, 0x78}, {0x02, 0x50}, +}; + +static u8 ov7670_init[][2] = { + {0x12, 0x80}, {0x11, 0x80}, {0x3a, 0x04}, {0x12, 0x01}, + {0x32, 0xb6}, {0x03, 0x0a}, {0x0c, 0x00}, {0x3e, 0x00}, + {0x70, 0x3a}, {0x71, 0x35}, {0x72, 0x11}, {0x73, 0xf0}, + {0xa2, 0x02}, {0x13, 0xe0}, {0x00, 0x00}, {0x10, 0x00}, + {0x0d, 0x40}, {0x14, 0x28}, {0xa5, 0x05}, {0xab, 0x07}, + {0x24, 0x95}, {0x25, 0x33}, {0x26, 0xe3}, {0x9f, 0x75}, + {0xa0, 0x65}, {0xa1, 0x0b}, {0xa6, 0xd8}, {0xa7, 0xd8}, + {0xa8, 0xf0}, {0xa9, 0x90}, {0xaa, 0x94}, {0x13, 0xe5}, + {0x0e, 0x61}, {0x0f, 0x4b}, {0x16, 0x02}, {0x1e, 0x27}, + {0x21, 0x02}, {0x22, 0x91}, {0x29, 0x07}, {0x33, 0x0b}, + {0x35, 0x0b}, {0x37, 0x1d}, {0x38, 0x71}, {0x39, 0x2a}, + {0x3c, 0x78}, {0x4d, 0x40}, {0x4e, 0x20}, {0x69, 0x00}, + {0x74, 0x19}, {0x8d, 0x4f}, {0x8e, 0x00}, {0x8f, 0x00}, + {0x90, 0x00}, {0x91, 0x00}, {0x96, 0x00}, {0x9a, 0x80}, + {0xb0, 0x84}, {0xb1, 0x0c}, {0xb2, 0x0e}, {0xb3, 0x82}, + {0xb8, 0x0a}, {0x43, 0x0a}, {0x44, 0xf0}, {0x45, 0x20}, + {0x46, 0x7d}, {0x47, 0x29}, {0x48, 0x4a}, {0x59, 0x8c}, + {0x5a, 0xa5}, {0x5b, 0xde}, {0x5c, 0x96}, {0x5d, 0x66}, + {0x5e, 0x10}, {0x6c, 0x0a}, {0x6d, 0x55}, {0x6e, 0x11}, + {0x6f, 0x9e}, {0x6a, 0x40}, {0x01, 0x40}, {0x02, 0x40}, + {0x13, 0xe7}, {0x4f, 0x6e}, {0x50, 0x70}, {0x51, 0x02}, + {0x52, 0x1d}, {0x53, 0x56}, {0x54, 0x73}, {0x55, 0x0a}, + {0x56, 0x55}, {0x57, 0x80}, {0x58, 0x9e}, {0x41, 0x08}, + {0x3f, 0x02}, {0x75, 0x03}, {0x76, 0x63}, {0x4c, 0x04}, + {0x77, 0x06}, {0x3d, 0x02}, {0x4b, 0x09}, {0xc9, 0x30}, + {0x41, 0x08}, {0x56, 0x48}, {0x34, 0x11}, {0xa4, 0x88}, + {0x96, 0x00}, {0x97, 0x30}, {0x98, 0x20}, {0x99, 0x30}, + {0x9a, 0x84}, {0x9b, 0x29}, {0x9c, 0x03}, {0x9d, 0x99}, + {0x9e, 0x7f}, {0x78, 0x04}, {0x79, 0x01}, {0xc8, 0xf0}, + {0x79, 0x0f}, {0xc8, 0x00}, {0x79, 0x10}, {0xc8, 0x7e}, + {0x79, 0x0a}, {0xc8, 0x80}, {0x79, 0x0b}, {0xc8, 0x01}, + {0x79, 0x0c}, {0xc8, 0x0f}, {0x79, 0x0d}, {0xc8, 0x20}, + {0x79, 0x09}, {0xc8, 0x80}, {0x79, 0x02}, {0xc8, 0xc0}, + {0x79, 0x03}, {0xc8, 0x40}, {0x79, 0x05}, {0xc8, 0x30}, + {0x79, 0x26}, {0x62, 0x20}, {0x63, 0x00}, {0x64, 0x06}, + {0x65, 0x00}, {0x66, 0x05}, {0x94, 0x05}, {0x95, 0x0a}, + {0x17, 0x13}, {0x18, 0x01}, {0x19, 0x02}, {0x1a, 0x7a}, + {0x46, 0x59}, {0x47, 0x30}, {0x58, 0x9a}, {0x59, 0x84}, + {0x5a, 0x91}, {0x5b, 0x57}, {0x5c, 0x75}, {0x5d, 0x6d}, + {0x5e, 0x13}, {0x64, 0x07}, {0x94, 0x07}, {0x95, 0x0d}, + {0xa6, 0xdf}, {0xa7, 0xdf}, {0x48, 0x4d}, {0x51, 0x00}, + {0x6b, 0x0a}, {0x11, 0x80}, {0x2a, 0x00}, {0x2b, 0x00}, + {0x92, 0x00}, {0x93, 0x00}, {0x55, 0x0a}, {0x56, 0x60}, + {0x4f, 0x6e}, {0x50, 0x70}, {0x51, 0x00}, {0x52, 0x1d}, + {0x53, 0x56}, {0x54, 0x73}, {0x58, 0x9a}, {0x4f, 0x6e}, + {0x50, 0x70}, {0x51, 0x00}, {0x52, 0x1d}, {0x53, 0x56}, + {0x54, 0x73}, {0x58, 0x9a}, {0x3f, 0x01}, {0x7b, 0x03}, + {0x7c, 0x09}, {0x7d, 0x16}, {0x7e, 0x38}, {0x7f, 0x47}, + {0x80, 0x53}, {0x81, 0x5e}, {0x82, 0x6a}, {0x83, 0x74}, + {0x84, 0x80}, {0x85, 0x8c}, {0x86, 0x9b}, {0x87, 0xb2}, + {0x88, 0xcc}, {0x89, 0xe5}, {0x7a, 0x24}, {0x3b, 0x00}, + {0x9f, 0x76}, {0xa0, 0x65}, {0x13, 0xe2}, {0x6b, 0x0a}, + {0x11, 0x80}, {0x2a, 0x00}, {0x2b, 0x00}, {0x92, 0x00}, + {0x93, 0x00}, +}; + +static u8 ov9650_init[][2] = { + {0x12, 0x80}, {0x00, 0x00}, {0x01, 0x78}, + {0x02, 0x78}, {0x03, 0x36}, {0x04, 0x03}, + {0x05, 0x00}, {0x06, 0x00}, {0x08, 0x00}, + {0x09, 0x01}, {0x0c, 0x00}, {0x0d, 0x00}, + {0x0e, 0xa0}, {0x0f, 0x52}, {0x10, 0x7c}, + {0x11, 0x80}, {0x12, 0x45}, {0x13, 0xc2}, + {0x14, 0x2e}, {0x15, 0x00}, {0x16, 0x07}, + {0x17, 0x24}, {0x18, 0xc5}, {0x19, 0x00}, + {0x1a, 0x3c}, {0x1b, 0x00}, {0x1e, 0x04}, + {0x1f, 0x00}, {0x24, 0x78}, {0x25, 0x68}, + {0x26, 0xd4}, {0x27, 0x80}, {0x28, 0x80}, + {0x29, 0x30}, {0x2a, 0x00}, {0x2b, 0x00}, + {0x2c, 0x80}, {0x2d, 0x00}, {0x2e, 0x00}, + {0x2f, 0x00}, {0x30, 0x08}, {0x31, 0x30}, + {0x32, 0x84}, {0x33, 0xe2}, {0x34, 0xbf}, + {0x35, 0x81}, {0x36, 0xf9}, {0x37, 0x00}, + {0x38, 0x93}, {0x39, 0x50}, {0x3a, 0x01}, + {0x3b, 0x01}, {0x3c, 0x73}, {0x3d, 0x19}, + {0x3e, 0x0b}, {0x3f, 0x80}, {0x40, 0xc1}, + {0x41, 0x00}, {0x42, 0x08}, {0x67, 0x80}, + {0x68, 0x80}, {0x69, 0x40}, {0x6a, 0x00}, + {0x6b, 0x0a}, {0x8b, 0x06}, {0x8c, 0x20}, + {0x8d, 0x00}, {0x8e, 0x00}, {0x8f, 0xdf}, + {0x92, 0x00}, {0x93, 0x00}, {0x94, 0x88}, + {0x95, 0x88}, {0x96, 0x04}, {0xa1, 0x00}, + {0xa5, 0x80}, {0xa8, 0x80}, {0xa9, 0xb8}, + {0xaa, 0x92}, {0xab, 0x0a}, +}; + +static u8 ov9655_init[][2] = { + {0x12, 0x80}, {0x12, 0x01}, {0x0d, 0x00}, {0x0e, 0x61}, + {0x11, 0x80}, {0x13, 0xba}, {0x14, 0x2e}, {0x16, 0x24}, + {0x1e, 0x04}, {0x1e, 0x04}, {0x1e, 0x04}, {0x27, 0x08}, + {0x28, 0x08}, {0x29, 0x15}, {0x2c, 0x08}, {0x32, 0xbf}, + {0x34, 0x3d}, {0x35, 0x00}, {0x36, 0xf8}, {0x38, 0x12}, + {0x39, 0x57}, {0x3a, 0x00}, {0x3b, 0xcc}, {0x3c, 0x0c}, + {0x3d, 0x19}, {0x3e, 0x0c}, {0x3f, 0x01}, {0x41, 0x40}, + {0x42, 0x80}, {0x45, 0x46}, {0x46, 0x62}, {0x47, 0x2a}, + {0x48, 0x3c}, {0x4a, 0xf0}, {0x4b, 0xdc}, {0x4c, 0xdc}, + {0x4d, 0xdc}, {0x4e, 0xdc}, {0x69, 0x02}, {0x6c, 0x04}, + {0x6f, 0x9e}, {0x70, 0x05}, {0x71, 0x78}, {0x77, 0x02}, + {0x8a, 0x23}, {0x8c, 0x0d}, {0x90, 0x7e}, {0x91, 0x7c}, + {0x9f, 0x6e}, {0xa0, 0x6e}, {0xa5, 0x68}, {0xa6, 0x60}, + {0xa8, 0xc1}, {0xa9, 0xfa}, {0xaa, 0x92}, {0xab, 0x04}, + {0xac, 0x80}, {0xad, 0x80}, {0xae, 0x80}, {0xaf, 0x80}, + {0xb2, 0xf2}, {0xb3, 0x20}, {0xb5, 0x00}, {0xb6, 0xaf}, + {0xbb, 0xae}, {0xbc, 0x44}, {0xbd, 0x44}, {0xbe, 0x3b}, + {0xbf, 0x3a}, {0xc0, 0xe2}, {0xc1, 0xc8}, {0xc2, 0x01}, + {0xc4, 0x00}, {0xc6, 0x85}, {0xc7, 0x81}, {0xc9, 0xe0}, + {0xca, 0xe8}, {0xcc, 0xd8}, {0xcd, 0x93}, {0x12, 0x61}, + {0x36, 0xfa}, {0x8c, 0x8d}, {0xc0, 0xaa}, {0x69, 0x0a}, + {0x03, 0x12}, {0x17, 0x14}, {0x18, 0x00}, {0x19, 0x01}, + {0x1a, 0x3d}, {0x32, 0xbf}, {0x11, 0x80}, {0x2a, 0x10}, + {0x2b, 0x0a}, {0x92, 0x00}, {0x93, 0x00}, {0x1e, 0x04}, + {0x1e, 0x04}, {0x10, 0x7c}, {0x04, 0x03}, {0xa1, 0x00}, + {0x2d, 0x00}, {0x2e, 0x00}, {0x00, 0x00}, {0x01, 0x80}, + {0x02, 0x80}, {0x12, 0x61}, {0x36, 0xfa}, {0x8c, 0x8d}, + {0xc0, 0xaa}, {0x69, 0x0a}, {0x03, 0x12}, {0x17, 0x14}, + {0x18, 0x00}, {0x19, 0x01}, {0x1a, 0x3d}, {0x32, 0xbf}, + {0x11, 0x80}, {0x2a, 0x10}, {0x2b, 0x0a}, {0x92, 0x00}, + {0x93, 0x00}, {0x04, 0x01}, {0x10, 0x1f}, {0xa1, 0x00}, + {0x00, 0x0a}, {0xa1, 0x00}, {0x10, 0x5d}, {0x04, 0x03}, + {0x00, 0x01}, {0xa1, 0x00}, {0x10, 0x7c}, {0x04, 0x03}, + {0x00, 0x03}, {0x00, 0x0a}, {0x00, 0x10}, {0x00, 0x13}, +}; + +static u16 mt9v112_init[][2] = { + {0xf0, 0x0000}, {0x0d, 0x0021}, {0x0d, 0x0020}, + {0x34, 0xc019}, {0x0a, 0x0011}, {0x0b, 0x000b}, + {0x20, 0x0703}, {0x35, 0x2022}, {0xf0, 0x0001}, + {0x05, 0x0000}, {0x06, 0x340c}, {0x3b, 0x042a}, + {0x3c, 0x0400}, {0xf0, 0x0002}, {0x2e, 0x0c58}, + {0x5b, 0x0001}, {0xc8, 0x9f0b}, {0xf0, 0x0001}, + {0x9b, 0x5300}, {0xf0, 0x0000}, {0x2b, 0x0020}, + {0x2c, 0x002a}, {0x2d, 0x0032}, {0x2e, 0x0020}, + {0x09, 0x01dc}, {0x01, 0x000c}, {0x02, 0x0020}, + {0x03, 0x01e0}, {0x04, 0x0280}, {0x06, 0x000c}, + {0x05, 0x0098}, {0x20, 0x0703}, {0x09, 0x01f2}, + {0x2b, 0x00a0}, {0x2c, 0x00a0}, {0x2d, 0x00a0}, + {0x2e, 0x00a0}, {0x01, 0x000c}, {0x02, 0x0020}, + {0x03, 0x01e0}, {0x04, 0x0280}, {0x06, 0x000c}, + {0x05, 0x0098}, {0x09, 0x01c1}, {0x2b, 0x00ae}, + {0x2c, 0x00ae}, {0x2d, 0x00ae}, {0x2e, 0x00ae}, +}; + +static u16 mt9v111_init[][2] = { + {0x01, 0x0004}, {0x0d, 0x0001}, {0x0d, 0x0000}, + {0x01, 0x0001}, {0x02, 0x0016}, {0x03, 0x01e1}, + {0x04, 0x0281}, {0x05, 0x0004}, {0x07, 0x3002}, + {0x21, 0x0000}, {0x25, 0x4024}, {0x26, 0xff03}, + {0x27, 0xff10}, {0x2b, 0x7828}, {0x2c, 0xb43c}, + {0x2d, 0xf0a0}, {0x2e, 0x0c64}, {0x2f, 0x0064}, + {0x67, 0x4010}, {0x06, 0x301e}, {0x08, 0x0480}, + {0x01, 0x0004}, {0x02, 0x0016}, {0x03, 0x01e6}, + {0x04, 0x0286}, {0x05, 0x0004}, {0x06, 0x0000}, + {0x07, 0x3002}, {0x08, 0x0008}, {0x0c, 0x0000}, + {0x0d, 0x0000}, {0x0e, 0x0000}, {0x0f, 0x0000}, + {0x10, 0x0000}, {0x11, 0x0000}, {0x12, 0x00b0}, + {0x13, 0x007c}, {0x14, 0x0000}, {0x15, 0x0000}, + {0x16, 0x0000}, {0x17, 0x0000}, {0x18, 0x0000}, + {0x19, 0x0000}, {0x1a, 0x0000}, {0x1b, 0x0000}, + {0x1c, 0x0000}, {0x1d, 0x0000}, {0x30, 0x0000}, + {0x30, 0x0005}, {0x31, 0x0000}, {0x02, 0x0016}, + {0x03, 0x01e1}, {0x04, 0x0281}, {0x05, 0x0004}, + {0x06, 0x0000}, {0x07, 0x3002}, {0x06, 0x002d}, + {0x05, 0x0004}, {0x09, 0x0064}, {0x2b, 0x00a0}, + {0x2c, 0x00a0}, {0x2d, 0x00a0}, {0x2e, 0x00a0}, + {0x02, 0x0016}, {0x03, 0x01e1}, {0x04, 0x0281}, + {0x05, 0x0004}, {0x06, 0x002d}, {0x07, 0x3002}, + {0x0e, 0x0008}, {0x06, 0x002d}, {0x05, 0x0004}, +}; + +static u16 mt9v011_init[][2] = { + {0x07, 0x0002}, {0x0d, 0x0001}, {0x0d, 0x0000}, + {0x01, 0x0008}, {0x02, 0x0016}, {0x03, 0x01e1}, + {0x04, 0x0281}, {0x05, 0x0083}, {0x06, 0x0006}, + {0x0d, 0x0002}, {0x0a, 0x0000}, {0x0b, 0x0000}, + {0x0c, 0x0000}, {0x0d, 0x0000}, {0x0e, 0x0000}, + {0x0f, 0x0000}, {0x10, 0x0000}, {0x11, 0x0000}, + {0x12, 0x0000}, {0x13, 0x0000}, {0x14, 0x0000}, + {0x15, 0x0000}, {0x16, 0x0000}, {0x17, 0x0000}, + {0x18, 0x0000}, {0x19, 0x0000}, {0x1a, 0x0000}, + {0x1b, 0x0000}, {0x1c, 0x0000}, {0x1d, 0x0000}, + {0x32, 0x0000}, {0x20, 0x1101}, {0x21, 0x0000}, + {0x22, 0x0000}, {0x23, 0x0000}, {0x24, 0x0000}, + {0x25, 0x0000}, {0x26, 0x0000}, {0x27, 0x0024}, + {0x2f, 0xf7b0}, {0x30, 0x0005}, {0x31, 0x0000}, + {0x32, 0x0000}, {0x33, 0x0000}, {0x34, 0x0100}, + {0x3d, 0x068f}, {0x40, 0x01e0}, {0x41, 0x00d1}, + {0x44, 0x0082}, {0x5a, 0x0000}, {0x5b, 0x0000}, + {0x5c, 0x0000}, {0x5d, 0x0000}, {0x5e, 0x0000}, + {0x5f, 0xa31d}, {0x62, 0x0611}, {0x0a, 0x0000}, + {0x06, 0x0029}, {0x05, 0x0009}, {0x20, 0x1101}, + {0x20, 0x1101}, {0x09, 0x0064}, {0x07, 0x0003}, + {0x2b, 0x0033}, {0x2c, 0x00a0}, {0x2d, 0x00a0}, + {0x2e, 0x0033}, {0x07, 0x0002}, {0x06, 0x0000}, + {0x06, 0x0029}, {0x05, 0x0009}, +}; + +static u16 mt9m001_init[][2] = { + {0x0d, 0x0001}, {0x0d, 0x0000}, {0x01, 0x000e}, + {0x02, 0x0014}, {0x03, 0x03c1}, {0x04, 0x0501}, + {0x05, 0x0083}, {0x06, 0x0006}, {0x0d, 0x0002}, + {0x0a, 0x0000}, {0x0c, 0x0000}, {0x11, 0x0000}, + {0x1e, 0x8000}, {0x5f, 0x8904}, {0x60, 0x0000}, + {0x61, 0x0000}, {0x62, 0x0498}, {0x63, 0x0000}, + {0x64, 0x0000}, {0x20, 0x111d}, {0x06, 0x00f2}, + {0x05, 0x0013}, {0x09, 0x10f2}, {0x07, 0x0003}, + {0x2b, 0x002a}, {0x2d, 0x002a}, {0x2c, 0x002a}, + {0x2e, 0x0029}, {0x07, 0x0002}, +}; + +static u16 mt9m111_init[][2] = { + {0xf0, 0x0000}, {0x0d, 0x0008}, {0x0d, 0x0009}, + {0x0d, 0x0008}, {0xf0, 0x0001}, {0x3a, 0x4300}, + {0x9b, 0x4300}, {0xa1, 0x0280}, {0xa4, 0x0200}, + {0x06, 0x308e}, {0xf0, 0x0000}, +}; + +static u8 hv7131r_init[][2] = { + {0x02, 0x08}, {0x02, 0x00}, {0x01, 0x08}, + {0x02, 0x00}, {0x20, 0x00}, {0x21, 0xd0}, + {0x22, 0x00}, {0x23, 0x09}, {0x01, 0x08}, + {0x01, 0x08}, {0x01, 0x08}, {0x25, 0x07}, + {0x26, 0xc3}, {0x27, 0x50}, {0x30, 0x62}, + {0x31, 0x10}, {0x32, 0x06}, {0x33, 0x10}, + {0x20, 0x00}, {0x21, 0xd0}, {0x22, 0x00}, + {0x23, 0x09}, {0x01, 0x08}, +}; + +int reg_r(struct gspca_dev *gspca_dev, u16 reg, u16 length) +{ + struct usb_device *dev = gspca_dev->dev; + int result; + result = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), + 0x00, + USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, + reg, + 0x00, + gspca_dev->usb_buf, + length, + 500); + if (unlikely(result < 0 || result != length)) { + err("Read register failed 0x%02X", reg); + return -EIO; + } + return 0; +} + +int reg_w(struct gspca_dev *gspca_dev, u16 reg, const u8 *buffer, int length) +{ + struct usb_device *dev = gspca_dev->dev; + int result; + memcpy(gspca_dev->usb_buf, buffer, length); + result = usb_control_msg(dev, usb_sndctrlpipe(dev, 0), + 0x08, + USB_DIR_OUT | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, + reg, + 0x00, + gspca_dev->usb_buf, + length, + 500); + if (unlikely(result < 0 || result != length)) { + err("Write register failed index 0x%02X", reg); + return -EIO; + } + return 0; +} + +int reg_w1(struct gspca_dev *gspca_dev, u16 reg, const u8 value) +{ + u8 data[1] = {value}; + return reg_w(gspca_dev, reg, data, 1); +} + +int i2c_w(struct gspca_dev *gspca_dev, const u8 *buffer) +{ + int i; + reg_w(gspca_dev, 0x10c0, buffer, 8); + for (i = 0; i < 5; i++) { + reg_r(gspca_dev, 0x10c0, 1); + if (gspca_dev->usb_buf[0] & 0x04) { + if (gspca_dev->usb_buf[0] & 0x08) + return -1; + return 0; + } + msleep(1); + } + return -1; +} + +int i2c_w1(struct gspca_dev *gspca_dev, u8 reg, u8 val) +{ + struct sd *sd = (struct sd *) gspca_dev; + + u8 row[8]; + + /* + * from the point of view of the bridge, the length + * includes the address + */ + row[0] = 0x81 | (2 << 4); + row[1] = sd->i2c_addr; + row[2] = reg; + row[3] = val; + row[4] = 0x00; + row[5] = 0x00; + row[6] = 0x00; + row[7] = 0x10; + + return i2c_w(gspca_dev, row); +} + +int i2c_w2(struct gspca_dev *gspca_dev, u8 reg, u16 val) +{ + struct sd *sd = (struct sd *) gspca_dev; + u8 row[8]; + + /* + * from the point of view of the bridge, the length + * includes the address + */ + row[0] = 0x81 | (3 << 4); + row[1] = sd->i2c_addr; + row[2] = reg; + row[3] = (val >> 8) & 0xff; + row[4] = val & 0xff; + row[5] = 0x00; + row[6] = 0x00; + row[7] = 0x10; + + return i2c_w(gspca_dev, row); +} + +int i2c_r1(struct gspca_dev *gspca_dev, u8 reg, u8 *val) +{ + struct sd *sd = (struct sd *) gspca_dev; + u8 row[8]; + + row[0] = 0x81 | 0x10; + row[1] = sd->i2c_addr; + row[2] = reg; + row[3] = 0; + row[4] = 0; + row[5] = 0; + row[6] = 0; + row[7] = 0x10; + reg_w(gspca_dev, 0x10c0, row, 8); + msleep(1); + row[0] = 0x81 | (2 << 4) | 0x02; + row[2] = 0; + reg_w(gspca_dev, 0x10c0, row, 8); + msleep(1); + reg_r(gspca_dev, 0x10c2, 5); + *val = gspca_dev->usb_buf[3]; + return 0; +} + +int i2c_r2(struct gspca_dev *gspca_dev, u8 reg, u16 *val) +{ + struct sd *sd = (struct sd *) gspca_dev; + u8 row[8]; + + row[0] = 0x81 | 0x10; + row[1] = sd->i2c_addr; + row[2] = reg; + row[3] = 0; + row[4] = 0; + row[5] = 0; + row[6] = 0; + row[7] = 0x10; + reg_w(gspca_dev, 0x10c0, row, 8); + msleep(1); + row[0] = 0x81 | (3 << 4) | 0x02; + row[2] = 0; + reg_w(gspca_dev, 0x10c0, row, 8); + msleep(1); + reg_r(gspca_dev, 0x10c2, 5); + *val = (gspca_dev->usb_buf[2] << 8) | gspca_dev->usb_buf[3]; + return 0; +} + +static int ov9650_init_sensor(struct gspca_dev *gspca_dev) +{ + int i; + struct sd *sd = (struct sd *) gspca_dev; + + for (i = 0; i < ARRAY_SIZE(ov9650_init); i++) { + if (i2c_w1(gspca_dev, ov9650_init[i][0], + ov9650_init[i][1]) < 0) { + err("OV9650 sensor initialization failed"); + return -ENODEV; + } + } + sd->hstart = 1; + sd->vstart = 7; + return 0; +} + +static int ov9655_init_sensor(struct gspca_dev *gspca_dev) +{ + int i; + struct sd *sd = (struct sd *) gspca_dev; + + for (i = 0; i < ARRAY_SIZE(ov9655_init); i++) { + if (i2c_w1(gspca_dev, ov9655_init[i][0], + ov9655_init[i][1]) < 0) { + err("OV9655 sensor initialization failed"); + return -ENODEV; + } + } + /* disable hflip and vflip */ + gspca_dev->ctrl_dis = (1 << HFLIP_IDX) | (1 << VFLIP_IDX); + sd->hstart = 0; + sd->vstart = 7; + return 0; +} + +static int soi968_init_sensor(struct gspca_dev *gspca_dev) +{ + int i; + struct sd *sd = (struct sd *) gspca_dev; + + for (i = 0; i < ARRAY_SIZE(soi968_init); i++) { + if (i2c_w1(gspca_dev, soi968_init[i][0], + soi968_init[i][1]) < 0) { + err("SOI968 sensor initialization failed"); + return -ENODEV; + } + } + /* disable hflip and vflip */ + gspca_dev->ctrl_dis = (1 << HFLIP_IDX) | (1 << VFLIP_IDX); + sd->hstart = 60; + sd->vstart = 11; + return 0; +} + +static int ov7660_init_sensor(struct gspca_dev *gspca_dev) +{ + int i; + struct sd *sd = (struct sd *) gspca_dev; + + for (i = 0; i < ARRAY_SIZE(ov7660_init); i++) { + if (i2c_w1(gspca_dev, ov7660_init[i][0], + ov7660_init[i][1]) < 0) { + err("OV7660 sensor initialization failed"); + return -ENODEV; + } + } + /* disable hflip and vflip */ + gspca_dev->ctrl_dis = (1 << HFLIP_IDX) | (1 << VFLIP_IDX); + sd->hstart = 1; + sd->vstart = 1; + return 0; +} + +static int ov7670_init_sensor(struct gspca_dev *gspca_dev) +{ + int i; + struct sd *sd = (struct sd *) gspca_dev; + + for (i = 0; i < ARRAY_SIZE(ov7670_init); i++) { + if (i2c_w1(gspca_dev, ov7670_init[i][0], + ov7670_init[i][1]) < 0) { + err("OV7670 sensor initialization failed"); + return -ENODEV; + } + } + /* disable hflip and vflip */ + gspca_dev->ctrl_dis = (1 << HFLIP_IDX) | (1 << VFLIP_IDX); + sd->hstart = 0; + sd->vstart = 1; + return 0; +} + +static int mt9v_init_sensor(struct gspca_dev *gspca_dev) +{ + struct sd *sd = (struct sd *) gspca_dev; + int i; + u16 value; + int ret; + + sd->i2c_addr = 0x5d; + ret = i2c_r2(gspca_dev, 0xff, &value); + if ((ret == 0) && (value == 0x8243)) { + for (i = 0; i < ARRAY_SIZE(mt9v011_init); i++) { + if (i2c_w2(gspca_dev, mt9v011_init[i][0], + mt9v011_init[i][1]) < 0) { + err("MT9V011 sensor initialization failed"); + return -ENODEV; + } + } + sd->hstart = 2; + sd->vstart = 2; + sd->sensor = SENSOR_MT9V011; + info("MT9V011 sensor detected"); + return 0; + } + + sd->i2c_addr = 0x5c; + i2c_w2(gspca_dev, 0x01, 0x0004); + ret = i2c_r2(gspca_dev, 0xff, &value); + if ((ret == 0) && (value == 0x823a)) { + for (i = 0; i < ARRAY_SIZE(mt9v111_init); i++) { + if (i2c_w2(gspca_dev, mt9v111_init[i][0], + mt9v111_init[i][1]) < 0) { + err("MT9V111 sensor initialization failed"); + return -ENODEV; + } + } + sd->hstart = 2; + sd->vstart = 2; + sd->sensor = SENSOR_MT9V111; + info("MT9V111 sensor detected"); + return 0; + } + + sd->i2c_addr = 0x5d; + ret = i2c_w2(gspca_dev, 0xf0, 0x0000); + if (ret < 0) { + sd->i2c_addr = 0x48; + i2c_w2(gspca_dev, 0xf0, 0x0000); + } + ret = i2c_r2(gspca_dev, 0x00, &value); + if ((ret == 0) && (value == 0x1229)) { + for (i = 0; i < ARRAY_SIZE(mt9v112_init); i++) { + if (i2c_w2(gspca_dev, mt9v112_init[i][0], + mt9v112_init[i][1]) < 0) { + err("MT9V112 sensor initialization failed"); + return -ENODEV; + } + } + sd->hstart = 6; + sd->vstart = 2; + sd->sensor = SENSOR_MT9V112; + info("MT9V112 sensor detected"); + return 0; + } + + return -ENODEV; +} + +static int mt9m111_init_sensor(struct gspca_dev *gspca_dev) +{ + struct sd *sd = (struct sd *) gspca_dev; + int i; + for (i = 0; i < ARRAY_SIZE(mt9m111_init); i++) { + if (i2c_w2(gspca_dev, mt9m111_init[i][0], + mt9m111_init[i][1]) < 0) { + err("MT9M111 sensor initialization failed"); + return -ENODEV; + } + } + sd->hstart = 0; + sd->vstart = 2; + return 0; +} + +static int mt9m001_init_sensor(struct gspca_dev *gspca_dev) +{ + struct sd *sd = (struct sd *) gspca_dev; + int i; + for (i = 0; i < ARRAY_SIZE(mt9m001_init); i++) { + if (i2c_w2(gspca_dev, mt9m001_init[i][0], + mt9m001_init[i][1]) < 0) { + err("MT9M001 sensor initialization failed"); + return -ENODEV; + } + } + /* disable hflip and vflip */ + gspca_dev->ctrl_dis = (1 << HFLIP_IDX) | (1 << VFLIP_IDX); + sd->hstart = 2; + sd->vstart = 2; + return 0; +} + +static int hv7131r_init_sensor(struct gspca_dev *gspca_dev) +{ + int i; + struct sd *sd = (struct sd *) gspca_dev; + + for (i = 0; i < ARRAY_SIZE(hv7131r_init); i++) { + if (i2c_w1(gspca_dev, hv7131r_init[i][0], + hv7131r_init[i][1]) < 0) { + err("HV7131R Sensor initialization failed"); + return -ENODEV; + } + } + sd->hstart = 0; + sd->vstart = 1; + return 0; +} + +#ifdef CONFIG_USB_GSPCA_SN9C20X_EVDEV +static int input_kthread(void *data) +{ + struct gspca_dev *gspca_dev = (struct gspca_dev *)data; + struct sd *sd = (struct sd *) gspca_dev; + + DECLARE_WAIT_QUEUE_HEAD(wait); + set_freezable(); + for (;;) { + if (kthread_should_stop()) + break; + + if (reg_r(gspca_dev, 0x1005, 1) < 0) + continue; + + input_report_key(sd->input_dev, + KEY_CAMERA, + gspca_dev->usb_buf[0] & sd->input_gpio); + input_sync(sd->input_dev); + + wait_event_freezable_timeout(wait, + kthread_should_stop(), + msecs_to_jiffies(100)); + } + return 0; +} + + +static int sn9c20x_input_init(struct gspca_dev *gspca_dev) +{ + struct sd *sd = (struct sd *) gspca_dev; + if (sd->input_gpio == 0) + return 0; + + sd->input_dev = input_allocate_device(); + if (!sd->input_dev) + return -ENOMEM; + + sd->input_dev->name = "SN9C20X Webcam"; + + sd->input_dev->phys = kasprintf(GFP_KERNEL, "usb-%s-%s", + gspca_dev->dev->bus->bus_name, + gspca_dev->dev->devpath); + + if (!sd->input_dev->phys) + return -ENOMEM; + + usb_to_input_id(gspca_dev->dev, &sd->input_dev->id); + sd->input_dev->dev.parent = &gspca_dev->dev->dev; + + set_bit(EV_KEY, sd->input_dev->evbit); + set_bit(KEY_CAMERA, sd->input_dev->keybit); + + if (input_register_device(sd->input_dev)) + return -EINVAL; + + sd->input_task = kthread_run(input_kthread, gspca_dev, "sn9c20x/%d", + gspca_dev->vdev.minor); + + if (IS_ERR(sd->input_task)) + return -EINVAL; + + return 0; +} + +static void sn9c20x_input_cleanup(struct gspca_dev *gspca_dev) +{ + struct sd *sd = (struct sd *) gspca_dev; + if (sd->input_task != NULL && !IS_ERR(sd->input_task)) + kthread_stop(sd->input_task); + + if (sd->input_dev != NULL) { + input_unregister_device(sd->input_dev); + kfree(sd->input_dev->phys); + input_free_device(sd->input_dev); + sd->input_dev = NULL; + } +} +#endif + +static int set_cmatrix(struct gspca_dev *gspca_dev) +{ + struct sd *sd = (struct sd *) gspca_dev; + s32 hue_coord, hue_index = 180 + sd->hue; + u8 cmatrix[21]; + memset(cmatrix, 0, 21); + + cmatrix[2] = (sd->contrast * 0x25 / 0x100) + 0x26; + cmatrix[0] = 0x13 + (cmatrix[2] - 0x26) * 0x13 / 0x25; + cmatrix[4] = 0x07 + (cmatrix[2] - 0x26) * 0x07 / 0x25; + cmatrix[18] = sd->brightness - 0x80; + + hue_coord = (hsv_red_x[hue_index] * sd->saturation) >> 8; + cmatrix[6] = (unsigned char)(hue_coord & 0xff); + cmatrix[7] = (unsigned char)((hue_coord >> 8) & 0x0f); + + hue_coord = (hsv_red_y[hue_index] * sd->saturation) >> 8; + cmatrix[8] = (unsigned char)(hue_coord & 0xff); + cmatrix[9] = (unsigned char)((hue_coord >> 8) & 0x0f); + + hue_coord = (hsv_green_x[hue_index] * sd->saturation) >> 8; + cmatrix[10] = (unsigned char)(hue_coord & 0xff); + cmatrix[11] = (unsigned char)((hue_coord >> 8) & 0x0f); + + hue_coord = (hsv_green_y[hue_index] * sd->saturation) >> 8; + cmatrix[12] = (unsigned char)(hue_coord & 0xff); + cmatrix[13] = (unsigned char)((hue_coord >> 8) & 0x0f); + + hue_coord = (hsv_blue_x[hue_index] * sd->saturation) >> 8; + cmatrix[14] = (unsigned char)(hue_coord & 0xff); + cmatrix[15] = (unsigned char)((hue_coord >> 8) & 0x0f); + + hue_coord = (hsv_blue_y[hue_index] * sd->saturation) >> 8; + cmatrix[16] = (unsigned char)(hue_coord & 0xff); + cmatrix[17] = (unsigned char)((hue_coord >> 8) & 0x0f); + + return reg_w(gspca_dev, 0x10e1, cmatrix, 21); +} + +static int set_gamma(struct gspca_dev *gspca_dev) +{ + struct sd *sd = (struct sd *) gspca_dev; + u8 gamma[17]; + u8 gval = sd->gamma * 0xb8 / 0x100; + + + gamma[0] = 0x0a; + gamma[1] = 0x13 + (gval * (0xcb - 0x13) / 0xb8); + gamma[2] = 0x25 + (gval * (0xee - 0x25) / 0xb8); + gamma[3] = 0x37 + (gval * (0xfa - 0x37) / 0xb8); + gamma[4] = 0x45 + (gval * (0xfc - 0x45) / 0xb8); + gamma[5] = 0x55 + (gval * (0xfb - 0x55) / 0xb8); + gamma[6] = 0x65 + (gval * (0xfc - 0x65) / 0xb8); + gamma[7] = 0x74 + (gval * (0xfd - 0x74) / 0xb8); + gamma[8] = 0x83 + (gval * (0xfe - 0x83) / 0xb8); + gamma[9] = 0x92 + (gval * (0xfc - 0x92) / 0xb8); + gamma[10] = 0xa1 + (gval * (0xfc - 0xa1) / 0xb8); + gamma[11] = 0xb0 + (gval * (0xfc - 0xb0) / 0xb8); + gamma[12] = 0xbf + (gval * (0xfb - 0xbf) / 0xb8); + gamma[13] = 0xce + (gval * (0xfb - 0xce) / 0xb8); + gamma[14] = 0xdf + (gval * (0xfd - 0xdf) / 0xb8); + gamma[15] = 0xea + (gval * (0xf9 - 0xea) / 0xb8); + gamma[16] = 0xf5; + + return reg_w(gspca_dev, 0x1190, gamma, 17); +} + +static int set_redblue(struct gspca_dev *gspca_dev) +{ + struct sd *sd = (struct sd *) gspca_dev; + reg_w1(gspca_dev, 0x118c, sd->red); + reg_w1(gspca_dev, 0x118f, sd->blue); + return 0; +} + +static int set_hvflip(struct gspca_dev *gspca_dev) +{ + u8 value, tslb; + u16 value2; + struct sd *sd = (struct sd *) gspca_dev; + switch (sd->sensor) { + case SENSOR_OV9650: + i2c_r1(gspca_dev, 0x1e, &value); + value &= ~0x30; + tslb = 0x01; + if (sd->hflip) + value |= 0x20; + if (sd->vflip) { + value |= 0x10; + tslb = 0x49; + } + i2c_w1(gspca_dev, 0x1e, value); + i2c_w1(gspca_dev, 0x3a, tslb); + break; + case SENSOR_MT9V111: + case SENSOR_MT9V011: + i2c_r2(gspca_dev, 0x20, &value2); + value2 &= ~0xc0a0; + if (sd->hflip) + value2 |= 0x8080; + if (sd->vflip) + value2 |= 0x4020; + i2c_w2(gspca_dev, 0x20, value2); + break; + case SENSOR_MT9M111: + case SENSOR_MT9V112: + i2c_r2(gspca_dev, 0x20, &value2); + value2 &= ~0x0003; + if (sd->hflip) + value2 |= 0x0002; + if (sd->vflip) + value2 |= 0x0001; + i2c_w2(gspca_dev, 0x20, value2); + break; + case SENSOR_HV7131R: + i2c_r1(gspca_dev, 0x01, &value); + value &= ~0x03; + if (sd->vflip) + value |= 0x01; + if (sd->hflip) + value |= 0x02; + i2c_w1(gspca_dev, 0x01, value); + break; + } + return 0; +} + +static int set_exposure(struct gspca_dev *gspca_dev) +{ + struct sd *sd = (struct sd *) gspca_dev; + u8 exp[8] = {0x81, sd->i2c_addr, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1e}; + switch (sd->sensor) { + case SENSOR_OV7660: + case SENSOR_OV7670: + case SENSOR_SOI968: + case SENSOR_OV9655: + case SENSOR_OV9650: + exp[0] |= (3 << 4); + exp[2] = 0x2d; + exp[3] = sd->exposure & 0xff; + exp[4] = sd->exposure >> 8; + break; + case SENSOR_MT9M001: + case SENSOR_MT9M111: + case SENSOR_MT9V112: + case SENSOR_MT9V111: + case SENSOR_MT9V011: + exp[0] |= (3 << 4); + exp[2] = 0x09; + exp[3] = sd->exposure >> 8; + exp[4] = sd->exposure & 0xff; + break; + case SENSOR_HV7131R: + exp[0] |= (4 << 4); + exp[2] = 0x25; + exp[3] = ((sd->exposure * 0xffffff) / 0xffff) >> 16; + exp[4] = ((sd->exposure * 0xffffff) / 0xffff) >> 8; + exp[5] = ((sd->exposure * 0xffffff) / 0xffff) & 0xff; + break; + } + i2c_w(gspca_dev, exp); + return 0; +} + +static int set_gain(struct gspca_dev *gspca_dev) +{ + struct sd *sd = (struct sd *) gspca_dev; + u8 gain[8] = {0x81, sd->i2c_addr, 0x00, 0x00, 0x00, 0x00, 0x00, 0x1d}; + switch (sd->sensor) { + case SENSOR_OV7660: + case SENSOR_OV7670: + case SENSOR_SOI968: + case SENSOR_OV9655: + case SENSOR_OV9650: + gain[0] |= (2 << 4); + gain[3] = ov_gain[sd->gain]; + break; + case SENSOR_MT9V011: + case SENSOR_MT9V111: + gain[0] |= (3 << 4); + gain[2] = 0x35; + gain[3] = micron1_gain[sd->gain] >> 8; + gain[4] = micron1_gain[sd->gain] & 0xff; + break; + case SENSOR_MT9V112: + case SENSOR_MT9M111: + gain[0] |= (3 << 4); + gain[2] = 0x2f; + gain[3] = micron1_gain[sd->gain] >> 8; + gain[4] = micron1_gain[sd->gain] & 0xff; + break; + case SENSOR_MT9M001: + gain[0] |= (3 << 4); + gain[2] = 0x2f; + gain[3] = micron2_gain[sd->gain] >> 8; + gain[4] = micron2_gain[sd->gain] & 0xff; + break; + case SENSOR_HV7131R: + gain[0] |= (2 << 4); + gain[2] = 0x30; + gain[3] = hv7131r_gain[sd->gain]; + break; + } + i2c_w(gspca_dev, gain); + return 0; +} + +static int sd_setbrightness(struct gspca_dev *gspca_dev, s32 val) +{ + struct sd *sd = (struct sd *) gspca_dev; + + sd->brightness = val; + if (gspca_dev->streaming) + return set_cmatrix(gspca_dev); + return 0; +} + +static int sd_getbrightness(struct gspca_dev *gspca_dev, s32 *val) +{ + struct sd *sd = (struct sd *) gspca_dev; + *val = sd->brightness; + return 0; +} + + +static int sd_setcontrast(struct gspca_dev *gspca_dev, s32 val) +{ + struct sd *sd = (struct sd *) gspca_dev; + + sd->contrast = val; + if (gspca_dev->streaming) + return set_cmatrix(gspca_dev); + return 0; +} + +static int sd_getcontrast(struct gspca_dev *gspca_dev, s32 *val) +{ + struct sd *sd = (struct sd *) gspca_dev; + *val = sd->contrast; + return 0; +} + +static int sd_setsaturation(struct gspca_dev *gspca_dev, s32 val) +{ + struct sd *sd = (struct sd *) gspca_dev; + + sd->saturation = val; + if (gspca_dev->streaming) + return set_cmatrix(gspca_dev); + return 0; +} + +static int sd_getsaturation(struct gspca_dev *gspca_dev, s32 *val) +{ + struct sd *sd = (struct sd *) gspca_dev; + *val = sd->saturation; + return 0; +} + +static int sd_sethue(struct gspca_dev *gspca_dev, s32 val) +{ + struct sd *sd = (struct sd *) gspca_dev; + + sd->hue = val; + if (gspca_dev->streaming) + return set_cmatrix(gspca_dev); + return 0; +} + +static int sd_gethue(struct gspca_dev *gspca_dev, s32 *val) +{ + struct sd *sd = (struct sd *) gspca_dev; + *val = sd->hue; + return 0; +} + +static int sd_setgamma(struct gspca_dev *gspca_dev, s32 val) +{ + struct sd *sd = (struct sd *) gspca_dev; + + sd->gamma = val; + if (gspca_dev->streaming) + return set_gamma(gspca_dev); + return 0; +} + +static int sd_getgamma(struct gspca_dev *gspca_dev, s32 *val) +{ + struct sd *sd = (struct sd *) gspca_dev; + *val = sd->gamma; + return 0; +} + +static int sd_setredbalance(struct gspca_dev *gspca_dev, s32 val) +{ + struct sd *sd = (struct sd *) gspca_dev; + + sd->red = val; + if (gspca_dev->streaming) + return set_redblue(gspca_dev); + return 0; +} + +static int sd_getredbalance(struct gspca_dev *gspca_dev, s32 *val) +{ + struct sd *sd = (struct sd *) gspca_dev; + *val = sd->red; + return 0; +} + +static int sd_setbluebalance(struct gspca_dev *gspca_dev, s32 val) +{ + struct sd *sd = (struct sd *) gspca_dev; + + sd->blue = val; + if (gspca_dev->streaming) + return set_redblue(gspca_dev); + return 0; +} + +static int sd_getbluebalance(struct gspca_dev *gspca_dev, s32 *val) +{ + struct sd *sd = (struct sd *) gspca_dev; + *val = sd->blue; + return 0; +} + +static int sd_sethflip(struct gspca_dev *gspca_dev, s32 val) +{ + struct sd *sd = (struct sd *) gspca_dev; + + sd->hflip = val; + if (gspca_dev->streaming) + return set_hvflip(gspca_dev); + return 0; +} + +static int sd_gethflip(struct gspca_dev *gspca_dev, s32 *val) +{ + struct sd *sd = (struct sd *) gspca_dev; + *val = sd->hflip; + return 0; +} + +static int sd_setvflip(struct gspca_dev *gspca_dev, s32 val) +{ + struct sd *sd = (struct sd *) gspca_dev; + + sd->vflip = val; + if (gspca_dev->streaming) + return set_hvflip(gspca_dev); + return 0; +} + +static int sd_getvflip(struct gspca_dev *gspca_dev, s32 *val) +{ + struct sd *sd = (struct sd *) gspca_dev; + *val = sd->vflip; + return 0; +} + +static int sd_setexposure(struct gspca_dev *gspca_dev, s32 val) +{ + struct sd *sd = (struct sd *) gspca_dev; + + sd->exposure = val; + if (gspca_dev->streaming) + return set_exposure(gspca_dev); + return 0; +} + +static int sd_getexposure(struct gspca_dev *gspca_dev, s32 *val) +{ + struct sd *sd = (struct sd *) gspca_dev; + *val = sd->exposure; + return 0; +} + +static int sd_setgain(struct gspca_dev *gspca_dev, s32 val) +{ + struct sd *sd = (struct sd *) gspca_dev; + + sd->gain = val; + if (gspca_dev->streaming) + return set_gain(gspca_dev); + return 0; +} + +static int sd_getgain(struct gspca_dev *gspca_dev, s32 *val) +{ + struct sd *sd = (struct sd *) gspca_dev; + *val = sd->gain; + return 0; +} + +static int sd_setautoexposure(struct gspca_dev *gspca_dev, s32 val) +{ + struct sd *sd = (struct sd *) gspca_dev; + sd->auto_exposure = val; + return 0; +} + +static int sd_getautoexposure(struct gspca_dev *gspca_dev, s32 *val) +{ + struct sd *sd = (struct sd *) gspca_dev; + *val = sd->auto_exposure; + return 0; +} + +#ifdef CONFIG_VIDEO_ADV_DEBUG +static int sd_dbg_g_register(struct gspca_dev *gspca_dev, + struct v4l2_dbg_register *reg) +{ + struct sd *sd = (struct sd *) gspca_dev; + switch (reg->match.type) { + case V4L2_CHIP_MATCH_HOST: + if (reg->match.addr != 0) + return -EINVAL; + if (reg->reg < 0x1000 || reg->reg > 0x11ff) + return -EINVAL; + if (reg_r(gspca_dev, reg->reg, 1) < 0) + return -EINVAL; + reg->val = gspca_dev->usb_buf[0]; + return 0; + case V4L2_CHIP_MATCH_I2C_ADDR: + if (reg->match.addr != sd->i2c_addr) + return -EINVAL; + if (sd->sensor >= SENSOR_MT9V011 && + sd->sensor <= SENSOR_MT9M111) { + if (i2c_r2(gspca_dev, reg->reg, (u16 *)®->val) < 0) + return -EINVAL; + } else { + if (i2c_r1(gspca_dev, reg->reg, (u8 *)®->val) < 0) + return -EINVAL; + } + return 0; + } + return -EINVAL; +} + +static int sd_dbg_s_register(struct gspca_dev *gspca_dev, + struct v4l2_dbg_register *reg) +{ + struct sd *sd = (struct sd *) gspca_dev; + switch (reg->match.type) { + case V4L2_CHIP_MATCH_HOST: + if (reg->match.addr != 0) + return -EINVAL; + if (reg->reg < 0x1000 || reg->reg > 0x11ff) + return -EINVAL; + if (reg_w1(gspca_dev, reg->reg, reg->val) < 0) + return -EINVAL; + return 0; + case V4L2_CHIP_MATCH_I2C_ADDR: + if (reg->match.addr != sd->i2c_addr) + return -EINVAL; + if (sd->sensor >= SENSOR_MT9V011 && + sd->sensor <= SENSOR_MT9M111) { + if (i2c_w2(gspca_dev, reg->reg, reg->val) < 0) + return -EINVAL; + } else { + if (i2c_w1(gspca_dev, reg->reg, reg->val) < 0) + return -EINVAL; + } + return 0; + } + return -EINVAL; +} +#endif + +static int sd_chip_ident(struct gspca_dev *gspca_dev, + struct v4l2_dbg_chip_ident *chip) +{ + struct sd *sd = (struct sd *) gspca_dev; + + switch (chip->match.type) { + case V4L2_CHIP_MATCH_HOST: + if (chip->match.addr != 0) + return -EINVAL; + chip->revision = 0; + chip->ident = V4L2_IDENT_SN9C20X; + return 0; + case V4L2_CHIP_MATCH_I2C_ADDR: + if (chip->match.addr != sd->i2c_addr) + return -EINVAL; + chip->revision = 0; + chip->ident = i2c_ident[sd->sensor]; + return 0; + } + return -EINVAL; +} + +static int sd_config(struct gspca_dev *gspca_dev, + const struct usb_device_id *id) +{ + struct sd *sd = (struct sd *) gspca_dev; + struct cam *cam; + + cam = &gspca_dev->cam; + + sd->sensor = (id->driver_info >> 8) & 0xff; + sd->i2c_addr = id->driver_info & 0xff; + + switch (sd->sensor) { + case SENSOR_OV9650: + cam->cam_mode = sxga_mode; + cam->nmodes = ARRAY_SIZE(sxga_mode); + break; + default: + cam->cam_mode = vga_mode; + cam->nmodes = ARRAY_SIZE(vga_mode); + } + + sd->old_step = 0; + sd->older_step = 0; + sd->exposure_step = 16; + + sd->brightness = BRIGHTNESS_DEFAULT; + sd->contrast = CONTRAST_DEFAULT; + sd->saturation = SATURATION_DEFAULT; + sd->hue = HUE_DEFAULT; + sd->gamma = GAMMA_DEFAULT; + sd->red = RED_DEFAULT; + sd->blue = BLUE_DEFAULT; + + sd->hflip = HFLIP_DEFAULT; + sd->vflip = VFLIP_DEFAULT; + sd->exposure = EXPOSURE_DEFAULT; + sd->gain = GAIN_DEFAULT; + sd->auto_exposure = AUTO_EXPOSURE_DEFAULT; + + sd->quality = 95; + +#ifdef CONFIG_USB_GSPCA_SN9C20X_EVDEV + sd->input_gpio = (id->driver_info >> 16) & 0xff; + if (sn9c20x_input_init(gspca_dev) < 0) + return -ENODEV; +#endif + return 0; +} + +static int sd_init(struct gspca_dev *gspca_dev) +{ + struct sd *sd = (struct sd *) gspca_dev; + int i; + u8 value; + u8 i2c_init[9] = + {0x80, sd->i2c_addr, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03}; + + for (i = 0; i < ARRAY_SIZE(bridge_init); i++) { + value = bridge_init[i][1]; + if (reg_w(gspca_dev, bridge_init[i][0], &value, 1) < 0) { + err("Device initialization failed"); + return -ENODEV; + } + } + + if (reg_w(gspca_dev, 0x10c0, i2c_init, 9) < 0) { + err("Device initialization failed"); + return -ENODEV; + } + + switch (sd->sensor) { + case SENSOR_OV9650: + if (ov9650_init_sensor(gspca_dev) < 0) + return -ENODEV; + info("OV9650 sensor detected"); + break; + case SENSOR_OV9655: + if (ov9655_init_sensor(gspca_dev) < 0) + return -ENODEV; + info("OV9655 sensor detected"); + break; + case SENSOR_SOI968: + if (soi968_init_sensor(gspca_dev) < 0) + return -ENODEV; + info("SOI968 sensor detected"); + break; + case SENSOR_OV7660: + if (ov7660_init_sensor(gspca_dev) < 0) + return -ENODEV; + info("OV7660 sensor detected"); + break; + case SENSOR_OV7670: + if (ov7670_init_sensor(gspca_dev) < 0) + return -ENODEV; + info("OV7670 sensor detected"); + break; + case SENSOR_MT9VPRB: + if (mt9v_init_sensor(gspca_dev) < 0) + return -ENODEV; + break; + case SENSOR_MT9M111: + if (mt9m111_init_sensor(gspca_dev) < 0) + return -ENODEV; + info("MT9M111 sensor detected"); + break; + case SENSOR_MT9M001: + if (mt9m001_init_sensor(gspca_dev) < 0) + return -ENODEV; + info("MT9M001 sensor detected"); + break; + case SENSOR_HV7131R: + if (hv7131r_init_sensor(gspca_dev) < 0) + return -ENODEV; + info("HV7131R sensor detected"); + break; + default: + info("Unsupported Sensor"); + return -ENODEV; + } + + return 0; +} + +static void configure_sensor_output(struct gspca_dev *gspca_dev, int mode) +{ + struct sd *sd = (struct sd *) gspca_dev; + u8 value; + switch (sd->sensor) { + case SENSOR_OV9650: + if (mode & MODE_SXGA) { + i2c_w1(gspca_dev, 0x17, 0x1b); + i2c_w1(gspca_dev, 0x18, 0xbc); + i2c_w1(gspca_dev, 0x19, 0x01); + i2c_w1(gspca_dev, 0x1a, 0x82); + i2c_r1(gspca_dev, 0x12, &value); + i2c_w1(gspca_dev, 0x12, value & 0x07); + } else { + i2c_w1(gspca_dev, 0x17, 0x24); + i2c_w1(gspca_dev, 0x18, 0xc5); + i2c_w1(gspca_dev, 0x19, 0x00); + i2c_w1(gspca_dev, 0x1a, 0x3c); + i2c_r1(gspca_dev, 0x12, &value); + i2c_w1(gspca_dev, 0x12, (value & 0x7) | 0x40); + } + break; + } +} + +#define HW_WIN(mode, hstart, vstart) \ +((const u8 []){hstart & 0xff, hstart >> 8, \ +vstart & 0xff, vstart >> 8, \ +(mode & MODE_SXGA ? 1280 >> 4 : 640 >> 4), \ +(mode & MODE_SXGA ? 1024 >> 3 : 480 >> 3)}) + +#define CLR_WIN(width, height) \ +((const u8 [])\ +{0, width >> 2, 0, height >> 1,\ +((width >> 10) & 0x01) | ((height >> 8) & 0x6)}) + +static int sd_start(struct gspca_dev *gspca_dev) +{ + struct sd *sd = (struct sd *) gspca_dev; + int mode = gspca_dev->cam.cam_mode[(int) gspca_dev->curr_mode].priv; + int width = gspca_dev->width; + int height = gspca_dev->height; + u8 fmt, scale = 0; + + sd->jpeg_hdr = kmalloc(JPEG_HDR_SZ, GFP_KERNEL); + if (sd->jpeg_hdr == NULL) + return -ENOMEM; + + jpeg_define(sd->jpeg_hdr, height, width, + 0x21); + jpeg_set_qual(sd->jpeg_hdr, sd->quality); + + if (mode & MODE_RAW) + fmt = 0x2d; + else if (mode & MODE_JPEG) + fmt = 0x2c; + else + fmt = 0x2f; + + switch (mode & 0x0f) { + case 3: + scale = 0xc0; + info("Set 1280x1024"); + break; + case 2: + scale = 0x80; + info("Set 640x480"); + break; + case 1: + scale = 0x90; + info("Set 320x240"); + break; + case 0: + scale = 0xa0; + info("Set 160x120"); + break; + } + + configure_sensor_output(gspca_dev, mode); + reg_w(gspca_dev, 0x1100, sd->jpeg_hdr + JPEG_QT0_OFFSET, 64); + reg_w(gspca_dev, 0x1140, sd->jpeg_hdr + JPEG_QT1_OFFSET, 64); + reg_w(gspca_dev, 0x10fb, CLR_WIN(width, height), 5); + reg_w(gspca_dev, 0x1180, HW_WIN(mode, sd->hstart, sd->vstart), 6); + reg_w1(gspca_dev, 0x1189, scale); + reg_w1(gspca_dev, 0x10e0, fmt); + + set_cmatrix(gspca_dev); + set_gamma(gspca_dev); + set_redblue(gspca_dev); + set_gain(gspca_dev); + set_exposure(gspca_dev); + set_hvflip(gspca_dev); + + reg_r(gspca_dev, 0x1061, 1); + reg_w1(gspca_dev, 0x1061, gspca_dev->usb_buf[0] | 0x02); + return 0; +} + +static void sd_stopN(struct gspca_dev *gspca_dev) +{ + reg_r(gspca_dev, 0x1061, 1); + reg_w1(gspca_dev, 0x1061, gspca_dev->usb_buf[0] & ~0x02); +} + +static void sd_stop0(struct gspca_dev *gspca_dev) +{ + struct sd *sd = (struct sd *) gspca_dev; + kfree(sd->jpeg_hdr); +} + +static void do_autoexposure(struct gspca_dev *gspca_dev) +{ + struct sd *sd = (struct sd *) gspca_dev; + int avg_lum, new_exp; + + if (!sd->auto_exposure) + return; + + avg_lum = atomic_read(&sd->avg_lum); + + /* + * some hardcoded values are present + * like those for maximal/minimal exposure + * and exposure steps + */ + if (avg_lum < MIN_AVG_LUM) { + if (sd->exposure > 0x1770) + return; + + new_exp = sd->exposure + sd->exposure_step; + if (new_exp > 0x1770) + new_exp = 0x1770; + if (new_exp < 0x10) + new_exp = 0x10; + sd->exposure = new_exp; + set_exposure(gspca_dev); + + sd->older_step = sd->old_step; + sd->old_step = 1; + + if (sd->old_step ^ sd->older_step) + sd->exposure_step /= 2; + else + sd->exposure_step += 2; + } + if (avg_lum > MAX_AVG_LUM) { + if (sd->exposure < 0x10) + return; + new_exp = sd->exposure - sd->exposure_step; + if (new_exp > 0x1700) + new_exp = 0x1770; + if (new_exp < 0x10) + new_exp = 0x10; + sd->exposure = new_exp; + set_exposure(gspca_dev); + sd->older_step = sd->old_step; + sd->old_step = 0; + + if (sd->old_step ^ sd->older_step) + sd->exposure_step /= 2; + else + sd->exposure_step += 2; + } +} + +static void sd_pkt_scan(struct gspca_dev *gspca_dev, + struct gspca_frame *frame, /* target */ + u8 *data, /* isoc packet */ + int len) /* iso packet length */ +{ + struct sd *sd = (struct sd *) gspca_dev; + int avg_lum; + static unsigned char frame_header[] = + {0xff, 0xff, 0x00, 0xc4, 0xc4, 0x96}; + if (len == 64 && memcmp(data, frame_header, 6) == 0) { + avg_lum = ((data[35] >> 2) & 3) | + (data[20] << 2) | + (data[19] << 10); + avg_lum += ((data[35] >> 4) & 3) | + (data[22] << 2) | + (data[21] << 10); + avg_lum += ((data[35] >> 6) & 3) | + (data[24] << 2) | + (data[23] << 10); + avg_lum += (data[36] & 3) | + (data[26] << 2) | + (data[25] << 10); + avg_lum += ((data[36] >> 2) & 3) | + (data[28] << 2) | + (data[27] << 10); + avg_lum += ((data[36] >> 4) & 3) | + (data[30] << 2) | + (data[29] << 10); + avg_lum += ((data[36] >> 6) & 3) | + (data[32] << 2) | + (data[31] << 10); + avg_lum += ((data[44] >> 4) & 3) | + (data[34] << 2) | + (data[33] << 10); + avg_lum >>= 9; + atomic_set(&sd->avg_lum, avg_lum); + gspca_frame_add(gspca_dev, LAST_PACKET, + frame, data, len); + return; + } + if (gspca_dev->last_packet_type == LAST_PACKET) { + if (gspca_dev->cam.cam_mode[(int) gspca_dev->curr_mode].priv + & MODE_JPEG) { + gspca_frame_add(gspca_dev, FIRST_PACKET, frame, + sd->jpeg_hdr, JPEG_HDR_SZ); + gspca_frame_add(gspca_dev, INTER_PACKET, frame, + data, len); + } else { + gspca_frame_add(gspca_dev, FIRST_PACKET, frame, + data, len); + } + } else { + gspca_frame_add(gspca_dev, INTER_PACKET, frame, data, len); + } +} + +/* sub-driver description */ +static const struct sd_desc sd_desc = { + .name = MODULE_NAME, + .ctrls = sd_ctrls, + .nctrls = ARRAY_SIZE(sd_ctrls), + .config = sd_config, + .init = sd_init, + .start = sd_start, + .stopN = sd_stopN, + .stop0 = sd_stop0, + .pkt_scan = sd_pkt_scan, + .dq_callback = do_autoexposure, +#ifdef CONFIG_VIDEO_ADV_DEBUG + .set_register = sd_dbg_s_register, + .get_register = sd_dbg_g_register, +#endif + .get_chip_ident = sd_chip_ident, +}; + +#define SN9C20X(sensor, i2c_addr, button_mask) \ + .driver_info = (button_mask << 16) \ + | (SENSOR_ ## sensor << 8) \ + | (i2c_addr) + +static const __devinitdata struct usb_device_id device_table[] = { + {USB_DEVICE(0x0c45, 0x6240), SN9C20X(MT9M001, 0x5d, 0)}, + {USB_DEVICE(0x0c45, 0x6242), SN9C20X(MT9M111, 0x5d, 0)}, + {USB_DEVICE(0x0c45, 0x6248), SN9C20X(OV9655, 0x30, 0)}, + {USB_DEVICE(0x0c45, 0x624e), SN9C20X(SOI968, 0x30, 0x10)}, + {USB_DEVICE(0x0c45, 0x624f), SN9C20X(OV9650, 0x30, 0)}, + {USB_DEVICE(0x0c45, 0x6251), SN9C20X(OV9650, 0x30, 0)}, + {USB_DEVICE(0x0c45, 0x6253), SN9C20X(OV9650, 0x30, 0)}, + {USB_DEVICE(0x0c45, 0x6260), SN9C20X(OV7670, 0x21, 0)}, + {USB_DEVICE(0x0c45, 0x6270), SN9C20X(MT9VPRB, 0x00, 0)}, + {USB_DEVICE(0x0c45, 0x627b), SN9C20X(OV7660, 0x21, 0)}, + {USB_DEVICE(0x0c45, 0x627c), SN9C20X(HV7131R, 0x11, 0)}, + {USB_DEVICE(0x0c45, 0x627f), SN9C20X(OV9650, 0x30, 0)}, + {USB_DEVICE(0x0c45, 0x6280), SN9C20X(MT9M001, 0x5d, 0)}, + {USB_DEVICE(0x0c45, 0x6282), SN9C20X(MT9M111, 0x5d, 0)}, + {USB_DEVICE(0x0c45, 0x6288), SN9C20X(OV9655, 0x30, 0)}, + {USB_DEVICE(0x0c45, 0x628e), SN9C20X(SOI968, 0x30, 0)}, + {USB_DEVICE(0x0c45, 0x628f), SN9C20X(OV9650, 0x30, 0)}, + {USB_DEVICE(0x0c45, 0x62a0), SN9C20X(OV7670, 0x21, 0)}, + {USB_DEVICE(0x0c45, 0x62b0), SN9C20X(MT9VPRB, 0x00, 0)}, + {USB_DEVICE(0x0c45, 0x62b3), SN9C20X(OV9655, 0x30, 0)}, + {USB_DEVICE(0x0c45, 0x62bb), SN9C20X(OV7660, 0x21, 0)}, + {USB_DEVICE(0x0c45, 0x62bc), SN9C20X(HV7131R, 0x11, 0)}, + {USB_DEVICE(0x045e, 0x00f4), SN9C20X(OV9650, 0x30, 0)}, + {USB_DEVICE(0x145f, 0x013d), SN9C20X(OV7660, 0x21, 0)}, + {USB_DEVICE(0x0458, 0x7029), SN9C20X(HV7131R, 0x11, 0)}, + {USB_DEVICE(0xa168, 0x0610), SN9C20X(HV7131R, 0x11, 0)}, + {USB_DEVICE(0xa168, 0x0611), SN9C20X(HV7131R, 0x11, 0)}, + {USB_DEVICE(0xa168, 0x0613), SN9C20X(HV7131R, 0x11, 0)}, + {USB_DEVICE(0xa168, 0x0618), SN9C20X(HV7131R, 0x11, 0)}, + {USB_DEVICE(0xa168, 0x0614), SN9C20X(MT9M111, 0x5d, 0)}, + {USB_DEVICE(0xa168, 0x0615), SN9C20X(MT9M111, 0x5d, 0)}, + {USB_DEVICE(0xa168, 0x0617), SN9C20X(MT9M111, 0x5d, 0)}, + {} +}; +MODULE_DEVICE_TABLE(usb, device_table); + +/* -- device connect -- */ +static int sd_probe(struct usb_interface *intf, + const struct usb_device_id *id) +{ + return gspca_dev_probe(intf, id, &sd_desc, sizeof(struct sd), + THIS_MODULE); +} + +static void sd_disconnect(struct usb_interface *intf) +{ +#ifdef CONFIG_USB_GSPCA_SN9C20X_EVDEV + struct gspca_dev *gspca_dev = usb_get_intfdata(intf); + + sn9c20x_input_cleanup(gspca_dev); +#endif + + gspca_disconnect(intf); +} + +static struct usb_driver sd_driver = { + .name = MODULE_NAME, + .id_table = device_table, + .probe = sd_probe, + .disconnect = sd_disconnect, +#ifdef CONFIG_PM + .suspend = gspca_suspend, + .resume = gspca_resume, + .reset_resume = gspca_resume, +#endif +}; + +/* -- module insert / remove -- */ +static int __init sd_mod_init(void) +{ + int ret; + ret = usb_register(&sd_driver); + if (ret < 0) + return ret; + info("registered"); + return 0; +} +static void __exit sd_mod_exit(void) +{ + usb_deregister(&sd_driver); + info("deregistered"); +} + +module_init(sd_mod_init); +module_exit(sd_mod_exit); diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index 95846d988011..74f16876f38d 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -338,6 +338,7 @@ struct v4l2_pix_format { /* Vendor-specific formats */ #define V4L2_PIX_FMT_WNVA v4l2_fourcc('W', 'N', 'V', 'A') /* Winnov hw compress */ #define V4L2_PIX_FMT_SN9C10X v4l2_fourcc('S', '9', '1', '0') /* SN9C10x compression */ +#define V4L2_PIX_FMT_SN9C20X_I420 v4l2_fourcc('S', '9', '2', '0') /* SN9C20x YUV 4:2:0 */ #define V4L2_PIX_FMT_PWC1 v4l2_fourcc('P', 'W', 'C', '1') /* pwc older webcam */ #define V4L2_PIX_FMT_PWC2 v4l2_fourcc('P', 'W', 'C', '2') /* pwc newer webcam */ #define V4L2_PIX_FMT_ET61X251 v4l2_fourcc('E', '6', '2', '5') /* ET61X251 compression */ diff --git a/include/media/v4l2-chip-ident.h b/include/media/v4l2-chip-ident.h index 11a4a2d3e364..94e908c0d7a0 100644 --- a/include/media/v4l2-chip-ident.h +++ b/include/media/v4l2-chip-ident.h @@ -60,6 +60,10 @@ enum { V4L2_IDENT_OV7670 = 250, V4L2_IDENT_OV7720 = 251, V4L2_IDENT_OV7725 = 252, + V4L2_IDENT_OV7660 = 253, + V4L2_IDENT_OV9650 = 254, + V4L2_IDENT_OV9655 = 255, + V4L2_IDENT_SOI968 = 256, /* module saa7146: reserved range 300-309 */ V4L2_IDENT_SAA7146 = 300, @@ -161,6 +165,9 @@ enum { /* module tw9910: just ident 9910 */ V4L2_IDENT_TW9910 = 9910, + /* module sn9c20x: just ident 10000 */ + V4L2_IDENT_SN9C20X = 10000, + /* module msp3400: reserved range 34000-34999 and 44000-44999 */ V4L2_IDENT_MSPX4XX = 34000, /* generic MSPX4XX identifier, only use internally (tveeprom.c). */ @@ -237,6 +244,11 @@ enum { V4L2_IDENT_MT9V022IX7ATC = 45010, /* No way to detect "normal" I77ATx */ V4L2_IDENT_MT9V022IX7ATM = 45015, /* and "lead free" IA7ATx chips */ V4L2_IDENT_MT9T031 = 45020, + V4L2_IDENT_MT9V111 = 45031, + V4L2_IDENT_MT9V112 = 45032, + + /* HV7131R CMOS sensor: just ident 46000 */ + V4L2_IDENT_HV7131R = 46000, /* module cs53132a: just ident 53132 */ V4L2_IDENT_CS53l32A = 53132, -- cgit v1.2.3-59-g8ed1b From dcf777f6ed9799c5ac90ac17a5c369e6b73ca92e Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Sun, 26 Jul 2009 19:11:14 -0700 Subject: NET: ROSE: Don't use static buffer. The use of a static buffer in rose2asc() to return its result is not threadproof and can result in corruption if multiple threads are trying to use one of the procfs files based on rose2asc(). Signed-off-by: Ralf Baechle Signed-off-by: David S. Miller --- include/net/rose.h | 2 +- net/rose/af_rose.c | 18 ++++++++---------- net/rose/rose_route.c | 23 ++++++++++++----------- 3 files changed, 21 insertions(+), 22 deletions(-) (limited to 'include') diff --git a/include/net/rose.h b/include/net/rose.h index cbd5364b2c8a..5ba9f02731eb 100644 --- a/include/net/rose.h +++ b/include/net/rose.h @@ -156,7 +156,7 @@ extern int sysctl_rose_maximum_vcs; extern int sysctl_rose_window_size; extern int rosecmp(rose_address *, rose_address *); extern int rosecmpm(rose_address *, rose_address *, unsigned short); -extern const char *rose2asc(const rose_address *); +extern char *rose2asc(char *buf, const rose_address *); extern struct sock *rose_find_socket(unsigned int, struct rose_neigh *); extern void rose_kill_by_neigh(struct rose_neigh *); extern unsigned int rose_new_lci(struct rose_neigh *); diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 6bd8e93869ed..f0a76f6bca71 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -92,23 +92,21 @@ static void rose_set_lockdep_key(struct net_device *dev) /* * Convert a ROSE address into text. */ -const char *rose2asc(const rose_address *addr) +char *rose2asc(char *buf, const rose_address *addr) { - static char buffer[11]; - if (addr->rose_addr[0] == 0x00 && addr->rose_addr[1] == 0x00 && addr->rose_addr[2] == 0x00 && addr->rose_addr[3] == 0x00 && addr->rose_addr[4] == 0x00) { - strcpy(buffer, "*"); + strcpy(buf, "*"); } else { - sprintf(buffer, "%02X%02X%02X%02X%02X", addr->rose_addr[0] & 0xFF, + sprintf(buf, "%02X%02X%02X%02X%02X", addr->rose_addr[0] & 0xFF, addr->rose_addr[1] & 0xFF, addr->rose_addr[2] & 0xFF, addr->rose_addr[3] & 0xFF, addr->rose_addr[4] & 0xFF); } - return buffer; + return buf; } /* @@ -1437,7 +1435,7 @@ static void rose_info_stop(struct seq_file *seq, void *v) static int rose_info_show(struct seq_file *seq, void *v) { - char buf[11]; + char buf[11], rsbuf[11]; if (v == SEQ_START_TOKEN) seq_puts(seq, @@ -1455,8 +1453,8 @@ static int rose_info_show(struct seq_file *seq, void *v) devname = dev->name; seq_printf(seq, "%-10s %-9s ", - rose2asc(&rose->dest_addr), - ax2asc(buf, &rose->dest_call)); + rose2asc(rsbuf, &rose->dest_addr), + ax2asc(buf, &rose->dest_call)); if (ax25cmp(&rose->source_call, &null_ax25_address) == 0) callsign = "??????-?"; @@ -1465,7 +1463,7 @@ static int rose_info_show(struct seq_file *seq, void *v) seq_printf(seq, "%-10s %-9s %-5s %3.3X %05d %d %d %d %d %3lu %3lu %3lu %3lu %3lu %3lu/%03lu %5d %5d %ld\n", - rose2asc(&rose->source_addr), + rose2asc(rsbuf, &rose->source_addr), callsign, devname, rose->lci & 0x0FFF, diff --git a/net/rose/rose_route.c b/net/rose/rose_route.c index a81066a1010a..9478d9b3d977 100644 --- a/net/rose/rose_route.c +++ b/net/rose/rose_route.c @@ -1104,6 +1104,7 @@ static void rose_node_stop(struct seq_file *seq, void *v) static int rose_node_show(struct seq_file *seq, void *v) { + char rsbuf[11]; int i; if (v == SEQ_START_TOKEN) @@ -1112,13 +1113,13 @@ static int rose_node_show(struct seq_file *seq, void *v) const struct rose_node *rose_node = v; /* if (rose_node->loopback) { seq_printf(seq, "%-10s %04d 1 loopback\n", - rose2asc(&rose_node->address), - rose_node->mask); + rose2asc(rsbuf, &rose_node->address), + rose_node->mask); } else { */ seq_printf(seq, "%-10s %04d %d", - rose2asc(&rose_node->address), - rose_node->mask, - rose_node->count); + rose2asc(rsbuf, &rose_node->address), + rose_node->mask, + rose_node->count); for (i = 0; i < rose_node->count; i++) seq_printf(seq, " %05d", @@ -1267,7 +1268,7 @@ static void rose_route_stop(struct seq_file *seq, void *v) static int rose_route_show(struct seq_file *seq, void *v) { - char buf[11]; + char buf[11], rsbuf[11]; if (v == SEQ_START_TOKEN) seq_puts(seq, @@ -1279,7 +1280,7 @@ static int rose_route_show(struct seq_file *seq, void *v) seq_printf(seq, "%3.3X %-10s %-9s %05d ", rose_route->lci1, - rose2asc(&rose_route->src_addr), + rose2asc(rsbuf, &rose_route->src_addr), ax2asc(buf, &rose_route->src_call), rose_route->neigh1->number); else @@ -1289,10 +1290,10 @@ static int rose_route_show(struct seq_file *seq, void *v) if (rose_route->neigh2) seq_printf(seq, "%3.3X %-10s %-9s %05d\n", - rose_route->lci2, - rose2asc(&rose_route->dest_addr), - ax2asc(buf, &rose_route->dest_call), - rose_route->neigh2->number); + rose_route->lci2, + rose2asc(rsbuf, &rose_route->dest_addr), + ax2asc(buf, &rose_route->dest_call), + rose_route->neigh2->number); else seq_puts(seq, "000 * * 00000\n"); -- cgit v1.2.3-59-g8ed1b From 9e1b32caa525cb236e80e9c671e179bcecccc657 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Wed, 22 Jul 2009 15:44:28 +1000 Subject: mm: Pass virtual address to [__]p{te,ud,md}_free_tlb() mm: Pass virtual address to [__]p{te,ud,md}_free_tlb() Upcoming paches to support the new 64-bit "BookE" powerpc architecture will need to have the virtual address corresponding to PTE page when freeing it, due to the way the HW table walker works. Basically, the TLB can be loaded with "large" pages that cover the whole virtual space (well, sort-of, half of it actually) represented by a PTE page, and which contain an "indirect" bit indicating that this TLB entry RPN points to an array of PTEs from which the TLB can then create direct entries. Thus, in order to invalidate those when PTE pages are deleted, we need the virtual address to pass to tlbilx or tlbivax instructions. The old trick of sticking it somewhere in the PTE page struct page sucks too much, the address is almost readily available in all call sites and almost everybody implemets these as macros, so we may as well add the argument everywhere. I added it to the pmd and pud variants for consistency. Signed-off-by: Benjamin Herrenschmidt Acked-by: David Howells [MN10300 & FRV] Acked-by: Nick Piggin Acked-by: Martin Schwidefsky [s390] Signed-off-by: Linus Torvalds --- arch/alpha/include/asm/tlb.h | 4 ++-- arch/arm/include/asm/tlb.h | 4 ++-- arch/avr32/include/asm/pgalloc.h | 2 +- arch/cris/include/asm/pgalloc.h | 2 +- arch/frv/include/asm/pgalloc.h | 4 ++-- arch/frv/include/asm/pgtable.h | 2 +- arch/ia64/include/asm/pgalloc.h | 6 +++--- arch/ia64/include/asm/tlb.h | 12 ++++++------ arch/m32r/include/asm/pgalloc.h | 4 ++-- arch/m68k/include/asm/motorola_pgalloc.h | 6 ++++-- arch/m68k/include/asm/sun3_pgalloc.h | 4 ++-- arch/microblaze/include/asm/pgalloc.h | 4 ++-- arch/mips/include/asm/pgalloc.h | 6 +++--- arch/mn10300/include/asm/pgalloc.h | 2 +- arch/parisc/include/asm/tlb.h | 4 ++-- arch/powerpc/include/asm/pgalloc-32.h | 2 +- arch/powerpc/include/asm/pgalloc-64.h | 4 ++-- arch/powerpc/include/asm/pgalloc.h | 6 +++--- arch/powerpc/mm/hugetlbpage.c | 4 ++-- arch/s390/include/asm/tlb.h | 9 ++++++--- arch/sh/include/asm/pgalloc.h | 4 ++-- arch/sh/include/asm/tlb.h | 6 +++--- arch/sparc/include/asm/pgalloc_32.h | 8 ++++---- arch/sparc/include/asm/tlb_64.h | 6 +++--- arch/um/include/asm/pgalloc.h | 4 ++-- arch/um/include/asm/tlb.h | 6 +++--- arch/x86/include/asm/pgalloc.h | 25 ++++++++++++++++++++++--- arch/x86/mm/pgtable.c | 6 +++--- arch/xtensa/include/asm/tlb.h | 2 +- include/asm-generic/4level-fixup.h | 4 ++-- include/asm-generic/pgtable-nopmd.h | 2 +- include/asm-generic/pgtable-nopud.h | 2 +- include/asm-generic/tlb.h | 12 ++++++------ mm/memory.c | 11 ++++++----- 34 files changed, 107 insertions(+), 82 deletions(-) (limited to 'include') diff --git a/arch/alpha/include/asm/tlb.h b/arch/alpha/include/asm/tlb.h index c13636575fba..42866759f3fa 100644 --- a/arch/alpha/include/asm/tlb.h +++ b/arch/alpha/include/asm/tlb.h @@ -9,7 +9,7 @@ #include -#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, pte) -#define __pmd_free_tlb(tlb, pmd) pmd_free((tlb)->mm, pmd) +#define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte) +#define __pmd_free_tlb(tlb, pmd, address) pmd_free((tlb)->mm, pmd) #endif diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index 321c83e43a1e..f41a6f57cd12 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -102,8 +102,8 @@ tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) } #define tlb_remove_page(tlb,page) free_page_and_swap_cache(page) -#define pte_free_tlb(tlb, ptep) pte_free((tlb)->mm, ptep) -#define pmd_free_tlb(tlb, pmdp) pmd_free((tlb)->mm, pmdp) +#define pte_free_tlb(tlb, ptep, addr) pte_free((tlb)->mm, ptep) +#define pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp) #define tlb_migrate_finish(mm) do { } while (0) diff --git a/arch/avr32/include/asm/pgalloc.h b/arch/avr32/include/asm/pgalloc.h index 640821323943..92ecd8446ef8 100644 --- a/arch/avr32/include/asm/pgalloc.h +++ b/arch/avr32/include/asm/pgalloc.h @@ -83,7 +83,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pte) quicklist_free_page(QUICK_PT, NULL, pte); } -#define __pte_free_tlb(tlb,pte) \ +#define __pte_free_tlb(tlb,pte,addr) \ do { \ pgtable_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ diff --git a/arch/cris/include/asm/pgalloc.h b/arch/cris/include/asm/pgalloc.h index a1ba761d0573..6da975db112f 100644 --- a/arch/cris/include/asm/pgalloc.h +++ b/arch/cris/include/asm/pgalloc.h @@ -47,7 +47,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pte) __free_page(pte); } -#define __pte_free_tlb(tlb,pte) \ +#define __pte_free_tlb(tlb,pte,address) \ do { \ pgtable_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ diff --git a/arch/frv/include/asm/pgalloc.h b/arch/frv/include/asm/pgalloc.h index 971e6addb009..416d19a632f2 100644 --- a/arch/frv/include/asm/pgalloc.h +++ b/arch/frv/include/asm/pgalloc.h @@ -49,7 +49,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pte) __free_page(pte); } -#define __pte_free_tlb(tlb,pte) \ +#define __pte_free_tlb(tlb,pte,address) \ do { \ pgtable_page_dtor(pte); \ tlb_remove_page((tlb),(pte)); \ @@ -62,7 +62,7 @@ do { \ */ #define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *) 2); }) #define pmd_free(mm, x) do { } while (0) -#define __pmd_free_tlb(tlb,x) do { } while (0) +#define __pmd_free_tlb(tlb,x,a) do { } while (0) #endif /* CONFIG_MMU */ diff --git a/arch/frv/include/asm/pgtable.h b/arch/frv/include/asm/pgtable.h index 33233011b1c1..22c60692b551 100644 --- a/arch/frv/include/asm/pgtable.h +++ b/arch/frv/include/asm/pgtable.h @@ -225,7 +225,7 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address) */ #define pud_alloc_one(mm, address) NULL #define pud_free(mm, x) do { } while (0) -#define __pud_free_tlb(tlb, x) do { } while (0) +#define __pud_free_tlb(tlb, x, address) do { } while (0) /* * The "pud_xxx()" functions here are trivial for a folded two-level diff --git a/arch/ia64/include/asm/pgalloc.h b/arch/ia64/include/asm/pgalloc.h index b9ac1a6fc216..96a8d927db28 100644 --- a/arch/ia64/include/asm/pgalloc.h +++ b/arch/ia64/include/asm/pgalloc.h @@ -48,7 +48,7 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) { quicklist_free(0, NULL, pud); } -#define __pud_free_tlb(tlb, pud) pud_free((tlb)->mm, pud) +#define __pud_free_tlb(tlb, pud, address) pud_free((tlb)->mm, pud) #endif /* CONFIG_PGTABLE_4 */ static inline void @@ -67,7 +67,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) quicklist_free(0, NULL, pmd); } -#define __pmd_free_tlb(tlb, pmd) pmd_free((tlb)->mm, pmd) +#define __pmd_free_tlb(tlb, pmd, address) pmd_free((tlb)->mm, pmd) static inline void pmd_populate(struct mm_struct *mm, pmd_t * pmd_entry, pgtable_t pte) @@ -117,6 +117,6 @@ static inline void check_pgt_cache(void) quicklist_trim(0, NULL, 25, 16); } -#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, pte) +#define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte) #endif /* _ASM_IA64_PGALLOC_H */ diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h index 20d8a39680c2..85d965cb19a0 100644 --- a/arch/ia64/include/asm/tlb.h +++ b/arch/ia64/include/asm/tlb.h @@ -236,22 +236,22 @@ do { \ __tlb_remove_tlb_entry(tlb, ptep, addr); \ } while (0) -#define pte_free_tlb(tlb, ptep) \ +#define pte_free_tlb(tlb, ptep, address) \ do { \ tlb->need_flush = 1; \ - __pte_free_tlb(tlb, ptep); \ + __pte_free_tlb(tlb, ptep, address); \ } while (0) -#define pmd_free_tlb(tlb, ptep) \ +#define pmd_free_tlb(tlb, ptep, address) \ do { \ tlb->need_flush = 1; \ - __pmd_free_tlb(tlb, ptep); \ + __pmd_free_tlb(tlb, ptep, address); \ } while (0) -#define pud_free_tlb(tlb, pudp) \ +#define pud_free_tlb(tlb, pudp, address) \ do { \ tlb->need_flush = 1; \ - __pud_free_tlb(tlb, pudp); \ + __pud_free_tlb(tlb, pudp, address); \ } while (0) #endif /* _ASM_IA64_TLB_H */ diff --git a/arch/m32r/include/asm/pgalloc.h b/arch/m32r/include/asm/pgalloc.h index f11a2b909cdb..0fc736198979 100644 --- a/arch/m32r/include/asm/pgalloc.h +++ b/arch/m32r/include/asm/pgalloc.h @@ -58,7 +58,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pte) __free_page(pte); } -#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, (pte)) +#define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, (pte)) /* * allocating and freeing a pmd is trivial: the 1-entry pmd is @@ -68,7 +68,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pte) #define pmd_alloc_one(mm, addr) ({ BUG(); ((pmd_t *)2); }) #define pmd_free(mm, x) do { } while (0) -#define __pmd_free_tlb(tlb, x) do { } while (0) +#define __pmd_free_tlb(tlb, x, addr) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() #define check_pgt_cache() do { } while (0) diff --git a/arch/m68k/include/asm/motorola_pgalloc.h b/arch/m68k/include/asm/motorola_pgalloc.h index d08bf6261df8..15ee4c74a9f0 100644 --- a/arch/m68k/include/asm/motorola_pgalloc.h +++ b/arch/m68k/include/asm/motorola_pgalloc.h @@ -54,7 +54,8 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t page) __free_page(page); } -static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t page) +static inline void __pte_free_tlb(struct mmu_gather *tlb, pgtable_t page, + unsigned long address) { pgtable_page_dtor(page); cache_page(kmap(page)); @@ -73,7 +74,8 @@ static inline int pmd_free(struct mm_struct *mm, pmd_t *pmd) return free_pointer_table(pmd); } -static inline int __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +static inline int __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long address) { return free_pointer_table(pmd); } diff --git a/arch/m68k/include/asm/sun3_pgalloc.h b/arch/m68k/include/asm/sun3_pgalloc.h index d4c83f143816..48d80d5a666f 100644 --- a/arch/m68k/include/asm/sun3_pgalloc.h +++ b/arch/m68k/include/asm/sun3_pgalloc.h @@ -32,7 +32,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t page) __free_page(page); } -#define __pte_free_tlb(tlb,pte) \ +#define __pte_free_tlb(tlb,pte,addr) \ do { \ pgtable_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ @@ -80,7 +80,7 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, pgtable_t page * inside the pgd, so has no extra memory associated with it. */ #define pmd_free(mm, x) do { } while (0) -#define __pmd_free_tlb(tlb, x) do { } while (0) +#define __pmd_free_tlb(tlb, x, addr) do { } while (0) static inline void pgd_free(struct mm_struct *mm, pgd_t *pgd) { diff --git a/arch/microblaze/include/asm/pgalloc.h b/arch/microblaze/include/asm/pgalloc.h index 59a757e46ba5..b0131da1387b 100644 --- a/arch/microblaze/include/asm/pgalloc.h +++ b/arch/microblaze/include/asm/pgalloc.h @@ -180,7 +180,7 @@ extern inline void pte_free(struct mm_struct *mm, struct page *ptepage) __free_page(ptepage); } -#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, (pte)) +#define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, (pte)) #define pmd_populate(mm, pmd, pte) (pmd_val(*(pmd)) = page_address(pte)) @@ -193,7 +193,7 @@ extern inline void pte_free(struct mm_struct *mm, struct page *ptepage) */ #define pmd_alloc_one(mm, address) ({ BUG(); ((pmd_t *)2); }) /*#define pmd_free(mm, x) do { } while (0)*/ -#define __pmd_free_tlb(tlb, x) do { } while (0) +#define __pmd_free_tlb(tlb, x, addr) do { } while (0) #define pgd_populate(mm, pmd, pte) BUG() extern int do_check_pgt_cache(int, int); diff --git a/arch/mips/include/asm/pgalloc.h b/arch/mips/include/asm/pgalloc.h index 1275831dda29..f705735feefc 100644 --- a/arch/mips/include/asm/pgalloc.h +++ b/arch/mips/include/asm/pgalloc.h @@ -98,7 +98,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pte) __free_pages(pte, PTE_ORDER); } -#define __pte_free_tlb(tlb,pte) \ +#define __pte_free_tlb(tlb,pte,address) \ do { \ pgtable_page_dtor(pte); \ tlb_remove_page((tlb), pte); \ @@ -111,7 +111,7 @@ do { \ * inside the pgd, so has no extra memory associated with it. */ #define pmd_free(mm, x) do { } while (0) -#define __pmd_free_tlb(tlb, x) do { } while (0) +#define __pmd_free_tlb(tlb, x, addr) do { } while (0) #endif @@ -132,7 +132,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) free_pages((unsigned long)pmd, PMD_ORDER); } -#define __pmd_free_tlb(tlb, x) pmd_free((tlb)->mm, x) +#define __pmd_free_tlb(tlb, x, addr) pmd_free((tlb)->mm, x) #endif diff --git a/arch/mn10300/include/asm/pgalloc.h b/arch/mn10300/include/asm/pgalloc.h index ec057e1bd4cf..a19f11327cd8 100644 --- a/arch/mn10300/include/asm/pgalloc.h +++ b/arch/mn10300/include/asm/pgalloc.h @@ -51,6 +51,6 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte) } -#define __pte_free_tlb(tlb, pte) tlb_remove_page((tlb), (pte)) +#define __pte_free_tlb(tlb, pte, addr) tlb_remove_page((tlb), (pte)) #endif /* _ASM_PGALLOC_H */ diff --git a/arch/parisc/include/asm/tlb.h b/arch/parisc/include/asm/tlb.h index 383b1db310ee..07924903989e 100644 --- a/arch/parisc/include/asm/tlb.h +++ b/arch/parisc/include/asm/tlb.h @@ -21,7 +21,7 @@ do { if (!(tlb)->fullmm) \ #include -#define __pmd_free_tlb(tlb, pmd) pmd_free((tlb)->mm, pmd) -#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, pte) +#define __pmd_free_tlb(tlb, pmd, addr) pmd_free((tlb)->mm, pmd) +#define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte) #endif diff --git a/arch/powerpc/include/asm/pgalloc-32.h b/arch/powerpc/include/asm/pgalloc-32.h index 0815eb40acae..c9500d666a1d 100644 --- a/arch/powerpc/include/asm/pgalloc-32.h +++ b/arch/powerpc/include/asm/pgalloc-32.h @@ -16,7 +16,7 @@ extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); */ /* #define pmd_alloc_one(mm,address) ({ BUG(); ((pmd_t *)2); }) */ #define pmd_free(mm, x) do { } while (0) -#define __pmd_free_tlb(tlb,x) do { } while (0) +#define __pmd_free_tlb(tlb,x,a) do { } while (0) /* #define pgd_populate(mm, pmd, pte) BUG() */ #ifndef CONFIG_BOOKE diff --git a/arch/powerpc/include/asm/pgalloc-64.h b/arch/powerpc/include/asm/pgalloc-64.h index afda2bdd860f..e6f069c4f713 100644 --- a/arch/powerpc/include/asm/pgalloc-64.h +++ b/arch/powerpc/include/asm/pgalloc-64.h @@ -118,11 +118,11 @@ static inline void pgtable_free(pgtable_free_t pgf) kmem_cache_free(pgtable_cache[cachenum], p); } -#define __pmd_free_tlb(tlb, pmd) \ +#define __pmd_free_tlb(tlb, pmd,addr) \ pgtable_free_tlb(tlb, pgtable_free_cache(pmd, \ PMD_CACHE_NUM, PMD_TABLE_SIZE-1)) #ifndef CONFIG_PPC_64K_PAGES -#define __pud_free_tlb(tlb, pud) \ +#define __pud_free_tlb(tlb, pud, addr) \ pgtable_free_tlb(tlb, pgtable_free_cache(pud, \ PUD_CACHE_NUM, PUD_TABLE_SIZE-1)) #endif /* CONFIG_PPC_64K_PAGES */ diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index 5d8480265a77..1730e5e298d6 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -38,14 +38,14 @@ static inline pgtable_free_t pgtable_free_cache(void *p, int cachenum, extern void pgtable_free_tlb(struct mmu_gather *tlb, pgtable_free_t pgf); #ifdef CONFIG_SMP -#define __pte_free_tlb(tlb,ptepage) \ +#define __pte_free_tlb(tlb,ptepage,address) \ do { \ pgtable_page_dtor(ptepage); \ pgtable_free_tlb(tlb, pgtable_free_cache(page_address(ptepage), \ - PTE_NONCACHE_NUM, PTE_TABLE_SIZE-1)); \ + PTE_NONCACHE_NUM, PTE_TABLE_SIZE-1)); \ } while (0) #else -#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, (pte)) +#define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, (pte)) #endif diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 9920d6a7cf29..c46ef2ffa3d9 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -305,7 +305,7 @@ static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, pmd = pmd_offset(pud, start); pud_clear(pud); - pmd_free_tlb(tlb, pmd); + pmd_free_tlb(tlb, pmd, start); } static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -348,7 +348,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, pud = pud_offset(pgd, start); pgd_clear(pgd); - pud_free_tlb(tlb, pud); + pud_free_tlb(tlb, pud, start); } /* diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 3d8a96d39d9d..81150b053689 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -96,7 +96,8 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) * pte_free_tlb frees a pte table and clears the CRSTE for the * page table from the tlb. */ -static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte) +static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte, + unsigned long address) { if (!tlb->fullmm) { tlb->array[tlb->nr_ptes++] = pte; @@ -113,7 +114,8 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte) * as the pgd. pmd_free_tlb checks the asce_limit against 2GB * to avoid the double free of the pmd in this case. */ -static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long address) { #ifdef __s390x__ if (tlb->mm->context.asce_limit <= (1UL << 31)) @@ -134,7 +136,8 @@ static inline void pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) * as the pgd. pud_free_tlb checks the asce_limit against 4TB * to avoid the double free of the pud in this case. */ -static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) +static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, + unsigned long address) { #ifdef __s390x__ if (tlb->mm->context.asce_limit <= (1UL << 42)) diff --git a/arch/sh/include/asm/pgalloc.h b/arch/sh/include/asm/pgalloc.h index 84dd2db7104c..89a482750a5b 100644 --- a/arch/sh/include/asm/pgalloc.h +++ b/arch/sh/include/asm/pgalloc.h @@ -73,7 +73,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pte) quicklist_free_page(QUICK_PT, NULL, pte); } -#define __pte_free_tlb(tlb,pte) \ +#define __pte_free_tlb(tlb,pte,addr) \ do { \ pgtable_page_dtor(pte); \ tlb_remove_page((tlb), (pte)); \ @@ -85,7 +85,7 @@ do { \ */ #define pmd_free(mm, x) do { } while (0) -#define __pmd_free_tlb(tlb,x) do { } while (0) +#define __pmd_free_tlb(tlb,x,addr) do { } while (0) static inline void check_pgt_cache(void) { diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index 9c16f737074a..da8fe7ab8728 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h @@ -91,9 +91,9 @@ tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) } #define tlb_remove_page(tlb,page) free_page_and_swap_cache(page) -#define pte_free_tlb(tlb, ptep) pte_free((tlb)->mm, ptep) -#define pmd_free_tlb(tlb, pmdp) pmd_free((tlb)->mm, pmdp) -#define pud_free_tlb(tlb, pudp) pud_free((tlb)->mm, pudp) +#define pte_free_tlb(tlb, ptep, addr) pte_free((tlb)->mm, ptep) +#define pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp) +#define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp) #define tlb_migrate_finish(mm) do { } while (0) diff --git a/arch/sparc/include/asm/pgalloc_32.h b/arch/sparc/include/asm/pgalloc_32.h index 681582d26969..ca2b34456c4b 100644 --- a/arch/sparc/include/asm/pgalloc_32.h +++ b/arch/sparc/include/asm/pgalloc_32.h @@ -44,8 +44,8 @@ BTFIXUPDEF_CALL(pmd_t *, pmd_alloc_one, struct mm_struct *, unsigned long) BTFIXUPDEF_CALL(void, free_pmd_fast, pmd_t *) #define free_pmd_fast(pmd) BTFIXUP_CALL(free_pmd_fast)(pmd) -#define pmd_free(mm, pmd) free_pmd_fast(pmd) -#define __pmd_free_tlb(tlb, pmd) pmd_free((tlb)->mm, pmd) +#define pmd_free(mm, pmd) free_pmd_fast(pmd) +#define __pmd_free_tlb(tlb, pmd, addr) pmd_free((tlb)->mm, pmd) BTFIXUPDEF_CALL(void, pmd_populate, pmd_t *, struct page *) #define pmd_populate(MM, PMD, PTE) BTFIXUP_CALL(pmd_populate)(PMD, PTE) @@ -62,7 +62,7 @@ BTFIXUPDEF_CALL(void, free_pte_fast, pte_t *) #define pte_free_kernel(mm, pte) BTFIXUP_CALL(free_pte_fast)(pte) BTFIXUPDEF_CALL(void, pte_free, pgtable_t ) -#define pte_free(mm, pte) BTFIXUP_CALL(pte_free)(pte) -#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, pte) +#define pte_free(mm, pte) BTFIXUP_CALL(pte_free)(pte) +#define __pte_free_tlb(tlb, pte, addr) pte_free((tlb)->mm, pte) #endif /* _SPARC_PGALLOC_H */ diff --git a/arch/sparc/include/asm/tlb_64.h b/arch/sparc/include/asm/tlb_64.h index ee38e731bfa6..dca406b9b6fc 100644 --- a/arch/sparc/include/asm/tlb_64.h +++ b/arch/sparc/include/asm/tlb_64.h @@ -100,9 +100,9 @@ static inline void tlb_remove_page(struct mmu_gather *mp, struct page *page) } #define tlb_remove_tlb_entry(mp,ptep,addr) do { } while (0) -#define pte_free_tlb(mp, ptepage) pte_free((mp)->mm, ptepage) -#define pmd_free_tlb(mp, pmdp) pmd_free((mp)->mm, pmdp) -#define pud_free_tlb(tlb,pudp) __pud_free_tlb(tlb,pudp) +#define pte_free_tlb(mp, ptepage, addr) pte_free((mp)->mm, ptepage) +#define pmd_free_tlb(mp, pmdp, addr) pmd_free((mp)->mm, pmdp) +#define pud_free_tlb(tlb,pudp, addr) __pud_free_tlb(tlb,pudp,addr) #define tlb_migrate_finish(mm) do { } while (0) #define tlb_start_vma(tlb, vma) do { } while (0) diff --git a/arch/um/include/asm/pgalloc.h b/arch/um/include/asm/pgalloc.h index 718984359f8c..32c8ce4e1515 100644 --- a/arch/um/include/asm/pgalloc.h +++ b/arch/um/include/asm/pgalloc.h @@ -40,7 +40,7 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pte) __free_page(pte); } -#define __pte_free_tlb(tlb,pte) \ +#define __pte_free_tlb(tlb,pte, address) \ do { \ pgtable_page_dtor(pte); \ tlb_remove_page((tlb),(pte)); \ @@ -53,7 +53,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) free_page((unsigned long)pmd); } -#define __pmd_free_tlb(tlb,x) tlb_remove_page((tlb),virt_to_page(x)) +#define __pmd_free_tlb(tlb,x, address) tlb_remove_page((tlb),virt_to_page(x)) #endif #define check_pgt_cache() do { } while (0) diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index 5240fa1c5e08..660caedac9eb 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -116,11 +116,11 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) -#define pte_free_tlb(tlb, ptep) __pte_free_tlb(tlb, ptep) +#define pte_free_tlb(tlb, ptep, addr) __pte_free_tlb(tlb, ptep, addr) -#define pud_free_tlb(tlb, pudp) __pud_free_tlb(tlb, pudp) +#define pud_free_tlb(tlb, pudp, addr) __pud_free_tlb(tlb, pudp, addr) -#define pmd_free_tlb(tlb, pmdp) __pmd_free_tlb(tlb, pmdp) +#define pmd_free_tlb(tlb, pmdp, addr) __pmd_free_tlb(tlb, pmdp, addr) #define tlb_migrate_finish(mm) do {} while (0) diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index dd14c54ac718..0e8c2a0fd922 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -46,7 +46,13 @@ static inline void pte_free(struct mm_struct *mm, struct page *pte) __free_page(pte); } -extern void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte); +extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte); + +static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte, + unsigned long address) +{ + ___pte_free_tlb(tlb, pte); +} static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) @@ -78,7 +84,13 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) free_page((unsigned long)pmd); } -extern void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); +extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); + +static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long adddress) +{ + ___pmd_free_tlb(tlb, pmd); +} #ifdef CONFIG_X86_PAE extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd); @@ -108,7 +120,14 @@ static inline void pud_free(struct mm_struct *mm, pud_t *pud) free_page((unsigned long)pud); } -extern void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); +extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); + +static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, + unsigned long address) +{ + ___pud_free_tlb(tlb, pud); +} + #endif /* PAGETABLE_LEVELS > 3 */ #endif /* PAGETABLE_LEVELS > 2 */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8e43bdd45456..af8f9650058c 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -25,7 +25,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) return pte; } -void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) +void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); paravirt_release_pte(page_to_pfn(pte)); @@ -33,14 +33,14 @@ void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte) } #if PAGETABLE_LEVELS > 2 -void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) +void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) { paravirt_release_pmd(__pa(pmd) >> PAGE_SHIFT); tlb_remove_page(tlb, virt_to_page(pmd)); } #if PAGETABLE_LEVELS > 3 -void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) +void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); tlb_remove_page(tlb, virt_to_page(pud)); diff --git a/arch/xtensa/include/asm/tlb.h b/arch/xtensa/include/asm/tlb.h index 31c220faca02..0d766f9c1083 100644 --- a/arch/xtensa/include/asm/tlb.h +++ b/arch/xtensa/include/asm/tlb.h @@ -42,6 +42,6 @@ #include -#define __pte_free_tlb(tlb, pte) pte_free((tlb)->mm, pte) +#define __pte_free_tlb(tlb, pte, address) pte_free((tlb)->mm, pte) #endif /* _XTENSA_TLB_H */ diff --git a/include/asm-generic/4level-fixup.h b/include/asm-generic/4level-fixup.h index 9d40e879f99e..77ff547730af 100644 --- a/include/asm-generic/4level-fixup.h +++ b/include/asm-generic/4level-fixup.h @@ -27,9 +27,9 @@ #define pud_page_vaddr(pud) pgd_page_vaddr(pud) #undef pud_free_tlb -#define pud_free_tlb(tlb, x) do { } while (0) +#define pud_free_tlb(tlb, x, addr) do { } while (0) #define pud_free(mm, x) do { } while (0) -#define __pud_free_tlb(tlb, x) do { } while (0) +#define __pud_free_tlb(tlb, x, addr) do { } while (0) #undef pud_addr_end #define pud_addr_end(addr, end) (end) diff --git a/include/asm-generic/pgtable-nopmd.h b/include/asm-generic/pgtable-nopmd.h index a7cdc48e8b78..725612b793ce 100644 --- a/include/asm-generic/pgtable-nopmd.h +++ b/include/asm-generic/pgtable-nopmd.h @@ -59,7 +59,7 @@ static inline pmd_t * pmd_offset(pud_t * pud, unsigned long address) static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) { } -#define __pmd_free_tlb(tlb, x) do { } while (0) +#define __pmd_free_tlb(tlb, x, a) do { } while (0) #undef pmd_addr_end #define pmd_addr_end(addr, end) (end) diff --git a/include/asm-generic/pgtable-nopud.h b/include/asm-generic/pgtable-nopud.h index 87cf449a6df3..810431d8351b 100644 --- a/include/asm-generic/pgtable-nopud.h +++ b/include/asm-generic/pgtable-nopud.h @@ -52,7 +52,7 @@ static inline pud_t * pud_offset(pgd_t * pgd, unsigned long address) */ #define pud_alloc_one(mm, address) NULL #define pud_free(mm, x) do { } while (0) -#define __pud_free_tlb(tlb, x) do { } while (0) +#define __pud_free_tlb(tlb, x, a) do { } while (0) #undef pud_addr_end #define pud_addr_end(addr, end) (end) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index f490e43a90b9..e43f9766259f 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -123,24 +123,24 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) -#define pte_free_tlb(tlb, ptep) \ +#define pte_free_tlb(tlb, ptep, address) \ do { \ tlb->need_flush = 1; \ - __pte_free_tlb(tlb, ptep); \ + __pte_free_tlb(tlb, ptep, address); \ } while (0) #ifndef __ARCH_HAS_4LEVEL_HACK -#define pud_free_tlb(tlb, pudp) \ +#define pud_free_tlb(tlb, pudp, address) \ do { \ tlb->need_flush = 1; \ - __pud_free_tlb(tlb, pudp); \ + __pud_free_tlb(tlb, pudp, address); \ } while (0) #endif -#define pmd_free_tlb(tlb, pmdp) \ +#define pmd_free_tlb(tlb, pmdp, address) \ do { \ tlb->need_flush = 1; \ - __pmd_free_tlb(tlb, pmdp); \ + __pmd_free_tlb(tlb, pmdp, address); \ } while (0) #define tlb_migrate_finish(mm) do {} while (0) diff --git a/mm/memory.c b/mm/memory.c index 65216194eb8d..aede2ce3aba4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -135,11 +135,12 @@ void pmd_clear_bad(pmd_t *pmd) * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ -static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long addr) { pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); - pte_free_tlb(tlb, token); + pte_free_tlb(tlb, token, addr); tlb->mm->nr_ptes--; } @@ -157,7 +158,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - free_pte_range(tlb, pmd); + free_pte_range(tlb, pmd, addr); } while (pmd++, addr = next, addr != end); start &= PUD_MASK; @@ -173,7 +174,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, pmd = pmd_offset(pud, start); pud_clear(pud); - pmd_free_tlb(tlb, pmd); + pmd_free_tlb(tlb, pmd, start); } static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -206,7 +207,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, pud = pud_offset(pgd, start); pgd_clear(pgd); - pud_free_tlb(tlb, pud); + pud_free_tlb(tlb, pud, start); } /* -- cgit v1.2.3-59-g8ed1b From 7cb7f45c7feef43c8f71f5cfedfc0b19be2142f7 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 27 Jul 2009 18:42:38 -0400 Subject: Revert "ACPICA: Remove obsolete acpi_os_validate_address interface" This reverts commit f9ca058430333c9a24c5ca926aa445125f88df18. which caused a regression: http://bugzilla.kernel.org/show_bug.cgi?id=13620 Signed-off-by: Lin Ming Signed-off-by: Len Brown --- drivers/acpi/acpica/acobject.h | 1 + drivers/acpi/acpica/dsopcode.c | 24 ++++++++++++++++++++++++ drivers/acpi/acpica/exfldio.c | 6 ++++++ include/acpi/acpiosxf.h | 4 ++++ 4 files changed, 35 insertions(+) (limited to 'include') diff --git a/drivers/acpi/acpica/acobject.h b/drivers/acpi/acpica/acobject.h index 544dcf834922..eb6f038b03d9 100644 --- a/drivers/acpi/acpica/acobject.h +++ b/drivers/acpi/acpica/acobject.h @@ -97,6 +97,7 @@ #define AOPOBJ_OBJECT_INITIALIZED 0x08 #define AOPOBJ_SETUP_COMPLETE 0x10 #define AOPOBJ_SINGLE_DATUM 0x20 +#define AOPOBJ_INVALID 0x40 /* Used if host OS won't allow an op_region address */ /****************************************************************************** * diff --git a/drivers/acpi/acpica/dsopcode.c b/drivers/acpi/acpica/dsopcode.c index 584d766e6f12..b79978f7bc71 100644 --- a/drivers/acpi/acpica/dsopcode.c +++ b/drivers/acpi/acpica/dsopcode.c @@ -397,6 +397,30 @@ acpi_status acpi_ds_get_region_arguments(union acpi_operand_object *obj_desc) status = acpi_ds_execute_arguments(node, acpi_ns_get_parent_node(node), extra_desc->extra.aml_length, extra_desc->extra.aml_start); + if (ACPI_FAILURE(status)) { + return_ACPI_STATUS(status); + } + + /* Validate the region address/length via the host OS */ + + status = acpi_os_validate_address(obj_desc->region.space_id, + obj_desc->region.address, + (acpi_size) obj_desc->region.length, + acpi_ut_get_node_name(node)); + + if (ACPI_FAILURE(status)) { + /* + * Invalid address/length. We will emit an error message and mark + * the region as invalid, so that it will cause an additional error if + * it is ever used. Then return AE_OK. + */ + ACPI_EXCEPTION((AE_INFO, status, + "During address validation of OpRegion [%4.4s]", + node->name.ascii)); + obj_desc->common.flags |= AOPOBJ_INVALID; + status = AE_OK; + } + return_ACPI_STATUS(status); } diff --git a/drivers/acpi/acpica/exfldio.c b/drivers/acpi/acpica/exfldio.c index d4075b821021..6687be167f5f 100644 --- a/drivers/acpi/acpica/exfldio.c +++ b/drivers/acpi/acpica/exfldio.c @@ -113,6 +113,12 @@ acpi_ex_setup_region(union acpi_operand_object *obj_desc, } } + /* Exit if Address/Length have been disallowed by the host OS */ + + if (rgn_desc->common.flags & AOPOBJ_INVALID) { + return_ACPI_STATUS(AE_AML_ILLEGAL_ADDRESS); + } + /* * Exit now for SMBus address space, it has a non-linear address space * and the request cannot be directly validated diff --git a/include/acpi/acpiosxf.h b/include/acpi/acpiosxf.h index 3e798593b17b..ab0b85cf21f3 100644 --- a/include/acpi/acpiosxf.h +++ b/include/acpi/acpiosxf.h @@ -242,6 +242,10 @@ acpi_os_derive_pci_id(acpi_handle rhandle, acpi_status acpi_os_validate_interface(char *interface); acpi_status acpi_osi_invalidate(char* interface); +acpi_status +acpi_os_validate_address(u8 space_id, acpi_physical_address address, + acpi_size length, char *name); + u64 acpi_os_get_timer(void); acpi_status acpi_os_signal(u32 function, void *info); -- cgit v1.2.3-59-g8ed1b From 27fed4175acf81ddd91d9a4ee2fd298981f60295 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 27 Jul 2009 18:39:45 -0700 Subject: ip: fix logic of reverse path filter sysctl Even though reverse path filter was changed from simple boolean to trinary control, the loose mode only works if both all and device are configured because of this logic error. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/inetdevice.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h index acef2a770b6b..ad27c7da8798 100644 --- a/include/linux/inetdevice.h +++ b/include/linux/inetdevice.h @@ -82,7 +82,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev) #define IN_DEV_FORWARD(in_dev) IN_DEV_CONF_GET((in_dev), FORWARDING) #define IN_DEV_MFORWARD(in_dev) IN_DEV_ANDCONF((in_dev), MC_FORWARDING) -#define IN_DEV_RPFILTER(in_dev) IN_DEV_ANDCONF((in_dev), RP_FILTER) +#define IN_DEV_RPFILTER(in_dev) IN_DEV_MAXCONF((in_dev), RP_FILTER) #define IN_DEV_SOURCE_ROUTE(in_dev) IN_DEV_ANDCONF((in_dev), \ ACCEPT_SOURCE_ROUTE) #define IN_DEV_BOOTP_RELAY(in_dev) IN_DEV_ANDCONF((in_dev), BOOTP_RELAY) -- cgit v1.2.3-59-g8ed1b From 5920dadfb4aec6c1372c5570e71bcd3b4837e63c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 15 Jul 2009 17:11:41 +0900 Subject: libata: accept late unlocking of HPA On certain configurations, HPA isn't or can't be unlocked during probing but it somehow ends up unlocked afterwards. In the following thread, the problem can be reliably reproduced after resuming from STR. The BIOS turns on HPA during boot but forgets to do it during resume. http://thread.gmane.org/gmane.linux.kernel/858310 This patch updates libata revalidation such that it considers native n_sectors. If the device size has increased to match native n_sectors, it's assumed that HPA has been unlocked involuntarily and the device is recognized as the same one. This should be fairly safe while nicely working around the problem. Signed-off-by: Tejun Heo Reported-by: Christof Warlich Signed-off-by: Jeff Garzik --- drivers/ata/libata-core.c | 30 +++++++++++++++++++++++------- include/linux/libata.h | 1 + 2 files changed, 24 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 2c6aedaef718..8ac98ff16d7d 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -1515,6 +1515,7 @@ static int ata_hpa_resize(struct ata_device *dev) return rc; } + dev->n_native_sectors = native_sectors; /* nothing to do? */ if (native_sectors <= sectors || !ata_ignore_hpa) { @@ -4099,6 +4100,7 @@ int ata_dev_revalidate(struct ata_device *dev, unsigned int new_class, unsigned int readid_flags) { u64 n_sectors = dev->n_sectors; + u64 n_native_sectors = dev->n_native_sectors; int rc; if (!ata_dev_enabled(dev)) @@ -4128,16 +4130,30 @@ int ata_dev_revalidate(struct ata_device *dev, unsigned int new_class, /* verify n_sectors hasn't changed */ if (dev->class == ATA_DEV_ATA && n_sectors && dev->n_sectors != n_sectors) { - ata_dev_printk(dev, KERN_INFO, "n_sectors mismatch " + ata_dev_printk(dev, KERN_WARNING, "n_sectors mismatch " "%llu != %llu\n", (unsigned long long)n_sectors, (unsigned long long)dev->n_sectors); - - /* restore original n_sectors */ - dev->n_sectors = n_sectors; - - rc = -ENODEV; - goto fail; + /* + * Something could have caused HPA to be unlocked + * involuntarily. If n_native_sectors hasn't changed + * and the new size matches it, keep the device. + */ + if (dev->n_native_sectors == n_native_sectors && + dev->n_sectors > n_sectors && + dev->n_sectors == n_native_sectors) { + ata_dev_printk(dev, KERN_WARNING, + "new n_sectors matches native, probably " + "late HPA unlock, continuing\n"); + /* keep using the old n_sectors */ + dev->n_sectors = n_sectors; + } else { + /* restore original n_[native]_sectors and fail */ + dev->n_native_sectors = n_native_sectors; + dev->n_sectors = n_sectors; + rc = -ENODEV; + goto fail; + } } return 0; diff --git a/include/linux/libata.h b/include/linux/libata.h index 79b6d7fd4ac2..e5b6e33c6571 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -589,6 +589,7 @@ struct ata_device { #endif /* n_sector is CLEAR_BEGIN, read comment above CLEAR_BEGIN */ u64 n_sectors; /* size of device, if ATA */ + u64 n_native_sectors; /* native size, if ATA */ unsigned int class; /* ATA_DEV_xxx */ unsigned long unpark_deadline; -- cgit v1.2.3-59-g8ed1b From e024e11070a0a0dc7163ce1ec2da354a638bdbed Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Wed, 24 Jun 2009 09:48:08 +1000 Subject: drm/radeon/kms: add initial colortiling support. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds new set/get tiling interfaces where the pitch and macro/micro tiling enables can be set. Along with a flag to decide if this object should have a surface when mapped. The only thing we need to allocate with a mapped surface should be the frontbuffer. Note rotate scanout shouldn't require one, and back/depth shouldn't either, though mesa needs some fixes. It fixes the TTM interfaces along Thomas's suggestions, and I've tested the surface stealing code with two X servers and not seen any lockdep issues. I've stopped tiling the fbcon frontbuffer, as I don't see there being any advantage other than testing, I've left the testing commands in there, just flip the fb_tiled to true in radeon_fb.c Open: Can we integrate endian swapping in with this? Future features: texture tiling - need to relocate texture registers TXOFFSET* with tiling info. This also merges Michel's cleanup surfaces regs at init time patch even though it makes sense on its own, this patch really relies on it. Some PowerMac firmwares set up a tiling surface at the beginning of VRAM which messes us up otherwise. that patch is: Signed-off-by: Michel Dänzer Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/atombios_crtc.c | 11 ++- drivers/gpu/drm/radeon/r100.c | 79 ++++++++++++++++- drivers/gpu/drm/radeon/r300.c | 51 ++++++++++- drivers/gpu/drm/radeon/r300_reg.h | 4 +- drivers/gpu/drm/radeon/radeon.h | 32 ++++++- drivers/gpu/drm/radeon/radeon_asic.h | 19 ++++ drivers/gpu/drm/radeon/radeon_device.c | 2 + drivers/gpu/drm/radeon/radeon_fb.c | 12 ++- drivers/gpu/drm/radeon/radeon_gem.c | 41 +++++++++ drivers/gpu/drm/radeon/radeon_kms.c | 2 + drivers/gpu/drm/radeon/radeon_legacy_crtc.c | 15 ++-- drivers/gpu/drm/radeon/radeon_object.c | 130 ++++++++++++++++++++++++++++ drivers/gpu/drm/radeon/radeon_ttm.c | 2 + drivers/gpu/drm/ttm/ttm_bo.c | 5 +- drivers/gpu/drm/ttm/ttm_bo_vm.c | 3 + include/drm/radeon_drm.h | 23 ++++- include/drm/ttm/ttm_bo_driver.h | 15 ++++ 17 files changed, 427 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/radeon/atombios_crtc.c b/drivers/gpu/drm/radeon/atombios_crtc.c index e64a199b5ee1..eac26cdb5dae 100644 --- a/drivers/gpu/drm/radeon/atombios_crtc.c +++ b/drivers/gpu/drm/radeon/atombios_crtc.c @@ -327,7 +327,7 @@ int atombios_crtc_set_base(struct drm_crtc *crtc, int x, int y, struct drm_gem_object *obj; struct drm_radeon_gem_object *obj_priv; uint64_t fb_location; - uint32_t fb_format, fb_pitch_pixels; + uint32_t fb_format, fb_pitch_pixels, tiling_flags; if (!crtc->fb) return -EINVAL; @@ -364,7 +364,14 @@ int atombios_crtc_set_base(struct drm_crtc *crtc, int x, int y, return -EINVAL; } - /* TODO tiling */ + radeon_object_get_tiling_flags(obj->driver_private, + &tiling_flags, NULL); + if (tiling_flags & RADEON_TILING_MACRO) + fb_format |= AVIVO_D1GRPH_MACRO_ADDRESS_MODE; + + if (tiling_flags & RADEON_TILING_MICRO) + fb_format |= AVIVO_D1GRPH_TILED; + if (radeon_crtc->crtc_id == 0) WREG32(AVIVO_D1VGA_CONTROL, 0); else diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c index 0d05909f03f6..69bd7cb59972 100644 --- a/drivers/gpu/drm/radeon/r100.c +++ b/drivers/gpu/drm/radeon/r100.c @@ -909,6 +909,7 @@ static int r100_packet0_check(struct radeon_cs_parser *p, unsigned idx; bool onereg; int r; + u32 tile_flags = 0; ib = p->ib->ptr; ib_chunk = &p->chunks[p->chunk_ib_idx]; @@ -942,7 +943,20 @@ static int r100_packet0_check(struct radeon_cs_parser *p, } tmp = ib_chunk->kdata[idx] & 0x003fffff; tmp += (((u32)reloc->lobj.gpu_offset) >> 10); - ib[idx] = (ib_chunk->kdata[idx] & 0xffc00000) | tmp; + + if (reloc->lobj.tiling_flags & RADEON_TILING_MACRO) + tile_flags |= RADEON_DST_TILE_MACRO; + if (reloc->lobj.tiling_flags & RADEON_TILING_MICRO) { + if (reg == RADEON_SRC_PITCH_OFFSET) { + DRM_ERROR("Cannot src blit from microtiled surface\n"); + r100_cs_dump_packet(p, pkt); + return -EINVAL; + } + tile_flags |= RADEON_DST_TILE_MICRO; + } + + tmp |= tile_flags; + ib[idx] = (ib_chunk->kdata[idx] & 0x3fc00000) | tmp; break; case RADEON_RB3D_DEPTHOFFSET: case RADEON_RB3D_COLOROFFSET: @@ -987,6 +1001,25 @@ static int r100_packet0_check(struct radeon_cs_parser *p, } ib[idx] = ib_chunk->kdata[idx] + ((u32)reloc->lobj.gpu_offset); break; + case R300_RB3D_COLORPITCH0: + case RADEON_RB3D_COLORPITCH: + r = r100_cs_packet_next_reloc(p, &reloc); + if (r) { + DRM_ERROR("No reloc for ib[%d]=0x%04X\n", + idx, reg); + r100_cs_dump_packet(p, pkt); + return r; + } + + if (reloc->lobj.tiling_flags & RADEON_TILING_MACRO) + tile_flags |= RADEON_COLOR_TILE_ENABLE; + if (reloc->lobj.tiling_flags & RADEON_TILING_MICRO) + tile_flags |= RADEON_COLOR_MICROTILE_ENABLE; + + tmp = ib_chunk->kdata[idx] & ~(0x7 << 16); + tmp |= tile_flags; + ib[idx] = tmp; + break; default: /* FIXME: we don't want to allow anyothers packet */ break; @@ -1707,3 +1740,47 @@ int r100_debugfs_mc_info_init(struct radeon_device *rdev) return 0; #endif } + +int r100_set_surface_reg(struct radeon_device *rdev, int reg, + uint32_t tiling_flags, uint32_t pitch, + uint32_t offset, uint32_t obj_size) +{ + int surf_index = reg * 16; + int flags = 0; + + /* r100/r200 divide by 16 */ + if (rdev->family < CHIP_R300) + flags = pitch / 16; + else + flags = pitch / 8; + + if (rdev->family <= CHIP_RS200) { + if ((tiling_flags & (RADEON_TILING_MACRO|RADEON_TILING_MICRO)) + == (RADEON_TILING_MACRO|RADEON_TILING_MICRO)) + flags |= RADEON_SURF_TILE_COLOR_BOTH; + if (tiling_flags & RADEON_TILING_MACRO) + flags |= RADEON_SURF_TILE_COLOR_MACRO; + } else if (rdev->family <= CHIP_RV280) { + if (tiling_flags & (RADEON_TILING_MACRO)) + flags |= R200_SURF_TILE_COLOR_MACRO; + if (tiling_flags & RADEON_TILING_MICRO) + flags |= R200_SURF_TILE_COLOR_MICRO; + } else { + if (tiling_flags & RADEON_TILING_MACRO) + flags |= R300_SURF_TILE_MACRO; + if (tiling_flags & RADEON_TILING_MICRO) + flags |= R300_SURF_TILE_MICRO; + } + + DRM_DEBUG("writing surface %d %d %x %x\n", reg, flags, offset, offset+obj_size-1); + WREG32(RADEON_SURFACE0_INFO + surf_index, flags); + WREG32(RADEON_SURFACE0_LOWER_BOUND + surf_index, offset); + WREG32(RADEON_SURFACE0_UPPER_BOUND + surf_index, offset + obj_size - 1); + return 0; +} + +void r100_clear_surface_reg(struct radeon_device *rdev, int reg) +{ + int surf_index = reg * 16; + WREG32(RADEON_SURFACE0_INFO + surf_index, 0); +} diff --git a/drivers/gpu/drm/radeon/r300.c b/drivers/gpu/drm/radeon/r300.c index 0e0e094da503..28e5777658be 100644 --- a/drivers/gpu/drm/radeon/r300.c +++ b/drivers/gpu/drm/radeon/r300.c @@ -30,6 +30,7 @@ #include "drm.h" #include "radeon_reg.h" #include "radeon.h" +#include "radeon_drm.h" /* r300,r350,rv350,rv370,rv380 depends on : */ void r100_hdp_reset(struct radeon_device *rdev); @@ -1023,7 +1024,7 @@ static int r300_packet0_check(struct radeon_cs_parser *p, struct radeon_cs_reloc *reloc; struct r300_cs_track *track; volatile uint32_t *ib; - uint32_t tmp; + uint32_t tmp, tile_flags = 0; unsigned i; int r; @@ -1052,7 +1053,19 @@ static int r300_packet0_check(struct radeon_cs_parser *p, } tmp = ib_chunk->kdata[idx] & 0x003fffff; tmp += (((u32)reloc->lobj.gpu_offset) >> 10); - ib[idx] = (ib_chunk->kdata[idx] & 0xffc00000) | tmp; + + if (reloc->lobj.tiling_flags & RADEON_TILING_MACRO) + tile_flags |= RADEON_DST_TILE_MACRO; + if (reloc->lobj.tiling_flags & RADEON_TILING_MICRO) { + if (reg == RADEON_SRC_PITCH_OFFSET) { + DRM_ERROR("Cannot src blit from microtiled surface\n"); + r100_cs_dump_packet(p, pkt); + return -EINVAL; + } + tile_flags |= RADEON_DST_TILE_MICRO; + } + tmp |= tile_flags; + ib[idx] = (ib_chunk->kdata[idx] & 0x3fc00000) | tmp; break; case R300_RB3D_COLOROFFSET0: case R300_RB3D_COLOROFFSET1: @@ -1141,6 +1154,23 @@ static int r300_packet0_check(struct radeon_cs_parser *p, /* RB3D_COLORPITCH1 */ /* RB3D_COLORPITCH2 */ /* RB3D_COLORPITCH3 */ + r = r100_cs_packet_next_reloc(p, &reloc); + if (r) { + DRM_ERROR("No reloc for ib[%d]=0x%04X\n", + idx, reg); + r100_cs_dump_packet(p, pkt); + return r; + } + + if (reloc->lobj.tiling_flags & RADEON_TILING_MACRO) + tile_flags |= R300_COLOR_TILE_ENABLE; + if (reloc->lobj.tiling_flags & RADEON_TILING_MICRO) + tile_flags |= R300_COLOR_MICROTILE_ENABLE; + + tmp = ib_chunk->kdata[idx] & ~(0x7 << 16); + tmp |= tile_flags; + ib[idx] = tmp; + i = (reg - 0x4E38) >> 2; track->cb[i].pitch = ib_chunk->kdata[idx] & 0x3FFE; switch (((ib_chunk->kdata[idx] >> 21) & 0xF)) { @@ -1196,6 +1226,23 @@ static int r300_packet0_check(struct radeon_cs_parser *p, break; case 0x4F24: /* ZB_DEPTHPITCH */ + r = r100_cs_packet_next_reloc(p, &reloc); + if (r) { + DRM_ERROR("No reloc for ib[%d]=0x%04X\n", + idx, reg); + r100_cs_dump_packet(p, pkt); + return r; + } + + if (reloc->lobj.tiling_flags & RADEON_TILING_MACRO) + tile_flags |= R300_DEPTHMACROTILE_ENABLE; + if (reloc->lobj.tiling_flags & RADEON_TILING_MICRO) + tile_flags |= R300_DEPTHMICROTILE_TILED;; + + tmp = ib_chunk->kdata[idx] & ~(0x7 << 16); + tmp |= tile_flags; + ib[idx] = tmp; + track->zb.pitch = ib_chunk->kdata[idx] & 0x3FFC; break; case 0x4104: diff --git a/drivers/gpu/drm/radeon/r300_reg.h b/drivers/gpu/drm/radeon/r300_reg.h index 70f48609515e..4b7afef35a65 100644 --- a/drivers/gpu/drm/radeon/r300_reg.h +++ b/drivers/gpu/drm/radeon/r300_reg.h @@ -27,7 +27,9 @@ #ifndef _R300_REG_H_ #define _R300_REG_H_ - +#define R300_SURF_TILE_MACRO (1<<16) +#define R300_SURF_TILE_MICRO (2<<16) +#define R300_SURF_TILE_BOTH (3<<16) #define R300_MC_INIT_MISC_LAT_TIMER 0x180 diff --git a/drivers/gpu/drm/radeon/radeon.h b/drivers/gpu/drm/radeon/radeon.h index 7f007185e7f7..af12a2fe3221 100644 --- a/drivers/gpu/drm/radeon/radeon.h +++ b/drivers/gpu/drm/radeon/radeon.h @@ -201,6 +201,14 @@ int radeon_fence_wait_last(struct radeon_device *rdev); struct radeon_fence *radeon_fence_ref(struct radeon_fence *fence); void radeon_fence_unref(struct radeon_fence **fence); +/* + * Tiling registers + */ +struct radeon_surface_reg { + struct radeon_object *robj; +}; + +#define RADEON_GEM_MAX_SURFACES 8 /* * Radeon buffer. @@ -213,6 +221,7 @@ struct radeon_object_list { uint64_t gpu_offset; unsigned rdomain; unsigned wdomain; + uint32_t tiling_flags; }; int radeon_object_init(struct radeon_device *rdev); @@ -242,8 +251,15 @@ void radeon_object_list_clean(struct list_head *head); int radeon_object_fbdev_mmap(struct radeon_object *robj, struct vm_area_struct *vma); unsigned long radeon_object_size(struct radeon_object *robj); - - +void radeon_object_clear_surface_reg(struct radeon_object *robj); +int radeon_object_check_tiling(struct radeon_object *robj, bool has_moved, + bool force_drop); +void radeon_object_set_tiling_flags(struct radeon_object *robj, + uint32_t tiling_flags, uint32_t pitch); +void radeon_object_get_tiling_flags(struct radeon_object *robj, uint32_t *tiling_flags, uint32_t *pitch); +void radeon_bo_move_notify(struct ttm_buffer_object *bo, + struct ttm_mem_reg *mem); +void radeon_bo_fault_reserve_notify(struct ttm_buffer_object *bo); /* * GEM objects. */ @@ -535,6 +551,11 @@ struct radeon_asic { void (*set_memory_clock)(struct radeon_device *rdev, uint32_t mem_clock); void (*set_pcie_lanes)(struct radeon_device *rdev, int lanes); void (*set_clock_gating)(struct radeon_device *rdev, int enable); + + int (*set_surface_reg)(struct radeon_device *rdev, int reg, + uint32_t tiling_flags, uint32_t pitch, + uint32_t offset, uint32_t obj_size); + int (*clear_surface_reg)(struct radeon_device *rdev, int reg); }; union radeon_asic_config { @@ -568,6 +589,10 @@ int radeon_gem_busy_ioctl(struct drm_device *dev, void *data, int radeon_gem_wait_idle_ioctl(struct drm_device *dev, void *data, struct drm_file *filp); int radeon_cs_ioctl(struct drm_device *dev, void *data, struct drm_file *filp); +int radeon_gem_set_tiling_ioctl(struct drm_device *dev, void *data, + struct drm_file *filp); +int radeon_gem_get_tiling_ioctl(struct drm_device *dev, void *data, + struct drm_file *filp); /* @@ -627,6 +652,7 @@ struct radeon_device { bool shutdown; bool suspend; bool need_dma32; + struct radeon_surface_reg surface_regs[RADEON_GEM_MAX_SURFACES]; }; int radeon_device_init(struct radeon_device *rdev, @@ -801,5 +827,7 @@ static inline void radeon_ring_write(struct radeon_device *rdev, uint32_t v) #define radeon_set_memory_clock(rdev, e) (rdev)->asic->set_engine_clock((rdev), (e)) #define radeon_set_pcie_lanes(rdev, l) (rdev)->asic->set_pcie_lanes((rdev), (l)) #define radeon_set_clock_gating(rdev, e) (rdev)->asic->set_clock_gating((rdev), (e)) +#define radeon_set_surface_reg(rdev, r, f, p, o, s) ((rdev)->asic->set_surface_reg((rdev), (r), (f), (p), (o), (s))) +#define radeon_clear_surface_reg(rdev, r) ((rdev)->asic->clear_surface_reg((rdev), (r))) #endif diff --git a/drivers/gpu/drm/radeon/radeon_asic.h b/drivers/gpu/drm/radeon/radeon_asic.h index e2e567395df8..dd903d329406 100644 --- a/drivers/gpu/drm/radeon/radeon_asic.h +++ b/drivers/gpu/drm/radeon/radeon_asic.h @@ -71,6 +71,10 @@ int r100_copy_blit(struct radeon_device *rdev, uint64_t dst_offset, unsigned num_pages, struct radeon_fence *fence); +int r100_set_surface_reg(struct radeon_device *rdev, int reg, + uint32_t tiling_flags, uint32_t pitch, + uint32_t offset, uint32_t obj_size); +int r100_clear_surface_reg(struct radeon_device *rdev, int reg); static struct radeon_asic r100_asic = { .init = &r100_init, @@ -100,6 +104,8 @@ static struct radeon_asic r100_asic = { .set_memory_clock = NULL, .set_pcie_lanes = NULL, .set_clock_gating = &radeon_legacy_set_clock_gating, + .set_surface_reg = r100_set_surface_reg, + .clear_surface_reg = r100_clear_surface_reg, }; @@ -128,6 +134,7 @@ int r300_copy_dma(struct radeon_device *rdev, uint64_t dst_offset, unsigned num_pages, struct radeon_fence *fence); + static struct radeon_asic r300_asic = { .init = &r300_init, .errata = &r300_errata, @@ -156,6 +163,8 @@ static struct radeon_asic r300_asic = { .set_memory_clock = NULL, .set_pcie_lanes = &rv370_set_pcie_lanes, .set_clock_gating = &radeon_legacy_set_clock_gating, + .set_surface_reg = r100_set_surface_reg, + .clear_surface_reg = r100_clear_surface_reg, }; /* @@ -193,6 +202,8 @@ static struct radeon_asic r420_asic = { .set_memory_clock = &radeon_atom_set_memory_clock, .set_pcie_lanes = &rv370_set_pcie_lanes, .set_clock_gating = &radeon_atom_set_clock_gating, + .set_surface_reg = r100_set_surface_reg, + .clear_surface_reg = r100_clear_surface_reg, }; @@ -237,6 +248,8 @@ static struct radeon_asic rs400_asic = { .set_memory_clock = NULL, .set_pcie_lanes = NULL, .set_clock_gating = &radeon_legacy_set_clock_gating, + .set_surface_reg = r100_set_surface_reg, + .clear_surface_reg = r100_clear_surface_reg, }; @@ -322,6 +335,8 @@ static struct radeon_asic rs690_asic = { .set_memory_clock = &radeon_atom_set_memory_clock, .set_pcie_lanes = NULL, .set_clock_gating = &radeon_atom_set_clock_gating, + .set_surface_reg = r100_set_surface_reg, + .clear_surface_reg = r100_clear_surface_reg, }; @@ -367,6 +382,8 @@ static struct radeon_asic rv515_asic = { .set_memory_clock = &radeon_atom_set_memory_clock, .set_pcie_lanes = &rv370_set_pcie_lanes, .set_clock_gating = &radeon_atom_set_clock_gating, + .set_surface_reg = r100_set_surface_reg, + .clear_surface_reg = r100_clear_surface_reg, }; @@ -405,6 +422,8 @@ static struct radeon_asic r520_asic = { .set_memory_clock = &radeon_atom_set_memory_clock, .set_pcie_lanes = &rv370_set_pcie_lanes, .set_clock_gating = &radeon_atom_set_clock_gating, + .set_surface_reg = r100_set_surface_reg, + .clear_surface_reg = r100_clear_surface_reg, }; /* diff --git a/drivers/gpu/drm/radeon/radeon_device.c b/drivers/gpu/drm/radeon/radeon_device.c index cdef6eb01baf..f23083bbba3f 100644 --- a/drivers/gpu/drm/radeon/radeon_device.c +++ b/drivers/gpu/drm/radeon/radeon_device.c @@ -48,6 +48,8 @@ static void radeon_surface_init(struct radeon_device *rdev) i * (RADEON_SURFACE1_INFO - RADEON_SURFACE0_INFO), 0); } + /* enable surfaces */ + WREG32(RADEON_SURFACE_CNTL, 0); } } diff --git a/drivers/gpu/drm/radeon/radeon_fb.c b/drivers/gpu/drm/radeon/radeon_fb.c index 260870a29d83..36d2f5588f20 100644 --- a/drivers/gpu/drm/radeon/radeon_fb.c +++ b/drivers/gpu/drm/radeon/radeon_fb.c @@ -471,10 +471,10 @@ static struct notifier_block paniced = { .notifier_call = radeonfb_panic, }; -static int radeon_align_pitch(struct radeon_device *rdev, int width, int bpp) +static int radeon_align_pitch(struct radeon_device *rdev, int width, int bpp, bool tiled) { int aligned = width; - int align_large = (ASIC_IS_AVIVO(rdev)); + int align_large = (ASIC_IS_AVIVO(rdev)) || tiled; int pitch_mask = 0; switch (bpp / 8) { @@ -512,12 +512,13 @@ int radeonfb_create(struct radeon_device *rdev, u64 fb_gpuaddr; void *fbptr = NULL; unsigned long tmp; + bool fb_tiled = false; /* useful for testing */ mode_cmd.width = surface_width; mode_cmd.height = surface_height; mode_cmd.bpp = 32; /* need to align pitch with crtc limits */ - mode_cmd.pitch = radeon_align_pitch(rdev, mode_cmd.width, mode_cmd.bpp) * ((mode_cmd.bpp + 1) / 8); + mode_cmd.pitch = radeon_align_pitch(rdev, mode_cmd.width, mode_cmd.bpp, fb_tiled) * ((mode_cmd.bpp + 1) / 8); mode_cmd.depth = 24; size = mode_cmd.pitch * mode_cmd.height; @@ -535,6 +536,8 @@ int radeonfb_create(struct radeon_device *rdev, } robj = gobj->driver_private; + if (fb_tiled) + radeon_object_set_tiling_flags(robj, RADEON_TILING_MACRO|RADEON_TILING_SURFACE, mode_cmd.pitch); mutex_lock(&rdev->ddev->struct_mutex); fb = radeon_framebuffer_create(rdev->ddev, &mode_cmd, gobj); if (fb == NULL) { @@ -563,6 +566,9 @@ int radeonfb_create(struct radeon_device *rdev, } rfbdev = info->par; + if (fb_tiled) + radeon_object_check_tiling(robj, 0, 0); + ret = radeon_object_kmap(robj, &fbptr); if (ret) { goto out_unref; diff --git a/drivers/gpu/drm/radeon/radeon_gem.c b/drivers/gpu/drm/radeon/radeon_gem.c index eb516034235d..12542087b298 100644 --- a/drivers/gpu/drm/radeon/radeon_gem.c +++ b/drivers/gpu/drm/radeon/radeon_gem.c @@ -285,3 +285,44 @@ int radeon_gem_wait_idle_ioctl(struct drm_device *dev, void *data, mutex_unlock(&dev->struct_mutex); return r; } + +int radeon_gem_set_tiling_ioctl(struct drm_device *dev, void *data, + struct drm_file *filp) +{ + struct drm_radeon_gem_set_tiling *args = data; + struct drm_gem_object *gobj; + struct radeon_object *robj; + int r = 0; + + DRM_DEBUG("%d \n", args->handle); + gobj = drm_gem_object_lookup(dev, filp, args->handle); + if (gobj == NULL) + return -EINVAL; + robj = gobj->driver_private; + radeon_object_set_tiling_flags(robj, args->tiling_flags, args->pitch); + mutex_lock(&dev->struct_mutex); + drm_gem_object_unreference(gobj); + mutex_unlock(&dev->struct_mutex); + return r; +} + +int radeon_gem_get_tiling_ioctl(struct drm_device *dev, void *data, + struct drm_file *filp) +{ + struct drm_radeon_gem_get_tiling *args = data; + struct drm_gem_object *gobj; + struct radeon_object *robj; + int r = 0; + + DRM_DEBUG("\n"); + gobj = drm_gem_object_lookup(dev, filp, args->handle); + if (gobj == NULL) + return -EINVAL; + robj = gobj->driver_private; + radeon_object_get_tiling_flags(robj, &args->tiling_flags, + &args->pitch); + mutex_lock(&dev->struct_mutex); + drm_gem_object_unreference(gobj); + mutex_unlock(&dev->struct_mutex); + return r; +} diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index 4612a7c146d1..937a2f1cdb46 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -291,5 +291,7 @@ struct drm_ioctl_desc radeon_ioctls_kms[] = { DRM_IOCTL_DEF(DRM_RADEON_GEM_WAIT_IDLE, radeon_gem_wait_idle_ioctl, DRM_AUTH), DRM_IOCTL_DEF(DRM_RADEON_CS, radeon_cs_ioctl, DRM_AUTH), DRM_IOCTL_DEF(DRM_RADEON_INFO, radeon_info_ioctl, DRM_AUTH), + DRM_IOCTL_DEF(DRM_RADEON_GEM_SET_TILING, radeon_gem_set_tiling_ioctl, DRM_AUTH), + DRM_IOCTL_DEF(DRM_RADEON_GEM_GET_TILING, radeon_gem_get_tiling_ioctl, DRM_AUTH), }; int radeon_max_kms_ioctl = DRM_ARRAY_SIZE(radeon_ioctls_kms); diff --git a/drivers/gpu/drm/radeon/radeon_legacy_crtc.c b/drivers/gpu/drm/radeon/radeon_legacy_crtc.c index 14c1a5107fc9..0613790e2a5b 100644 --- a/drivers/gpu/drm/radeon/radeon_legacy_crtc.c +++ b/drivers/gpu/drm/radeon/radeon_legacy_crtc.c @@ -235,6 +235,7 @@ int radeon_crtc_set_base(struct drm_crtc *crtc, int x, int y, uint64_t base; uint32_t crtc_offset, crtc_offset_cntl, crtc_tile_x0_y0 = 0; uint32_t crtc_pitch, pitch_pixels; + uint32_t tiling_flags; DRM_DEBUG("\n"); @@ -258,8 +259,12 @@ int radeon_crtc_set_base(struct drm_crtc *crtc, int x, int y, (crtc->fb->bits_per_pixel * 8)); crtc_pitch |= crtc_pitch << 16; - /* TODO tiling */ - if (0) { + radeon_object_get_tiling_flags(obj->driver_private, + &tiling_flags, NULL); + if (tiling_flags & RADEON_TILING_MICRO) + DRM_ERROR("trying to scanout microtiled buffer\n"); + + if (tiling_flags & RADEON_TILING_MACRO) { if (ASIC_IS_R300(rdev)) crtc_offset_cntl |= (R300_CRTC_X_Y_MODE_EN | R300_CRTC_MICRO_TILE_BUFFER_DIS | @@ -275,15 +280,13 @@ int radeon_crtc_set_base(struct drm_crtc *crtc, int x, int y, crtc_offset_cntl &= ~RADEON_CRTC_TILE_EN; } - - /* TODO more tiling */ - if (0) { + if (tiling_flags & RADEON_TILING_MACRO) { if (ASIC_IS_R300(rdev)) { crtc_tile_x0_y0 = x | (y << 16); base &= ~0x7ff; } else { int byteshift = crtc->fb->bits_per_pixel >> 4; - int tile_addr = (((y >> 3) * crtc->fb->width + x) >> (8 - byteshift)) << 11; + int tile_addr = (((y >> 3) * pitch_pixels + x) >> (8 - byteshift)) << 11; base += tile_addr + ((x << byteshift) % 256) + ((y % 8) << 8); crtc_offset_cntl |= (y % 16); } diff --git a/drivers/gpu/drm/radeon/radeon_object.c b/drivers/gpu/drm/radeon/radeon_object.c index bac0d06c52ac..d5b1fd562d88 100644 --- a/drivers/gpu/drm/radeon/radeon_object.c +++ b/drivers/gpu/drm/radeon/radeon_object.c @@ -44,6 +44,9 @@ struct radeon_object { uint64_t gpu_addr; void *kptr; bool is_iomem; + uint32_t tiling_flags; + uint32_t pitch; + int surface_reg; }; int radeon_ttm_init(struct radeon_device *rdev); @@ -70,6 +73,7 @@ static void radeon_ttm_object_object_destroy(struct ttm_buffer_object *tobj) robj = container_of(tobj, struct radeon_object, tobj); list_del_init(&robj->list); + radeon_object_clear_surface_reg(robj); kfree(robj); } @@ -141,6 +145,7 @@ int radeon_object_create(struct radeon_device *rdev, } robj->rdev = rdev; robj->gobj = gobj; + robj->surface_reg = -1; INIT_LIST_HEAD(&robj->list); flags = radeon_object_flags_from_domain(domain); @@ -435,6 +440,7 @@ int radeon_object_list_validate(struct list_head *head, void *fence) radeon_object_gpu_addr(robj); } lobj->gpu_offset = robj->gpu_addr; + lobj->tiling_flags = robj->tiling_flags; if (fence) { old_fence = (struct radeon_fence *)robj->tobj.sync_obj; robj->tobj.sync_obj = radeon_fence_ref(fence); @@ -479,3 +485,127 @@ unsigned long radeon_object_size(struct radeon_object *robj) { return robj->tobj.num_pages << PAGE_SHIFT; } + +int radeon_object_get_surface_reg(struct radeon_object *robj) +{ + struct radeon_device *rdev = robj->rdev; + struct radeon_surface_reg *reg; + struct radeon_object *old_object; + int steal; + int i; + + if (!robj->tiling_flags) + return 0; + + if (robj->surface_reg >= 0) { + reg = &rdev->surface_regs[robj->surface_reg]; + i = robj->surface_reg; + goto out; + } + + steal = -1; + for (i = 0; i < RADEON_GEM_MAX_SURFACES; i++) { + + reg = &rdev->surface_regs[i]; + if (!reg->robj) + break; + + old_object = reg->robj; + if (old_object->pin_count == 0) + steal = i; + } + + /* if we are all out */ + if (i == RADEON_GEM_MAX_SURFACES) { + if (steal == -1) + return -ENOMEM; + /* find someone with a surface reg and nuke their BO */ + reg = &rdev->surface_regs[steal]; + old_object = reg->robj; + /* blow away the mapping */ + DRM_DEBUG("stealing surface reg %d from %p\n", steal, old_object); + ttm_bo_unmap_virtual(&old_object->tobj); + old_object->surface_reg = -1; + i = steal; + } + + robj->surface_reg = i; + reg->robj = robj; + +out: + radeon_set_surface_reg(rdev, i, robj->tiling_flags, robj->pitch, + robj->tobj.mem.mm_node->start << PAGE_SHIFT, + robj->tobj.num_pages << PAGE_SHIFT); + return 0; +} + +void radeon_object_clear_surface_reg(struct radeon_object *robj) +{ + struct radeon_device *rdev = robj->rdev; + struct radeon_surface_reg *reg; + + if (robj->surface_reg == -1) + return; + + reg = &rdev->surface_regs[robj->surface_reg]; + radeon_clear_surface_reg(rdev, robj->surface_reg); + + reg->robj = NULL; + robj->surface_reg = -1; +} + +void radeon_object_set_tiling_flags(struct radeon_object *robj, + uint32_t tiling_flags, uint32_t pitch) +{ + robj->tiling_flags = tiling_flags; + robj->pitch = pitch; +} + +void radeon_object_get_tiling_flags(struct radeon_object *robj, + uint32_t *tiling_flags, + uint32_t *pitch) +{ + if (tiling_flags) + *tiling_flags = robj->tiling_flags; + if (pitch) + *pitch = robj->pitch; +} + +int radeon_object_check_tiling(struct radeon_object *robj, bool has_moved, + bool force_drop) +{ + if (!(robj->tiling_flags & RADEON_TILING_SURFACE)) + return 0; + + if (force_drop) { + radeon_object_clear_surface_reg(robj); + return 0; + } + + if (robj->tobj.mem.mem_type != TTM_PL_VRAM) { + if (!has_moved) + return 0; + + if (robj->surface_reg >= 0) + radeon_object_clear_surface_reg(robj); + return 0; + } + + if ((robj->surface_reg >= 0) && !has_moved) + return 0; + + return radeon_object_get_surface_reg(robj); +} + +void radeon_bo_move_notify(struct ttm_buffer_object *bo, + struct ttm_mem_reg *mem) +{ + struct radeon_object *robj = container_of(bo, struct radeon_object, tobj); + radeon_object_check_tiling(robj, 0, 1); +} + +void radeon_bo_fault_reserve_notify(struct ttm_buffer_object *bo) +{ + struct radeon_object *robj = container_of(bo, struct radeon_object, tobj); + radeon_object_check_tiling(robj, 0, 0); +} diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index 4ca9aa9203d0..37e1cbcce3a9 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -429,6 +429,8 @@ static struct ttm_bo_driver radeon_bo_driver = { .sync_obj_flush = &radeon_sync_obj_flush, .sync_obj_unref = &radeon_sync_obj_unref, .sync_obj_ref = &radeon_sync_obj_ref, + .move_notify = &radeon_bo_move_notify, + .fault_reserve_notify = &radeon_bo_fault_reserve_notify, }; int radeon_ttm_init(struct radeon_device *rdev) diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index e55e7972c897..6538d4236989 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -43,7 +43,6 @@ #define TTM_BO_HASH_ORDER 13 static int ttm_bo_setup_vm(struct ttm_buffer_object *bo); -static void ttm_bo_unmap_virtual(struct ttm_buffer_object *bo); static int ttm_bo_swapout(struct ttm_mem_shrink *shrink); static inline uint32_t ttm_bo_type_flags(unsigned type) @@ -307,6 +306,9 @@ static int ttm_bo_handle_move_mem(struct ttm_buffer_object *bo, } + if (bdev->driver->move_notify) + bdev->driver->move_notify(bo, mem); + if (!(old_man->flags & TTM_MEMTYPE_FLAG_FIXED) && !(new_man->flags & TTM_MEMTYPE_FLAG_FIXED)) ret = ttm_bo_move_ttm(bo, evict, no_wait, mem); @@ -1451,6 +1453,7 @@ void ttm_bo_unmap_virtual(struct ttm_buffer_object *bo) unmap_mapping_range(bdev->dev_mapping, offset, holelen, 1); } +EXPORT_SYMBOL(ttm_bo_unmap_virtual); static void ttm_bo_vm_insert_rb(struct ttm_buffer_object *bo) { diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 40b75032ea47..41c907f6c560 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -101,6 +101,9 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_NOPAGE; } + if (bdev->driver->fault_reserve_notify) + bdev->driver->fault_reserve_notify(bo); + /* * Wait for buffer data in transit, due to a pipelined * move. diff --git a/include/drm/radeon_drm.h b/include/drm/radeon_drm.h index 41862e9a4c20..af4b4826997e 100644 --- a/include/drm/radeon_drm.h +++ b/include/drm/radeon_drm.h @@ -506,6 +506,8 @@ typedef struct { #define DRM_RADEON_GEM_WAIT_IDLE 0x24 #define DRM_RADEON_CS 0x26 #define DRM_RADEON_INFO 0x27 +#define DRM_RADEON_GEM_SET_TILING 0x28 +#define DRM_RADEON_GEM_GET_TILING 0x29 #define DRM_IOCTL_RADEON_CP_INIT DRM_IOW( DRM_COMMAND_BASE + DRM_RADEON_CP_INIT, drm_radeon_init_t) #define DRM_IOCTL_RADEON_CP_START DRM_IO( DRM_COMMAND_BASE + DRM_RADEON_CP_START) @@ -544,7 +546,8 @@ typedef struct { #define DRM_IOCTL_RADEON_GEM_WAIT_IDLE DRM_IOW(DRM_COMMAND_BASE + DRM_RADEON_GEM_WAIT_IDLE, struct drm_radeon_gem_wait_idle) #define DRM_IOCTL_RADEON_CS DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_CS, struct drm_radeon_cs) #define DRM_IOCTL_RADEON_INFO DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_INFO, struct drm_radeon_info) - +#define DRM_IOCTL_RADEON_SET_TILING DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_SET_TILING, struct drm_radeon_gem_set_tiling) +#define DRM_IOCTL_RADEON_GET_TILING DRM_IOWR(DRM_COMMAND_BASE + DRM_RADEON_GEM_GET_TILING, struct drm_radeon_gem_get_tiling) typedef struct drm_radeon_init { enum { @@ -796,6 +799,24 @@ struct drm_radeon_gem_create { uint32_t flags; }; +#define RADEON_TILING_MACRO 0x1 +#define RADEON_TILING_MICRO 0x2 +#define RADEON_TILING_SWAP 0x4 +#define RADEON_TILING_SURFACE 0x8 /* this object requires a surface + * when mapped - i.e. front buffer */ + +struct drm_radeon_gem_set_tiling { + uint32_t handle; + uint32_t tiling_flags; + uint32_t pitch; +}; + +struct drm_radeon_gem_get_tiling { + uint32_t handle; + uint32_t tiling_flags; + uint32_t pitch; +}; + struct drm_radeon_gem_mmap { uint32_t handle; uint32_t pad; diff --git a/include/drm/ttm/ttm_bo_driver.h b/include/drm/ttm/ttm_bo_driver.h index ea83dd23a4d7..a68829db381a 100644 --- a/include/drm/ttm/ttm_bo_driver.h +++ b/include/drm/ttm/ttm_bo_driver.h @@ -354,6 +354,14 @@ struct ttm_bo_driver { int (*sync_obj_flush) (void *sync_obj, void *sync_arg); void (*sync_obj_unref) (void **sync_obj); void *(*sync_obj_ref) (void *sync_obj); + + /* hook to notify driver about a driver move so it + * can do tiling things */ + void (*move_notify)(struct ttm_buffer_object *bo, + struct ttm_mem_reg *new_mem); + /* notify the driver we are taking a fault on this BO + * and have reserved it */ + void (*fault_reserve_notify)(struct ttm_buffer_object *bo); }; #define TTM_NUM_MEM_TYPES 8 @@ -653,6 +661,13 @@ extern int ttm_bo_device_init(struct ttm_bo_device *bdev, struct ttm_bo_driver *driver, uint64_t file_page_offset, bool need_dma32); +/** + * ttm_bo_unmap_virtual + * + * @bo: tear down the virtual mappings for this BO + */ +extern void ttm_bo_unmap_virtual(struct ttm_buffer_object *bo); + /** * ttm_bo_reserve: * -- cgit v1.2.3-59-g8ed1b From dddac6a7b445de95515f64fdf82fe5dc36c02f26 Mon Sep 17 00:00:00 2001 From: Alan Jenkins Date: Wed, 29 Jul 2009 21:07:55 +0200 Subject: PM / Hibernate: Replace bdget call with simple atomic_inc of i_count Create bdgrab(). This function copies an existing reference to a block_device. It is safe to call from any context. Hibernation code wishes to copy a reference to the active swap device. Right now it calls bdget() under a spinlock, but this is wrong because bdget() can sleep. It doesn't need a full bdget() because we already hold a reference to active swap devices (and the spinlock protects against swapoff). Fixes http://bugzilla.kernel.org/show_bug.cgi?id=13827 Signed-off-by: Alan Jenkins Signed-off-by: Rafael J. Wysocki --- fs/block_dev.c | 10 ++++++++++ include/linux/fs.h | 1 + mm/swapfile.c | 4 ++-- 3 files changed, 13 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/fs/block_dev.c b/fs/block_dev.c index 3a6d4fb2a329..94dfda24c06e 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -564,6 +564,16 @@ struct block_device *bdget(dev_t dev) EXPORT_SYMBOL(bdget); +/** + * bdgrab -- Grab a reference to an already referenced block device + * @bdev: Block device to grab a reference to. + */ +struct block_device *bdgrab(struct block_device *bdev) +{ + atomic_inc(&bdev->bd_inode->i_count); + return bdev; +} + long nr_blockdev_pages(void) { struct block_device *bdev; diff --git a/include/linux/fs.h b/include/linux/fs.h index 0872372184fe..a36ffa5a77a4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1946,6 +1946,7 @@ extern void putname(const char *name); extern int register_blkdev(unsigned int, const char *); extern void unregister_blkdev(unsigned int, const char *); extern struct block_device *bdget(dev_t); +extern struct block_device *bdgrab(struct block_device *bdev); extern void bd_set_size(struct block_device *, loff_t size); extern void bd_forget(struct inode *inode); extern void bdput(struct block_device *); diff --git a/mm/swapfile.c b/mm/swapfile.c index d1ade1a48ee7..8ffdc0d23c53 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -753,7 +753,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) if (!bdev) { if (bdev_p) - *bdev_p = bdget(sis->bdev->bd_dev); + *bdev_p = bdgrab(sis->bdev); spin_unlock(&swap_lock); return i; @@ -765,7 +765,7 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) struct swap_extent, list); if (se->start_block == offset) { if (bdev_p) - *bdev_p = bdget(sis->bdev->bd_dev); + *bdev_p = bdgrab(sis->bdev); spin_unlock(&swap_lock); bdput(bdev); -- cgit v1.2.3-59-g8ed1b From e043e42bdb66885b3ac10d27a01ccb9972e2b0a3 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Wed, 29 Jul 2009 12:15:56 -0700 Subject: pty: avoid forcing 'low_latency' tty flag We really don't want to mark the pty as a low-latency device, because as Alan points out, the ->write method can be called from an IRQ (ppp?), and that means we can't use ->low_latency=1 as we take mutexes in the low_latency case. So rather than using low_latency to force the written data to be pushed to the ldisc handling at 'write()' time, just make the reader side (or the poll function) do the flush when it checks whether there is data to be had. This also fixes the problem with lost data in an emacs compile buffer (bugzilla 13815), and we can thus revert the low_latency pty hack (commit 3a54297478e6578f96fd54bf4daa1751130aca86: "pty: quickfix for the pty ENXIO timing problems"). Signed-off-by: OGAWA Hirofumi Tested-by: Aneesh Kumar K.V [ Modified to do the tty_flush_to_ldisc() inside input_available_p() so that it triggers for both read and poll() - Linus] Signed-off-by: Linus Torvalds --- drivers/char/n_tty.c | 1 + drivers/char/pty.c | 2 -- drivers/char/tty_buffer.c | 13 +++++++++++++ include/linux/tty.h | 1 + 4 files changed, 15 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/drivers/char/n_tty.c b/drivers/char/n_tty.c index ff47907ff1bf..973be2f44195 100644 --- a/drivers/char/n_tty.c +++ b/drivers/char/n_tty.c @@ -1583,6 +1583,7 @@ static int n_tty_open(struct tty_struct *tty) static inline int input_available_p(struct tty_struct *tty, int amt) { + tty_flush_to_ldisc(tty); if (tty->icanon) { if (tty->canon_data) return 1; diff --git a/drivers/char/pty.c b/drivers/char/pty.c index 3850a68f265a..6e6942c45f5b 100644 --- a/drivers/char/pty.c +++ b/drivers/char/pty.c @@ -52,7 +52,6 @@ static void pty_close(struct tty_struct *tty, struct file *filp) return; tty->link->packet = 0; set_bit(TTY_OTHER_CLOSED, &tty->link->flags); - tty_flip_buffer_push(tty->link); wake_up_interruptible(&tty->link->read_wait); wake_up_interruptible(&tty->link->write_wait); if (tty->driver->subtype == PTY_TYPE_MASTER) { @@ -208,7 +207,6 @@ static int pty_open(struct tty_struct *tty, struct file *filp) clear_bit(TTY_OTHER_CLOSED, &tty->link->flags); set_bit(TTY_THROTTLED, &tty->flags); retval = 0; - tty->low_latency = 1; out: return retval; } diff --git a/drivers/char/tty_buffer.c b/drivers/char/tty_buffer.c index 810ee25d66a4..3108991c5c8b 100644 --- a/drivers/char/tty_buffer.c +++ b/drivers/char/tty_buffer.c @@ -461,6 +461,19 @@ static void flush_to_ldisc(struct work_struct *work) tty_ldisc_deref(disc); } +/** + * tty_flush_to_ldisc + * @tty: tty to push + * + * Push the terminal flip buffers to the line discipline. + * + * Must not be called from IRQ context. + */ +void tty_flush_to_ldisc(struct tty_struct *tty) +{ + flush_to_ldisc(&tty->buf.work.work); +} + /** * tty_flip_buffer_push - terminal * @tty: tty to push diff --git a/include/linux/tty.h b/include/linux/tty.h index 1488d8c81aac..e8c6c9136c97 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -394,6 +394,7 @@ extern void __do_SAK(struct tty_struct *tty); extern void disassociate_ctty(int priv); extern void no_tty(void); extern void tty_flip_buffer_push(struct tty_struct *tty); +extern void tty_flush_to_ldisc(struct tty_struct *tty); extern void tty_buffer_free_all(struct tty_struct *tty); extern void tty_buffer_flush(struct tty_struct *tty); extern void tty_buffer_init(struct tty_struct *tty); -- cgit v1.2.3-59-g8ed1b From 096b7fe012d66ed55e98bc8022405ede0cc80e96 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 29 Jul 2009 15:04:04 -0700 Subject: cgroups: fix pid namespace bug The bug was introduced by commit cc31edceee04a7b87f2be48f9489ebb72d264844 ("cgroups: convert tasks file to use a seq_file with shared pid array"). We cache a pid array for all threads that are opening the same "tasks" file, but the pids in the array are always from the namespace of the last process that opened the file, so all other threads will read pids from that namespace instead of their own namespaces. To fix it, we maintain a list of pid arrays, which is keyed by pid_ns. The list will be of length 1 at most time. Reported-by: Paul Menage Idea-by: Paul Menage Signed-off-by: Li Zefan Reviewed-by: Serge Hallyn Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 11 +++--- kernel/cgroup.c | 96 +++++++++++++++++++++++++++++++++++++------------- 2 files changed, 76 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 665fa70e4094..20411d2876f8 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -179,14 +179,11 @@ struct cgroup { */ struct list_head release_list; - /* pids_mutex protects the fields below */ + /* pids_mutex protects pids_list and cached pid arrays. */ struct rw_semaphore pids_mutex; - /* Array of process ids in the cgroup */ - pid_t *tasks_pids; - /* How many files are using the current tasks_pids array */ - int pids_use_count; - /* Length of the current tasks_pids array */ - int pids_length; + + /* Linked list of struct cgroup_pids */ + struct list_head pids_list; /* For RCU-protected deletion */ struct rcu_head rcu_head; diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3737a682cdf5..250dac05680f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -47,6 +47,7 @@ #include #include #include +#include #include @@ -960,6 +961,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->children); INIT_LIST_HEAD(&cgrp->css_sets); INIT_LIST_HEAD(&cgrp->release_list); + INIT_LIST_HEAD(&cgrp->pids_list); init_rwsem(&cgrp->pids_mutex); } static void init_cgroup_root(struct cgroupfs_root *root) @@ -2201,12 +2203,30 @@ err: return ret; } +/* + * Cache pids for all threads in the same pid namespace that are + * opening the same "tasks" file. + */ +struct cgroup_pids { + /* The node in cgrp->pids_list */ + struct list_head list; + /* The cgroup those pids belong to */ + struct cgroup *cgrp; + /* The namepsace those pids belong to */ + struct pid_namespace *ns; + /* Array of process ids in the cgroup */ + pid_t *tasks_pids; + /* How many files are using the this tasks_pids array */ + int use_count; + /* Length of the current tasks_pids array */ + int length; +}; + static int cmppid(const void *a, const void *b) { return *(pid_t *)a - *(pid_t *)b; } - /* * seq_file methods for the "tasks" file. The seq_file position is the * next pid to display; the seq_file iterator is a pointer to the pid @@ -2221,45 +2241,47 @@ static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos) * after a seek to the start). Use a binary-search to find the * next pid to display, if any */ - struct cgroup *cgrp = s->private; + struct cgroup_pids *cp = s->private; + struct cgroup *cgrp = cp->cgrp; int index = 0, pid = *pos; int *iter; down_read(&cgrp->pids_mutex); if (pid) { - int end = cgrp->pids_length; + int end = cp->length; while (index < end) { int mid = (index + end) / 2; - if (cgrp->tasks_pids[mid] == pid) { + if (cp->tasks_pids[mid] == pid) { index = mid; break; - } else if (cgrp->tasks_pids[mid] <= pid) + } else if (cp->tasks_pids[mid] <= pid) index = mid + 1; else end = mid; } } /* If we're off the end of the array, we're done */ - if (index >= cgrp->pids_length) + if (index >= cp->length) return NULL; /* Update the abstract position to be the actual pid that we found */ - iter = cgrp->tasks_pids + index; + iter = cp->tasks_pids + index; *pos = *iter; return iter; } static void cgroup_tasks_stop(struct seq_file *s, void *v) { - struct cgroup *cgrp = s->private; + struct cgroup_pids *cp = s->private; + struct cgroup *cgrp = cp->cgrp; up_read(&cgrp->pids_mutex); } static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos) { - struct cgroup *cgrp = s->private; + struct cgroup_pids *cp = s->private; int *p = v; - int *end = cgrp->tasks_pids + cgrp->pids_length; + int *end = cp->tasks_pids + cp->length; /* * Advance to the next pid in the array. If this goes off the @@ -2286,26 +2308,33 @@ static struct seq_operations cgroup_tasks_seq_operations = { .show = cgroup_tasks_show, }; -static void release_cgroup_pid_array(struct cgroup *cgrp) +static void release_cgroup_pid_array(struct cgroup_pids *cp) { + struct cgroup *cgrp = cp->cgrp; + down_write(&cgrp->pids_mutex); - BUG_ON(!cgrp->pids_use_count); - if (!--cgrp->pids_use_count) { - kfree(cgrp->tasks_pids); - cgrp->tasks_pids = NULL; - cgrp->pids_length = 0; + BUG_ON(!cp->use_count); + if (!--cp->use_count) { + list_del(&cp->list); + put_pid_ns(cp->ns); + kfree(cp->tasks_pids); + kfree(cp); } up_write(&cgrp->pids_mutex); } static int cgroup_tasks_release(struct inode *inode, struct file *file) { - struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + struct seq_file *seq; + struct cgroup_pids *cp; if (!(file->f_mode & FMODE_READ)) return 0; - release_cgroup_pid_array(cgrp); + seq = file->private_data; + cp = seq->private; + + release_cgroup_pid_array(cp); return seq_release(inode, file); } @@ -2324,6 +2353,8 @@ static struct file_operations cgroup_tasks_operations = { static int cgroup_tasks_open(struct inode *unused, struct file *file) { struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); + struct pid_namespace *ns = current->nsproxy->pid_ns; + struct cgroup_pids *cp; pid_t *pidarray; int npids; int retval; @@ -2350,20 +2381,37 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file) * array if necessary */ down_write(&cgrp->pids_mutex); - kfree(cgrp->tasks_pids); - cgrp->tasks_pids = pidarray; - cgrp->pids_length = npids; - cgrp->pids_use_count++; + + list_for_each_entry(cp, &cgrp->pids_list, list) { + if (ns == cp->ns) + goto found; + } + + cp = kzalloc(sizeof(*cp), GFP_KERNEL); + if (!cp) { + up_write(&cgrp->pids_mutex); + kfree(pidarray); + return -ENOMEM; + } + cp->cgrp = cgrp; + cp->ns = ns; + get_pid_ns(ns); + list_add(&cp->list, &cgrp->pids_list); +found: + kfree(cp->tasks_pids); + cp->tasks_pids = pidarray; + cp->length = npids; + cp->use_count++; up_write(&cgrp->pids_mutex); file->f_op = &cgroup_tasks_operations; retval = seq_open(file, &cgroup_tasks_seq_operations); if (retval) { - release_cgroup_pid_array(cgrp); + release_cgroup_pid_array(cp); return retval; } - ((struct seq_file *)file->private_data)->private = cgrp; + ((struct seq_file *)file->private_data)->private = cp; return 0; } -- cgit v1.2.3-59-g8ed1b From 887032670d47366a8c8f25396ea7c14b7b2cc620 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Wed, 29 Jul 2009 15:04:06 -0700 Subject: cgroup avoid permanent sleep at rmdir After commit ec64f51545fffbc4cb968f0cea56341a4b07e85a ("cgroup: fix frequent -EBUSY at rmdir"), cgroup's rmdir (especially against memcg) doesn't return -EBUSY by temporary ref counts. That commit expects all refs after pre_destroy() is temporary but...it wasn't. Then, rmdir can wait permanently. This patch tries to fix that and change followings. - set CGRP_WAIT_ON_RMDIR flag before pre_destroy(). - clear CGRP_WAIT_ON_RMDIR flag when the subsys finds racy case. if there are sleeping ones, wakes them up. - rmdir() sleeps only when CGRP_WAIT_ON_RMDIR flag is set. Tested-by: Daisuke Nishimura Reported-by: Daisuke Nishimura Reviewed-by: Paul Menage Acked-by: Balbir Sigh Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/cgroup.h | 17 ++++++++++++++++ kernel/cgroup.c | 55 ++++++++++++++++++++++++++++++++++---------------- mm/memcontrol.c | 23 ++++++++++++++++++--- 3 files changed, 75 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 20411d2876f8..90bba9e62286 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -362,6 +362,23 @@ int cgroup_task_count(const struct cgroup *cgrp); /* Return true if cgrp is a descendant of the task's cgroup */ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task); +/* + * When the subsys has to access css and may add permanent refcnt to css, + * it should take care of racy conditions with rmdir(). Following set of + * functions, is for stop/restart rmdir if necessary. + * Because these will call css_get/put, "css" should be alive css. + * + * cgroup_exclude_rmdir(); + * ...do some jobs which may access arbitrary empty cgroup + * cgroup_release_and_wakeup_rmdir(); + * + * When someone removes a cgroup while cgroup_exclude_rmdir() holds it, + * it sleeps and cgroup_release_and_wakeup_rmdir() will wake him up. + */ + +void cgroup_exclude_rmdir(struct cgroup_subsys_state *css); +void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css); + /* * Control Group subsystem type. * See Documentation/cgroups/cgroups.txt for details diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 250dac05680f..b6eadfe30e7b 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -735,16 +735,28 @@ static void cgroup_d_remove_dir(struct dentry *dentry) * reference to css->refcnt. In general, this refcnt is expected to goes down * to zero, soon. * - * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex; + * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; */ DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); -static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp) +static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) { - if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) + if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) wake_up_all(&cgroup_rmdir_waitq); } +void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) +{ + css_get(css); +} + +void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) +{ + cgroup_wakeup_rmdir_waiter(css->cgroup); + css_put(css); +} + + static int rebind_subsystems(struct cgroupfs_root *root, unsigned long final_bits) { @@ -1359,7 +1371,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) * wake up rmdir() waiter. the rmdir should fail since the cgroup * is no longer empty. */ - cgroup_wakeup_rmdir_waiters(cgrp); + cgroup_wakeup_rmdir_waiter(cgrp); return 0; } @@ -2743,34 +2755,43 @@ again: } mutex_unlock(&cgroup_mutex); + /* + * In general, subsystem has no css->refcnt after pre_destroy(). But + * in racy cases, subsystem may have to get css->refcnt after + * pre_destroy() and it makes rmdir return with -EBUSY. This sometimes + * make rmdir return -EBUSY too often. To avoid that, we use waitqueue + * for cgroup's rmdir. CGRP_WAIT_ON_RMDIR is for synchronizing rmdir + * and subsystem's reference count handling. Please see css_get/put + * and css_tryget() and cgroup_wakeup_rmdir_waiter() implementation. + */ + set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); + /* * Call pre_destroy handlers of subsys. Notify subsystems * that rmdir() request comes. */ ret = cgroup_call_pre_destroy(cgrp); - if (ret) + if (ret) { + clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); return ret; + } mutex_lock(&cgroup_mutex); parent = cgrp->parent; if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { + clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); mutex_unlock(&cgroup_mutex); return -EBUSY; } - /* - * css_put/get is provided for subsys to grab refcnt to css. In typical - * case, subsystem has no reference after pre_destroy(). But, under - * hierarchy management, some *temporal* refcnt can be hold. - * To avoid returning -EBUSY to a user, waitqueue is used. If subsys - * is really busy, it should return -EBUSY at pre_destroy(). wake_up - * is called when css_put() is called and refcnt goes down to 0. - */ - set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE); - if (!cgroup_clear_css_refs(cgrp)) { mutex_unlock(&cgroup_mutex); - schedule(); + /* + * Because someone may call cgroup_wakeup_rmdir_waiter() before + * prepare_to_wait(), we need to check this flag. + */ + if (test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)) + schedule(); finish_wait(&cgroup_rmdir_waitq, &wait); clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); if (signal_pending(current)) @@ -3342,7 +3363,7 @@ void __css_put(struct cgroup_subsys_state *css) set_bit(CGRP_RELEASABLE, &cgrp->flags); check_for_release(cgrp); } - cgroup_wakeup_rmdir_waiters(cgrp); + cgroup_wakeup_rmdir_waiter(cgrp); } rcu_read_unlock(); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e717964cb5a0..fd4529d86de5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1207,6 +1207,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, ret = 0; out: unlock_page_cgroup(pc); + /* + * We charges against "to" which may not have any tasks. Then, "to" + * can be under rmdir(). But in current implementation, caller of + * this function is just force_empty() and it's garanteed that + * "to" is never removed. So, we don't check rmdir status here. + */ return ret; } @@ -1428,6 +1434,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, return; if (!ptr) return; + cgroup_exclude_rmdir(&ptr->css); pc = lookup_page_cgroup(page); mem_cgroup_lru_del_before_commit_swapcache(page); __mem_cgroup_commit_charge(ptr, pc, ctype); @@ -1457,8 +1464,12 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, } rcu_read_unlock(); } - /* add this page(page_cgroup) to the LRU we want. */ - + /* + * At swapin, we may charge account against cgroup which has no tasks. + * So, rmdir()->pre_destroy() can be called while we do this charge. + * In that case, we need to call pre_destroy() again. check it here. + */ + cgroup_release_and_wakeup_rmdir(&ptr->css); } void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) @@ -1664,7 +1675,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, if (!mem) return; - + cgroup_exclude_rmdir(&mem->css); /* at migration success, oldpage->mapping is NULL. */ if (oldpage->mapping) { target = oldpage; @@ -1704,6 +1715,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, */ if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) mem_cgroup_uncharge_page(target); + /* + * At migration, we may charge account against cgroup which has no tasks + * So, rmdir()->pre_destroy() can be called while we do this charge. + * In that case, we need to call pre_destroy() again. check it here. + */ + cgroup_release_and_wakeup_rmdir(&mem->css); } /* -- cgit v1.2.3-59-g8ed1b From f5a55efa140f5e9c9dd0f398fef54f20cdb74ec9 Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Wed, 29 Jul 2009 15:04:12 -0700 Subject: pps.h needs Found with make headers_check /usr/include/linux/pps.h:52: found __[us]{8,16,32,64} type without #include Signed-off-by: Dave Jones Cc: Rodolfo Giometti Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pps.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/pps.h b/include/linux/pps.h index cfe5c7214ec6..0194ab06177b 100644 --- a/include/linux/pps.h +++ b/include/linux/pps.h @@ -22,6 +22,8 @@ #ifndef _PPS_H_ #define _PPS_H_ +#include + #define PPS_VERSION "5.3.6" #define PPS_MAX_SOURCES 16 /* should be enough... */ -- cgit v1.2.3-59-g8ed1b From 534acc057b5a08ec33fa57cdd2f5a09ef124e7f2 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 29 Jul 2009 15:04:18 -0700 Subject: lib: flexible array implementation Once a structure goes over PAGE_SIZE*2, we see occasional allocation failures. Some people have chosen to switch over to things like vmalloc() that will let them keep array-like access to such a large structures. But, vmalloc() has plenty of downsides. Here's an alternative. I think it's what Andrew was suggesting here: http://lkml.org/lkml/2009/7/2/518 I call it a flexible array. It does all of its work in PAGE_SIZE bits, so never does an order>0 allocation. The base level has PAGE_SIZE-2*sizeof(int) bytes of storage for pointers to the second level. So, with a 32-bit arch, you get about 4MB (4183112 bytes) of total storage when the objects pack nicely into a page. It is half that on 64-bit because the pointers are twice the size. There's a table detailing this in the code. There are kerneldocs for the functions, but here's an overview: flex_array_alloc() - dynamically allocate a base structure flex_array_free() - free the array and all of the second-level pages flex_array_free_parts() - free the second-level pages, but not the base (for static bases) flex_array_put() - copy into the array at the given index flex_array_get() - copy out of the array at the given index flex_array_prealloc() - preallocate the second-level pages between the given indexes to guarantee no allocs will occur at put() time. We could also potentially just pass the "element_size" into each of the API functions instead of storing it internally. That would get us one more base pointer on 32-bit. I've been testing this by running it in userspace. The header and patch that I've been using are here, as well as the little script I'm using to generate the size table which goes in the kerneldocs. http://sr71.net/~dave/linux/flexarray/ [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Dave Hansen Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/flex_array.h | 47 ++++++++ lib/Makefile | 2 +- lib/flex_array.c | 269 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 317 insertions(+), 1 deletion(-) create mode 100644 include/linux/flex_array.h create mode 100644 lib/flex_array.c (limited to 'include') diff --git a/include/linux/flex_array.h b/include/linux/flex_array.h new file mode 100644 index 000000000000..23c1ec79a31b --- /dev/null +++ b/include/linux/flex_array.h @@ -0,0 +1,47 @@ +#ifndef _FLEX_ARRAY_H +#define _FLEX_ARRAY_H + +#include +#include + +#define FLEX_ARRAY_PART_SIZE PAGE_SIZE +#define FLEX_ARRAY_BASE_SIZE PAGE_SIZE + +struct flex_array_part; + +/* + * This is meant to replace cases where an array-like + * structure has gotten too big to fit into kmalloc() + * and the developer is getting tempted to use + * vmalloc(). + */ + +struct flex_array { + union { + struct { + int element_size; + int total_nr_elements; + struct flex_array_part *parts[0]; + }; + /* + * This little trick makes sure that + * sizeof(flex_array) == PAGE_SIZE + */ + char padding[FLEX_ARRAY_BASE_SIZE]; + }; +}; + +#define FLEX_ARRAY_INIT(size, total) { { {\ + .element_size = (size), \ + .total_nr_elements = (total), \ +} } } + +struct flex_array *flex_array_alloc(int element_size, int total, gfp_t flags); +int flex_array_prealloc(struct flex_array *fa, int start, int end, gfp_t flags); +void flex_array_free(struct flex_array *fa); +void flex_array_free_parts(struct flex_array *fa); +int flex_array_put(struct flex_array *fa, int element_nr, void *src, + gfp_t flags); +void *flex_array_get(struct flex_array *fa, int element_nr); + +#endif /* _FLEX_ARRAY_H */ diff --git a/lib/Makefile b/lib/Makefile index b6d1857bbf08..2e78277eff9d 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -12,7 +12,7 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \ idr.o int_sqrt.o extable.o prio_tree.o \ sha1.o irq_regs.o reciprocal_div.o argv_split.o \ proportions.o prio_heap.o ratelimit.o show_mem.o \ - is_single_threaded.o plist.o decompress.o + is_single_threaded.o plist.o decompress.o flex_array.o lib-$(CONFIG_MMU) += ioremap.o lib-$(CONFIG_SMP) += cpumask.o diff --git a/lib/flex_array.c b/lib/flex_array.c new file mode 100644 index 000000000000..0e7894ce8882 --- /dev/null +++ b/lib/flex_array.c @@ -0,0 +1,269 @@ +/* + * Flexible array managed in PAGE_SIZE parts + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2009 + * + * Author: Dave Hansen + */ + +#include +#include +#include + +struct flex_array_part { + char elements[FLEX_ARRAY_PART_SIZE]; +}; + +static inline int __elements_per_part(int element_size) +{ + return FLEX_ARRAY_PART_SIZE / element_size; +} + +static inline int bytes_left_in_base(void) +{ + int element_offset = offsetof(struct flex_array, parts); + int bytes_left = FLEX_ARRAY_BASE_SIZE - element_offset; + return bytes_left; +} + +static inline int nr_base_part_ptrs(void) +{ + return bytes_left_in_base() / sizeof(struct flex_array_part *); +} + +/* + * If a user requests an allocation which is small + * enough, we may simply use the space in the + * flex_array->parts[] array to store the user + * data. + */ +static inline int elements_fit_in_base(struct flex_array *fa) +{ + int data_size = fa->element_size * fa->total_nr_elements; + if (data_size <= bytes_left_in_base()) + return 1; + return 0; +} + +/** + * flex_array_alloc - allocate a new flexible array + * @element_size: the size of individual elements in the array + * @total: total number of elements that this should hold + * + * Note: all locking must be provided by the caller. + * + * @total is used to size internal structures. If the user ever + * accesses any array indexes >=@total, it will produce errors. + * + * The maximum number of elements is defined as: the number of + * elements that can be stored in a page times the number of + * page pointers that we can fit in the base structure or (using + * integer math): + * + * (PAGE_SIZE/element_size) * (PAGE_SIZE-8)/sizeof(void *) + * + * Here's a table showing example capacities. Note that the maximum + * index that the get/put() functions is just nr_objects-1. This + * basically means that you get 4MB of storage on 32-bit and 2MB on + * 64-bit. + * + * + * Element size | Objects | Objects | + * PAGE_SIZE=4k | 32-bit | 64-bit | + * ---------------------------------| + * 1 bytes | 4186112 | 2093056 | + * 2 bytes | 2093056 | 1046528 | + * 3 bytes | 1395030 | 697515 | + * 4 bytes | 1046528 | 523264 | + * 32 bytes | 130816 | 65408 | + * 33 bytes | 126728 | 63364 | + * 2048 bytes | 2044 | 1022 | + * 2049 bytes | 1022 | 511 | + * void * | 1046528 | 261632 | + * + * Since 64-bit pointers are twice the size, we lose half the + * capacity in the base structure. Also note that no effort is made + * to efficiently pack objects across page boundaries. + */ +struct flex_array *flex_array_alloc(int element_size, int total, gfp_t flags) +{ + struct flex_array *ret; + int max_size = nr_base_part_ptrs() * __elements_per_part(element_size); + + /* max_size will end up 0 if element_size > PAGE_SIZE */ + if (total > max_size) + return NULL; + ret = kzalloc(sizeof(struct flex_array), flags); + if (!ret) + return NULL; + ret->element_size = element_size; + ret->total_nr_elements = total; + return ret; +} + +static int fa_element_to_part_nr(struct flex_array *fa, int element_nr) +{ + return element_nr / __elements_per_part(fa->element_size); +} + +/** + * flex_array_free_parts - just free the second-level pages + * @src: address of data to copy into the array + * @element_nr: index of the position in which to insert + * the new element. + * + * This is to be used in cases where the base 'struct flex_array' + * has been statically allocated and should not be free. + */ +void flex_array_free_parts(struct flex_array *fa) +{ + int part_nr; + int max_part = nr_base_part_ptrs(); + + if (elements_fit_in_base(fa)) + return; + for (part_nr = 0; part_nr < max_part; part_nr++) + kfree(fa->parts[part_nr]); +} + +void flex_array_free(struct flex_array *fa) +{ + flex_array_free_parts(fa); + kfree(fa); +} + +static int fa_index_inside_part(struct flex_array *fa, int element_nr) +{ + return element_nr % __elements_per_part(fa->element_size); +} + +static int index_inside_part(struct flex_array *fa, int element_nr) +{ + int part_offset = fa_index_inside_part(fa, element_nr); + return part_offset * fa->element_size; +} + +static struct flex_array_part * +__fa_get_part(struct flex_array *fa, int part_nr, gfp_t flags) +{ + struct flex_array_part *part = fa->parts[part_nr]; + if (!part) { + /* + * This leaves the part pages uninitialized + * and with potentially random data, just + * as if the user had kmalloc()'d the whole. + * __GFP_ZERO can be used to zero it. + */ + part = kmalloc(FLEX_ARRAY_PART_SIZE, flags); + if (!part) + return NULL; + fa->parts[part_nr] = part; + } + return part; +} + +/** + * flex_array_put - copy data into the array at @element_nr + * @src: address of data to copy into the array + * @element_nr: index of the position in which to insert + * the new element. + * + * Note that this *copies* the contents of @src into + * the array. If you are trying to store an array of + * pointers, make sure to pass in &ptr instead of ptr. + * + * Locking must be provided by the caller. + */ +int flex_array_put(struct flex_array *fa, int element_nr, void *src, gfp_t flags) +{ + int part_nr = fa_element_to_part_nr(fa, element_nr); + struct flex_array_part *part; + void *dst; + + if (element_nr >= fa->total_nr_elements) + return -ENOSPC; + if (elements_fit_in_base(fa)) + part = (struct flex_array_part *)&fa->parts[0]; + else + part = __fa_get_part(fa, part_nr, flags); + if (!part) + return -ENOMEM; + dst = &part->elements[index_inside_part(fa, element_nr)]; + memcpy(dst, src, fa->element_size); + return 0; +} + +/** + * flex_array_prealloc - guarantee that array space exists + * @start: index of first array element for which space is allocated + * @end: index of last (inclusive) element for which space is allocated + * + * This will guarantee that no future calls to flex_array_put() + * will allocate memory. It can be used if you are expecting to + * be holding a lock or in some atomic context while writing + * data into the array. + * + * Locking must be provided by the caller. + */ +int flex_array_prealloc(struct flex_array *fa, int start, int end, gfp_t flags) +{ + int start_part; + int end_part; + int part_nr; + struct flex_array_part *part; + + if (start >= fa->total_nr_elements || end >= fa->total_nr_elements) + return -ENOSPC; + if (elements_fit_in_base(fa)) + return 0; + start_part = fa_element_to_part_nr(fa, start); + end_part = fa_element_to_part_nr(fa, end); + for (part_nr = start_part; part_nr <= end_part; part_nr++) { + part = __fa_get_part(fa, part_nr, flags); + if (!part) + return -ENOMEM; + } + return 0; +} + +/** + * flex_array_get - pull data back out of the array + * @element_nr: index of the element to fetch from the array + * + * Returns a pointer to the data at index @element_nr. Note + * that this is a copy of the data that was passed in. If you + * are using this to store pointers, you'll get back &ptr. + * + * Locking must be provided by the caller. + */ +void *flex_array_get(struct flex_array *fa, int element_nr) +{ + int part_nr = fa_element_to_part_nr(fa, element_nr); + struct flex_array_part *part; + int index; + + if (element_nr >= fa->total_nr_elements) + return NULL; + if (!fa->parts[part_nr]) + return NULL; + if (elements_fit_in_base(fa)) + part = (struct flex_array_part *)&fa->parts[0]; + else + part = fa->parts[part_nr]; + index = index_inside_part(fa, element_nr); + return &part->elements[index_inside_part(fa, element_nr)]; +} -- cgit v1.2.3-59-g8ed1b From 812ed032cdc8138b7546eecc996879756b92d801 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 29 Jul 2009 15:04:19 -0700 Subject: uio: mark uio.h functions __KERNEL__ only To avoid userspace build failures such as: .../linux/uio.h:37: error: expected `=', `,', `;', `asm' or `__attribute__' before `iov_length' .../linux/uio.h:47: error: expected declaration specifiers or `...' before `size_t' move uio functions inside a __KERNEL__ block. Signed-off-by: Jiri Slaby Acked-by: Sam Ravnborg Cc: Alexander Viro Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/uio.h | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/uio.h b/include/linux/uio.h index b7fe13883bdb..98c114323a8b 100644 --- a/include/linux/uio.h +++ b/include/linux/uio.h @@ -19,15 +19,6 @@ struct iovec __kernel_size_t iov_len; /* Must be size_t (1003.1g) */ }; -#ifdef __KERNEL__ - -struct kvec { - void *iov_base; /* and that should *never* hold a userland pointer */ - size_t iov_len; -}; - -#endif - /* * UIO_MAXIOV shall be at least 16 1003.1g (5.4.1.1) */ @@ -35,6 +26,13 @@ struct kvec { #define UIO_FASTIOV 8 #define UIO_MAXIOV 1024 +#ifdef __KERNEL__ + +struct kvec { + void *iov_base; /* and that should *never* hold a userland pointer */ + size_t iov_len; +}; + /* * Total number of bytes covered by an iovec. * @@ -53,5 +51,6 @@ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs) } unsigned long iov_shorten(struct iovec *iov, unsigned long nr_segs, size_t to); +#endif #endif -- cgit v1.2.3-59-g8ed1b From 2e04ef76916d1e29a077ea9d0f2003c8fd86724d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 30 Jul 2009 16:03:45 -0600 Subject: lguest: fix comment style I don't really notice it (except to begrudge the extra vertical space), but Ingo does. And he pointed out that one excuse of lguest is as a teaching tool, it should set a good example. Signed-off-by: Rusty Russell Cc: Ingo Molnar --- Documentation/lguest/lguest.c | 540 ++++++++++++++++++++++------------ arch/x86/include/asm/lguest.h | 3 +- arch/x86/include/asm/lguest_hcall.h | 10 +- arch/x86/lguest/boot.c | 428 +++++++++++++++++---------- arch/x86/lguest/i386_head.S | 110 ++++--- drivers/lguest/core.c | 114 ++++--- drivers/lguest/hypercalls.c | 141 ++++++--- drivers/lguest/interrupts_and_traps.c | 288 ++++++++++++------ drivers/lguest/lg.h | 23 +- drivers/lguest/lguest_device.c | 150 ++++++---- drivers/lguest/lguest_user.c | 137 ++++++--- drivers/lguest/page_tables.c | 427 ++++++++++++++++++--------- drivers/lguest/segments.c | 106 ++++--- drivers/lguest/x86/core.c | 372 +++++++++++++++-------- drivers/lguest/x86/switcher_32.S | 18 +- include/linux/lguest.h | 36 ++- include/linux/lguest_launcher.h | 18 +- 17 files changed, 1906 insertions(+), 1015 deletions(-) (limited to 'include') diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 45d7d6dcae7a..aa66a52b73e9 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -1,7 +1,9 @@ -/*P:100 This is the Launcher code, a simple program which lays out the - * "physical" memory for the new Guest by mapping the kernel image and - * the virtual devices, then opens /dev/lguest to tell the kernel - * about the Guest and control it. :*/ +/*P:100 + * This is the Launcher code, a simple program which lays out the "physical" + * memory for the new Guest by mapping the kernel image and the virtual + * devices, then opens /dev/lguest to tell the kernel about the Guest and + * control it. +:*/ #define _LARGEFILE64_SOURCE #define _GNU_SOURCE #include @@ -46,13 +48,15 @@ #include "linux/virtio_rng.h" #include "linux/virtio_ring.h" #include "asm/bootparam.h" -/*L:110 We can ignore the 39 include files we need for this program, but I do - * want to draw attention to the use of kernel-style types. +/*L:110 + * We can ignore the 39 include files we need for this program, but I do want + * to draw attention to the use of kernel-style types. * * As Linus said, "C is a Spartan language, and so should your naming be." I * like these abbreviations, so we define them here. Note that u64 is always * unsigned long long, which works on all Linux systems: this means that we can - * use %llu in printf for any u64. */ + * use %llu in printf for any u64. + */ typedef unsigned long long u64; typedef uint32_t u32; typedef uint16_t u16; @@ -69,8 +73,10 @@ typedef uint8_t u8; /* This will occupy 3 pages: it must be a power of 2. */ #define VIRTQUEUE_NUM 256 -/*L:120 verbose is both a global flag and a macro. The C preprocessor allows - * this, and although I wouldn't recommend it, it works quite nicely here. */ +/*L:120 + * verbose is both a global flag and a macro. The C preprocessor allows + * this, and although I wouldn't recommend it, it works quite nicely here. + */ static bool verbose; #define verbose(args...) \ do { if (verbose) printf(args); } while(0) @@ -100,8 +106,7 @@ struct device_list /* A single linked list of devices. */ struct device *dev; - /* And a pointer to the last device for easy append and also for - * configuration appending. */ + /* And a pointer to the last device for easy append. */ struct device *lastdev; }; @@ -168,20 +173,24 @@ static char **main_args; /* The original tty settings to restore on exit. */ static struct termios orig_term; -/* We have to be careful with barriers: our devices are all run in separate +/* + * We have to be careful with barriers: our devices are all run in separate * threads and so we need to make sure that changes visible to the Guest happen - * in precise order. */ + * in precise order. + */ #define wmb() __asm__ __volatile__("" : : : "memory") #define mb() __asm__ __volatile__("" : : : "memory") -/* Convert an iovec element to the given type. +/* + * Convert an iovec element to the given type. * * This is a fairly ugly trick: we need to know the size of the type and * alignment requirement to check the pointer is kosher. It's also nice to * have the name of the type in case we report failure. * * Typing those three things all the time is cumbersome and error prone, so we - * have a macro which sets them all up and passes to the real function. */ + * have a macro which sets them all up and passes to the real function. + */ #define convert(iov, type) \ ((type *)_convert((iov), sizeof(type), __alignof__(type), #type)) @@ -198,8 +207,10 @@ static void *_convert(struct iovec *iov, size_t size, size_t align, /* Wrapper for the last available index. Makes it easier to change. */ #define lg_last_avail(vq) ((vq)->last_avail_idx) -/* The virtio configuration space is defined to be little-endian. x86 is - * little-endian too, but it's nice to be explicit so we have these helpers. */ +/* + * The virtio configuration space is defined to be little-endian. x86 is + * little-endian too, but it's nice to be explicit so we have these helpers. + */ #define cpu_to_le16(v16) (v16) #define cpu_to_le32(v32) (v32) #define cpu_to_le64(v64) (v64) @@ -241,11 +252,12 @@ static u8 *get_feature_bits(struct device *dev) + dev->num_vq * sizeof(struct lguest_vqconfig); } -/*L:100 The Launcher code itself takes us out into userspace, that scary place - * where pointers run wild and free! Unfortunately, like most userspace - * programs, it's quite boring (which is why everyone likes to hack on the - * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it - * will get you through this section. Or, maybe not. +/*L:100 + * The Launcher code itself takes us out into userspace, that scary place where + * pointers run wild and free! Unfortunately, like most userspace programs, + * it's quite boring (which is why everyone likes to hack on the kernel!). + * Perhaps if you make up an Lguest Drinking Game at this point, it will get + * you through this section. Or, maybe not. * * The Launcher sets up a big chunk of memory to be the Guest's "physical" * memory and stores it in "guest_base". In other words, Guest physical == @@ -253,7 +265,8 @@ static u8 *get_feature_bits(struct device *dev) * * This can be tough to get your head around, but usually it just means that we * use these trivial conversion functions when the Guest gives us it's - * "physical" addresses: */ + * "physical" addresses: + */ static void *from_guest_phys(unsigned long addr) { return guest_base + addr; @@ -268,7 +281,8 @@ static unsigned long to_guest_phys(const void *addr) * Loading the Kernel. * * We start with couple of simple helper routines. open_or_die() avoids - * error-checking code cluttering the callers: */ + * error-checking code cluttering the callers: + */ static int open_or_die(const char *name, int flags) { int fd = open(name, flags); @@ -283,8 +297,10 @@ static void *map_zeroed_pages(unsigned int num) int fd = open_or_die("/dev/zero", O_RDONLY); void *addr; - /* We use a private mapping (ie. if we write to the page, it will be - * copied). */ + /* + * We use a private mapping (ie. if we write to the page, it will be + * copied). + */ addr = mmap(NULL, getpagesize() * num, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); if (addr == MAP_FAILED) @@ -305,20 +321,24 @@ static void *get_pages(unsigned int num) return addr; } -/* This routine is used to load the kernel or initrd. It tries mmap, but if +/* + * This routine is used to load the kernel or initrd. It tries mmap, but if * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), - * it falls back to reading the memory in. */ + * it falls back to reading the memory in. + */ static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) { ssize_t r; - /* We map writable even though for some segments are marked read-only. + /* + * We map writable even though for some segments are marked read-only. * The kernel really wants to be writable: it patches its own * instructions. * * MAP_PRIVATE means that the page won't be copied until a write is * done to it. This allows us to share untouched memory between - * Guests. */ + * Guests. + */ if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) return; @@ -329,7 +349,8 @@ static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); } -/* This routine takes an open vmlinux image, which is in ELF, and maps it into +/* + * This routine takes an open vmlinux image, which is in ELF, and maps it into * the Guest memory. ELF = Embedded Linking Format, which is the format used * by all modern binaries on Linux including the kernel. * @@ -337,23 +358,28 @@ static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) * address. We use the physical address; the Guest will map itself to the * virtual address. * - * We return the starting address. */ + * We return the starting address. + */ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) { Elf32_Phdr phdr[ehdr->e_phnum]; unsigned int i; - /* Sanity checks on the main ELF header: an x86 executable with a - * reasonable number of correctly-sized program headers. */ + /* + * Sanity checks on the main ELF header: an x86 executable with a + * reasonable number of correctly-sized program headers. + */ if (ehdr->e_type != ET_EXEC || ehdr->e_machine != EM_386 || ehdr->e_phentsize != sizeof(Elf32_Phdr) || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) errx(1, "Malformed elf header"); - /* An ELF executable contains an ELF header and a number of "program" + /* + * An ELF executable contains an ELF header and a number of "program" * headers which indicate which parts ("segments") of the program to - * load where. */ + * load where. + */ /* We read in all the program headers at once: */ if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) @@ -361,8 +387,10 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) err(1, "Reading program headers"); - /* Try all the headers: there are usually only three. A read-only one, - * a read-write one, and a "note" section which we don't load. */ + /* + * Try all the headers: there are usually only three. A read-only one, + * a read-write one, and a "note" section which we don't load. + */ for (i = 0; i < ehdr->e_phnum; i++) { /* If this isn't a loadable segment, we ignore it */ if (phdr[i].p_type != PT_LOAD) @@ -380,13 +408,15 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) return ehdr->e_entry; } -/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're - * supposed to jump into it and it will unpack itself. We used to have to - * perform some hairy magic because the unpacking code scared me. +/*L:150 + * A bzImage, unlike an ELF file, is not meant to be loaded. You're supposed + * to jump into it and it will unpack itself. We used to have to perform some + * hairy magic because the unpacking code scared me. * * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote * a small patch to jump over the tricky bits in the Guest, so now we just read - * the funky header so we know where in the file to load, and away we go! */ + * the funky header so we know where in the file to load, and away we go! + */ static unsigned long load_bzimage(int fd) { struct boot_params boot; @@ -394,8 +424,10 @@ static unsigned long load_bzimage(int fd) /* Modern bzImages get loaded at 1M. */ void *p = from_guest_phys(0x100000); - /* Go back to the start of the file and read the header. It should be - * a Linux boot header (see Documentation/x86/i386/boot.txt) */ + /* + * Go back to the start of the file and read the header. It should be + * a Linux boot header (see Documentation/x86/i386/boot.txt) + */ lseek(fd, 0, SEEK_SET); read(fd, &boot, sizeof(boot)); @@ -414,9 +446,11 @@ static unsigned long load_bzimage(int fd) return boot.hdr.code32_start; } -/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels +/*L:140 + * Loading the kernel is easy when it's a "vmlinux", but most kernels * come wrapped up in the self-decompressing "bzImage" format. With a little - * work, we can load those, too. */ + * work, we can load those, too. + */ static unsigned long load_kernel(int fd) { Elf32_Ehdr hdr; @@ -433,24 +467,28 @@ static unsigned long load_kernel(int fd) return load_bzimage(fd); } -/* This is a trivial little helper to align pages. Andi Kleen hated it because +/* + * This is a trivial little helper to align pages. Andi Kleen hated it because * it calls getpagesize() twice: "it's dumb code." * * Kernel guys get really het up about optimization, even when it's not - * necessary. I leave this code as a reaction against that. */ + * necessary. I leave this code as a reaction against that. + */ static inline unsigned long page_align(unsigned long addr) { /* Add upwards and truncate downwards. */ return ((addr + getpagesize()-1) & ~(getpagesize()-1)); } -/*L:180 An "initial ram disk" is a disk image loaded into memory along with - * the kernel which the kernel can use to boot from without needing any - * drivers. Most distributions now use this as standard: the initrd contains - * the code to load the appropriate driver modules for the current machine. +/*L:180 + * An "initial ram disk" is a disk image loaded into memory along with the + * kernel which the kernel can use to boot from without needing any drivers. + * Most distributions now use this as standard: the initrd contains the code to + * load the appropriate driver modules for the current machine. * * Importantly, James Morris works for RedHat, and Fedora uses initrds for its - * kernels. He sent me this (and tells me when I break it). */ + * kernels. He sent me this (and tells me when I break it). + */ static unsigned long load_initrd(const char *name, unsigned long mem) { int ifd; @@ -462,12 +500,16 @@ static unsigned long load_initrd(const char *name, unsigned long mem) if (fstat(ifd, &st) < 0) err(1, "fstat() on initrd '%s'", name); - /* We map the initrd at the top of memory, but mmap wants it to be - * page-aligned, so we round the size up for that. */ + /* + * We map the initrd at the top of memory, but mmap wants it to be + * page-aligned, so we round the size up for that. + */ len = page_align(st.st_size); map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); - /* Once a file is mapped, you can close the file descriptor. It's a - * little odd, but quite useful. */ + /* + * Once a file is mapped, you can close the file descriptor. It's a + * little odd, but quite useful. + */ close(ifd); verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); @@ -476,8 +518,10 @@ static unsigned long load_initrd(const char *name, unsigned long mem) } /*:*/ -/* Simple routine to roll all the commandline arguments together with spaces - * between them. */ +/* + * Simple routine to roll all the commandline arguments together with spaces + * between them. + */ static void concat(char *dst, char *args[]) { unsigned int i, len = 0; @@ -494,10 +538,12 @@ static void concat(char *dst, char *args[]) dst[len] = '\0'; } -/*L:185 This is where we actually tell the kernel to initialize the Guest. We +/*L:185 + * This is where we actually tell the kernel to initialize the Guest. We * saw the arguments it expects when we looked at initialize() in lguest_user.c: * the base of Guest "physical" memory, the top physical page to allow and the - * entry point for the Guest. */ + * entry point for the Guest. + */ static void tell_kernel(unsigned long start) { unsigned long args[] = { LHREQ_INITIALIZE, @@ -522,20 +568,26 @@ static void tell_kernel(unsigned long start) static void *_check_pointer(unsigned long addr, unsigned int size, unsigned int line) { - /* We have to separately check addr and addr+size, because size could - * be huge and addr + size might wrap around. */ + /* + * We have to separately check addr and addr+size, because size could + * be huge and addr + size might wrap around. + */ if (addr >= guest_limit || addr + size >= guest_limit) errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr); - /* We return a pointer for the caller's convenience, now we know it's - * safe to use. */ + /* + * We return a pointer for the caller's convenience, now we know it's + * safe to use. + */ return from_guest_phys(addr); } /* A macro which transparently hands the line number to the real function. */ #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) -/* Each buffer in the virtqueues is actually a chain of descriptors. This +/* + * Each buffer in the virtqueues is actually a chain of descriptors. This * function returns the next descriptor in the chain, or vq->vring.num if we're - * at the end. */ + * at the end. + */ static unsigned next_desc(struct vring_desc *desc, unsigned int i, unsigned int max) { @@ -576,12 +628,14 @@ static void trigger_irq(struct virtqueue *vq) err(1, "Triggering irq %i", vq->config.irq); } -/* This looks in the virtqueue and for the first available buffer, and converts +/* + * This looks in the virtqueue and for the first available buffer, and converts * it to an iovec for convenient access. Since descriptors consist of some * number of output then some number of input descriptors, it's actually two * iovecs, but we pack them into one and note how many of each there were. * - * This function returns the descriptor number found. */ + * This function returns the descriptor number found. + */ static unsigned wait_for_vq_desc(struct virtqueue *vq, struct iovec iov[], unsigned int *out_num, unsigned int *in_num) @@ -599,8 +653,10 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, /* OK, now we need to know about added descriptors. */ vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; - /* They could have slipped one in as we were doing that: make - * sure it's written, then check again. */ + /* + * They could have slipped one in as we were doing that: make + * sure it's written, then check again. + */ mb(); if (last_avail != vq->vring.avail->idx) { vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; @@ -620,8 +676,10 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, errx(1, "Guest moved used index from %u to %u", last_avail, vq->vring.avail->idx); - /* Grab the next descriptor number they're advertising, and increment - * the index we've seen. */ + /* + * Grab the next descriptor number they're advertising, and increment + * the index we've seen. + */ head = vq->vring.avail->ring[last_avail % vq->vring.num]; lg_last_avail(vq)++; @@ -636,8 +694,10 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, desc = vq->vring.desc; i = head; - /* If this is an indirect entry, then this buffer contains a descriptor - * table which we handle as if it's any normal descriptor chain. */ + /* + * If this is an indirect entry, then this buffer contains a descriptor + * table which we handle as if it's any normal descriptor chain. + */ if (desc[i].flags & VRING_DESC_F_INDIRECT) { if (desc[i].len % sizeof(struct vring_desc)) errx(1, "Invalid size for indirect buffer table"); @@ -656,8 +716,10 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, if (desc[i].flags & VRING_DESC_F_WRITE) (*in_num)++; else { - /* If it's an output descriptor, they're all supposed - * to come before any input descriptors. */ + /* + * If it's an output descriptor, they're all supposed + * to come before any input descriptors. + */ if (*in_num) errx(1, "Descriptor has out after in"); (*out_num)++; @@ -671,14 +733,18 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, return head; } -/* After we've used one of their buffers, we tell them about it. We'll then - * want to send them an interrupt, using trigger_irq(). */ +/* + * After we've used one of their buffers, we tell them about it. We'll then + * want to send them an interrupt, using trigger_irq(). + */ static void add_used(struct virtqueue *vq, unsigned int head, int len) { struct vring_used_elem *used; - /* The virtqueue contains a ring of used buffers. Get a pointer to the - * next entry in that used ring. */ + /* + * The virtqueue contains a ring of used buffers. Get a pointer to the + * next entry in that used ring. + */ used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; used->id = head; used->len = len; @@ -698,7 +764,8 @@ static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) /* * The Console * - * We associate some data with the console for our exit hack. */ + * We associate some data with the console for our exit hack. + */ struct console_abort { /* How many times have they hit ^C? */ @@ -725,20 +792,24 @@ static void console_input(struct virtqueue *vq) if (len <= 0) { /* Ran out of input? */ warnx("Failed to get console input, ignoring console."); - /* For simplicity, dying threads kill the whole Launcher. So - * just nap here. */ + /* + * For simplicity, dying threads kill the whole Launcher. So + * just nap here. + */ for (;;) pause(); } add_used_and_trigger(vq, head, len); - /* Three ^C within one second? Exit. + /* + * Three ^C within one second? Exit. * * This is such a hack, but works surprisingly well. Each ^C has to * be in a buffer by itself, so they can't be too fast. But we check * that we get three within about a second, so they can't be too - * slow. */ + * slow. + */ if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { abort->count = 0; return; @@ -809,8 +880,7 @@ static bool will_block(int fd) return select(fd+1, &fdset, NULL, NULL, &zero) != 1; } -/* This is where we handle packets coming in from the tun device to our - * Guest. */ +/* This handles packets coming in from the tun device to our Guest. */ static void net_input(struct virtqueue *vq) { int len; @@ -842,8 +912,10 @@ static int do_thread(void *_vq) return 0; } -/* When a child dies, we kill our entire process group with SIGTERM. This - * also has the side effect that the shell restores the console for us! */ +/* + * When a child dies, we kill our entire process group with SIGTERM. This + * also has the side effect that the shell restores the console for us! + */ static void kill_launcher(int signal) { kill(0, SIGTERM); @@ -880,9 +952,10 @@ static void reset_device(struct device *dev) static void create_thread(struct virtqueue *vq) { - /* Create stack for thread and run it. Since stack grows - * upwards, we point the stack pointer to the end of this - * region. */ + /* + * Create stack for thread and run it. Since the stack grows upwards, + * we point the stack pointer to the end of this region. + */ char *stack = malloc(32768); unsigned long args[] = { LHREQ_EVENTFD, vq->config.pfn*getpagesize(), 0 }; @@ -981,8 +1054,11 @@ static void handle_output(unsigned long addr) } } - /* Early console write is done using notify on a nul-terminated string - * in Guest memory. */ + /* + * Early console write is done using notify on a nul-terminated string + * in Guest memory. It's also great for hacking debugging messages + * into a Guest. + */ if (addr >= guest_limit) errx(1, "Bad NOTIFY %#lx", addr); @@ -998,10 +1074,12 @@ static void handle_output(unsigned long addr) * routines to allocate and manage them. */ -/* The layout of the device page is a "struct lguest_device_desc" followed by a +/* + * The layout of the device page is a "struct lguest_device_desc" followed by a * number of virtqueue descriptors, then two sets of feature bits, then an * array of configuration bytes. This routine returns the configuration - * pointer. */ + * pointer. + */ static u8 *device_config(const struct device *dev) { return (void *)(dev->desc + 1) @@ -1009,9 +1087,11 @@ static u8 *device_config(const struct device *dev) + dev->feature_len * 2; } -/* This routine allocates a new "struct lguest_device_desc" from descriptor +/* + * This routine allocates a new "struct lguest_device_desc" from descriptor * table page just above the Guest's normal memory. It returns a pointer to - * that descriptor. */ + * that descriptor. + */ static struct lguest_device_desc *new_dev_desc(u16 type) { struct lguest_device_desc d = { .type = type }; @@ -1032,8 +1112,10 @@ static struct lguest_device_desc *new_dev_desc(u16 type) return memcpy(p, &d, sizeof(d)); } -/* Each device descriptor is followed by the description of its virtqueues. We - * specify how many descriptors the virtqueue is to have. */ +/* + * Each device descriptor is followed by the description of its virtqueues. We + * specify how many descriptors the virtqueue is to have. + */ static void add_virtqueue(struct device *dev, unsigned int num_descs, void (*service)(struct virtqueue *)) { @@ -1061,10 +1143,12 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, /* Initialize the vring. */ vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN); - /* Append virtqueue to this device's descriptor. We use + /* + * Append virtqueue to this device's descriptor. We use * device_config() to get the end of the device's current virtqueues; * we check that we haven't added any config or feature information - * yet, otherwise we'd be overwriting them. */ + * yet, otherwise we'd be overwriting them. + */ assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); memcpy(device_config(dev), &vq->config, sizeof(vq->config)); dev->num_vq++; @@ -1072,14 +1156,18 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, verbose("Virtqueue page %#lx\n", to_guest_phys(p)); - /* Add to tail of list, so dev->vq is first vq, dev->vq->next is - * second. */ + /* + * Add to tail of list, so dev->vq is first vq, dev->vq->next is + * second. + */ for (i = &dev->vq; *i; i = &(*i)->next); *i = vq; } -/* The first half of the feature bitmask is for us to advertise features. The - * second half is for the Guest to accept features. */ +/* + * The first half of the feature bitmask is for us to advertise features. The + * second half is for the Guest to accept features. + */ static void add_feature(struct device *dev, unsigned bit) { u8 *features = get_feature_bits(dev); @@ -1093,9 +1181,11 @@ static void add_feature(struct device *dev, unsigned bit) features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); } -/* This routine sets the configuration fields for an existing device's +/* + * This routine sets the configuration fields for an existing device's * descriptor. It only works for the last device, but that's OK because that's - * how we use it. */ + * how we use it. + */ static void set_config(struct device *dev, unsigned len, const void *conf) { /* Check we haven't overflowed our single page. */ @@ -1110,10 +1200,12 @@ static void set_config(struct device *dev, unsigned len, const void *conf) assert(dev->desc->config_len == len); } -/* This routine does all the creation and setup of a new device, including +/* + * This routine does all the creation and setup of a new device, including * calling new_dev_desc() to allocate the descriptor and device memory. * - * See what I mean about userspace being boring? */ + * See what I mean about userspace being boring? + */ static struct device *new_device(const char *name, u16 type) { struct device *dev = malloc(sizeof(*dev)); @@ -1126,10 +1218,12 @@ static struct device *new_device(const char *name, u16 type) dev->num_vq = 0; dev->running = false; - /* Append to device list. Prepending to a single-linked list is + /* + * Append to device list. Prepending to a single-linked list is * easier, but the user expects the devices to be arranged on the bus * in command-line order. The first network device on the command line - * is eth0, the first block device /dev/vda, etc. */ + * is eth0, the first block device /dev/vda, etc. + */ if (devices.lastdev) devices.lastdev->next = dev; else @@ -1139,8 +1233,10 @@ static struct device *new_device(const char *name, u16 type) return dev; } -/* Our first setup routine is the console. It's a fairly simple device, but - * UNIX tty handling makes it uglier than it could be. */ +/* + * Our first setup routine is the console. It's a fairly simple device, but + * UNIX tty handling makes it uglier than it could be. + */ static void setup_console(void) { struct device *dev; @@ -1148,8 +1244,10 @@ static void setup_console(void) /* If we can save the initial standard input settings... */ if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { struct termios term = orig_term; - /* Then we turn off echo, line buffering and ^C etc. We want a - * raw input stream to the Guest. */ + /* + * Then we turn off echo, line buffering and ^C etc: We want a + * raw input stream to the Guest. + */ term.c_lflag &= ~(ISIG|ICANON|ECHO); tcsetattr(STDIN_FILENO, TCSANOW, &term); } @@ -1160,10 +1258,12 @@ static void setup_console(void) dev->priv = malloc(sizeof(struct console_abort)); ((struct console_abort *)dev->priv)->count = 0; - /* The console needs two virtqueues: the input then the output. When + /* + * The console needs two virtqueues: the input then the output. When * they put something the input queue, we make sure we're listening to * stdin. When they put something in the output queue, we write it to - * stdout. */ + * stdout. + */ add_virtqueue(dev, VIRTQUEUE_NUM, console_input); add_virtqueue(dev, VIRTQUEUE_NUM, console_output); @@ -1171,7 +1271,8 @@ static void setup_console(void) } /*:*/ -/*M:010 Inter-guest networking is an interesting area. Simplest is to have a +/*M:010 + * Inter-guest networking is an interesting area. Simplest is to have a * --sharenet= option which opens or creates a named pipe. This can be * used to send packets to another guest in a 1:1 manner. * @@ -1185,7 +1286,8 @@ static void setup_console(void) * multiple inter-guest channels behind one interface, although it would * require some manner of hotplugging new virtio channels. * - * Finally, we could implement a virtio network switch in the kernel. :*/ + * Finally, we could implement a virtio network switch in the kernel. +:*/ static u32 str2ip(const char *ipaddr) { @@ -1210,11 +1312,13 @@ static void str2mac(const char *macaddr, unsigned char mac[6]) mac[5] = m[5]; } -/* This code is "adapted" from libbridge: it attaches the Host end of the +/* + * This code is "adapted" from libbridge: it attaches the Host end of the * network device to the bridge device specified by the command line. * * This is yet another James Morris contribution (I'm an IP-level guy, so I - * dislike bridging), and I just try not to break it. */ + * dislike bridging), and I just try not to break it. + */ static void add_to_bridge(int fd, const char *if_name, const char *br_name) { int ifidx; @@ -1234,9 +1338,11 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name) err(1, "can't add %s to bridge %s", if_name, br_name); } -/* This sets up the Host end of the network device with an IP address, brings +/* + * This sets up the Host end of the network device with an IP address, brings * it up so packets will flow, the copies the MAC address into the hwaddr - * pointer. */ + * pointer. + */ static void configure_device(int fd, const char *tapif, u32 ipaddr) { struct ifreq ifr; @@ -1263,10 +1369,12 @@ static int get_tun_device(char tapif[IFNAMSIZ]) /* Start with this zeroed. Messy but sure. */ memset(&ifr, 0, sizeof(ifr)); - /* We open the /dev/net/tun device and tell it we want a tap device. A + /* + * We open the /dev/net/tun device and tell it we want a tap device. A * tap device is like a tun device, only somehow different. To tell * the truth, I completely blundered my way through this code, but it - * works now! */ + * works now! + */ netfd = open_or_die("/dev/net/tun", O_RDWR); ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; strcpy(ifr.ifr_name, "tap%d"); @@ -1277,18 +1385,22 @@ static int get_tun_device(char tapif[IFNAMSIZ]) TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0) err(1, "Could not set features for tun device"); - /* We don't need checksums calculated for packets coming in this - * device: trust us! */ + /* + * We don't need checksums calculated for packets coming in this + * device: trust us! + */ ioctl(netfd, TUNSETNOCSUM, 1); memcpy(tapif, ifr.ifr_name, IFNAMSIZ); return netfd; } -/*L:195 Our network is a Host<->Guest network. This can either use bridging or +/*L:195 + * Our network is a Host<->Guest network. This can either use bridging or * routing, but the principle is the same: it uses the "tun" device to inject * packets into the Host as if they came in from a normal network card. We - * just shunt packets between the Guest and the tun device. */ + * just shunt packets between the Guest and the tun device. + */ static void setup_tun_net(char *arg) { struct device *dev; @@ -1305,13 +1417,14 @@ static void setup_tun_net(char *arg) dev = new_device("net", VIRTIO_ID_NET); dev->priv = net_info; - /* Network devices need a receive and a send queue, just like - * console. */ + /* Network devices need a recv and a send queue, just like console. */ add_virtqueue(dev, VIRTQUEUE_NUM, net_input); add_virtqueue(dev, VIRTQUEUE_NUM, net_output); - /* We need a socket to perform the magic network ioctls to bring up the - * tap interface, connect to the bridge etc. Any socket will do! */ + /* + * We need a socket to perform the magic network ioctls to bring up the + * tap interface, connect to the bridge etc. Any socket will do! + */ ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); if (ipfd < 0) err(1, "opening IP socket"); @@ -1366,7 +1479,8 @@ static void setup_tun_net(char *arg) devices.device_num, tapif, arg); } -/* Our block (disk) device should be really simple: the Guest asks for a block +/* + * Our block (disk) device should be really simple: the Guest asks for a block * number and we read or write that position in the file. Unfortunately, that * was amazingly slow: the Guest waits until the read is finished before * running anything else, even if it could have been doing useful work. @@ -1374,7 +1488,9 @@ static void setup_tun_net(char *arg) * We could use async I/O, except it's reputed to suck so hard that characters * actually go missing from your code when you try to use it. * - * So we farm the I/O out to thread, and communicate with it via a pipe. */ + * So this was one reason why lguest now does all virtqueue servicing in + * separate threads: it's more efficient and more like a real device. + */ /* This hangs off device->priv. */ struct vblk_info @@ -1412,9 +1528,11 @@ static void blk_request(struct virtqueue *vq) /* Get the next request. */ head = wait_for_vq_desc(vq, iov, &out_num, &in_num); - /* Every block request should contain at least one output buffer + /* + * Every block request should contain at least one output buffer * (detailing the location on disk and the type of request) and one - * input buffer (to hold the result). */ + * input buffer (to hold the result). + */ if (out_num == 0 || in_num == 0) errx(1, "Bad virtblk cmd %u out=%u in=%u", head, out_num, in_num); @@ -1423,33 +1541,41 @@ static void blk_request(struct virtqueue *vq) in = convert(&iov[out_num+in_num-1], u8); off = out->sector * 512; - /* The block device implements "barriers", where the Guest indicates + /* + * The block device implements "barriers", where the Guest indicates * that it wants all previous writes to occur before this write. We * don't have a way of asking our kernel to do a barrier, so we just - * synchronize all the data in the file. Pretty poor, no? */ + * synchronize all the data in the file. Pretty poor, no? + */ if (out->type & VIRTIO_BLK_T_BARRIER) fdatasync(vblk->fd); - /* In general the virtio block driver is allowed to try SCSI commands. - * It'd be nice if we supported eject, for example, but we don't. */ + /* + * In general the virtio block driver is allowed to try SCSI commands. + * It'd be nice if we supported eject, for example, but we don't. + */ if (out->type & VIRTIO_BLK_T_SCSI_CMD) { fprintf(stderr, "Scsi commands unsupported\n"); *in = VIRTIO_BLK_S_UNSUPP; wlen = sizeof(*in); } else if (out->type & VIRTIO_BLK_T_OUT) { - /* Write */ - - /* Move to the right location in the block file. This can fail - * if they try to write past end. */ + /* + * Write + * + * Move to the right location in the block file. This can fail + * if they try to write past end. + */ if (lseek64(vblk->fd, off, SEEK_SET) != off) err(1, "Bad seek to sector %llu", out->sector); ret = writev(vblk->fd, iov+1, out_num-1); verbose("WRITE to sector %llu: %i\n", out->sector, ret); - /* Grr... Now we know how long the descriptor they sent was, we + /* + * Grr... Now we know how long the descriptor they sent was, we * make sure they didn't try to write over the end of the block - * file (possibly extending it). */ + * file (possibly extending it). + */ if (ret > 0 && off + ret > vblk->len) { /* Trim it back to the correct length */ ftruncate64(vblk->fd, vblk->len); @@ -1459,10 +1585,12 @@ static void blk_request(struct virtqueue *vq) wlen = sizeof(*in); *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); } else { - /* Read */ - - /* Move to the right location in the block file. This can fail - * if they try to read past end. */ + /* + * Read + * + * Move to the right location in the block file. This can fail + * if they try to read past end. + */ if (lseek64(vblk->fd, off, SEEK_SET) != off) err(1, "Bad seek to sector %llu", out->sector); @@ -1477,10 +1605,12 @@ static void blk_request(struct virtqueue *vq) } } - /* OK, so we noted that it was pretty poor to use an fdatasync as a + /* + * OK, so we noted that it was pretty poor to use an fdatasync as a * barrier. But Christoph Hellwig points out that we need a sync * *afterwards* as well: "Barriers specify no reordering to the front - * or the back." And Jens Axboe confirmed it, so here we are: */ + * or the back." And Jens Axboe confirmed it, so here we are: + */ if (out->type & VIRTIO_BLK_T_BARRIER) fdatasync(vblk->fd); @@ -1494,7 +1624,7 @@ static void setup_block_file(const char *filename) struct vblk_info *vblk; struct virtio_blk_config conf; - /* The device responds to return from I/O thread. */ + /* Creat the device. */ dev = new_device("block", VIRTIO_ID_BLOCK); /* The device has one virtqueue, where the Guest places requests. */ @@ -1513,8 +1643,10 @@ static void setup_block_file(const char *filename) /* Tell Guest how many sectors this device has. */ conf.capacity = cpu_to_le64(vblk->len / 512); - /* Tell Guest not to put in too many descriptors at once: two are used - * for the in and out elements. */ + /* + * Tell Guest not to put in too many descriptors at once: two are used + * for the in and out elements. + */ add_feature(dev, VIRTIO_BLK_F_SEG_MAX); conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); @@ -1525,16 +1657,18 @@ static void setup_block_file(const char *filename) ++devices.device_num, le64_to_cpu(conf.capacity)); } -struct rng_info { - int rfd; -}; - -/* Our random number generator device reads from /dev/random into the Guest's +/*L:211 + * Our random number generator device reads from /dev/random into the Guest's * input buffers. The usual case is that the Guest doesn't want random numbers * and so has no buffers although /dev/random is still readable, whereas * console is the reverse. * - * The same logic applies, however. */ + * The same logic applies, however. + */ +struct rng_info { + int rfd; +}; + static void rng_input(struct virtqueue *vq) { int len; @@ -1547,9 +1681,11 @@ static void rng_input(struct virtqueue *vq) if (out_num) errx(1, "Output buffers in rng?"); - /* This is why we convert to iovecs: the readv() call uses them, and so + /* + * This is why we convert to iovecs: the readv() call uses them, and so * it reads straight into the Guest's buffer. We loop to make sure we - * fill it. */ + * fill it. + */ while (!iov_empty(iov, in_num)) { len = readv(rng_info->rfd, iov, in_num); if (len <= 0) @@ -1562,15 +1698,18 @@ static void rng_input(struct virtqueue *vq) add_used(vq, head, totlen); } -/* And this creates a "hardware" random number device for the Guest. */ +/*L:199 + * This creates a "hardware" random number device for the Guest. + */ static void setup_rng(void) { struct device *dev; struct rng_info *rng_info = malloc(sizeof(*rng_info)); + /* Our device's privat info simply contains the /dev/random fd. */ rng_info->rfd = open_or_die("/dev/random", O_RDONLY); - /* The device responds to return from I/O thread. */ + /* Create the new device. */ dev = new_device("rng", VIRTIO_ID_RNG); dev->priv = rng_info; @@ -1586,8 +1725,10 @@ static void __attribute__((noreturn)) restart_guest(void) { unsigned int i; - /* Since we don't track all open fds, we simply close everything beyond - * stderr. */ + /* + * Since we don't track all open fds, we simply close everything beyond + * stderr. + */ for (i = 3; i < FD_SETSIZE; i++) close(i); @@ -1598,8 +1739,10 @@ static void __attribute__((noreturn)) restart_guest(void) err(1, "Could not exec %s", main_args[0]); } -/*L:220 Finally we reach the core of the Launcher which runs the Guest, serves - * its input and output, and finally, lays it to rest. */ +/*L:220 + * Finally we reach the core of the Launcher which runs the Guest, serves + * its input and output, and finally, lays it to rest. + */ static void __attribute__((noreturn)) run_guest(void) { for (;;) { @@ -1634,7 +1777,7 @@ static void __attribute__((noreturn)) run_guest(void) * * Are you ready? Take a deep breath and join me in the core of the Host, in * "make Host". - :*/ +:*/ static struct option opts[] = { { "verbose", 0, NULL, 'v' }, @@ -1655,8 +1798,7 @@ static void usage(void) /*L:105 The main routine is where the real work begins: */ int main(int argc, char *argv[]) { - /* Memory, top-level pagetable, code startpoint and size of the - * (optional) initrd. */ + /* Memory, code startpoint and size of the (optional) initrd. */ unsigned long mem = 0, start, initrd_size = 0; /* Two temporaries. */ int i, c; @@ -1668,24 +1810,30 @@ int main(int argc, char *argv[]) /* Save the args: we "reboot" by execing ourselves again. */ main_args = argv; - /* First we initialize the device list. We keep a pointer to the last + /* + * First we initialize the device list. We keep a pointer to the last * device, and the next interrupt number to use for devices (1: - * remember that 0 is used by the timer). */ + * remember that 0 is used by the timer). + */ devices.lastdev = NULL; devices.next_irq = 1; cpu_id = 0; - /* We need to know how much memory so we can set up the device + /* + * We need to know how much memory so we can set up the device * descriptor and memory pages for the devices as we parse the command * line. So we quickly look through the arguments to find the amount - * of memory now. */ + * of memory now. + */ for (i = 1; i < argc; i++) { if (argv[i][0] != '-') { mem = atoi(argv[i]) * 1024 * 1024; - /* We start by mapping anonymous pages over all of + /* + * We start by mapping anonymous pages over all of * guest-physical memory range. This fills it with 0, * and ensures that the Guest won't be killed when it - * tries to access it. */ + * tries to access it. + */ guest_base = map_zeroed_pages(mem / getpagesize() + DEVICE_PAGES); guest_limit = mem; @@ -1718,8 +1866,10 @@ int main(int argc, char *argv[]) usage(); } } - /* After the other arguments we expect memory and kernel image name, - * followed by command line arguments for the kernel. */ + /* + * After the other arguments we expect memory and kernel image name, + * followed by command line arguments for the kernel. + */ if (optind + 2 > argc) usage(); @@ -1737,20 +1887,26 @@ int main(int argc, char *argv[]) /* Map the initrd image if requested (at top of physical memory) */ if (initrd_name) { initrd_size = load_initrd(initrd_name, mem); - /* These are the location in the Linux boot header where the - * start and size of the initrd are expected to be found. */ + /* + * These are the location in the Linux boot header where the + * start and size of the initrd are expected to be found. + */ boot->hdr.ramdisk_image = mem - initrd_size; boot->hdr.ramdisk_size = initrd_size; /* The bootloader type 0xFF means "unknown"; that's OK. */ boot->hdr.type_of_loader = 0xFF; } - /* The Linux boot header contains an "E820" memory map: ours is a - * simple, single region. */ + /* + * The Linux boot header contains an "E820" memory map: ours is a + * simple, single region. + */ boot->e820_entries = 1; boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM }); - /* The boot header contains a command line pointer: we put the command - * line after the boot header. */ + /* + * The boot header contains a command line pointer: we put the command + * line after the boot header. + */ boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); /* We use a simple helper to copy the arguments separated by spaces. */ concat((char *)(boot + 1), argv+optind+2); @@ -1764,8 +1920,10 @@ int main(int argc, char *argv[]) /* Tell the entry path not to try to reload segment registers. */ boot->hdr.loadflags |= KEEP_SEGMENTS; - /* We tell the kernel to initialize the Guest: this returns the open - * /dev/lguest file descriptor. */ + /* + * We tell the kernel to initialize the Guest: this returns the open + * /dev/lguest file descriptor. + */ tell_kernel(start); /* Ensure that we terminate if a child dies. */ diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 313389cd50d2..5136dad57cbb 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -17,8 +17,7 @@ /* Pages for switcher itself, then two pages per cpu */ #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) -/* We map at -4M (-2M when PAE is activated) for ease of mapping - * into the guest (one PTE page). */ +/* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */ #ifdef CONFIG_X86_PAE #define SWITCHER_ADDR 0xFFE00000 #else diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index 33600a66755f..cceb73e12e50 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -30,7 +30,8 @@ #include #include -/*G:030 But first, how does our Guest contact the Host to ask for privileged +/*G:030 + * But first, how does our Guest contact the Host to ask for privileged * operations? There are two ways: the direct way is to make a "hypercall", * to make requests of the Host Itself. * @@ -41,16 +42,15 @@ * * Grossly invalid calls result in Sudden Death at the hands of the vengeful * Host, rather than returning failure. This reflects Winston Churchill's - * definition of a gentleman: "someone who is only rude intentionally". */ -/*:*/ + * definition of a gentleman: "someone who is only rude intentionally". +:*/ /* Can't use our min() macro here: needs to be a constant */ #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) #define LHCALL_RING_SIZE 64 struct hcall_args { - /* These map directly onto eax, ebx, ecx, edx and esi - * in struct lguest_regs */ + /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */ unsigned long arg0, arg1, arg2, arg3, arg4; }; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index f2bf1f73d468..025c04d18f2b 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -22,7 +22,8 @@ * * So how does the kernel know it's a Guest? We'll see that later, but let's * just say that we end up here where we replace the native functions various - * "paravirt" structures with our Guest versions, then boot like normal. :*/ + * "paravirt" structures with our Guest versions, then boot like normal. +:*/ /* * Copyright (C) 2006, Rusty Russell IBM Corporation. @@ -74,7 +75,8 @@ * * The Guest in our tale is a simple creature: identical to the Host but * behaving in simplified but equivalent ways. In particular, the Guest is the - * same kernel as the Host (or at least, built from the same source code). :*/ + * same kernel as the Host (or at least, built from the same source code). +:*/ struct lguest_data lguest_data = { .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, @@ -85,7 +87,8 @@ struct lguest_data lguest_data = { .syscall_vec = SYSCALL_VECTOR, }; -/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a +/*G:037 + * async_hcall() is pretty simple: I'm quite proud of it really. We have a * ring buffer of stored hypercalls which the Host will run though next time we * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall * arguments, and a "hcall_status" word which is 0 if the call is ready to go, @@ -94,7 +97,8 @@ struct lguest_data lguest_data = { * If we come around to a slot which hasn't been finished, then the table is * full and we just make the hypercall directly. This has the nice side * effect of causing the Host to run all the stored calls in the ring buffer - * which empties it for next time! */ + * which empties it for next time! + */ static void async_hcall(unsigned long call, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4) @@ -103,9 +107,11 @@ static void async_hcall(unsigned long call, unsigned long arg1, static unsigned int next_call; unsigned long flags; - /* Disable interrupts if not already disabled: we don't want an + /* + * Disable interrupts if not already disabled: we don't want an * interrupt handler making a hypercall while we're already doing - * one! */ + * one! + */ local_irq_save(flags); if (lguest_data.hcall_status[next_call] != 0xFF) { /* Table full, so do normal hcall which will flush table. */ @@ -125,8 +131,9 @@ static void async_hcall(unsigned long call, unsigned long arg1, local_irq_restore(flags); } -/*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first - * real optimization trick! +/*G:035 + * Notice the lazy_hcall() above, rather than hcall(). This is our first real + * optimization trick! * * When lazy_mode is set, it means we're allowed to defer all hypercalls and do * them as a batch when lazy_mode is eventually turned off. Because hypercalls @@ -136,7 +143,8 @@ static void async_hcall(unsigned long call, unsigned long arg1, * lguest_leave_lazy_mode(). * * So, when we're in lazy mode, we call async_hcall() to store the call for - * future processing: */ + * future processing: + */ static void lazy_hcall1(unsigned long call, unsigned long arg1) { @@ -208,9 +216,11 @@ static void lguest_end_context_switch(struct task_struct *next) * check there before it tries to deliver an interrupt. */ -/* save_flags() is expected to return the processor state (ie. "flags"). The +/* + * save_flags() is expected to return the processor state (ie. "flags"). The * flags word contains all kind of stuff, but in practice Linux only cares - * about the interrupt flag. Our "save_flags()" just returns that. */ + * about the interrupt flag. Our "save_flags()" just returns that. + */ static unsigned long save_fl(void) { return lguest_data.irq_enabled; @@ -222,13 +232,15 @@ static void irq_disable(void) lguest_data.irq_enabled = 0; } -/* Let's pause a moment. Remember how I said these are called so often? +/* + * Let's pause a moment. Remember how I said these are called so often? * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to * break some rules. In particular, these functions are assumed to save their * own registers if they need to: normal C functions assume they can trash the * eax register. To use normal C functions, we use * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the - * C function, then restores it. */ + * C function, then restores it. + */ PV_CALLEE_SAVE_REGS_THUNK(save_fl); PV_CALLEE_SAVE_REGS_THUNK(irq_disable); /*:*/ @@ -237,18 +249,20 @@ PV_CALLEE_SAVE_REGS_THUNK(irq_disable); extern void lg_irq_enable(void); extern void lg_restore_fl(unsigned long flags); -/*M:003 Note that we don't check for outstanding interrupts when we re-enable - * them (or when we unmask an interrupt). This seems to work for the moment, - * since interrupts are rare and we'll just get the interrupt on the next timer - * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way - * would be to put the "irq_enabled" field in a page by itself, and have the - * Host write-protect it when an interrupt comes in when irqs are disabled. - * There will then be a page fault as soon as interrupts are re-enabled. +/*M:003 + * Note that we don't check for outstanding interrupts when we re-enable them + * (or when we unmask an interrupt). This seems to work for the moment, since + * interrupts are rare and we'll just get the interrupt on the next timer tick, + * but now we can run with CONFIG_NO_HZ, we should revisit this. One way would + * be to put the "irq_enabled" field in a page by itself, and have the Host + * write-protect it when an interrupt comes in when irqs are disabled. There + * will then be a page fault as soon as interrupts are re-enabled. * * A better method is to implement soft interrupt disable generally for x86: * instead of disabling interrupts, we set a flag. If an interrupt does come * in, we then disable them for real. This is uncommon, so we could simply use - * a hypercall for interrupt control and not worry about efficiency. :*/ + * a hypercall for interrupt control and not worry about efficiency. +:*/ /*G:034 * The Interrupt Descriptor Table (IDT). @@ -261,10 +275,12 @@ extern void lg_restore_fl(unsigned long flags); static void lguest_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) { - /* The gate_desc structure is 8 bytes long: we hand it to the Host in + /* + * The gate_desc structure is 8 bytes long: we hand it to the Host in * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors * around like this; typesafety wasn't a big concern in Linux's early - * years. */ + * years. + */ u32 *desc = (u32 *)g; /* Keep the local copy up to date. */ native_write_idt_entry(dt, entrynum, g); @@ -272,9 +288,11 @@ static void lguest_write_idt_entry(gate_desc *dt, kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); } -/* Changing to a different IDT is very rare: we keep the IDT up-to-date every +/* + * Changing to a different IDT is very rare: we keep the IDT up-to-date every * time it is written, so we can simply loop through all entries and tell the - * Host about them. */ + * Host about them. + */ static void lguest_load_idt(const struct desc_ptr *desc) { unsigned int i; @@ -305,9 +323,11 @@ static void lguest_load_gdt(const struct desc_ptr *desc) kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); } -/* For a single GDT entry which changes, we do the lazy thing: alter our GDT, +/* + * For a single GDT entry which changes, we do the lazy thing: alter our GDT, * then tell the Host to reload the entire thing. This operation is so rare - * that this naive implementation is reasonable. */ + * that this naive implementation is reasonable. + */ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, const void *desc, int type) { @@ -317,29 +337,36 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, dt[entrynum].a, dt[entrynum].b); } -/* OK, I lied. There are three "thread local storage" GDT entries which change +/* + * OK, I lied. There are three "thread local storage" GDT entries which change * on every context switch (these three entries are how glibc implements - * __thread variables). So we have a hypercall specifically for this case. */ + * __thread variables). So we have a hypercall specifically for this case. + */ static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) { - /* There's one problem which normal hardware doesn't have: the Host + /* + * There's one problem which normal hardware doesn't have: the Host * can't handle us removing entries we're currently using. So we clear - * the GS register here: if it's needed it'll be reloaded anyway. */ + * the GS register here: if it's needed it'll be reloaded anyway. + */ lazy_load_gs(0); lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); } -/*G:038 That's enough excitement for now, back to ploughing through each of - * the different pv_ops structures (we're about 1/3 of the way through). +/*G:038 + * That's enough excitement for now, back to ploughing through each of the + * different pv_ops structures (we're about 1/3 of the way through). * * This is the Local Descriptor Table, another weird Intel thingy. Linux only * uses this for some strange applications like Wine. We don't do anything - * here, so they'll get an informative and friendly Segmentation Fault. */ + * here, so they'll get an informative and friendly Segmentation Fault. + */ static void lguest_set_ldt(const void *addr, unsigned entries) { } -/* This loads a GDT entry into the "Task Register": that entry points to a +/* + * This loads a GDT entry into the "Task Register": that entry points to a * structure called the Task State Segment. Some comments scattered though the * kernel code indicate that this used for task switching in ages past, along * with blood sacrifice and astrology. @@ -347,19 +374,21 @@ static void lguest_set_ldt(const void *addr, unsigned entries) * Now there's nothing interesting in here that we don't get told elsewhere. * But the native version uses the "ltr" instruction, which makes the Host * complain to the Guest about a Segmentation Fault and it'll oops. So we - * override the native version with a do-nothing version. */ + * override the native version with a do-nothing version. + */ static void lguest_load_tr_desc(void) { } -/* The "cpuid" instruction is a way of querying both the CPU identity +/* + * The "cpuid" instruction is a way of querying both the CPU identity * (manufacturer, model, etc) and its features. It was introduced before the * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. * As you might imagine, after a decade and a half this treatment, it is now a * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. * * This instruction even it has its own Wikipedia entry. The Wikipedia entry - * has been translated into 4 languages. I am not making this up! + * has been translated into 5 languages. I am not making this up! * * We could get funky here and identify ourselves as "GenuineLguest", but * instead we just use the real "cpuid" instruction. Then I pretty much turned @@ -371,7 +400,8 @@ static void lguest_load_tr_desc(void) * Replacing the cpuid so we can turn features off is great for the kernel, but * anyone (including userspace) can just use the raw "cpuid" instruction and * the Host won't even notice since it isn't privileged. So we try not to get - * too worked up about it. */ + * too worked up about it. + */ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, unsigned int *cx, unsigned int *dx) { @@ -379,43 +409,63 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, native_cpuid(ax, bx, cx, dx); switch (function) { - case 0: /* ID and highest CPUID. Futureproof a little by sticking to - * older ones. */ + /* + * CPUID 0 gives the highest legal CPUID number (and the ID string). + * We futureproof our code a little by sticking to known CPUID values. + */ + case 0: if (*ax > 5) *ax = 5; break; - case 1: /* Basic feature request. */ - /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ + + /* + * CPUID 1 is a basic feature request. + * + * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3 + * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE. + */ + case 1: *cx &= 0x00002201; - /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */ *dx &= 0x07808151; - /* The Host can do a nice optimization if it knows that the + /* + * The Host can do a nice optimization if it knows that the * kernel mappings (addresses above 0xC0000000 or whatever * PAGE_OFFSET is set to) haven't changed. But Linux calls * flush_tlb_user() for both user and kernel mappings unless - * the Page Global Enable (PGE) feature bit is set. */ + * the Page Global Enable (PGE) feature bit is set. + */ *dx |= 0x00002000; - /* We also lie, and say we're family id 5. 6 or greater + /* + * We also lie, and say we're family id 5. 6 or greater * leads to a rdmsr in early_init_intel which we can't handle. - * Family ID is returned as bits 8-12 in ax. */ + * Family ID is returned as bits 8-12 in ax. + */ *ax &= 0xFFFFF0FF; *ax |= 0x00000500; break; + /* + * 0x80000000 returns the highest Extended Function, so we futureproof + * like we do above by limiting it to known fields. + */ case 0x80000000: - /* Futureproof this a little: if they ask how much extended - * processor information there is, limit it to known fields. */ if (*ax > 0x80000008) *ax = 0x80000008; break; + + /* + * PAE systems can mark pages as non-executable. Linux calls this the + * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced + * Virus Protection). We just switch turn if off here, since we don't + * support it. + */ case 0x80000001: - /* Here we should fix nx cap depending on host. */ - /* For this version of PAE, we just clear NX bit. */ *dx &= ~(1 << 20); break; } } -/* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. +/* + * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother * it. The Host needs to know when the Guest wants to change them, so we have * a whole series of functions like read_cr0() and write_cr0(). @@ -430,7 +480,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, * name like "FPUTRAP bit" be a little less cryptic? * * We store cr0 locally because the Host never changes it. The Guest sometimes - * wants to read it and we'd prefer not to bother the Host unnecessarily. */ + * wants to read it and we'd prefer not to bother the Host unnecessarily. + */ static unsigned long current_cr0; static void lguest_write_cr0(unsigned long val) { @@ -443,18 +494,22 @@ static unsigned long lguest_read_cr0(void) return current_cr0; } -/* Intel provided a special instruction to clear the TS bit for people too cool +/* + * Intel provided a special instruction to clear the TS bit for people too cool * to use write_cr0() to do it. This "clts" instruction is faster, because all - * the vowels have been optimized out. */ + * the vowels have been optimized out. + */ static void lguest_clts(void) { lazy_hcall1(LHCALL_TS, 0); current_cr0 &= ~X86_CR0_TS; } -/* cr2 is the virtual address of the last page fault, which the Guest only ever +/* + * cr2 is the virtual address of the last page fault, which the Guest only ever * reads. The Host kindly writes this into our "struct lguest_data", so we - * just read it out of there. */ + * just read it out of there. + */ static unsigned long lguest_read_cr2(void) { return lguest_data.cr2; @@ -463,10 +518,12 @@ static unsigned long lguest_read_cr2(void) /* See lguest_set_pte() below. */ static bool cr3_changed = false; -/* cr3 is the current toplevel pagetable page: the principle is the same as +/* + * cr3 is the current toplevel pagetable page: the principle is the same as * cr0. Keep a local copy, and tell the Host when it changes. The only * difference is that our local copy is in lguest_data because the Host needs - * to set it upon our initial hypercall. */ + * to set it upon our initial hypercall. + */ static void lguest_write_cr3(unsigned long cr3) { lguest_data.pgdir = cr3; @@ -538,10 +595,12 @@ static void lguest_write_cr4(unsigned long val) * the real page tables based on the Guests'. */ -/* The Guest calls this to set a second-level entry (pte), ie. to map a page +/* + * The Guest calls this to set a second-level entry (pte), ie. to map a page * into a process' address space. We set the entry then tell the Host the * toplevel and address this corresponds to. The Guest uses one pagetable per - * process, so we need to tell the Host which one we're changing (mm->pgd). */ + * process, so we need to tell the Host which one we're changing (mm->pgd). + */ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -560,10 +619,13 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, lguest_pte_update(mm, addr, ptep); } -/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd +/* + * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd * to set a middle-level entry when PAE is activated. + * * Again, we set the entry then tell the Host which page we changed, - * and the index of the entry we changed. */ + * and the index of the entry we changed. + */ #ifdef CONFIG_X86_PAE static void lguest_set_pud(pud_t *pudp, pud_t pudval) { @@ -582,8 +644,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) } #else -/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not - * activated. */ +/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { native_set_pmd(pmdp, pmdval); @@ -592,7 +653,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) } #endif -/* There are a couple of legacy places where the kernel sets a PTE, but we +/* + * There are a couple of legacy places where the kernel sets a PTE, but we * don't know the top level any more. This is useless for us, since we don't * know which pagetable is changing or what address, so we just tell the Host * to forget all of them. Fortunately, this is very rare. @@ -600,7 +662,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) * ... except in early boot when the kernel sets up the initial pagetables, * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell * the Host anything changed until we've done the first page table switch, - * which brings boot back to 0.25 seconds. */ + * which brings boot back to 0.25 seconds. + */ static void lguest_set_pte(pte_t *ptep, pte_t pteval) { native_set_pte(ptep, pteval); @@ -628,7 +691,8 @@ void lguest_pmd_clear(pmd_t *pmdp) } #endif -/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on +/* + * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on * native page table operations. On native hardware you can set a new page * table entry whenever you want, but if you want to remove one you have to do * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). @@ -637,24 +701,29 @@ void lguest_pmd_clear(pmd_t *pmdp) * called when a valid entry is written, not when it's removed (ie. marked not * present). Instead, this is where we come when the Guest wants to remove a * page table entry: we tell the Host to set that entry to 0 (ie. the present - * bit is zero). */ + * bit is zero). + */ static void lguest_flush_tlb_single(unsigned long addr) { /* Simply set it to zero: if it was not, it will fault back in. */ lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); } -/* This is what happens after the Guest has removed a large number of entries. +/* + * This is what happens after the Guest has removed a large number of entries. * This tells the Host that any of the page table entries for userspace might - * have changed, ie. virtual addresses below PAGE_OFFSET. */ + * have changed, ie. virtual addresses below PAGE_OFFSET. + */ static void lguest_flush_tlb_user(void) { lazy_hcall1(LHCALL_FLUSH_TLB, 0); } -/* This is called when the kernel page tables have changed. That's not very +/* + * This is called when the kernel page tables have changed. That's not very * common (unless the Guest is using highmem, which makes the Guest extremely - * slow), so it's worth separating this from the user flushing above. */ + * slow), so it's worth separating this from the user flushing above. + */ static void lguest_flush_tlb_kernel(void) { lazy_hcall1(LHCALL_FLUSH_TLB, 1); @@ -691,23 +760,27 @@ static struct irq_chip lguest_irq_controller = { .unmask = enable_lguest_irq, }; -/* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware +/* + * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware * interrupt (except 128, which is used for system calls), and then tells the * Linux infrastructure that each interrupt is controlled by our level-based - * lguest interrupt controller. */ + * lguest interrupt controller. + */ static void __init lguest_init_IRQ(void) { unsigned int i; for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { - /* Some systems map "vectors" to interrupts weirdly. Lguest has - * a straightforward 1 to 1 mapping, so force that here. */ + /* Some systems map "vectors" to interrupts weirdly. Not us! */ __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; if (i != SYSCALL_VECTOR) set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); } - /* This call is required to set up for 4k stacks, where we have - * separate stacks for hard and soft interrupts. */ + + /* + * This call is required to set up for 4k stacks, where we have + * separate stacks for hard and soft interrupts. + */ irq_ctx_init(smp_processor_id()); } @@ -729,31 +802,39 @@ static unsigned long lguest_get_wallclock(void) return lguest_data.time.tv_sec; } -/* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us +/* + * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us * what speed it runs at, or 0 if it's unusable as a reliable clock source. * This matches what we want here: if we return 0 from this function, the x86 - * TSC clock will give up and not register itself. */ + * TSC clock will give up and not register itself. + */ static unsigned long lguest_tsc_khz(void) { return lguest_data.tsc_khz; } -/* If we can't use the TSC, the kernel falls back to our lower-priority - * "lguest_clock", where we read the time value given to us by the Host. */ +/* + * If we can't use the TSC, the kernel falls back to our lower-priority + * "lguest_clock", where we read the time value given to us by the Host. + */ static cycle_t lguest_clock_read(struct clocksource *cs) { unsigned long sec, nsec; - /* Since the time is in two parts (seconds and nanoseconds), we risk + /* + * Since the time is in two parts (seconds and nanoseconds), we risk * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, * and getting 99 and 0. As Linux tends to come apart under the stress - * of time travel, we must be careful: */ + * of time travel, we must be careful: + */ do { /* First we read the seconds part. */ sec = lguest_data.time.tv_sec; - /* This read memory barrier tells the compiler and the CPU that + /* + * This read memory barrier tells the compiler and the CPU that * this can't be reordered: we have to complete the above - * before going on. */ + * before going on. + */ rmb(); /* Now we read the nanoseconds part. */ nsec = lguest_data.time.tv_nsec; @@ -777,9 +858,11 @@ static struct clocksource lguest_clock = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -/* We also need a "struct clock_event_device": Linux asks us to set it to go +/* + * We also need a "struct clock_event_device": Linux asks us to set it to go * off some time in the future. Actually, James Morris figured all this out, I - * just applied the patch. */ + * just applied the patch. + */ static int lguest_clockevent_set_next_event(unsigned long delta, struct clock_event_device *evt) { @@ -829,8 +912,10 @@ static struct clock_event_device lguest_clockevent = { .max_delta_ns = LG_CLOCK_MAX_DELTA, }; -/* This is the Guest timer interrupt handler (hardware interrupt 0). We just - * call the clockevent infrastructure and it does whatever needs doing. */ +/* + * This is the Guest timer interrupt handler (hardware interrupt 0). We just + * call the clockevent infrastructure and it does whatever needs doing. + */ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) { unsigned long flags; @@ -841,10 +926,12 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) local_irq_restore(flags); } -/* At some point in the boot process, we get asked to set up our timing +/* + * At some point in the boot process, we get asked to set up our timing * infrastructure. The kernel doesn't expect timer interrupts before this, but * we cleverly initialized the "blocked_interrupts" field of "struct - * lguest_data" so that timer interrupts were blocked until now. */ + * lguest_data" so that timer interrupts were blocked until now. + */ static void lguest_time_init(void) { /* Set up the timer interrupt (0) to go to our simple timer routine */ @@ -868,14 +955,16 @@ static void lguest_time_init(void) * to work. They're pretty simple. */ -/* The Guest needs to tell the Host what stack it expects traps to use. For +/* + * The Guest needs to tell the Host what stack it expects traps to use. For * native hardware, this is part of the Task State Segment mentioned above in * lguest_load_tr_desc(), but to help hypervisors there's this special call. * * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data * segment), the privilege level (we're privilege level 1, the Host is 0 and * will not tolerate us trying to use that), the stack pointer, and the number - * of pages in the stack. */ + * of pages in the stack. + */ static void lguest_load_sp0(struct tss_struct *tss, struct thread_struct *thread) { @@ -889,7 +978,8 @@ static void lguest_set_debugreg(int regno, unsigned long value) /* FIXME: Implement */ } -/* There are times when the kernel wants to make sure that no memory writes are +/* + * There are times when the kernel wants to make sure that no memory writes are * caught in the cache (that they've all reached real hardware devices). This * doesn't matter for the Guest which has virtual hardware. * @@ -903,11 +993,13 @@ static void lguest_wbinvd(void) { } -/* If the Guest expects to have an Advanced Programmable Interrupt Controller, +/* + * If the Guest expects to have an Advanced Programmable Interrupt Controller, * we play dumb by ignoring writes and returning 0 for reads. So it's no * longer Programmable nor Controlling anything, and I don't think 8 lines of * code qualifies for Advanced. It will also never interrupt anything. It - * does, however, allow us to get through the Linux boot code. */ + * does, however, allow us to get through the Linux boot code. + */ #ifdef CONFIG_X86_LOCAL_APIC static void lguest_apic_write(u32 reg, u32 v) { @@ -956,11 +1048,13 @@ static void lguest_safe_halt(void) kvm_hypercall0(LHCALL_HALT); } -/* The SHUTDOWN hypercall takes a string to describe what's happening, and +/* + * The SHUTDOWN hypercall takes a string to describe what's happening, and * an argument which says whether this to restart (reboot) the Guest or not. * * Note that the Host always prefers that the Guest speak in physical addresses - * rather than virtual addresses, so we use __pa() here. */ + * rather than virtual addresses, so we use __pa() here. + */ static void lguest_power_off(void) { kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), @@ -991,8 +1085,10 @@ static __init char *lguest_memory_setup(void) * nice to move it back to lguest_init. Patch welcome... */ atomic_notifier_chain_register(&panic_notifier_list, &paniced); - /* The Linux bootloader header contains an "e820" memory map: the - * Launcher populated the first entry with our memory limit. */ + /* + *The Linux bootloader header contains an "e820" memory map: the + * Launcher populated the first entry with our memory limit. + */ e820_add_region(boot_params.e820_map[0].addr, boot_params.e820_map[0].size, boot_params.e820_map[0].type); @@ -1001,16 +1097,17 @@ static __init char *lguest_memory_setup(void) return "LGUEST"; } -/* We will eventually use the virtio console device to produce console output, +/* + * We will eventually use the virtio console device to produce console output, * but before that is set up we use LHCALL_NOTIFY on normal memory to produce - * console output. */ + * console output. + */ static __init int early_put_chars(u32 vtermno, const char *buf, int count) { char scratch[17]; unsigned int len = count; - /* We use a nul-terminated string, so we have to make a copy. Icky, - * huh? */ + /* We use a nul-terminated string, so we make a copy. Icky, huh? */ if (len > sizeof(scratch) - 1) len = sizeof(scratch) - 1; scratch[len] = '\0'; @@ -1021,8 +1118,10 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) return len; } -/* Rebooting also tells the Host we're finished, but the RESTART flag tells the - * Launcher to reboot us. */ +/* + * Rebooting also tells the Host we're finished, but the RESTART flag tells the + * Launcher to reboot us. + */ static void lguest_restart(char *reason) { kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); @@ -1049,7 +1148,8 @@ static void lguest_restart(char *reason) * fit comfortably. * * First we need assembly templates of each of the patchable Guest operations, - * and these are in i386_head.S. */ + * and these are in i386_head.S. + */ /*G:060 We construct a table from the assembler templates: */ static const struct lguest_insns @@ -1060,9 +1160,11 @@ static const struct lguest_insns [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, }; -/* Now our patch routine is fairly simple (based on the native one in +/* + * Now our patch routine is fairly simple (based on the native one in * paravirt.c). If we have a replacement, we copy it in and return how much of - * the available space we used. */ + * the available space we used. + */ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, unsigned long addr, unsigned len) { @@ -1074,8 +1176,7 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, insn_len = lguest_insns[type].end - lguest_insns[type].start; - /* Similarly if we can't fit replacement (shouldn't happen, but let's - * be thorough). */ + /* Similarly if it can't fit (doesn't happen, but let's be thorough). */ if (len < insn_len) return paravirt_patch_default(type, clobber, ibuf, addr, len); @@ -1084,22 +1185,28 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, return insn_len; } -/*G:029 Once we get to lguest_init(), we know we're a Guest. The various +/*G:029 + * Once we get to lguest_init(), we know we're a Guest. The various * pv_ops structures in the kernel provide points for (almost) every routine we - * have to override to avoid privileged instructions. */ + * have to override to avoid privileged instructions. + */ __init void lguest_init(void) { - /* We're under lguest, paravirt is enabled, and we're running at - * privilege level 1, not 0 as normal. */ + /* We're under lguest. */ pv_info.name = "lguest"; + /* Paravirt is enabled. */ pv_info.paravirt_enabled = 1; + /* We're running at privilege level 1, not 0 as normal. */ pv_info.kernel_rpl = 1; + /* Everyone except Xen runs with this set. */ pv_info.shared_kernel_pmd = 1; - /* We set up all the lguest overrides for sensitive operations. These - * are detailed with the operations themselves. */ + /* + * We set up all the lguest overrides for sensitive operations. These + * are detailed with the operations themselves. + */ - /* interrupt-related operations */ + /* Interrupt-related operations */ pv_irq_ops.init_IRQ = lguest_init_IRQ; pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); @@ -1107,11 +1214,11 @@ __init void lguest_init(void) pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); pv_irq_ops.safe_halt = lguest_safe_halt; - /* init-time operations */ + /* Setup operations */ pv_init_ops.memory_setup = lguest_memory_setup; pv_init_ops.patch = lguest_patch; - /* Intercepts of various cpu instructions */ + /* Intercepts of various CPU instructions */ pv_cpu_ops.load_gdt = lguest_load_gdt; pv_cpu_ops.cpuid = lguest_cpuid; pv_cpu_ops.load_idt = lguest_load_idt; @@ -1132,7 +1239,7 @@ __init void lguest_init(void) pv_cpu_ops.start_context_switch = paravirt_start_context_switch; pv_cpu_ops.end_context_switch = lguest_end_context_switch; - /* pagetable management */ + /* Pagetable management */ pv_mmu_ops.write_cr3 = lguest_write_cr3; pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; @@ -1154,54 +1261,71 @@ __init void lguest_init(void) pv_mmu_ops.pte_update_defer = lguest_pte_update; #ifdef CONFIG_X86_LOCAL_APIC - /* apic read/write intercepts */ + /* APIC read/write intercepts */ set_lguest_basic_apic_ops(); #endif - /* time operations */ + /* Time operations */ pv_time_ops.get_wallclock = lguest_get_wallclock; pv_time_ops.time_init = lguest_time_init; pv_time_ops.get_tsc_khz = lguest_tsc_khz; - /* Now is a good time to look at the implementations of these functions - * before returning to the rest of lguest_init(). */ + /* + * Now is a good time to look at the implementations of these functions + * before returning to the rest of lguest_init(). + */ - /*G:070 Now we've seen all the paravirt_ops, we return to + /*G:070 + * Now we've seen all the paravirt_ops, we return to * lguest_init() where the rest of the fairly chaotic boot setup - * occurs. */ + * occurs. + */ - /* The stack protector is a weird thing where gcc places a canary + /* + * The stack protector is a weird thing where gcc places a canary * value on the stack and then checks it on return. This file is * compiled with -fno-stack-protector it, so we got this far without * problems. The value of the canary is kept at offset 20 from the * %gs register, so we need to set that up before calling C functions - * in other files. */ + * in other files. + */ setup_stack_canary_segment(0); - /* We could just call load_stack_canary_segment(), but we might as - * call switch_to_new_gdt() which loads the whole table and sets up - * the per-cpu segment descriptor register %fs as well. */ + + /* + * We could just call load_stack_canary_segment(), but we might as well + * call switch_to_new_gdt() which loads the whole table and sets up the + * per-cpu segment descriptor register %fs as well. + */ switch_to_new_gdt(0); /* As described in head_32.S, we map the first 128M of memory. */ max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; - /* The Host<->Guest Switcher lives at the top of our address space, and + /* + * The Host<->Guest Switcher lives at the top of our address space, and * the Host told us how big it is when we made LGUEST_INIT hypercall: - * it put the answer in lguest_data.reserve_mem */ + * it put the answer in lguest_data.reserve_mem + */ reserve_top_address(lguest_data.reserve_mem); - /* If we don't initialize the lock dependency checker now, it crashes - * paravirt_disable_iospace. */ + /* + * If we don't initialize the lock dependency checker now, it crashes + * paravirt_disable_iospace. + */ lockdep_init(); - /* The IDE code spends about 3 seconds probing for disks: if we reserve + /* + * The IDE code spends about 3 seconds probing for disks: if we reserve * all the I/O ports up front it can't get them and so doesn't probe. * Other device drivers are similar (but less severe). This cuts the - * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */ + * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. + */ paravirt_disable_iospace(); - /* This is messy CPU setup stuff which the native boot code does before - * start_kernel, so we have to do, too: */ + /* + * This is messy CPU setup stuff which the native boot code does before + * start_kernel, so we have to do, too: + */ cpu_detect(&new_cpu_data); /* head.S usually sets up the first capability word, so do it here. */ new_cpu_data.x86_capability[0] = cpuid_edx(1); @@ -1218,22 +1342,28 @@ __init void lguest_init(void) acpi_ht = 0; #endif - /* We set the preferred console to "hvc". This is the "hypervisor + /* + * We set the preferred console to "hvc". This is the "hypervisor * virtual console" driver written by the PowerPC people, which we also - * adapted for lguest's use. */ + * adapted for lguest's use. + */ add_preferred_console("hvc", 0, NULL); /* Register our very early console. */ virtio_cons_early_init(early_put_chars); - /* Last of all, we set the power management poweroff hook to point to + /* + * Last of all, we set the power management poweroff hook to point to * the Guest routine to power off, and the reboot hook to our restart - * routine. */ + * routine. + */ pm_power_off = lguest_power_off; machine_ops.restart = lguest_restart; - /* Now we're set up, call i386_start_kernel() in head32.c and we proceed - * to boot as normal. It never returns. */ + /* + * Now we're set up, call i386_start_kernel() in head32.c and we proceed + * to boot as normal. It never returns. + */ i386_start_kernel(); } /* diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index a9c8cfe61cd4..db6aa95eb054 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -5,7 +5,8 @@ #include #include -/*G:020 Our story starts with the kernel booting into startup_32 in +/*G:020 + * Our story starts with the kernel booting into startup_32 in * arch/x86/kernel/head_32.S. It expects a boot header, which is created by * the bootloader (the Launcher in our case). * @@ -21,11 +22,14 @@ * data without remembering to subtract __PAGE_OFFSET! * * The .section line puts this code in .init.text so it will be discarded after - * boot. */ + * boot. + */ .section .init.text, "ax", @progbits ENTRY(lguest_entry) - /* We make the "initialization" hypercall now to tell the Host about - * us, and also find out where it put our page tables. */ + /* + * We make the "initialization" hypercall now to tell the Host about + * us, and also find out where it put our page tables. + */ movl $LHCALL_LGUEST_INIT, %eax movl $lguest_data - __PAGE_OFFSET, %ebx .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ @@ -33,13 +37,14 @@ ENTRY(lguest_entry) /* Set up the initial stack so we can run C code. */ movl $(init_thread_union+THREAD_SIZE),%esp - /* Jumps are relative, and we're running __PAGE_OFFSET too low at the - * moment. */ + /* Jumps are relative: we're running __PAGE_OFFSET too low. */ jmp lguest_init+__PAGE_OFFSET -/*G:055 We create a macro which puts the assembler code between lgstart_ and - * lgend_ markers. These templates are put in the .text section: they can't be - * discarded after boot as we may need to patch modules, too. */ +/*G:055 + * We create a macro which puts the assembler code between lgstart_ and lgend_ + * markers. These templates are put in the .text section: they can't be + * discarded after boot as we may need to patch modules, too. + */ .text #define LGUEST_PATCH(name, insns...) \ lgstart_##name: insns; lgend_##name:; \ @@ -48,58 +53,74 @@ ENTRY(lguest_entry) LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) -/*G:033 But using those wrappers is inefficient (we'll see why that doesn't - * matter for save_fl and irq_disable later). If we write our routines - * carefully in assembler, we can avoid clobbering any registers and avoid - * jumping through the wrapper functions. +/*G:033 + * But using those wrappers is inefficient (we'll see why that doesn't matter + * for save_fl and irq_disable later). If we write our routines carefully in + * assembler, we can avoid clobbering any registers and avoid jumping through + * the wrapper functions. * * I skipped over our first piece of assembler, but this one is worth studying - * in a bit more detail so I'll describe in easy stages. First, the routine - * to enable interrupts: */ + * in a bit more detail so I'll describe in easy stages. First, the routine to + * enable interrupts: + */ ENTRY(lg_irq_enable) - /* The reverse of irq_disable, this sets lguest_data.irq_enabled to - * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ + /* + * The reverse of irq_disable, this sets lguest_data.irq_enabled to + * X86_EFLAGS_IF (ie. "Interrupts enabled"). + */ movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled - /* But now we need to check if the Host wants to know: there might have + /* + * But now we need to check if the Host wants to know: there might have * been interrupts waiting to be delivered, in which case it will have * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we - * jump to send_interrupts, otherwise we're done. */ + * jump to send_interrupts, otherwise we're done. + */ testl $0, lguest_data+LGUEST_DATA_irq_pending jnz send_interrupts - /* One cool thing about x86 is that you can do many things without using + /* + * One cool thing about x86 is that you can do many things without using * a register. In this case, the normal path hasn't needed to save or - * restore any registers at all! */ + * restore any registers at all! + */ ret send_interrupts: - /* OK, now we need a register: eax is used for the hypercall number, + /* + * OK, now we need a register: eax is used for the hypercall number, * which is LHCALL_SEND_INTERRUPTS. * * We used not to bother with this pending detection at all, which was * much simpler. Sooner or later the Host would realize it had to * send us an interrupt. But that turns out to make performance 7 * times worse on a simple tcp benchmark. So now we do this the hard - * way. */ + * way. + */ pushl %eax movl $LHCALL_SEND_INTERRUPTS, %eax - /* This is a vmcall instruction (same thing that KVM uses). Older + /* + * This is a vmcall instruction (same thing that KVM uses). Older * assembler versions might not know the "vmcall" instruction, so we - * create one manually here. */ + * create one manually here. + */ .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ popl %eax ret -/* Finally, the "popf" or "restore flags" routine. The %eax register holds the +/* + * Finally, the "popf" or "restore flags" routine. The %eax register holds the * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're - * enabling interrupts again, if it's 0 we're leaving them off. */ + * enabling interrupts again, if it's 0 we're leaving them off. + */ ENTRY(lg_restore_fl) /* This is just "lguest_data.irq_enabled = flags;" */ movl %eax, lguest_data+LGUEST_DATA_irq_enabled - /* Now, if the %eax value has enabled interrupts and + /* + * Now, if the %eax value has enabled interrupts and * lguest_data.irq_pending is set, we want to tell the Host so it can * deliver any outstanding interrupts. Fortunately, both values will * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" * instruction will AND them together for us. If both are set, we - * jump to send_interrupts. */ + * jump to send_interrupts. + */ testl lguest_data+LGUEST_DATA_irq_pending, %eax jnz send_interrupts /* Again, the normal path has used no extra registers. Clever, huh? */ @@ -109,22 +130,24 @@ ENTRY(lg_restore_fl) .global lguest_noirq_start .global lguest_noirq_end -/*M:004 When the Host reflects a trap or injects an interrupt into the Guest, - * it sets the eflags interrupt bit on the stack based on - * lguest_data.irq_enabled, so the Guest iret logic does the right thing when - * restoring it. However, when the Host sets the Guest up for direct traps, - * such as system calls, the processor is the one to push eflags onto the - * stack, and the interrupt bit will be 1 (in reality, interrupts are always - * enabled in the Guest). +/*M:004 + * When the Host reflects a trap or injects an interrupt into the Guest, it + * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled, + * so the Guest iret logic does the right thing when restoring it. However, + * when the Host sets the Guest up for direct traps, such as system calls, the + * processor is the one to push eflags onto the stack, and the interrupt bit + * will be 1 (in reality, interrupts are always enabled in the Guest). * * This turns out to be harmless: the only trap which should happen under Linux * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc * regions), which has to be reflected through the Host anyway. If another * trap *does* go off when interrupts are disabled, the Guest will panic, and - * we'll never get to this iret! :*/ + * we'll never get to this iret! +:*/ -/*G:045 There is one final paravirt_op that the Guest implements, and glancing - * at it you can see why I left it to last. It's *cool*! It's in *assembler*! +/*G:045 + * There is one final paravirt_op that the Guest implements, and glancing at it + * you can see why I left it to last. It's *cool*! It's in *assembler*! * * The "iret" instruction is used to return from an interrupt or trap. The * stack looks like this: @@ -148,15 +171,18 @@ ENTRY(lg_restore_fl) * return to userspace or wherever. Our solution to this is to surround the * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the * Host that it is *never* to interrupt us there, even if interrupts seem to be - * enabled. */ + * enabled. + */ ENTRY(lguest_iret) pushl %eax movl 12(%esp), %eax lguest_noirq_start: - /* Note the %ss: segment prefix here. Normal data accesses use the + /* + * Note the %ss: segment prefix here. Normal data accesses use the * "ds" segment, but that will have already been restored for whatever * we're returning to (such as userspace): we can't trust it. The %ss: - * prefix makes sure we use the stack segment, which is still valid. */ + * prefix makes sure we use the stack segment, which is still valid. + */ movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled popl %eax iret diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index a6974e9b8ebf..cd058bc903ff 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -1,6 +1,8 @@ -/*P:400 This contains run_guest() which actually calls into the Host<->Guest +/*P:400 + * This contains run_guest() which actually calls into the Host<->Guest * Switcher and analyzes the return, such as determining if the Guest wants the - * Host to do something. This file also contains useful helper routines. :*/ + * Host to do something. This file also contains useful helper routines. +:*/ #include #include #include @@ -24,7 +26,8 @@ static struct page **switcher_page; /* This One Big lock protects all inter-guest data structures. */ DEFINE_MUTEX(lguest_lock); -/*H:010 We need to set up the Switcher at a high virtual address. Remember the +/*H:010 + * We need to set up the Switcher at a high virtual address. Remember the * Switcher is a few hundred bytes of assembler code which actually changes the * CPU to run the Guest, and then changes back to the Host when a trap or * interrupt happens. @@ -33,7 +36,8 @@ DEFINE_MUTEX(lguest_lock); * Host since it will be running as the switchover occurs. * * Trying to map memory at a particular address is an unusual thing to do, so - * it's not a simple one-liner. */ + * it's not a simple one-liner. + */ static __init int map_switcher(void) { int i, err; @@ -47,8 +51,10 @@ static __init int map_switcher(void) * easy. */ - /* We allocate an array of struct page pointers. map_vm_area() wants - * this, rather than just an array of pages. */ + /* + * We allocate an array of struct page pointers. map_vm_area() wants + * this, rather than just an array of pages. + */ switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, GFP_KERNEL); if (!switcher_page) { @@ -56,8 +62,10 @@ static __init int map_switcher(void) goto out; } - /* Now we actually allocate the pages. The Guest will see these pages, - * so we make sure they're zeroed. */ + /* + * Now we actually allocate the pages. The Guest will see these pages, + * so we make sure they're zeroed. + */ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { unsigned long addr = get_zeroed_page(GFP_KERNEL); if (!addr) { @@ -67,19 +75,23 @@ static __init int map_switcher(void) switcher_page[i] = virt_to_page(addr); } - /* First we check that the Switcher won't overlap the fixmap area at + /* + * First we check that the Switcher won't overlap the fixmap area at * the top of memory. It's currently nowhere near, but it could have - * very strange effects if it ever happened. */ + * very strange effects if it ever happened. + */ if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ err = -ENOMEM; printk("lguest: mapping switcher would thwack fixmap\n"); goto free_pages; } - /* Now we reserve the "virtual memory area" we want: 0xFFC00000 + /* + * Now we reserve the "virtual memory area" we want: 0xFFC00000 * (SWITCHER_ADDR). We might not get it in theory, but in practice * it's worked so far. The end address needs +1 because __get_vm_area - * allocates an extra guard page, so we need space for that. */ + * allocates an extra guard page, so we need space for that. + */ switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); @@ -89,11 +101,13 @@ static __init int map_switcher(void) goto free_pages; } - /* This code actually sets up the pages we've allocated to appear at + /* + * This code actually sets up the pages we've allocated to appear at * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the * kind of pages we're mapping (kernel pages), and a pointer to our * array of struct pages. It increments that pointer, but we don't - * care. */ + * care. + */ pagep = switcher_page; err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); if (err) { @@ -101,8 +115,10 @@ static __init int map_switcher(void) goto free_vma; } - /* Now the Switcher is mapped at the right address, we can't fail! - * Copy in the compiled-in Switcher code (from _switcher.S). */ + /* + * Now the Switcher is mapped at the right address, we can't fail! + * Copy in the compiled-in Switcher code (from _switcher.S). + */ memcpy(switcher_vma->addr, start_switcher_text, end_switcher_text - start_switcher_text); @@ -124,8 +140,7 @@ out: } /*:*/ -/* Cleaning up the mapping when the module is unloaded is almost... - * too easy. */ +/* Cleaning up the mapping when the module is unloaded is almost... too easy. */ static void unmap_switcher(void) { unsigned int i; @@ -151,16 +166,19 @@ static void unmap_switcher(void) * But we can't trust the Guest: it might be trying to access the Launcher * code. We have to check that the range is below the pfn_limit the Launcher * gave us. We have to make sure that addr + len doesn't give us a false - * positive by overflowing, too. */ + * positive by overflowing, too. + */ bool lguest_address_ok(const struct lguest *lg, unsigned long addr, unsigned long len) { return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); } -/* This routine copies memory from the Guest. Here we can see how useful the +/* + * This routine copies memory from the Guest. Here we can see how useful the * kill_lguest() routine we met in the Launcher can be: we return a random - * value (all zeroes) instead of needing to return an error. */ + * value (all zeroes) instead of needing to return an error. + */ void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes) { if (!lguest_address_ok(cpu->lg, addr, bytes) @@ -181,9 +199,11 @@ void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b, } /*:*/ -/*H:030 Let's jump straight to the the main loop which runs the Guest. +/*H:030 + * Let's jump straight to the the main loop which runs the Guest. * Remember, this is called by the Launcher reading /dev/lguest, and we keep - * going around and around until something interesting happens. */ + * going around and around until something interesting happens. + */ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) { /* We stop running once the Guest is dead. */ @@ -195,8 +215,10 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) if (cpu->hcall) do_hypercalls(cpu); - /* It's possible the Guest did a NOTIFY hypercall to the - * Launcher, in which case we return from the read() now. */ + /* + * It's possible the Guest did a NOTIFY hypercall to the + * Launcher, in which case we return from the read() now. + */ if (cpu->pending_notify) { if (!send_notify_to_eventfd(cpu)) { if (put_user(cpu->pending_notify, user)) @@ -209,29 +231,39 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) if (signal_pending(current)) return -ERESTARTSYS; - /* Check if there are any interrupts which can be delivered now: + /* + * Check if there are any interrupts which can be delivered now: * if so, this sets up the hander to be executed when we next - * run the Guest. */ + * run the Guest. + */ irq = interrupt_pending(cpu, &more); if (irq < LGUEST_IRQS) try_deliver_interrupt(cpu, irq, more); - /* All long-lived kernel loops need to check with this horrible + /* + * All long-lived kernel loops need to check with this horrible * thing called the freezer. If the Host is trying to suspend, - * it stops us. */ + * it stops us. + */ try_to_freeze(); - /* Just make absolutely sure the Guest is still alive. One of - * those hypercalls could have been fatal, for example. */ + /* + * Just make absolutely sure the Guest is still alive. One of + * those hypercalls could have been fatal, for example. + */ if (cpu->lg->dead) break; - /* If the Guest asked to be stopped, we sleep. The Guest's - * clock timer will wake us. */ + /* + * If the Guest asked to be stopped, we sleep. The Guest's + * clock timer will wake us. + */ if (cpu->halted) { set_current_state(TASK_INTERRUPTIBLE); - /* Just before we sleep, make sure no interrupt snuck in - * which we should be doing. */ + /* + * Just before we sleep, make sure no interrupt snuck in + * which we should be doing. + */ if (interrupt_pending(cpu, &more) < LGUEST_IRQS) set_current_state(TASK_RUNNING); else @@ -239,8 +271,10 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) continue; } - /* OK, now we're ready to jump into the Guest. First we put up - * the "Do Not Disturb" sign: */ + /* + * OK, now we're ready to jump into the Guest. First we put up + * the "Do Not Disturb" sign: + */ local_irq_disable(); /* Actually run the Guest until something happens. */ @@ -327,8 +361,10 @@ static void __exit fini(void) } /*:*/ -/* The Host side of lguest can be a module. This is a nice way for people to - * play with it. */ +/* + * The Host side of lguest can be a module. This is a nice way for people to + * play with it. + */ module_init(init); module_exit(fini); MODULE_LICENSE("GPL"); diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index c29ffa19cb74..787ab4bc09f0 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -1,8 +1,10 @@ -/*P:500 Just as userspace programs request kernel operations through a system +/*P:500 + * Just as userspace programs request kernel operations through a system * call, the Guest requests Host operations through a "hypercall". You might * notice this nomenclature doesn't really follow any logic, but the name has * been around for long enough that we're stuck with it. As you'd expect, this - * code is basically a one big switch statement. :*/ + * code is basically a one big switch statement. +:*/ /* Copyright (C) 2006 Rusty Russell IBM Corporation @@ -28,30 +30,41 @@ #include #include "lg.h" -/*H:120 This is the core hypercall routine: where the Guest gets what it wants. - * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. */ +/*H:120 + * This is the core hypercall routine: where the Guest gets what it wants. + * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. + */ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) { switch (args->arg0) { case LHCALL_FLUSH_ASYNC: - /* This call does nothing, except by breaking out of the Guest - * it makes us process all the asynchronous hypercalls. */ + /* + * This call does nothing, except by breaking out of the Guest + * it makes us process all the asynchronous hypercalls. + */ break; case LHCALL_SEND_INTERRUPTS: - /* This call does nothing too, but by breaking out of the Guest - * it makes us process any pending interrupts. */ + /* + * This call does nothing too, but by breaking out of the Guest + * it makes us process any pending interrupts. + */ break; case LHCALL_LGUEST_INIT: - /* You can't get here unless you're already initialized. Don't - * do that. */ + /* + * You can't get here unless you're already initialized. Don't + * do that. + */ kill_guest(cpu, "already have lguest_data"); break; case LHCALL_SHUTDOWN: { - /* Shutdown is such a trivial hypercall that we do it in four - * lines right here. */ char msg[128]; - /* If the lgread fails, it will call kill_guest() itself; the - * kill_guest() with the message will be ignored. */ + /* + * Shutdown is such a trivial hypercall that we do it in four + * lines right here. + * + * If the lgread fails, it will call kill_guest() itself; the + * kill_guest() with the message will be ignored. + */ __lgread(cpu, msg, args->arg1, sizeof(msg)); msg[sizeof(msg)-1] = '\0'; kill_guest(cpu, "CRASH: %s", msg); @@ -60,16 +73,17 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) break; } case LHCALL_FLUSH_TLB: - /* FLUSH_TLB comes in two flavors, depending on the - * argument: */ + /* FLUSH_TLB comes in two flavors, depending on the argument: */ if (args->arg1) guest_pagetable_clear_all(cpu); else guest_pagetable_flush_user(cpu); break; - /* All these calls simply pass the arguments through to the right - * routines. */ + /* + * All these calls simply pass the arguments through to the right + * routines. + */ case LHCALL_NEW_PGTABLE: guest_new_pagetable(cpu, args->arg1); break; @@ -112,15 +126,16 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) kill_guest(cpu, "Bad hypercall %li\n", args->arg0); } } -/*:*/ -/*H:124 Asynchronous hypercalls are easy: we just look in the array in the +/*H:124 + * Asynchronous hypercalls are easy: we just look in the array in the * Guest's "struct lguest_data" to see if any new ones are marked "ready". * * We are careful to do these in order: obviously we respect the order the * Guest put them in the ring, but we also promise the Guest that they will * happen before any normal hypercall (which is why we check this before - * checking for a normal hcall). */ + * checking for a normal hcall). + */ static void do_async_hcalls(struct lg_cpu *cpu) { unsigned int i; @@ -133,22 +148,28 @@ static void do_async_hcalls(struct lg_cpu *cpu) /* We process "struct lguest_data"s hcalls[] ring once. */ for (i = 0; i < ARRAY_SIZE(st); i++) { struct hcall_args args; - /* We remember where we were up to from last time. This makes + /* + * We remember where we were up to from last time. This makes * sure that the hypercalls are done in the order the Guest - * places them in the ring. */ + * places them in the ring. + */ unsigned int n = cpu->next_hcall; /* 0xFF means there's no call here (yet). */ if (st[n] == 0xFF) break; - /* OK, we have hypercall. Increment the "next_hcall" cursor, - * and wrap back to 0 if we reach the end. */ + /* + * OK, we have hypercall. Increment the "next_hcall" cursor, + * and wrap back to 0 if we reach the end. + */ if (++cpu->next_hcall == LHCALL_RING_SIZE) cpu->next_hcall = 0; - /* Copy the hypercall arguments into a local copy of - * the hcall_args struct. */ + /* + * Copy the hypercall arguments into a local copy of the + * hcall_args struct. + */ if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n], sizeof(struct hcall_args))) { kill_guest(cpu, "Fetching async hypercalls"); @@ -164,19 +185,25 @@ static void do_async_hcalls(struct lg_cpu *cpu) break; } - /* Stop doing hypercalls if they want to notify the Launcher: - * it needs to service this first. */ + /* + * Stop doing hypercalls if they want to notify the Launcher: + * it needs to service this first. + */ if (cpu->pending_notify) break; } } -/* Last of all, we look at what happens first of all. The very first time the - * Guest makes a hypercall, we end up here to set things up: */ +/* + * Last of all, we look at what happens first of all. The very first time the + * Guest makes a hypercall, we end up here to set things up: + */ static void initialize(struct lg_cpu *cpu) { - /* You can't do anything until you're initialized. The Guest knows the - * rules, so we're unforgiving here. */ + /* + * You can't do anything until you're initialized. The Guest knows the + * rules, so we're unforgiving here. + */ if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) { kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0); return; @@ -185,32 +212,40 @@ static void initialize(struct lg_cpu *cpu) if (lguest_arch_init_hypercalls(cpu)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); - /* The Guest tells us where we're not to deliver interrupts by putting - * the range of addresses into "struct lguest_data". */ + /* + * The Guest tells us where we're not to deliver interrupts by putting + * the range of addresses into "struct lguest_data". + */ if (get_user(cpu->lg->noirq_start, &cpu->lg->lguest_data->noirq_start) || get_user(cpu->lg->noirq_end, &cpu->lg->lguest_data->noirq_end)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); - /* We write the current time into the Guest's data page once so it can - * set its clock. */ + /* + * We write the current time into the Guest's data page once so it can + * set its clock. + */ write_timestamp(cpu); /* page_tables.c will also do some setup. */ page_table_guest_data_init(cpu); - /* This is the one case where the above accesses might have been the + /* + * This is the one case where the above accesses might have been the * first write to a Guest page. This may have caused a copy-on-write * fault, but the old page might be (read-only) in the Guest - * pagetable. */ + * pagetable. + */ guest_pagetable_clear_all(cpu); } /*:*/ -/*M:013 If a Guest reads from a page (so creates a mapping) that it has never +/*M:013 + * If a Guest reads from a page (so creates a mapping) that it has never * written to, and then the Launcher writes to it (ie. the output of a virtual * device), the Guest will still see the old page. In practice, this never * happens: why would the Guest read a page which it has never written to? But - * a similar scenario might one day bite us, so it's worth mentioning. :*/ + * a similar scenario might one day bite us, so it's worth mentioning. +:*/ /*H:100 * Hypercalls @@ -229,17 +264,22 @@ void do_hypercalls(struct lg_cpu *cpu) return; } - /* The Guest has initialized. + /* + * The Guest has initialized. * - * Look in the hypercall ring for the async hypercalls: */ + * Look in the hypercall ring for the async hypercalls: + */ do_async_hcalls(cpu); - /* If we stopped reading the hypercall ring because the Guest did a + /* + * If we stopped reading the hypercall ring because the Guest did a * NOTIFY to the Launcher, we want to return now. Otherwise we do - * the hypercall. */ + * the hypercall. + */ if (!cpu->pending_notify) { do_hcall(cpu, cpu->hcall); - /* Tricky point: we reset the hcall pointer to mark the + /* + * Tricky point: we reset the hcall pointer to mark the * hypercall as "done". We use the hcall pointer rather than * the trap number to indicate a hypercall is pending. * Normally it doesn't matter: the Guest will run again and @@ -248,13 +288,16 @@ void do_hypercalls(struct lg_cpu *cpu) * However, if we are signalled or the Guest sends I/O to the * Launcher, the run_guest() loop will exit without running the * Guest. When it comes back it would try to re-run the - * hypercall. Finding that bug sucked. */ + * hypercall. Finding that bug sucked. + */ cpu->hcall = NULL; } } -/* This routine supplies the Guest with time: it's used for wallclock time at - * initial boot and as a rough time source if the TSC isn't available. */ +/* + * This routine supplies the Guest with time: it's used for wallclock time at + * initial boot and as a rough time source if the TSC isn't available. + */ void write_timestamp(struct lg_cpu *cpu) { struct timespec now; diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 0e9067b0d507..18648180db02 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -1,4 +1,5 @@ -/*P:800 Interrupts (traps) are complicated enough to earn their own file. +/*P:800 + * Interrupts (traps) are complicated enough to earn their own file. * There are three classes of interrupts: * * 1) Real hardware interrupts which occur while we're running the Guest, @@ -10,7 +11,8 @@ * just like real hardware would deliver them. Traps from the Guest can be set * up to go directly back into the Guest, but sometimes the Host wants to see * them first, so we also have a way of "reflecting" them into the Guest as if - * they had been delivered to it directly. :*/ + * they had been delivered to it directly. +:*/ #include #include #include @@ -26,8 +28,10 @@ static unsigned long idt_address(u32 lo, u32 hi) return (lo & 0x0000FFFF) | (hi & 0xFFFF0000); } -/* The "type" of the interrupt handler is a 4 bit field: we only support a - * couple of types. */ +/* + * The "type" of the interrupt handler is a 4 bit field: we only support a + * couple of types. + */ static int idt_type(u32 lo, u32 hi) { return (hi >> 8) & 0xF; @@ -39,8 +43,10 @@ static bool idt_present(u32 lo, u32 hi) return (hi & 0x8000); } -/* We need a helper to "push" a value onto the Guest's stack, since that's a - * big part of what delivering an interrupt does. */ +/* + * We need a helper to "push" a value onto the Guest's stack, since that's a + * big part of what delivering an interrupt does. + */ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) { /* Stack grows upwards: move stack then write value. */ @@ -48,7 +54,8 @@ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) lgwrite(cpu, *gstack, u32, val); } -/*H:210 The set_guest_interrupt() routine actually delivers the interrupt or +/*H:210 + * The set_guest_interrupt() routine actually delivers the interrupt or * trap. The mechanics of delivering traps and interrupts to the Guest are the * same, except some traps have an "error code" which gets pushed onto the * stack as well: the caller tells us if this is one. @@ -59,7 +66,8 @@ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) * * We set up the stack just like the CPU does for a real interrupt, so it's * identical for the Guest (and the standard "iret" instruction will undo - * it). */ + * it). + */ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, bool has_err) { @@ -67,20 +75,26 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, u32 eflags, ss, irq_enable; unsigned long virtstack; - /* There are two cases for interrupts: one where the Guest is already + /* + * There are two cases for interrupts: one where the Guest is already * in the kernel, and a more complex one where the Guest is in - * userspace. We check the privilege level to find out. */ + * userspace. We check the privilege level to find out. + */ if ((cpu->regs->ss&0x3) != GUEST_PL) { - /* The Guest told us their kernel stack with the SET_STACK - * hypercall: both the virtual address and the segment */ + /* + * The Guest told us their kernel stack with the SET_STACK + * hypercall: both the virtual address and the segment. + */ virtstack = cpu->esp1; ss = cpu->ss1; origstack = gstack = guest_pa(cpu, virtstack); - /* We push the old stack segment and pointer onto the new + /* + * We push the old stack segment and pointer onto the new * stack: when the Guest does an "iret" back from the interrupt * handler the CPU will notice they're dropping privilege - * levels and expect these here. */ + * levels and expect these here. + */ push_guest_stack(cpu, &gstack, cpu->regs->ss); push_guest_stack(cpu, &gstack, cpu->regs->esp); } else { @@ -91,18 +105,22 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, origstack = gstack = guest_pa(cpu, virtstack); } - /* Remember that we never let the Guest actually disable interrupts, so + /* + * Remember that we never let the Guest actually disable interrupts, so * the "Interrupt Flag" bit is always set. We copy that bit from the * Guest's "irq_enabled" field into the eflags word: we saw the Guest - * copy it back in "lguest_iret". */ + * copy it back in "lguest_iret". + */ eflags = cpu->regs->eflags; if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0 && !(irq_enable & X86_EFLAGS_IF)) eflags &= ~X86_EFLAGS_IF; - /* An interrupt is expected to push three things on the stack: the old + /* + * An interrupt is expected to push three things on the stack: the old * "eflags" word, the old code segment, and the old instruction - * pointer. */ + * pointer. + */ push_guest_stack(cpu, &gstack, eflags); push_guest_stack(cpu, &gstack, cpu->regs->cs); push_guest_stack(cpu, &gstack, cpu->regs->eip); @@ -111,15 +129,19 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, if (has_err) push_guest_stack(cpu, &gstack, cpu->regs->errcode); - /* Now we've pushed all the old state, we change the stack, the code - * segment and the address to execute. */ + /* + * Now we've pushed all the old state, we change the stack, the code + * segment and the address to execute. + */ cpu->regs->ss = ss; cpu->regs->esp = virtstack + (gstack - origstack); cpu->regs->cs = (__KERNEL_CS|GUEST_PL); cpu->regs->eip = idt_address(lo, hi); - /* There are two kinds of interrupt handlers: 0xE is an "interrupt - * gate" which expects interrupts to be disabled on entry. */ + /* + * There are two kinds of interrupt handlers: 0xE is an "interrupt + * gate" which expects interrupts to be disabled on entry. + */ if (idt_type(lo, hi) == 0xE) if (put_user(0, &cpu->lg->lguest_data->irq_enabled)) kill_guest(cpu, "Disabling interrupts"); @@ -130,7 +152,8 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, * * interrupt_pending() returns the first pending interrupt which isn't blocked * by the Guest. It is called before every entry to the Guest, and just before - * we go to sleep when the Guest has halted itself. */ + * we go to sleep when the Guest has halted itself. + */ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) { unsigned int irq; @@ -140,8 +163,10 @@ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) if (!cpu->lg->lguest_data) return LGUEST_IRQS; - /* Take our "irqs_pending" array and remove any interrupts the Guest - * wants blocked: the result ends up in "blk". */ + /* + * Take our "irqs_pending" array and remove any interrupts the Guest + * wants blocked: the result ends up in "blk". + */ if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, sizeof(blk))) return LGUEST_IRQS; @@ -154,16 +179,20 @@ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) return irq; } -/* This actually diverts the Guest to running an interrupt handler, once an - * interrupt has been identified by interrupt_pending(). */ +/* + * This actually diverts the Guest to running an interrupt handler, once an + * interrupt has been identified by interrupt_pending(). + */ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) { struct desc_struct *idt; BUG_ON(irq >= LGUEST_IRQS); - /* They may be in the middle of an iret, where they asked us never to - * deliver interrupts. */ + /* + * They may be in the middle of an iret, where they asked us never to + * deliver interrupts. + */ if (cpu->regs->eip >= cpu->lg->noirq_start && (cpu->regs->eip < cpu->lg->noirq_end)) return; @@ -187,29 +216,37 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) } } - /* Look at the IDT entry the Guest gave us for this interrupt. The + /* + * Look at the IDT entry the Guest gave us for this interrupt. The * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip - * over them. */ + * over them. + */ idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; /* If they don't have a handler (yet?), we just ignore it */ if (idt_present(idt->a, idt->b)) { /* OK, mark it no longer pending and deliver it. */ clear_bit(irq, cpu->irqs_pending); - /* set_guest_interrupt() takes the interrupt descriptor and a + /* + * set_guest_interrupt() takes the interrupt descriptor and a * flag to say whether this interrupt pushes an error code onto - * the stack as well: virtual interrupts never do. */ + * the stack as well: virtual interrupts never do. + */ set_guest_interrupt(cpu, idt->a, idt->b, false); } - /* Every time we deliver an interrupt, we update the timestamp in the + /* + * Every time we deliver an interrupt, we update the timestamp in the * Guest's lguest_data struct. It would be better for the Guest if we * did this more often, but it can actually be quite slow: doing it * here is a compromise which means at least it gets updated every - * timer interrupt. */ + * timer interrupt. + */ write_timestamp(cpu); - /* If there are no other interrupts we want to deliver, clear - * the pending flag. */ + /* + * If there are no other interrupts we want to deliver, clear + * the pending flag. + */ if (!more) put_user(0, &cpu->lg->lguest_data->irq_pending); } @@ -217,24 +254,29 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) /* And this is the routine when we want to set an interrupt for the Guest. */ void set_interrupt(struct lg_cpu *cpu, unsigned int irq) { - /* Next time the Guest runs, the core code will see if it can deliver - * this interrupt. */ + /* + * Next time the Guest runs, the core code will see if it can deliver + * this interrupt. + */ set_bit(irq, cpu->irqs_pending); - /* Make sure it sees it; it might be asleep (eg. halted), or - * running the Guest right now, in which case kick_process() - * will knock it out. */ + /* + * Make sure it sees it; it might be asleep (eg. halted), or running + * the Guest right now, in which case kick_process() will knock it out. + */ if (!wake_up_process(cpu->tsk)) kick_process(cpu->tsk); } /*:*/ -/* Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent +/* + * Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent * me a patch, so we support that too. It'd be a big step for lguest if half * the Plan 9 user base were to start using it. * * Actually now I think of it, it's possible that Ron *is* half the Plan 9 - * userbase. Oh well. */ + * userbase. Oh well. + */ static bool could_be_syscall(unsigned int num) { /* Normal Linux SYSCALL_VECTOR or reserved vector? */ @@ -274,9 +316,11 @@ void free_interrupts(void) clear_bit(syscall_vector, used_vectors); } -/*H:220 Now we've got the routines to deliver interrupts, delivering traps like +/*H:220 + * Now we've got the routines to deliver interrupts, delivering traps like * page fault is easy. The only trick is that Intel decided that some traps - * should have error codes: */ + * should have error codes: + */ static bool has_err(unsigned int trap) { return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); @@ -285,13 +329,17 @@ static bool has_err(unsigned int trap) /* deliver_trap() returns true if it could deliver the trap. */ bool deliver_trap(struct lg_cpu *cpu, unsigned int num) { - /* Trap numbers are always 8 bit, but we set an impossible trap number - * for traps inside the Switcher, so check that here. */ + /* + * Trap numbers are always 8 bit, but we set an impossible trap number + * for traps inside the Switcher, so check that here. + */ if (num >= ARRAY_SIZE(cpu->arch.idt)) return false; - /* Early on the Guest hasn't set the IDT entries (or maybe it put a - * bogus one in): if we fail here, the Guest will be killed. */ + /* + * Early on the Guest hasn't set the IDT entries (or maybe it put a + * bogus one in): if we fail here, the Guest will be killed. + */ if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b)) return false; set_guest_interrupt(cpu, cpu->arch.idt[num].a, @@ -299,7 +347,8 @@ bool deliver_trap(struct lg_cpu *cpu, unsigned int num) return true; } -/*H:250 Here's the hard part: returning to the Host every time a trap happens +/*H:250 + * Here's the hard part: returning to the Host every time a trap happens * and then calling deliver_trap() and re-entering the Guest is slow. * Particularly because Guest userspace system calls are traps (usually trap * 128). @@ -311,69 +360,87 @@ bool deliver_trap(struct lg_cpu *cpu, unsigned int num) * the other hypervisors would beat it up at lunchtime. * * This routine indicates if a particular trap number could be delivered - * directly. */ + * directly. + */ static bool direct_trap(unsigned int num) { - /* Hardware interrupts don't go to the Guest at all (except system - * call). */ + /* + * Hardware interrupts don't go to the Guest at all (except system + * call). + */ if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num)) return false; - /* The Host needs to see page faults (for shadow paging and to save the + /* + * The Host needs to see page faults (for shadow paging and to save the * fault address), general protection faults (in/out emulation) and * device not available (TS handling), invalid opcode fault (kvm hcall), - * and of course, the hypercall trap. */ + * and of course, the hypercall trap. + */ return num != 14 && num != 13 && num != 7 && num != 6 && num != LGUEST_TRAP_ENTRY; } /*:*/ -/*M:005 The Guest has the ability to turn its interrupt gates into trap gates, +/*M:005 + * The Guest has the ability to turn its interrupt gates into trap gates, * if it is careful. The Host will let trap gates can go directly to the * Guest, but the Guest needs the interrupts atomically disabled for an * interrupt gate. It can do this by pointing the trap gate at instructions - * within noirq_start and noirq_end, where it can safely disable interrupts. */ + * within noirq_start and noirq_end, where it can safely disable interrupts. + */ -/*M:006 The Guests do not use the sysenter (fast system call) instruction, +/*M:006 + * The Guests do not use the sysenter (fast system call) instruction, * because it's hardcoded to enter privilege level 0 and so can't go direct. * It's about twice as fast as the older "int 0x80" system call, so it might * still be worthwhile to handle it in the Switcher and lcall down to the * Guest. The sysenter semantics are hairy tho: search for that keyword in - * entry.S :*/ + * entry.S +:*/ -/*H:260 When we make traps go directly into the Guest, we need to make sure +/*H:260 + * When we make traps go directly into the Guest, we need to make sure * the kernel stack is valid (ie. mapped in the page tables). Otherwise, the * CPU trying to deliver the trap will fault while trying to push the interrupt * words on the stack: this is called a double fault, and it forces us to kill * the Guest. * - * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */ + * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. + */ void pin_stack_pages(struct lg_cpu *cpu) { unsigned int i; - /* Depending on the CONFIG_4KSTACKS option, the Guest can have one or - * two pages of stack space. */ + /* + * Depending on the CONFIG_4KSTACKS option, the Guest can have one or + * two pages of stack space. + */ for (i = 0; i < cpu->lg->stack_pages; i++) - /* The stack grows *upwards*, so the address we're given is the + /* + * The stack grows *upwards*, so the address we're given is the * start of the page after the kernel stack. Subtract one to * get back onto the first stack page, and keep subtracting to - * get to the rest of the stack pages. */ + * get to the rest of the stack pages. + */ pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE); } -/* Direct traps also mean that we need to know whenever the Guest wants to use +/* + * Direct traps also mean that we need to know whenever the Guest wants to use * a different kernel stack, so we can change the IDT entries to use that * stack. The IDT entries expect a virtual address, so unlike most addresses * the Guest gives us, the "esp" (stack pointer) value here is virtual, not * physical. * * In Linux each process has its own kernel stack, so this happens a lot: we - * change stacks on each context switch. */ + * change stacks on each context switch. + */ void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) { - /* You are not allowed have a stack segment with privilege level 0: bad - * Guest! */ + /* + * You're not allowed a stack segment with privilege level 0: bad Guest! + */ if ((seg & 0x3) != GUEST_PL) kill_guest(cpu, "bad stack segment %i", seg); /* We only expect one or two stack pages. */ @@ -387,11 +454,15 @@ void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) pin_stack_pages(cpu); } -/* All this reference to mapping stacks leads us neatly into the other complex - * part of the Host: page table handling. */ +/* + * All this reference to mapping stacks leads us neatly into the other complex + * part of the Host: page table handling. + */ -/*H:235 This is the routine which actually checks the Guest's IDT entry and - * transfers it into the entry in "struct lguest": */ +/*H:235 + * This is the routine which actually checks the Guest's IDT entry and + * transfers it into the entry in "struct lguest": + */ static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, unsigned int num, u32 lo, u32 hi) { @@ -407,30 +478,38 @@ static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, if (type != 0xE && type != 0xF) kill_guest(cpu, "bad IDT type %i", type); - /* We only copy the handler address, present bit, privilege level and + /* + * We only copy the handler address, present bit, privilege level and * type. The privilege level controls where the trap can be triggered * manually with an "int" instruction. This is usually GUEST_PL, - * except for system calls which userspace can use. */ + * except for system calls which userspace can use. + */ trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF); trap->b = (hi&0xFFFFEF00); } -/*H:230 While we're here, dealing with delivering traps and interrupts to the +/*H:230 + * While we're here, dealing with delivering traps and interrupts to the * Guest, we might as well complete the picture: how the Guest tells us where * it wants them to go. This would be simple, except making traps fast * requires some tricks. * * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the - * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */ + * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. + */ void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) { - /* Guest never handles: NMI, doublefault, spurious interrupt or - * hypercall. We ignore when it tries to set them. */ + /* + * Guest never handles: NMI, doublefault, spurious interrupt or + * hypercall. We ignore when it tries to set them. + */ if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY) return; - /* Mark the IDT as changed: next time the Guest runs we'll know we have - * to copy this again. */ + /* + * Mark the IDT as changed: next time the Guest runs we'll know we have + * to copy this again. + */ cpu->changed |= CHANGED_IDT; /* Check that the Guest doesn't try to step outside the bounds. */ @@ -440,9 +519,11 @@ void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) set_trap(cpu, &cpu->arch.idt[num], num, lo, hi); } -/* The default entry for each interrupt points into the Switcher routines which +/* + * The default entry for each interrupt points into the Switcher routines which * simply return to the Host. The run_guest() loop will then call - * deliver_trap() to bounce it back into the Guest. */ + * deliver_trap() to bounce it back into the Guest. + */ static void default_idt_entry(struct desc_struct *idt, int trap, const unsigned long handler, @@ -451,13 +532,17 @@ static void default_idt_entry(struct desc_struct *idt, /* A present interrupt gate. */ u32 flags = 0x8e00; - /* Set the privilege level on the entry for the hypercall: this allows - * the Guest to use the "int" instruction to trigger it. */ + /* + * Set the privilege level on the entry for the hypercall: this allows + * the Guest to use the "int" instruction to trigger it. + */ if (trap == LGUEST_TRAP_ENTRY) flags |= (GUEST_PL << 13); else if (base) - /* Copy priv. level from what Guest asked for. This allows - * debug (int 3) traps from Guest userspace, for example. */ + /* + * Copy privilege level from what Guest asked for. This allows + * debug (int 3) traps from Guest userspace, for example. + */ flags |= (base->b & 0x6000); /* Now pack it into the IDT entry in its weird format. */ @@ -475,16 +560,20 @@ void setup_default_idt_entries(struct lguest_ro_state *state, default_idt_entry(&state->guest_idt[i], i, def[i], NULL); } -/*H:240 We don't use the IDT entries in the "struct lguest" directly, instead +/*H:240 + * We don't use the IDT entries in the "struct lguest" directly, instead * we copy them into the IDT which we've set up for Guests on this CPU, just - * before we run the Guest. This routine does that copy. */ + * before we run the Guest. This routine does that copy. + */ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, const unsigned long *def) { unsigned int i; - /* We can simply copy the direct traps, otherwise we use the default - * ones in the Switcher: they will return to the Host. */ + /* + * We can simply copy the direct traps, otherwise we use the default + * ones in the Switcher: they will return to the Host. + */ for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) { const struct desc_struct *gidt = &cpu->arch.idt[i]; @@ -492,14 +581,16 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, if (!direct_trap(i)) continue; - /* Only trap gates (type 15) can go direct to the Guest. + /* + * Only trap gates (type 15) can go direct to the Guest. * Interrupt gates (type 14) disable interrupts as they are * entered, which we never let the Guest do. Not present * entries (type 0x0) also can't go direct, of course. * * If it can't go direct, we still need to copy the priv. level: * they might want to give userspace access to a software - * interrupt. */ + * interrupt. + */ if (idt_type(gidt->a, gidt->b) == 0xF) idt[i] = *gidt; else @@ -518,7 +609,8 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, * the next timer interrupt (in nanoseconds). We use the high-resolution timer * infrastructure to set a callback at that time. * - * 0 means "turn off the clock". */ + * 0 means "turn off the clock". + */ void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) { ktime_t expires; @@ -529,9 +621,11 @@ void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) return; } - /* We use wallclock time here, so the Guest might not be running for + /* + * We use wallclock time here, so the Guest might not be running for * all the time between now and the timer interrupt it asked for. This - * is almost always the right thing to do. */ + * is almost always the right thing to do. + */ expires = ktime_add_ns(ktime_get_real(), delta); hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS); } diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 01c591923793..74c0db691b53 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -54,13 +54,13 @@ struct lg_cpu { unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ - /* At end of a page shared mapped over lguest_pages in guest. */ + /* At end of a page shared mapped over lguest_pages in guest. */ unsigned long regs_page; struct lguest_regs *regs; struct lguest_pages *last_pages; - int cpu_pgd; /* which pgd this cpu is currently using */ + int cpu_pgd; /* Which pgd this cpu is currently using */ /* If a hypercall was asked for, this points to the arguments. */ struct hcall_args *hcall; @@ -96,8 +96,11 @@ struct lguest unsigned int nr_cpus; u32 pfn_limit; - /* This provides the offset to the base of guest-physical - * memory in the Launcher. */ + + /* + * This provides the offset to the base of guest-physical memory in the + * Launcher. + */ void __user *mem_base; unsigned long kernel_address; @@ -122,11 +125,13 @@ bool lguest_address_ok(const struct lguest *lg, void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); -/*H:035 Using memory-copy operations like that is usually inconvient, so we +/*H:035 + * Using memory-copy operations like that is usually inconvient, so we * have the following helper macros which read and write a specific type (often * an unsigned long). * - * This reads into a variable of the given type then returns that. */ + * This reads into a variable of the given type then returns that. + */ #define lgread(cpu, addr, type) \ ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; }) @@ -140,9 +145,11 @@ void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); int run_guest(struct lg_cpu *cpu, unsigned long __user *user); -/* Helper macros to obtain the first 12 or the last 20 bits, this is only the +/* + * Helper macros to obtain the first 12 or the last 20 bits, this is only the * first step in the migration to the kernel types. pte_pfn is already defined - * in the kernel. */ + * in the kernel. + */ #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) #define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK) diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index e082cdac88b4..cc000e79c3d1 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -1,10 +1,12 @@ -/*P:050 Lguest guests use a very simple method to describe devices. It's a +/*P:050 + * Lguest guests use a very simple method to describe devices. It's a * series of device descriptors contained just above the top of normal Guest * memory. * * We use the standard "virtio" device infrastructure, which provides us with a * console, a network and a block driver. Each one expects some configuration - * information and a "virtqueue" or two to send and receive data. :*/ + * information and a "virtqueue" or two to send and receive data. +:*/ #include #include #include @@ -20,8 +22,10 @@ /* The pointer to our (page) of device descriptions. */ static void *lguest_devices; -/* For Guests, device memory can be used as normal memory, so we cast away the - * __iomem to quieten sparse. */ +/* + * For Guests, device memory can be used as normal memory, so we cast away the + * __iomem to quieten sparse. + */ static inline void *lguest_map(unsigned long phys_addr, unsigned long pages) { return (__force void *)ioremap_cache(phys_addr, PAGE_SIZE*pages); @@ -32,8 +36,10 @@ static inline void lguest_unmap(void *addr) iounmap((__force void __iomem *)addr); } -/*D:100 Each lguest device is just a virtio device plus a pointer to its entry - * in the lguest_devices page. */ +/*D:100 + * Each lguest device is just a virtio device plus a pointer to its entry + * in the lguest_devices page. + */ struct lguest_device { struct virtio_device vdev; @@ -41,9 +47,11 @@ struct lguest_device { struct lguest_device_desc *desc; }; -/* Since the virtio infrastructure hands us a pointer to the virtio_device all +/* + * Since the virtio infrastructure hands us a pointer to the virtio_device all * the time, it helps to have a curt macro to get a pointer to the struct - * lguest_device it's enclosed in. */ + * lguest_device it's enclosed in. + */ #define to_lgdev(vd) container_of(vd, struct lguest_device, vdev) /*D:130 @@ -55,7 +63,8 @@ struct lguest_device { * the driver will look at them during setup. * * A convenient routine to return the device's virtqueue config array: - * immediately after the descriptor. */ + * immediately after the descriptor. + */ static struct lguest_vqconfig *lg_vq(const struct lguest_device_desc *desc) { return (void *)(desc + 1); @@ -98,10 +107,12 @@ static u32 lg_get_features(struct virtio_device *vdev) return features; } -/* The virtio core takes the features the Host offers, and copies the - * ones supported by the driver into the vdev->features array. Once - * that's all sorted out, this routine is called so we can tell the - * Host which features we understand and accept. */ +/* + * The virtio core takes the features the Host offers, and copies the ones + * supported by the driver into the vdev->features array. Once that's all + * sorted out, this routine is called so we can tell the Host which features we + * understand and accept. + */ static void lg_finalize_features(struct virtio_device *vdev) { unsigned int i, bits; @@ -112,10 +123,11 @@ static void lg_finalize_features(struct virtio_device *vdev) /* Give virtio_ring a chance to accept features. */ vring_transport_features(vdev); - /* The vdev->feature array is a Linux bitmask: this isn't the - * same as a the simple array of bits used by lguest devices - * for features. So we do this slow, manual conversion which is - * completely general. */ + /* + * The vdev->feature array is a Linux bitmask: this isn't the same as a + * the simple array of bits used by lguest devices for features. So we + * do this slow, manual conversion which is completely general. + */ memset(out_features, 0, desc->feature_len); bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8; for (i = 0; i < bits; i++) { @@ -146,15 +158,19 @@ static void lg_set(struct virtio_device *vdev, unsigned int offset, memcpy(lg_config(desc) + offset, buf, len); } -/* The operations to get and set the status word just access the status field - * of the device descriptor. */ +/* + * The operations to get and set the status word just access the status field + * of the device descriptor. + */ static u8 lg_get_status(struct virtio_device *vdev) { return to_lgdev(vdev)->desc->status; } -/* To notify on status updates, we (ab)use the NOTIFY hypercall, with the - * descriptor address of the device. A zero status means "reset". */ +/* + * To notify on status updates, we (ab)use the NOTIFY hypercall, with the + * descriptor address of the device. A zero status means "reset". + */ static void set_status(struct virtio_device *vdev, u8 status) { unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; @@ -200,13 +216,17 @@ struct lguest_vq_info void *pages; }; -/* When the virtio_ring code wants to prod the Host, it calls us here and we +/* + * When the virtio_ring code wants to prod the Host, it calls us here and we * make a hypercall. We hand the physical address of the virtqueue so the Host - * knows which virtqueue we're talking about. */ + * knows which virtqueue we're talking about. + */ static void lg_notify(struct virtqueue *vq) { - /* We store our virtqueue information in the "priv" pointer of the - * virtqueue structure. */ + /* + * We store our virtqueue information in the "priv" pointer of the + * virtqueue structure. + */ struct lguest_vq_info *lvq = vq->priv; kvm_hypercall1(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT); @@ -215,7 +235,8 @@ static void lg_notify(struct virtqueue *vq) /* An extern declaration inside a C file is bad form. Don't do it. */ extern void lguest_setup_irq(unsigned int irq); -/* This routine finds the first virtqueue described in the configuration of +/* + * This routine finds the first virtqueue described in the configuration of * this device and sets it up. * * This is kind of an ugly duckling. It'd be nicer to have a standard @@ -225,7 +246,8 @@ extern void lguest_setup_irq(unsigned int irq); * simpler for the Host to simply tell us where the pages are. * * So we provide drivers with a "find the Nth virtqueue and set it up" - * function. */ + * function. + */ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, unsigned index, void (*callback)(struct virtqueue *vq), @@ -244,9 +266,11 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, if (!lvq) return ERR_PTR(-ENOMEM); - /* Make a copy of the "struct lguest_vqconfig" entry, which sits after + /* + * Make a copy of the "struct lguest_vqconfig" entry, which sits after * the descriptor. We need a copy because the config space might not - * be aligned correctly. */ + * be aligned correctly. + */ memcpy(&lvq->config, lg_vq(ldev->desc)+index, sizeof(lvq->config)); printk("Mapping virtqueue %i addr %lx\n", index, @@ -261,8 +285,10 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, goto free_lvq; } - /* OK, tell virtio_ring.c to set up a virtqueue now we know its size - * and we've got a pointer to its pages. */ + /* + * OK, tell virtio_ring.c to set up a virtqueue now we know its size + * and we've got a pointer to its pages. + */ vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN, vdev, lvq->pages, lg_notify, callback, name); if (!vq) { @@ -273,18 +299,23 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, /* Make sure the interrupt is allocated. */ lguest_setup_irq(lvq->config.irq); - /* Tell the interrupt for this virtqueue to go to the virtio_ring - * interrupt handler. */ - /* FIXME: We used to have a flag for the Host to tell us we could use + /* + * Tell the interrupt for this virtqueue to go to the virtio_ring + * interrupt handler. + * + * FIXME: We used to have a flag for the Host to tell us we could use * the interrupt as a source of randomness: it'd be nice to have that - * back.. */ + * back. + */ err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED, dev_name(&vdev->dev), vq); if (err) goto destroy_vring; - /* Last of all we hook up our 'struct lguest_vq_info" to the - * virtqueue's priv pointer. */ + /* + * Last of all we hook up our 'struct lguest_vq_info" to the + * virtqueue's priv pointer. + */ vq->priv = lvq; return vq; @@ -358,11 +389,14 @@ static struct virtio_config_ops lguest_config_ops = { .del_vqs = lg_del_vqs, }; -/* The root device for the lguest virtio devices. This makes them appear as - * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. */ +/* + * The root device for the lguest virtio devices. This makes them appear as + * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. + */ static struct device *lguest_root; -/*D:120 This is the core of the lguest bus: actually adding a new device. +/*D:120 + * This is the core of the lguest bus: actually adding a new device. * It's a separate function because it's neater that way, and because an * earlier version of the code supported hotplug and unplug. They were removed * early on because they were never used. @@ -371,14 +405,14 @@ static struct device *lguest_root; * * It's worth reading this carefully: we start with a pointer to the new device * descriptor in the "lguest_devices" page, and the offset into the device - * descriptor page so we can uniquely identify it if things go badly wrong. */ + * descriptor page so we can uniquely identify it if things go badly wrong. + */ static void add_lguest_device(struct lguest_device_desc *d, unsigned int offset) { struct lguest_device *ldev; - /* Start with zeroed memory; Linux's device layer seems to count on - * it. */ + /* Start with zeroed memory; Linux's device layer counts on it. */ ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); if (!ldev) { printk(KERN_EMERG "Cannot allocate lguest dev %u type %u\n", @@ -390,15 +424,19 @@ static void add_lguest_device(struct lguest_device_desc *d, ldev->vdev.dev.parent = lguest_root; /* We have a unique device index thanks to the dev_index counter. */ ldev->vdev.id.device = d->type; - /* We have a simple set of routines for querying the device's - * configuration information and setting its status. */ + /* + * We have a simple set of routines for querying the device's + * configuration information and setting its status. + */ ldev->vdev.config = &lguest_config_ops; /* And we remember the device's descriptor for lguest_config_ops. */ ldev->desc = d; - /* register_virtio_device() sets up the generic fields for the struct + /* + * register_virtio_device() sets up the generic fields for the struct * virtio_device and calls device_register(). This makes the bus - * infrastructure look for a matching driver. */ + * infrastructure look for a matching driver. + */ if (register_virtio_device(&ldev->vdev) != 0) { printk(KERN_ERR "Failed to register lguest dev %u type %u\n", offset, d->type); @@ -406,8 +444,10 @@ static void add_lguest_device(struct lguest_device_desc *d, } } -/*D:110 scan_devices() simply iterates through the device page. The type 0 is - * reserved to mean "end of devices". */ +/*D:110 + * scan_devices() simply iterates through the device page. The type 0 is + * reserved to mean "end of devices". + */ static void scan_devices(void) { unsigned int i; @@ -426,7 +466,8 @@ static void scan_devices(void) } } -/*D:105 Fairly early in boot, lguest_devices_init() is called to set up the +/*D:105 + * Fairly early in boot, lguest_devices_init() is called to set up the * lguest device infrastructure. We check that we are a Guest by checking * pv_info.name: there are other ways of checking, but this seems most * obvious to me. @@ -437,7 +478,8 @@ static void scan_devices(void) * correct sysfs incantation). * * Finally we call scan_devices() which adds all the devices found in the - * lguest_devices page. */ + * lguest_devices page. + */ static int __init lguest_devices_init(void) { if (strcmp(pv_info.name, "lguest") != 0) @@ -456,11 +498,13 @@ static int __init lguest_devices_init(void) /* We do this after core stuff, but before the drivers. */ postcore_initcall(lguest_devices_init); -/*D:150 At this point in the journey we used to now wade through the lguest +/*D:150 + * At this point in the journey we used to now wade through the lguest * devices themselves: net, block and console. Since they're all now virtio * devices rather than lguest-specific, I've decided to ignore them. Mostly, * they're kind of boring. But this does mean you'll never experience the * thrill of reading the forbidden love scene buried deep in the block driver. * * "make Launcher" beckons, where we answer questions like "Where do Guests - * come from?", and "What do you do when someone asks for optimization?". */ + * come from?", and "What do you do when someone asks for optimization?". + */ diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 407722a8e0c4..7e92017103dc 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -1,8 +1,10 @@ -/*P:200 This contains all the /dev/lguest code, whereby the userspace launcher +/*P:200 + * This contains all the /dev/lguest code, whereby the userspace launcher * controls and communicates with the Guest. For example, the first write will * tell us the Guest's memory layout, pagetable, entry point and kernel address * offset. A read will run the Guest until something happens, such as a signal - * or the Guest doing a NOTIFY out to the Launcher. :*/ + * or the Guest doing a NOTIFY out to the Launcher. +:*/ #include #include #include @@ -37,8 +39,10 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) if (!addr) return -EINVAL; - /* Replace the old array with the new one, carefully: others can - * be accessing it at the same time */ + /* + * Replace the old array with the new one, carefully: others can + * be accessing it at the same time. + */ new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1), GFP_KERNEL); if (!new) @@ -61,8 +65,10 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) /* Now put new one in place. */ rcu_assign_pointer(lg->eventfds, new); - /* We're not in a big hurry. Wait until noone's looking at old - * version, then delete it. */ + /* + * We're not in a big hurry. Wait until noone's looking at old + * version, then delete it. + */ synchronize_rcu(); kfree(old); @@ -87,8 +93,10 @@ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) return err; } -/*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt - * number to /dev/lguest. */ +/*L:050 + * Sending an interrupt is done by writing LHREQ_IRQ and an interrupt + * number to /dev/lguest. + */ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) { unsigned long irq; @@ -102,8 +110,10 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) return 0; } -/*L:040 Once our Guest is initialized, the Launcher makes it run by reading - * from /dev/lguest. */ +/*L:040 + * Once our Guest is initialized, the Launcher makes it run by reading + * from /dev/lguest. + */ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) { struct lguest *lg = file->private_data; @@ -139,8 +149,10 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) return len; } - /* If we returned from read() last time because the Guest sent I/O, - * clear the flag. */ + /* + * If we returned from read() last time because the Guest sent I/O, + * clear the flag. + */ if (cpu->pending_notify) cpu->pending_notify = 0; @@ -148,8 +160,10 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) return run_guest(cpu, (unsigned long __user *)user); } -/*L:025 This actually initializes a CPU. For the moment, a Guest is only - * uniprocessor, so "id" is always 0. */ +/*L:025 + * This actually initializes a CPU. For the moment, a Guest is only + * uniprocessor, so "id" is always 0. + */ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) { /* We have a limited number the number of CPUs in the lguest struct. */ @@ -164,8 +178,10 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) /* Each CPU has a timer it can set. */ init_clockdev(cpu); - /* We need a complete page for the Guest registers: they are accessible - * to the Guest and we can only grant it access to whole pages. */ + /* + * We need a complete page for the Guest registers: they are accessible + * to the Guest and we can only grant it access to whole pages. + */ cpu->regs_page = get_zeroed_page(GFP_KERNEL); if (!cpu->regs_page) return -ENOMEM; @@ -173,29 +189,38 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) /* We actually put the registers at the bottom of the page. */ cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); - /* Now we initialize the Guest's registers, handing it the start - * address. */ + /* + * Now we initialize the Guest's registers, handing it the start + * address. + */ lguest_arch_setup_regs(cpu, start_ip); - /* We keep a pointer to the Launcher task (ie. current task) for when - * other Guests want to wake this one (eg. console input). */ + /* + * We keep a pointer to the Launcher task (ie. current task) for when + * other Guests want to wake this one (eg. console input). + */ cpu->tsk = current; - /* We need to keep a pointer to the Launcher's memory map, because if + /* + * We need to keep a pointer to the Launcher's memory map, because if * the Launcher dies we need to clean it up. If we don't keep a - * reference, it is destroyed before close() is called. */ + * reference, it is destroyed before close() is called. + */ cpu->mm = get_task_mm(cpu->tsk); - /* We remember which CPU's pages this Guest used last, for optimization - * when the same Guest runs on the same CPU twice. */ + /* + * We remember which CPU's pages this Guest used last, for optimization + * when the same Guest runs on the same CPU twice. + */ cpu->last_pages = NULL; /* No error == success. */ return 0; } -/*L:020 The initialization write supplies 3 pointer sized (32 or 64 bit) - * values (in addition to the LHREQ_INITIALIZE value). These are: +/*L:020 + * The initialization write supplies 3 pointer sized (32 or 64 bit) values (in + * addition to the LHREQ_INITIALIZE value). These are: * * base: The start of the Guest-physical memory inside the Launcher memory. * @@ -207,14 +232,15 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) */ static int initialize(struct file *file, const unsigned long __user *input) { - /* "struct lguest" contains everything we (the Host) know about a - * Guest. */ + /* "struct lguest" contains all we (the Host) know about a Guest. */ struct lguest *lg; int err; unsigned long args[3]; - /* We grab the Big Lguest lock, which protects against multiple - * simultaneous initializations. */ + /* + * We grab the Big Lguest lock, which protects against multiple + * simultaneous initializations. + */ mutex_lock(&lguest_lock); /* You can't initialize twice! Close the device and start again... */ if (file->private_data) { @@ -249,8 +275,10 @@ static int initialize(struct file *file, const unsigned long __user *input) if (err) goto free_eventfds; - /* Initialize the Guest's shadow page tables, using the toplevel - * address the Launcher gave us. This allocates memory, so can fail. */ + /* + * Initialize the Guest's shadow page tables, using the toplevel + * address the Launcher gave us. This allocates memory, so can fail. + */ err = init_guest_pagetable(lg); if (err) goto free_regs; @@ -275,7 +303,8 @@ unlock: return err; } -/*L:010 The first operation the Launcher does must be a write. All writes +/*L:010 + * The first operation the Launcher does must be a write. All writes * start with an unsigned long number: for the first write this must be * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use * writes of other values to send interrupts. @@ -283,12 +312,15 @@ unlock: * Note that we overload the "offset" in the /dev/lguest file to indicate what * CPU number we're dealing with. Currently this is always 0, since we only * support uniprocessor Guests, but you can see the beginnings of SMP support - * here. */ + * here. + */ static ssize_t write(struct file *file, const char __user *in, size_t size, loff_t *off) { - /* Once the Guest is initialized, we hold the "struct lguest" in the - * file private data. */ + /* + * Once the Guest is initialized, we hold the "struct lguest" in the + * file private data. + */ struct lguest *lg = file->private_data; const unsigned long __user *input = (const unsigned long __user *)in; unsigned long req; @@ -323,13 +355,15 @@ static ssize_t write(struct file *file, const char __user *in, } } -/*L:060 The final piece of interface code is the close() routine. It reverses +/*L:060 + * The final piece of interface code is the close() routine. It reverses * everything done in initialize(). This is usually called because the * Launcher exited. * * Note that the close routine returns 0 or a negative error number: it can't * really fail, but it can whine. I blame Sun for this wart, and K&R C for - * letting them do it. :*/ + * letting them do it. +:*/ static int close(struct inode *inode, struct file *file) { struct lguest *lg = file->private_data; @@ -339,8 +373,10 @@ static int close(struct inode *inode, struct file *file) if (!lg) return 0; - /* We need the big lock, to protect from inter-guest I/O and other - * Launchers initializing guests. */ + /* + * We need the big lock, to protect from inter-guest I/O and other + * Launchers initializing guests. + */ mutex_lock(&lguest_lock); /* Free up the shadow page tables for the Guest. */ @@ -351,8 +387,10 @@ static int close(struct inode *inode, struct file *file) hrtimer_cancel(&lg->cpus[i].hrt); /* We can free up the register page we allocated. */ free_page(lg->cpus[i].regs_page); - /* Now all the memory cleanups are done, it's safe to release - * the Launcher's memory management structure. */ + /* + * Now all the memory cleanups are done, it's safe to release + * the Launcher's memory management structure. + */ mmput(lg->cpus[i].mm); } @@ -361,8 +399,10 @@ static int close(struct inode *inode, struct file *file) eventfd_ctx_put(lg->eventfds->map[i].event); kfree(lg->eventfds); - /* If lg->dead doesn't contain an error code it will be NULL or a - * kmalloc()ed string, either of which is ok to hand to kfree(). */ + /* + * If lg->dead doesn't contain an error code it will be NULL or a + * kmalloc()ed string, either of which is ok to hand to kfree(). + */ if (!IS_ERR(lg->dead)) kfree(lg->dead); /* Free the memory allocated to the lguest_struct */ @@ -386,7 +426,8 @@ static int close(struct inode *inode, struct file *file) * * We begin our understanding with the Host kernel interface which the Launcher * uses: reading and writing a character device called /dev/lguest. All the - * work happens in the read(), write() and close() routines: */ + * work happens in the read(), write() and close() routines: + */ static struct file_operations lguest_fops = { .owner = THIS_MODULE, .release = close, @@ -394,8 +435,10 @@ static struct file_operations lguest_fops = { .read = read, }; -/* This is a textbook example of a "misc" character device. Populate a "struct - * miscdevice" and register it with misc_register(). */ +/* + * This is a textbook example of a "misc" character device. Populate a "struct + * miscdevice" and register it with misc_register(). + */ static struct miscdevice lguest_dev = { .minor = MISC_DYNAMIC_MINOR, .name = "lguest", diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index a6fe1abda240..3da902e4b4cb 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -1,9 +1,11 @@ -/*P:700 The pagetable code, on the other hand, still shows the scars of +/*P:700 + * The pagetable code, on the other hand, still shows the scars of * previous encounters. It's functional, and as neat as it can be in the * circumstances, but be wary, for these things are subtle and break easily. * The Guest provides a virtual to physical mapping, but we can neither trust * it nor use it: we verify and convert it here then point the CPU to the - * converted Guest pages when running the Guest. :*/ + * converted Guest pages when running the Guest. +:*/ /* Copyright (C) Rusty Russell IBM Corporation 2006. * GPL v2 and any later version */ @@ -17,10 +19,12 @@ #include #include "lg.h" -/*M:008 We hold reference to pages, which prevents them from being swapped. +/*M:008 + * We hold reference to pages, which prevents them from being swapped. * It'd be nice to have a callback in the "struct mm_struct" when Linux wants * to swap out. If we had this, and a shrinker callback to trim PTE pages, we - * could probably consider launching Guests as non-root. :*/ + * could probably consider launching Guests as non-root. +:*/ /*H:300 * The Page Table Code @@ -45,16 +49,19 @@ * (v) Flushing (throwing away) page tables, * (vi) Mapping the Switcher when the Guest is about to run, * (vii) Setting up the page tables initially. - :*/ +:*/ - -/* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is +/* + * 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is * conveniently placed at the top 4MB, so it uses a separate, complete PTE - * page. */ + * page. + */ #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) -/* For PAE we need the PMD index as well. We use the last 2MB, so we - * will need the last pmd entry of the last pmd page. */ +/* + * For PAE we need the PMD index as well. We use the last 2MB, so we + * will need the last pmd entry of the last pmd page. + */ #ifdef CONFIG_X86_PAE #define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) #define RESERVE_MEM 2U @@ -64,13 +71,16 @@ #define CHECK_GPGD_MASK _PAGE_TABLE #endif -/* We actually need a separate PTE page for each CPU. Remember that after the +/* + * We actually need a separate PTE page for each CPU. Remember that after the * Switcher code itself comes two pages for each CPU, and we don't want this - * CPU's guest to see the pages of any other CPU. */ + * CPU's guest to see the pages of any other CPU. + */ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) -/*H:320 The page table code is curly enough to need helper functions to keep it +/*H:320 + * The page table code is curly enough to need helper functions to keep it * clear and clean. * * There are two functions which return pointers to the shadow (aka "real") @@ -79,7 +89,8 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); * spgd_addr() takes the virtual address and returns a pointer to the top-level * page directory entry (PGD) for that address. Since we keep track of several * page tables, the "i" argument tells us which one we're interested in (it's - * usually the current one). */ + * usually the current one). + */ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) { unsigned int index = pgd_index(vaddr); @@ -96,9 +107,11 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) } #ifdef CONFIG_X86_PAE -/* This routine then takes the PGD entry given above, which contains the +/* + * This routine then takes the PGD entry given above, which contains the * address of the PMD page. It then returns a pointer to the PMD entry for the - * given address. */ + * given address. + */ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) { unsigned int index = pmd_index(vaddr); @@ -119,9 +132,11 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) } #endif -/* This routine then takes the page directory entry returned above, which +/* + * This routine then takes the page directory entry returned above, which * contains the address of the page table entry (PTE) page. It then returns a - * pointer to the PTE entry for the given address. */ + * pointer to the PTE entry for the given address. + */ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) { #ifdef CONFIG_X86_PAE @@ -139,8 +154,10 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) return &page[pte_index(vaddr)]; } -/* These two functions just like the above two, except they access the Guest - * page tables. Hence they return a Guest address. */ +/* + * These two functions just like the above two, except they access the Guest + * page tables. Hence they return a Guest address. + */ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) { unsigned int index = vaddr >> (PGDIR_SHIFT); @@ -175,17 +192,21 @@ static unsigned long gpte_addr(struct lg_cpu *cpu, #endif /*:*/ -/*M:014 get_pfn is slow: we could probably try to grab batches of pages here as - * an optimization (ie. pre-faulting). :*/ +/*M:014 + * get_pfn is slow: we could probably try to grab batches of pages here as + * an optimization (ie. pre-faulting). +:*/ -/*H:350 This routine takes a page number given by the Guest and converts it to +/*H:350 + * This routine takes a page number given by the Guest and converts it to * an actual, physical page number. It can fail for several reasons: the * virtual address might not be mapped by the Launcher, the write flag is set * and the page is read-only, or the write flag was set and the page was * shared so had to be copied, but we ran out of memory. * * This holds a reference to the page, so release_pte() is careful to put that - * back. */ + * back. + */ static unsigned long get_pfn(unsigned long virtpfn, int write) { struct page *page; @@ -198,33 +219,41 @@ static unsigned long get_pfn(unsigned long virtpfn, int write) return -1UL; } -/*H:340 Converting a Guest page table entry to a shadow (ie. real) page table +/*H:340 + * Converting a Guest page table entry to a shadow (ie. real) page table * entry can be a little tricky. The flags are (almost) the same, but the * Guest PTE contains a virtual page number: the CPU needs the real page - * number. */ + * number. + */ static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) { unsigned long pfn, base, flags; - /* The Guest sets the global flag, because it thinks that it is using + /* + * The Guest sets the global flag, because it thinks that it is using * PGE. We only told it to use PGE so it would tell us whether it was * flushing a kernel mapping or a userspace mapping. We don't actually - * use the global bit, so throw it away. */ + * use the global bit, so throw it away. + */ flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); /* The Guest's pages are offset inside the Launcher. */ base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE; - /* We need a temporary "unsigned long" variable to hold the answer from + /* + * We need a temporary "unsigned long" variable to hold the answer from * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't * fit in spte.pfn. get_pfn() finds the real physical number of the - * page, given the virtual number. */ + * page, given the virtual number. + */ pfn = get_pfn(base + pte_pfn(gpte), write); if (pfn == -1UL) { kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte)); - /* When we destroy the Guest, we'll go through the shadow page + /* + * When we destroy the Guest, we'll go through the shadow page * tables and release_pte() them. Make sure we don't think - * this one is valid! */ + * this one is valid! + */ flags = 0; } /* Now we assemble our shadow PTE from the page number and flags. */ @@ -234,8 +263,10 @@ static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) /*H:460 And to complete the chain, release_pte() looks like this: */ static void release_pte(pte_t pte) { - /* Remember that get_user_pages_fast() took a reference to the page, in - * get_pfn()? We have to put it back now. */ + /* + * Remember that get_user_pages_fast() took a reference to the page, in + * get_pfn()? We have to put it back now. + */ if (pte_flags(pte) & _PAGE_PRESENT) put_page(pte_page(pte)); } @@ -273,7 +304,8 @@ static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) * and return to the Guest without it knowing. * * If we fixed up the fault (ie. we mapped the address), this routine returns - * true. Otherwise, it was a real fault and we need to tell the Guest. */ + * true. Otherwise, it was a real fault and we need to tell the Guest. + */ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) { pgd_t gpgd; @@ -298,22 +330,26 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { /* No shadow entry: allocate a new shadow PTE page. */ unsigned long ptepage = get_zeroed_page(GFP_KERNEL); - /* This is not really the Guest's fault, but killing it is - * simple for this corner case. */ + /* + * This is not really the Guest's fault, but killing it is + * simple for this corner case. + */ if (!ptepage) { kill_guest(cpu, "out of memory allocating pte page"); return false; } /* We check that the Guest pgd is OK. */ check_gpgd(cpu, gpgd); - /* And we copy the flags to the shadow PGD entry. The page - * number in the shadow PGD is the page we just allocated. */ + /* + * And we copy the flags to the shadow PGD entry. The page + * number in the shadow PGD is the page we just allocated. + */ set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); } #ifdef CONFIG_X86_PAE gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); - /* middle level not present? We can't map it in. */ + /* Middle level not present? We can't map it in. */ if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) return false; @@ -324,8 +360,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* No shadow entry: allocate a new shadow PTE page. */ unsigned long ptepage = get_zeroed_page(GFP_KERNEL); - /* This is not really the Guest's fault, but killing it is - * simple for this corner case. */ + /* + * This is not really the Guest's fault, but killing it is + * simple for this corner case. + */ if (!ptepage) { kill_guest(cpu, "out of memory allocating pte page"); return false; @@ -334,17 +372,23 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* We check that the Guest pmd is OK. */ check_gpmd(cpu, gpmd); - /* And we copy the flags to the shadow PMD entry. The page - * number in the shadow PMD is the page we just allocated. */ + /* + * And we copy the flags to the shadow PMD entry. The page + * number in the shadow PMD is the page we just allocated. + */ native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); } - /* OK, now we look at the lower level in the Guest page table: keep its - * address, because we might update it later. */ + /* + * OK, now we look at the lower level in the Guest page table: keep its + * address, because we might update it later. + */ gpte_ptr = gpte_addr(cpu, gpmd, vaddr); #else - /* OK, now we look at the lower level in the Guest page table: keep its - * address, because we might update it later. */ + /* + * OK, now we look at the lower level in the Guest page table: keep its + * address, because we might update it later. + */ gpte_ptr = gpte_addr(cpu, gpgd, vaddr); #endif gpte = lgread(cpu, gpte_ptr, pte_t); @@ -353,8 +397,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) if (!(pte_flags(gpte) & _PAGE_PRESENT)) return false; - /* Check they're not trying to write to a page the Guest wants - * read-only (bit 2 of errcode == write). */ + /* + * Check they're not trying to write to a page the Guest wants + * read-only (bit 2 of errcode == write). + */ if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) return false; @@ -362,8 +408,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) return false; - /* Check that the Guest PTE flags are OK, and the page number is below - * the pfn_limit (ie. not mapping the Launcher binary). */ + /* + * Check that the Guest PTE flags are OK, and the page number is below + * the pfn_limit (ie. not mapping the Launcher binary). + */ check_gpte(cpu, gpte); /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ @@ -373,29 +421,40 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* Get the pointer to the shadow PTE entry we're going to set. */ spte = spte_addr(cpu, *spgd, vaddr); - /* If there was a valid shadow PTE entry here before, we release it. - * This can happen with a write to a previously read-only entry. */ + + /* + * If there was a valid shadow PTE entry here before, we release it. + * This can happen with a write to a previously read-only entry. + */ release_pte(*spte); - /* If this is a write, we insist that the Guest page is writable (the - * final arg to gpte_to_spte()). */ + /* + * If this is a write, we insist that the Guest page is writable (the + * final arg to gpte_to_spte()). + */ if (pte_dirty(gpte)) *spte = gpte_to_spte(cpu, gpte, 1); else - /* If this is a read, don't set the "writable" bit in the page + /* + * If this is a read, don't set the "writable" bit in the page * table entry, even if the Guest says it's writable. That way * we will come back here when a write does actually occur, so - * we can update the Guest's _PAGE_DIRTY flag. */ + * we can update the Guest's _PAGE_DIRTY flag. + */ native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); - /* Finally, we write the Guest PTE entry back: we've set the - * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ + /* + * Finally, we write the Guest PTE entry back: we've set the + * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. + */ lgwrite(cpu, gpte_ptr, pte_t, gpte); - /* The fault is fixed, the page table is populated, the mapping + /* + * The fault is fixed, the page table is populated, the mapping * manipulated, the result returned and the code complete. A small * delay and a trace of alliteration are the only indications the Guest - * has that a page fault occurred at all. */ + * has that a page fault occurred at all. + */ return true; } @@ -408,7 +467,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) * mapped, so it's overkill. * * This is a quick version which answers the question: is this virtual address - * mapped by the shadow page tables, and is it writable? */ + * mapped by the shadow page tables, and is it writable? + */ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) { pgd_t *spgd; @@ -428,16 +488,20 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) return false; #endif - /* Check the flags on the pte entry itself: it must be present and - * writable. */ + /* + * Check the flags on the pte entry itself: it must be present and + * writable. + */ flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); } -/* So, when pin_stack_pages() asks us to pin a page, we check if it's already +/* + * So, when pin_stack_pages() asks us to pin a page, we check if it's already * in the page tables, and if not, we call demand_page() with error code 2 - * (meaning "write"). */ + * (meaning "write"). + */ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) { if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) @@ -485,9 +549,11 @@ static void release_pgd(pgd_t *spgd) /* If the entry's not present, there's nothing to release. */ if (pgd_flags(*spgd) & _PAGE_PRESENT) { unsigned int i; - /* Converting the pfn to find the actual PTE page is easy: turn + /* + * Converting the pfn to find the actual PTE page is easy: turn * the page number into a physical address, then convert to a - * virtual address (easy for kernel pages like this one). */ + * virtual address (easy for kernel pages like this one). + */ pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); /* For each entry in the page, we might need to release it. */ for (i = 0; i < PTRS_PER_PTE; i++) @@ -499,9 +565,12 @@ static void release_pgd(pgd_t *spgd) } } #endif -/*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() + +/*H:445 + * We saw flush_user_mappings() twice: once from the flush_user_mappings() * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. - * It simply releases every PTE page from 0 up to the Guest's kernel address. */ + * It simply releases every PTE page from 0 up to the Guest's kernel address. + */ static void flush_user_mappings(struct lguest *lg, int idx) { unsigned int i; @@ -510,10 +579,12 @@ static void flush_user_mappings(struct lguest *lg, int idx) release_pgd(lg->pgdirs[idx].pgdir + i); } -/*H:440 (v) Flushing (throwing away) page tables, +/*H:440 + * (v) Flushing (throwing away) page tables, * * The Guest has a hypercall to throw away the page tables: it's used when a - * large number of mappings have been changed. */ + * large number of mappings have been changed. + */ void guest_pagetable_flush_user(struct lg_cpu *cpu) { /* Drop the userspace part of the current page table. */ @@ -551,9 +622,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); } -/* We keep several page tables. This is a simple routine to find the page +/* + * We keep several page tables. This is a simple routine to find the page * table (if any) corresponding to this top-level address the Guest has given - * us. */ + * us. + */ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) { unsigned int i; @@ -563,9 +636,11 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) return i; } -/*H:435 And this is us, creating the new page directory. If we really do +/*H:435 + * And this is us, creating the new page directory. If we really do * allocate a new one (and so the kernel parts are not there), we set - * blank_pgdir. */ + * blank_pgdir. + */ static unsigned int new_pgdir(struct lg_cpu *cpu, unsigned long gpgdir, int *blank_pgdir) @@ -575,8 +650,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, pmd_t *pmd_table; #endif - /* We pick one entry at random to throw out. Choosing the Least - * Recently Used might be better, but this is easy. */ + /* + * We pick one entry at random to throw out. Choosing the Least + * Recently Used might be better, but this is easy. + */ next = random32() % ARRAY_SIZE(cpu->lg->pgdirs); /* If it's never been allocated at all before, try now. */ if (!cpu->lg->pgdirs[next].pgdir) { @@ -587,8 +664,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, next = cpu->cpu_pgd; else { #ifdef CONFIG_X86_PAE - /* In PAE mode, allocate a pmd page and populate the - * last pgd entry. */ + /* + * In PAE mode, allocate a pmd page and populate the + * last pgd entry. + */ pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd_table) { free_page((long)cpu->lg->pgdirs[next].pgdir); @@ -598,8 +677,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, set_pgd(cpu->lg->pgdirs[next].pgdir + SWITCHER_PGD_INDEX, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - /* This is a blank page, so there are no kernel - * mappings: caller must map the stack! */ + /* + * This is a blank page, so there are no kernel + * mappings: caller must map the stack! + */ *blank_pgdir = 1; } #else @@ -615,19 +696,23 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, return next; } -/*H:430 (iv) Switching page tables +/*H:430 + * (iv) Switching page tables * * Now we've seen all the page table setting and manipulation, let's see * what happens when the Guest changes page tables (ie. changes the top-level - * pgdir). This occurs on almost every context switch. */ + * pgdir). This occurs on almost every context switch. + */ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) { int newpgdir, repin = 0; /* Look to see if we have this one already. */ newpgdir = find_pgdir(cpu->lg, pgtable); - /* If not, we allocate or mug an existing one: if it's a fresh one, - * repin gets set to 1. */ + /* + * If not, we allocate or mug an existing one: if it's a fresh one, + * repin gets set to 1. + */ if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) newpgdir = new_pgdir(cpu, pgtable, &repin); /* Change the current pgd index to the new one. */ @@ -637,9 +722,11 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) pin_stack_pages(cpu); } -/*H:470 Finally, a routine which throws away everything: all PGD entries in all +/*H:470 + * Finally, a routine which throws away everything: all PGD entries in all * the shadow page tables, including the Guest's kernel mappings. This is used - * when we destroy the Guest. */ + * when we destroy the Guest. + */ static void release_all_pagetables(struct lguest *lg) { unsigned int i, j; @@ -656,8 +743,10 @@ static void release_all_pagetables(struct lguest *lg) spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); - /* And release the pmd entries of that pmd page, - * except for the switcher pmd. */ + /* + * And release the pmd entries of that pmd page, + * except for the switcher pmd. + */ for (k = 0; k < SWITCHER_PMD_INDEX; k++) release_pmd(&pmdpage[k]); #endif @@ -667,10 +756,12 @@ static void release_all_pagetables(struct lguest *lg) } } -/* We also throw away everything when a Guest tells us it's changed a kernel +/* + * We also throw away everything when a Guest tells us it's changed a kernel * mapping. Since kernel mappings are in every page table, it's easiest to * throw them all away. This traps the Guest in amber for a while as - * everything faults back in, but it's rare. */ + * everything faults back in, but it's rare. + */ void guest_pagetable_clear_all(struct lg_cpu *cpu) { release_all_pagetables(cpu->lg); @@ -678,15 +769,19 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu) pin_stack_pages(cpu); } /*:*/ -/*M:009 Since we throw away all mappings when a kernel mapping changes, our + +/*M:009 + * Since we throw away all mappings when a kernel mapping changes, our * performance sucks for guests using highmem. In fact, a guest with * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is * usually slower than a Guest with less memory. * * This, of course, cannot be fixed. It would take some kind of... well, I - * don't know, but the term "puissant code-fu" comes to mind. :*/ + * don't know, but the term "puissant code-fu" comes to mind. +:*/ -/*H:420 This is the routine which actually sets the page table entry for then +/*H:420 + * This is the routine which actually sets the page table entry for then * "idx"'th shadow page table. * * Normally, we can just throw out the old entry and replace it with 0: if they @@ -715,31 +810,36 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, spmd = spmd_addr(cpu, *spgd, vaddr); if (pmd_flags(*spmd) & _PAGE_PRESENT) { #endif - /* Otherwise, we start by releasing - * the existing entry. */ + /* Otherwise, start by releasing the existing entry. */ pte_t *spte = spte_addr(cpu, *spgd, vaddr); release_pte(*spte); - /* If they're setting this entry as dirty or accessed, - * we might as well put that entry they've given us - * in now. This shaves 10% off a - * copy-on-write micro-benchmark. */ + /* + * If they're setting this entry as dirty or accessed, + * we might as well put that entry they've given us in + * now. This shaves 10% off a copy-on-write + * micro-benchmark. + */ if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { check_gpte(cpu, gpte); native_set_pte(spte, gpte_to_spte(cpu, gpte, pte_flags(gpte) & _PAGE_DIRTY)); - } else - /* Otherwise kill it and we can demand_page() - * it in later. */ + } else { + /* + * Otherwise kill it and we can demand_page() + * it in later. + */ native_set_pte(spte, __pte(0)); + } #ifdef CONFIG_X86_PAE } #endif } } -/*H:410 Updating a PTE entry is a little trickier. +/*H:410 + * Updating a PTE entry is a little trickier. * * We keep track of several different page tables (the Guest uses one for each * process, so it makes sense to cache at least a few). Each of these have @@ -748,12 +848,15 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, * all the page tables, not just the current one. This is rare. * * The benefit is that when we have to track a new page table, we can keep all - * the kernel mappings. This speeds up context switch immensely. */ + * the kernel mappings. This speeds up context switch immensely. + */ void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, unsigned long vaddr, pte_t gpte) { - /* Kernel mappings must be changed on all top levels. Slow, but doesn't - * happen often. */ + /* + * Kernel mappings must be changed on all top levels. Slow, but doesn't + * happen often. + */ if (vaddr >= cpu->lg->kernel_address) { unsigned int i; for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) @@ -802,12 +905,14 @@ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) } #endif -/* Once we know how much memory we have we can construct simple identity - * (which set virtual == physical) and linear mappings - * which will get the Guest far enough into the boot to create its own. +/* + * Once we know how much memory we have we can construct simple identity (which + * set virtual == physical) and linear mappings which will get the Guest far + * enough into the boot to create its own. * * We lay them out of the way, just below the initrd (which is why we need to - * know its size here). */ + * know its size here). + */ static unsigned long setup_pagetables(struct lguest *lg, unsigned long mem, unsigned long initrd_size) @@ -825,8 +930,10 @@ static unsigned long setup_pagetables(struct lguest *lg, unsigned int phys_linear; #endif - /* We have mapped_pages frames to map, so we need - * linear_pages page tables to map them. */ + /* + * We have mapped_pages frames to map, so we need linear_pages page + * tables to map them. + */ mapped_pages = mem / PAGE_SIZE; linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE; @@ -839,8 +946,10 @@ static unsigned long setup_pagetables(struct lguest *lg, #ifdef CONFIG_X86_PAE pmds = (void *)linear - PAGE_SIZE; #endif - /* Linear mapping is easy: put every page's address into the - * mapping in order. */ + /* + * Linear mapping is easy: put every page's address into the + * mapping in order. + */ for (i = 0; i < mapped_pages; i++) { pte_t pte; pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER)); @@ -848,8 +957,10 @@ static unsigned long setup_pagetables(struct lguest *lg, return -EFAULT; } - /* The top level points to the linear page table pages above. - * We setup the identity and linear mappings here. */ + /* + * The top level points to the linear page table pages above. + * We setup the identity and linear mappings here. + */ #ifdef CONFIG_X86_PAE for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; i += PTRS_PER_PTE, j++) { @@ -880,15 +991,19 @@ static unsigned long setup_pagetables(struct lguest *lg, } #endif - /* We return the top level (guest-physical) address: remember where - * this is. */ + /* + * We return the top level (guest-physical) address: remember where + * this is. + */ return (unsigned long)pgdir - mem_base; } -/*H:500 (vii) Setting up the page tables initially. +/*H:500 + * (vii) Setting up the page tables initially. * * When a Guest is first created, the Launcher tells us where the toplevel of - * its first page table is. We set some things up here: */ + * its first page table is. We set some things up here: + */ int init_guest_pagetable(struct lguest *lg) { u64 mem; @@ -898,14 +1013,18 @@ int init_guest_pagetable(struct lguest *lg) pgd_t *pgd; pmd_t *pmd_table; #endif - /* Get the Guest memory size and the ramdisk size from the boot header - * located at lg->mem_base (Guest address 0). */ + /* + * Get the Guest memory size and the ramdisk size from the boot header + * located at lg->mem_base (Guest address 0). + */ if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) || get_user(initrd_size, &boot->hdr.ramdisk_size)) return -EFAULT; - /* We start on the first shadow page table, and give it a blank PGD - * page. */ + /* + * We start on the first shadow page table, and give it a blank PGD + * page. + */ lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size); if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir)) return lg->pgdirs[0].gpgdir; @@ -931,17 +1050,21 @@ void page_table_guest_data_init(struct lg_cpu *cpu) /* We get the kernel address: above this is all kernel memory. */ if (get_user(cpu->lg->kernel_address, &cpu->lg->lguest_data->kernel_address) - /* We tell the Guest that it can't use the top 2 or 4 MB - * of virtual addresses used by the Switcher. */ + /* + * We tell the Guest that it can't use the top 2 or 4 MB + * of virtual addresses used by the Switcher. + */ || put_user(RESERVE_MEM * 1024 * 1024, &cpu->lg->lguest_data->reserve_mem) || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); - /* In flush_user_mappings() we loop from 0 to + /* + * In flush_user_mappings() we loop from 0 to * "pgd_index(lg->kernel_address)". This assumes it won't hit the - * Switcher mappings, so check that now. */ + * Switcher mappings, so check that now. + */ #ifdef CONFIG_X86_PAE if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) @@ -964,12 +1087,14 @@ void free_guest_pagetable(struct lguest *lg) free_page((long)lg->pgdirs[i].pgdir); } -/*H:480 (vi) Mapping the Switcher when the Guest is about to run. +/*H:480 + * (vi) Mapping the Switcher when the Guest is about to run. * * The Switcher and the two pages for this CPU need to be visible in the * Guest (and not the pages for other CPUs). We have the appropriate PTE pages * for each CPU already set up, we just need to hook them in now we know which - * Guest is about to run on this CPU. */ + * Guest is about to run on this CPU. + */ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) { pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); @@ -990,20 +1115,24 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) #else pgd_t switcher_pgd; - /* Make the last PGD entry for this Guest point to the Switcher's PTE - * page for this CPU (with appropriate flags). */ + /* + * Make the last PGD entry for this Guest point to the Switcher's PTE + * page for this CPU (with appropriate flags). + */ switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; #endif - /* We also change the Switcher PTE page. When we're running the Guest, + /* + * We also change the Switcher PTE page. When we're running the Guest, * we want the Guest's "regs" page to appear where the first Switcher * page for this CPU is. This is an optimization: when the Switcher * saves the Guest registers, it saves them into the first page of this * CPU's "struct lguest_pages": if we make sure the Guest's register * page is already mapped there, we don't have to copy them out - * again. */ + * again. + */ pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; native_set_pte(®s_pte, pfn_pte(pfn, PAGE_KERNEL)); native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], @@ -1019,10 +1148,12 @@ static void free_switcher_pte_pages(void) free_page((long)switcher_pte_page(i)); } -/*H:520 Setting up the Switcher PTE page for given CPU is fairly easy, given +/*H:520 + * Setting up the Switcher PTE page for given CPU is fairly easy, given * the CPU number and the "struct page"s for the Switcher code itself. * - * Currently the Switcher is less than a page long, so "pages" is always 1. */ + * Currently the Switcher is less than a page long, so "pages" is always 1. + */ static __init void populate_switcher_pte_page(unsigned int cpu, struct page *switcher_page[], unsigned int pages) @@ -1043,13 +1174,16 @@ static __init void populate_switcher_pte_page(unsigned int cpu, native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); - /* The second page contains the "struct lguest_ro_state", and is - * read-only. */ + /* + * The second page contains the "struct lguest_ro_state", and is + * read-only. + */ native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); } -/* We've made it through the page table code. Perhaps our tired brains are +/* + * We've made it through the page table code. Perhaps our tired brains are * still processing the details, or perhaps we're simply glad it's over. * * If nothing else, note that all this complexity in juggling shadow page tables @@ -1058,10 +1192,13 @@ static __init void populate_switcher_pte_page(unsigned int cpu, * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD * have implemented shadow page table support directly into hardware. * - * There is just one file remaining in the Host. */ + * There is just one file remaining in the Host. + */ -/*H:510 At boot or module load time, init_pagetables() allocates and populates - * the Switcher PTE page for each CPU. */ +/*H:510 + * At boot or module load time, init_pagetables() allocates and populates + * the Switcher PTE page for each CPU. + */ __init int init_pagetables(struct page **switcher_page, unsigned int pages) { unsigned int i; diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c index 482ed5a18750..951c57b0a7e0 100644 --- a/drivers/lguest/segments.c +++ b/drivers/lguest/segments.c @@ -1,4 +1,5 @@ -/*P:600 The x86 architecture has segments, which involve a table of descriptors +/*P:600 + * The x86 architecture has segments, which involve a table of descriptors * which can be used to do funky things with virtual address interpretation. * We originally used to use segments so the Guest couldn't alter the * Guest<->Host Switcher, and then we had to trim Guest segments, and restore @@ -8,7 +9,8 @@ * * In these modern times, the segment handling code consists of simple sanity * checks, and the worst you'll experience reading this code is butterfly-rash - * from frolicking through its parklike serenity. :*/ + * from frolicking through its parklike serenity. +:*/ #include "lg.h" /*H:600 @@ -41,10 +43,12 @@ * begin. */ -/* There are several entries we don't let the Guest set. The TSS entry is the +/* + * There are several entries we don't let the Guest set. The TSS entry is the * "Task State Segment" which controls all kinds of delicate things. The * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the - * the Guest can't be trusted to deal with double faults. */ + * the Guest can't be trusted to deal with double faults. + */ static bool ignored_gdt(unsigned int num) { return (num == GDT_ENTRY_TSS @@ -53,42 +57,52 @@ static bool ignored_gdt(unsigned int num) || num == GDT_ENTRY_DOUBLEFAULT_TSS); } -/*H:630 Once the Guest gave us new GDT entries, we fix them up a little. We +/*H:630 + * Once the Guest gave us new GDT entries, we fix them up a little. We * don't care if they're invalid: the worst that can happen is a General * Protection Fault in the Switcher when it restores a Guest segment register * which tries to use that entry. Then we kill the Guest for causing such a - * mess: the message will be "unhandled trap 256". */ + * mess: the message will be "unhandled trap 256". + */ static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end) { unsigned int i; for (i = start; i < end; i++) { - /* We never copy these ones to real GDT, so we don't care what - * they say */ + /* + * We never copy these ones to real GDT, so we don't care what + * they say + */ if (ignored_gdt(i)) continue; - /* Segment descriptors contain a privilege level: the Guest is + /* + * Segment descriptors contain a privilege level: the Guest is * sometimes careless and leaves this as 0, even though it's - * running at privilege level 1. If so, we fix it here. */ + * running at privilege level 1. If so, we fix it here. + */ if ((cpu->arch.gdt[i].b & 0x00006000) == 0) cpu->arch.gdt[i].b |= (GUEST_PL << 13); - /* Each descriptor has an "accessed" bit. If we don't set it + /* + * Each descriptor has an "accessed" bit. If we don't set it * now, the CPU will try to set it when the Guest first loads * that entry into a segment register. But the GDT isn't - * writable by the Guest, so bad things can happen. */ + * writable by the Guest, so bad things can happen. + */ cpu->arch.gdt[i].b |= 0x00000100; } } -/*H:610 Like the IDT, we never simply use the GDT the Guest gives us. We keep +/*H:610 + * Like the IDT, we never simply use the GDT the Guest gives us. We keep * a GDT for each CPU, and copy across the Guest's entries each time we want to * run the Guest on that CPU. * * This routine is called at boot or modprobe time for each CPU to set up the * constant GDT entries: the ones which are the same no matter what Guest we're - * running. */ + * running. + */ void setup_default_gdt_entries(struct lguest_ro_state *state) { struct desc_struct *gdt = state->guest_gdt; @@ -98,30 +112,37 @@ void setup_default_gdt_entries(struct lguest_ro_state *state) gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; - /* The TSS segment refers to the TSS entry for this particular CPU. + /* + * The TSS segment refers to the TSS entry for this particular CPU. * Forgive the magic flags: the 0x8900 means the entry is Present, it's * privilege level 0 Available 386 TSS system segment, and the 0x67 - * means Saturn is eclipsed by Mercury in the twelfth house. */ + * means Saturn is eclipsed by Mercury in the twelfth house. + */ gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16); gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) | ((tss >> 16) & 0x000000FF); } -/* This routine sets up the initial Guest GDT for booting. All entries start - * as 0 (unusable). */ +/* + * This routine sets up the initial Guest GDT for booting. All entries start + * as 0 (unusable). + */ void setup_guest_gdt(struct lg_cpu *cpu) { - /* Start with full 0-4G segments... */ + /* + * Start with full 0-4G segments...except the Guest is allowed to use + * them, so set the privilege level appropriately in the flags. + */ cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; - /* ...except the Guest is allowed to use them, so set the privilege - * level appropriately in the flags. */ cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); } -/*H:650 An optimization of copy_gdt(), for just the three "thead-local storage" - * entries. */ +/*H:650 + * An optimization of copy_gdt(), for just the three "thead-local storage" + * entries. + */ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt) { unsigned int i; @@ -130,26 +151,34 @@ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt) gdt[i] = cpu->arch.gdt[i]; } -/*H:640 When the Guest is run on a different CPU, or the GDT entries have - * changed, copy_gdt() is called to copy the Guest's GDT entries across to this - * CPU's GDT. */ +/*H:640 + * When the Guest is run on a different CPU, or the GDT entries have changed, + * copy_gdt() is called to copy the Guest's GDT entries across to this CPU's + * GDT. + */ void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt) { unsigned int i; - /* The default entries from setup_default_gdt_entries() are not - * replaced. See ignored_gdt() above. */ + /* + * The default entries from setup_default_gdt_entries() are not + * replaced. See ignored_gdt() above. + */ for (i = 0; i < GDT_ENTRIES; i++) if (!ignored_gdt(i)) gdt[i] = cpu->arch.gdt[i]; } -/*H:620 This is where the Guest asks us to load a new GDT entry - * (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in. */ +/*H:620 + * This is where the Guest asks us to load a new GDT entry + * (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in. + */ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) { - /* We assume the Guest has the same number of GDT entries as the - * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ + /* + * We assume the Guest has the same number of GDT entries as the + * Host, otherwise we'd have to dynamically allocate the Guest GDT. + */ if (num >= ARRAY_SIZE(cpu->arch.gdt)) kill_guest(cpu, "too many gdt entries %i", num); @@ -157,15 +186,19 @@ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) cpu->arch.gdt[num].a = lo; cpu->arch.gdt[num].b = hi; fixup_gdt_table(cpu, num, num+1); - /* Mark that the GDT changed so the core knows it has to copy it again, - * even if the Guest is run on the same CPU. */ + /* + * Mark that the GDT changed so the core knows it has to copy it again, + * even if the Guest is run on the same CPU. + */ cpu->changed |= CHANGED_GDT; } -/* This is the fast-track version for just changing the three TLS entries. +/* + * This is the fast-track version for just changing the three TLS entries. * Remember that this happens on every context switch, so it's worth * optimizing. But wouldn't it be neater to have a single hypercall to cover - * both cases? */ + * both cases? + */ void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls) { struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN]; @@ -175,7 +208,6 @@ void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls) /* Note that just the TLS entries have changed. */ cpu->changed |= CHANGED_GDT_TLS; } -/*:*/ /*H:660 * With this, we have finished the Host. diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index eaf722fe309a..96f7d88ec7f8 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -17,13 +17,15 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -/*P:450 This file contains the x86-specific lguest code. It used to be all +/*P:450 + * This file contains the x86-specific lguest code. It used to be all * mixed in with drivers/lguest/core.c but several foolhardy code slashers * wrestled most of the dependencies out to here in preparation for porting * lguest to other architectures (see what I mean by foolhardy?). * * This also contains a couple of non-obvious setup and teardown pieces which - * were implemented after days of debugging pain. :*/ + * were implemented after days of debugging pain. +:*/ #include #include #include @@ -82,25 +84,33 @@ static DEFINE_PER_CPU(struct lg_cpu *, last_cpu); */ static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages) { - /* Copying all this data can be quite expensive. We usually run the + /* + * Copying all this data can be quite expensive. We usually run the * same Guest we ran last time (and that Guest hasn't run anywhere else * meanwhile). If that's not the case, we pretend everything in the - * Guest has changed. */ + * Guest has changed. + */ if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) { __get_cpu_var(last_cpu) = cpu; cpu->last_pages = pages; cpu->changed = CHANGED_ALL; } - /* These copies are pretty cheap, so we do them unconditionally: */ - /* Save the current Host top-level page directory. */ + /* + * These copies are pretty cheap, so we do them unconditionally: */ + /* Save the current Host top-level page directory. + */ pages->state.host_cr3 = __pa(current->mm->pgd); - /* Set up the Guest's page tables to see this CPU's pages (and no - * other CPU's pages). */ + /* + * Set up the Guest's page tables to see this CPU's pages (and no + * other CPU's pages). + */ map_switcher_in_guest(cpu, pages); - /* Set up the two "TSS" members which tell the CPU what stack to use + /* + * Set up the two "TSS" members which tell the CPU what stack to use * for traps which do directly into the Guest (ie. traps at privilege - * level 1). */ + * level 1). + */ pages->state.guest_tss.sp1 = cpu->esp1; pages->state.guest_tss.ss1 = cpu->ss1; @@ -125,40 +135,53 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) /* This is a dummy value we need for GCC's sake. */ unsigned int clobber; - /* Copy the guest-specific information into this CPU's "struct - * lguest_pages". */ + /* + * Copy the guest-specific information into this CPU's "struct + * lguest_pages". + */ copy_in_guest_info(cpu, pages); - /* Set the trap number to 256 (impossible value). If we fault while + /* + * Set the trap number to 256 (impossible value). If we fault while * switching to the Guest (bad segment registers or bug), this will - * cause us to abort the Guest. */ + * cause us to abort the Guest. + */ cpu->regs->trapnum = 256; - /* Now: we push the "eflags" register on the stack, then do an "lcall". + /* + * Now: we push the "eflags" register on the stack, then do an "lcall". * This is how we change from using the kernel code segment to using * the dedicated lguest code segment, as well as jumping into the * Switcher. * * The lcall also pushes the old code segment (KERNEL_CS) onto the * stack, then the address of this call. This stack layout happens to - * exactly match the stack layout created by an interrupt... */ + * exactly match the stack layout created by an interrupt... + */ asm volatile("pushf; lcall *lguest_entry" - /* This is how we tell GCC that %eax ("a") and %ebx ("b") - * are changed by this routine. The "=" means output. */ + /* + * This is how we tell GCC that %eax ("a") and %ebx ("b") + * are changed by this routine. The "=" means output. + */ : "=a"(clobber), "=b"(clobber) - /* %eax contains the pages pointer. ("0" refers to the + /* + * %eax contains the pages pointer. ("0" refers to the * 0-th argument above, ie "a"). %ebx contains the * physical address of the Guest's top-level page - * directory. */ + * directory. + */ : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)) - /* We tell gcc that all these registers could change, + /* + * We tell gcc that all these registers could change, * which means we don't have to save and restore them in - * the Switcher. */ + * the Switcher. + */ : "memory", "%edx", "%ecx", "%edi", "%esi"); } /*:*/ -/*M:002 There are hooks in the scheduler which we can register to tell when we +/*M:002 + * There are hooks in the scheduler which we can register to tell when we * get kicked off the CPU (preempt_notifier_register()). This would allow us * to lazily disable SYSENTER which would regain some performance, and should * also simplify copy_in_guest_info(). Note that we'd still need to restore @@ -166,56 +189,72 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) * * We could also try using this hooks for PGE, but that might be too expensive. * - * The hooks were designed for KVM, but we can also put them to good use. :*/ + * The hooks were designed for KVM, but we can also put them to good use. +:*/ -/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts - * are disabled: we own the CPU. */ +/*H:040 + * This is the i386-specific code to setup and run the Guest. Interrupts + * are disabled: we own the CPU. + */ void lguest_arch_run_guest(struct lg_cpu *cpu) { - /* Remember the awfully-named TS bit? If the Guest has asked to set it + /* + * Remember the awfully-named TS bit? If the Guest has asked to set it * we set it now, so we can trap and pass that trap to the Guest if it - * uses the FPU. */ + * uses the FPU. + */ if (cpu->ts) unlazy_fpu(current); - /* SYSENTER is an optimized way of doing system calls. We can't allow + /* + * SYSENTER is an optimized way of doing system calls. We can't allow * it because it always jumps to privilege level 0. A normal Guest * won't try it because we don't advertise it in CPUID, but a malicious * Guest (or malicious Guest userspace program) could, so we tell the - * CPU to disable it before running the Guest. */ + * CPU to disable it before running the Guest. + */ if (boot_cpu_has(X86_FEATURE_SEP)) wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); - /* Now we actually run the Guest. It will return when something + /* + * Now we actually run the Guest. It will return when something * interesting happens, and we can examine its registers to see what it - * was doing. */ + * was doing. + */ run_guest_once(cpu, lguest_pages(raw_smp_processor_id())); - /* Note that the "regs" structure contains two extra entries which are + /* + * Note that the "regs" structure contains two extra entries which are * not really registers: a trap number which says what interrupt or * trap made the switcher code come back, and an error code which some - * traps set. */ + * traps set. + */ /* Restore SYSENTER if it's supposed to be on. */ if (boot_cpu_has(X86_FEATURE_SEP)) wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); - /* If the Guest page faulted, then the cr2 register will tell us the + /* + * If the Guest page faulted, then the cr2 register will tell us the * bad virtual address. We have to grab this now, because once we * re-enable interrupts an interrupt could fault and thus overwrite - * cr2, or we could even move off to a different CPU. */ + * cr2, or we could even move off to a different CPU. + */ if (cpu->regs->trapnum == 14) cpu->arch.last_pagefault = read_cr2(); - /* Similarly, if we took a trap because the Guest used the FPU, + /* + * Similarly, if we took a trap because the Guest used the FPU, * we have to restore the FPU it expects to see. * math_state_restore() may sleep and we may even move off to * a different CPU. So all the critical stuff should be done - * before this. */ + * before this. + */ else if (cpu->regs->trapnum == 7) math_state_restore(); } -/*H:130 Now we've examined the hypercall code; our Guest can make requests. +/*H:130 + * Now we've examined the hypercall code; our Guest can make requests. * Our Guest is usually so well behaved; it never tries to do things it isn't * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual * infrastructure isn't quite complete, because it doesn't contain replacements @@ -225,26 +264,33 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) * * When the Guest uses one of these instructions, we get a trap (General * Protection Fault) and come here. We see if it's one of those troublesome - * instructions and skip over it. We return true if we did. */ + * instructions and skip over it. We return true if we did. + */ static int emulate_insn(struct lg_cpu *cpu) { u8 insn; unsigned int insnlen = 0, in = 0, shift = 0; - /* The eip contains the *virtual* address of the Guest's instruction: - * guest_pa just subtracts the Guest's page_offset. */ + /* + * The eip contains the *virtual* address of the Guest's instruction: + * guest_pa just subtracts the Guest's page_offset. + */ unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); - /* This must be the Guest kernel trying to do something, not userspace! + /* + * This must be the Guest kernel trying to do something, not userspace! * The bottom two bits of the CS segment register are the privilege - * level. */ + * level. + */ if ((cpu->regs->cs & 3) != GUEST_PL) return 0; /* Decoding x86 instructions is icky. */ insn = lgread(cpu, physaddr, u8); - /* 0x66 is an "operand prefix". It means it's using the upper 16 bits - of the eax register. */ + /* + * 0x66 is an "operand prefix". It means it's using the upper 16 bits + * of the eax register. + */ if (insn == 0x66) { shift = 16; /* The instruction is 1 byte so far, read the next byte. */ @@ -252,8 +298,10 @@ static int emulate_insn(struct lg_cpu *cpu) insn = lgread(cpu, physaddr + insnlen, u8); } - /* We can ignore the lower bit for the moment and decode the 4 opcodes - * we need to emulate. */ + /* + * We can ignore the lower bit for the moment and decode the 4 opcodes + * we need to emulate. + */ switch (insn & 0xFE) { case 0xE4: /* in ,%al */ insnlen += 2; @@ -274,9 +322,11 @@ static int emulate_insn(struct lg_cpu *cpu) return 0; } - /* If it was an "IN" instruction, they expect the result to be read + /* + * If it was an "IN" instruction, they expect the result to be read * into %eax, so we change %eax. We always return all-ones, which - * traditionally means "there's nothing there". */ + * traditionally means "there's nothing there". + */ if (in) { /* Lower bit tells is whether it's a 16 or 32 bit access */ if (insn & 0x1) @@ -290,7 +340,8 @@ static int emulate_insn(struct lg_cpu *cpu) return 1; } -/* Our hypercalls mechanism used to be based on direct software interrupts. +/* + * Our hypercalls mechanism used to be based on direct software interrupts. * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to * change over to using kvm hypercalls. * @@ -318,16 +369,20 @@ static int emulate_insn(struct lg_cpu *cpu) */ static void rewrite_hypercall(struct lg_cpu *cpu) { - /* This are the opcodes we use to patch the Guest. The opcode for "int + /* + * This are the opcodes we use to patch the Guest. The opcode for "int * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we - * complete the sequence with a NOP (0x90). */ + * complete the sequence with a NOP (0x90). + */ u8 insn[3] = {0xcd, 0x1f, 0x90}; __lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn)); - /* The above write might have caused a copy of that page to be made + /* + * The above write might have caused a copy of that page to be made * (if it was read-only). We need to make sure the Guest has * up-to-date pagetables. As this doesn't happen often, we can just - * drop them all. */ + * drop them all. + */ guest_pagetable_clear_all(cpu); } @@ -335,9 +390,11 @@ static bool is_hypercall(struct lg_cpu *cpu) { u8 insn[3]; - /* This must be the Guest kernel trying to do something. + /* + * This must be the Guest kernel trying to do something. * The bottom two bits of the CS segment register are the privilege - * level. */ + * level. + */ if ((cpu->regs->cs & 3) != GUEST_PL) return false; @@ -351,86 +408,105 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) { switch (cpu->regs->trapnum) { case 13: /* We've intercepted a General Protection Fault. */ - /* Check if this was one of those annoying IN or OUT + /* + * Check if this was one of those annoying IN or OUT * instructions which we need to emulate. If so, we just go - * back into the Guest after we've done it. */ + * back into the Guest after we've done it. + */ if (cpu->regs->errcode == 0) { if (emulate_insn(cpu)) return; } - /* If KVM is active, the vmcall instruction triggers a - * General Protection Fault. Normally it triggers an - * invalid opcode fault (6): */ + /* + * If KVM is active, the vmcall instruction triggers a General + * Protection Fault. Normally it triggers an invalid opcode + * fault (6): + */ case 6: - /* We need to check if ring == GUEST_PL and - * faulting instruction == vmcall. */ + /* + * We need to check if ring == GUEST_PL and faulting + * instruction == vmcall. + */ if (is_hypercall(cpu)) { rewrite_hypercall(cpu); return; } break; case 14: /* We've intercepted a Page Fault. */ - /* The Guest accessed a virtual address that wasn't mapped. + /* + * The Guest accessed a virtual address that wasn't mapped. * This happens a lot: we don't actually set up most of the page * tables for the Guest at all when we start: as it runs it asks * for more and more, and we set them up as required. In this * case, we don't even tell the Guest that the fault happened. * * The errcode tells whether this was a read or a write, and - * whether kernel or userspace code. */ + * whether kernel or userspace code. + */ if (demand_page(cpu, cpu->arch.last_pagefault, cpu->regs->errcode)) return; - /* OK, it's really not there (or not OK): the Guest needs to + /* + * OK, it's really not there (or not OK): the Guest needs to * know. We write out the cr2 value so it knows where the * fault occurred. * * Note that if the Guest were really messed up, this could * happen before it's done the LHCALL_LGUEST_INIT hypercall, so - * lg->lguest_data could be NULL */ + * lg->lguest_data could be NULL + */ if (cpu->lg->lguest_data && put_user(cpu->arch.last_pagefault, &cpu->lg->lguest_data->cr2)) kill_guest(cpu, "Writing cr2"); break; case 7: /* We've intercepted a Device Not Available fault. */ - /* If the Guest doesn't want to know, we already restored the - * Floating Point Unit, so we just continue without telling - * it. */ + /* + * If the Guest doesn't want to know, we already restored the + * Floating Point Unit, so we just continue without telling it. + */ if (!cpu->ts) return; break; case 32 ... 255: - /* These values mean a real interrupt occurred, in which case + /* + * These values mean a real interrupt occurred, in which case * the Host handler has already been run. We just do a * friendly check if another process should now be run, then - * return to run the Guest again */ + * return to run the Guest again + */ cond_resched(); return; case LGUEST_TRAP_ENTRY: - /* Our 'struct hcall_args' maps directly over our regs: we set - * up the pointer now to indicate a hypercall is pending. */ + /* + * Our 'struct hcall_args' maps directly over our regs: we set + * up the pointer now to indicate a hypercall is pending. + */ cpu->hcall = (struct hcall_args *)cpu->regs; return; } /* We didn't handle the trap, so it needs to go to the Guest. */ if (!deliver_trap(cpu, cpu->regs->trapnum)) - /* If the Guest doesn't have a handler (either it hasn't + /* + * If the Guest doesn't have a handler (either it hasn't * registered any yet, or it's one of the faults we don't let - * it handle), it dies with this cryptic error message. */ + * it handle), it dies with this cryptic error message. + */ kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)", cpu->regs->trapnum, cpu->regs->eip, cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault : cpu->regs->errcode); } -/* Now we can look at each of the routines this calls, in increasing order of +/* + * Now we can look at each of the routines this calls, in increasing order of * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(), * deliver_trap() and demand_page(). After all those, we'll be ready to * examine the Switcher, and our philosophical understanding of the Host/Guest - * duality will be complete. :*/ + * duality will be complete. +:*/ static void adjust_pge(void *on) { if (on) @@ -439,13 +515,16 @@ static void adjust_pge(void *on) write_cr4(read_cr4() & ~X86_CR4_PGE); } -/*H:020 Now the Switcher is mapped and every thing else is ready, we need to do - * some more i386-specific initialization. */ +/*H:020 + * Now the Switcher is mapped and every thing else is ready, we need to do + * some more i386-specific initialization. + */ void __init lguest_arch_host_init(void) { int i; - /* Most of the i386/switcher.S doesn't care that it's been moved; on + /* + * Most of the i386/switcher.S doesn't care that it's been moved; on * Intel, jumps are relative, and it doesn't access any references to * external code or data. * @@ -453,7 +532,8 @@ void __init lguest_arch_host_init(void) * addresses are placed in a table (default_idt_entries), so we need to * update the table with the new addresses. switcher_offset() is a * convenience function which returns the distance between the - * compiled-in switcher code and the high-mapped copy we just made. */ + * compiled-in switcher code and the high-mapped copy we just made. + */ for (i = 0; i < IDT_ENTRIES; i++) default_idt_entries[i] += switcher_offset(); @@ -468,63 +548,81 @@ void __init lguest_arch_host_init(void) for_each_possible_cpu(i) { /* lguest_pages() returns this CPU's two pages. */ struct lguest_pages *pages = lguest_pages(i); - /* This is a convenience pointer to make the code fit one - * statement to a line. */ + /* This is a convenience pointer to make the code neater. */ struct lguest_ro_state *state = &pages->state; - /* The Global Descriptor Table: the Host has a different one + /* + * The Global Descriptor Table: the Host has a different one * for each CPU. We keep a descriptor for the GDT which says * where it is and how big it is (the size is actually the last - * byte, not the size, hence the "-1"). */ + * byte, not the size, hence the "-1"). + */ state->host_gdt_desc.size = GDT_SIZE-1; state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); - /* All CPUs on the Host use the same Interrupt Descriptor + /* + * All CPUs on the Host use the same Interrupt Descriptor * Table, so we just use store_idt(), which gets this CPU's IDT - * descriptor. */ + * descriptor. + */ store_idt(&state->host_idt_desc); - /* The descriptors for the Guest's GDT and IDT can be filled + /* + * The descriptors for the Guest's GDT and IDT can be filled * out now, too. We copy the GDT & IDT into ->guest_gdt and - * ->guest_idt before actually running the Guest. */ + * ->guest_idt before actually running the Guest. + */ state->guest_idt_desc.size = sizeof(state->guest_idt)-1; state->guest_idt_desc.address = (long)&state->guest_idt; state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; state->guest_gdt_desc.address = (long)&state->guest_gdt; - /* We know where we want the stack to be when the Guest enters + /* + * We know where we want the stack to be when the Guest enters * the Switcher: in pages->regs. The stack grows upwards, so - * we start it at the end of that structure. */ + * we start it at the end of that structure. + */ state->guest_tss.sp0 = (long)(&pages->regs + 1); - /* And this is the GDT entry to use for the stack: we keep a - * couple of special LGUEST entries. */ + /* + * And this is the GDT entry to use for the stack: we keep a + * couple of special LGUEST entries. + */ state->guest_tss.ss0 = LGUEST_DS; - /* x86 can have a finegrained bitmap which indicates what I/O + /* + * x86 can have a finegrained bitmap which indicates what I/O * ports the process can use. We set it to the end of our - * structure, meaning "none". */ + * structure, meaning "none". + */ state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); - /* Some GDT entries are the same across all Guests, so we can - * set them up now. */ + /* + * Some GDT entries are the same across all Guests, so we can + * set them up now. + */ setup_default_gdt_entries(state); /* Most IDT entries are the same for all Guests, too.*/ setup_default_idt_entries(state, default_idt_entries); - /* The Host needs to be able to use the LGUEST segments on this - * CPU, too, so put them in the Host GDT. */ + /* + * The Host needs to be able to use the LGUEST segments on this + * CPU, too, so put them in the Host GDT. + */ get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; } - /* In the Switcher, we want the %cs segment register to use the + /* + * In the Switcher, we want the %cs segment register to use the * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so * it will be undisturbed when we switch. To change %cs and jump we - * need this structure to feed to Intel's "lcall" instruction. */ + * need this structure to feed to Intel's "lcall" instruction. + */ lguest_entry.offset = (long)switch_to_guest + switcher_offset(); lguest_entry.segment = LGUEST_CS; - /* Finally, we need to turn off "Page Global Enable". PGE is an + /* + * Finally, we need to turn off "Page Global Enable". PGE is an * optimization where page table entries are specially marked to show * they never change. The Host kernel marks all the kernel pages this * way because it's always present, even when userspace is running. @@ -534,16 +632,21 @@ void __init lguest_arch_host_init(void) * you'll get really weird bugs that you'll chase for two days. * * I used to turn PGE off every time we switched to the Guest and back - * on when we return, but that slowed the Switcher down noticibly. */ + * on when we return, but that slowed the Switcher down noticibly. + */ - /* We don't need the complexity of CPUs coming and going while we're - * doing this. */ + /* + * We don't need the complexity of CPUs coming and going while we're + * doing this. + */ get_online_cpus(); if (cpu_has_pge) { /* We have a broader idea of "global". */ /* Remember that this was originally set (for cleanup). */ cpu_had_pge = 1; - /* adjust_pge is a helper function which sets or unsets the PGE - * bit on its CPU, depending on the argument (0 == unset). */ + /* + * adjust_pge is a helper function which sets or unsets the PGE + * bit on its CPU, depending on the argument (0 == unset). + */ on_each_cpu(adjust_pge, (void *)0, 1); /* Turn off the feature in the global feature set. */ clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); @@ -590,26 +693,32 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) { u32 tsc_speed; - /* The pointer to the Guest's "struct lguest_data" is the only argument. - * We check that address now. */ + /* + * The pointer to the Guest's "struct lguest_data" is the only argument. + * We check that address now. + */ if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1, sizeof(*cpu->lg->lguest_data))) return -EFAULT; - /* Having checked it, we simply set lg->lguest_data to point straight + /* + * Having checked it, we simply set lg->lguest_data to point straight * into the Launcher's memory at the right place and then use * copy_to_user/from_user from now on, instead of lgread/write. I put * this in to show that I'm not immune to writing stupid - * optimizations. */ + * optimizations. + */ cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1; - /* We insist that the Time Stamp Counter exist and doesn't change with + /* + * We insist that the Time Stamp Counter exist and doesn't change with * cpu frequency. Some devious chip manufacturers decided that TSC * changes could be handled in software. I decided that time going * backwards might be good for benchmarks, but it's bad for users. * * We also insist that the TSC be stable: the kernel detects unreliable - * TSCs for its own purposes, and we use that here. */ + * TSCs for its own purposes, and we use that here. + */ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) tsc_speed = tsc_khz; else @@ -625,38 +734,47 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) } /*:*/ -/*L:030 lguest_arch_setup_regs() +/*L:030 + * lguest_arch_setup_regs() * * Most of the Guest's registers are left alone: we used get_zeroed_page() to - * allocate the structure, so they will be 0. */ + * allocate the structure, so they will be 0. + */ void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start) { struct lguest_regs *regs = cpu->regs; - /* There are four "segment" registers which the Guest needs to boot: + /* + * There are four "segment" registers which the Guest needs to boot: * The "code segment" register (cs) refers to the kernel code segment * __KERNEL_CS, and the "data", "extra" and "stack" segment registers * refer to the kernel data segment __KERNEL_DS. * * The privilege level is packed into the lower bits. The Guest runs - * at privilege level 1 (GUEST_PL).*/ + * at privilege level 1 (GUEST_PL). + */ regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; regs->cs = __KERNEL_CS|GUEST_PL; - /* The "eflags" register contains miscellaneous flags. Bit 1 (0x002) + /* + * The "eflags" register contains miscellaneous flags. Bit 1 (0x002) * is supposed to always be "1". Bit 9 (0x200) controls whether * interrupts are enabled. We always leave interrupts enabled while - * running the Guest. */ + * running the Guest. + */ regs->eflags = X86_EFLAGS_IF | 0x2; - /* The "Extended Instruction Pointer" register says where the Guest is - * running. */ + /* + * The "Extended Instruction Pointer" register says where the Guest is + * running. + */ regs->eip = start; - /* %esi points to our boot information, at physical address 0, so don't - * touch it. */ + /* + * %esi points to our boot information, at physical address 0, so don't + * touch it. + */ - /* There are a couple of GDT entries the Guest expects when first - * booting. */ + /* There are a couple of GDT entries the Guest expects at boot. */ setup_guest_gdt(cpu); } diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S index 3fc15318a80f..6dec09793836 100644 --- a/drivers/lguest/x86/switcher_32.S +++ b/drivers/lguest/x86/switcher_32.S @@ -1,12 +1,15 @@ -/*P:900 This is the Switcher: code which sits at 0xFFC00000 astride both the +/*P:900 + * This is the Switcher: code which sits at 0xFFC00000 astride both the * Host and Guest to do the low-level Guest<->Host switch. It is as simple as * it can be made, but it's naturally very specific to x86. * * You have now completed Preparation. If this has whet your appetite; if you * are feeling invigorated and refreshed then the next, more challenging stage - * can be found in "make Guest". :*/ + * can be found in "make Guest". + :*/ -/*M:012 Lguest is meant to be simple: my rule of thumb is that 1% more LOC must +/*M:012 + * Lguest is meant to be simple: my rule of thumb is that 1% more LOC must * gain at least 1% more performance. Since neither LOC nor performance can be * measured beforehand, it generally means implementing a feature then deciding * if it's worth it. And once it's implemented, who can say no? @@ -31,11 +34,14 @@ * Host (which is actually really easy). * * Two questions remain. Would the performance gain outweigh the complexity? - * And who would write the verse documenting it? :*/ + * And who would write the verse documenting it? +:*/ -/*M:011 Lguest64 handles NMI. This gave me NMI envy (until I looked at their +/*M:011 + * Lguest64 handles NMI. This gave me NMI envy (until I looked at their * code). It's worth doing though, since it would let us use oprofile in the - * Host when a Guest is running. :*/ + * Host when a Guest is running. +:*/ /*S:100 * Welcome to the Switcher itself! diff --git a/include/linux/lguest.h b/include/linux/lguest.h index dbf2479e808e..0a3a11afd64b 100644 --- a/include/linux/lguest.h +++ b/include/linux/lguest.h @@ -1,5 +1,7 @@ -/* Things the lguest guest needs to know. Note: like all lguest interfaces, - * this is subject to wild and random change between versions. */ +/* + * Things the lguest guest needs to know. Note: like all lguest interfaces, + * this is subject to wild and random change between versions. + */ #ifndef _LINUX_LGUEST_H #define _LINUX_LGUEST_H @@ -11,32 +13,42 @@ #define LG_CLOCK_MIN_DELTA 100UL #define LG_CLOCK_MAX_DELTA ULONG_MAX -/*G:031 The second method of communicating with the Host is to via "struct +/*G:031 + * The second method of communicating with the Host is to via "struct * lguest_data". Once the Guest's initialization hypercall tells the Host where - * this is, the Guest and Host both publish information in it. :*/ + * this is, the Guest and Host both publish information in it. +:*/ struct lguest_data { - /* 512 == enabled (same as eflags in normal hardware). The Guest - * changes interrupts so often that a hypercall is too slow. */ + /* + * 512 == enabled (same as eflags in normal hardware). The Guest + * changes interrupts so often that a hypercall is too slow. + */ unsigned int irq_enabled; /* Fine-grained interrupt disabling by the Guest */ DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS); - /* The Host writes the virtual address of the last page fault here, + /* + * The Host writes the virtual address of the last page fault here, * which saves the Guest a hypercall. CR2 is the native register where - * this address would normally be found. */ + * this address would normally be found. + */ unsigned long cr2; /* Wallclock time set by the Host. */ struct timespec time; - /* Interrupt pending set by the Host. The Guest should do a hypercall - * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). */ + /* + * Interrupt pending set by the Host. The Guest should do a hypercall + * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). + */ int irq_pending; - /* Async hypercall ring. Instead of directly making hypercalls, we can + /* + * Async hypercall ring. Instead of directly making hypercalls, we can * place them in here for processing the next time the Host wants. - * This batching can be quite efficient. */ + * This batching can be quite efficient. + */ /* 0xFF == done (set by Host), 0 == pending (set by Guest). */ u8 hcall_status[LHCALL_RING_SIZE]; diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h index bfefbdf7498a..495203ff221c 100644 --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h @@ -29,8 +29,10 @@ struct lguest_device_desc { __u8 type; /* The number of virtqueues (first in config array) */ __u8 num_vq; - /* The number of bytes of feature bits. Multiply by 2: one for host - * features and one for Guest acknowledgements. */ + /* + * The number of bytes of feature bits. Multiply by 2: one for host + * features and one for Guest acknowledgements. + */ __u8 feature_len; /* The number of bytes of the config array after virtqueues. */ __u8 config_len; @@ -39,8 +41,10 @@ struct lguest_device_desc { __u8 config[0]; }; -/*D:135 This is how we expect the device configuration field for a virtqueue - * to be laid out in config space. */ +/*D:135 + * This is how we expect the device configuration field for a virtqueue + * to be laid out in config space. + */ struct lguest_vqconfig { /* The number of entries in the virtio_ring */ __u16 num; @@ -61,7 +65,9 @@ enum lguest_req LHREQ_EVENTFD, /* + address, fd. */ }; -/* The alignment to use between consumer and producer parts of vring. - * x86 pagesize for historical reasons. */ +/* + * The alignment to use between consumer and producer parts of vring. + * x86 pagesize for historical reasons. + */ #define LGUEST_VRING_ALIGN 4096 #endif /* _LINUX_LGUEST_LAUNCHER */ -- cgit v1.2.3-59-g8ed1b From 1842f23c05b6a866be831aa60bc8a8731c58ddd0 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 30 Jul 2009 16:03:46 -0600 Subject: lguest and virtio: cleanup struct definitions to Linux style. I've been doing this for years, and akpm picked me up on it about 12 months ago. lguest partly serves as example code, so let's do it Right. Also, remove two unused fields in struct vblk_info in the example launcher. Signed-off-by: Rusty Russell Cc: Ingo Molnar --- Documentation/lguest/lguest.c | 21 +++++---------------- drivers/lguest/lg.h | 9 +++------ drivers/lguest/lguest_device.c | 3 +-- include/linux/lguest.h | 3 +-- include/linux/virtio_blk.h | 6 ++---- include/linux/virtio_config.h | 3 +-- include/linux/virtio_net.h | 6 ++---- include/linux/virtio_ring.h | 12 ++++-------- 8 files changed, 19 insertions(+), 44 deletions(-) (limited to 'include') diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 45163651b519..950cde6d6e58 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -93,8 +93,7 @@ static int lguest_fd; static unsigned int __thread cpu_id; /* This is our list of devices. */ -struct device_list -{ +struct device_list { /* Counter to assign interrupt numbers. */ unsigned int next_irq; @@ -114,8 +113,7 @@ struct device_list static struct device_list devices; /* The device structure describes a single device. */ -struct device -{ +struct device { /* The linked-list pointer. */ struct device *next; @@ -140,8 +138,7 @@ struct device }; /* The virtqueue structure describes a queue attached to a device. */ -struct virtqueue -{ +struct virtqueue { struct virtqueue *next; /* Which device owns me. */ @@ -779,8 +776,7 @@ static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) * * We associate some data with the console for our exit hack. */ -struct console_abort -{ +struct console_abort { /* How many times have they hit ^C? */ int count; /* When did they start? */ @@ -1570,20 +1566,13 @@ static void setup_tun_net(char *arg) /*:*/ /* This hangs off device->priv. */ -struct vblk_info -{ +struct vblk_info { /* The size of the file. */ off64_t len; /* The file descriptor for the file. */ int fd; - /* IO thread listens on this file descriptor [0]. */ - int workpipe[2]; - - /* IO thread writes to this file descriptor to mark it done, then - * Launcher triggers interrupt to Guest. */ - int done_fd; }; /*L:210 diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 74c0db691b53..bc28745d05af 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -16,15 +16,13 @@ void free_pagetables(void); int init_pagetables(struct page **switcher_page, unsigned int pages); -struct pgdir -{ +struct pgdir { unsigned long gpgdir; pgd_t *pgdir; }; /* We have two pages shared with guests, per cpu. */ -struct lguest_pages -{ +struct lguest_pages { /* This is the stack page mapped rw in guest */ char spare[PAGE_SIZE - sizeof(struct lguest_regs)]; struct lguest_regs regs; @@ -89,8 +87,7 @@ struct lg_eventfd_map { }; /* The private info the thread maintains about the guest. */ -struct lguest -{ +struct lguest { struct lguest_data __user *lguest_data; struct lg_cpu cpus[NR_CPUS]; unsigned int nr_cpus; diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index 1401c1ace1ec..b6200bc39b58 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -207,8 +207,7 @@ static void lg_reset(struct virtio_device *vdev) */ /*D:140 This is the information we remember about each virtqueue. */ -struct lguest_vq_info -{ +struct lguest_vq_info { /* A copy of the information contained in the device config. */ struct lguest_vqconfig config; diff --git a/include/linux/lguest.h b/include/linux/lguest.h index 0a3a11afd64b..2fb1dcbcb5aa 100644 --- a/include/linux/lguest.h +++ b/include/linux/lguest.h @@ -18,8 +18,7 @@ * lguest_data". Once the Guest's initialization hypercall tells the Host where * this is, the Guest and Host both publish information in it. :*/ -struct lguest_data -{ +struct lguest_data { /* * 512 == enabled (same as eflags in normal hardware). The Guest * changes interrupts so often that a hypercall is too slow. diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h index be7d255fc7cf..8dab9f2b8832 100644 --- a/include/linux/virtio_blk.h +++ b/include/linux/virtio_blk.h @@ -20,8 +20,7 @@ #define VIRTIO_BLK_ID_BYTES (sizeof(__u16[256])) /* IDENTIFY DATA */ -struct virtio_blk_config -{ +struct virtio_blk_config { /* The capacity (in 512-byte sectors). */ __u64 capacity; /* The maximum segment size (if VIRTIO_BLK_F_SIZE_MAX) */ @@ -50,8 +49,7 @@ struct virtio_blk_config #define VIRTIO_BLK_T_BARRIER 0x80000000 /* This is the first element of the read scatter-gather list. */ -struct virtio_blk_outhdr -{ +struct virtio_blk_outhdr { /* VIRTIO_BLK_T* */ __u32 type; /* io priority. */ diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 99f514575f6a..e547e3c8ee9a 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -79,8 +79,7 @@ * the dev->feature bits if it wants. */ typedef void vq_callback_t(struct virtqueue *); -struct virtio_config_ops -{ +struct virtio_config_ops { void (*get)(struct virtio_device *vdev, unsigned offset, void *buf, unsigned len); void (*set)(struct virtio_device *vdev, unsigned offset, diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 9c543d6ac535..d8dd539c9f48 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -31,8 +31,7 @@ #define VIRTIO_NET_S_LINK_UP 1 /* Link is up */ -struct virtio_net_config -{ +struct virtio_net_config { /* The config defining mac address (if VIRTIO_NET_F_MAC) */ __u8 mac[6]; /* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */ @@ -41,8 +40,7 @@ struct virtio_net_config /* This is the first element of the scatter-gather list. If you don't * specify GSO or CSUM features, you can simply ignore the header. */ -struct virtio_net_hdr -{ +struct virtio_net_hdr { #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 // Use csum_start, csum_offset __u8 flags; #define VIRTIO_NET_HDR_GSO_NONE 0 // Not a GSO frame diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index 693e0ec5afa6..e4d144b132b5 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -30,8 +30,7 @@ #define VIRTIO_RING_F_INDIRECT_DESC 28 /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */ -struct vring_desc -{ +struct vring_desc { /* Address (guest-physical). */ __u64 addr; /* Length. */ @@ -42,24 +41,21 @@ struct vring_desc __u16 next; }; -struct vring_avail -{ +struct vring_avail { __u16 flags; __u16 idx; __u16 ring[]; }; /* u32 is used here for ids for padding reasons. */ -struct vring_used_elem -{ +struct vring_used_elem { /* Index of start of used descriptor chain. */ __u32 id; /* Total length of the descriptor chain which was used (written to) */ __u32 len; }; -struct vring_used -{ +struct vring_used { __u16 flags; __u16 idx; struct vring_used_elem ring[]; -- cgit v1.2.3-59-g8ed1b From cbb4f2646d77b536ed2b1500ef6641083228ed8f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 31 Jul 2009 08:55:48 +0200 Subject: io context: fix ref counting Commit d9c7d394a8ebacb60097b192939ae9f15235225e ("block: prevent possible io_context->refcount overflow") mistakenly changed atomic_inc(&ioc->nr_tasks) to atomic_long_inc(&ioc->refcount). Signed-off-by: Li Zefan Acked-by: Nikanth Karthikesan Signed-off-by: Jens Axboe --- include/linux/iocontext.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index dd05434fa45f..4da4a75c3f1e 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -92,7 +92,7 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc) * a race). */ if (ioc && atomic_long_inc_not_zero(&ioc->refcount)) { - atomic_long_inc(&ioc->refcount); + atomic_inc(&ioc->nr_tasks); return ioc; } -- cgit v1.2.3-59-g8ed1b From 6de7e356faf54aa75de5b624bbce28a5b776dfa8 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 18 Jun 2009 10:19:12 +0200 Subject: lib/scatterlist: add a flags to signalize mapping direction sg_miter_start() is currently unaware of the direction of the copy process (to or from the scatter list). It is important to know the direction because the page has to be flushed in case the data written is seen on a different mapping in user land on cache incoherent architectures. Signed-off-by: Sebastian Andrzej Siewior Acked-by: FUJITA Tomonori Acked-by: Tejun Heo Signed-off-by: Pierre Ossman --- include/linux/scatterlist.h | 2 ++ lib/scatterlist.c | 16 ++++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/scatterlist.h b/include/linux/scatterlist.h index e5996984ddd0..9aaf5bfdad1a 100644 --- a/include/linux/scatterlist.h +++ b/include/linux/scatterlist.h @@ -242,6 +242,8 @@ size_t sg_copy_to_buffer(struct scatterlist *sgl, unsigned int nents, */ #define SG_MITER_ATOMIC (1 << 0) /* use kmap_atomic */ +#define SG_MITER_TO_SG (1 << 1) /* flush back to phys on unmap */ +#define SG_MITER_FROM_SG (1 << 2) /* nop */ struct sg_mapping_iter { /* the following three fields can be accessed directly */ diff --git a/lib/scatterlist.c b/lib/scatterlist.c index a295e404e908..0d475d8167bf 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -314,6 +314,7 @@ void sg_miter_start(struct sg_mapping_iter *miter, struct scatterlist *sgl, miter->__sg = sgl; miter->__nents = nents; miter->__offset = 0; + WARN_ON(!(flags & (SG_MITER_TO_SG | SG_MITER_FROM_SG))); miter->__flags = flags; } EXPORT_SYMBOL(sg_miter_start); @@ -394,6 +395,9 @@ void sg_miter_stop(struct sg_mapping_iter *miter) if (miter->addr) { miter->__offset += miter->consumed; + if (miter->__flags & SG_MITER_TO_SG) + flush_kernel_dcache_page(miter->page); + if (miter->__flags & SG_MITER_ATOMIC) { WARN_ON(!irqs_disabled()); kunmap_atomic(miter->addr, KM_BIO_SRC_IRQ); @@ -426,8 +430,14 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, unsigned int offset = 0; struct sg_mapping_iter miter; unsigned long flags; + unsigned int sg_flags = SG_MITER_ATOMIC; + + if (to_buffer) + sg_flags |= SG_MITER_FROM_SG; + else + sg_flags |= SG_MITER_TO_SG; - sg_miter_start(&miter, sgl, nents, SG_MITER_ATOMIC); + sg_miter_start(&miter, sgl, nents, sg_flags); local_irq_save(flags); @@ -438,10 +448,8 @@ static size_t sg_copy_buffer(struct scatterlist *sgl, unsigned int nents, if (to_buffer) memcpy(buf + offset, miter.addr, len); - else { + else memcpy(miter.addr, buf + offset, len); - flush_kernel_dcache_page(miter.page); - } offset += len; } -- cgit v1.2.3-59-g8ed1b From 4b2a108cd0d34880fe9d932258ca5b2ccebcd05e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 22 Jun 2009 09:18:05 +0200 Subject: cb710: use SG_MITER_TO_SG/SG_MITER_FROM_SG MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit the code allready uses flush_kernel_dcache_page(). This patch updates the driver to the recent sg API changes which require that either SG_MITER_TO_SG or SG_MITER_FROM_SG is set. SG_MITER_TO_SG calls flush_kernel_dcache_page() in sg_mitter_stop() Signed-off-by: Sebastian Andrzej Siewior Acked-by: MichaÅ‚ MirosÅ‚aw Signed-off-by: Pierre Ossman --- drivers/misc/cb710/sgbuf2.c | 4 ---- drivers/mmc/host/cb710-mmc.c | 6 +++--- include/linux/cb710.h | 29 +++-------------------------- 3 files changed, 6 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/drivers/misc/cb710/sgbuf2.c b/drivers/misc/cb710/sgbuf2.c index d38a7acdb6ec..d019746551f3 100644 --- a/drivers/misc/cb710/sgbuf2.c +++ b/drivers/misc/cb710/sgbuf2.c @@ -114,7 +114,6 @@ static void sg_dwiter_write_slow(struct sg_mapping_iter *miter, uint32_t data) if (!left) return; addr += len; - flush_kernel_dcache_page(miter->page); } while (sg_dwiter_next(miter)); } @@ -142,9 +141,6 @@ void cb710_sg_dwiter_write_next_block(struct sg_mapping_iter *miter, uint32_t da return; } else sg_dwiter_write_slow(miter, data); - - if (miter->length == miter->consumed) - flush_kernel_dcache_page(miter->page); } EXPORT_SYMBOL_GPL(cb710_sg_dwiter_write_next_block); diff --git a/drivers/mmc/host/cb710-mmc.c b/drivers/mmc/host/cb710-mmc.c index 11efefb1af51..4e72964a7b43 100644 --- a/drivers/mmc/host/cb710-mmc.c +++ b/drivers/mmc/host/cb710-mmc.c @@ -278,7 +278,7 @@ static int cb710_mmc_receive(struct cb710_slot *slot, struct mmc_data *data) if (unlikely(data->blksz & 15 && (data->blocks != 1 || data->blksz != 8))) return -EINVAL; - sg_miter_start(&miter, data->sg, data->sg_len, 0); + sg_miter_start(&miter, data->sg, data->sg_len, SG_MITER_TO_SG); cb710_modify_port_8(slot, CB710_MMC_CONFIG2_PORT, 15, CB710_MMC_C2_READ_PIO_SIZE_MASK); @@ -307,7 +307,7 @@ static int cb710_mmc_receive(struct cb710_slot *slot, struct mmc_data *data) goto out; } out: - cb710_sg_miter_stop_writing(&miter); + sg_miter_stop(&miter); return err; } @@ -322,7 +322,7 @@ static int cb710_mmc_send(struct cb710_slot *slot, struct mmc_data *data) if (unlikely(data->blocks > 1 && data->blksz & 15)) return -EINVAL; - sg_miter_start(&miter, data->sg, data->sg_len, 0); + sg_miter_start(&miter, data->sg, data->sg_len, SG_MITER_FROM_SG); cb710_modify_port_8(slot, CB710_MMC_CONFIG2_PORT, 0, CB710_MMC_C2_READ_PIO_SIZE_MASK); diff --git a/include/linux/cb710.h b/include/linux/cb710.h index 63bc9a4d2926..8cc10411bab2 100644 --- a/include/linux/cb710.h +++ b/include/linux/cb710.h @@ -140,29 +140,6 @@ void cb710_dump_regs(struct cb710_chip *chip, unsigned dump); #include #include -/** - * cb710_sg_miter_stop_writing - stop mapping iteration after writing - * @miter: sg mapping iter to be stopped - * - * Description: - * Stops mapping iterator @miter. @miter should have been started - * started using sg_miter_start(). A stopped iteration can be - * resumed by calling sg_miter_next() on it. This is useful when - * resources (kmap) need to be released during iteration. - * - * This is a convenience wrapper that will be optimized out for arches - * that don't need flush_kernel_dcache_page(). - * - * Context: - * IRQ disabled if the SG_MITER_ATOMIC is set. Don't care otherwise. - */ -static inline void cb710_sg_miter_stop_writing(struct sg_mapping_iter *miter) -{ - if (miter->page) - flush_kernel_dcache_page(miter->page); - sg_miter_stop(miter); -} - /* * 32-bit PIO mapping sg iterator * @@ -171,12 +148,12 @@ static inline void cb710_sg_miter_stop_writing(struct sg_mapping_iter *miter) * without DMA support). * * Best-case reading (transfer from device): - * sg_miter_start(); + * sg_miter_start(, SG_MITER_TO_SG); * cb710_sg_dwiter_write_from_io(); - * cb710_sg_miter_stop_writing(); + * sg_miter_stop(); * * Best-case writing (transfer to device): - * sg_miter_start(); + * sg_miter_start(, SG_MITER_FROM_SG); * cb710_sg_dwiter_read_to_io(); * sg_miter_stop(); */ -- cgit v1.2.3-59-g8ed1b From c7121843685de2bf7f3afd3ae1d6a146010bf1fc Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 28 Jul 2009 14:09:55 -0700 Subject: clocksource: Save mult_orig in clocksource_disable() To fix the common case where ->enable() does not set up mult, make sure mult_orig is saved in mult on disable. Also add comments to explain why we do this. Signed-off-by: Magnus Damm Cc: johnstul@us.ibm.com Cc: lethal@linux-sh.org Cc: akpm@linux-foundation.org LKML-Reference: <20090618152432.10136.9932.sendpatchset@rx1.opensource.se> Signed-off-by: Thomas Gleixner --- include/linux/clocksource.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index c56457c8334e..1219be4fb42e 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -293,7 +293,12 @@ static inline int clocksource_enable(struct clocksource *cs) if (cs->enable) ret = cs->enable(cs); - /* save mult_orig on enable */ + /* + * The frequency may have changed while the clocksource + * was disabled. If so the code in ->enable() must update + * the mult value to reflect the new frequency. Make sure + * mult_orig follows this change. + */ cs->mult_orig = cs->mult; return ret; @@ -309,6 +314,13 @@ static inline int clocksource_enable(struct clocksource *cs) */ static inline void clocksource_disable(struct clocksource *cs) { + /* + * Save mult_orig in mult so clocksource_enable() can + * restore the value regardless if ->enable() updates + * the value of mult or not. + */ + cs->mult = cs->mult_orig; + if (cs->disable) cs->disable(cs); } -- cgit v1.2.3-59-g8ed1b From 7c958e32649e0c35801762878fb0b6da8c55a515 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 31 Jul 2009 11:49:11 -0400 Subject: block: Add a wrapper for setting minimum request size without a queue Introduce blk_limits_io_min() and make blk_queue_io_min() call it. Signed-off-by: Mike Snitzer Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-settings.c | 31 ++++++++++++++++++++++++------- include/linux/blkdev.h | 1 + 2 files changed, 25 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/block/blk-settings.c b/block/blk-settings.c index 8e86e2d2b147..1f7197434166 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -383,6 +383,29 @@ void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset) } EXPORT_SYMBOL(blk_queue_alignment_offset); +/** + * blk_limits_io_min - set minimum request size for a device + * @limits: the queue limits + * @min: smallest I/O size in bytes + * + * Description: + * Some devices have an internal block size bigger than the reported + * hardware sector size. This function can be used to signal the + * smallest I/O the device can perform without incurring a performance + * penalty. + */ +void blk_limits_io_min(struct queue_limits *limits, unsigned int min) +{ + limits->io_min = min; + + if (limits->io_min < limits->logical_block_size) + limits->io_min = limits->logical_block_size; + + if (limits->io_min < limits->physical_block_size) + limits->io_min = limits->physical_block_size; +} +EXPORT_SYMBOL(blk_limits_io_min); + /** * blk_queue_io_min - set minimum request size for the queue * @q: the request queue for the device @@ -396,13 +419,7 @@ EXPORT_SYMBOL(blk_queue_alignment_offset); */ void blk_queue_io_min(struct request_queue *q, unsigned int min) { - q->limits.io_min = min; - - if (q->limits.io_min < q->limits.logical_block_size) - q->limits.io_min = q->limits.logical_block_size; - - if (q->limits.io_min < q->limits.physical_block_size) - q->limits.io_min = q->limits.physical_block_size; + blk_limits_io_min(&q->limits, min); } EXPORT_SYMBOL(blk_queue_io_min); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e7cb5dbf6c26..69103e053c92 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -913,6 +913,7 @@ extern void blk_queue_logical_block_size(struct request_queue *, unsigned short) extern void blk_queue_physical_block_size(struct request_queue *, unsigned short); extern void blk_queue_alignment_offset(struct request_queue *q, unsigned int alignment); +extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); extern void blk_queue_io_min(struct request_queue *q, unsigned int min); extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt); extern void blk_set_default_limits(struct queue_limits *lim); -- cgit v1.2.3-59-g8ed1b From 9f498cc5be7e013d8d6e4c616980ed0ffc8680d2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 23 Jul 2009 14:46:33 +0200 Subject: perf_counter: Full task tracing In order to be able to distinguish between no samples due to inactivity and no samples due to task ended, Arjan asked for PERF_EVENT_EXIT events. This is useful to the boot delay instrumentation (bootchart) app. This patch changes the PERF_EVENT_FORK to be emitted on every clone, and adds PERF_EVENT_EXIT to be emitted on task exit, after the task's counters have been closed. This task tracing is controlled through: attr.comm || attr.mmap and through the new attr.task field. Suggested-by: Arjan van de Ven Cc: Paul Mackerras Cc: Anton Blanchard Signed-off-by: Peter Zijlstra [ cleaned up perf_counter.h a bit ] Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 13 ++++++- kernel/fork.c | 4 +- kernel/perf_counter.c | 87 +++++++++++++++++++++++++++++--------------- 3 files changed, 71 insertions(+), 33 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index bd15d7a5f5ce..e604e6ef72dd 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -181,8 +181,9 @@ struct perf_counter_attr { freq : 1, /* use freq, not period */ inherit_stat : 1, /* per task counts */ enable_on_exec : 1, /* next exec enables */ + task : 1, /* trace fork/exit */ - __reserved_1 : 51; + __reserved_1 : 50; __u32 wakeup_events; /* wakeup every n events */ __u32 __reserved_2; @@ -308,6 +309,15 @@ enum perf_event_type { */ PERF_EVENT_COMM = 3, + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * }; + */ + PERF_EVENT_EXIT = 4, + /* * struct { * struct perf_event_header header; @@ -323,6 +333,7 @@ enum perf_event_type { * struct { * struct perf_event_header header; * u32 pid, ppid; + * u32 tid, ptid; * }; */ PERF_EVENT_FORK = 7, diff --git a/kernel/fork.c b/kernel/fork.c index 29b532e718f7..466531eb92cc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1269,6 +1269,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); cgroup_post_fork(p); + perf_counter_fork(p); return p; bad_fork_free_pid: @@ -1410,9 +1411,6 @@ long do_fork(unsigned long clone_flags, init_completion(&vfork); } - if (!(clone_flags & CLONE_THREAD)) - perf_counter_fork(p); - audit_finish_fork(p); tracehook_report_clone(regs, clone_flags, nr, p); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 48471d75ae01..199ed4771315 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -42,6 +42,7 @@ static int perf_overcommit __read_mostly = 1; static atomic_t nr_counters __read_mostly; static atomic_t nr_mmap_counters __read_mostly; static atomic_t nr_comm_counters __read_mostly; +static atomic_t nr_task_counters __read_mostly; /* * perf counter paranoia level: @@ -1654,6 +1655,8 @@ static void free_counter(struct perf_counter *counter) atomic_dec(&nr_mmap_counters); if (counter->attr.comm) atomic_dec(&nr_comm_counters); + if (counter->attr.task) + atomic_dec(&nr_task_counters); } if (counter->destroy) @@ -2831,10 +2834,12 @@ perf_counter_read_event(struct perf_counter *counter, } /* - * fork tracking + * task tracking -- fork/exit + * + * enabled by: attr.comm | attr.mmap | attr.task */ -struct perf_fork_event { +struct perf_task_event { struct task_struct *task; struct { @@ -2842,37 +2847,42 @@ struct perf_fork_event { u32 pid; u32 ppid; + u32 tid; + u32 ptid; } event; }; -static void perf_counter_fork_output(struct perf_counter *counter, - struct perf_fork_event *fork_event) +static void perf_counter_task_output(struct perf_counter *counter, + struct perf_task_event *task_event) { struct perf_output_handle handle; - int size = fork_event->event.header.size; - struct task_struct *task = fork_event->task; + int size = task_event->event.header.size; + struct task_struct *task = task_event->task; int ret = perf_output_begin(&handle, counter, size, 0, 0); if (ret) return; - fork_event->event.pid = perf_counter_pid(counter, task); - fork_event->event.ppid = perf_counter_pid(counter, task->real_parent); + task_event->event.pid = perf_counter_pid(counter, task); + task_event->event.ppid = perf_counter_pid(counter, task->real_parent); - perf_output_put(&handle, fork_event->event); + task_event->event.tid = perf_counter_tid(counter, task); + task_event->event.ptid = perf_counter_tid(counter, task->real_parent); + + perf_output_put(&handle, task_event->event); perf_output_end(&handle); } -static int perf_counter_fork_match(struct perf_counter *counter) +static int perf_counter_task_match(struct perf_counter *counter) { - if (counter->attr.comm || counter->attr.mmap) + if (counter->attr.comm || counter->attr.mmap || counter->attr.task) return 1; return 0; } -static void perf_counter_fork_ctx(struct perf_counter_context *ctx, - struct perf_fork_event *fork_event) +static void perf_counter_task_ctx(struct perf_counter_context *ctx, + struct perf_task_event *task_event) { struct perf_counter *counter; @@ -2881,19 +2891,19 @@ static void perf_counter_fork_ctx(struct perf_counter_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_counter_fork_match(counter)) - perf_counter_fork_output(counter, fork_event); + if (perf_counter_task_match(counter)) + perf_counter_task_output(counter, task_event); } rcu_read_unlock(); } -static void perf_counter_fork_event(struct perf_fork_event *fork_event) +static void perf_counter_task_event(struct perf_task_event *task_event) { struct perf_cpu_context *cpuctx; struct perf_counter_context *ctx; cpuctx = &get_cpu_var(perf_cpu_context); - perf_counter_fork_ctx(&cpuctx->ctx, fork_event); + perf_counter_task_ctx(&cpuctx->ctx, task_event); put_cpu_var(perf_cpu_context); rcu_read_lock(); @@ -2903,32 +2913,40 @@ static void perf_counter_fork_event(struct perf_fork_event *fork_event) */ ctx = rcu_dereference(current->perf_counter_ctxp); if (ctx) - perf_counter_fork_ctx(ctx, fork_event); + perf_counter_task_ctx(ctx, task_event); rcu_read_unlock(); } -void perf_counter_fork(struct task_struct *task) +static void perf_counter_task(struct task_struct *task, int new) { - struct perf_fork_event fork_event; + struct perf_task_event task_event; if (!atomic_read(&nr_comm_counters) && - !atomic_read(&nr_mmap_counters)) + !atomic_read(&nr_mmap_counters) && + !atomic_read(&nr_task_counters)) return; - fork_event = (struct perf_fork_event){ + task_event = (struct perf_task_event){ .task = task, .event = { .header = { - .type = PERF_EVENT_FORK, + .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, .misc = 0, - .size = sizeof(fork_event.event), + .size = sizeof(task_event.event), }, /* .pid */ /* .ppid */ + /* .tid */ + /* .ptid */ }, }; - perf_counter_fork_event(&fork_event); + perf_counter_task_event(&task_event); +} + +void perf_counter_fork(struct task_struct *task) +{ + perf_counter_task(task, 1); } /* @@ -3887,6 +3905,8 @@ done: atomic_inc(&nr_mmap_counters); if (counter->attr.comm) atomic_inc(&nr_comm_counters); + if (counter->attr.task) + atomic_inc(&nr_task_counters); } return counter; @@ -4248,8 +4268,10 @@ void perf_counter_exit_task(struct task_struct *child) struct perf_counter_context *child_ctx; unsigned long flags; - if (likely(!child->perf_counter_ctxp)) + if (likely(!child->perf_counter_ctxp)) { + perf_counter_task(child, 0); return; + } local_irq_save(flags); /* @@ -4267,15 +4289,22 @@ void perf_counter_exit_task(struct task_struct *child) * incremented the context's refcount before we do put_ctx below. */ spin_lock(&child_ctx->lock); - child->perf_counter_ctxp = NULL; /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all * the counters from it. */ unclone_ctx(child_ctx); - spin_unlock(&child_ctx->lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&child_ctx->lock, flags); + + /* + * Report the task dead after unscheduling the counters so that we + * won't get any samples after PERF_EVENT_EXIT. We can however still + * get a few PERF_EVENT_READ events. + */ + perf_counter_task(child, 0); + + child->perf_counter_ctxp = NULL; /* * We can recurse on the same lock type through: -- cgit v1.2.3-59-g8ed1b From 7699ad35ed06044c4fc1be162553880f98658616 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Mon, 15 Jun 2009 01:10:18 -0400 Subject: mtd: let include/linux/mtd/partitions.h stand on its own When declaring static MTD partitions in board specific code, only including should suffice without gcc nagging us with: In file included from arch/arm/mach-kirkwood/sheevaplug-setup.c:14: include/linux/mtd/partitions.h:50: warning: 'struct mtd_info' declared inside parameter list include/linux/mtd/partitions.h:50: warning: its scope is only this definition or declaration, which is probably not what you want include/linux/mtd/partitions.h:51: warning: 'struct mtd_info' declared inside parameter list include/linux/mtd/partitions.h:61: warning: 'struct mtd_info' declared inside parameter list include/linux/mtd/partitions.h:67: warning: 'struct mtd_info' declared inside parameter list Signed-off-by: Nicolas Pitre Signed-off-by: Artem Bityutskiy Signed-off-by: David Woodhouse --- include/linux/mtd/partitions.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h index af6dcb992bc3..b70313d33ff8 100644 --- a/include/linux/mtd/partitions.h +++ b/include/linux/mtd/partitions.h @@ -47,6 +47,8 @@ struct mtd_partition { #define MTDPART_SIZ_FULL (0) +struct mtd_info; + int add_mtd_partitions(struct mtd_info *, const struct mtd_partition *, int); int del_mtd_partitions(struct mtd_info *); -- cgit v1.2.3-59-g8ed1b From 6afc4fdb3e94ba60cd566cb878b60c6c01979277 Mon Sep 17 00:00:00 2001 From: Saeed Bishara Date: Tue, 28 Jul 2009 04:56:43 -0700 Subject: mtd: fix the conversion from dev to mtd_info The patch fixes a bug when converting dev to mtd_info by using the drvdata of the dev, the previous code used container_of(dev, struct mtd_info, dev), but won't work for the mtdXro devices as they created without being contained inside mtd_info structure. Signed-off-by: Saeed Bishara Signed-off-by: David Woodhouse --- drivers/mtd/mtdcore.c | 7 ++++--- include/linux/mtd/mtd.h | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index fac54a3fa3f1..00ebf7af7467 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -65,8 +65,8 @@ static void mtd_release(struct device *dev) static int mtd_cls_suspend(struct device *dev, pm_message_t state) { struct mtd_info *mtd = dev_to_mtd(dev); - - if (mtd->suspend) + + if (mtd && mtd->suspend) return mtd->suspend(mtd); else return 0; @@ -76,7 +76,7 @@ static int mtd_cls_resume(struct device *dev) { struct mtd_info *mtd = dev_to_mtd(dev); - if (mtd->resume) + if (mtd && mtd->resume) mtd->resume(mtd); return 0; } @@ -298,6 +298,7 @@ int add_mtd_device(struct mtd_info *mtd) mtd->dev.class = &mtd_class; mtd->dev.devt = MTD_DEVT(i); dev_set_name(&mtd->dev, "mtd%d", i); + dev_set_drvdata(&mtd->dev, mtd); if (device_register(&mtd->dev) != 0) { mtd_table[i] = NULL; break; diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index 5675b63a0631..0f32a9b6ff55 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -251,7 +251,7 @@ struct mtd_info { static inline struct mtd_info *dev_to_mtd(struct device *dev) { - return dev ? container_of(dev, struct mtd_info, dev) : NULL; + return dev ? dev_get_drvdata(dev) : NULL; } static inline uint32_t mtd_div_by_eb(uint64_t sz, struct mtd_info *mtd) -- cgit v1.2.3-59-g8ed1b From af0d3b103bcfa877343ee338de12002cd50c9ee5 Mon Sep 17 00:00:00 2001 From: Dave Young Date: Mon, 3 Aug 2009 04:26:16 +0000 Subject: bluetooth: rfcomm_init bug fix rfcomm tty may be used before rfcomm_tty_driver initilized, The problem is that now socket layer init before tty layer, if userspace program do socket callback right here then oops will happen. reporting in: http://marc.info/?l=linux-bluetooth&m=124404919324542&w=2 make 3 changes: 1. remove #ifdef in rfcomm/core.c, make it blank function when rfcomm tty not selected in rfcomm.h 2. tune the rfcomm_init error patch to ensure tty driver initilized before rfcomm socket usage. 3. remove __exit for rfcomm_cleanup_sockets because above change need call it in a __init function. Reported-by: Oliver Hartkopp Tested-by: Oliver Hartkopp Signed-off-by: Dave Young Signed-off-by: David S. Miller --- include/net/bluetooth/rfcomm.h | 12 +++++++++++- net/bluetooth/rfcomm/core.c | 27 +++++++++++++++++++-------- net/bluetooth/rfcomm/sock.c | 2 +- 3 files changed, 31 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/net/bluetooth/rfcomm.h b/include/net/bluetooth/rfcomm.h index 80072611d26a..c274993234e3 100644 --- a/include/net/bluetooth/rfcomm.h +++ b/include/net/bluetooth/rfcomm.h @@ -355,7 +355,17 @@ struct rfcomm_dev_list_req { }; int rfcomm_dev_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); + +#ifdef CONFIG_BT_RFCOMM_TTY int rfcomm_init_ttys(void); void rfcomm_cleanup_ttys(void); - +#else +static inline int rfcomm_init_ttys(void) +{ + return 0; +} +static inline void rfcomm_cleanup_ttys(void) +{ +} +#endif #endif /* __RFCOMM_H */ diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index e50566ebf9f9..94b3388c188b 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -2080,28 +2080,41 @@ static CLASS_ATTR(rfcomm_dlc, S_IRUGO, rfcomm_dlc_sysfs_show, NULL); /* ---- Initialization ---- */ static int __init rfcomm_init(void) { + int ret; + l2cap_load(); hci_register_cb(&rfcomm_cb); rfcomm_thread = kthread_run(rfcomm_run, NULL, "krfcommd"); if (IS_ERR(rfcomm_thread)) { - hci_unregister_cb(&rfcomm_cb); - return PTR_ERR(rfcomm_thread); + ret = PTR_ERR(rfcomm_thread); + goto out_thread; } if (class_create_file(bt_class, &class_attr_rfcomm_dlc) < 0) BT_ERR("Failed to create RFCOMM info file"); - rfcomm_init_sockets(); + ret = rfcomm_init_ttys(); + if (ret) + goto out_tty; -#ifdef CONFIG_BT_RFCOMM_TTY - rfcomm_init_ttys(); -#endif + ret = rfcomm_init_sockets(); + if (ret) + goto out_sock; BT_INFO("RFCOMM ver %s", VERSION); return 0; + +out_sock: + rfcomm_cleanup_ttys(); +out_tty: + kthread_stop(rfcomm_thread); +out_thread: + hci_unregister_cb(&rfcomm_cb); + + return ret; } static void __exit rfcomm_exit(void) @@ -2112,9 +2125,7 @@ static void __exit rfcomm_exit(void) kthread_stop(rfcomm_thread); -#ifdef CONFIG_BT_RFCOMM_TTY rfcomm_cleanup_ttys(); -#endif rfcomm_cleanup_sockets(); } diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 7f482784e9f7..0b85e8116859 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -1132,7 +1132,7 @@ error: return err; } -void __exit rfcomm_cleanup_sockets(void) +void rfcomm_cleanup_sockets(void) { class_remove_file(bt_class, &class_attr_rfcomm); -- cgit v1.2.3-59-g8ed1b From 371842448c05b42d11a4be1c8e4e81d62ecc7534 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Thu, 30 Jul 2009 17:43:48 -0700 Subject: cfg80211: fix regression on beacon world roaming feature A regression was added through patch a4ed90d6: "cfg80211: respect API on orig_flags on channel for beacon hint" We did indeed respect _orig flags but the intention was not clearly stated in the commit log. This patch fixes firmware issues picked up by iwlwifi when we lift passive scan of beaconing restrictions on channels its EEPROM has been configured to always enable. By doing so though we also disallowed beacon hints on devices registering their wiphy with custom world regulatory domains enabled, this happens to be currently ath5k, ath9k and ar9170. The passive scan and beacon restrictions on those devices would never be lifted even if we did find a beacon and the hardware did support such enhancements when world roaming. Since Johannes indicates iwlwifi firmware cannot be changed to allow beacon hinting we set up a flag now to specifically allow drivers to disable beacon hints for devices which cannot use them. We enable the flag on iwlwifi to disable beacon hints and by default enable it for all other drivers. It should be noted beacon hints lift passive scan flags and beacon restrictions when we receive a beacon from an AP on any 5 GHz non-DFS channels, and channels 12-14 on the 2.4 GHz band. We don't bother with channels 1-11 as those channels are allowed world wide. This should fix world roaming for ath5k, ath9k and ar9170, thereby improving scan time when we receive the first beacon from any AP, and also enabling beaconing operation (AP/IBSS/Mesh) on cards which would otherwise not be allowed to do so. Drivers not using custom regulatory stuff (wiphy_apply_custom_regulatory()) were not affected by this as the orig_flags for the channels would have been cleared upon wiphy registration. I tested this with a world roaming ath5k card. Cc: Jouni Malinen Signed-off-by: Luis R. Rodriguez Reviewed-by: Johannes Berg Signed-off-by: John W. Linville --- drivers/net/wireless/iwlwifi/iwl-core.c | 3 +++ drivers/net/wireless/iwlwifi/iwl3945-base.c | 3 +++ include/net/cfg80211.h | 5 +++++ net/wireless/reg.c | 9 +++++---- net/wireless/reg.h | 3 ++- 5 files changed, 18 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/net/wireless/iwlwifi/iwl-core.c b/drivers/net/wireless/iwlwifi/iwl-core.c index 6ab07165ea28..18b135f510e5 100644 --- a/drivers/net/wireless/iwlwifi/iwl-core.c +++ b/drivers/net/wireless/iwlwifi/iwl-core.c @@ -1332,6 +1332,9 @@ int iwl_setup_mac(struct iwl_priv *priv) hw->wiphy->custom_regulatory = true; + /* Firmware does not support this */ + hw->wiphy->disable_beacon_hints = true; + hw->wiphy->max_scan_ssids = PROBE_OPTION_MAX; /* we create the 802.11 header and a zero-length SSID element */ hw->wiphy->max_scan_ie_len = IWL_MAX_PROBE_REQUEST - 24 - 2; diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c index 2f50ab60bfdf..523843369ca2 100644 --- a/drivers/net/wireless/iwlwifi/iwl3945-base.c +++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c @@ -3968,6 +3968,9 @@ static int iwl3945_setup_mac(struct iwl_priv *priv) hw->wiphy->custom_regulatory = true; + /* Firmware does not support this */ + hw->wiphy->disable_beacon_hints = true; + hw->wiphy->max_scan_ssids = PROBE_OPTION_MAX_3945; /* we create the 802.11 header and a zero-length SSID element */ hw->wiphy->max_scan_ie_len = IWL_MAX_PROBE_REQUEST - 24 - 2; diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 1a21895b732b..d1892d66701a 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -979,6 +979,10 @@ struct cfg80211_ops { * channels at a later time. This can be used for devices which do not * have calibration information gauranteed for frequencies or settings * outside of its regulatory domain. + * @disable_beacon_hints: enable this if your driver needs to ensure that + * passive scan flags and beaconing flags may not be lifted by cfg80211 + * due to regulatory beacon hints. For more information on beacon + * hints read the documenation for regulatory_hint_found_beacon() * @reg_notifier: the driver's regulatory notification callback * @regd: the driver's regulatory domain, if one was requested via * the regulatory_hint() API. This can be used by the driver @@ -1004,6 +1008,7 @@ struct wiphy { bool custom_regulatory; bool strict_regulatory; + bool disable_beacon_hints; enum cfg80211_signal_type signal_type; diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 5e14371cda70..75a406d33619 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1089,17 +1089,18 @@ static void handle_reg_beacon(struct wiphy *wiphy, chan->beacon_found = true; + if (wiphy->disable_beacon_hints) + return; + chan_before.center_freq = chan->center_freq; chan_before.flags = chan->flags; - if ((chan->flags & IEEE80211_CHAN_PASSIVE_SCAN) && - !(chan->orig_flags & IEEE80211_CHAN_PASSIVE_SCAN)) { + if (chan->flags & IEEE80211_CHAN_PASSIVE_SCAN) { chan->flags &= ~IEEE80211_CHAN_PASSIVE_SCAN; channel_changed = true; } - if ((chan->flags & IEEE80211_CHAN_NO_IBSS) && - !(chan->orig_flags & IEEE80211_CHAN_NO_IBSS)) { + if (chan->flags & IEEE80211_CHAN_NO_IBSS) { chan->flags &= ~IEEE80211_CHAN_NO_IBSS; channel_changed = true; } diff --git a/net/wireless/reg.h b/net/wireless/reg.h index e37829a49dc4..4e167a8e11be 100644 --- a/net/wireless/reg.h +++ b/net/wireless/reg.h @@ -30,7 +30,8 @@ int set_regdom(const struct ieee80211_regdomain *rd); * non-radar 5 GHz channels. * * Drivers do not need to call this, cfg80211 will do it for after a scan - * on a newly found BSS. + * on a newly found BSS. If you cannot make use of this feature you can + * set the wiphy->disable_beacon_hints to true. */ int regulatory_hint_found_beacon(struct wiphy *wiphy, struct ieee80211_channel *beacon_chan, -- cgit v1.2.3-59-g8ed1b From 7320700df1864b601cef5adbddce8654a0e3f78b Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 3 Aug 2009 17:01:53 -0400 Subject: drm/radeon: add some new r7xx pci ids Signed-off-by: Alex Deucher Signed-off-by: Dave Airlie --- include/drm/drm_pciids.h | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'include') diff --git a/include/drm/drm_pciids.h b/include/drm/drm_pciids.h index 7174818c2c13..9d4c00491547 100644 --- a/include/drm/drm_pciids.h +++ b/include/drm/drm_pciids.h @@ -257,9 +257,12 @@ {0x1002, 0x940F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_R600|RADEON_NEW_MEMMAP}, \ {0x1002, 0x94A0, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV740|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x94A1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV740|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x94A3, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV740|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x94B1, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV740|RADEON_NEW_MEMMAP}, \ {0x1002, 0x94B3, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV740|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x94B4, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV740|RADEON_NEW_MEMMAP}, \ {0x1002, 0x94B5, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV740|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x94B9, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV740|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x9440, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV770|RADEON_NEW_MEMMAP}, \ {0x1002, 0x9441, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV770|RADEON_NEW_MEMMAP}, \ {0x1002, 0x9442, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV770|RADEON_NEW_MEMMAP}, \ @@ -288,6 +291,7 @@ {0x1002, 0x948F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV730|RADEON_NEW_MEMMAP}, \ {0x1002, 0x9490, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV730|RADEON_NEW_MEMMAP}, \ {0x1002, 0x9491, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV730|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x9495, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV730|RADEON_NEW_MEMMAP}, \ {0x1002, 0x9498, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV730|RADEON_NEW_MEMMAP}, \ {0x1002, 0x949C, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV730|RADEON_NEW_MEMMAP}, \ {0x1002, 0x949E, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV730|RADEON_NEW_MEMMAP}, \ @@ -325,6 +329,7 @@ {0x1002, 0x9552, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV710|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x9553, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV710|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x9555, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV710|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x9557, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV710|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x9580, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV630|RADEON_NEW_MEMMAP}, \ {0x1002, 0x9581, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV630|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x9583, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RV630|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ -- cgit v1.2.3-59-g8ed1b From 18eac1cc100fa2afd5f39085aae6b694e417734b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 3 Aug 2009 10:58:29 -0700 Subject: tty-ldisc: make refcount be atomic_t 'users' count This is pure preparation of changing the ldisc reference counting to be a true refcount that defines the lifetime of the ldisc. But this is a purely syntactic change for now to make the next steps easier. This patch should make no semantic changes at all. But I wanted to make the ldisc refcount be an atomic (I will be touching it without locks soon enough), and I wanted to rename it so that there isn't quite as much confusion between 'ldo->refcount' (ldisk operations refcount) and 'ld->refcount' (ldisc refcount itself) in the same file. So it's now an atomic 'ld->users' count. It still starts at zero, despite having a reference from 'tty->ldisc', but that will change once we turn it into a _real_ refcount. Signed-off-by: Linus Torvalds Tested-by: OGAWA Hirofumi Tested-by: Sergey Senozhatsky Acked-by: Alan Cox Signed-off-by: Greg Kroah-Hartman --- drivers/char/tty_ldisc.c | 18 ++++++++---------- include/linux/tty_ldisc.h | 2 +- 2 files changed, 9 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c index acd76b767d4c..fd175e60aad5 100644 --- a/drivers/char/tty_ldisc.c +++ b/drivers/char/tty_ldisc.c @@ -142,7 +142,7 @@ static struct tty_ldisc *tty_ldisc_try_get(int disc) /* lock it */ ldops->refcount++; ld->ops = ldops; - ld->refcount = 0; + atomic_set(&ld->users, 0); err = 0; } } @@ -206,7 +206,7 @@ static void tty_ldisc_put(struct tty_ldisc *ld) ldo->refcount--; module_put(ldo->owner); spin_unlock_irqrestore(&tty_ldisc_lock, flags); - WARN_ON(ld->refcount); + WARN_ON(atomic_read(&ld->users)); kfree(ld); } @@ -297,7 +297,7 @@ static int tty_ldisc_try(struct tty_struct *tty) spin_lock_irqsave(&tty_ldisc_lock, flags); ld = tty->ldisc; if (test_bit(TTY_LDISC, &tty->flags)) { - ld->refcount++; + atomic_inc(&ld->users); ret = 1; } spin_unlock_irqrestore(&tty_ldisc_lock, flags); @@ -324,7 +324,7 @@ struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *tty) { /* wait_event is a macro */ wait_event(tty_ldisc_wait, tty_ldisc_try(tty)); - WARN_ON(tty->ldisc->refcount == 0); + WARN_ON(atomic_read(&tty->ldisc->users) == 0); return tty->ldisc; } EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait); @@ -365,11 +365,9 @@ void tty_ldisc_deref(struct tty_ldisc *ld) BUG_ON(ld == NULL); spin_lock_irqsave(&tty_ldisc_lock, flags); - if (ld->refcount == 0) + if (atomic_read(&ld->users) == 0) printk(KERN_ERR "tty_ldisc_deref: no references.\n"); - else - ld->refcount--; - if (ld->refcount == 0) + else if (atomic_dec_and_test(&ld->users)) wake_up(&tty_ldisc_wait); spin_unlock_irqrestore(&tty_ldisc_lock, flags); } @@ -536,10 +534,10 @@ static int tty_ldisc_wait_idle(struct tty_struct *tty) { unsigned long flags; spin_lock_irqsave(&tty_ldisc_lock, flags); - while (tty->ldisc->refcount) { + while (atomic_read(&tty->ldisc->users)) { spin_unlock_irqrestore(&tty_ldisc_lock, flags); if (wait_event_timeout(tty_ldisc_wait, - tty->ldisc->refcount == 0, 5 * HZ) == 0) + atomic_read(&tty->ldisc->users) == 0, 5 * HZ) == 0) return -EBUSY; spin_lock_irqsave(&tty_ldisc_lock, flags); } diff --git a/include/linux/tty_ldisc.h b/include/linux/tty_ldisc.h index 40f38d896777..0c4ee9b88f85 100644 --- a/include/linux/tty_ldisc.h +++ b/include/linux/tty_ldisc.h @@ -144,7 +144,7 @@ struct tty_ldisc_ops { struct tty_ldisc { struct tty_ldisc_ops *ops; - int refcount; + atomic_t users; }; #define TTY_LDISC_MAGIC 0x5403 -- cgit v1.2.3-59-g8ed1b From 6502fbfaf81b09b3f474bb7b3796257e9450273e Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 4 Aug 2009 11:24:24 -0400 Subject: drm/radeon: Add support for RS880 chips These are new AMD IGP chips Signed-off-by: Alex Deucher Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/r600_cp.c | 22 +++++++++++++++------- drivers/gpu/drm/radeon/radeon_drv.h | 1 + include/drm/drm_pciids.h | 5 +++++ 3 files changed, 21 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/radeon/r600_cp.c b/drivers/gpu/drm/radeon/r600_cp.c index 146f3570af8e..20f17908b036 100644 --- a/drivers/gpu/drm/radeon/r600_cp.c +++ b/drivers/gpu/drm/radeon/r600_cp.c @@ -384,8 +384,9 @@ static void r600_cp_load_microcode(drm_radeon_private_t *dev_priv) DRM_INFO("Loading RV670 PFP Microcode\n"); for (i = 0; i < PFP_UCODE_SIZE; i++) RADEON_WRITE(R600_CP_PFP_UCODE_DATA, RV670_pfp_microcode[i]); - } else if (((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RS780)) { - DRM_INFO("Loading RS780 CP Microcode\n"); + } else if (((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RS780) || + ((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RS880)) { + DRM_INFO("Loading RS780/RS880 CP Microcode\n"); for (i = 0; i < PM4_UCODE_SIZE; i++) { RADEON_WRITE(R600_CP_ME_RAM_DATA, RS780_cp_microcode[i][0]); @@ -396,7 +397,7 @@ static void r600_cp_load_microcode(drm_radeon_private_t *dev_priv) } RADEON_WRITE(R600_CP_PFP_UCODE_ADDR, 0); - DRM_INFO("Loading RS780 PFP Microcode\n"); + DRM_INFO("Loading RS780/RS880 PFP Microcode\n"); for (i = 0; i < PFP_UCODE_SIZE; i++) RADEON_WRITE(R600_CP_PFP_UCODE_DATA, RS780_pfp_microcode[i]); } @@ -783,6 +784,7 @@ static void r600_gfx_init(struct drm_device *dev, break; case CHIP_RV610: case CHIP_RS780: + case CHIP_RS880: case CHIP_RV620: dev_priv->r600_max_pipes = 1; dev_priv->r600_max_tile_pipes = 1; @@ -917,7 +919,8 @@ static void r600_gfx_init(struct drm_device *dev, ((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RV630) || ((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RV610) || ((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RV620) || - ((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RS780)) + ((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RS780) || + ((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RS880)) RADEON_WRITE(R600_DB_DEBUG, R600_PREZ_MUST_WAIT_FOR_POSTZ_DONE); else RADEON_WRITE(R600_DB_DEBUG, 0); @@ -935,7 +938,8 @@ static void r600_gfx_init(struct drm_device *dev, sq_ms_fifo_sizes = RADEON_READ(R600_SQ_MS_FIFO_SIZES); if (((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RV610) || ((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RV620) || - ((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RS780)) { + ((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RS780) || + ((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RS880)) { sq_ms_fifo_sizes = (R600_CACHE_FIFO_SIZE(0xa) | R600_FETCH_FIFO_HIWATER(0xa) | R600_DONE_FIFO_HIWATER(0xe0) | @@ -978,7 +982,8 @@ static void r600_gfx_init(struct drm_device *dev, R600_NUM_ES_STACK_ENTRIES(0)); } else if (((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RV610) || ((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RV620) || - ((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RS780)) { + ((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RS780) || + ((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RS880)) { /* no vertex cache */ sq_config &= ~R600_VC_ENABLE; @@ -1035,7 +1040,8 @@ static void r600_gfx_init(struct drm_device *dev, if (((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RV610) || ((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RV620) || - ((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RS780)) + ((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RS780) || + ((dev_priv->flags & RADEON_FAMILY_MASK) == CHIP_RS880)) RADEON_WRITE(R600_VGT_CACHE_INVALIDATION, R600_CACHE_INVALIDATION(R600_TC_ONLY)); else RADEON_WRITE(R600_VGT_CACHE_INVALIDATION, R600_CACHE_INVALIDATION(R600_VC_AND_TC)); @@ -1078,6 +1084,7 @@ static void r600_gfx_init(struct drm_device *dev, break; case CHIP_RV610: case CHIP_RS780: + case CHIP_RS880: case CHIP_RV620: gs_prim_buffer_depth = 32; break; @@ -1123,6 +1130,7 @@ static void r600_gfx_init(struct drm_device *dev, switch (dev_priv->flags & RADEON_FAMILY_MASK) { case CHIP_RV610: case CHIP_RS780: + case CHIP_RS880: case CHIP_RV620: tc_cntl = R600_TC_L2_SIZE(8); break; diff --git a/drivers/gpu/drm/radeon/radeon_drv.h b/drivers/gpu/drm/radeon/radeon_drv.h index 127d0456f628..3933f8216a34 100644 --- a/drivers/gpu/drm/radeon/radeon_drv.h +++ b/drivers/gpu/drm/radeon/radeon_drv.h @@ -143,6 +143,7 @@ enum radeon_family { CHIP_RV635, CHIP_RV670, CHIP_RS780, + CHIP_RS880, CHIP_RV770, CHIP_RV730, CHIP_RV710, diff --git a/include/drm/drm_pciids.h b/include/drm/drm_pciids.h index 9d4c00491547..853508499d20 100644 --- a/include/drm/drm_pciids.h +++ b/include/drm/drm_pciids.h @@ -370,6 +370,11 @@ {0x1002, 0x9614, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS780|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9615, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS780|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9616, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS780|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9710, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS880|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9711, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS880|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9712, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS880|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9713, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS880|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9714, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RS880|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0, 0, 0} #define r128_PCI_IDS \ -- cgit v1.2.3-59-g8ed1b From 5116d8f6b977970ebefc1932c0f313163a6ec91f Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Sun, 26 Jul 2009 17:10:01 +0300 Subject: KVM: fix ack not being delivered when msi present kvm_notify_acked_irq does not check irq type, so that it sometimes interprets msi vector as irq. As a result, ack notifiers are not called, which typially hangs the guest. The fix is to track and check irq type. Signed-off-by: Michael S. Tsirkin Signed-off-by: Avi Kivity --- include/linux/kvm_host.h | 1 + virt/kvm/irq_comm.c | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 16713dc672e4..3060bdc35ffe 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -110,6 +110,7 @@ struct kvm_memory_slot { struct kvm_kernel_irq_routing_entry { u32 gsi; + u32 type; int (*set)(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int level); union { diff --git a/virt/kvm/irq_comm.c b/virt/kvm/irq_comm.c index a8bd466d00cc..ddc17f0e2f35 100644 --- a/virt/kvm/irq_comm.c +++ b/virt/kvm/irq_comm.c @@ -160,7 +160,8 @@ void kvm_notify_acked_irq(struct kvm *kvm, unsigned irqchip, unsigned pin) unsigned gsi = pin; list_for_each_entry(e, &kvm->irq_routing, link) - if (e->irqchip.irqchip == irqchip && + if (e->type == KVM_IRQ_ROUTING_IRQCHIP && + e->irqchip.irqchip == irqchip && e->irqchip.pin == pin) { gsi = e->gsi; break; @@ -259,6 +260,7 @@ static int setup_routing_entry(struct kvm_kernel_irq_routing_entry *e, int delta; e->gsi = ue->gsi; + e->type = ue->type; switch (ue->type) { case KVM_IRQ_ROUTING_IRQCHIP: delta = 0; -- cgit v1.2.3-59-g8ed1b From af6af30c0fcd77e621638e53ef8b176bca8bd3b4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 5 Aug 2009 20:41:04 +0200 Subject: ftrace: Fix perf-tracepoint OOPS Not all tracepoints are created equal, in specific the ftrace tracepoints are created with TRACE_EVENT_FORMAT() which does not generate the needed bits to tie them into perf counters. For those events, don't create the 'id' file and fail ->profile_enable when their ID is specified through other means. Reported-by: Chris Mason Signed-off-by: Peter Zijlstra Cc: Steven Rostedt LKML-Reference: <1249497664.5890.4.camel@laptop> [ v2: fix build error in the !CONFIG_EVENT_PROFILE case ] Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 8 +++----- kernel/trace/trace_event_profile.c | 2 +- kernel/trace/trace_events.c | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 5c093ffc655b..d7cd193c2277 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -119,11 +119,9 @@ struct ftrace_event_call { void *filter; void *mod; -#ifdef CONFIG_EVENT_PROFILE - atomic_t profile_count; - int (*profile_enable)(struct ftrace_event_call *); - void (*profile_disable)(struct ftrace_event_call *); -#endif + atomic_t profile_count; + int (*profile_enable)(struct ftrace_event_call *); + void (*profile_disable)(struct ftrace_event_call *); }; #define MAX_FILTER_PRED 32 diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 5b5895afecfe..11ba5bb4ed0a 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -14,7 +14,7 @@ int ftrace_profile_enable(int event_id) mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id) { + if (event->id == event_id && event->profile_enable) { ret = event->profile_enable(event); break; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 23d2972b22d6..e75276a49cf5 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -940,7 +940,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, entry = trace_create_file("enable", 0644, call->dir, call, enable); - if (call->id) + if (call->id && call->profile_enable) entry = trace_create_file("id", 0444, call->dir, call, id); -- cgit v1.2.3-59-g8ed1b From d82f1c35348cebe2fb2d4a4d31ce0ab0769e3d93 Mon Sep 17 00:00:00 2001 From: Eric Miao Date: Wed, 5 Aug 2009 01:24:41 -0700 Subject: Input: matrix_keypad - make matrix keymap size dynamic Remove assumption on the shift and size of rows/columns form matrix_keypad driver. Signed-off-by: Eric Miao Signed-off-by: Dmitry Torokhov --- drivers/input/keyboard/matrix_keypad.c | 18 +++++++++--------- include/linux/input/matrix_keypad.h | 13 +++++++------ 2 files changed, 16 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/drivers/input/keyboard/matrix_keypad.c b/drivers/input/keyboard/matrix_keypad.c index e9b2e7cb05be..541b981ff075 100644 --- a/drivers/input/keyboard/matrix_keypad.c +++ b/drivers/input/keyboard/matrix_keypad.c @@ -27,6 +27,7 @@ struct matrix_keypad { const struct matrix_keypad_platform_data *pdata; struct input_dev *input_dev; unsigned short *keycodes; + unsigned int row_shift; uint32_t last_key_state[MATRIX_MAX_COLS]; struct delayed_work work; @@ -136,7 +137,7 @@ static void matrix_keypad_scan(struct work_struct *work) if ((bits_changed & (1 << row)) == 0) continue; - code = (row << 4) + col; + code = MATRIX_SCAN_CODE(row, col, keypad->row_shift); input_event(input_dev, EV_MSC, MSC_SCAN, code); input_report_key(input_dev, keypad->keycodes[code], @@ -317,6 +318,7 @@ static int __devinit matrix_keypad_probe(struct platform_device *pdev) struct matrix_keypad *keypad; struct input_dev *input_dev; unsigned short *keycodes; + unsigned int row_shift; int i; int err; @@ -332,14 +334,11 @@ static int __devinit matrix_keypad_probe(struct platform_device *pdev) return -EINVAL; } - if (!keymap_data->max_keymap_size) { - dev_err(&pdev->dev, "invalid keymap data supplied\n"); - return -EINVAL; - } + row_shift = get_count_order(pdata->num_col_gpios); keypad = kzalloc(sizeof(struct matrix_keypad), GFP_KERNEL); - keycodes = kzalloc(keymap_data->max_keymap_size * - sizeof(keypad->keycodes), + keycodes = kzalloc((pdata->num_row_gpios << row_shift) * + sizeof(*keycodes), GFP_KERNEL); input_dev = input_allocate_device(); if (!keypad || !keycodes || !input_dev) { @@ -350,6 +349,7 @@ static int __devinit matrix_keypad_probe(struct platform_device *pdev) keypad->input_dev = input_dev; keypad->pdata = pdata; keypad->keycodes = keycodes; + keypad->row_shift = row_shift; keypad->stopped = true; INIT_DELAYED_WORK(&keypad->work, matrix_keypad_scan); spin_lock_init(&keypad->lock); @@ -363,7 +363,7 @@ static int __devinit matrix_keypad_probe(struct platform_device *pdev) input_dev->keycode = keycodes; input_dev->keycodesize = sizeof(*keycodes); - input_dev->keycodemax = keymap_data->max_keymap_size; + input_dev->keycodemax = pdata->num_row_gpios << keypad->row_shift; for (i = 0; i < keymap_data->keymap_size; i++) { unsigned int key = keymap_data->keymap[i]; @@ -371,7 +371,7 @@ static int __devinit matrix_keypad_probe(struct platform_device *pdev) unsigned int col = KEY_COL(key); unsigned short code = KEY_VAL(key); - keycodes[(row << 4) + col] = code; + keycodes[MATRIX_SCAN_CODE(row, col, row_shift)] = code; __set_bit(code, input_dev->keybit); } __clear_bit(KEY_RESERVED, input_dev->keybit); diff --git a/include/linux/input/matrix_keypad.h b/include/linux/input/matrix_keypad.h index 7964516c6954..15d5903af2dd 100644 --- a/include/linux/input/matrix_keypad.h +++ b/include/linux/input/matrix_keypad.h @@ -15,12 +15,13 @@ #define KEY_COL(k) (((k) >> 16) & 0xff) #define KEY_VAL(k) ((k) & 0xffff) +#define MATRIX_SCAN_CODE(row, col, row_shift) (((row) << (row_shift)) + (col)) + /** * struct matrix_keymap_data - keymap for matrix keyboards * @keymap: pointer to array of uint32 values encoded with KEY() macro * representing keymap * @keymap_size: number of entries (initialized) in this keymap - * @max_keymap_size: maximum size of keymap supported by the device * * This structure is supposed to be used by platform code to supply * keymaps to drivers that implement matrix-like keypads/keyboards. @@ -28,14 +29,13 @@ struct matrix_keymap_data { const uint32_t *keymap; unsigned int keymap_size; - unsigned int max_keymap_size; }; /** * struct matrix_keypad_platform_data - platform-dependent keypad data * @keymap_data: pointer to &matrix_keymap_data - * @row_gpios: array of gpio numbers reporesenting rows - * @col_gpios: array of gpio numbers reporesenting colums + * @row_gpios: pointer to array of gpio numbers representing rows + * @col_gpios: pointer to array of gpio numbers reporesenting colums * @num_row_gpios: actual number of row gpios used by device * @num_col_gpios: actual number of col gpios used by device * @col_scan_delay_us: delay, measured in microseconds, that is @@ -48,8 +48,9 @@ struct matrix_keymap_data { struct matrix_keypad_platform_data { const struct matrix_keymap_data *keymap_data; - unsigned int row_gpios[MATRIX_MAX_ROWS]; - unsigned int col_gpios[MATRIX_MAX_COLS]; + const unsigned int *row_gpios; + const unsigned int *col_gpios; + unsigned int num_row_gpios; unsigned int num_col_gpios; -- cgit v1.2.3-59-g8ed1b From 54e346215e4fe2ca8c94c54e546cc61902060510 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Aug 2009 14:38:25 -0300 Subject: vfs: fix inode_init_always calling convention Currently inode_init_always calls into ->destroy_inode if the additional initialization fails. That's not only counter-intuitive because inode_init_always did not allocate the inode structure, but in case of XFS it's actively harmful as ->destroy_inode might delete the inode from a radix-tree that has never been added. This in turn might end up deleting the inode for the same inum that has been instanciated by another process and cause lots of cause subtile problems. Also in the case of re-initializing a reclaimable inode in XFS it would free an inode we still want to keep alive. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen --- fs/inode.c | 30 +++++++++++++++++------------- fs/xfs/xfs_iget.c | 17 +++++------------ include/linux/fs.h | 2 +- 3 files changed, 23 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/fs/inode.c b/fs/inode.c index 901bad1e5f12..af2c05235cc8 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -120,12 +120,11 @@ static void wake_up_inode(struct inode *inode) * These are initializations that need to be done on every inode * allocation as the fields are not initialised by slab allocation. */ -struct inode *inode_init_always(struct super_block *sb, struct inode *inode) +int inode_init_always(struct super_block *sb, struct inode *inode) { static const struct address_space_operations empty_aops; static struct inode_operations empty_iops; static const struct file_operations empty_fops; - struct address_space *const mapping = &inode->i_data; inode->i_sb = sb; @@ -152,7 +151,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode) inode->dirtied_when = 0; if (security_inode_alloc(inode)) - goto out_free_inode; + goto out; /* allocate and initialize an i_integrity */ if (ima_inode_alloc(inode)) @@ -198,16 +197,12 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode) inode->i_fsnotify_mask = 0; #endif - return inode; + return 0; out_free_security: security_inode_free(inode); -out_free_inode: - if (inode->i_sb->s_op->destroy_inode) - inode->i_sb->s_op->destroy_inode(inode); - else - kmem_cache_free(inode_cachep, (inode)); - return NULL; +out: + return -ENOMEM; } EXPORT_SYMBOL(inode_init_always); @@ -220,9 +215,18 @@ static struct inode *alloc_inode(struct super_block *sb) else inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); - if (inode) - return inode_init_always(sb, inode); - return NULL; + if (!inode) + return NULL; + + if (unlikely(inode_init_always(sb, inode))) { + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, inode); + return NULL; + } + + return inode; } void destroy_inode(struct inode *inode) diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 5fcec6f020a7..719c85b155f4 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -64,6 +64,10 @@ xfs_inode_alloc( ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); if (!ip) return NULL; + if (inode_init_always(mp->m_super, VFS_I(ip))) { + kmem_zone_free(xfs_inode_zone, ip); + return NULL; + } ASSERT(atomic_read(&ip->i_iocount) == 0); ASSERT(atomic_read(&ip->i_pincount) == 0); @@ -105,17 +109,6 @@ xfs_inode_alloc( #ifdef XFS_DIR2_TRACE ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS); #endif - /* - * Now initialise the VFS inode. We do this after the xfs_inode - * initialisation as internal failures will result in ->destroy_inode - * being called and that will pass down through the reclaim path and - * free the XFS inode. This path requires the XFS inode to already be - * initialised. Hence if this call fails, the xfs_inode has already - * been freed and we should not reference it at all in the error - * handling. - */ - if (!inode_init_always(mp->m_super, VFS_I(ip))) - return NULL; /* prevent anyone from using this yet */ VFS_I(ip)->i_state = I_NEW|I_LOCK; @@ -167,7 +160,7 @@ xfs_iget_cache_hit( * errors cleanly, then tag it so it can be set up correctly * later. */ - if (!inode_init_always(mp->m_super, VFS_I(ip))) { + if (inode_init_always(mp->m_super, VFS_I(ip))) { error = ENOMEM; goto out_error; } diff --git a/include/linux/fs.h b/include/linux/fs.h index a36ffa5a77a4..0c3b5e58a986 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2137,7 +2137,7 @@ extern loff_t default_llseek(struct file *file, loff_t offset, int origin); extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin); -extern struct inode * inode_init_always(struct super_block *, struct inode *); +extern int inode_init_always(struct super_block *, struct inode *); extern void inode_init_once(struct inode *); extern void inode_add_to_lists(struct super_block *, struct inode *); extern void iput(struct inode *); -- cgit v1.2.3-59-g8ed1b From 2e00c97e2c1d2ffc9e26252ca26b237678b0b772 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Aug 2009 14:38:29 -0300 Subject: vfs: add __destroy_inode When we want to tear down an inode that lost the add to the cache race in XFS we must not call into ->destroy_inode because that would delete the inode that won the race from the inode cache radix tree. This patch provides the __destroy_inode helper needed to fix this, the actual fix will be in th next patch. As XFS was the only reason destroy_inode was exported we shift the export to the new __destroy_inode. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen --- fs/inode.c | 10 +++++++--- include/linux/fs.h | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/fs/inode.c b/fs/inode.c index af2c05235cc8..ae7b67e48661 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -229,7 +229,7 @@ static struct inode *alloc_inode(struct super_block *sb) return inode; } -void destroy_inode(struct inode *inode) +void __destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); ima_inode_free(inode); @@ -241,13 +241,17 @@ void destroy_inode(struct inode *inode) if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) posix_acl_release(inode->i_default_acl); #endif +} +EXPORT_SYMBOL(__destroy_inode); + +void destroy_inode(struct inode *inode) +{ + __destroy_inode(inode); if (inode->i_sb->s_op->destroy_inode) inode->i_sb->s_op->destroy_inode(inode); else kmem_cache_free(inode_cachep, (inode)); } -EXPORT_SYMBOL(destroy_inode); - /* * These are initializations that only need to be done diff --git a/include/linux/fs.h b/include/linux/fs.h index 0c3b5e58a986..67888a9e0655 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2164,6 +2164,7 @@ extern void __iget(struct inode * inode); extern void iget_failed(struct inode *); extern void clear_inode(struct inode *); extern void destroy_inode(struct inode *); +extern void __destroy_inode(struct inode *); extern struct inode *new_inode(struct super_block *); extern int should_remove_suid(struct dentry *); extern int file_remove_suid(struct file *); -- cgit v1.2.3-59-g8ed1b From 4bfc44958e499af9a73f62201543b3a1f617cfeb Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 6 Aug 2009 15:07:33 -0700 Subject: mm: make set_mempolicy(MPOL_INTERLEAV) N_HIGH_MEMORY aware At first, init_task's mems_allowed is initialized as this. init_task->mems_allowed == node_state[N_POSSIBLE] And cpuset's top_cpuset mask is initialized as this top_cpuset->mems_allowed = node_state[N_HIGH_MEMORY] Before 2.6.29: policy's mems_allowed is initialized as this. 1. update tasks->mems_allowed by its cpuset->mems_allowed. 2. policy->mems_allowed = nodes_and(tasks->mems_allowed, user's mask) Updating task's mems_allowed in reference to top_cpuset's one. cpuset's mems_allowed is aware of N_HIGH_MEMORY, always. In 2.6.30: After commit 58568d2a8215cb6f55caf2332017d7bdff954e1c ("cpuset,mm: update tasks' mems_allowed in time"), policy's mems_allowed is initialized as this. 1. policy->mems_allowd = nodes_and(task->mems_allowed, user's mask) Here, if task is in top_cpuset, task->mems_allowed is not updated from init's one. Assume user excutes command as #numactrl --interleave=all ,.... policy->mems_allowd = nodes_and(N_POSSIBLE, ALL_SET_MASK) Then, policy's mems_allowd can includes a possible node, which has no pgdat. MPOL's INTERLEAVE just scans nodemask of task->mems_allowd and access this directly. NODE_DATA(nid)->zonelist even if NODE_DATA(nid)==NULL Then, what's we need is making policy->mems_allowed be aware of N_HIGH_MEMORY. This patch does that. But to do so, extra nodemask will be on statck. Because I know cpumask has a new interface of CPUMASK_ALLOC(), I added it to node. This patch stands on old behavior. But I feel this fix itself is just a Band-Aid. But to do fundametal fix, we have to take care of memory hotplug and it takes time. (task->mems_allowd should be N_HIGH_MEMORY, I think.) mpol_set_nodemask() should be aware of N_HIGH_MEMORY and policy's nodemask should be includes only online nodes. In old behavior, this is guaranteed by frequent reference to cpuset's code. Now, most of them are removed and mempolicy has to check it by itself. To do check, a few nodemask_t will be used for calculating nodemask. But, size of nodemask_t can be big and it's not good to allocate them on stack. Now, cpumask_t has CPUMASK_ALLOC/FREE an easy code for get scratch area. NODEMASK_ALLOC/FREE shoudl be there. [akpm@linux-foundation.org: cleanups & tweaks] Tested-by: KOSAKI Motohiro Signed-off-by: KAMEZAWA Hiroyuki Cc: Miao Xie Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Christoph Lameter Cc: Paul Menage Cc: Nick Piggin Cc: Yasunori Goto Cc: Pekka Enberg Cc: David Rientjes Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nodemask.h | 28 ++++++++++++++++ mm/mempolicy.c | 84 +++++++++++++++++++++++++++++++++--------------- 2 files changed, 86 insertions(+), 26 deletions(-) (limited to 'include') diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index 829b94b156f2..b359c4a9ec9e 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -82,6 +82,12 @@ * to generate slightly worse code. So use a simple one-line #define * for node_isset(), instead of wrapping an inline inside a macro, the * way we do the other calls. + * + * NODEMASK_SCRATCH + * When doing above logical AND, OR, XOR, Remap operations the callers tend to + * need temporary nodemask_t's on the stack. But if NODES_SHIFT is large, + * nodemask_t's consume too much stack space. NODEMASK_SCRATCH is a helper + * for such situations. See below and CPUMASK_ALLOC also. */ #include @@ -473,4 +479,26 @@ static inline int num_node_state(enum node_states state) #define for_each_node(node) for_each_node_state(node, N_POSSIBLE) #define for_each_online_node(node) for_each_node_state(node, N_ONLINE) +/* + * For nodemask scrach area.(See CPUMASK_ALLOC() in cpumask.h) + */ + +#if NODES_SHIFT > 8 /* nodemask_t > 64 bytes */ +#define NODEMASK_ALLOC(x, m) struct x *m = kmalloc(sizeof(*m), GFP_KERNEL) +#define NODEMASK_FREE(m) kfree(m) +#else +#define NODEMASK_ALLOC(x, m) struct x _m, *m = &_m +#define NODEMASK_FREE(m) +#endif + +/* A example struture for using NODEMASK_ALLOC, used in mempolicy. */ +struct nodemask_scratch { + nodemask_t mask1; + nodemask_t mask2; +}; + +#define NODEMASK_SCRATCH(x) NODEMASK_ALLOC(nodemask_scratch, x) +#define NODEMASK_SCRATCH_FREE(x) NODEMASK_FREE(x) + + #endif /* __LINUX_NODEMASK_H */ diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e08e2c4da63a..7dd9d9f80694 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -191,25 +191,27 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_semaphore for write. */ -static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) +static int mpol_set_nodemask(struct mempolicy *pol, + const nodemask_t *nodes, struct nodemask_scratch *nsc) { - nodemask_t cpuset_context_nmask; int ret; /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ if (pol == NULL) return 0; + /* Check N_HIGH_MEMORY */ + nodes_and(nsc->mask1, + cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); VM_BUG_ON(!nodes); if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) nodes = NULL; /* explicit local allocation */ else { if (pol->flags & MPOL_F_RELATIVE_NODES) - mpol_relative_nodemask(&cpuset_context_nmask, nodes, - &cpuset_current_mems_allowed); + mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1); else - nodes_and(cpuset_context_nmask, *nodes, - cpuset_current_mems_allowed); + nodes_and(nsc->mask2, *nodes, nsc->mask1); + if (mpol_store_user_nodemask(pol)) pol->w.user_nodemask = *nodes; else @@ -217,8 +219,10 @@ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) cpuset_current_mems_allowed; } - ret = mpol_ops[pol->mode].create(pol, - nodes ? &cpuset_context_nmask : NULL); + if (nodes) + ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); + else + ret = mpol_ops[pol->mode].create(pol, NULL); return ret; } @@ -620,12 +624,17 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, { struct mempolicy *new, *old; struct mm_struct *mm = current->mm; + NODEMASK_SCRATCH(scratch); int ret; - new = mpol_new(mode, flags, nodes); - if (IS_ERR(new)) - return PTR_ERR(new); + if (!scratch) + return -ENOMEM; + new = mpol_new(mode, flags, nodes); + if (IS_ERR(new)) { + ret = PTR_ERR(new); + goto out; + } /* * prevent changing our mempolicy while show_numa_maps() * is using it. @@ -635,13 +644,13 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, if (mm) down_write(&mm->mmap_sem); task_lock(current); - ret = mpol_set_nodemask(new, nodes); + ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { task_unlock(current); if (mm) up_write(&mm->mmap_sem); mpol_put(new); - return ret; + goto out; } old = current->mempolicy; current->mempolicy = new; @@ -654,7 +663,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, up_write(&mm->mmap_sem); mpol_put(old); - return 0; + ret = 0; +out: + NODEMASK_SCRATCH_FREE(scratch); + return ret; } /* @@ -1014,12 +1026,20 @@ static long do_mbind(unsigned long start, unsigned long len, if (err) return err; } - down_write(&mm->mmap_sem); - task_lock(current); - err = mpol_set_nodemask(new, nmask); - task_unlock(current); + { + NODEMASK_SCRATCH(scratch); + if (scratch) { + down_write(&mm->mmap_sem); + task_lock(current); + err = mpol_set_nodemask(new, nmask, scratch); + task_unlock(current); + if (err) + up_write(&mm->mmap_sem); + } else + err = -ENOMEM; + NODEMASK_SCRATCH_FREE(scratch); + } if (err) { - up_write(&mm->mmap_sem); mpol_put(new); return err; } @@ -1891,6 +1911,7 @@ restart: * Install non-NULL @mpol in inode's shared policy rb-tree. * On entry, the current task has a reference on a non-NULL @mpol. * This must be released on exit. + * This is called at get_inode() calls and we can use GFP_KERNEL. */ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { @@ -1902,19 +1923,24 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) if (mpol) { struct vm_area_struct pvma; struct mempolicy *new; + NODEMASK_SCRATCH(scratch); + if (!scratch) + return; /* contextualize the tmpfs mount point mempolicy */ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); if (IS_ERR(new)) { mpol_put(mpol); /* drop our ref on sb mpol */ + NODEMASK_SCRATCH_FREE(scratch); return; /* no valid nodemask intersection */ } task_lock(current); - ret = mpol_set_nodemask(new, &mpol->w.user_nodemask); + ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); task_unlock(current); mpol_put(mpol); /* drop our ref on sb mpol */ if (ret) { + NODEMASK_SCRATCH_FREE(scratch); mpol_put(new); return; } @@ -1924,6 +1950,7 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) pvma.vm_end = TASK_SIZE; /* policy covers entire file */ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ mpol_put(new); /* drop initial ref */ + NODEMASK_SCRATCH_FREE(scratch); } } @@ -2140,13 +2167,18 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) err = 1; else { int ret; - - task_lock(current); - ret = mpol_set_nodemask(new, &nodes); - task_unlock(current); - if (ret) + NODEMASK_SCRATCH(scratch); + if (scratch) { + task_lock(current); + ret = mpol_set_nodemask(new, &nodes, scratch); + task_unlock(current); + } else + ret = -ENOMEM; + NODEMASK_SCRATCH_FREE(scratch); + if (ret) { err = 1; - else if (no_context) { + mpol_put(new); + } else if (no_context) { /* save for contextualization */ new->w.user_nodemask = nodes; } -- cgit v1.2.3-59-g8ed1b From daeb6b6fbe27049f465c48a7d0ee5555c3b84064 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Thu, 6 Aug 2009 15:09:30 -0700 Subject: bzip2/lzma/gzip: fix comments describing decompressor API Fix and improve comments in decompress/generic.h that describe the decompressor API. Also remove an unused definition, and rename INBUF_LEN in lib/decompress_inflate.c to conform to bzip2/lzma naming. Signed-off-by: Phillip Lougher Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/decompress/generic.h | 32 +++++++++++++++++++------------- lib/decompress_inflate.c | 8 ++++---- 2 files changed, 23 insertions(+), 17 deletions(-) (limited to 'include') diff --git a/include/linux/decompress/generic.h b/include/linux/decompress/generic.h index 6dfb856327bb..0c7111a55a1a 100644 --- a/include/linux/decompress/generic.h +++ b/include/linux/decompress/generic.h @@ -1,31 +1,37 @@ #ifndef DECOMPRESS_GENERIC_H #define DECOMPRESS_GENERIC_H -/* Minimal chunksize to be read. - *Bzip2 prefers at least 4096 - *Lzma prefers 0x10000 */ -#define COMPR_IOBUF_SIZE 4096 - typedef int (*decompress_fn) (unsigned char *inbuf, int len, int(*fill)(void*, unsigned int), - int(*writebb)(void*, unsigned int), - unsigned char *output, + int(*flush)(void*, unsigned int), + unsigned char *outbuf, int *posp, void(*error)(char *x)); /* inbuf - input buffer *len - len of pre-read data in inbuf - *fill - function to fill inbuf if empty - *writebb - function to write out outbug + *fill - function to fill inbuf when empty + *flush - function to write out outbuf + *outbuf - output buffer *posp - if non-null, input position (number of bytes read) will be * returned here * - *If len != 0, the inbuf is initialized (with as much data), and fill - *should not be called - *If len = 0, the inbuf is allocated, but empty. Its size is IOBUF_SIZE - *fill should be called (repeatedly...) to read data, at most IOBUF_SIZE + *If len != 0, inbuf should contain all the necessary input data, and fill + *should be NULL + *If len = 0, inbuf can be NULL, in which case the decompressor will allocate + *the input buffer. If inbuf != NULL it must be at least XXX_IOBUF_SIZE bytes. + *fill will be called (repeatedly...) to read data, at most XXX_IOBUF_SIZE + *bytes should be read per call. Replace XXX with the appropriate decompressor + *name, i.e. LZMA_IOBUF_SIZE. + * + *If flush = NULL, outbuf must be large enough to buffer all the expected + *output. If flush != NULL, the output buffer will be allocated by the + *decompressor (outbuf = NULL), and the flush function will be called to + *flush the output buffer at the appropriate time (decompressor and stream + *dependent). */ + /* Utility routine to detect the decompression method */ decompress_fn decompress_method(const unsigned char *inbuf, int len, const char **name); diff --git a/lib/decompress_inflate.c b/lib/decompress_inflate.c index e36b296fc9f8..bfe605ac534f 100644 --- a/lib/decompress_inflate.c +++ b/lib/decompress_inflate.c @@ -25,7 +25,7 @@ #include #include -#define INBUF_LEN (16*1024) +#define GZIP_IOBUF_SIZE (16*1024) /* Included from initramfs et al code */ STATIC int INIT gunzip(unsigned char *buf, int len, @@ -55,7 +55,7 @@ STATIC int INIT gunzip(unsigned char *buf, int len, if (buf) zbuf = buf; else { - zbuf = malloc(INBUF_LEN); + zbuf = malloc(GZIP_IOBUF_SIZE); len = 0; } if (!zbuf) { @@ -77,7 +77,7 @@ STATIC int INIT gunzip(unsigned char *buf, int len, } if (len == 0) - len = fill(zbuf, INBUF_LEN); + len = fill(zbuf, GZIP_IOBUF_SIZE); /* verify the gzip header */ if (len < 10 || @@ -113,7 +113,7 @@ STATIC int INIT gunzip(unsigned char *buf, int len, while (rc == Z_OK) { if (strm->avail_in == 0) { /* TODO: handle case where both pos and fill are set */ - len = fill(zbuf, INBUF_LEN); + len = fill(zbuf, GZIP_IOBUF_SIZE); if (len < 0) { rc = -1; error("read error"); -- cgit v1.2.3-59-g8ed1b From 3a6593050fbd8bbcaed3a44d01c31d907315c86c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 21 Jul 2009 17:34:57 +0200 Subject: perf_counter, ftrace: Fix perf_counter integration Adds possible second part to the assign argument of TP_EVENT(). TP_perf_assign( __perf_count(foo); __perf_addr(bar); ) Which, when specified make the swcounter increment with @foo instead of the usual 1, and report @bar for PERF_SAMPLE_ADDR (data address associated with the event) when this triggers a counter overflow. Signed-off-by: Peter Zijlstra Acked-by: Steven Rostedt Cc: Frederic Weisbecker Cc: Jason Baron Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- include/trace/ftrace.h | 110 ++++++++++++++++++++++++++++++++++++++----------- kernel/perf_counter.c | 6 +-- 2 files changed, 88 insertions(+), 28 deletions(-) (limited to 'include') diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 1867553c61e5..fec71f8dbc48 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -144,6 +144,9 @@ #undef TP_fast_assign #define TP_fast_assign(args...) args +#undef TP_perf_assign +#define TP_perf_assign(args...) + #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ static int \ @@ -345,6 +348,88 @@ static inline int ftrace_get_offsets_##call( \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#ifdef CONFIG_EVENT_PROFILE + +/* + * Generate the functions needed for tracepoint perf_counter support. + * + * static void ftrace_profile_(proto) + * { + * extern void perf_tpcounter_event(int, u64, u64); + * u64 __addr = 0, __count = 1; + * + * <-- here we expand the TP_perf_assign() macro + * + * perf_tpcounter_event(event_.id, __addr, __count); + * } + * + * static int ftrace_profile_enable_(struct ftrace_event_call *event_call) + * { + * int ret = 0; + * + * if (!atomic_inc_return(&event_call->profile_count)) + * ret = register_trace_(ftrace_profile_); + * + * return ret; + * } + * + * static void ftrace_profile_disable_(struct ftrace_event_call *event_call) + * { + * if (atomic_add_negative(-1, &event->call->profile_count)) + * unregister_trace_(ftrace_profile_); + * } + * + */ + +#undef TP_fast_assign +#define TP_fast_assign(args...) + +#undef TP_perf_assign +#define TP_perf_assign(args...) args + +#undef __perf_addr +#define __perf_addr(a) __addr = (a) + +#undef __perf_count +#define __perf_count(c) __count = (c) + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ + \ +static void ftrace_profile_##call(proto) \ +{ \ + extern void perf_tpcounter_event(int, u64, u64); \ + u64 __addr = 0, __count = 1; \ + { assign; } \ + perf_tpcounter_event(event_##call.id, __addr, __count); \ +} \ + \ +static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ +{ \ + int ret = 0; \ + \ + if (!atomic_inc_return(&event_call->profile_count)) \ + ret = register_trace_##call(ftrace_profile_##call); \ + \ + return ret; \ +} \ + \ +static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ +{ \ + if (atomic_add_negative(-1, &event_call->profile_count)) \ + unregister_trace_##call(ftrace_profile_##call); \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +#undef TP_fast_assign +#define TP_fast_assign(args...) args + +#undef TP_perf_assign +#define TP_perf_assign(args...) + +#endif + /* * Stage 4 of the trace events. * @@ -447,28 +532,6 @@ static inline int ftrace_get_offsets_##call( \ #define TP_FMT(fmt, args...) fmt "\n", ##args #ifdef CONFIG_EVENT_PROFILE -#define _TRACE_PROFILE(call, proto, args) \ -static void ftrace_profile_##call(proto) \ -{ \ - extern void perf_tpcounter_event(int); \ - perf_tpcounter_event(event_##call.id); \ -} \ - \ -static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ -{ \ - int ret = 0; \ - \ - if (!atomic_inc_return(&event_call->profile_count)) \ - ret = register_trace_##call(ftrace_profile_##call); \ - \ - return ret; \ -} \ - \ -static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ -{ \ - if (atomic_add_negative(-1, &event_call->profile_count)) \ - unregister_trace_##call(ftrace_profile_##call); \ -} #define _TRACE_PROFILE_INIT(call) \ .profile_count = ATOMIC_INIT(-1), \ @@ -476,7 +539,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ .profile_disable = ftrace_profile_disable_##call, #else -#define _TRACE_PROFILE(call, proto, args) #define _TRACE_PROFILE_INIT(call) #endif @@ -502,7 +564,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ -_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ \ static struct ftrace_event_call event_##call; \ \ @@ -586,6 +647,5 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#undef _TRACE_PROFILE #undef _TRACE_PROFILE_INIT diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 673c1aaf7332..52eb4b68d34f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3703,17 +3703,17 @@ static const struct pmu perf_ops_task_clock = { }; #ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id) +void perf_tpcounter_event(int event_id, u64 addr, u64 count) { struct perf_sample_data data = { .regs = get_irq_regs(), - .addr = 0, + .addr = addr, }; if (!data.regs) data.regs = task_pt_regs(current); - do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, &data); + do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data); } EXPORT_SYMBOL_GPL(perf_tpcounter_event); -- cgit v1.2.3-59-g8ed1b From f413cdb80ce00ec1a4d0ab949b5d96c81cae7f75 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Aug 2009 01:25:54 +0200 Subject: perf_counter: Fix/complete ftrace event records sampling This patch implements the kernel side support for ftrace event record sampling. A new counter sampling attribute is added: PERF_SAMPLE_TP_RECORD which requests ftrace events record sampling. In this case if a PERF_TYPE_TRACEPOINT counter is active and a tracepoint fires, we emit the tracepoint binary record to the perfcounter event buffer, as a sample. Result, after setting PERF_SAMPLE_TP_RECORD attribute from perf record: perf record -f -F 1 -a -e workqueue:workqueue_execution perf report -D 0x21e18 [0x48]: event: 9 . . ... raw event: size 72 bytes . 0000: 09 00 00 00 01 00 48 00 d0 c7 00 81 ff ff ff ff ......H........ . 0010: 0a 00 00 00 0a 00 00 00 21 00 00 00 00 00 00 00 ........!...... . 0020: 2b 00 01 02 0a 00 00 00 0a 00 00 00 65 76 65 6e +...........eve . 0030: 74 73 2f 31 00 00 00 00 00 00 00 00 0a 00 00 00 ts/1........... . 0040: e0 b1 31 81 ff ff ff ff ....... . 0x21e18 [0x48]: PERF_EVENT_SAMPLE (IP, 1): 10: 0xffffffff8100c7d0 period: 33 The raw ftrace binary record starts at offset 0020. Translation: struct trace_entry { type = 0x2b = 43; flags = 1; preempt_count = 2; pid = 0xa = 10; tgid = 0xa = 10; } thread_comm = "events/1" thread_pid = 0xa = 10; func = 0xffffffff8131b1e0 = flush_to_ldisc() What will come next? - Userspace support ('perf trace'), 'flight data recorder' mode for perf trace, etc. - The unconditional copy from the profiling callback brings some costs however if someone wants no such sampling to occur, and needs to be fixed in the future. For that we need to have an instant access to the perf counter attribute. This is a matter of a flag to add in the struct ftrace_event. - Take care of the events recursivity! Don't ever try to record a lock event for example, it seems some locking is used in the profiling fast path and lead to a tracing recursivity. That will be fixed using raw spinlock or recursivity protection. - [...] - Profit! :-) Signed-off-by: Frederic Weisbecker Cc: Li Zefan Cc: Tom Zanussi Cc: Arnaldo Carvalho de Melo Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Steven Rostedt Cc: Paul Mackerras Cc: Pekka Enberg Cc: Gabriel Munteanu Cc: Lai Jiangshan Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 4 +- include/linux/perf_counter.h | 9 ++- include/trace/ftrace.h | 130 ++++++++++++++++++++++++++++++++----------- kernel/perf_counter.c | 18 +++++- kernel/trace/trace.c | 1 + kernel/trace/trace.h | 4 -- tools/perf/builtin-record.c | 1 + 7 files changed, 126 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index d7cd193c2277..a81170de7f6b 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -89,7 +89,9 @@ enum print_line_t { TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ }; - +void tracing_generic_entry_update(struct trace_entry *entry, + unsigned long flags, + int pc); struct ring_buffer_event * trace_current_buffer_lock_reserve(int type, unsigned long len, unsigned long flags, int pc); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index e604e6ef72dd..a67dd5c5b6d3 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -121,8 +121,9 @@ enum perf_counter_sample_format { PERF_SAMPLE_CPU = 1U << 7, PERF_SAMPLE_PERIOD = 1U << 8, PERF_SAMPLE_STREAM_ID = 1U << 9, + PERF_SAMPLE_TP_RECORD = 1U << 10, - PERF_SAMPLE_MAX = 1U << 10, /* non-ABI */ + PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ }; /* @@ -413,6 +414,11 @@ struct perf_callchain_entry { __u64 ip[PERF_MAX_STACK_DEPTH]; }; +struct perf_tracepoint_record { + int size; + char *record; +}; + struct task_struct; /** @@ -681,6 +687,7 @@ struct perf_sample_data { struct pt_regs *regs; u64 addr; u64 period; + void *private; }; extern int perf_counter_overflow(struct perf_counter *counter, int nmi, diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index fec71f8dbc48..7fb16d90e7b1 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -353,15 +353,7 @@ static inline int ftrace_get_offsets_##call( \ /* * Generate the functions needed for tracepoint perf_counter support. * - * static void ftrace_profile_(proto) - * { - * extern void perf_tpcounter_event(int, u64, u64); - * u64 __addr = 0, __count = 1; - * - * <-- here we expand the TP_perf_assign() macro - * - * perf_tpcounter_event(event_.id, __addr, __count); - * } + * NOTE: The insertion profile callback (ftrace_profile_) is defined later * * static int ftrace_profile_enable_(struct ftrace_event_call *event_call) * { @@ -381,28 +373,10 @@ static inline int ftrace_get_offsets_##call( \ * */ -#undef TP_fast_assign -#define TP_fast_assign(args...) - -#undef TP_perf_assign -#define TP_perf_assign(args...) args - -#undef __perf_addr -#define __perf_addr(a) __addr = (a) - -#undef __perf_count -#define __perf_count(c) __count = (c) - #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ \ -static void ftrace_profile_##call(proto) \ -{ \ - extern void perf_tpcounter_event(int, u64, u64); \ - u64 __addr = 0, __count = 1; \ - { assign; } \ - perf_tpcounter_event(event_##call.id, __addr, __count); \ -} \ +static void ftrace_profile_##call(proto); \ \ static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \ { \ @@ -422,12 +396,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#undef TP_fast_assign -#define TP_fast_assign(args...) args - -#undef TP_perf_assign -#define TP_perf_assign(args...) - #endif /* @@ -647,5 +615,99 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +/* + * Define the insertion callback to profile events + * + * The job is very similar to ftrace_raw_event_ except that we don't + * insert in the ring buffer but in a perf counter. + * + * static void ftrace_profile_(proto) + * { + * struct ftrace_data_offsets_ __maybe_unused __data_offsets; + * struct ftrace_event_call *event_call = &event_; + * extern void perf_tpcounter_event(int, u64, u64, void *, int); + * struct ftrace_raw_##call *entry; + * u64 __addr = 0, __count = 1; + * unsigned long irq_flags; + * int __entry_size; + * int __data_size; + * int pc; + * + * local_save_flags(irq_flags); + * pc = preempt_count(); + * + * __data_size = ftrace_get_offsets_(&__data_offsets, args); + * __entry_size = __data_size + sizeof(*entry); + * + * do { + * char raw_data[__entry_size]; <- allocate our sample in the stack + * struct trace_entry *ent; + * + * entry = (struct ftrace_raw_ *)raw_data; + * ent = &entry->ent; + * tracing_generic_entry_update(ent, irq_flags, pc); + * ent->type = event_call->id; + * + * <- do some jobs with dynamic arrays + * + * <- affect our values + * + * perf_tpcounter_event(event_call->id, __addr, __count, entry, + * __entry_size); <- submit them to perf counter + * } while (0); + * + * } + */ + +#ifdef CONFIG_EVENT_PROFILE + +#undef __perf_addr +#define __perf_addr(a) __addr = (a) + +#undef __perf_count +#define __perf_count(c) __count = (c) + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ +static void ftrace_profile_##call(proto) \ +{ \ + struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\ + struct ftrace_event_call *event_call = &event_##call; \ + extern void perf_tpcounter_event(int, u64, u64, void *, int); \ + struct ftrace_raw_##call *entry; \ + u64 __addr = 0, __count = 1; \ + unsigned long irq_flags; \ + int __entry_size; \ + int __data_size; \ + int pc; \ + \ + local_save_flags(irq_flags); \ + pc = preempt_count(); \ + \ + __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ + __entry_size = ALIGN(__data_size + sizeof(*entry), sizeof(u64));\ + \ + do { \ + char raw_data[__entry_size]; \ + struct trace_entry *ent; \ + \ + entry = (struct ftrace_raw_##call *)raw_data; \ + ent = &entry->ent; \ + tracing_generic_entry_update(ent, irq_flags, pc); \ + ent->type = event_call->id; \ + \ + tstruct \ + \ + { assign; } \ + \ + perf_tpcounter_event(event_call->id, __addr, __count, entry,\ + __entry_size); \ + } while (0); \ + \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#endif /* CONFIG_EVENT_PROFILE */ + #undef _TRACE_PROFILE_INIT diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 52eb4b68d34f..868102172aa4 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,6 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; + struct perf_tracepoint_record *tp; int callchain_size = 0; u64 time; struct { @@ -2714,6 +2715,11 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(u64); } + if (sample_type & PERF_SAMPLE_TP_RECORD) { + tp = data->private; + header.size += tp->size; + } + ret = perf_output_begin(&handle, counter, header.size, nmi, 1); if (ret) return; @@ -2777,6 +2783,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } + if (sample_type & PERF_SAMPLE_TP_RECORD) + perf_output_copy(&handle, tp->record, tp->size); + perf_output_end(&handle); } @@ -3703,11 +3712,18 @@ static const struct pmu perf_ops_task_clock = { }; #ifdef CONFIG_EVENT_PROFILE -void perf_tpcounter_event(int event_id, u64 addr, u64 count) +void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, + int entry_size) { + struct perf_tracepoint_record tp = { + .size = entry_size, + .record = record, + }; + struct perf_sample_data data = { .regs = get_irq_regs(), .addr = addr, + .private = &tp, }; if (!data.regs) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8930e39b9d8c..c22b40f8f576 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -848,6 +848,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); } +EXPORT_SYMBOL_GPL(tracing_generic_entry_update); struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, int type, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3548ae5cc780..8b9f4f6e9559 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -438,10 +438,6 @@ struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); -void tracing_generic_entry_update(struct trace_entry *entry, - unsigned long flags, - int pc); - void default_wait_pipe(struct trace_iterator *iter); void poll_wait_pipe(struct trace_iterator *iter); diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 6da09928130f..90c98082af10 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -412,6 +412,7 @@ static void create_counter(int counter, int cpu, pid_t pid) if (call_graph) attr->sample_type |= PERF_SAMPLE_CALLCHAIN; + attr->mmap = track; attr->comm = track; attr->inherit = (cpu < 0) && inherit; -- cgit v1.2.3-59-g8ed1b From 3a43ce68ae1758fa6a839386025ef45acb6baa22 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 04:26:37 +0200 Subject: perf_counter: Fix tracepoint sampling to be part of generic sampling Based on Peter's comments, make tracepoint sampling generic just like all the other sampling bits are. This is a rename with no code changes: - PERF_SAMPLE_TP_RECORD to PERF_SAMPLE_RAW - struct perf_tracepoint_record to perf_raw_record We want the system in place that transport tracepoints raw samples events into the perf ring buffer to be generalized and usable by any type of counter. Reported-by; Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249698400-5441-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 +++++----- kernel/perf_counter.c | 20 ++++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a67dd5c5b6d3..2aabe43c1d04 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -121,7 +121,7 @@ enum perf_counter_sample_format { PERF_SAMPLE_CPU = 1U << 7, PERF_SAMPLE_PERIOD = 1U << 8, PERF_SAMPLE_STREAM_ID = 1U << 9, - PERF_SAMPLE_TP_RECORD = 1U << 10, + PERF_SAMPLE_RAW = 1U << 10, PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ }; @@ -414,9 +414,9 @@ struct perf_callchain_entry { __u64 ip[PERF_MAX_STACK_DEPTH]; }; -struct perf_tracepoint_record { - int size; - char *record; +struct perf_raw_record { + u32 size; + void *data; }; struct task_struct; @@ -687,7 +687,7 @@ struct perf_sample_data { struct pt_regs *regs; u64 addr; u64 period; - void *private; + struct perf_raw_record *raw; }; extern int perf_counter_overflow(struct perf_counter *counter, int nmi, diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 117622cb73a3..002310540417 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_tracepoint_record *tp = NULL; + struct perf_raw_record *raw = NULL; int callchain_size = 0; u64 time; struct { @@ -2715,10 +2715,10 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(u64); } - if (sample_type & PERF_SAMPLE_TP_RECORD) { - tp = data->private; - if (tp) - header.size += tp->size; + if (sample_type & PERF_SAMPLE_RAW) { + raw = data->raw; + if (raw) + header.size += raw->size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2784,8 +2784,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if ((sample_type & PERF_SAMPLE_TP_RECORD) && tp) - perf_output_copy(&handle, tp->record, tp->size); + if ((sample_type & PERF_SAMPLE_RAW) && raw) + perf_output_copy(&handle, raw->data, raw->size); perf_output_end(&handle); } @@ -3740,15 +3740,15 @@ static const struct pmu perf_ops_task_clock = { void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, int entry_size) { - struct perf_tracepoint_record tp = { + struct perf_raw_record raw = { .size = entry_size, - .record = record, + .data = record, }; struct perf_sample_data data = { .regs = get_irq_regs(), .addr = addr, - .private = &tp, + .raw = &raw, }; if (!data.regs) -- cgit v1.2.3-59-g8ed1b From a044560c3a1f0ad75ce685c1ed7604820b9ed319 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 11:16:52 +0200 Subject: perf_counter: Correct PERF_SAMPLE_RAW output PERF_SAMPLE_* output switches should unconditionally output the correct format, as they are the only way to unambiguously parse the PERF_EVENT_SAMPLE data. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249896447.17467.74.camel@twins> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ include/trace/ftrace.h | 3 ++- kernel/perf_counter.c | 30 ++++++++++++++++++++++++------ 3 files changed, 28 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2aabe43c1d04..a9d823a93fe8 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -369,6 +369,8 @@ enum perf_event_type { * * { u64 nr, * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * { u32 size; + * char data[size];}&& PERF_SAMPLE_RAW * }; */ PERF_EVENT_SAMPLE = 9, diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 7fb16d90e7b1..7167b9b97da2 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -685,7 +685,8 @@ static void ftrace_profile_##call(proto) \ pc = preempt_count(); \ \ __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ - __entry_size = ALIGN(__data_size + sizeof(*entry), sizeof(u64));\ + __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\ + sizeof(u64)); \ \ do { \ char raw_data[__entry_size]; \ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 546e62d62941..5229d1666fa5 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_raw_record *raw = NULL; int callchain_size = 0; u64 time; struct { @@ -2716,9 +2715,15 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } if (sample_type & PERF_SAMPLE_RAW) { - raw = data->raw; - if (raw) - header.size += raw->size; + int size = sizeof(u32); + + if (data->raw) + size += data->raw->size; + else + size += sizeof(u32); + + WARN_ON_ONCE(size & (sizeof(u64)-1)); + header.size += size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2784,8 +2789,21 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if ((sample_type & PERF_SAMPLE_RAW) && raw) - perf_output_copy(&handle, raw->data, raw->size); + if (sample_type & PERF_SAMPLE_RAW) { + if (data->raw) { + perf_output_put(&handle, data->raw->size); + perf_output_copy(&handle, data->raw->data, data->raw->size); + } else { + struct { + u32 size; + u32 data; + } raw = { + .size = sizeof(u32), + .data = 0, + }; + perf_output_put(&handle, raw); + } + } perf_output_end(&handle); } -- cgit v1.2.3-59-g8ed1b From 2fc391112fb6f3424435a3aa2fda887497b5f807 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 12:33:05 +0100 Subject: locking, sched: Give waitqueue spinlocks their own lockdep classes Give waitqueue spinlocks their own lockdep classes when they are initialised from init_waitqueue_head(). This means that struct wait_queue::func functions can operate other waitqueues. This is used by CacheFiles to catch the page from a backing fs being unlocked and to wake up another thread to take a copy of it. Signed-off-by: Peter Zijlstra Signed-off-by: David Howells Tested-by: Takashi Iwai Cc: linux-cachefs@redhat.com Cc: torvalds@osdl.org Cc: akpm@linux-foundation.org LKML-Reference: <20090810113305.17284.81508.stgit@warthog.procyon.org.uk> Signed-off-by: Ingo Molnar --- include/linux/wait.h | 9 ++++++++- kernel/wait.c | 5 +++-- 2 files changed, 11 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/wait.h b/include/linux/wait.h index 6788e1a4d4ca..cf3c2f5dba51 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -77,7 +77,14 @@ struct task_struct; #define __WAIT_BIT_KEY_INITIALIZER(word, bit) \ { .flags = word, .bit_nr = bit, } -extern void init_waitqueue_head(wait_queue_head_t *q); +extern void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *); + +#define init_waitqueue_head(q) \ + do { \ + static struct lock_class_key __key; \ + \ + __init_waitqueue_head((q), &__key); \ + } while (0) #ifdef CONFIG_LOCKDEP # define __WAIT_QUEUE_HEAD_INIT_ONSTACK(name) \ diff --git a/kernel/wait.c b/kernel/wait.c index ea7c3b4275cf..c4bd3d825f35 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -10,13 +10,14 @@ #include #include -void init_waitqueue_head(wait_queue_head_t *q) +void __init_waitqueue_head(wait_queue_head_t *q, struct lock_class_key *key) { spin_lock_init(&q->lock); + lockdep_set_class(&q->lock, key); INIT_LIST_HEAD(&q->task_list); } -EXPORT_SYMBOL(init_waitqueue_head); +EXPORT_SYMBOL(__init_waitqueue_head); void add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) { -- cgit v1.2.3-59-g8ed1b From 304703aba31a87903b8c0db8f5e6890cac2d596d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 10 Aug 2009 16:11:32 +0200 Subject: perf_counter: Subtract the buffer size field from the event record size We compute the perf raw sample size by aligning the raw ftrace event size plus the buffer size field itself. We do that instead of aligning only the perf raw sample size, so that we might economize some in some cases. But this buffer size field is not stored in the perf raw sample, we must then substract its size from the buffer once we computed the alignment unless we may get a useless u32 field in the buffer. Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <20090810141129.GA5124@nowhere> Signed-off-by: Ingo Molnar --- include/trace/ftrace.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 7167b9b97da2..a05524fa245d 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -637,7 +637,12 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ * pc = preempt_count(); * * __data_size = ftrace_get_offsets_(&__data_offsets, args); - * __entry_size = __data_size + sizeof(*entry); + * + * // Below we want to get the aligned size by taking into account + * // the u32 field that will later store the buffer size + * __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32), + * sizeof(u64)); + * __entry_size -= sizeof(u32); * * do { * char raw_data[__entry_size]; <- allocate our sample in the stack @@ -687,6 +692,7 @@ static void ftrace_profile_##call(proto) \ __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\ sizeof(u64)); \ + __entry_size -= sizeof(u32); \ \ do { \ char raw_data[__entry_size]; \ -- cgit v1.2.3-59-g8ed1b From 1853db0e02ae4088f102b0d8e59e83dc98f93f03 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 10 Aug 2009 16:38:36 +0200 Subject: perf_counter: Zero dead bytes from ftrace raw samples size alignment After aligning the ftrace raw samples, there are dead bytes storing random data from the stack. We don't want to leak these to userspace, then zero these out. Before: 0x2de88 [0x50]: event: 9 . . ... raw event: size 80 bytes . 0000: 09 00 00 00 01 00 50 00 d0 c7 00 81 ff ff ff ff ......P........ . 0010: 68 01 00 00 68 01 00 00 2c 00 00 00 00 00 00 00 h...h...,...... . 0020: 2c 00 00 00 2b 00 01 02 68 01 00 00 68 01 00 00 ,...+...h...h.. . 0030: 6b 6f 6e 64 65 6d 61 6e 64 2f 30 00 00 00 00 00 kondemand/0.... . 0040: 68 01 00 00 40 7f 46 81 ff ff ff ff 00 10 1b 7f h...@.F........ ^ ^ ^ ^ Leak After: 0x2d318 [0x50]: event: 9 . . ... raw event: size 80 bytes . 0000: 09 00 00 00 01 00 50 00 d0 c7 00 81 ff ff ff ff ......P........ . 0010: 68 01 00 00 68 01 00 00 68 14 00 00 00 00 00 00 h...h...h...... . 0020: 2c 00 00 00 2b 00 01 02 68 01 00 00 68 01 00 00 ,...+...h...h.. . 0030: 6b 6f 6e 64 65 6d 61 6e 64 2f 30 00 00 00 00 00 kondemand/0.... . 0040: 68 01 00 00 a0 80 46 81 ff ff ff ff 00 00 00 00 h.....F........ ^ ^ ^ ^ Fixed Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith LKML-Reference: <1249915116-5210-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith --- include/trace/ftrace.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index a05524fa245d..f64fbaae781a 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -648,6 +648,9 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ * char raw_data[__entry_size]; <- allocate our sample in the stack * struct trace_entry *ent; * + * zero dead bytes from alignment to avoid stack leak to userspace: + * + * *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; * entry = (struct ftrace_raw_ *)raw_data; * ent = &entry->ent; * tracing_generic_entry_update(ent, irq_flags, pc); @@ -698,6 +701,7 @@ static void ftrace_profile_##call(proto) \ char raw_data[__entry_size]; \ struct trace_entry *ent; \ \ + *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; \ entry = (struct ftrace_raw_##call *)raw_data; \ ent = &entry->ent; \ tracing_generic_entry_update(ent, irq_flags, pc); \ -- cgit v1.2.3-59-g8ed1b From 1ae88b2e446261c038f2c0c3150ffae142b227a2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 12 Aug 2009 09:12:30 -0400 Subject: NFS: Fix an O_DIRECT Oops... We can't call nfs_readdata_release()/nfs_writedata_release() without first initialising and referencing args.context. Doing so inside nfs_direct_read_schedule_segment()/nfs_direct_write_schedule_segment() causes an Oops. We should rather be calling nfs_readdata_free()/nfs_writedata_free() in those cases. Looking at the O_DIRECT code, the "struct nfs_direct_req" is already referencing the nfs_open_context for us. Since the readdata and writedata structures carry a reference to that, we can simplify things by getting rid of the extra nfs_open_context references, so that we can replace all instances of nfs_readdata_release()/nfs_writedata_release(). Reported-by: Catalin Marinas Signed-off-by: Trond Myklebust Tested-by: Catalin Marinas Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/nfs/direct.c | 20 ++++++++++---------- fs/nfs/read.c | 6 ++---- fs/nfs/write.c | 6 ++---- include/linux/nfs_fs.h | 5 ++--- 4 files changed, 16 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 489fc01a3204..e4e089a8f294 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -255,7 +255,7 @@ static void nfs_direct_read_release(void *calldata) if (put_dreq(dreq)) nfs_direct_complete(dreq); - nfs_readdata_release(calldata); + nfs_readdata_free(data); } static const struct rpc_call_ops nfs_read_direct_ops = { @@ -314,14 +314,14 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, data->npages, 1, 0, data->pagevec, NULL); up_read(¤t->mm->mmap_sem); if (result < 0) { - nfs_readdata_release(data); + nfs_readdata_free(data); break; } if ((unsigned)result < data->npages) { bytes = result * PAGE_SIZE; if (bytes <= pgbase) { nfs_direct_release_pages(data->pagevec, result); - nfs_readdata_release(data); + nfs_readdata_free(data); break; } bytes -= pgbase; @@ -334,7 +334,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, data->inode = inode; data->cred = msg.rpc_cred; data->args.fh = NFS_FH(inode); - data->args.context = get_nfs_open_context(ctx); + data->args.context = ctx; data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = data->pagevec; @@ -441,7 +441,7 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq) struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages); list_del(&data->pages); nfs_direct_release_pages(data->pagevec, data->npages); - nfs_writedata_release(data); + nfs_writedata_free(data); } } @@ -534,7 +534,7 @@ static void nfs_direct_commit_release(void *calldata) dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status); nfs_direct_write_complete(dreq, data->inode); - nfs_commitdata_release(calldata); + nfs_commit_free(data); } static const struct rpc_call_ops nfs_commit_direct_ops = { @@ -570,7 +570,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) data->args.fh = NFS_FH(data->inode); data->args.offset = 0; data->args.count = 0; - data->args.context = get_nfs_open_context(dreq->ctx); + data->args.context = dreq->ctx; data->res.count = 0; data->res.fattr = &data->fattr; data->res.verf = &data->verf; @@ -734,14 +734,14 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, data->npages, 0, 0, data->pagevec, NULL); up_read(¤t->mm->mmap_sem); if (result < 0) { - nfs_writedata_release(data); + nfs_writedata_free(data); break; } if ((unsigned)result < data->npages) { bytes = result * PAGE_SIZE; if (bytes <= pgbase) { nfs_direct_release_pages(data->pagevec, result); - nfs_writedata_release(data); + nfs_writedata_free(data); break; } bytes -= pgbase; @@ -756,7 +756,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, data->inode = inode; data->cred = msg.rpc_cred; data->args.fh = NFS_FH(inode); - data->args.context = get_nfs_open_context(ctx); + data->args.context = ctx; data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = data->pagevec; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 73ea5e8d66ce..12c9e66d3f1d 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -60,17 +60,15 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) return p; } -static void nfs_readdata_free(struct nfs_read_data *p) +void nfs_readdata_free(struct nfs_read_data *p) { if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); mempool_free(p, nfs_rdata_mempool); } -void nfs_readdata_release(void *data) +static void nfs_readdata_release(struct nfs_read_data *rdata) { - struct nfs_read_data *rdata = data; - put_nfs_open_context(rdata->args.context); nfs_readdata_free(rdata); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0a0a2ff767c3..a34fae21fe10 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -87,17 +87,15 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) return p; } -static void nfs_writedata_free(struct nfs_write_data *p) +void nfs_writedata_free(struct nfs_write_data *p) { if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); mempool_free(p, nfs_wdata_mempool); } -void nfs_writedata_release(void *data) +static void nfs_writedata_release(struct nfs_write_data *wdata) { - struct nfs_write_data *wdata = data; - put_nfs_open_context(wdata->args.context); nfs_writedata_free(wdata); } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index fdffb413b192..f6b90240dd41 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -473,7 +473,6 @@ extern int nfs_writepages(struct address_space *, struct writeback_control *); extern int nfs_flush_incompatible(struct file *file, struct page *page); extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); -extern void nfs_writedata_release(void *); /* * Try to write back everything synchronously (but check the @@ -488,7 +487,6 @@ extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); extern int nfs_commit_inode(struct inode *, int); extern struct nfs_write_data *nfs_commitdata_alloc(void); extern void nfs_commit_free(struct nfs_write_data *wdata); -extern void nfs_commitdata_release(void *wdata); #else static inline int nfs_commit_inode(struct inode *inode, int how) @@ -507,6 +505,7 @@ nfs_have_writebacks(struct inode *inode) * Allocate nfs_write_data structures */ extern struct nfs_write_data *nfs_writedata_alloc(unsigned int npages); +extern void nfs_writedata_free(struct nfs_write_data *); /* * linux/fs/nfs/read.c @@ -515,7 +514,6 @@ extern int nfs_readpage(struct file *, struct page *); extern int nfs_readpages(struct file *, struct address_space *, struct list_head *, unsigned); extern int nfs_readpage_result(struct rpc_task *, struct nfs_read_data *); -extern void nfs_readdata_release(void *data); extern int nfs_readpage_async(struct nfs_open_context *, struct inode *, struct page *); @@ -523,6 +521,7 @@ extern int nfs_readpage_async(struct nfs_open_context *, struct inode *, * Allocate nfs_read_data structures */ extern struct nfs_read_data *nfs_readdata_alloc(unsigned int npages); +extern void nfs_readdata_free(struct nfs_read_data *); /* * linux/fs/nfs3proc.c -- cgit v1.2.3-59-g8ed1b From 28402971d869e26271b25301011f667d3a5640c3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 13 Aug 2009 10:13:22 +0200 Subject: perf_counter: Provide hw_perf_counter_setup_online() APIs Provide weak aliases for hw_perf_counter_setup_online(). This is used by the BTS patches (for v2.6.32), but it interacts with fixes so propagate this upstream. (it has no effect as of yet) Also export perf_counter_output() to architecture code. Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ kernel/perf_counter.c | 10 +++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a9d823a93fe8..2b36afe6ae0f 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -694,6 +694,8 @@ struct perf_sample_data { extern int perf_counter_overflow(struct perf_counter *counter, int nmi, struct perf_sample_data *data); +extern void perf_counter_output(struct perf_counter *counter, int nmi, + struct perf_sample_data *data); /* * Return 1 for a software counter, 0 for a hardware counter diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b0b20a07f394..e26d2fcfa320 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -88,6 +88,7 @@ void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } void __weak hw_perf_counter_setup(int cpu) { barrier(); } +void __weak hw_perf_counter_setup_online(int cpu) { barrier(); } int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, @@ -2630,7 +2631,7 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) return task_pid_nr_ns(p, counter->ns); } -static void perf_counter_output(struct perf_counter *counter, int nmi, +void perf_counter_output(struct perf_counter *counter, int nmi, struct perf_sample_data *data) { int ret; @@ -4592,6 +4593,11 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) perf_counter_init_cpu(cpu); break; + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + hw_perf_counter_setup_online(cpu); + break; + case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: perf_counter_exit_cpu(cpu); @@ -4616,6 +4622,8 @@ void __init perf_counter_init(void) { perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, + (void *)(long)smp_processor_id()); register_cpu_notifier(&perf_cpu_nb); } -- cgit v1.2.3-59-g8ed1b From 3dab77fb1bf89664bb1c9544607159dcab6f7d57 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 13 Aug 2009 11:47:53 +0200 Subject: perf: Rework/fix the whole read vs group stuff Replace PERF_SAMPLE_GROUP with PERF_SAMPLE_READ and introduce PERF_FORMAT_GROUP to deal with group reads in a more generic way. This allows you to get group reads out of read() as well. Signed-off-by: Peter Zijlstra Cc: Corey J Ashford Cc: Paul Mackerras Cc: stephane eranian LKML-Reference: <20090813103655.117411814@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 47 ++++++-- kernel/perf_counter.c | 274 +++++++++++++++++++++++++++++++------------ 2 files changed, 238 insertions(+), 83 deletions(-) (limited to 'include') diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2b36afe6ae0f..b53f7006cc4e 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -115,7 +115,7 @@ enum perf_counter_sample_format { PERF_SAMPLE_TID = 1U << 1, PERF_SAMPLE_TIME = 1U << 2, PERF_SAMPLE_ADDR = 1U << 3, - PERF_SAMPLE_GROUP = 1U << 4, + PERF_SAMPLE_READ = 1U << 4, PERF_SAMPLE_CALLCHAIN = 1U << 5, PERF_SAMPLE_ID = 1U << 6, PERF_SAMPLE_CPU = 1U << 7, @@ -127,16 +127,32 @@ enum perf_counter_sample_format { }; /* - * Bits that can be set in attr.read_format to request that - * reads on the counter should return the indicated quantities, - * in increasing order of bit value, after the counter value. + * The format of the data returned by read() on a perf counter fd, + * as specified by attr.read_format: + * + * struct read_format { + * { u64 value; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 id; } && PERF_FORMAT_ID + * } && !PERF_FORMAT_GROUP + * + * { u64 nr; + * { u64 time_enabled; } && PERF_FORMAT_ENABLED + * { u64 time_running; } && PERF_FORMAT_RUNNING + * { u64 value; + * { u64 id; } && PERF_FORMAT_ID + * } cntr[nr]; + * } && PERF_FORMAT_GROUP + * }; */ enum perf_counter_read_format { PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, PERF_FORMAT_ID = 1U << 2, + PERF_FORMAT_GROUP = 1U << 3, - PERF_FORMAT_MAX = 1U << 3, /* non-ABI */ + PERF_FORMAT_MAX = 1U << 4, /* non-ABI */ }; #define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ @@ -343,10 +359,8 @@ enum perf_event_type { * struct { * struct perf_event_header header; * u32 pid, tid; - * u64 value; - * { u64 time_enabled; } && PERF_FORMAT_ENABLED - * { u64 time_running; } && PERF_FORMAT_RUNNING - * { u64 parent_id; } && PERF_FORMAT_ID + * + * struct read_format values; * }; */ PERF_EVENT_READ = 8, @@ -364,11 +378,22 @@ enum perf_event_type { * { u32 cpu, res; } && PERF_SAMPLE_CPU * { u64 period; } && PERF_SAMPLE_PERIOD * - * { u64 nr; - * { u64 id, val; } cnt[nr]; } && PERF_SAMPLE_GROUP + * { struct read_format values; } && PERF_SAMPLE_READ * * { u64 nr, * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * + * # + * # The RAW record below is opaque data wrt the ABI + * # + * # That is, the ABI doesn't make any promises wrt to + * # the stability of its content, it may vary depending + * # on event, hardware, kernel version and phase of + * # the moon. + * # + * # In other words, PERF_SAMPLE_RAW contents are not an ABI. + * # + * * { u32 size; * char data[size];}&& PERF_SAMPLE_RAW * }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3dd4339589a0..b8c6b97a20a3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1692,7 +1692,32 @@ static int perf_release(struct inode *inode, struct file *file) return 0; } -static u64 perf_counter_read_tree(struct perf_counter *counter) +static int perf_counter_read_size(struct perf_counter *counter) +{ + int entry = sizeof(u64); /* value */ + int size = 0; + int nr = 1; + + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + size += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + size += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_ID) + entry += sizeof(u64); + + if (counter->attr.read_format & PERF_FORMAT_GROUP) { + nr += counter->group_leader->nr_siblings; + size += sizeof(u64); + } + + size += entry * nr; + + return size; +} + +static u64 perf_counter_read_value(struct perf_counter *counter) { struct perf_counter *child; u64 total = 0; @@ -1704,14 +1729,96 @@ static u64 perf_counter_read_tree(struct perf_counter *counter) return total; } +static int perf_counter_read_entry(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + int n = 0, count = 0; + u64 values[2]; + + values[n++] = perf_counter_read_value(counter); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + count = n * sizeof(u64); + + if (copy_to_user(buf, values, count)) + return -EFAULT; + + return count; +} + +static int perf_counter_read_group(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + struct perf_counter *leader = counter->group_leader, *sub; + int n = 0, size = 0, err = -EFAULT; + u64 values[3]; + + values[n++] = 1 + leader->nr_siblings; + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = leader->total_time_enabled + + atomic64_read(&leader->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = leader->total_time_running + + atomic64_read(&leader->child_total_time_running); + } + + size = n * sizeof(u64); + + if (copy_to_user(buf, values, size)) + return -EFAULT; + + err = perf_counter_read_entry(leader, read_format, buf + size); + if (err < 0) + return err; + + size += err; + + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + err = perf_counter_read_entry(counter, read_format, + buf + size); + if (err < 0) + return err; + + size += err; + } + + return size; +} + +static int perf_counter_read_one(struct perf_counter *counter, + u64 read_format, char __user *buf) +{ + u64 values[4]; + int n = 0; + + values[n++] = perf_counter_read_value(counter); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + if (copy_to_user(buf, values, n * sizeof(u64))) + return -EFAULT; + + return n * sizeof(u64); +} + /* * Read the performance counter - simple non blocking version for now */ static ssize_t perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) { - u64 values[4]; - int n; + u64 read_format = counter->attr.read_format; + int ret; /* * Return end-of-file for a read on a counter that is in @@ -1721,28 +1828,18 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) if (counter->state == PERF_COUNTER_STATE_ERROR) return 0; + if (count < perf_counter_read_size(counter)) + return -ENOSPC; + WARN_ON_ONCE(counter->ctx->parent_ctx); mutex_lock(&counter->child_mutex); - values[0] = perf_counter_read_tree(counter); - n = 1; - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) - values[n++] = counter->total_time_enabled + - atomic64_read(&counter->child_total_time_enabled); - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) - values[n++] = counter->total_time_running + - atomic64_read(&counter->child_total_time_running); - if (counter->attr.read_format & PERF_FORMAT_ID) - values[n++] = primary_counter_id(counter); + if (read_format & PERF_FORMAT_GROUP) + ret = perf_counter_read_group(counter, read_format, buf); + else + ret = perf_counter_read_one(counter, read_format, buf); mutex_unlock(&counter->child_mutex); - if (count < n * sizeof(u64)) - return -EINVAL; - count = n * sizeof(u64); - - if (copy_to_user(buf, values, count)) - return -EFAULT; - - return count; + return ret; } static ssize_t @@ -2631,6 +2728,79 @@ static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) return task_pid_nr_ns(p, counter->ns); } +static void perf_output_read_one(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + u64 read_format = counter->attr.read_format; + u64 values[4]; + int n = 0; + + values[n++] = atomic64_read(&counter->count); + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { + values[n++] = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + } + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { + values[n++] = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); + } + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(counter); + + perf_output_copy(handle, values, n * sizeof(u64)); +} + +/* + * XXX PERF_FORMAT_GROUP vs inherited counters seems difficult. + */ +static void perf_output_read_group(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + struct perf_counter *leader = counter->group_leader, *sub; + u64 read_format = counter->attr.read_format; + u64 values[5]; + int n = 0; + + values[n++] = 1 + leader->nr_siblings; + + if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = leader->total_time_enabled; + + if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = leader->total_time_running; + + if (leader != counter) + leader->pmu->read(leader); + + values[n++] = atomic64_read(&leader->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(leader); + + perf_output_copy(handle, values, n * sizeof(u64)); + + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + n = 0; + + if (sub != counter) + sub->pmu->read(sub); + + values[n++] = atomic64_read(&sub->count); + if (read_format & PERF_FORMAT_ID) + values[n++] = primary_counter_id(sub); + + perf_output_copy(handle, values, n * sizeof(u64)); + } +} + +static void perf_output_read(struct perf_output_handle *handle, + struct perf_counter *counter) +{ + if (counter->attr.read_format & PERF_FORMAT_GROUP) + perf_output_read_group(handle, counter); + else + perf_output_read_one(handle, counter); +} + void perf_counter_output(struct perf_counter *counter, int nmi, struct perf_sample_data *data) { @@ -2642,10 +2812,6 @@ void perf_counter_output(struct perf_counter *counter, int nmi, struct { u32 pid, tid; } tid_entry; - struct { - u64 id; - u64 counter; - } group_entry; struct perf_callchain_entry *callchain = NULL; int callchain_size = 0; u64 time; @@ -2700,10 +2866,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_PERIOD) header.size += sizeof(u64); - if (sample_type & PERF_SAMPLE_GROUP) { - header.size += sizeof(u64) + - counter->nr_siblings * sizeof(group_entry); - } + if (sample_type & PERF_SAMPLE_READ) + header.size += perf_counter_read_size(counter); if (sample_type & PERF_SAMPLE_CALLCHAIN) { callchain = perf_callchain(data->regs); @@ -2760,26 +2924,8 @@ void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_PERIOD) perf_output_put(&handle, data->period); - /* - * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. - */ - if (sample_type & PERF_SAMPLE_GROUP) { - struct perf_counter *leader, *sub; - u64 nr = counter->nr_siblings; - - perf_output_put(&handle, nr); - - leader = counter->group_leader; - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - if (sub != counter) - sub->pmu->read(sub); - - group_entry.id = primary_counter_id(sub); - group_entry.counter = atomic64_read(&sub->count); - - perf_output_put(&handle, group_entry); - } - } + if (sample_type & PERF_SAMPLE_READ) + perf_output_read(&handle, counter); if (sample_type & PERF_SAMPLE_CALLCHAIN) { if (callchain) @@ -2818,8 +2964,6 @@ struct perf_read_event { u32 pid; u32 tid; - u64 value; - u64 format[3]; }; static void @@ -2831,34 +2975,20 @@ perf_counter_read_event(struct perf_counter *counter, .header = { .type = PERF_EVENT_READ, .misc = 0, - .size = sizeof(event) - sizeof(event.format), + .size = sizeof(event) + perf_counter_read_size(counter), }, .pid = perf_counter_pid(counter, task), .tid = perf_counter_tid(counter, task), - .value = atomic64_read(&counter->count), }; - int ret, i = 0; - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) { - event.header.size += sizeof(u64); - event.format[i++] = counter->total_time_enabled; - } - - if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) { - event.header.size += sizeof(u64); - event.format[i++] = counter->total_time_running; - } - - if (counter->attr.read_format & PERF_FORMAT_ID) { - event.header.size += sizeof(u64); - event.format[i++] = primary_counter_id(counter); - } + int ret; ret = perf_output_begin(&handle, counter, event.header.size, 0, 0); if (ret) return; - perf_output_copy(&handle, &event, event.header.size); + perf_output_put(&handle, event); + perf_output_read(&handle, counter); + perf_output_end(&handle); } @@ -3921,9 +4051,9 @@ perf_counter_alloc(struct perf_counter_attr *attr, atomic64_set(&hwc->period_left, hwc->sample_period); /* - * we currently do not support PERF_SAMPLE_GROUP on inherited counters + * we currently do not support PERF_FORMAT_GROUP on inherited counters */ - if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) + if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) goto done; switch (attr->type) { -- cgit v1.2.3-59-g8ed1b