From db21733488f84a596faaad0d05430b3f51804692 Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Sat, 17 Jun 2006 21:24:58 -0700 Subject: [I/OAT]: Setup the networking subsystem as a DMA client Attempts to allocate per-CPU DMA channels Signed-off-by: Chris Leech Signed-off-by: David S. Miller --- net/core/dev.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 4fba549caf29..6bfa78c66c25 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -115,6 +115,7 @@ #include #include #include +#include /* * The list of packet types we will receive (as opposed to discard) @@ -148,6 +149,12 @@ static DEFINE_SPINLOCK(ptype_lock); static struct list_head ptype_base[16]; /* 16 way hashed list */ static struct list_head ptype_all; /* Taps */ +#ifdef CONFIG_NET_DMA +static struct dma_client *net_dma_client; +static unsigned int net_dma_count; +static spinlock_t net_dma_event_lock; +#endif + /* * The @dev_base list is protected by @dev_base_lock and the rtnl * semaphore. @@ -1846,6 +1853,19 @@ static void net_rx_action(struct softirq_action *h) } } out: +#ifdef CONFIG_NET_DMA + /* + * There may not be any more sk_buffs coming right now, so push + * any pending DMA copies to hardware + */ + if (net_dma_client) { + struct dma_chan *chan; + rcu_read_lock(); + list_for_each_entry_rcu(chan, &net_dma_client->channels, client_node) + dma_async_memcpy_issue_pending(chan); + rcu_read_unlock(); + } +#endif local_irq_enable(); return; @@ -3300,6 +3320,88 @@ static int dev_cpu_callback(struct notifier_block *nfb, } #endif /* CONFIG_HOTPLUG_CPU */ +#ifdef CONFIG_NET_DMA +/** + * net_dma_rebalance - + * This is called when the number of channels allocated to the net_dma_client + * changes. The net_dma_client tries to have one DMA channel per CPU. + */ +static void net_dma_rebalance(void) +{ + unsigned int cpu, i, n; + struct dma_chan *chan; + + lock_cpu_hotplug(); + + if (net_dma_count == 0) { + for_each_online_cpu(cpu) + rcu_assign_pointer(per_cpu(softnet_data.net_dma, cpu), NULL); + unlock_cpu_hotplug(); + return; + } + + i = 0; + cpu = first_cpu(cpu_online_map); + + rcu_read_lock(); + list_for_each_entry(chan, &net_dma_client->channels, client_node) { + n = ((num_online_cpus() / net_dma_count) + + (i < (num_online_cpus() % net_dma_count) ? 1 : 0)); + + while(n) { + per_cpu(softnet_data.net_dma, cpu) = chan; + cpu = next_cpu(cpu, cpu_online_map); + n--; + } + i++; + } + rcu_read_unlock(); + + unlock_cpu_hotplug(); +} + +/** + * netdev_dma_event - event callback for the net_dma_client + * @client: should always be net_dma_client + * @chan: + * @event: + */ +static void netdev_dma_event(struct dma_client *client, struct dma_chan *chan, + enum dma_event event) +{ + spin_lock(&net_dma_event_lock); + switch (event) { + case DMA_RESOURCE_ADDED: + net_dma_count++; + net_dma_rebalance(); + break; + case DMA_RESOURCE_REMOVED: + net_dma_count--; + net_dma_rebalance(); + break; + default: + break; + } + spin_unlock(&net_dma_event_lock); +} + +/** + * netdev_dma_regiser - register the networking subsystem as a DMA client + */ +static int __init netdev_dma_register(void) +{ + spin_lock_init(&net_dma_event_lock); + net_dma_client = dma_async_client_register(netdev_dma_event); + if (net_dma_client == NULL) + return -ENOMEM; + + dma_async_client_chan_request(net_dma_client, num_online_cpus()); + return 0; +} + +#else +static int __init netdev_dma_register(void) { return -ENODEV; } +#endif /* CONFIG_NET_DMA */ /* * Initialize the DEV module. At boot time this walks the device list and @@ -3353,6 +3455,8 @@ static int __init net_dev_init(void) atomic_set(&queue->backlog_dev.refcnt, 1); } + netdev_dma_register(); + dev_boot_phase = 0; open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL); -- cgit v1.2.3-59-g8ed1b From de5506e155276d385712c2aa1c2d9a27cd4ed947 Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Tue, 23 May 2006 17:50:37 -0700 Subject: [I/OAT]: Utility functions for offloading sk_buff to iovec copies Provides for pinning user space pages in memory, copying to iovecs, and copying from sk_buffs including fragmented and chained sk_buffs. Signed-off-by: Chris Leech Signed-off-by: David S. Miller --- drivers/dma/Makefile | 3 +- drivers/dma/iovlock.c | 301 ++++++++++++++++++++++++++++++++++++++++++++++ include/linux/dmaengine.h | 22 ++++ include/net/netdma.h | 6 + net/core/Makefile | 1 + net/core/user_dma.c | 127 +++++++++++++++++++ 6 files changed, 459 insertions(+), 1 deletion(-) create mode 100644 drivers/dma/iovlock.c create mode 100644 net/core/user_dma.c (limited to 'net') diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index c8a5f5677313..bdcfdbdb1aec 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -1,2 +1,3 @@ -obj-y += dmaengine.o +obj-$(CONFIG_DMA_ENGINE) += dmaengine.o +obj-$(CONFIG_NET_DMA) += iovlock.o obj-$(CONFIG_INTEL_IOATDMA) += ioatdma.o diff --git a/drivers/dma/iovlock.c b/drivers/dma/iovlock.c new file mode 100644 index 000000000000..5ed327e453a2 --- /dev/null +++ b/drivers/dma/iovlock.c @@ -0,0 +1,301 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * Portions based on net/core/datagram.c and copyrighted by their authors. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ + +/* + * This code allows the net stack to make use of a DMA engine for + * skb to iovec copies. + */ + +#include +#include +#include /* for memcpy_toiovec */ +#include +#include + +int num_pages_spanned(struct iovec *iov) +{ + return + ((PAGE_ALIGN((unsigned long)iov->iov_base + iov->iov_len) - + ((unsigned long)iov->iov_base & PAGE_MASK)) >> PAGE_SHIFT); +} + +/* + * Pin down all the iovec pages needed for len bytes. + * Return a struct dma_pinned_list to keep track of pages pinned down. + * + * We are allocating a single chunk of memory, and then carving it up into + * 3 sections, the latter 2 whose size depends on the number of iovecs and the + * total number of pages, respectively. + */ +struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len) +{ + struct dma_pinned_list *local_list; + struct page **pages; + int i; + int ret; + int nr_iovecs = 0; + int iovec_len_used = 0; + int iovec_pages_used = 0; + long err; + + /* don't pin down non-user-based iovecs */ + if (segment_eq(get_fs(), KERNEL_DS)) + return NULL; + + /* determine how many iovecs/pages there are, up front */ + do { + iovec_len_used += iov[nr_iovecs].iov_len; + iovec_pages_used += num_pages_spanned(&iov[nr_iovecs]); + nr_iovecs++; + } while (iovec_len_used < len); + + /* single kmalloc for pinned list, page_list[], and the page arrays */ + local_list = kmalloc(sizeof(*local_list) + + (nr_iovecs * sizeof (struct dma_page_list)) + + (iovec_pages_used * sizeof (struct page*)), GFP_KERNEL); + if (!local_list) { + err = -ENOMEM; + goto out; + } + + /* list of pages starts right after the page list array */ + pages = (struct page **) &local_list->page_list[nr_iovecs]; + + for (i = 0; i < nr_iovecs; i++) { + struct dma_page_list *page_list = &local_list->page_list[i]; + + len -= iov[i].iov_len; + + if (!access_ok(VERIFY_WRITE, iov[i].iov_base, iov[i].iov_len)) { + err = -EFAULT; + goto unpin; + } + + page_list->nr_pages = num_pages_spanned(&iov[i]); + page_list->base_address = iov[i].iov_base; + + page_list->pages = pages; + pages += page_list->nr_pages; + + /* pin pages down */ + down_read(¤t->mm->mmap_sem); + ret = get_user_pages( + current, + current->mm, + (unsigned long) iov[i].iov_base, + page_list->nr_pages, + 1, /* write */ + 0, /* force */ + page_list->pages, + NULL); + up_read(¤t->mm->mmap_sem); + + if (ret != page_list->nr_pages) { + err = -ENOMEM; + goto unpin; + } + + local_list->nr_iovecs = i + 1; + } + + return local_list; + +unpin: + dma_unpin_iovec_pages(local_list); +out: + return ERR_PTR(err); +} + +void dma_unpin_iovec_pages(struct dma_pinned_list *pinned_list) +{ + int i, j; + + if (!pinned_list) + return; + + for (i = 0; i < pinned_list->nr_iovecs; i++) { + struct dma_page_list *page_list = &pinned_list->page_list[i]; + for (j = 0; j < page_list->nr_pages; j++) { + set_page_dirty_lock(page_list->pages[j]); + page_cache_release(page_list->pages[j]); + } + } + + kfree(pinned_list); +} + +static dma_cookie_t dma_memcpy_to_kernel_iovec(struct dma_chan *chan, struct + iovec *iov, unsigned char *kdata, size_t len) +{ + dma_cookie_t dma_cookie = 0; + + while (len > 0) { + if (iov->iov_len) { + int copy = min_t(unsigned int, iov->iov_len, len); + dma_cookie = dma_async_memcpy_buf_to_buf( + chan, + iov->iov_base, + kdata, + copy); + kdata += copy; + len -= copy; + iov->iov_len -= copy; + iov->iov_base += copy; + } + iov++; + } + + return dma_cookie; +} + +/* + * We have already pinned down the pages we will be using in the iovecs. + * Each entry in iov array has corresponding entry in pinned_list->page_list. + * Using array indexing to keep iov[] and page_list[] in sync. + * Initial elements in iov array's iov->iov_len will be 0 if already copied into + * by another call. + * iov array length remaining guaranteed to be bigger than len. + */ +dma_cookie_t dma_memcpy_to_iovec(struct dma_chan *chan, struct iovec *iov, + struct dma_pinned_list *pinned_list, unsigned char *kdata, size_t len) +{ + int iov_byte_offset; + int copy; + dma_cookie_t dma_cookie = 0; + int iovec_idx; + int page_idx; + + if (!chan) + return memcpy_toiovec(iov, kdata, len); + + /* -> kernel copies (e.g. smbfs) */ + if (!pinned_list) + return dma_memcpy_to_kernel_iovec(chan, iov, kdata, len); + + iovec_idx = 0; + while (iovec_idx < pinned_list->nr_iovecs) { + struct dma_page_list *page_list; + + /* skip already used-up iovecs */ + while (!iov[iovec_idx].iov_len) + iovec_idx++; + + page_list = &pinned_list->page_list[iovec_idx]; + + iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK); + page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK) + - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT; + + /* break up copies to not cross page boundary */ + while (iov[iovec_idx].iov_len) { + copy = min_t(int, PAGE_SIZE - iov_byte_offset, len); + copy = min_t(int, copy, iov[iovec_idx].iov_len); + + dma_cookie = dma_async_memcpy_buf_to_pg(chan, + page_list->pages[page_idx], + iov_byte_offset, + kdata, + copy); + + len -= copy; + iov[iovec_idx].iov_len -= copy; + iov[iovec_idx].iov_base += copy; + + if (!len) + return dma_cookie; + + kdata += copy; + iov_byte_offset = 0; + page_idx++; + } + iovec_idx++; + } + + /* really bad if we ever run out of iovecs */ + BUG(); + return -EFAULT; +} + +dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov, + struct dma_pinned_list *pinned_list, struct page *page, + unsigned int offset, size_t len) +{ + int iov_byte_offset; + int copy; + dma_cookie_t dma_cookie = 0; + int iovec_idx; + int page_idx; + int err; + + /* this needs as-yet-unimplemented buf-to-buff, so punt. */ + /* TODO: use dma for this */ + if (!chan || !pinned_list) { + u8 *vaddr = kmap(page); + err = memcpy_toiovec(iov, vaddr + offset, len); + kunmap(page); + return err; + } + + iovec_idx = 0; + while (iovec_idx < pinned_list->nr_iovecs) { + struct dma_page_list *page_list; + + /* skip already used-up iovecs */ + while (!iov[iovec_idx].iov_len) + iovec_idx++; + + page_list = &pinned_list->page_list[iovec_idx]; + + iov_byte_offset = ((unsigned long)iov[iovec_idx].iov_base & ~PAGE_MASK); + page_idx = (((unsigned long)iov[iovec_idx].iov_base & PAGE_MASK) + - ((unsigned long)page_list->base_address & PAGE_MASK)) >> PAGE_SHIFT; + + /* break up copies to not cross page boundary */ + while (iov[iovec_idx].iov_len) { + copy = min_t(int, PAGE_SIZE - iov_byte_offset, len); + copy = min_t(int, copy, iov[iovec_idx].iov_len); + + dma_cookie = dma_async_memcpy_pg_to_pg(chan, + page_list->pages[page_idx], + iov_byte_offset, + page, + offset, + copy); + + len -= copy; + iov[iovec_idx].iov_len -= copy; + iov[iovec_idx].iov_base += copy; + + if (!len) + return dma_cookie; + + offset += copy; + iov_byte_offset = 0; + page_idx++; + } + iovec_idx++; + } + + /* really bad if we ever run out of iovecs */ + BUG(); + return -EFAULT; +} diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index 30781546ac99..78b236ca04f8 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -333,5 +333,27 @@ static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie, int dma_async_device_register(struct dma_device *device); void dma_async_device_unregister(struct dma_device *device); +/* --- Helper iov-locking functions --- */ + +struct dma_page_list { + char *base_address; + int nr_pages; + struct page **pages; +}; + +struct dma_pinned_list { + int nr_iovecs; + struct dma_page_list page_list[0]; +}; + +struct dma_pinned_list *dma_pin_iovec_pages(struct iovec *iov, size_t len); +void dma_unpin_iovec_pages(struct dma_pinned_list* pinned_list); + +dma_cookie_t dma_memcpy_to_iovec(struct dma_chan *chan, struct iovec *iov, + struct dma_pinned_list *pinned_list, unsigned char *kdata, size_t len); +dma_cookie_t dma_memcpy_pg_to_iovec(struct dma_chan *chan, struct iovec *iov, + struct dma_pinned_list *pinned_list, struct page *page, + unsigned int offset, size_t len); + #endif /* CONFIG_DMA_ENGINE */ #endif /* DMAENGINE_H */ diff --git a/include/net/netdma.h b/include/net/netdma.h index cbfe89d7e5d0..19760eb131aa 100644 --- a/include/net/netdma.h +++ b/include/net/netdma.h @@ -23,6 +23,7 @@ #include #ifdef CONFIG_NET_DMA #include +#include static inline struct dma_chan *get_softnet_dma(void) { @@ -34,5 +35,10 @@ static inline struct dma_chan *get_softnet_dma(void) rcu_read_unlock(); return chan; } + +int dma_skb_copy_datagram_iovec(struct dma_chan* chan, + const struct sk_buff *skb, int offset, struct iovec *to, + size_t len, struct dma_pinned_list *pinned_list); + #endif /* CONFIG_NET_DMA */ #endif /* NETDMA_H */ diff --git a/net/core/Makefile b/net/core/Makefile index 79fe12cced27..e9bd2467d5a9 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -16,3 +16,4 @@ obj-$(CONFIG_NET_DIVERT) += dv.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_WIRELESS_EXT) += wireless.o obj-$(CONFIG_NETPOLL) += netpoll.o +obj-$(CONFIG_NET_DMA) += user_dma.o diff --git a/net/core/user_dma.c b/net/core/user_dma.c new file mode 100644 index 000000000000..9eee91bcbf3f --- /dev/null +++ b/net/core/user_dma.c @@ -0,0 +1,127 @@ +/* + * Copyright(c) 2004 - 2006 Intel Corporation. All rights reserved. + * Portions based on net/core/datagram.c and copyrighted by their authors. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., 59 + * Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * The full GNU General Public License is included in this distribution in the + * file called COPYING. + */ + +/* + * This code allows the net stack to make use of a DMA engine for + * skb to iovec copies. + */ + +#include +#include +#include /* for BUG_TRAP */ +#include + +/** + * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec. + * @skb - buffer to copy + * @offset - offset in the buffer to start copying from + * @iovec - io vector to copy to + * @len - amount of data to copy from buffer to iovec + * @pinned_list - locked iovec buffer data + * + * Note: the iovec is modified during the copy. + */ +int dma_skb_copy_datagram_iovec(struct dma_chan *chan, + struct sk_buff *skb, int offset, struct iovec *to, + size_t len, struct dma_pinned_list *pinned_list) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + dma_cookie_t cookie = 0; + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + cookie = dma_memcpy_to_iovec(chan, to, pinned_list, + skb->data + offset, copy); + if (cookie < 0) + goto fault; + len -= copy; + if (len == 0) + goto end; + offset += copy; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + skb_shinfo(skb)->frags[i].size; + copy = end - offset; + if ((copy = end - offset) > 0) { + skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + struct page *page = frag->page; + + if (copy > len) + copy = len; + + cookie = dma_memcpy_pg_to_iovec(chan, to, pinned_list, page, + frag->page_offset + offset - start, copy); + if (cookie < 0) + goto fault; + len -= copy; + if (len == 0) + goto end; + offset += copy; + } + start = end; + } + + if (skb_shinfo(skb)->frag_list) { + struct sk_buff *list = skb_shinfo(skb)->frag_list; + + for (; list; list = list->next) { + int end; + + BUG_TRAP(start <= offset + len); + + end = start + list->len; + copy = end - offset; + if (copy > 0) { + if (copy > len) + copy = len; + cookie = dma_skb_copy_datagram_iovec(chan, list, + offset - start, to, copy, + pinned_list); + if (cookie < 0) + goto fault; + len -= copy; + if (len == 0) + goto end; + offset += copy; + } + start = end; + } + } + +end: + if (!len) { + skb->dma_cookie = cookie; + return cookie; + } + +fault: + return -EFAULT; +} -- cgit v1.2.3-59-g8ed1b From 97fc2f0848c928c63c2ae619deee61a0b1107b69 Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Tue, 23 May 2006 17:55:33 -0700 Subject: [I/OAT]: Structure changes for TCP recv offload to I/OAT Adds an async_wait_queue and some additional fields to tcp_sock, and a dma_cookie_t to sk_buff. Signed-off-by: Chris Leech Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++++ include/linux/tcp.h | 8 ++++++++ include/net/sock.h | 2 ++ include/net/tcp.h | 7 +++++++ net/core/sock.c | 6 ++++++ 5 files changed, 27 insertions(+) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index f8f234708b98..23bad3bf3c9d 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -29,6 +29,7 @@ #include #include #include +#include #define HAVE_ALLOC_SKB /* For the drivers to know */ #define HAVE_ALIGNABLE_SKB /* Ditto 8) */ @@ -285,6 +286,9 @@ struct sk_buff { __u16 tc_verd; /* traffic control verdict */ #endif #endif +#ifdef CONFIG_NET_DMA + dma_cookie_t dma_cookie; +#endif /* These elements must be at the end, see alloc_skb() for details. */ diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 542d39596bd8..c90daa5da6c3 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -18,6 +18,7 @@ #define _LINUX_TCP_H #include +#include #include struct tcphdr { @@ -233,6 +234,13 @@ struct tcp_sock { struct iovec *iov; int memory; int len; +#ifdef CONFIG_NET_DMA + /* members for async copy */ + struct dma_chan *dma_chan; + int wakeup; + struct dma_pinned_list *pinned_list; + dma_cookie_t dma_cookie; +#endif } ucopy; __u32 snd_wl1; /* Sequence for window update */ diff --git a/include/net/sock.h b/include/net/sock.h index c9fad6fb629b..90c65cb091a8 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -132,6 +132,7 @@ struct sock_common { * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_write_queue: Packet sending queue + * @sk_async_wait_queue: DMA copied packets * @sk_omem_alloc: "o" is "option" or "other" * @sk_wmem_queued: persistent queue size * @sk_forward_alloc: space allocated forward @@ -205,6 +206,7 @@ struct sock { atomic_t sk_omem_alloc; struct sk_buff_head sk_receive_queue; struct sk_buff_head sk_write_queue; + struct sk_buff_head sk_async_wait_queue; int sk_wmem_queued; int sk_forward_alloc; gfp_t sk_allocation; diff --git a/include/net/tcp.h b/include/net/tcp.h index 3c989db8a7aa..d0c2c2fa7587 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -817,6 +818,12 @@ static inline void tcp_prequeue_init(struct tcp_sock *tp) tp->ucopy.len = 0; tp->ucopy.memory = 0; skb_queue_head_init(&tp->ucopy.prequeue); +#ifdef CONFIG_NET_DMA + tp->ucopy.dma_chan = NULL; + tp->ucopy.wakeup = 0; + tp->ucopy.pinned_list = NULL; + tp->ucopy.dma_cookie = 0; +#endif } /* Packet is added to VJ-style prequeue for processing in process diff --git a/net/core/sock.c b/net/core/sock.c index ed2afdb9ea2d..5d820c376653 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -832,6 +832,9 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) atomic_set(&newsk->sk_omem_alloc, 0); skb_queue_head_init(&newsk->sk_receive_queue); skb_queue_head_init(&newsk->sk_write_queue); +#ifdef CONFIG_NET_DMA + skb_queue_head_init(&newsk->sk_async_wait_queue); +#endif rwlock_init(&newsk->sk_dst_lock); rwlock_init(&newsk->sk_callback_lock); @@ -1383,6 +1386,9 @@ void sock_init_data(struct socket *sock, struct sock *sk) skb_queue_head_init(&sk->sk_receive_queue); skb_queue_head_init(&sk->sk_write_queue); skb_queue_head_init(&sk->sk_error_queue); +#ifdef CONFIG_NET_DMA + skb_queue_head_init(&sk->sk_async_wait_queue); +#endif sk->sk_send_head = NULL; -- cgit v1.2.3-59-g8ed1b From 0e4b4992b8007c6b62ec143cbbb292f98813ca11 Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Tue, 23 May 2006 18:00:16 -0700 Subject: [I/OAT]: Rename cleanup_rbuf to tcp_cleanup_rbuf and make non-static Needed to be able to call tcp_cleanup_rbuf in tcp_input.c for I/OAT Signed-off-by: Chris Leech Signed-off-by: David S. Miller --- include/net/tcp.h | 2 ++ net/ipv4/tcp.c | 10 +++++----- 2 files changed, 7 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index d0c2c2fa7587..578cccf275d3 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -294,6 +294,8 @@ extern int tcp_rcv_established(struct sock *sk, extern void tcp_rcv_space_adjust(struct sock *sk); +extern void tcp_cleanup_rbuf(struct sock *sk, int copied); + extern int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e2b7b8055037..1c0cfd7a8bbb 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -937,7 +937,7 @@ static int tcp_recv_urg(struct sock *sk, long timeo, * calculation of whether or not we must ACK for the sake of * a window update. */ -static void cleanup_rbuf(struct sock *sk, int copied) +void tcp_cleanup_rbuf(struct sock *sk, int copied) { struct tcp_sock *tp = tcp_sk(sk); int time_to_ack = 0; @@ -1086,7 +1086,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, /* Clean up data we have read: This will do ACK frames. */ if (copied) - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); return copied; } @@ -1220,7 +1220,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } } - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) { /* Install new reader */ @@ -1391,7 +1391,7 @@ skip_copy: */ /* Clean up data we have read: This will do ACK frames. */ - cleanup_rbuf(sk, copied); + tcp_cleanup_rbuf(sk, copied); TCP_CHECK_TIMER(sk); release_sock(sk); @@ -1858,7 +1858,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT) && inet_csk_ack_scheduled(sk)) { icsk->icsk_ack.pending |= ICSK_ACK_PUSHED; - cleanup_rbuf(sk, 1); + tcp_cleanup_rbuf(sk, 1); if (!(val & 1)) icsk->icsk_ack.pingpong = 1; } -- cgit v1.2.3-59-g8ed1b From 624d1164730d58a494cc5aa4afa37d02c41e83a7 Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Tue, 23 May 2006 18:01:28 -0700 Subject: [I/OAT]: Make sk_eat_skb I/OAT aware. Add an extra argument to sk_eat_skb, and make it move early copied packets to the async_wait_queue instead of freeing them. Signed-off-by: Chris Leech Signed-off-by: David S. Miller --- include/net/sock.h | 13 ++++++++++++- net/dccp/proto.c | 4 ++-- net/ipv4/tcp.c | 8 ++++---- net/llc/af_llc.c | 2 +- 4 files changed, 19 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 90c65cb091a8..75b0e97ed93d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1273,11 +1273,22 @@ sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) * This routine must be called with interrupts disabled or with the socket * locked so that the sk_buff queue operation is ok. */ -static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) +#ifdef CONFIG_NET_DMA +static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early) +{ + __skb_unlink(skb, &sk->sk_receive_queue); + if (!copied_early) + __kfree_skb(skb); + else + __skb_queue_tail(&sk->sk_async_wait_queue, skb); +} +#else +static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_early) { __skb_unlink(skb, &sk->sk_receive_queue); __kfree_skb(skb); } +#endif extern void sock_enable_timestamp(struct sock *sk); extern int sock_get_timestamp(struct sock *, struct timeval __user *); diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 2e0ee8355c41..5317fd3e6691 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -719,7 +719,7 @@ int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } dccp_pr_debug("packet_type=%s\n", dccp_packet_name(dh->dccph_type)); - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); verify_sock_status: if (sock_flag(sk, SOCK_DONE)) { len = 0; @@ -773,7 +773,7 @@ verify_sock_status: } found_fin_ok: if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); break; } while (1); out: diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 1c0cfd7a8bbb..4e067d25a63c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1072,11 +1072,11 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, break; } if (skb->h.th->fin) { - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); ++seq; break; } - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); if (!desc->count) break; } @@ -1356,14 +1356,14 @@ skip_copy: if (skb->h.th->fin) goto found_fin_ok; if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); continue; found_fin_ok: /* Process the FIN. */ ++*seq; if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); break; } while (len > 0); diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 5a04db745c8d..7465170a36ca 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -789,7 +789,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, continue; if (!(flags & MSG_PEEK)) { - sk_eat_skb(sk, skb); + sk_eat_skb(sk, skb, 0); *seq = 0; } } while (len > 0); -- cgit v1.2.3-59-g8ed1b From 9593782585e0cf70babe787a8463d492a68b1744 Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Tue, 23 May 2006 18:02:55 -0700 Subject: [I/OAT]: Add a sysctl for tuning the I/OAT offloaded I/O threshold Any socket recv of less than this ammount will not be offloaded Signed-off-by: Chris Leech Signed-off-by: David S. Miller --- include/linux/sysctl.h | 1 + include/net/tcp.h | 1 + net/core/user_dma.c | 4 ++++ net/ipv4/sysctl_net_ipv4.c | 10 ++++++++++ 4 files changed, 16 insertions(+) (limited to 'net') diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 76eaeff76f82..cd9e7c0825ad 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -403,6 +403,7 @@ enum NET_TCP_MTU_PROBING=113, NET_TCP_BASE_MSS=114, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115, + NET_TCP_DMA_COPYBREAK=116, }; enum { diff --git a/include/net/tcp.h b/include/net/tcp.h index 578cccf275d3..f1f472746e6c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -219,6 +219,7 @@ extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; extern int sysctl_tcp_low_latency; +extern int sysctl_tcp_dma_copybreak; extern int sysctl_tcp_nometrics_save; extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; diff --git a/net/core/user_dma.c b/net/core/user_dma.c index 9eee91bcbf3f..b7c98dbcdb81 100644 --- a/net/core/user_dma.c +++ b/net/core/user_dma.c @@ -30,6 +30,10 @@ #include /* for BUG_TRAP */ #include +#define NET_DMA_DEFAULT_COPYBREAK 4096 + +int sysctl_tcp_dma_copybreak = NET_DMA_DEFAULT_COPYBREAK; + /** * dma_skb_copy_datagram_iovec - Copy a datagram to an iovec. * @skb - buffer to copy diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 6b6c3adfcf00..6a6aa537b7aa 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -688,6 +688,16 @@ ctl_table ipv4_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, +#ifdef CONFIG_NET_DMA + { + .ctl_name = NET_TCP_DMA_COPYBREAK, + .procname = "tcp_dma_copybreak", + .data = &sysctl_tcp_dma_copybreak, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, +#endif { .ctl_name = 0 } }; -- cgit v1.2.3-59-g8ed1b From 1a2449a87bb7606113b1aa1a9d3c3e78ef189a1c Mon Sep 17 00:00:00 2001 From: Chris Leech Date: Tue, 23 May 2006 18:05:53 -0700 Subject: [I/OAT]: TCP recv offload to I/OAT Locks down user pages and sets up for DMA in tcp_recvmsg, then calls dma_async_try_early_copy in tcp_v4_do_rcv Signed-off-by: Chris Leech Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 103 +++++++++++++++++++++++++++++++++++++++++++++------ net/ipv4/tcp_input.c | 74 ++++++++++++++++++++++++++++++++---- net/ipv4/tcp_ipv4.c | 18 ++++++++- net/ipv6/tcp_ipv6.c | 12 +++++- 4 files changed, 185 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 4e067d25a63c..ff6ccda9ff46 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -263,7 +263,7 @@ #include #include #include - +#include #include #include @@ -1110,6 +1110,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, int target; /* Read at least this many bytes */ long timeo; struct task_struct *user_recv = NULL; + int copied_early = 0; lock_sock(sk); @@ -1133,6 +1134,17 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, target = sock_rcvlowat(sk, flags & MSG_WAITALL, len); +#ifdef CONFIG_NET_DMA + tp->ucopy.dma_chan = NULL; + preempt_disable(); + if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && + !sysctl_tcp_low_latency && __get_cpu_var(softnet_data.net_dma)) { + preempt_enable_no_resched(); + tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); + } else + preempt_enable_no_resched(); +#endif + do { struct sk_buff *skb; u32 offset; @@ -1274,6 +1286,10 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } else sk_wait_data(sk, &timeo); +#ifdef CONFIG_NET_DMA + tp->ucopy.wakeup = 0; +#endif + if (user_recv) { int chunk; @@ -1329,13 +1345,39 @@ do_prequeue: } if (!(flags & MSG_TRUNC)) { - err = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); - if (err) { - /* Exception. Bailout! */ - if (!copied) - copied = -EFAULT; - break; +#ifdef CONFIG_NET_DMA + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) + tp->ucopy.dma_chan = get_softnet_dma(); + + if (tp->ucopy.dma_chan) { + tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( + tp->ucopy.dma_chan, skb, offset, + msg->msg_iov, used, + tp->ucopy.pinned_list); + + if (tp->ucopy.dma_cookie < 0) { + + printk(KERN_ALERT "dma_cookie < 0\n"); + + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } + if ((offset + used) == skb->len) + copied_early = 1; + + } else +#endif + { + err = skb_copy_datagram_iovec(skb, offset, + msg->msg_iov, used); + if (err) { + /* Exception. Bailout! */ + if (!copied) + copied = -EFAULT; + break; + } } } @@ -1355,15 +1397,19 @@ skip_copy: if (skb->h.th->fin) goto found_fin_ok; - if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb, 0); + if (!(flags & MSG_PEEK)) { + sk_eat_skb(sk, skb, copied_early); + copied_early = 0; + } continue; found_fin_ok: /* Process the FIN. */ ++*seq; - if (!(flags & MSG_PEEK)) - sk_eat_skb(sk, skb, 0); + if (!(flags & MSG_PEEK)) { + sk_eat_skb(sk, skb, copied_early); + copied_early = 0; + } break; } while (len > 0); @@ -1386,6 +1432,36 @@ skip_copy: tp->ucopy.len = 0; } +#ifdef CONFIG_NET_DMA + if (tp->ucopy.dma_chan) { + struct sk_buff *skb; + dma_cookie_t done, used; + + dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); + + while (dma_async_memcpy_complete(tp->ucopy.dma_chan, + tp->ucopy.dma_cookie, &done, + &used) == DMA_IN_PROGRESS) { + /* do partial cleanup of sk_async_wait_queue */ + while ((skb = skb_peek(&sk->sk_async_wait_queue)) && + (dma_async_is_complete(skb->dma_cookie, done, + used) == DMA_SUCCESS)) { + __skb_dequeue(&sk->sk_async_wait_queue); + kfree_skb(skb); + } + } + + /* Safe to free early-copied skbs now */ + __skb_queue_purge(&sk->sk_async_wait_queue); + dma_chan_put(tp->ucopy.dma_chan); + tp->ucopy.dma_chan = NULL; + } + if (tp->ucopy.pinned_list) { + dma_unpin_iovec_pages(tp->ucopy.pinned_list); + tp->ucopy.pinned_list = NULL; + } +#endif + /* According to UNIX98, msg_name/msg_namelen are ignored * on connected socket. I was just happy when found this 8) --ANK */ @@ -1658,6 +1734,9 @@ int tcp_disconnect(struct sock *sk, int flags) __skb_queue_purge(&sk->sk_receive_queue); sk_stream_writequeue_purge(sk); __skb_queue_purge(&tp->out_of_order_queue); +#ifdef CONFIG_NET_DMA + __skb_queue_purge(&sk->sk_async_wait_queue); +#endif inet->dport = 0; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b5521a9d3dc1..c6d62f0a9966 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -71,6 +71,7 @@ #include #include #include +#include int sysctl_tcp_timestamps = 1; int sysctl_tcp_window_scaling = 1; @@ -3785,6 +3786,50 @@ static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *sk __tcp_checksum_complete_user(sk, skb); } +#ifdef CONFIG_NET_DMA +static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen) +{ + struct tcp_sock *tp = tcp_sk(sk); + int chunk = skb->len - hlen; + int dma_cookie; + int copied_early = 0; + + if (tp->ucopy.wakeup) + return 0; + + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) + tp->ucopy.dma_chan = get_softnet_dma(); + + if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) { + + dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, + skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list); + + if (dma_cookie < 0) + goto out; + + tp->ucopy.dma_cookie = dma_cookie; + copied_early = 1; + + tp->ucopy.len -= chunk; + tp->copied_seq += chunk; + tcp_rcv_space_adjust(sk); + + if ((tp->ucopy.len == 0) || + (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) || + (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { + tp->ucopy.wakeup = 1; + sk->sk_data_ready(sk, 0); + } + } else if (chunk > 0) { + tp->ucopy.wakeup = 1; + sk->sk_data_ready(sk, 0); + } +out: + return copied_early; +} +#endif /* CONFIG_NET_DMA */ + /* * TCP receive function for the ESTABLISHED state. * @@ -3901,14 +3946,23 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, } } else { int eaten = 0; + int copied_early = 0; - if (tp->ucopy.task == current && - tp->copied_seq == tp->rcv_nxt && - len - tcp_header_len <= tp->ucopy.len && - sock_owned_by_user(sk)) { - __set_current_state(TASK_RUNNING); + if (tp->copied_seq == tp->rcv_nxt && + len - tcp_header_len <= tp->ucopy.len) { +#ifdef CONFIG_NET_DMA + if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) { + copied_early = 1; + eaten = 1; + } +#endif + if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) { + __set_current_state(TASK_RUNNING); - if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) { + if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) + eaten = 1; + } + if (eaten) { /* Predicted packet is in window by definition. * seq == rcv_nxt and rcv_wup <= rcv_nxt. * Hence, check seq<=rcv_wup reduces to: @@ -3924,8 +3978,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, __skb_pull(skb, tcp_header_len); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER); - eaten = 1; } + if (copied_early) + tcp_cleanup_rbuf(sk, skb->len); } if (!eaten) { if (tcp_checksum_complete_user(sk, skb)) @@ -3966,6 +4021,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, __tcp_ack_snd_check(sk, 0); no_ack: +#ifdef CONFIG_NET_DMA + if (copied_early) + __skb_queue_tail(&sk->sk_async_wait_queue, skb); + else +#endif if (eaten) __kfree_skb(skb); else diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 672950e54c49..25ecc6e2478b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -71,6 +71,7 @@ #include #include #include +#include #include #include @@ -1091,8 +1092,18 @@ process: bh_lock_sock(sk); ret = 0; if (!sock_owned_by_user(sk)) { - if (!tcp_prequeue(sk, skb)) +#ifdef CONFIG_NET_DMA + struct tcp_sock *tp = tcp_sk(sk); + if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) + tp->ucopy.dma_chan = get_softnet_dma(); + if (tp->ucopy.dma_chan) ret = tcp_v4_do_rcv(sk, skb); + else +#endif + { + if (!tcp_prequeue(sk, skb)) + ret = tcp_v4_do_rcv(sk, skb); + } } else sk_add_backlog(sk, skb); bh_unlock_sock(sk); @@ -1296,6 +1307,11 @@ int tcp_v4_destroy_sock(struct sock *sk) /* Cleans up our, hopefully empty, out_of_order_queue. */ __skb_queue_purge(&tp->out_of_order_queue); +#ifdef CONFIG_NET_DMA + /* Cleans up our sk_async_wait_queue */ + __skb_queue_purge(&sk->sk_async_wait_queue); +#endif + /* Clean prequeue, it must be empty really */ __skb_queue_purge(&tp->ucopy.prequeue); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 301eee726b0f..a50eb306e9e2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1218,8 +1218,16 @@ process: bh_lock_sock(sk); ret = 0; if (!sock_owned_by_user(sk)) { - if (!tcp_prequeue(sk, skb)) - ret = tcp_v6_do_rcv(sk, skb); +#ifdef CONFIG_NET_DMA + struct tcp_sock *tp = tcp_sk(sk); + if (tp->ucopy.dma_chan) + ret = tcp_v6_do_rcv(sk, skb); + else +#endif + { + if (!tcp_prequeue(sk, skb)) + ret = tcp_v6_do_rcv(sk, skb); + } } else sk_add_backlog(sk, skb); bh_unlock_sock(sk); -- cgit v1.2.3-59-g8ed1b From aecbd4e45c2e469e0452ffb2c0b0d881e2815bb8 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 25 May 2006 15:08:30 -0700 Subject: [LLC]: use more efficient ether address routines Use more cache efficient Ethernet address manipulation functions in etherdevice.h. Signed-off-by: Stephen Hemminger --- include/net/llc_if.h | 13 ++++++------- net/llc/llc_if.c | 2 -- 2 files changed, 6 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/include/net/llc_if.h b/include/net/llc_if.h index 090eaa0d71f9..a05d04ac4513 100644 --- a/include/net/llc_if.h +++ b/include/net/llc_if.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #define LLC_DATAUNIT_PRIM 1 @@ -61,8 +62,6 @@ #define LLC_STATUS_CONFLICT 7 /* disconnect conn */ #define LLC_STATUS_RESET_DONE 8 /* */ -extern u8 llc_mac_null_var[IFHWADDRLEN]; - /** * llc_mac_null - determines if a address is a null mac address * @mac: Mac address to test if null. @@ -70,12 +69,12 @@ extern u8 llc_mac_null_var[IFHWADDRLEN]; * Determines if a given address is a null mac address. Returns 0 if the * address is not a null mac, 1 if the address is a null mac. */ -static __inline__ int llc_mac_null(u8 *mac) +static inline int llc_mac_null(const u8 *mac) { - return !memcmp(mac, llc_mac_null_var, IFHWADDRLEN); + return is_zero_ether_addr(mac); } -static __inline__ int llc_addrany(struct llc_addr *addr) +static inline int llc_addrany(const struct llc_addr *addr) { return llc_mac_null(addr->mac) && !addr->lsap; } @@ -89,9 +88,9 @@ static __inline__ int llc_addrany(struct llc_addr *addr) * is not a complete match up to len, 1 if a complete match up to len is * found. */ -static __inline__ int llc_mac_match(u8 *mac1, u8 *mac2) +static inline int llc_mac_match(const u8 *mac1, const u8 *mac2) { - return !memcmp(mac1, mac2, IFHWADDRLEN); + return !compare_ether_addr(mac1, mac2); } extern int llc_establish_connection(struct sock *sk, u8 *lmac, diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c index ba90f7f0801a..5ae47be7dde0 100644 --- a/net/llc/llc_if.c +++ b/net/llc/llc_if.c @@ -26,8 +26,6 @@ #include #include -u8 llc_mac_null_var[IFHWADDRLEN]; - /** * llc_build_and_send_pkt - Connection data sending for upper layers. * @sk: connection -- cgit v1.2.3-59-g8ed1b From 29efcd2666b3a465b40aa07ef1f4d79847303e2f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 25 May 2006 15:08:59 -0700 Subject: [LLC]: allow datagram recvmsg LLC receive is broken for SOCK_DGRAM. If an application does recv() on a datagram socket and there is no data present, don't return "not connected". Instead, just do normal datagram semantics. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/llc/af_llc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 7465170a36ca..75c9b1480801 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -674,7 +674,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, lock_sock(sk); copied = -ENOTCONN; - if (sk->sk_state == TCP_LISTEN) + if (unlikely(sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN)) goto out; timeo = sock_rcvtimeo(sk, nonblock); @@ -733,7 +733,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, if (sk->sk_shutdown & RCV_SHUTDOWN) break; - if (sk->sk_state == TCP_CLOSE) { + if (sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_CLOSE) { if (!sock_flag(sk, SOCK_DONE)) { /* * This occurs when user tries to read -- cgit v1.2.3-59-g8ed1b From 23dbe7912dad6be71bb9e69cb819d05e2442d362 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 25 May 2006 15:09:37 -0700 Subject: [LLC]: use rcu_dereference on receive handler The receive hander pointer might be modified during network changes of protocol. So use rcu_dereference (only matters on alpha). Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/llc/llc_input.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index d62e0f9b9da3..cb9f7f058f75 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -142,6 +142,8 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, struct llc_sap *sap; struct llc_pdu_sn *pdu; int dest; + int (*rcv)(struct sk_buff *, struct net_device *, + struct packet_type *, struct net_device *); /* * When the interface is in promisc. mode, drop all the crap that it @@ -169,8 +171,9 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, * First the upper layer protocols that don't need the full * LLC functionality */ - if (sap->rcv_func) { - sap->rcv_func(skb, dev, pt, orig_dev); + rcv = rcu_dereference(sap->rcv_func); + if (rcv) { + rcv(skb, dev, pt, orig_dev); goto out_put; } dest = llc_pdu_type(skb); -- cgit v1.2.3-59-g8ed1b From 8f182b494f87799d6ae20a1401825c516da46081 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 25 May 2006 15:10:02 -0700 Subject: [LLC]: allow applications to get copy of kernel datagrams It is legal for an application to bind to a SAP that is also being used by the kernel. This happens if the bridge module binds to the STP SAP, and the user wants to have a daemon for STP as well. It is possible to have kernel doing STP on one bridge, but let application do RSTP on another bridge. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/llc/llc_input.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index cb9f7f058f75..32cf8ac8ea80 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -173,8 +173,10 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, */ rcv = rcu_dereference(sap->rcv_func); if (rcv) { + struct sk_buff *cskb = skb_clone(skb, GFP_ATOMIC); + if (cskb) + rcv(cskb, dev, pt, orig_dev); rcv(skb, dev, pt, orig_dev); - goto out_put; } dest = llc_pdu_type(skb); if (unlikely(!dest || !llc_type_handlers[dest - 1])) -- cgit v1.2.3-59-g8ed1b From bc0e646796928918e45b6465e02616f2fe65c3c1 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 25 May 2006 15:10:37 -0700 Subject: [LLC]: add multicast support for datagrams Allow mulitcast reception of datagrams (similar to UDP). All sockets bound to the same SAP receive a clone. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/llc_if.h | 4 ++++ net/llc/llc_sap.c | 59 +++++++++++++++++++++++++++++++++++++++++++++------- 2 files changed, 55 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/net/llc_if.h b/include/net/llc_if.h index a05d04ac4513..c608812a8e89 100644 --- a/include/net/llc_if.h +++ b/include/net/llc_if.h @@ -79,6 +79,10 @@ static inline int llc_addrany(const struct llc_addr *addr) return llc_mac_null(addr->mac) && !addr->lsap; } +static inline int llc_mac_multicast(const u8 *mac) +{ + return is_multicast_ether_addr(mac); +} /** * llc_mac_match - determines if two mac addresses are the same * @mac1: First mac address to compare. diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index 4029ceee9b91..20c4eb5c1ac6 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -282,7 +282,7 @@ static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb) * mac, and local sap. Returns pointer for socket found, %NULL otherwise. */ static struct sock *llc_lookup_dgram(struct llc_sap *sap, - struct llc_addr *laddr) + const struct llc_addr *laddr) { struct sock *rc; struct hlist_node *node; @@ -304,19 +304,62 @@ found: return rc; } +/** + * llc_sap_mcast - Deliver multicast PDU's to all matching datagram sockets. + * @sap: SAP + * @laddr: address of local LLC (MAC + SAP) + * + * Search socket list of the SAP and finds connections with same sap. + * Deliver clone to each. + */ +static void llc_sap_mcast(struct llc_sap *sap, + const struct llc_addr *laddr, + struct sk_buff *skb) +{ + struct sock *sk; + struct hlist_node *node; + + read_lock_bh(&sap->sk_list.lock); + sk_for_each(sk, node, &sap->sk_list.list) { + struct llc_sock *llc = llc_sk(sk); + struct sk_buff *skb1; + + if (sk->sk_type != SOCK_DGRAM) + continue; + + if (llc->laddr.lsap != laddr->lsap) + continue; + + skb1 = skb_clone(skb, GFP_ATOMIC); + if (!skb1) + break; + + sock_hold(sk); + skb_set_owner_r(skb1, sk); + llc_sap_rcv(sap, skb1); + sock_put(sk); + } + read_unlock_bh(&sap->sk_list.lock); +} + + void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb) { struct llc_addr laddr; - struct sock *sk; llc_pdu_decode_da(skb, laddr.mac); llc_pdu_decode_dsap(skb, &laddr.lsap); - sk = llc_lookup_dgram(sap, &laddr); - if (sk) { - skb_set_owner_r(skb, sk); - llc_sap_rcv(sap, skb); - sock_put(sk); - } else + if (llc_mac_multicast(laddr.mac)) { + llc_sap_mcast(sap, &laddr, skb); kfree_skb(skb); + } else { + struct sock *sk = llc_lookup_dgram(sap, &laddr); + if (sk) { + skb_set_owner_r(skb, sk); + llc_sap_rcv(sap, skb); + sock_put(sk); + } else + kfree_skb(skb); + } } -- cgit v1.2.3-59-g8ed1b From 9ef513bed68534110727381ab652f06756803f5a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 25 May 2006 15:58:54 -0700 Subject: [BRIDGE]: optimize conditional in forward path Small optimizations of bridge forwarding path. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_forward.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c index 56f3aa47e758..0dca027ceb80 100644 --- a/net/bridge/br_forward.c +++ b/net/bridge/br_forward.c @@ -20,14 +20,11 @@ #include #include "br_private.h" +/* Don't forward packets to originating port or forwarding diasabled */ static inline int should_deliver(const struct net_bridge_port *p, const struct sk_buff *skb) { - if (skb->dev == p->dev || - p->state != BR_STATE_FORWARDING) - return 0; - - return 1; + return (skb->dev != p->dev && p->state == BR_STATE_FORWARDING); } static inline unsigned packet_length(const struct sk_buff *skb) @@ -55,10 +52,9 @@ int br_dev_queue_push_xmit(struct sk_buff *skb) int br_forward_finish(struct sk_buff *skb) { - NF_HOOK(PF_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev, - br_dev_queue_push_xmit); + return NF_HOOK(PF_BRIDGE, NF_BR_POST_ROUTING, skb, NULL, skb->dev, + br_dev_queue_push_xmit); - return 0; } static void __br_deliver(const struct net_bridge_port *to, struct sk_buff *skb) -- cgit v1.2.3-59-g8ed1b From c090971326db094ed702c1f8f2dbe04b7e3b8f27 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 25 May 2006 15:59:33 -0700 Subject: [BRIDGE]: fix module startup error handling Return address in use, if some other kernel code has the SAP. Propogate out error codes from netfilter registration and unwind. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br.c | 26 +++++++++++++++++--------- net/bridge/br_private.h | 5 +++++ 2 files changed, 22 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/bridge/br.c b/net/bridge/br.c index 12da21afb9ca..558d27204f60 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -30,36 +30,44 @@ static struct llc_sap *br_stp_sap; static int __init br_init(void) { + int err; + br_stp_sap = llc_sap_open(LLC_SAP_BSPAN, br_stp_rcv); if (!br_stp_sap) { printk(KERN_ERR "bridge: can't register sap for STP\n"); - return -EBUSY; + return -EADDRINUSE; } br_fdb_init(); -#ifdef CONFIG_BRIDGE_NETFILTER - if (br_netfilter_init()) - return 1; -#endif + err = br_netfilter_init(); + if (err) + goto err_out1; + + err = register_netdevice_notifier(&br_device_notifier); + if (err) + goto err_out2; + brioctl_set(br_ioctl_deviceless_stub); br_handle_frame_hook = br_handle_frame; br_fdb_get_hook = br_fdb_get; br_fdb_put_hook = br_fdb_put; - register_netdevice_notifier(&br_device_notifier); - return 0; + +err_out2: + br_netfilter_fini(); +err_out1: + llc_sap_put(br_stp_sap); + return err; } static void __exit br_deinit(void) { rcu_assign_pointer(br_stp_sap->rcv_func, NULL); -#ifdef CONFIG_BRIDGE_NETFILTER br_netfilter_fini(); -#endif unregister_netdevice_notifier(&br_device_notifier); brioctl_set(NULL); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 86ecea7ed372..22071d156828 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -192,8 +192,13 @@ extern int br_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd); extern int br_ioctl_deviceless_stub(unsigned int cmd, void __user *arg); /* br_netfilter.c */ +#ifdef CONFIG_BRIDGE_NETFILTER extern int br_netfilter_init(void); extern void br_netfilter_fini(void); +#else +#define br_netfilter_init() (0) +#define br_netfilter_fini() do { } while(0) +#endif /* br_stp.c */ extern void br_log_state(const struct net_bridge_port *p); -- cgit v1.2.3-59-g8ed1b From 11dc1f36a6701b502ecb695f308aae46ede8bac6 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 25 May 2006 16:00:12 -0700 Subject: [BRIDGE]: netlink interface for link management Add basic netlink support to the Ethernet bridge. Including: * dump interfaces in bridges * monitor link status changes * change state of bridge port For some demo programs see: http://developer.osdl.org/shemminger/prototypes/brnl.tar.gz These are to allow building a daemon that does alternative implementations of Spanning Tree Protocol. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/Makefile | 2 +- net/bridge/br.c | 2 + net/bridge/br_netlink.c | 199 ++++++++++++++++++++++++++++++++++++++++++++++++ net/bridge/br_notify.c | 2 + net/bridge/br_private.h | 7 +- net/bridge/br_stp_if.c | 4 + 6 files changed, 214 insertions(+), 2 deletions(-) create mode 100644 net/bridge/br_netlink.c (limited to 'net') diff --git a/net/bridge/Makefile b/net/bridge/Makefile index 59556e40e143..f444c12cde5a 100644 --- a/net/bridge/Makefile +++ b/net/bridge/Makefile @@ -6,7 +6,7 @@ obj-$(CONFIG_BRIDGE) += bridge.o bridge-y := br.o br_device.o br_fdb.o br_forward.o br_if.o br_input.o \ br_ioctl.o br_notify.o br_stp.o br_stp_bpdu.o \ - br_stp_if.o br_stp_timer.o + br_stp_if.o br_stp_timer.o br_netlink.o bridge-$(CONFIG_SYSFS) += br_sysfs_if.o br_sysfs_br.o diff --git a/net/bridge/br.c b/net/bridge/br.c index 558d27204f60..654401ceb2db 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -48,6 +48,7 @@ static int __init br_init(void) if (err) goto err_out2; + br_netlink_init(); brioctl_set(br_ioctl_deviceless_stub); br_handle_frame_hook = br_handle_frame; @@ -67,6 +68,7 @@ static void __exit br_deinit(void) { rcu_assign_pointer(br_stp_sap->rcv_func, NULL); + br_netlink_fini(); br_netfilter_fini(); unregister_netdevice_notifier(&br_device_notifier); brioctl_set(NULL); diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c new file mode 100644 index 000000000000..881d7d1a732a --- /dev/null +++ b/net/bridge/br_netlink.c @@ -0,0 +1,199 @@ +/* + * Bridge netlink control interface + * + * Authors: + * Stephen Hemminger + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include "br_private.h" + +/* + * Create one netlink message for one interface + * Contains port and master info as well as carrier and bridge state. + */ +static int br_fill_ifinfo(struct sk_buff *skb, const struct net_bridge_port *port, + u32 pid, u32 seq, int event, unsigned int flags) +{ + const struct net_bridge *br = port->br; + const struct net_device *dev = port->dev; + struct ifinfomsg *r; + struct nlmsghdr *nlh; + unsigned char *b = skb->tail; + u32 mtu = dev->mtu; + u8 operstate = netif_running(dev) ? dev->operstate : IF_OPER_DOWN; + u8 portstate = port->state; + + pr_debug("br_fill_info event %d port %s master %s\n", + event, dev->name, br->dev->name); + + nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags); + r = NLMSG_DATA(nlh); + r->ifi_family = AF_BRIDGE; + r->__ifi_pad = 0; + r->ifi_type = dev->type; + r->ifi_index = dev->ifindex; + r->ifi_flags = dev_get_flags(dev); + r->ifi_change = 0; + + RTA_PUT(skb, IFLA_IFNAME, strlen(dev->name)+1, dev->name); + + RTA_PUT(skb, IFLA_MASTER, sizeof(int), &br->dev->ifindex); + + if (dev->addr_len) + RTA_PUT(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr); + + RTA_PUT(skb, IFLA_MTU, sizeof(mtu), &mtu); + if (dev->ifindex != dev->iflink) + RTA_PUT(skb, IFLA_LINK, sizeof(int), &dev->iflink); + + + RTA_PUT(skb, IFLA_OPERSTATE, sizeof(operstate), &operstate); + + if (event == RTM_NEWLINK) + RTA_PUT(skb, IFLA_PROTINFO, sizeof(portstate), &portstate); + + nlh->nlmsg_len = skb->tail - b; + + return skb->len; + +nlmsg_failure: +rtattr_failure: + + skb_trim(skb, b - skb->data); + return -EINVAL; +} + +/* + * Notify listeners of a change in port information + */ +void br_ifinfo_notify(int event, struct net_bridge_port *port) +{ + struct sk_buff *skb; + int err = -ENOMEM; + + pr_debug("bridge notify event=%d\n", event); + skb = alloc_skb(NLMSG_SPACE(sizeof(struct ifinfomsg) + 128), + GFP_ATOMIC); + if (!skb) + goto err_out; + + err = br_fill_ifinfo(skb, port, current->pid, 0, event, 0); + if (err) + goto err_kfree; + + NETLINK_CB(skb).dst_group = RTNLGRP_LINK; + netlink_broadcast(rtnl, skb, 0, RTNLGRP_LINK, GFP_ATOMIC); + return; + +err_kfree: + kfree_skb(skb); +err_out: + netlink_set_err(rtnl, 0, RTNLGRP_LINK, err); +} + +/* + * Dump information about all ports, in response to GETLINK + */ +static int br_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct net_device *dev; + int idx; + int s_idx = cb->args[0]; + int err = 0; + + read_lock(&dev_base_lock); + for (dev = dev_base, idx = 0; dev; dev = dev->next) { + struct net_bridge_port *p = dev->br_port; + + /* not a bridge port */ + if (!p) + continue; + + if (idx < s_idx) + continue; + + err = br_fill_ifinfo(skb, p, NETLINK_CB(cb->skb).pid, + cb->nlh->nlmsg_seq, RTM_NEWLINK, NLM_F_MULTI); + if (err <= 0) + break; + ++idx; + } + read_unlock(&dev_base_lock); + + cb->args[0] = idx; + + return skb->len; +} + +/* + * Change state of port (ie from forwarding to blocking etc) + * Used by spanning tree in user space. + */ +static int br_rtm_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) +{ + struct rtattr **rta = arg; + struct ifinfomsg *ifm = NLMSG_DATA(nlh); + struct net_device *dev; + struct net_bridge_port *p; + u8 new_state; + + if (ifm->ifi_family != AF_BRIDGE) + return -EPFNOSUPPORT; + + /* Must pass valid state as PROTINFO */ + if (rta[IFLA_PROTINFO-1]) { + u8 *pstate = RTA_DATA(rta[IFLA_PROTINFO-1]); + new_state = *pstate; + } else + return -EINVAL; + + if (new_state > BR_STATE_BLOCKING) + return -EINVAL; + + /* Find bridge port */ + dev = __dev_get_by_index(ifm->ifi_index); + if (!dev) + return -ENODEV; + + p = dev->br_port; + if (!p) + return -EINVAL; + + /* if kernel STP is running, don't allow changes */ + if (p->br->stp_enabled) + return -EBUSY; + + if (!netif_running(dev)) + return -ENETDOWN; + + if (!netif_carrier_ok(dev) && new_state != BR_STATE_DISABLED) + return -ENETDOWN; + + p->state = new_state; + br_log_state(p); + return 0; +} + + +static struct rtnetlink_link bridge_rtnetlink_table[RTM_NR_MSGTYPES] = { + [RTM_GETLINK - RTM_BASE] = { .dumpit = br_dump_ifinfo, }, + [RTM_SETLINK - RTM_BASE] = { .doit = br_rtm_setlink, }, +}; + +void __init br_netlink_init(void) +{ + rtnetlink_links[PF_BRIDGE] = bridge_rtnetlink_table; +} + +void __exit br_netlink_fini(void) +{ + rtnetlink_links[PF_BRIDGE] = NULL; +} + diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c index a43a9c1d50d7..20278494e4da 100644 --- a/net/bridge/br_notify.c +++ b/net/bridge/br_notify.c @@ -14,6 +14,7 @@ */ #include +#include #include "br_private.h" @@ -49,6 +50,7 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v case NETDEV_CHANGEADDR: br_fdb_changeaddr(p, dev->dev_addr); + br_ifinfo_notify(RTM_NEWLINK, p); br_stp_recalculate_bridge_id(br); break; diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 22071d156828..c491fb2f280e 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -29,7 +29,7 @@ #define BR_PORT_DEBOUNCE (HZ/10) -#define BR_VERSION "2.1" +#define BR_VERSION "2.2" typedef struct bridge_id bridge_id; typedef struct mac_addr mac_addr; @@ -237,6 +237,11 @@ extern struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br, extern void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent); +/* br_netlink.c */ +extern void br_netlink_init(void); +extern void br_netlink_fini(void); +extern void br_ifinfo_notify(int event, struct net_bridge_port *port); + #ifdef CONFIG_SYSFS /* br_sysfs_if.c */ extern struct sysfs_ops brport_sysfs_ops; diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 23dea1422c9a..14cd025079af 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "br_private.h" #include "br_private_stp.h" @@ -86,6 +87,7 @@ void br_stp_disable_bridge(struct net_bridge *br) void br_stp_enable_port(struct net_bridge_port *p) { br_init_port(p); + br_ifinfo_notify(RTM_NEWLINK, p); br_port_state_selection(p->br); } @@ -99,6 +101,8 @@ void br_stp_disable_port(struct net_bridge_port *p) printk(KERN_INFO "%s: port %i(%s) entering %s state\n", br->dev->name, p->port_no, p->dev->name, "disabled"); + br_ifinfo_notify(RTM_DELLINK, p); + wasroot = br_is_root_bridge(br); br_become_designated_port(p); p->state = BR_STATE_DISABLED; -- cgit v1.2.3-59-g8ed1b From 15986e1aadbbf40a331cddd0470bb434d156431d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 25 May 2006 16:11:14 -0700 Subject: [TCP]: tcp_rcv_rtt_measure_ts() call in pure-ACK path is superfluous We only want to take receive RTT mesaurements for data bearing frames, here in the header prediction fast path for a pure-sender, we know that we have a pure-ACK and thus the checks in tcp_rcv_rtt_mesaure_ts() will not pass. Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c6d62f0a9966..6d167889a4b0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3931,8 +3931,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rcv_nxt == tp->rcv_wup) tcp_store_ts_recent(tp); - tcp_rcv_rtt_measure_ts(sk, skb); - /* We know that such packets are checksummed * on entry. */ -- cgit v1.2.3-59-g8ed1b From 546be2405be119ef55467aace45f337a16e5d424 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 27 May 2006 23:03:58 -0700 Subject: [IPSEC] xfrm: Undo afinfo lock proliferation The number of locks used to manage afinfo structures can easily be reduced down to one each for policy and state respectively. This is based on the observation that the write locks are only held by module insertion/removal which are very rare events so there is no need to further differentiate between the insertion of modules like ipv6 versus esp6. The removal of the read locks in xfrm4_policy.c/xfrm6_policy.c might look suspicious at first. However, after you realise that nobody ever takes the corresponding write lock you'll feel better :) As far as I can gather it's an attempt to guard against the removal of the corresponding modules. Since neither module can be unloaded at all we can leave it to whoever fixes up IPv6 unloading :) Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/net/xfrm.h | 9 +------- net/ipv4/xfrm4_policy.c | 6 ----- net/ipv4/xfrm4_state.c | 1 - net/ipv6/xfrm6_policy.c | 6 ----- net/ipv6/xfrm6_state.c | 1 - net/xfrm/xfrm_policy.c | 58 +++++++++++++++++++++++++++++-------------------- net/xfrm/xfrm_state.c | 9 +++----- 7 files changed, 38 insertions(+), 52 deletions(-) (limited to 'net') diff --git a/include/net/xfrm.h b/include/net/xfrm.h index afa508d92c93..ed7c9747059d 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -204,8 +204,7 @@ struct xfrm_type; struct xfrm_dst; struct xfrm_policy_afinfo { unsigned short family; - rwlock_t lock; - struct xfrm_type_map *type_map; + struct xfrm_type *type_map[256]; struct dst_ops *dst_ops; void (*garbage_collect)(void); int (*dst_lookup)(struct xfrm_dst **dst, struct flowi *fl); @@ -232,7 +231,6 @@ extern int __xfrm_state_delete(struct xfrm_state *x); struct xfrm_state_afinfo { unsigned short family; - rwlock_t lock; struct list_head *state_bydst; struct list_head *state_byspi; int (*init_flags)(struct xfrm_state *x); @@ -264,11 +262,6 @@ struct xfrm_type u32 (*get_max_size)(struct xfrm_state *, int size); }; -struct xfrm_type_map { - rwlock_t lock; - struct xfrm_type *map[256]; -}; - extern int xfrm_register_type(struct xfrm_type *type, unsigned short family); extern int xfrm_unregister_type(struct xfrm_type *type, unsigned short family); extern struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family); diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 8604c747bca5..c0465284dfac 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -17,8 +17,6 @@ static struct dst_ops xfrm4_dst_ops; static struct xfrm_policy_afinfo xfrm4_policy_afinfo; -static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED }; - static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) { return __ip_route_output_key((struct rtable**)dst, fl); @@ -237,9 +235,7 @@ _decode_session4(struct sk_buff *skb, struct flowi *fl) static inline int xfrm4_garbage_collect(void) { - read_lock(&xfrm4_policy_afinfo.lock); xfrm4_policy_afinfo.garbage_collect(); - read_unlock(&xfrm4_policy_afinfo.lock); return (atomic_read(&xfrm4_dst_ops.entries) > xfrm4_dst_ops.gc_thresh*2); } @@ -299,8 +295,6 @@ static struct dst_ops xfrm4_dst_ops = { static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { .family = AF_INET, - .lock = RW_LOCK_UNLOCKED, - .type_map = &xfrm4_type_map, .dst_ops = &xfrm4_dst_ops, .dst_lookup = xfrm4_dst_lookup, .find_bundle = __xfrm4_find_bundle, diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index dbabf81a9b7b..81e1751c966e 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -131,7 +131,6 @@ __xfrm4_find_acq(u8 mode, u32 reqid, u8 proto, static struct xfrm_state_afinfo xfrm4_state_afinfo = { .family = AF_INET, - .lock = RW_LOCK_UNLOCKED, .init_flags = xfrm4_init_flags, .init_tempsel = __xfrm4_init_tempsel, .state_lookup = __xfrm4_state_lookup, diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 88c840f1beb6..ee715f2691e9 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -23,8 +23,6 @@ static struct dst_ops xfrm6_dst_ops; static struct xfrm_policy_afinfo xfrm6_policy_afinfo; -static struct xfrm_type_map xfrm6_type_map = { .lock = RW_LOCK_UNLOCKED }; - static int xfrm6_dst_lookup(struct xfrm_dst **dst, struct flowi *fl) { int err = 0; @@ -249,9 +247,7 @@ _decode_session6(struct sk_buff *skb, struct flowi *fl) static inline int xfrm6_garbage_collect(void) { - read_lock(&xfrm6_policy_afinfo.lock); xfrm6_policy_afinfo.garbage_collect(); - read_unlock(&xfrm6_policy_afinfo.lock); return (atomic_read(&xfrm6_dst_ops.entries) > xfrm6_dst_ops.gc_thresh*2); } @@ -311,8 +307,6 @@ static struct dst_ops xfrm6_dst_ops = { static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { .family = AF_INET6, - .lock = RW_LOCK_UNLOCKED, - .type_map = &xfrm6_type_map, .dst_ops = &xfrm6_dst_ops, .dst_lookup = xfrm6_dst_lookup, .find_bundle = __xfrm6_find_bundle, diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index a5723024d3b3..b33296b3f6de 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -135,7 +135,6 @@ __xfrm6_find_acq(u8 mode, u32 reqid, u8 proto, static struct xfrm_state_afinfo xfrm6_state_afinfo = { .family = AF_INET6, - .lock = RW_LOCK_UNLOCKED, .init_tempsel = __xfrm6_init_tempsel, .state_lookup = __xfrm6_state_lookup, .find_acq = __xfrm6_find_acq, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index b469c8b54613..44b64a593c01 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -46,45 +46,43 @@ static DEFINE_SPINLOCK(xfrm_policy_gc_lock); static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family); static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo); +static struct xfrm_policy_afinfo *xfrm_policy_lock_afinfo(unsigned int family); +static void xfrm_policy_unlock_afinfo(struct xfrm_policy_afinfo *afinfo); int xfrm_register_type(struct xfrm_type *type, unsigned short family) { - struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); - struct xfrm_type_map *typemap; + struct xfrm_policy_afinfo *afinfo = xfrm_policy_lock_afinfo(family); + struct xfrm_type **typemap; int err = 0; if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; typemap = afinfo->type_map; - write_lock_bh(&typemap->lock); - if (likely(typemap->map[type->proto] == NULL)) - typemap->map[type->proto] = type; + if (likely(typemap[type->proto] == NULL)) + typemap[type->proto] = type; else err = -EEXIST; - write_unlock_bh(&typemap->lock); - xfrm_policy_put_afinfo(afinfo); + xfrm_policy_unlock_afinfo(afinfo); return err; } EXPORT_SYMBOL(xfrm_register_type); int xfrm_unregister_type(struct xfrm_type *type, unsigned short family) { - struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); - struct xfrm_type_map *typemap; + struct xfrm_policy_afinfo *afinfo = xfrm_policy_lock_afinfo(family); + struct xfrm_type **typemap; int err = 0; if (unlikely(afinfo == NULL)) return -EAFNOSUPPORT; typemap = afinfo->type_map; - write_lock_bh(&typemap->lock); - if (unlikely(typemap->map[type->proto] != type)) + if (unlikely(typemap[type->proto] != type)) err = -ENOENT; else - typemap->map[type->proto] = NULL; - write_unlock_bh(&typemap->lock); - xfrm_policy_put_afinfo(afinfo); + typemap[type->proto] = NULL; + xfrm_policy_unlock_afinfo(afinfo); return err; } EXPORT_SYMBOL(xfrm_unregister_type); @@ -92,7 +90,7 @@ EXPORT_SYMBOL(xfrm_unregister_type); struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family) { struct xfrm_policy_afinfo *afinfo; - struct xfrm_type_map *typemap; + struct xfrm_type **typemap; struct xfrm_type *type; int modload_attempted = 0; @@ -102,11 +100,9 @@ retry: return NULL; typemap = afinfo->type_map; - read_lock(&typemap->lock); - type = typemap->map[proto]; + type = typemap[proto]; if (unlikely(type && !try_module_get(type->owner))) type = NULL; - read_unlock(&typemap->lock); if (!type && !modload_attempted) { xfrm_policy_put_afinfo(afinfo); request_module("xfrm-type-%d-%d", @@ -1306,17 +1302,31 @@ static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family) return NULL; read_lock(&xfrm_policy_afinfo_lock); afinfo = xfrm_policy_afinfo[family]; - if (likely(afinfo != NULL)) - read_lock(&afinfo->lock); - read_unlock(&xfrm_policy_afinfo_lock); + if (unlikely(!afinfo)) + read_unlock(&xfrm_policy_afinfo_lock); return afinfo; } static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo) { - if (unlikely(afinfo == NULL)) - return; - read_unlock(&afinfo->lock); + read_unlock(&xfrm_policy_afinfo_lock); +} + +static struct xfrm_policy_afinfo *xfrm_policy_lock_afinfo(unsigned int family) +{ + struct xfrm_policy_afinfo *afinfo; + if (unlikely(family >= NPROTO)) + return NULL; + write_lock_bh(&xfrm_policy_afinfo_lock); + afinfo = xfrm_policy_afinfo[family]; + if (unlikely(!afinfo)) + write_unlock_bh(&xfrm_policy_afinfo_lock); + return afinfo; +} + +static void xfrm_policy_unlock_afinfo(struct xfrm_policy_afinfo *afinfo) +{ + write_unlock_bh(&xfrm_policy_afinfo_lock); } static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 93a2f36ad3db..ee62c239a7e3 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1103,17 +1103,14 @@ static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family) return NULL; read_lock(&xfrm_state_afinfo_lock); afinfo = xfrm_state_afinfo[family]; - if (likely(afinfo != NULL)) - read_lock(&afinfo->lock); - read_unlock(&xfrm_state_afinfo_lock); + if (unlikely(!afinfo)) + read_unlock(&xfrm_state_afinfo_lock); return afinfo; } static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo) { - if (unlikely(afinfo == NULL)) - return; - read_unlock(&afinfo->lock); + read_unlock(&xfrm_state_afinfo_lock); } /* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */ -- cgit v1.2.3-59-g8ed1b From b59f45d0b2878ab76f8053b0973654e6621828ee Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 27 May 2006 23:05:54 -0700 Subject: [IPSEC] xfrm: Abstract out encapsulation modes This patch adds the structure xfrm_mode. It is meant to represent the operations carried out by transport/tunnel modes. By doing this we allow additional encapsulation modes to be added without clogging up the xfrm_input/xfrm_output paths. Candidate modes include 4-to-6 tunnel mode, 6-to-4 tunnel mode, and BEET modes. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/xfrm.h | 4 ++ include/net/xfrm.h | 17 ++++++ net/ipv4/Kconfig | 18 ++++++ net/ipv4/Makefile | 2 + net/ipv4/xfrm4_input.c | 28 +-------- net/ipv4/xfrm4_mode_transport.c | 69 ++++++++++++++++++++++ net/ipv4/xfrm4_mode_tunnel.c | 125 ++++++++++++++++++++++++++++++++++++++++ net/ipv4/xfrm4_output.c | 61 +------------------- net/ipv6/Kconfig | 20 +++++++ net/ipv6/Makefile | 2 + net/ipv6/ip6_output.c | 2 + net/ipv6/xfrm6_input.c | 29 +--------- net/ipv6/xfrm6_mode_transport.c | 73 +++++++++++++++++++++++ net/ipv6/xfrm6_mode_tunnel.c | 121 ++++++++++++++++++++++++++++++++++++++ net/ipv6/xfrm6_output.c | 63 +------------------- net/xfrm/xfrm_policy.c | 83 ++++++++++++++++++++++++++ net/xfrm/xfrm_state.c | 6 ++ 17 files changed, 553 insertions(+), 170 deletions(-) create mode 100644 net/ipv4/xfrm4_mode_transport.c create mode 100644 net/ipv4/xfrm4_mode_tunnel.c create mode 100644 net/ipv6/xfrm6_mode_transport.c create mode 100644 net/ipv6/xfrm6_mode_tunnel.c (limited to 'net') diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h index 6b42cc474c01..46a15c7a1a13 100644 --- a/include/linux/xfrm.h +++ b/include/linux/xfrm.h @@ -118,6 +118,10 @@ enum XFRM_SHARE_UNIQUE /* Use once */ }; +#define XFRM_MODE_TRANSPORT 0 +#define XFRM_MODE_TUNNEL 1 +#define XFRM_MODE_MAX 2 + /* Netlink configuration messages. */ enum { XFRM_MSG_BASE = 0x10, diff --git a/include/net/xfrm.h b/include/net/xfrm.h index ed7c9747059d..ed5bb34f817f 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -20,6 +20,8 @@ #include #define XFRM_ALIGN8(len) (((len) + 7) & ~7) +#define MODULE_ALIAS_XFRM_MODE(family, encap) \ + MODULE_ALIAS("xfrm-mode-" __stringify(family) "-" __stringify(encap)) extern struct sock *xfrm_nl; extern u32 sysctl_xfrm_aevent_etime; @@ -164,6 +166,7 @@ struct xfrm_state /* Reference to data common to all the instances of this * transformer. */ struct xfrm_type *type; + struct xfrm_mode *mode; /* Security context */ struct xfrm_sec_ctx *security; @@ -205,6 +208,7 @@ struct xfrm_dst; struct xfrm_policy_afinfo { unsigned short family; struct xfrm_type *type_map[256]; + struct xfrm_mode *mode_map[XFRM_MODE_MAX]; struct dst_ops *dst_ops; void (*garbage_collect)(void); int (*dst_lookup)(struct xfrm_dst **dst, struct flowi *fl); @@ -267,6 +271,19 @@ extern int xfrm_unregister_type(struct xfrm_type *type, unsigned short family); extern struct xfrm_type *xfrm_get_type(u8 proto, unsigned short family); extern void xfrm_put_type(struct xfrm_type *type); +struct xfrm_mode { + int (*input)(struct xfrm_state *x, struct sk_buff *skb); + int (*output)(struct sk_buff *skb); + + struct module *owner; + unsigned int encap; +}; + +extern int xfrm_register_mode(struct xfrm_mode *mode, int family); +extern int xfrm_unregister_mode(struct xfrm_mode *mode, int family); +extern struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family); +extern void xfrm_put_mode(struct xfrm_mode *mode); + struct xfrm_tmpl { /* id in template is interpreted as: diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index e40f75322377..35eb70b1448d 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -414,6 +414,24 @@ config INET_TUNNEL tristate default n +config INET_XFRM_MODE_TRANSPORT + tristate "IP: IPsec transport mode" + default y + select XFRM + ---help--- + Support for IPsec transport mode. + + If unsure, say Y. + +config INET_XFRM_MODE_TUNNEL + tristate "IP: IPsec tunnel mode" + default y + select XFRM + ---help--- + Support for IPsec tunnel mode. + + If unsure, say Y. + config INET_DIAG tristate "INET: socket monitoring interface" default y diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 9ef50a0b9d2c..4cc94482eacc 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -24,6 +24,8 @@ obj-$(CONFIG_INET_ESP) += esp4.o obj-$(CONFIG_INET_IPCOMP) += ipcomp.o obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o obj-$(CONFIG_INET_TUNNEL) += tunnel4.o +obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o +obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o obj-$(CONFIG_IP_PNP) += ipconfig.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_RR) += multipath_rr.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_RANDOM) += multipath_random.o diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c index 3e174c83bfe7..817ed84511a6 100644 --- a/net/ipv4/xfrm4_input.c +++ b/net/ipv4/xfrm4_input.c @@ -13,7 +13,6 @@ #include #include #include -#include #include #include @@ -24,15 +23,6 @@ int xfrm4_rcv(struct sk_buff *skb) EXPORT_SYMBOL(xfrm4_rcv); -static inline void ipip_ecn_decapsulate(struct sk_buff *skb) -{ - struct iphdr *outer_iph = skb->nh.iph; - struct iphdr *inner_iph = skb->h.ipiph; - - if (INET_ECN_is_ce(outer_iph->tos)) - IP_ECN_set_ce(inner_iph); -} - static int xfrm4_parse_spi(struct sk_buff *skb, u8 nexthdr, u32 *spi, u32 *seq) { switch (nexthdr) { @@ -113,24 +103,10 @@ int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type) xfrm_vec[xfrm_nr++] = x; - iph = skb->nh.iph; + if (x->mode->input(x, skb)) + goto drop; if (x->props.mode) { - if (iph->protocol != IPPROTO_IPIP) - goto drop; - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - goto drop; - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - goto drop; - if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv4_copy_dscp(iph, skb->h.ipiph); - if (!(x->props.flags & XFRM_STATE_NOECN)) - ipip_ecn_decapsulate(skb); - skb->mac.raw = memmove(skb->data - skb->mac_len, - skb->mac.raw, skb->mac_len); - skb->nh.raw = skb->data; - memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); decaps = 1; break; } diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c new file mode 100644 index 000000000000..e46d9a4ccc55 --- /dev/null +++ b/net/ipv4/xfrm4_mode_transport.c @@ -0,0 +1,69 @@ +/* + * xfrm4_mode_transport.c - Transport mode encapsulation for IPv4. + * + * Copyright (c) 2004-2006 Herbert Xu + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Add encapsulation header. + * + * The IP header will be moved forward to make space for the encapsulation + * header. + * + * On exit, skb->h will be set to the start of the payload to be processed + * by x->type->output and skb->nh will be set to the top IP header. + */ +static int xfrm4_transport_output(struct sk_buff *skb) +{ + struct xfrm_state *x; + struct iphdr *iph; + int ihl; + + iph = skb->nh.iph; + skb->h.ipiph = iph; + + ihl = iph->ihl * 4; + skb->h.raw += ihl; + + x = skb->dst->xfrm; + skb->nh.raw = memmove(skb_push(skb, x->props.header_len), iph, ihl); + return 0; +} + +static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) +{ + return 0; +} + +static struct xfrm_mode xfrm4_transport_mode = { + .input = xfrm4_transport_input, + .output = xfrm4_transport_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TRANSPORT, +}; + +static int __init xfrm4_transport_init(void) +{ + return xfrm_register_mode(&xfrm4_transport_mode, AF_INET); +} + +static void __exit xfrm4_transport_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm4_transport_mode, AF_INET); + BUG_ON(err); +} + +module_init(xfrm4_transport_init); +module_exit(xfrm4_transport_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TRANSPORT); diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c new file mode 100644 index 000000000000..f8d880beb12f --- /dev/null +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -0,0 +1,125 @@ +/* + * xfrm4_mode_tunnel.c - Tunnel mode encapsulation for IPv4. + * + * Copyright (c) 2004-2006 Herbert Xu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline void ipip_ecn_decapsulate(struct sk_buff *skb) +{ + struct iphdr *outer_iph = skb->nh.iph; + struct iphdr *inner_iph = skb->h.ipiph; + + if (INET_ECN_is_ce(outer_iph->tos)) + IP_ECN_set_ce(inner_iph); +} + +/* Add encapsulation header. + * + * The top IP header will be constructed per RFC 2401. The following fields + * in it shall be filled in by x->type->output: + * tot_len + * check + * + * On exit, skb->h will be set to the start of the payload to be processed + * by x->type->output and skb->nh will be set to the top IP header. + */ +static int xfrm4_tunnel_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + struct iphdr *iph, *top_iph; + int flags; + + iph = skb->nh.iph; + skb->h.ipiph = iph; + + skb->nh.raw = skb_push(skb, x->props.header_len); + top_iph = skb->nh.iph; + + top_iph->ihl = 5; + top_iph->version = 4; + + /* DS disclosed */ + top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos); + + flags = x->props.flags; + if (flags & XFRM_STATE_NOECN) + IP_ECN_clear(top_iph); + + top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? + 0 : (iph->frag_off & htons(IP_DF)); + if (!top_iph->frag_off) + __ip_select_ident(top_iph, dst->child, 0); + + top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); + + top_iph->saddr = x->props.saddr.a4; + top_iph->daddr = x->id.daddr.a4; + top_iph->protocol = IPPROTO_IPIP; + + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + return 0; +} + +static int xfrm4_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) +{ + struct iphdr *iph = skb->nh.iph; + int err = -EINVAL; + + if (iph->protocol != IPPROTO_IPIP) + goto out; + if (!pskb_may_pull(skb, sizeof(struct iphdr))) + goto out; + + if (skb_cloned(skb) && + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + goto out; + + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv4_copy_dscp(iph, skb->h.ipiph); + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip_ecn_decapsulate(skb); + skb->mac.raw = memmove(skb->data - skb->mac_len, + skb->mac.raw, skb->mac_len); + skb->nh.raw = skb->data; + memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); + err = 0; + +out: + return err; +} + +static struct xfrm_mode xfrm4_tunnel_mode = { + .input = xfrm4_tunnel_input, + .output = xfrm4_tunnel_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TUNNEL, +}; + +static int __init xfrm4_tunnel_init(void) +{ + return xfrm_register_mode(&xfrm4_tunnel_mode, AF_INET); +} + +static void __exit xfrm4_tunnel_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm4_tunnel_mode, AF_INET); + BUG_ON(err); +} + +module_init(xfrm4_tunnel_init); +module_exit(xfrm4_tunnel_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET, XFRM_MODE_TUNNEL); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 4ef8efaf6a67..ac9d91d4bb05 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -12,67 +12,10 @@ #include #include #include -#include #include #include #include -/* Add encapsulation header. - * - * In transport mode, the IP header will be moved forward to make space - * for the encapsulation header. - * - * In tunnel mode, the top IP header will be constructed per RFC 2401. - * The following fields in it shall be filled in by x->type->output: - * tot_len - * check - * - * On exit, skb->h will be set to the start of the payload to be processed - * by x->type->output and skb->nh will be set to the top IP header. - */ -static void xfrm4_encap(struct sk_buff *skb) -{ - struct dst_entry *dst = skb->dst; - struct xfrm_state *x = dst->xfrm; - struct iphdr *iph, *top_iph; - int flags; - - iph = skb->nh.iph; - skb->h.ipiph = iph; - - skb->nh.raw = skb_push(skb, x->props.header_len); - top_iph = skb->nh.iph; - - if (!x->props.mode) { - skb->h.raw += iph->ihl*4; - memmove(top_iph, iph, iph->ihl*4); - return; - } - - top_iph->ihl = 5; - top_iph->version = 4; - - /* DS disclosed */ - top_iph->tos = INET_ECN_encapsulate(iph->tos, iph->tos); - - flags = x->props.flags; - if (flags & XFRM_STATE_NOECN) - IP_ECN_clear(top_iph); - - top_iph->frag_off = (flags & XFRM_STATE_NOPMTUDISC) ? - 0 : (iph->frag_off & htons(IP_DF)); - if (!top_iph->frag_off) - __ip_select_ident(top_iph, dst->child, 0); - - top_iph->ttl = dst_metric(dst->child, RTAX_HOPLIMIT); - - top_iph->saddr = x->props.saddr.a4; - top_iph->daddr = x->id.daddr.a4; - top_iph->protocol = IPPROTO_IPIP; - - memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options)); -} - static int xfrm4_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; @@ -121,7 +64,9 @@ static int xfrm4_output_one(struct sk_buff *skb) if (err) goto error; - xfrm4_encap(skb); + err = x->mode->output(skb); + if (err) + goto error; err = x->type->output(x, skb); if (err) diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig index f8a107ab5592..e923d4dea418 100644 --- a/net/ipv6/Kconfig +++ b/net/ipv6/Kconfig @@ -106,6 +106,26 @@ config INET6_TUNNEL tristate default n +config INET6_XFRM_MODE_TRANSPORT + tristate "IPv6: IPsec transport mode" + depends on IPV6 + default IPV6 + select XFRM + ---help--- + Support for IPsec transport mode. + + If unsure, say Y. + +config INET6_XFRM_MODE_TUNNEL + tristate "IPv6: IPsec tunnel mode" + depends on IPV6 + default IPV6 + select XFRM + ---help--- + Support for IPsec tunnel mode. + + If unsure, say Y. + config IPV6_TUNNEL tristate "IPv6: IPv6-in-IPv6 tunnel" select INET6_TUNNEL diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index a760b0988fbb..386e0a626948 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -20,6 +20,8 @@ obj-$(CONFIG_INET6_ESP) += esp6.o obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o obj-$(CONFIG_INET6_XFRM_TUNNEL) += xfrm6_tunnel.o obj-$(CONFIG_INET6_TUNNEL) += tunnel6.o +obj-$(CONFIG_INET6_XFRM_MODE_TRANSPORT) += xfrm6_mode_transport.o +obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index e46048974f37..416f6e428a0a 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -488,6 +489,7 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) return offset; } +EXPORT_SYMBOL_GPL(ip6_find_1stfragopt); static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) { diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c index 00cfdee18dca..0405d74ff910 100644 --- a/net/ipv6/xfrm6_input.c +++ b/net/ipv6/xfrm6_input.c @@ -13,21 +13,9 @@ #include #include #include -#include -#include -#include #include #include -static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) -{ - struct ipv6hdr *outer_iph = skb->nh.ipv6h; - struct ipv6hdr *inner_iph = skb->h.ipv6h; - - if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph))) - IP6_ECN_set_ce(inner_iph); -} - int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi) { int err; @@ -81,21 +69,10 @@ int xfrm6_rcv_spi(struct sk_buff *skb, u32 spi) xfrm_vec[xfrm_nr++] = x; + if (x->mode->input(x, skb)) + goto drop; + if (x->props.mode) { /* XXX */ - if (nexthdr != IPPROTO_IPV6) - goto drop; - if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) - goto drop; - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) - goto drop; - if (x->props.flags & XFRM_STATE_DECAP_DSCP) - ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h); - if (!(x->props.flags & XFRM_STATE_NOECN)) - ipip6_ecn_decapsulate(skb); - skb->mac.raw = memmove(skb->data - skb->mac_len, - skb->mac.raw, skb->mac_len); - skb->nh.raw = skb->data; decaps = 1; break; } diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c new file mode 100644 index 000000000000..5efbbae08ef0 --- /dev/null +++ b/net/ipv6/xfrm6_mode_transport.c @@ -0,0 +1,73 @@ +/* + * xfrm6_mode_transport.c - Transport mode encapsulation for IPv6. + * + * Copyright (C) 2002 USAGI/WIDE Project + * Copyright (c) 2004-2006 Herbert Xu + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Add encapsulation header. + * + * The IP header and mutable extension headers will be moved forward to make + * space for the encapsulation header. + * + * On exit, skb->h will be set to the start of the encapsulation header to be + * filled in by x->type->output and skb->nh will be set to the nextheader field + * of the extension header directly preceding the encapsulation header, or in + * its absence, that of the top IP header. The value of skb->data will always + * point to the top IP header. + */ +static int xfrm6_transport_output(struct sk_buff *skb) +{ + struct xfrm_state *x = skb->dst->xfrm; + struct ipv6hdr *iph; + u8 *prevhdr; + int hdr_len; + + skb_push(skb, x->props.header_len); + iph = skb->nh.ipv6h; + + hdr_len = ip6_find_1stfragopt(skb, &prevhdr); + skb->nh.raw = prevhdr - x->props.header_len; + skb->h.raw = skb->data + hdr_len; + memmove(skb->data, iph, hdr_len); + return 0; +} + +static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) +{ + return 0; +} + +static struct xfrm_mode xfrm6_transport_mode = { + .input = xfrm6_transport_input, + .output = xfrm6_transport_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TRANSPORT, +}; + +static int __init xfrm6_transport_init(void) +{ + return xfrm_register_mode(&xfrm6_transport_mode, AF_INET6); +} + +static void __exit xfrm6_transport_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm6_transport_mode, AF_INET6); + BUG_ON(err); +} + +module_init(xfrm6_transport_init); +module_exit(xfrm6_transport_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TRANSPORT); diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c new file mode 100644 index 000000000000..8af79be2edca --- /dev/null +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -0,0 +1,121 @@ +/* + * xfrm6_mode_tunnel.c - Tunnel mode encapsulation for IPv6. + * + * Copyright (C) 2002 USAGI/WIDE Project + * Copyright (c) 2004-2006 Herbert Xu + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static inline void ipip6_ecn_decapsulate(struct sk_buff *skb) +{ + struct ipv6hdr *outer_iph = skb->nh.ipv6h; + struct ipv6hdr *inner_iph = skb->h.ipv6h; + + if (INET_ECN_is_ce(ipv6_get_dsfield(outer_iph))) + IP6_ECN_set_ce(inner_iph); +} + +/* Add encapsulation header. + * + * The top IP header will be constructed per RFC 2401. The following fields + * in it shall be filled in by x->type->output: + * payload_len + * + * On exit, skb->h will be set to the start of the encapsulation header to be + * filled in by x->type->output and skb->nh will be set to the nextheader field + * of the extension header directly preceding the encapsulation header, or in + * its absence, that of the top IP header. The value of skb->data will always + * point to the top IP header. + */ +static int xfrm6_tunnel_output(struct sk_buff *skb) +{ + struct dst_entry *dst = skb->dst; + struct xfrm_state *x = dst->xfrm; + struct ipv6hdr *iph, *top_iph; + int dsfield; + + skb_push(skb, x->props.header_len); + iph = skb->nh.ipv6h; + + skb->nh.raw = skb->data; + top_iph = skb->nh.ipv6h; + skb->nh.raw = &top_iph->nexthdr; + skb->h.ipv6h = top_iph + 1; + + top_iph->version = 6; + top_iph->priority = iph->priority; + top_iph->flow_lbl[0] = iph->flow_lbl[0]; + top_iph->flow_lbl[1] = iph->flow_lbl[1]; + top_iph->flow_lbl[2] = iph->flow_lbl[2]; + dsfield = ipv6_get_dsfield(top_iph); + dsfield = INET_ECN_encapsulate(dsfield, dsfield); + if (x->props.flags & XFRM_STATE_NOECN) + dsfield &= ~INET_ECN_MASK; + ipv6_change_dsfield(top_iph, 0, dsfield); + top_iph->nexthdr = IPPROTO_IPV6; + top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT); + ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); + ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); + return 0; +} + +static int xfrm6_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) +{ + int err = -EINVAL; + + if (skb->nh.raw[IP6CB(skb)->nhoff] != IPPROTO_IPV6) + goto out; + if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) + goto out; + + if (skb_cloned(skb) && + (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + goto out; + + if (x->props.flags & XFRM_STATE_DECAP_DSCP) + ipv6_copy_dscp(skb->nh.ipv6h, skb->h.ipv6h); + if (!(x->props.flags & XFRM_STATE_NOECN)) + ipip6_ecn_decapsulate(skb); + skb->mac.raw = memmove(skb->data - skb->mac_len, + skb->mac.raw, skb->mac_len); + skb->nh.raw = skb->data; + err = 0; + +out: + return err; +} + +static struct xfrm_mode xfrm6_tunnel_mode = { + .input = xfrm6_tunnel_input, + .output = xfrm6_tunnel_output, + .owner = THIS_MODULE, + .encap = XFRM_MODE_TUNNEL, +}; + +static int __init xfrm6_tunnel_init(void) +{ + return xfrm_register_mode(&xfrm6_tunnel_mode, AF_INET6); +} + +static void __exit xfrm6_tunnel_exit(void) +{ + int err; + + err = xfrm_unregister_mode(&xfrm6_tunnel_mode, AF_INET6); + BUG_ON(err); +} + +module_init(xfrm6_tunnel_init); +module_exit(xfrm6_tunnel_exit); +MODULE_LICENSE("GPL"); +MODULE_ALIAS_XFRM_MODE(AF_INET6, XFRM_MODE_TUNNEL); diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 80242172a5df..16e84254a252 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -14,68 +14,9 @@ #include #include #include -#include -#include #include #include -/* Add encapsulation header. - * - * In transport mode, the IP header and mutable extension headers will be moved - * forward to make space for the encapsulation header. - * - * In tunnel mode, the top IP header will be constructed per RFC 2401. - * The following fields in it shall be filled in by x->type->output: - * payload_len - * - * On exit, skb->h will be set to the start of the encapsulation header to be - * filled in by x->type->output and skb->nh will be set to the nextheader field - * of the extension header directly preceding the encapsulation header, or in - * its absence, that of the top IP header. The value of skb->data will always - * point to the top IP header. - */ -static void xfrm6_encap(struct sk_buff *skb) -{ - struct dst_entry *dst = skb->dst; - struct xfrm_state *x = dst->xfrm; - struct ipv6hdr *iph, *top_iph; - int dsfield; - - skb_push(skb, x->props.header_len); - iph = skb->nh.ipv6h; - - if (!x->props.mode) { - u8 *prevhdr; - int hdr_len; - - hdr_len = ip6_find_1stfragopt(skb, &prevhdr); - skb->nh.raw = prevhdr - x->props.header_len; - skb->h.raw = skb->data + hdr_len; - memmove(skb->data, iph, hdr_len); - return; - } - - skb->nh.raw = skb->data; - top_iph = skb->nh.ipv6h; - skb->nh.raw = &top_iph->nexthdr; - skb->h.ipv6h = top_iph + 1; - - top_iph->version = 6; - top_iph->priority = iph->priority; - top_iph->flow_lbl[0] = iph->flow_lbl[0]; - top_iph->flow_lbl[1] = iph->flow_lbl[1]; - top_iph->flow_lbl[2] = iph->flow_lbl[2]; - dsfield = ipv6_get_dsfield(top_iph); - dsfield = INET_ECN_encapsulate(dsfield, dsfield); - if (x->props.flags & XFRM_STATE_NOECN) - dsfield &= ~INET_ECN_MASK; - ipv6_change_dsfield(top_iph, 0, dsfield); - top_iph->nexthdr = IPPROTO_IPV6; - top_iph->hop_limit = dst_metric(dst->child, RTAX_HOPLIMIT); - ipv6_addr_copy(&top_iph->saddr, (struct in6_addr *)&x->props.saddr); - ipv6_addr_copy(&top_iph->daddr, (struct in6_addr *)&x->id.daddr); -} - static int xfrm6_tunnel_check_size(struct sk_buff *skb) { int mtu, ret = 0; @@ -118,7 +59,9 @@ static int xfrm6_output_one(struct sk_buff *skb) if (err) goto error; - xfrm6_encap(skb); + err = x->mode->output(skb); + if (err) + goto error; err = x->type->output(x, skb); if (err) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 44b64a593c01..b8936926c24b 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -138,6 +138,89 @@ void xfrm_put_type(struct xfrm_type *type) module_put(type->owner); } +int xfrm_register_mode(struct xfrm_mode *mode, int family) +{ + struct xfrm_policy_afinfo *afinfo; + struct xfrm_mode **modemap; + int err; + + if (unlikely(mode->encap >= XFRM_MODE_MAX)) + return -EINVAL; + + afinfo = xfrm_policy_lock_afinfo(family); + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + + err = -EEXIST; + modemap = afinfo->mode_map; + if (likely(modemap[mode->encap] == NULL)) { + modemap[mode->encap] = mode; + err = 0; + } + + xfrm_policy_unlock_afinfo(afinfo); + return err; +} +EXPORT_SYMBOL(xfrm_register_mode); + +int xfrm_unregister_mode(struct xfrm_mode *mode, int family) +{ + struct xfrm_policy_afinfo *afinfo; + struct xfrm_mode **modemap; + int err; + + if (unlikely(mode->encap >= XFRM_MODE_MAX)) + return -EINVAL; + + afinfo = xfrm_policy_lock_afinfo(family); + if (unlikely(afinfo == NULL)) + return -EAFNOSUPPORT; + + err = -ENOENT; + modemap = afinfo->mode_map; + if (likely(modemap[mode->encap] == mode)) { + modemap[mode->encap] = NULL; + err = 0; + } + + xfrm_policy_unlock_afinfo(afinfo); + return err; +} +EXPORT_SYMBOL(xfrm_unregister_mode); + +struct xfrm_mode *xfrm_get_mode(unsigned int encap, int family) +{ + struct xfrm_policy_afinfo *afinfo; + struct xfrm_mode *mode; + int modload_attempted = 0; + + if (unlikely(encap >= XFRM_MODE_MAX)) + return NULL; + +retry: + afinfo = xfrm_policy_get_afinfo(family); + if (unlikely(afinfo == NULL)) + return NULL; + + mode = afinfo->mode_map[encap]; + if (unlikely(mode && !try_module_get(mode->owner))) + mode = NULL; + if (!mode && !modload_attempted) { + xfrm_policy_put_afinfo(afinfo); + request_module("xfrm-mode-%d-%d", family, encap); + modload_attempted = 1; + goto retry; + } + + xfrm_policy_put_afinfo(afinfo); + return mode; +} + +void xfrm_put_mode(struct xfrm_mode *mode) +{ + module_put(mode->owner); +} + static inline unsigned long make_jiffies(long secs) { if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index ee62c239a7e3..17b29ec3c417 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -77,6 +77,8 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) kfree(x->ealg); kfree(x->calg); kfree(x->encap); + if (x->mode) + xfrm_put_mode(x->mode); if (x->type) { x->type->destructor(x); xfrm_put_type(x->type); @@ -1193,6 +1195,10 @@ int xfrm_init_state(struct xfrm_state *x) if (err) goto error; + x->mode = xfrm_get_mode(x->props.mode, family); + if (x->mode == NULL) + goto error; + x->km.state = XFRM_STATE_VALID; error: -- cgit v1.2.3-59-g8ed1b From 31a4ab93025719e62e7cf7ce899f71c34ecde5a0 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 27 May 2006 23:06:13 -0700 Subject: [IPSEC] proto: Move transport mode input path into xfrm_mode_transport Now that we have xfrm_mode objects we can move the transport mode specific input decapsulation code into xfrm_mode_transport. This removes duplicate code as well as unnecessary header movement in case of tunnel mode SAs since we will discard the original IP header immediately. This also fixes a minor bug for transport-mode ESP where the IP payload length is set to the correct value minus the header length (with extension headers for IPv6). Of course the other neat thing is that we no longer have to allocate temporary buffers to hold the IP headers for ESP and IPComp. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv4/ah4.c | 15 +++++++-------- net/ipv4/esp4.c | 18 ++++++------------ net/ipv4/ipcomp.c | 23 +++++------------------ net/ipv4/xfrm4_mode_transport.c | 14 ++++++++++++++ net/ipv6/ah6.c | 10 +++------- net/ipv6/esp6.c | 20 ++++---------------- net/ipv6/ipcomp6.c | 27 +++++---------------------- net/ipv6/xfrm6_mode_transport.c | 15 +++++++++++++++ 8 files changed, 59 insertions(+), 83 deletions(-) (limited to 'net') diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index e2e4771fa4c6..c7782230080d 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -119,6 +119,7 @@ error: static int ah_input(struct xfrm_state *x, struct sk_buff *skb) { int ah_hlen; + int ihl; struct iphdr *iph; struct ip_auth_hdr *ah; struct ah_data *ahp; @@ -149,13 +150,14 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) ah = (struct ip_auth_hdr*)skb->data; iph = skb->nh.iph; - memcpy(work_buf, iph, iph->ihl*4); + ihl = skb->data - skb->nh.raw; + memcpy(work_buf, iph, ihl); iph->ttl = 0; iph->tos = 0; iph->frag_off = 0; iph->check = 0; - if (iph->ihl != 5) { + if (ihl > sizeof(*iph)) { u32 dummy; if (ip_clear_mutable_options(iph, &dummy)) goto out; @@ -164,7 +166,7 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) u8 auth_data[MAX_AH_AUTH_LEN]; memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); - skb_push(skb, skb->data - skb->nh.raw); + skb_push(skb, ihl); ahp->icv(ahp, skb, ah->auth_data); if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { x->stats.integrity_failed++; @@ -172,11 +174,8 @@ static int ah_input(struct xfrm_state *x, struct sk_buff *skb) } } ((struct iphdr*)work_buf)->protocol = ah->nexthdr; - skb->nh.raw = skb_pull(skb, ah_hlen); - memcpy(skb->nh.raw, work_buf, iph->ihl*4); - skb->nh.iph->tot_len = htons(skb->len); - skb_pull(skb, skb->nh.iph->ihl*4); - skb->h.raw = skb->data; + skb->h.raw = memcpy(skb->nh.raw += ah_hlen, work_buf, ihl); + __skb_pull(skb, ah_hlen + ihl); return 0; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 9d1881c07a32..9bbdd4494551 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -143,10 +143,9 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) int alen = esp->auth.icv_trunc_len; int elen = skb->len - sizeof(struct ip_esp_hdr) - esp->conf.ivlen - alen; int nfrags; - int encap_len = 0; + int ihl; u8 nexthdr[2]; struct scatterlist *sg; - u8 workbuf[60]; int padlen; if (!pskb_may_pull(skb, sizeof(struct ip_esp_hdr))) @@ -177,7 +176,6 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) skb->ip_summed = CHECKSUM_NONE; esph = (struct ip_esp_hdr*)skb->data; - iph = skb->nh.iph; /* Get ivec. This can be wrong, check against another impls. */ if (esp->conf.ivlen) @@ -204,12 +202,12 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) /* ... check padding bits here. Silly. :-) */ + iph = skb->nh.iph; + ihl = iph->ihl * 4; + if (x->encap) { struct xfrm_encap_tmpl *encap = x->encap; - struct udphdr *uh; - - uh = (struct udphdr *)(iph + 1); - encap_len = (void*)esph - (void*)uh; + struct udphdr *uh = (void *)(skb->nh.raw + ihl); /* * 1) if the NAT-T peer's IP or port changed then @@ -246,11 +244,7 @@ static int esp_input(struct xfrm_state *x, struct sk_buff *skb) iph->protocol = nexthdr[1]; pskb_trim(skb, skb->len - alen - padlen - 2); - memcpy(workbuf, skb->nh.raw, iph->ihl*4); - skb->h.raw = skb_pull(skb, sizeof(struct ip_esp_hdr) + esp->conf.ivlen); - skb->nh.raw += encap_len + sizeof(struct ip_esp_hdr) + esp->conf.ivlen; - memcpy(skb->nh.raw, workbuf, iph->ihl*4); - skb->nh.iph->tot_len = htons(skb->len); + skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - ihl; return 0; diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 95278b22b669..8e243589045f 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -45,7 +45,6 @@ static LIST_HEAD(ipcomp_tfms_list); static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) { int err, plen, dlen; - struct iphdr *iph; struct ipcomp_data *ipcd = x->data; u8 *start, *scratch; struct crypto_tfm *tfm; @@ -74,8 +73,6 @@ static int ipcomp_decompress(struct xfrm_state *x, struct sk_buff *skb) skb_put(skb, dlen - plen); memcpy(skb->data, scratch, dlen); - iph = skb->nh.iph; - iph->tot_len = htons(dlen + iph->ihl * 4); out: put_cpu(); return err; @@ -83,14 +80,9 @@ out: static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) { - u8 nexthdr; int err = 0; struct iphdr *iph; - union { - struct iphdr iph; - char buf[60]; - } tmp_iph; - + struct ip_comp_hdr *ipch; if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && skb_linearize(skb, GFP_ATOMIC) != 0) { @@ -102,15 +94,10 @@ static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) /* Remove ipcomp header and decompress original payload */ iph = skb->nh.iph; - memcpy(&tmp_iph, iph, iph->ihl * 4); - nexthdr = *(u8 *)skb->data; - skb_pull(skb, sizeof(struct ip_comp_hdr)); - skb->nh.raw += sizeof(struct ip_comp_hdr); - memcpy(skb->nh.raw, &tmp_iph, tmp_iph.iph.ihl * 4); - iph = skb->nh.iph; - iph->tot_len = htons(ntohs(iph->tot_len) - sizeof(struct ip_comp_hdr)); - iph->protocol = nexthdr; - skb->h.raw = skb->data; + ipch = (void *)skb->data; + iph->protocol = ipch->nexthdr; + skb->h.raw = skb->nh.raw + sizeof(*ipch); + __skb_pull(skb, sizeof(*ipch)); err = ipcomp_decompress(x, skb); out: diff --git a/net/ipv4/xfrm4_mode_transport.c b/net/ipv4/xfrm4_mode_transport.c index e46d9a4ccc55..a9e6b3dd19c9 100644 --- a/net/ipv4/xfrm4_mode_transport.c +++ b/net/ipv4/xfrm4_mode_transport.c @@ -38,8 +38,22 @@ static int xfrm4_transport_output(struct sk_buff *skb) return 0; } +/* Remove encapsulation header. + * + * The IP header will be moved over the top of the encapsulation header. + * + * On entry, skb->h shall point to where the IP header should be and skb->nh + * shall be set to where the IP header currently is. skb->data shall point + * to the start of the payload. + */ static int xfrm4_transport_input(struct xfrm_state *x, struct sk_buff *skb) { + int ihl = skb->data - skb->h.raw; + + if (skb->h.raw != skb->nh.raw) + skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl); + skb->nh.iph->tot_len = htons(skb->len + ihl); + skb->h.raw = skb->data; return 0; } diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index 6778173a3dda..d31c0d6c0448 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -292,7 +292,7 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) memcpy(auth_data, ah->auth_data, ahp->icv_trunc_len); memset(ah->auth_data, 0, ahp->icv_trunc_len); - skb_push(skb, skb->data - skb->nh.raw); + skb_push(skb, hdr_len); ahp->icv(ahp, skb, ah->auth_data); if (memcmp(ah->auth_data, auth_data, ahp->icv_trunc_len)) { LIMIT_NETDEBUG(KERN_WARNING "ipsec ah authentication error\n"); @@ -301,12 +301,8 @@ static int ah6_input(struct xfrm_state *x, struct sk_buff *skb) } } - skb->nh.raw = skb_pull(skb, ah_hlen); - memcpy(skb->nh.raw, tmp_hdr, hdr_len); - skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); - skb_pull(skb, hdr_len); - skb->h.raw = skb->data; - + skb->h.raw = memcpy(skb->nh.raw += ah_hlen, tmp_hdr, hdr_len); + __skb_pull(skb, ah_hlen + hdr_len); kfree(tmp_hdr); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 22f046079037..a15a6f320f70 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -142,25 +142,17 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) int hdr_len = skb->h.raw - skb->nh.raw; int nfrags; - unsigned char *tmp_hdr = NULL; int ret = 0; if (!pskb_may_pull(skb, sizeof(struct ipv6_esp_hdr))) { ret = -EINVAL; - goto out_nofree; + goto out; } if (elen <= 0 || (elen & (blksize-1))) { ret = -EINVAL; - goto out_nofree; - } - - tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC); - if (!tmp_hdr) { - ret = -ENOMEM; - goto out_nofree; + goto out; } - memcpy(tmp_hdr, skb->nh.raw, hdr_len); /* If integrity check is required, do this. */ if (esp->auth.icv_full_len) { @@ -222,16 +214,12 @@ static int esp6_input(struct xfrm_state *x, struct sk_buff *skb) /* ... check padding bits here. Silly. :-) */ pskb_trim(skb, skb->len - alen - padlen - 2); - skb->h.raw = skb_pull(skb, sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen); - skb->nh.raw += sizeof(struct ipv6_esp_hdr) + esp->conf.ivlen; - memcpy(skb->nh.raw, tmp_hdr, hdr_len); - skb->nh.ipv6h->payload_len = htons(skb->len - sizeof(struct ipv6hdr)); ret = nexthdr[1]; } + skb->h.raw = __skb_pull(skb, sizeof(*esph) + esp->conf.ivlen) - hdr_len; + out: - kfree(tmp_hdr); -out_nofree: return ret; } diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 48636436028a..cec3be544b69 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -66,10 +66,8 @@ static LIST_HEAD(ipcomp6_tfms_list); static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) { int err = 0; - u8 nexthdr = 0; - int hdr_len = skb->h.raw - skb->nh.raw; - unsigned char *tmp_hdr = NULL; struct ipv6hdr *iph; + struct ipv6_comp_hdr *ipch; int plen, dlen; struct ipcomp_data *ipcd = x->data; u8 *start, *scratch; @@ -86,17 +84,9 @@ static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) /* Remove ipcomp header and decompress original payload */ iph = skb->nh.ipv6h; - tmp_hdr = kmalloc(hdr_len, GFP_ATOMIC); - if (!tmp_hdr) - goto out; - memcpy(tmp_hdr, iph, hdr_len); - nexthdr = *(u8 *)skb->data; - skb_pull(skb, sizeof(struct ipv6_comp_hdr)); - skb->nh.raw += sizeof(struct ipv6_comp_hdr); - memcpy(skb->nh.raw, tmp_hdr, hdr_len); - iph = skb->nh.ipv6h; - iph->payload_len = htons(ntohs(iph->payload_len) - sizeof(struct ipv6_comp_hdr)); - skb->h.raw = skb->data; + ipch = (void *)skb->data; + skb->h.raw = skb->nh.raw + sizeof(*ipch); + __skb_pull(skb, sizeof(*ipch)); /* decompression */ plen = skb->len; @@ -125,18 +115,11 @@ static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) skb_put(skb, dlen - plen); memcpy(skb->data, scratch, dlen); + err = ipch->nexthdr; - iph = skb->nh.ipv6h; - iph->payload_len = htons(skb->len); - out_put_cpu: put_cpu(); out: - kfree(tmp_hdr); - if (err) - goto error_out; - return nexthdr; -error_out: return err; } diff --git a/net/ipv6/xfrm6_mode_transport.c b/net/ipv6/xfrm6_mode_transport.c index 5efbbae08ef0..711d713e36d8 100644 --- a/net/ipv6/xfrm6_mode_transport.c +++ b/net/ipv6/xfrm6_mode_transport.c @@ -42,8 +42,23 @@ static int xfrm6_transport_output(struct sk_buff *skb) return 0; } +/* Remove encapsulation header. + * + * The IP header will be moved over the top of the encapsulation header. + * + * On entry, skb->h shall point to where the IP header should be and skb->nh + * shall be set to where the IP header currently is. skb->data shall point + * to the start of the payload. + */ static int xfrm6_transport_input(struct xfrm_state *x, struct sk_buff *skb) { + int ihl = skb->data - skb->h.raw; + + if (skb->h.raw != skb->nh.raw) + skb->nh.raw = memmove(skb->h.raw, skb->nh.raw, ihl); + skb->nh.ipv6h->payload_len = htons(skb->len + ihl - + sizeof(struct ipv6hdr)); + skb->h.raw = skb->data; return 0; } -- cgit v1.2.3-59-g8ed1b From 3e72b2fe5b31791f976350b023b7a37ef59c02c1 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:19:19 -0700 Subject: [NETFILTER]: x_tables: remove some unnecessary casts Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_hashlimit.c | 2 +- net/netfilter/xt_connmark.c | 2 +- net/netfilter/xt_dccp.c | 3 +-- net/netfilter/xt_mark.c | 2 +- net/netfilter/xt_sctp.c | 4 +--- net/netfilter/xt_string.c | 2 +- 6 files changed, 6 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c index 7c6836c4646e..b88adc7f4b47 100644 --- a/net/ipv4/netfilter/ipt_hashlimit.c +++ b/net/ipv4/netfilter/ipt_hashlimit.c @@ -561,7 +561,7 @@ static void hashlimit_destroy(const struct xt_match *match, void *matchinfo, unsigned int matchsize) { - struct ipt_hashlimit_info *r = (struct ipt_hashlimit_info *) matchinfo; + struct ipt_hashlimit_info *r = matchinfo; htable_put(r->hinfo); } diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index dc26a27cbcaf..56324c8aff0a 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -58,7 +58,7 @@ checkentry(const char *tablename, unsigned int matchsize, unsigned int hook_mask) { - struct xt_connmark_info *cm = (struct xt_connmark_info *)matchinfo; + struct xt_connmark_info *cm = matchinfo; if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) { printk(KERN_WARNING "connmark: only support 32bit mark\n"); diff --git a/net/netfilter/xt_dccp.c b/net/netfilter/xt_dccp.c index dfb10b648e57..2e2f825dad4c 100644 --- a/net/netfilter/xt_dccp.c +++ b/net/netfilter/xt_dccp.c @@ -101,8 +101,7 @@ match(const struct sk_buff *skb, unsigned int protoff, int *hotdrop) { - const struct xt_dccp_info *info = - (const struct xt_dccp_info *)matchinfo; + const struct xt_dccp_info *info = matchinfo; struct dccp_hdr _dh, *dh; if (offset) diff --git a/net/netfilter/xt_mark.c b/net/netfilter/xt_mark.c index 8b385a34886d..876bc5797738 100644 --- a/net/netfilter/xt_mark.c +++ b/net/netfilter/xt_mark.c @@ -42,7 +42,7 @@ checkentry(const char *tablename, unsigned int matchsize, unsigned int hook_mask) { - struct xt_mark_info *minfo = (struct xt_mark_info *) matchinfo; + const struct xt_mark_info *minfo = matchinfo; if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) { printk(KERN_WARNING "mark: only supports 32bit mark\n"); diff --git a/net/netfilter/xt_sctp.c b/net/netfilter/xt_sctp.c index 34bd87259a09..b5110e5b54b0 100644 --- a/net/netfilter/xt_sctp.c +++ b/net/netfilter/xt_sctp.c @@ -129,11 +129,9 @@ match(const struct sk_buff *skb, unsigned int protoff, int *hotdrop) { - const struct xt_sctp_info *info; + const struct xt_sctp_info *info = matchinfo; sctp_sctphdr_t _sh, *sh; - info = (const struct xt_sctp_info *)matchinfo; - if (offset) { duprintf("Dropping non-first fragment.. FIXME\n"); return 0; diff --git a/net/netfilter/xt_string.c b/net/netfilter/xt_string.c index 79d9ea6964ba..0ebb6ac2c8c7 100644 --- a/net/netfilter/xt_string.c +++ b/net/netfilter/xt_string.c @@ -30,8 +30,8 @@ static int match(const struct sk_buff *skb, unsigned int protoff, int *hotdrop) { + const struct xt_string_info *conf = matchinfo; struct ts_state state; - struct xt_string_info *conf = (struct xt_string_info *) matchinfo; memset(&state, 0, sizeof(struct ts_state)); -- cgit v1.2.3-59-g8ed1b From 957dc80ac30f3c4d53259fa936df807663ba54fa Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:19:56 -0700 Subject: [NETFILTER]: x_tables: add SCTP/DCCP support where missing Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 20 ++++-------- net/ipv4/netfilter/ipt_hashlimit.c | 64 ++++++++++---------------------------- net/netfilter/xt_multiport.c | 7 +++-- 3 files changed, 26 insertions(+), 65 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index aad9d28c8d71..dbc83c5d7aa6 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -241,25 +241,17 @@ clusterip_hashfn(struct sk_buff *skb, struct clusterip_config *config) struct iphdr *iph = skb->nh.iph; unsigned long hashval; u_int16_t sport, dport; - struct tcphdr *th; - struct udphdr *uh; - struct icmphdr *ih; + u_int16_t *ports; switch (iph->protocol) { case IPPROTO_TCP: - th = (void *)iph+iph->ihl*4; - sport = ntohs(th->source); - dport = ntohs(th->dest); - break; case IPPROTO_UDP: - uh = (void *)iph+iph->ihl*4; - sport = ntohs(uh->source); - dport = ntohs(uh->dest); - break; + case IPPROTO_SCTP: + case IPPROTO_DCCP: case IPPROTO_ICMP: - ih = (void *)iph+iph->ihl*4; - sport = ntohs(ih->un.echo.id); - dport = (ih->type<<8)|ih->code; + ports = (void *)iph+iph->ihl*4; + sport = ports[0]; + dport = ports[1]; break; default: if (net_ratelimit()) { diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c index b88adc7f4b47..85edfb79469a 100644 --- a/net/ipv4/netfilter/ipt_hashlimit.c +++ b/net/ipv4/netfilter/ipt_hashlimit.c @@ -28,9 +28,6 @@ #include #include #include -#include -#include -#include #include #include #include @@ -381,49 +378,6 @@ static inline void rateinfo_recalc(struct dsthash_ent *dh, unsigned long now) dh->rateinfo.credit = dh->rateinfo.credit_cap; } -static inline int get_ports(const struct sk_buff *skb, int offset, - u16 ports[2]) -{ - union { - struct tcphdr th; - struct udphdr uh; - sctp_sctphdr_t sctph; - } hdr_u, *ptr_u; - - /* Must not be a fragment. */ - if (offset) - return 1; - - /* Must be big enough to read ports (both UDP and TCP have - them at the start). */ - ptr_u = skb_header_pointer(skb, skb->nh.iph->ihl*4, 8, &hdr_u); - if (!ptr_u) - return 1; - - switch (skb->nh.iph->protocol) { - case IPPROTO_TCP: - ports[0] = ptr_u->th.source; - ports[1] = ptr_u->th.dest; - break; - case IPPROTO_UDP: - ports[0] = ptr_u->uh.source; - ports[1] = ptr_u->uh.dest; - break; - case IPPROTO_SCTP: - ports[0] = ptr_u->sctph.source; - ports[1] = ptr_u->sctph.dest; - break; - default: - /* all other protocols don't supprot per-port hash - * buckets */ - ports[0] = ports[1] = 0; - break; - } - - return 0; -} - - static int hashlimit_match(const struct sk_buff *skb, const struct net_device *in, @@ -449,8 +403,22 @@ hashlimit_match(const struct sk_buff *skb, dst.src_ip = skb->nh.iph->saddr; if (hinfo->cfg.mode & IPT_HASHLIMIT_HASH_DPT ||hinfo->cfg.mode & IPT_HASHLIMIT_HASH_SPT) { - u_int16_t ports[2]; - if (get_ports(skb, offset, ports)) { + u_int16_t _ports[2], *ports; + + switch (skb->nh.iph->protocol) { + case IPPROTO_TCP: + case IPPROTO_UDP: + case IPPROTO_SCTP: + case IPPROTO_DCCP: + ports = skb_header_pointer(skb, skb->nh.iph->ihl*4, + sizeof(_ports), &_ports); + break; + default: + _ports[0] = _ports[1] = 0; + ports = _ports; + break; + } + if (!ports) { /* We've been asked to examine this packet, and we can't. Hence, no choice but to drop. */ *hotdrop = 1; diff --git a/net/netfilter/xt_multiport.c b/net/netfilter/xt_multiport.c index b56cd2baaac2..1ff0a25396e7 100644 --- a/net/netfilter/xt_multiport.c +++ b/net/netfilter/xt_multiport.c @@ -1,4 +1,4 @@ -/* Kernel module to match one of a list of TCP/UDP ports: ports are in +/* Kernel module to match one of a list of TCP/UDP/SCTP/DCCP ports: ports are in the same place so we can treat them as equal. */ /* (C) 1999-2001 Paul `Rusty' Russell @@ -160,8 +160,9 @@ check(u_int16_t proto, u_int8_t match_flags, u_int8_t count) { - /* Must specify proto == TCP/UDP, no unknown flags or bad count */ - return (proto == IPPROTO_TCP || proto == IPPROTO_UDP) + /* Must specify supported protocol, no unknown flags or bad count */ + return (proto == IPPROTO_TCP || proto == IPPROTO_UDP + || proto == IPPROTO_SCTP || proto == IPPROTO_DCCP) && !(ip_invflags & XT_INV_PROTO) && (match_flags == XT_MULTIPORT_SOURCE || match_flags == XT_MULTIPORT_DESTINATION -- cgit v1.2.3-59-g8ed1b From 62b7743483b402f8fb73545d5d487ca714e82766 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:20:32 -0700 Subject: [NETFILTER]: x_tables: add quota match Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter/xt_quota.h | 16 +++++++ net/netfilter/Kconfig | 10 ++++ net/netfilter/Makefile | 1 + net/netfilter/xt_quota.c | 96 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 123 insertions(+) create mode 100644 include/linux/netfilter/xt_quota.h create mode 100644 net/netfilter/xt_quota.c (limited to 'net') diff --git a/include/linux/netfilter/xt_quota.h b/include/linux/netfilter/xt_quota.h new file mode 100644 index 000000000000..acd7fd77bbee --- /dev/null +++ b/include/linux/netfilter/xt_quota.h @@ -0,0 +1,16 @@ +#ifndef _XT_QUOTA_H +#define _XT_QUOTA_H + +enum xt_quota_flags { + XT_QUOTA_INVERT = 0x1, +}; +#define XT_QUOTA_MASK 0x1 + +struct xt_quota_info { + u_int32_t flags; + u_int32_t pad; + aligned_u64 quota; + struct xt_quota_info *master; +}; + +#endif /* _XT_QUOTA_H */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index e2893effdfaa..5543c7b74535 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -329,6 +329,16 @@ config NETFILTER_XT_MATCH_PKTTYPE To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_MATCH_QUOTA + tristate '"quota" match support' + depends on NETFILTER_XTABLES + help + This option adds a `quota' match, which allows to match on a + byte counter. + + If you want to compile it as a module, say M here and read + . If unsure, say `N'. + config NETFILTER_XT_MATCH_REALM tristate '"realm" match support' depends on NETFILTER_XTABLES diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 95b7e416512d..4b6a6ea07373 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -44,6 +44,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_MARK) += xt_mark.o obj-$(CONFIG_NETFILTER_XT_MATCH_MULTIPORT) += xt_multiport.o obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o +obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o diff --git a/net/netfilter/xt_quota.c b/net/netfilter/xt_quota.c new file mode 100644 index 000000000000..4cdba7469dc4 --- /dev/null +++ b/net/netfilter/xt_quota.c @@ -0,0 +1,96 @@ +/* + * netfilter module to enforce network quotas + * + * Sam Johnston + */ +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Sam Johnston "); + +static DEFINE_SPINLOCK(quota_lock); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + const struct xt_match *match, const void *matchinfo, + int offset, unsigned int protoff, int *hotdrop) +{ + struct xt_quota_info *q = ((struct xt_quota_info *)matchinfo)->master; + int ret = q->flags & XT_QUOTA_INVERT ? 1 : 0; + + spin_lock_bh("a_lock); + if (q->quota >= skb->len) { + q->quota -= skb->len; + ret ^= 1; + } else { + /* we do not allow even small packets from now on */ + q->quota = 0; + } + spin_unlock_bh("a_lock); + + return ret; +} + +static int +checkentry(const char *tablename, const void *entry, + const struct xt_match *match, void *matchinfo, + unsigned int matchsize, unsigned int hook_mask) +{ + struct xt_quota_info *q = (struct xt_quota_info *)matchinfo; + + if (q->flags & ~XT_QUOTA_MASK) + return 0; + /* For SMP, we only want to use one set of counters. */ + q->master = q; + return 1; +} + +static struct xt_match quota_match = { + .name = "quota", + .family = AF_INET, + .match = match, + .matchsize = sizeof(struct xt_quota_info), + .checkentry = checkentry, + .me = THIS_MODULE +}; + +static struct xt_match quota_match6 = { + .name = "quota", + .family = AF_INET6, + .match = match, + .matchsize = sizeof(struct xt_quota_info), + .checkentry = checkentry, + .me = THIS_MODULE +}; + +static int __init xt_quota_init(void) +{ + int ret; + + ret = xt_register_match("a_match); + if (ret) + goto err1; + ret = xt_register_match("a_match6); + if (ret) + goto err2; + return ret; + +err2: + xt_unregister_match("a_match); +err1: + return ret; +} + +static void __exit xt_quota_fini(void) +{ + xt_unregister_match("a_match6); + xt_unregister_match("a_match); +} + +module_init(xt_quota_init); +module_exit(xt_quota_fini); -- cgit v1.2.3-59-g8ed1b From f3389805e53a13bd969ee1c8fc5a4137b7c6c167 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:21:00 -0700 Subject: [NETFILTER]: x_tables: add statistic match Add statistic match which is a combination of the nth and random matches. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter/xt_statistic.h | 32 ++++++++++ net/netfilter/Kconfig | 6 ++ net/netfilter/Makefile | 1 + net/netfilter/xt_statistic.c | 112 +++++++++++++++++++++++++++++++++ 4 files changed, 151 insertions(+) create mode 100644 include/linux/netfilter/xt_statistic.h create mode 100644 net/netfilter/xt_statistic.c (limited to 'net') diff --git a/include/linux/netfilter/xt_statistic.h b/include/linux/netfilter/xt_statistic.h new file mode 100644 index 000000000000..c344e9916e23 --- /dev/null +++ b/include/linux/netfilter/xt_statistic.h @@ -0,0 +1,32 @@ +#ifndef _XT_STATISTIC_H +#define _XT_STATISTIC_H + +enum xt_statistic_mode { + XT_STATISTIC_MODE_RANDOM, + XT_STATISTIC_MODE_NTH, + __XT_STATISTIC_MODE_MAX +}; +#define XT_STATISTIC_MODE_MAX (__XT_STATISTIC_MODE_MAX - 1) + +enum xt_statistic_flags { + XT_STATISTIC_INVERT = 0x1, +}; +#define XT_STATISTIC_MASK 0x1 + +struct xt_statistic_info { + u_int16_t mode; + u_int16_t flags; + union { + struct { + u_int32_t probability; + } random; + struct { + u_int32_t every; + u_int32_t packet; + u_int32_t count; + } nth; + } u; + struct xt_statistic_info *master __attribute__((aligned(8))); +}; + +#endif /* _XT_STATISTIC_H */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 5543c7b74535..85a7e1770252 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -375,6 +375,12 @@ config NETFILTER_XT_MATCH_STATE To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_MATCH_STATISTIC + tristate '"statistic" match support' + depends on NETFILTER_XTABLES + help + statistic module + config NETFILTER_XT_MATCH_STRING tristate '"string" match support' depends on NETFILTER_XTABLES diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 4b6a6ea07373..df1da25f40df 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -48,6 +48,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o obj-$(CONFIG_NETFILTER_XT_MATCH_SCTP) += xt_sctp.o obj-$(CONFIG_NETFILTER_XT_MATCH_STATE) += xt_state.o +obj-$(CONFIG_NETFILTER_XT_MATCH_STATISTIC) += xt_statistic.o obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c new file mode 100644 index 000000000000..de1037f58596 --- /dev/null +++ b/net/netfilter/xt_statistic.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2006 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * Based on ipt_random and ipt_nth by Fabrice MARIE . + */ + +#include +#include +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Patrick McHardy "); +MODULE_DESCRIPTION("xtables statistical match module"); +MODULE_ALIAS("ipt_statistic"); +MODULE_ALIAS("ip6t_statistic"); + +static DEFINE_SPINLOCK(nth_lock); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + const struct xt_match *match, const void *matchinfo, + int offset, unsigned int protoff, int *hotdrop) +{ + struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo; + int ret = info->flags & XT_STATISTIC_INVERT ? 1 : 0; + + switch (info->mode) { + case XT_STATISTIC_MODE_RANDOM: + if ((net_random() & 0x7FFFFFFF) < info->u.random.probability) + ret ^= 1; + break; + case XT_STATISTIC_MODE_NTH: + info = info->master; + spin_lock_bh(&nth_lock); + if (info->u.nth.count++ == info->u.nth.every) { + info->u.nth.count = 0; + ret ^= 1; + } + spin_unlock_bh(&nth_lock); + break; + } + + return ret; +} + +static int +checkentry(const char *tablename, const void *entry, + const struct xt_match *match, void *matchinfo, + unsigned int matchsize, unsigned int hook_mask) +{ + struct xt_statistic_info *info = (struct xt_statistic_info *)matchinfo; + + if (info->mode > XT_STATISTIC_MODE_MAX || + info->flags & ~XT_STATISTIC_MASK) + return 0; + info->master = info; + return 1; +} + +static struct xt_match statistic_match = { + .name = "statistic", + .match = match, + .matchsize = sizeof(struct xt_statistic_info), + .checkentry = checkentry, + .family = AF_INET, + .me = THIS_MODULE, +}; + +static struct xt_match statistic_match6 = { + .name = "statistic", + .match = match, + .matchsize = sizeof(struct xt_statistic_info), + .checkentry = checkentry, + .family = AF_INET6, + .me = THIS_MODULE, +}; + +static int __init xt_statistic_init(void) +{ + int ret; + + ret = xt_register_match(&statistic_match); + if (ret) + goto err1; + + ret = xt_register_match(&statistic_match6); + if (ret) + goto err2; + return ret; +err2: + xt_unregister_match(&statistic_match); +err1: + return ret; +} + +static void __exit xt_statistic_fini(void) +{ + xt_unregister_match(&statistic_match6); + xt_unregister_match(&statistic_match); +} + +module_init(xt_statistic_init); +module_exit(xt_statistic_fini); -- cgit v1.2.3-59-g8ed1b From 404bdbfd242cb99ca0e9d3eb5fbb5bcd54123081 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:21:34 -0700 Subject: [NETFILTER]: recent match: replace by rewritten version Replace the unmaintainable ipt_recent match by a rewritten version that should be fully compatible. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_recent.c | 1268 ++++++++++++--------------------------- 1 file changed, 377 insertions(+), 891 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index b847ee409efb..9686c4d2057d 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -1,1007 +1,493 @@ -/* Kernel module to check if the source address has been seen recently. */ -/* Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org */ -/* Author: Stephen Frost */ -/* Project Page: http://snowman.net/projects/ipt_recent/ */ -/* This software is distributed under the terms of the GPL, Version 2 */ -/* This copyright does not cover user programs that use kernel services - * by normal system calls. */ - -#include -#include +/* + * Copyright (c) 2006 Patrick McHardy + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This is a replacement of the old ipt_recent module, which carried the + * following copyright notice: + * + * Author: Stephen Frost + * Copyright 2002-2003, Stephen Frost, 2.5.x port by laforge@netfilter.org + */ +#include +#include #include -#include -#include -#include +#include +#include #include -#include -#include -#include +#include +#include +#include +#include +#include +#include #include #include -#undef DEBUG -#define HASH_LOG 9 +MODULE_AUTHOR("Patrick McHardy "); +MODULE_DESCRIPTION("IP tables recently seen matching module"); +MODULE_LICENSE("GPL"); -/* Defaults, these can be overridden on the module command-line. */ static unsigned int ip_list_tot = 100; static unsigned int ip_pkt_list_tot = 20; static unsigned int ip_list_hash_size = 0; static unsigned int ip_list_perms = 0644; -#ifdef DEBUG -static int debug = 1; -#endif - -static char version[] = -KERN_INFO RECENT_NAME " " RECENT_VER ": Stephen Frost . http://snowman.net/projects/ipt_recent/\n"; - -MODULE_AUTHOR("Stephen Frost "); -MODULE_DESCRIPTION("IP tables recently seen matching module " RECENT_VER); -MODULE_LICENSE("GPL"); module_param(ip_list_tot, uint, 0400); module_param(ip_pkt_list_tot, uint, 0400); module_param(ip_list_hash_size, uint, 0400); module_param(ip_list_perms, uint, 0400); -#ifdef DEBUG -module_param(debug, bool, 0600); -MODULE_PARM_DESC(debug,"enable debugging output"); -#endif -MODULE_PARM_DESC(ip_list_tot,"number of IPs to remember per list"); -MODULE_PARM_DESC(ip_pkt_list_tot,"number of packets per IP to remember"); -MODULE_PARM_DESC(ip_list_hash_size,"size of hash table used to look up IPs"); -MODULE_PARM_DESC(ip_list_perms,"permissions on /proc/net/ipt_recent/* files"); - -/* Structure of our list of recently seen addresses. */ -struct recent_ip_list { - u_int32_t addr; - u_int8_t ttl; - unsigned long last_seen; - unsigned long *last_pkts; - u_int32_t oldest_pkt; - u_int32_t hash_entry; - u_int32_t time_pos; -}; - -struct time_info_list { - u_int32_t position; - u_int32_t time; +MODULE_PARM_DESC(ip_list_tot, "number of IPs to remember per list"); +MODULE_PARM_DESC(ip_pkt_list_tot, "number of packets per IP to remember (max. 255)"); +MODULE_PARM_DESC(ip_list_hash_size, "size of hash table used to look up IPs"); +MODULE_PARM_DESC(ip_list_perms, "permissions on /proc/net/ipt_recent/* files"); + + +struct recent_entry { + struct list_head list; + struct list_head lru_list; + u_int32_t addr; + u_int8_t ttl; + u_int8_t index; + u_int16_t nstamps; + unsigned long stamps[0]; }; -/* Structure of our linked list of tables of recent lists. */ -struct recent_ip_tables { - char name[IPT_RECENT_NAME_LEN]; - int count; - int time_pos; - struct recent_ip_list *table; - struct recent_ip_tables *next; - spinlock_t list_lock; - int *hash_table; - struct time_info_list *time_info; +struct recent_table { + struct list_head list; + char name[IPT_RECENT_NAME_LEN]; #ifdef CONFIG_PROC_FS - struct proc_dir_entry *status_proc; -#endif /* CONFIG_PROC_FS */ + struct proc_dir_entry *proc; +#endif + unsigned int refcnt; + unsigned int entries; + struct list_head lru_list; + struct list_head iphash[0]; }; -/* Our current list of addresses we have recently seen. - * Only added to on a --set, and only updated on --set || --update - */ -static struct recent_ip_tables *r_tables = NULL; - -/* We protect r_list with this spinlock so two processors are not modifying - * the list at the same time. - */ +static LIST_HEAD(tables); static DEFINE_SPINLOCK(recent_lock); #ifdef CONFIG_PROC_FS -/* Our /proc/net/ipt_recent entry */ -static struct proc_dir_entry *proc_net_ipt_recent = NULL; -#endif - -/* Function declaration for later. */ -static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop); - -/* Function to hash a given address into the hash table of table_size size */ -static int hash_func(unsigned int addr, int table_size) -{ - int result = 0; - unsigned int value = addr; - do { result ^= value; } while((value >>= HASH_LOG)); - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": %d = hash_func(%u,%d)\n", - result & (table_size - 1), - addr, - table_size); +static struct proc_dir_entry *proc_dir; +static struct file_operations recent_fops; #endif - return(result & (table_size - 1)); -} +static u_int32_t hash_rnd; +static int hash_rnd_initted; -#ifdef CONFIG_PROC_FS -/* This is the function which produces the output for our /proc output - * interface which lists each IP address, the last seen time and the - * other recent times the address was seen. - */ - -static int ip_recent_get_info(char *buffer, char **start, off_t offset, int length, int *eof, void *data) +static unsigned int recent_entry_hash(u_int32_t addr) { - int len = 0, count, last_len = 0, pkt_count; - off_t pos = 0; - off_t begin = 0; - struct recent_ip_tables *curr_table; - - curr_table = (struct recent_ip_tables*) data; - - spin_lock_bh(&curr_table->list_lock); - for(count = 0; count < ip_list_tot; count++) { - if(!curr_table->table[count].addr) continue; - last_len = len; - len += sprintf(buffer+len,"src=%u.%u.%u.%u ",NIPQUAD(curr_table->table[count].addr)); - len += sprintf(buffer+len,"ttl: %u ",curr_table->table[count].ttl); - len += sprintf(buffer+len,"last_seen: %lu ",curr_table->table[count].last_seen); - len += sprintf(buffer+len,"oldest_pkt: %u ",curr_table->table[count].oldest_pkt); - len += sprintf(buffer+len,"last_pkts: %lu",curr_table->table[count].last_pkts[0]); - for(pkt_count = 1; pkt_count < ip_pkt_list_tot; pkt_count++) { - if(!curr_table->table[count].last_pkts[pkt_count]) break; - len += sprintf(buffer+len,", %lu",curr_table->table[count].last_pkts[pkt_count]); - } - len += sprintf(buffer+len,"\n"); - pos = begin + len; - if(pos < offset) { len = 0; begin = pos; } - if(pos > offset + length) { len = last_len; break; } + if (!hash_rnd_initted) { + get_random_bytes(&hash_rnd, 4); + hash_rnd_initted = 1; } - - *start = buffer + (offset - begin); - len -= (offset - begin); - if(len > length) len = length; - - spin_unlock_bh(&curr_table->list_lock); - return len; + return jhash_1word(addr, hash_rnd) & (ip_list_hash_size - 1); } -/* ip_recent_ctrl provides an interface for users to modify the table - * directly. This allows adding entries, removing entries, and - * flushing the entire table. - * This is done by opening up the appropriate table for writing and - * sending one of: - * xx.xx.xx.xx -- Add entry to table with current time - * +xx.xx.xx.xx -- Add entry to table with current time - * -xx.xx.xx.xx -- Remove entry from table - * clear -- Flush table, remove all entries - */ - -static int ip_recent_ctrl(struct file *file, const char __user *input, unsigned long size, void *data) +static struct recent_entry * +recent_entry_lookup(const struct recent_table *table, u_int32_t addr, u_int8_t ttl) { - static const u_int32_t max[4] = { 0xffffffff, 0xffffff, 0xffff, 0xff }; - u_int32_t val; - int base, used = 0; - char c, *cp; - union iaddr { - uint8_t bytes[4]; - uint32_t word; - } res; - uint8_t *pp = res.bytes; - int digit; - - char buffer[20]; - int len, check_set = 0, count; - u_int32_t addr = 0; - struct sk_buff *skb; - struct ipt_recent_info *info; - struct recent_ip_tables *curr_table; - - curr_table = (struct recent_ip_tables*) data; - - if(size > 20) len = 20; else len = size; - - if(copy_from_user(buffer,input,len)) return -EFAULT; - - if(len < 20) buffer[len] = '\0'; - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl len: %d, input: `%.20s'\n",len,buffer); -#endif + struct recent_entry *e; + unsigned int h; + + h = recent_entry_hash(addr); + list_for_each_entry(e, &table->iphash[h], list) + if (e->addr == addr && (ttl == e->ttl || !ttl || !e->ttl)) + return e; + return NULL; +} - cp = buffer; - while(isspace(*cp)) { cp++; used++; if(used >= len-5) return used; } +static void recent_entry_remove(struct recent_table *t, struct recent_entry *e) +{ + list_del(&e->list); + list_del(&e->lru_list); + kfree(e); + t->entries--; +} - /* Check if we are asked to flush the entire table */ - if(!memcmp(cp,"clear",5)) { - used += 5; - spin_lock_bh(&curr_table->list_lock); - curr_table->time_pos = 0; - for(count = 0; count < ip_list_hash_size; count++) { - curr_table->hash_table[count] = -1; - } - for(count = 0; count < ip_list_tot; count++) { - curr_table->table[count].last_seen = 0; - curr_table->table[count].addr = 0; - curr_table->table[count].ttl = 0; - memset(curr_table->table[count].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long)); - curr_table->table[count].oldest_pkt = 0; - curr_table->table[count].time_pos = 0; - curr_table->time_info[count].position = count; - curr_table->time_info[count].time = 0; - } - spin_unlock_bh(&curr_table->list_lock); - return used; - } +static struct recent_entry * +recent_entry_init(struct recent_table *t, u_int32_t addr, u_int8_t ttl) +{ + struct recent_entry *e; - check_set = IPT_RECENT_SET; - switch(*cp) { - case '+': check_set = IPT_RECENT_SET; cp++; used++; break; - case '-': check_set = IPT_RECENT_REMOVE; cp++; used++; break; - default: if(!isdigit(*cp)) return (used+1); break; + if (t->entries >= ip_list_tot) { + e = list_entry(t->lru_list.next, struct recent_entry, lru_list); + recent_entry_remove(t, e); } + e = kmalloc(sizeof(*e) + sizeof(e->stamps[0]) * ip_pkt_list_tot, + GFP_ATOMIC); + if (e == NULL) + return NULL; + e->addr = addr; + e->ttl = ttl; + e->stamps[0] = jiffies; + e->nstamps = 1; + e->index = 1; + list_add_tail(&e->list, &t->iphash[recent_entry_hash(addr)]); + list_add_tail(&e->lru_list, &t->lru_list); + t->entries++; + return e; +} -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl cp: `%c', check_set: %d\n",*cp,check_set); -#endif - /* Get addr (effectively inet_aton()) */ - /* Shamelessly stolen from libc, a function in the kernel for doing - * this would, of course, be greatly preferred, but our options appear - * to be rather limited, so we will just do it ourselves here. - */ - res.word = 0; - - c = *cp; - for(;;) { - if(!isdigit(c)) return used; - val = 0; base = 10; digit = 0; - if(c == '0') { - c = *++cp; - if(c == 'x' || c == 'X') base = 16, c = *++cp; - else { base = 8; digit = 1; } - } - for(;;) { - if(isascii(c) && isdigit(c)) { - if(base == 8 && (c == '8' || c == '0')) return used; - val = (val * base) + (c - '0'); - c = *++cp; - digit = 1; - } else if(base == 16 && isascii(c) && isxdigit(c)) { - val = (val << 4) | (c + 10 - (islower(c) ? 'a' : 'A')); - c = *++cp; - digit = 1; - } else break; - } - if(c == '.') { - if(pp > res.bytes + 2 || val > 0xff) return used; - *pp++ = val; - c = *++cp; - } else break; - } - used = cp - buffer; - if(c != '\0' && (!isascii(c) || !isspace(c))) return used; - if(c == '\n') used++; - if(!digit) return used; +static void recent_entry_update(struct recent_table *t, struct recent_entry *e) +{ + e->stamps[e->index++] = jiffies; + if (e->index > e->nstamps) + e->nstamps = e->index; + e->index %= ip_pkt_list_tot; + list_move_tail(&e->lru_list, &t->lru_list); +} - if(val > max[pp - res.bytes]) return used; - addr = res.word | htonl(val); +static struct recent_table *recent_table_lookup(const char *name) +{ + struct recent_table *t; - if(!addr && check_set == IPT_RECENT_SET) return used; + list_for_each_entry(t, &tables, list) + if (!strcmp(t->name, name)) + return t; + return NULL; +} -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": ip_recent_ctrl c: %c, addr: %u used: %d\n",c,addr,used); -#endif +static void recent_table_flush(struct recent_table *t) +{ + struct recent_entry *e, *next; + unsigned int i; - /* Set up and just call match */ - info = kmalloc(sizeof(struct ipt_recent_info),GFP_KERNEL); - if(!info) { return -ENOMEM; } - info->seconds = 0; - info->hit_count = 0; - info->check_set = check_set; - info->invert = 0; - info->side = IPT_RECENT_SOURCE; - strncpy(info->name,curr_table->name,IPT_RECENT_NAME_LEN); - info->name[IPT_RECENT_NAME_LEN-1] = '\0'; - - skb = kmalloc(sizeof(struct sk_buff),GFP_KERNEL); - if (!skb) { - used = -ENOMEM; - goto out_free_info; - } - skb->nh.iph = kmalloc(sizeof(struct iphdr),GFP_KERNEL); - if (!skb->nh.iph) { - used = -ENOMEM; - goto out_free_skb; + for (i = 0; i < ip_list_hash_size; i++) { + list_for_each_entry_safe(e, next, &t->iphash[i], list) + recent_entry_remove(t, e); } - - skb->nh.iph->saddr = addr; - skb->nh.iph->daddr = 0; - /* Clear ttl since we have no way of knowing it */ - skb->nh.iph->ttl = 0; - match(skb,NULL,NULL,NULL,info,0,0,NULL); - - kfree(skb->nh.iph); -out_free_skb: - kfree(skb); -out_free_info: - kfree(info); - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": Leaving ip_recent_ctrl addr: %u used: %d\n",addr,used); -#endif - return used; } -#endif /* CONFIG_PROC_FS */ - -/* 'match' is our primary function, called by the kernel whenever a rule is - * hit with our module as an option to it. - * What this function does depends on what was specifically asked of it by - * the user: - * --set -- Add or update last seen time of the source address of the packet - * -- matchinfo->check_set == IPT_RECENT_SET - * --rcheck -- Just check if the source address is in the list - * -- matchinfo->check_set == IPT_RECENT_CHECK - * --update -- If the source address is in the list, update last_seen - * -- matchinfo->check_set == IPT_RECENT_UPDATE - * --remove -- If the source address is in the list, remove it - * -- matchinfo->check_set == IPT_RECENT_REMOVE - * --seconds -- Option to --rcheck/--update, only match if last_seen within seconds - * -- matchinfo->seconds - * --hitcount -- Option to --rcheck/--update, only match if seen hitcount times - * -- matchinfo->hit_count - * --seconds and --hitcount can be combined - */ static int -match(const struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - const struct xt_match *match, - const void *matchinfo, - int offset, - unsigned int protoff, - int *hotdrop) +ipt_recent_match(const struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + const struct xt_match *match, const void *matchinfo, + int offset, unsigned int protoff, int *hotdrop) { - int pkt_count, hits_found, ans; - unsigned long now; const struct ipt_recent_info *info = matchinfo; - u_int32_t addr = 0, time_temp; - u_int8_t ttl = skb->nh.iph->ttl; - int *hash_table; - int orig_hash_result, hash_result, temp, location = 0, time_loc, end_collision_chain = -1; - struct time_info_list *time_info; - struct recent_ip_tables *curr_table; - struct recent_ip_tables *last_table; - struct recent_ip_list *r_list; - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match() called\n"); -#endif - - /* Default is false ^ info->invert */ - ans = info->invert; + struct recent_table *t; + struct recent_entry *e; + u_int32_t addr; + u_int8_t ttl; + int ret = info->invert; -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): name = '%s'\n",info->name); -#endif + if (info->side == IPT_RECENT_DEST) + addr = skb->nh.iph->daddr; + else + addr = skb->nh.iph->saddr; - /* if out != NULL then routing has been done and TTL changed. - * We change it back here internally for match what came in before routing. */ - if(out) ttl++; + ttl = skb->nh.iph->ttl; + /* use TTL as seen before forwarding */ + if (out && !skb->sk) + ttl++; - /* Find the right table */ spin_lock_bh(&recent_lock); - curr_table = r_tables; - while( (last_table = curr_table) && strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (curr_table = curr_table->next) ); - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): table found('%s')\n",info->name); -#endif - - spin_unlock_bh(&recent_lock); - - /* Table with this name not found, match impossible */ - if(!curr_table) { return ans; } - - /* Make sure no one is changing the list while we work with it */ - spin_lock_bh(&curr_table->list_lock); - - r_list = curr_table->table; - if(info->side == IPT_RECENT_DEST) addr = skb->nh.iph->daddr; else addr = skb->nh.iph->saddr; - - if(!addr) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match() address (%u) invalid, leaving.\n",addr); -#endif - spin_unlock_bh(&curr_table->list_lock); - return ans; + t = recent_table_lookup(info->name); + e = recent_entry_lookup(t, addr, + info->check_set & IPT_RECENT_TTL ? ttl : 0); + if (e == NULL) { + if (!(info->check_set & IPT_RECENT_SET)) + goto out; + e = recent_entry_init(t, addr, ttl); + if (e == NULL) + *hotdrop = 1; + ret ^= 1; + goto out; } -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): checking table, addr: %u, ttl: %u, orig_ttl: %u\n",addr,ttl,skb->nh.iph->ttl); -#endif - - /* Get jiffies now in case they changed while we were waiting for a lock */ - now = jiffies; - hash_table = curr_table->hash_table; - time_info = curr_table->time_info; - - orig_hash_result = hash_result = hash_func(addr,ip_list_hash_size); - /* Hash entry at this result used */ - /* Check for TTL match if requested. If TTL is zero then a match would never - * happen, so match regardless of existing TTL in that case. Zero means the - * entry was added via the /proc interface anyway, so we will just use the - * first TTL we get for that IP address. */ - if(info->check_set & IPT_RECENT_TTL) { - while(hash_table[hash_result] != -1 && !(r_list[hash_table[hash_result]].addr == addr && - (!r_list[hash_table[hash_result]].ttl || r_list[hash_table[hash_result]].ttl == ttl))) { - /* Collision in hash table */ - hash_result = (hash_result + 1) % ip_list_hash_size; - } - } else { - while(hash_table[hash_result] != -1 && r_list[hash_table[hash_result]].addr != addr) { - /* Collision in hash table */ - hash_result = (hash_result + 1) % ip_list_hash_size; - } - } - - if(hash_table[hash_result] == -1 && !(info->check_set & IPT_RECENT_SET)) { - /* IP not in list and not asked to SET */ - spin_unlock_bh(&curr_table->list_lock); - return ans; - } - - /* Check if we need to handle the collision, do not need to on REMOVE */ - if(orig_hash_result != hash_result && !(info->check_set & IPT_RECENT_REMOVE)) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision in hash table. (or: %d,hr: %d,oa: %u,ha: %u)\n", - orig_hash_result, - hash_result, - r_list[hash_table[orig_hash_result]].addr, - addr); -#endif - - /* We had a collision. - * orig_hash_result is where we started, hash_result is where we ended up. - * So, swap them because we are likely to see the same guy again sooner */ -#ifdef DEBUG - if(debug) { - printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[orig_hash_result] = %d\n",hash_table[orig_hash_result]); - printk(KERN_INFO RECENT_NAME ": match(): Collision; r_list[hash_table[orig_hash_result]].hash_entry = %d\n", - r_list[hash_table[orig_hash_result]].hash_entry); - } -#endif - - r_list[hash_table[orig_hash_result]].hash_entry = hash_result; - - - temp = hash_table[orig_hash_result]; -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision; hash_table[hash_result] = %d\n",hash_table[hash_result]); -#endif - hash_table[orig_hash_result] = hash_table[hash_result]; - hash_table[hash_result] = temp; - temp = hash_result; - hash_result = orig_hash_result; - orig_hash_result = temp; - time_info[r_list[hash_table[orig_hash_result]].time_pos].position = hash_table[orig_hash_result]; - if(hash_table[hash_result] != -1) { - r_list[hash_table[hash_result]].hash_entry = hash_result; - time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; - } - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): Collision handled.\n"); -#endif - } - - if(hash_table[hash_result] == -1) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): New table entry. (hr: %d,ha: %u)\n", - hash_result, addr); -#endif - - /* New item found and IPT_RECENT_SET, so we need to add it */ - location = time_info[curr_table->time_pos].position; - hash_table[r_list[location].hash_entry] = -1; - hash_table[hash_result] = location; - memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long)); - r_list[location].time_pos = curr_table->time_pos; - r_list[location].addr = addr; - r_list[location].ttl = ttl; - r_list[location].last_seen = now; - r_list[location].oldest_pkt = 1; - r_list[location].last_pkts[0] = now; - r_list[location].hash_entry = hash_result; - time_info[curr_table->time_pos].time = r_list[location].last_seen; - curr_table->time_pos = (curr_table->time_pos + 1) % ip_list_tot; - - ans = !info->invert; - } else { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): Existing table entry. (hr: %d,ha: %u)\n", - hash_result, - addr); -#endif - - /* Existing item found */ - location = hash_table[hash_result]; - /* We have a match on address, now to make sure it meets all requirements for a - * full match. */ - if(info->check_set & IPT_RECENT_CHECK || info->check_set & IPT_RECENT_UPDATE) { - if(!info->seconds && !info->hit_count) ans = !info->invert; else ans = info->invert; - if(info->seconds && !info->hit_count) { - if(time_before_eq(now,r_list[location].last_seen+info->seconds*HZ)) ans = !info->invert; else ans = info->invert; - } - if(info->seconds && info->hit_count) { - for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { - if(r_list[location].last_pkts[pkt_count] == 0) break; - if(time_before_eq(now,r_list[location].last_pkts[pkt_count]+info->seconds*HZ)) hits_found++; - } - if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; - } - if(info->hit_count && !info->seconds) { - for(pkt_count = 0, hits_found = 0; pkt_count < ip_pkt_list_tot; pkt_count++) { - if(r_list[location].last_pkts[pkt_count] == 0) break; - hits_found++; - } - if(hits_found >= info->hit_count) ans = !info->invert; else ans = info->invert; - } - } -#ifdef DEBUG - if(debug) { - if(ans) - printk(KERN_INFO RECENT_NAME ": match(): match addr: %u\n",addr); - else - printk(KERN_INFO RECENT_NAME ": match(): no match addr: %u\n",addr); - } -#endif - - /* If and only if we have been asked to SET, or to UPDATE (on match) do we add the - * current timestamp to the last_seen. */ - if((info->check_set & IPT_RECENT_SET && (ans = !info->invert)) || (info->check_set & IPT_RECENT_UPDATE && ans)) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): SET or UPDATE; updating time info.\n"); -#endif - /* Have to update our time info */ - time_loc = r_list[location].time_pos; - time_info[time_loc].time = now; - time_info[time_loc].position = location; - while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) { - time_temp = time_info[time_loc].time; - time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time; - time_info[(time_loc+1)%ip_list_tot].time = time_temp; - time_temp = time_info[time_loc].position; - time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position; - time_info[(time_loc+1)%ip_list_tot].position = time_temp; - r_list[time_info[time_loc].position].time_pos = time_loc; - r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot; - time_loc = (time_loc+1) % ip_list_tot; - } - r_list[location].time_pos = time_loc; - r_list[location].ttl = ttl; - r_list[location].last_pkts[r_list[location].oldest_pkt] = now; - r_list[location].oldest_pkt = ++r_list[location].oldest_pkt % ip_pkt_list_tot; - r_list[location].last_seen = now; - } - /* If we have been asked to remove the entry from the list, just set it to 0 */ - if(info->check_set & IPT_RECENT_REMOVE) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; clearing entry (or: %d, hr: %d).\n",orig_hash_result,hash_result); -#endif - /* Check if this is part of a collision chain */ - while(hash_table[(orig_hash_result+1) % ip_list_hash_size] != -1) { - orig_hash_result++; - if(hash_func(r_list[hash_table[orig_hash_result]].addr,ip_list_hash_size) == hash_result) { - /* Found collision chain, how deep does this rabbit hole go? */ -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; found collision chain.\n"); -#endif - end_collision_chain = orig_hash_result; - } - } - if(end_collision_chain != -1) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match(): REMOVE; part of collision chain, moving to end.\n"); -#endif - /* Part of a collision chain, swap it with the end of the chain - * before removing. */ - r_list[hash_table[end_collision_chain]].hash_entry = hash_result; - temp = hash_table[end_collision_chain]; - hash_table[end_collision_chain] = hash_table[hash_result]; - hash_table[hash_result] = temp; - time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; - hash_result = end_collision_chain; - r_list[hash_table[hash_result]].hash_entry = hash_result; - time_info[r_list[hash_table[hash_result]].time_pos].position = hash_table[hash_result]; - } - location = hash_table[hash_result]; - hash_table[r_list[location].hash_entry] = -1; - time_loc = r_list[location].time_pos; - time_info[time_loc].time = 0; - time_info[time_loc].position = location; - while((time_info[(time_loc+1) % ip_list_tot].time < time_info[time_loc].time) && ((time_loc+1) % ip_list_tot) != curr_table->time_pos) { - time_temp = time_info[time_loc].time; - time_info[time_loc].time = time_info[(time_loc+1)%ip_list_tot].time; - time_info[(time_loc+1)%ip_list_tot].time = time_temp; - time_temp = time_info[time_loc].position; - time_info[time_loc].position = time_info[(time_loc+1)%ip_list_tot].position; - time_info[(time_loc+1)%ip_list_tot].position = time_temp; - r_list[time_info[time_loc].position].time_pos = time_loc; - r_list[time_info[(time_loc+1)%ip_list_tot].position].time_pos = (time_loc+1)%ip_list_tot; - time_loc = (time_loc+1) % ip_list_tot; + if (info->check_set & IPT_RECENT_SET) + ret ^= 1; + else if (info->check_set & IPT_RECENT_REMOVE) { + recent_entry_remove(t, e); + ret ^= 1; + } else if (info->check_set & (IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) { + unsigned long t = jiffies - info->seconds * HZ; + unsigned int i, hits = 0; + + for (i = 0; i < e->nstamps; i++) { + if (info->seconds && time_after(t, e->stamps[i])) + continue; + if (++hits >= info->hit_count) { + ret ^= 1; + break; } - r_list[location].time_pos = time_loc; - r_list[location].last_seen = 0; - r_list[location].addr = 0; - r_list[location].ttl = 0; - memset(r_list[location].last_pkts,0,ip_pkt_list_tot*sizeof(unsigned long)); - r_list[location].oldest_pkt = 0; - ans = !info->invert; } - spin_unlock_bh(&curr_table->list_lock); - return ans; } - spin_unlock_bh(&curr_table->list_lock); -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": match() left.\n"); -#endif - return ans; + if (info->check_set & IPT_RECENT_SET || + (info->check_set & IPT_RECENT_UPDATE && ret)) { + recent_entry_update(t, e); + e->ttl = ttl; + } +out: + spin_unlock_bh(&recent_lock); + return ret; } -/* This function is to verify that the rule given during the userspace iptables - * command is correct. - * If the command is valid then we check if the table name referred to by the - * rule exists, if not it is created. - */ static int -checkentry(const char *tablename, - const void *ip, - const struct xt_match *match, - void *matchinfo, - unsigned int matchsize, - unsigned int hook_mask) +ipt_recent_checkentry(const char *tablename, const void *ip, + const struct xt_match *match, void *matchinfo, + unsigned int matchsize, unsigned int hook_mask) { - int flag = 0, c; - unsigned long *hold; const struct ipt_recent_info *info = matchinfo; - struct recent_ip_tables *curr_table, *find_table, *last_table; - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() entered.\n"); -#endif - - /* seconds and hit_count only valid for CHECK/UPDATE */ - if(info->check_set & IPT_RECENT_SET) { flag++; if(info->seconds || info->hit_count) return 0; } - if(info->check_set & IPT_RECENT_REMOVE) { flag++; if(info->seconds || info->hit_count) return 0; } - if(info->check_set & IPT_RECENT_CHECK) flag++; - if(info->check_set & IPT_RECENT_UPDATE) flag++; - - /* One and only one of these should ever be set */ - if(flag != 1) return 0; + struct recent_table *t; + unsigned i; + int ret = 0; - /* Name must be set to something */ - if(!info->name || !info->name[0]) return 0; + if (hweight8(info->check_set & + (IPT_RECENT_SET | IPT_RECENT_REMOVE | + IPT_RECENT_CHECK | IPT_RECENT_UPDATE)) != 1) + return 0; + if ((info->check_set & (IPT_RECENT_SET | IPT_RECENT_REMOVE)) && + (info->seconds || info->hit_count)) + return 0; + if (info->name[0] == '\0' || + strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN) + return 0; - /* Things look good, create a list for this if it does not exist */ - /* Lock the linked list while we play with it */ spin_lock_bh(&recent_lock); - - /* Look for an entry with this name already created */ - /* Finds the end of the list and the entry before the end if current name does not exist */ - find_table = r_tables; - while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) ); - - /* If a table already exists just increment the count on that table and return */ - if(find_table) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), incrementing count.\n",info->name); -#endif - find_table->count++; - spin_unlock_bh(&recent_lock); - return 1; + t = recent_table_lookup(info->name); + if (t != NULL) { + t->refcnt++; + ret = 1; + goto out; } - spin_unlock_bh(&recent_lock); - - /* Table with this name not found */ - /* Allocate memory for new linked list item */ - -#ifdef DEBUG - if(debug) { - printk(KERN_INFO RECENT_NAME ": checkentry: no table found (%s)\n",info->name); - printk(KERN_INFO RECENT_NAME ": checkentry: Allocationg %d for link-list entry.\n",sizeof(struct recent_ip_tables)); + t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size, + GFP_ATOMIC); + if (t == NULL) + goto out; + strcpy(t->name, info->name); + INIT_LIST_HEAD(&t->lru_list); + for (i = 0; i < ip_list_hash_size; i++) + INIT_LIST_HEAD(&t->iphash[i]); +#ifdef CONFIG_PROC_FS + t->proc = create_proc_entry(t->name, ip_list_perms, proc_dir); + if (t->proc == NULL) { + kfree(t); + goto out; } + t->proc->proc_fops = &recent_fops; + t->proc->data = t; #endif + list_add_tail(&t->list, &tables); + ret = 1; +out: + spin_unlock_bh(&recent_lock); + return ret; +} - curr_table = vmalloc(sizeof(struct recent_ip_tables)); - if(curr_table == NULL) return 0; - - spin_lock_init(&curr_table->list_lock); - curr_table->next = NULL; - curr_table->count = 1; - curr_table->time_pos = 0; - strncpy(curr_table->name,info->name,IPT_RECENT_NAME_LEN); - curr_table->name[IPT_RECENT_NAME_LEN-1] = '\0'; - - /* Allocate memory for this table and the list of packets in each entry. */ -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for table (%s).\n", - sizeof(struct recent_ip_list)*ip_list_tot, - info->name); -#endif - - curr_table->table = vmalloc(sizeof(struct recent_ip_list)*ip_list_tot); - if(curr_table->table == NULL) { vfree(curr_table); return 0; } - memset(curr_table->table,0,sizeof(struct recent_ip_list)*ip_list_tot); -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for pkt_list.\n", - sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot); -#endif - - hold = vmalloc(sizeof(unsigned long)*ip_pkt_list_tot*ip_list_tot); -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: After pkt_list allocation.\n"); -#endif - if(hold == NULL) { - printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for pkt_list.\n"); - vfree(curr_table->table); - vfree(curr_table); - return 0; - } - for(c = 0; c < ip_list_tot; c++) { - curr_table->table[c].last_pkts = hold + c*ip_pkt_list_tot; - } +static void +ipt_recent_destroy(const struct xt_match *match, void *matchinfo, + unsigned int matchsize) +{ + const struct ipt_recent_info *info = matchinfo; + struct recent_table *t; - /* Allocate memory for the hash table */ -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for hash_table.\n", - sizeof(int)*ip_list_hash_size); + spin_lock_bh(&recent_lock); + t = recent_table_lookup(info->name); + if (--t->refcnt == 0) { + list_del(&t->list); + recent_table_flush(t); +#ifdef CONFIG_PROC_FS + remove_proc_entry(t->name, proc_dir); #endif - - curr_table->hash_table = vmalloc(sizeof(int)*ip_list_hash_size); - if(!curr_table->hash_table) { - printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for hash_table.\n"); - vfree(hold); - vfree(curr_table->table); - vfree(curr_table); - return 0; - } - - for(c = 0; c < ip_list_hash_size; c++) { - curr_table->hash_table[c] = -1; + kfree(t); } + spin_unlock_bh(&recent_lock); +} - /* Allocate memory for the time info */ -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: Allocating %d for time_info.\n", - sizeof(struct time_info_list)*ip_list_tot); -#endif +#ifdef CONFIG_PROC_FS +struct recent_iter_state { + struct recent_table *table; + unsigned int bucket; +}; - curr_table->time_info = vmalloc(sizeof(struct time_info_list)*ip_list_tot); - if(!curr_table->time_info) { - printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for time_info.\n"); - vfree(curr_table->hash_table); - vfree(hold); - vfree(curr_table->table); - vfree(curr_table); - return 0; - } - for(c = 0; c < ip_list_tot; c++) { - curr_table->time_info[c].position = c; - curr_table->time_info[c].time = 0; - } +static void *recent_seq_start(struct seq_file *seq, loff_t *pos) +{ + struct recent_iter_state *st = seq->private; + struct recent_table *t = st->table; + struct recent_entry *e; + loff_t p = *pos; - /* Put the new table in place */ spin_lock_bh(&recent_lock); - find_table = r_tables; - while( (last_table = find_table) && strncmp(info->name,find_table->name,IPT_RECENT_NAME_LEN) && (find_table = find_table->next) ); - - /* If a table already exists just increment the count on that table and return */ - if(find_table) { - find_table->count++; - spin_unlock_bh(&recent_lock); -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry: table found (%s), created by other process.\n",info->name); -#endif - vfree(curr_table->time_info); - vfree(curr_table->hash_table); - vfree(hold); - vfree(curr_table->table); - vfree(curr_table); - return 1; - } - if(!last_table) r_tables = curr_table; else last_table->next = curr_table; - spin_unlock_bh(&recent_lock); - -#ifdef CONFIG_PROC_FS - /* Create our proc 'status' entry. */ - curr_table->status_proc = create_proc_entry(curr_table->name, ip_list_perms, proc_net_ipt_recent); - if (!curr_table->status_proc) { - vfree(hold); - printk(KERN_INFO RECENT_NAME ": checkentry: unable to allocate for /proc entry.\n"); - /* Destroy the created table */ - spin_lock_bh(&recent_lock); - last_table = NULL; - curr_table = r_tables; - if(!curr_table) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, no tables.\n"); -#endif - spin_unlock_bh(&recent_lock); - return 0; - } - while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) ); - if(!curr_table) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() create_proc failed, table already destroyed.\n"); -#endif - spin_unlock_bh(&recent_lock); - return 0; + for (st->bucket = 0; st->bucket < ip_list_hash_size; st->bucket++) { + list_for_each_entry(e, &t->iphash[st->bucket], list) { + if (p-- == 0) + return e; } - if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next; - spin_unlock_bh(&recent_lock); - vfree(curr_table->time_info); - vfree(curr_table->hash_table); - vfree(curr_table->table); - vfree(curr_table); - return 0; } - - curr_table->status_proc->owner = THIS_MODULE; - curr_table->status_proc->data = curr_table; - wmb(); - curr_table->status_proc->read_proc = ip_recent_get_info; - curr_table->status_proc->write_proc = ip_recent_ctrl; -#endif /* CONFIG_PROC_FS */ - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": checkentry() left.\n"); -#endif + return NULL; +} - return 1; +static void *recent_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct recent_iter_state *st = seq->private; + struct recent_table *t = st->table; + struct recent_entry *e = v; + struct list_head *head = e->list.next; + + while (head == &t->iphash[st->bucket]) { + if (++st->bucket >= ip_list_hash_size) + return NULL; + head = t->iphash[st->bucket].next; + } + (*pos)++; + return list_entry(head, struct recent_entry, list); } -/* This function is called in the event that a rule matching this module is - * removed. - * When this happens we need to check if there are no other rules matching - * the table given. If that is the case then we remove the table and clean - * up its memory. - */ -static void -destroy(const struct xt_match *match, void *matchinfo, unsigned int matchsize) +static void recent_seq_stop(struct seq_file *s, void *v) { - const struct ipt_recent_info *info = matchinfo; - struct recent_ip_tables *curr_table, *last_table; + spin_unlock_bh(&recent_lock); +} -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() entered.\n"); -#endif +static int recent_seq_show(struct seq_file *seq, void *v) +{ + struct recent_entry *e = v; + unsigned int i; + + i = (e->index - 1) % ip_pkt_list_tot; + seq_printf(seq, "src=%u.%u.%u.%u ttl: %u last_seen: %lu oldest_pkt: %u", + NIPQUAD(e->addr), e->ttl, e->stamps[i], e->index); + for (i = 0; i < e->nstamps; i++) + seq_printf(seq, "%s %lu", i ? "," : "", e->stamps[i]); + seq_printf(seq, "\n"); + return 0; +} - if(matchsize != IPT_ALIGN(sizeof(struct ipt_recent_info))) return; +static struct seq_operations recent_seq_ops = { + .start = recent_seq_start, + .next = recent_seq_next, + .stop = recent_seq_stop, + .show = recent_seq_show, +}; - /* Lock the linked list while we play with it */ - spin_lock_bh(&recent_lock); +static int recent_seq_open(struct inode *inode, struct file *file) +{ + struct proc_dir_entry *pde = PDE(inode); + struct seq_file *seq; + struct recent_iter_state *st; + int ret; + + st = kzalloc(sizeof(*st), GFP_KERNEL); + if (st == NULL) + return -ENOMEM; + ret = seq_open(file, &recent_seq_ops); + if (ret) + kfree(st); + st->table = pde->data; + seq = file->private_data; + seq->private = st; + return ret; +} - /* Look for an entry with this name already created */ - /* Finds the end of the list and the entry before the end if current name does not exist */ - last_table = NULL; - curr_table = r_tables; - if(!curr_table) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() No tables found, leaving.\n"); -#endif +static ssize_t recent_proc_write(struct file *file, const char __user *input, + size_t size, loff_t *loff) +{ + struct proc_dir_entry *pde = PDE(file->f_dentry->d_inode); + struct recent_table *t = pde->data; + struct recent_entry *e; + char buf[sizeof("+255.255.255.255")], *c = buf; + u_int32_t addr; + int add; + + if (size > sizeof(buf)) + size = sizeof(buf); + if (copy_from_user(buf, input, size)) + return -EFAULT; + while (isspace(*c)) + c++; + + if (size - (c - buf) < 5) + return c - buf; + if (!strncmp(c, "clear", 5)) { + c += 5; + spin_lock_bh(&recent_lock); + recent_table_flush(t); spin_unlock_bh(&recent_lock); - return; + return c - buf; } - while( strncmp(info->name,curr_table->name,IPT_RECENT_NAME_LEN) && (last_table = curr_table) && (curr_table = curr_table->next) ); - /* If a table does not exist then do nothing and return */ - if(!curr_table) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table not found, leaving.\n"); -#endif - spin_unlock_bh(&recent_lock); - return; + switch (*c) { + case '-': + add = 0; + c++; + break; + case '+': + c++; + default: + add = 1; + break; } + addr = in_aton(c); - curr_table->count--; - - /* If count is still non-zero then there are still rules referenceing it so we do nothing */ - if(curr_table->count) { -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, non-zero count, leaving.\n"); -#endif - spin_unlock_bh(&recent_lock); - return; + spin_lock_bh(&recent_lock); + e = recent_entry_lookup(t, addr, 0); + if (e == NULL) { + if (add) + recent_entry_init(t, addr, 0); + } else { + if (add) + recent_entry_update(t, e); + else + recent_entry_remove(t, e); } - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() table found, zero count, removing.\n"); -#endif - - /* Count must be zero so we remove this table from the list */ - if(last_table) last_table->next = curr_table->next; else r_tables = curr_table->next; - spin_unlock_bh(&recent_lock); + return size; +} - /* lock to make sure any late-runners still using this after we removed it from - * the list finish up then remove everything */ - spin_lock_bh(&curr_table->list_lock); - spin_unlock_bh(&curr_table->list_lock); - -#ifdef CONFIG_PROC_FS - if(curr_table->status_proc) remove_proc_entry(curr_table->name,proc_net_ipt_recent); +static struct file_operations recent_fops = { + .open = recent_seq_open, + .read = seq_read, + .write = recent_proc_write, + .release = seq_release_private, + .owner = THIS_MODULE, +}; #endif /* CONFIG_PROC_FS */ - vfree(curr_table->table[0].last_pkts); - vfree(curr_table->table); - vfree(curr_table->hash_table); - vfree(curr_table->time_info); - vfree(curr_table); - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": destroy() left.\n"); -#endif - return; -} - -/* This is the structure we pass to ipt_register to register our - * module with iptables. - */ static struct ipt_match recent_match = { .name = "recent", - .match = match, + .match = ipt_recent_match, .matchsize = sizeof(struct ipt_recent_info), - .checkentry = checkentry, - .destroy = destroy, - .me = THIS_MODULE + .checkentry = ipt_recent_checkentry, + .destroy = ipt_recent_destroy, + .me = THIS_MODULE, }; -/* Kernel module initialization. */ static int __init ipt_recent_init(void) { - int err, count; + int err; - printk(version); -#ifdef CONFIG_PROC_FS - proc_net_ipt_recent = proc_mkdir("ipt_recent",proc_net); - if(!proc_net_ipt_recent) return -ENOMEM; -#endif - - if(ip_list_hash_size && ip_list_hash_size <= ip_list_tot) { - printk(KERN_WARNING RECENT_NAME ": ip_list_hash_size too small, resetting to default.\n"); - ip_list_hash_size = 0; - } - - if(!ip_list_hash_size) { - ip_list_hash_size = ip_list_tot*3; - count = 2*2; - while(ip_list_hash_size > count) count = count*2; - ip_list_hash_size = count; - } - -#ifdef DEBUG - if(debug) printk(KERN_INFO RECENT_NAME ": ip_list_hash_size: %d\n",ip_list_hash_size); -#endif + if (!ip_list_tot || !ip_pkt_list_tot || ip_pkt_list_tot > 255) + return -EINVAL; + ip_list_hash_size = 1 << fls(ip_list_tot); err = ipt_register_match(&recent_match); +#ifdef CONFIG_PROC_FS if (err) - remove_proc_entry("ipt_recent", proc_net); + return err; + proc_dir = proc_mkdir("ipt_recent", proc_net); + if (proc_dir == NULL) { + ipt_unregister_match(&recent_match); + err = -ENOMEM; + } +#endif return err; } -/* Kernel module destruction. */ -static void __exit ipt_recent_fini(void) +static void __exit ipt_recent_exit(void) { + BUG_ON(!list_empty(&tables)); ipt_unregister_match(&recent_match); - - remove_proc_entry("ipt_recent",proc_net); +#ifdef CONFIG_PROC_FS + remove_proc_entry("ipt_recent", proc_net); +#endif } -/* Register our module with the kernel. */ module_init(ipt_recent_init); -module_exit(ipt_recent_fini); +module_exit(ipt_recent_exit); -- cgit v1.2.3-59-g8ed1b From 6442f1cf897643d4ca597f2f7d3464b765bae960 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:21:53 -0700 Subject: [NETFILTER]: conntrack: don't call helpers for related ICMP messages None of the existing helpers expects to get called for related ICMP packets and some even drop them if they can't parse them. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_standalone.c | 2 +- net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c | 2 +- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index 929d61f7be91..f0cc7feb0da3 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -417,7 +417,7 @@ static unsigned int ip_conntrack_help(unsigned int hooknum, /* This is where we call the helper: as the packet goes out. */ ct = ip_conntrack_get(*pskb, &ctinfo); - if (ct && ct->helper) { + if (ct && ct->helper && ctinfo != IP_CT_RELATED + IP_CT_IS_REPLY) { unsigned int ret; ret = ct->helper->help(pskb, ct, ctinfo); if (ret != NF_ACCEPT) diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c index 77d974443c7b..8cc8e1b36778 100644 --- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c +++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c @@ -145,7 +145,7 @@ static unsigned int ipv4_conntrack_help(unsigned int hooknum, /* This is where we call the helper: as the packet goes out. */ ct = nf_ct_get(*pskb, &ctinfo); - if (!ct) + if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) return NF_ACCEPT; help = nfct_help(ct); diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index 93bae36f2663..2a71c3b669f1 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -189,7 +189,7 @@ static unsigned int ipv6_confirm(unsigned int hooknum, /* This is where we call the helper: as the packet goes out. */ ct = nf_ct_get(*pskb, &ctinfo); - if (!ct) + if (!ct || ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) goto out; help = nfct_help(ct); -- cgit v1.2.3-59-g8ed1b From 39a27a35c5c1b5be499a0576a35c45a011788bf8 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:23:54 -0700 Subject: [NETFILTER]: conntrack: add sysctl to disable checksumming Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter_ipv4/ip_conntrack.h | 1 + include/linux/sysctl.h | 2 ++ include/net/netfilter/nf_conntrack.h | 1 + net/ipv4/netfilter/ip_conntrack_proto_icmp.c | 2 +- net/ipv4/netfilter/ip_conntrack_proto_tcp.c | 2 +- net/ipv4/netfilter/ip_conntrack_proto_udp.c | 2 +- net/ipv4/netfilter/ip_conntrack_standalone.c | 11 +++++++++++ net/ipv4/netfilter/nf_conntrack_proto_icmp.c | 2 +- net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c | 2 +- net/netfilter/nf_conntrack_proto_tcp.c | 5 +++-- net/netfilter/nf_conntrack_proto_udp.c | 3 ++- net/netfilter/nf_conntrack_standalone.c | 11 +++++++++++ 12 files changed, 36 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h index d54d7b278e96..5473c01f69ea 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack.h +++ b/include/linux/netfilter_ipv4/ip_conntrack.h @@ -293,6 +293,7 @@ static inline int is_dying(struct ip_conntrack *ct) } extern unsigned int ip_conntrack_htable_size; +extern int ip_conntrack_checksum; #define CONNTRACK_STAT_INC(count) (__get_cpu_var(ip_conntrack_stat).count++) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index cd9e7c0825ad..98338ed2c0b6 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -313,6 +313,7 @@ enum NET_NF_CONNTRACK_FRAG6_TIMEOUT=29, NET_NF_CONNTRACK_FRAG6_LOW_THRESH=30, NET_NF_CONNTRACK_FRAG6_HIGH_THRESH=31, + NET_NF_CONNTRACK_CHECKSUM=32, }; /* /proc/sys/net/ipv4 */ @@ -492,6 +493,7 @@ enum NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_RECD=25, NET_IPV4_NF_CONNTRACK_SCTP_TIMEOUT_SHUTDOWN_ACK_SENT=26, NET_IPV4_NF_CONNTRACK_COUNT=27, + NET_IPV4_NF_CONNTRACK_CHECKSUM=28, }; /* /proc/sys/net/ipv6 */ diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index 916013ca4a5c..dbe7a114d0c5 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -285,6 +285,7 @@ static inline int nf_ct_is_dying(struct nf_conn *ct) } extern unsigned int nf_conntrack_htable_size; +extern int nf_conntrack_checksum; #define NF_CT_STAT_INC(count) (__get_cpu_var(nf_conntrack_stat).count++) diff --git a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c index d8b14a9010a6..23f1c504586d 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_icmp.c @@ -224,7 +224,7 @@ icmp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, } /* See ip_conntrack_proto_tcp.c */ - if (hooknum == NF_IP_PRE_ROUTING && + if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && nf_ip_checksum(skb, hooknum, skb->nh.iph->ihl * 4, 0)) { if (LOG_INVALID(IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, diff --git a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c index 062b252b58ad..c5c2ce5cdeb8 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_tcp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_tcp.c @@ -870,7 +870,7 @@ static int tcp_error(struct sk_buff *skb, * and moreover root might send raw packets. */ /* FIXME: Source route IP option packets --RR */ - if (hooknum == NF_IP_PRE_ROUTING && + if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_TCP)) { if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index 70899868783b..9b2c16b4d2ff 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -120,7 +120,7 @@ static int udp_error(struct sk_buff *skb, enum ip_conntrack_info *ctinfo, * because the semantic of CHECKSUM_HW is different there * and moreover root might send raw packets. * FIXME: Source route IP option packets --RR */ - if (hooknum == NF_IP_PRE_ROUTING && + if (ip_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && nf_ip_checksum(skb, hooknum, iph->ihl * 4, IPPROTO_UDP)) { if (LOG_INVALID(IPPROTO_UDP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index f0cc7feb0da3..6cb9b989d14c 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -564,6 +564,8 @@ extern unsigned int ip_ct_generic_timeout; static int log_invalid_proto_min = 0; static int log_invalid_proto_max = 255; +int ip_conntrack_checksum = 1; + static struct ctl_table_header *ip_ct_sysctl_header; static ctl_table ip_ct_sysctl_table[] = { @@ -591,6 +593,14 @@ static ctl_table ip_ct_sysctl_table[] = { .mode = 0444, .proc_handler = &proc_dointvec, }, + { + .ctl_name = NET_IPV4_NF_CONNTRACK_CHECKSUM, + .procname = "ip_conntrack_checksum", + .data = &ip_conntrack_checksum, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, .procname = "ip_conntrack_tcp_timeout_syn_sent", @@ -946,6 +956,7 @@ EXPORT_SYMBOL_GPL(__ip_conntrack_helper_find_byname); EXPORT_SYMBOL_GPL(ip_conntrack_proto_find_get); EXPORT_SYMBOL_GPL(ip_conntrack_proto_put); EXPORT_SYMBOL_GPL(__ip_conntrack_proto_find); +EXPORT_SYMBOL_GPL(ip_conntrack_checksum); #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \ defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE) EXPORT_SYMBOL_GPL(ip_ct_port_tuple_to_nfattr); diff --git a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c index 4b0d361cc6e6..663a73ee3f2f 100644 --- a/net/ipv4/netfilter/nf_conntrack_proto_icmp.c +++ b/net/ipv4/netfilter/nf_conntrack_proto_icmp.c @@ -235,7 +235,7 @@ icmp_error(struct sk_buff *skb, unsigned int dataoff, } /* See ip_conntrack_proto_tcp.c */ - if (hooknum == NF_IP_PRE_ROUTING && + if (nf_conntrack_checksum && hooknum == NF_IP_PRE_ROUTING && nf_ip_checksum(skb, hooknum, dataoff, 0)) { if (LOG_INVALID(IPPROTO_ICMP)) nf_log_packet(PF_INET, 0, skb, NULL, NULL, NULL, diff --git a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c index 86c6703265d0..ef18a7b7014b 100644 --- a/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c +++ b/net/ipv6/netfilter/nf_conntrack_proto_icmpv6.c @@ -233,7 +233,7 @@ icmpv6_error(struct sk_buff *skb, unsigned int dataoff, return -NF_ACCEPT; } - if (hooknum == NF_IP6_PRE_ROUTING && + if (nf_conntrack_checksum && hooknum == NF_IP6_PRE_ROUTING && nf_ip6_checksum(skb, hooknum, dataoff, IPPROTO_ICMPV6)) { nf_log_packet(PF_INET6, 0, skb, NULL, NULL, NULL, "nf_ct_icmpv6: ICMPv6 checksum failed\n"); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 69899f27d26a..12fb7c0a1509 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -828,8 +828,9 @@ static int tcp_error(struct sk_buff *skb, * and moreover root might send raw packets. */ /* FIXME: Source route IP option packets --RR */ - if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || - (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) && + if (nf_conntrack_checksum && + ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || + (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) && nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { if (LOG_INVALID(IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, diff --git a/net/netfilter/nf_conntrack_proto_udp.c b/net/netfilter/nf_conntrack_proto_udp.c index d93edbfde9e3..ae07ebe3ab37 100644 --- a/net/netfilter/nf_conntrack_proto_udp.c +++ b/net/netfilter/nf_conntrack_proto_udp.c @@ -134,7 +134,8 @@ static int udp_error(struct sk_buff *skb, unsigned int dataoff, * because the semantic of CHECKSUM_HW is different there * and moreover root might send raw packets. * FIXME: Source route IP option packets --RR */ - if (((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || + if (nf_conntrack_checksum && + ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) && nf_checksum(skb, hooknum, dataoff, IPPROTO_UDP, pf)) { if (LOG_INVALID(IPPROTO_UDP)) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 408960c6a544..e01d20d8e287 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -455,6 +455,8 @@ extern unsigned int nf_ct_generic_timeout; static int log_invalid_proto_min = 0; static int log_invalid_proto_max = 255; +int nf_conntrack_checksum = 1; + static struct ctl_table_header *nf_ct_sysctl_header; static ctl_table nf_ct_sysctl_table[] = { @@ -482,6 +484,14 @@ static ctl_table nf_ct_sysctl_table[] = { .mode = 0444, .proc_handler = &proc_dointvec, }, + { + .ctl_name = NET_NF_CONNTRACK_CHECKSUM, + .procname = "nf_conntrack_checksum", + .data = &nf_conntrack_checksum, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = NET_NF_CONNTRACK_TCP_TIMEOUT_SYN_SENT, .procname = "nf_conntrack_tcp_timeout_syn_sent", @@ -851,6 +861,7 @@ EXPORT_SYMBOL(nf_ct_proto_put); EXPORT_SYMBOL(nf_ct_l3proto_find_get); EXPORT_SYMBOL(nf_ct_l3proto_put); EXPORT_SYMBOL(nf_ct_l3protos); +EXPORT_SYMBOL_GPL(nf_conntrack_checksum); EXPORT_SYMBOL(nf_conntrack_expect_alloc); EXPORT_SYMBOL(nf_conntrack_expect_put); EXPORT_SYMBOL(nf_conntrack_expect_related); -- cgit v1.2.3-59-g8ed1b From 997ae831ade74bdaed4172b1c02060b9efd6e206 Mon Sep 17 00:00:00 2001 From: Eric Leblond Date: Mon, 29 May 2006 18:24:20 -0700 Subject: [NETFILTER]: conntrack: add fixed timeout flag in connection tracking Add a flag in a connection status to have a non updated timeout. This permits to have connection that automatically die at a given time. Signed-off-by: Eric Leblond Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter/nf_conntrack_common.h | 4 ++++ net/ipv4/netfilter/ip_conntrack_core.c | 6 ++++++ net/netfilter/nf_conntrack_core.c | 6 ++++++ 3 files changed, 16 insertions(+) (limited to 'net') diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index 3ff88c878308..d2e4bd7a7a14 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -69,6 +69,10 @@ enum ip_conntrack_status { /* Connection is dying (removed from lists), can not be unset. */ IPS_DYING_BIT = 9, IPS_DYING = (1 << IPS_DYING_BIT), + + /* Connection has fixed timeout. */ + IPS_FIXED_TIMEOUT_BIT = 10, + IPS_FIXED_TIMEOUT = (1 << IPS_FIXED_TIMEOUT_BIT), }; /* Connection tracking event bits */ diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index a297da7bbef5..4fe9e69378df 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -1130,6 +1130,12 @@ void __ip_ct_refresh_acct(struct ip_conntrack *ct, write_lock_bh(&ip_conntrack_lock); + /* Only update if this is not a fixed timeout */ + if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { + write_unlock_bh(&ip_conntrack_lock); + return; + } + /* If not in hash table, timer will not be active yet */ if (!is_confirmed(ct)) { ct->timeout.expires = extra_jiffies; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f9b83f91371a..bc2bd4c3859e 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1396,6 +1396,12 @@ void __nf_ct_refresh_acct(struct nf_conn *ct, write_lock_bh(&nf_conntrack_lock); + /* Only update if this is not a fixed timeout */ + if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { + write_unlock_bh(&nf_conntrack_lock); + return; + } + /* If not in hash table, timer will not be active yet */ if (!nf_ct_is_confirmed(ct)) { ct->timeout.expires = extra_jiffies; -- cgit v1.2.3-59-g8ed1b From 3726add76643c715d437aceda320d319153b6113 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:24:39 -0700 Subject: [NETFILTER]: ctnetlink: fix NAT configuration The current configuration only allows to configure one manip and overloads conntrack status flags with netlink semantic. Signed-off-by: Patrick Mchardy Signed-off-by: David S. Miller --- include/linux/netfilter/nfnetlink_conntrack.h | 4 +- net/ipv4/netfilter/ip_conntrack_netlink.c | 53 +++++++++++---------------- net/netfilter/nf_conntrack_netlink.c | 53 +++++++++++---------------- 3 files changed, 47 insertions(+), 63 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter/nfnetlink_conntrack.h b/include/linux/netfilter/nfnetlink_conntrack.h index 668ec946c8e2..b5883ccee295 100644 --- a/include/linux/netfilter/nfnetlink_conntrack.h +++ b/include/linux/netfilter/nfnetlink_conntrack.h @@ -27,13 +27,15 @@ enum ctattr_type { CTA_STATUS, CTA_PROTOINFO, CTA_HELP, - CTA_NAT, + CTA_NAT_SRC, +#define CTA_NAT CTA_NAT_SRC /* backwards compatibility */ CTA_TIMEOUT, CTA_MARK, CTA_COUNTERS_ORIG, CTA_COUNTERS_REPLY, CTA_USE, CTA_ID, + CTA_NAT_DST, __CTA_MAX }; #define CTA_MAX (__CTA_MAX - 1) diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index 01bd7cab9367..af152e3623dc 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -629,7 +629,7 @@ static const size_t cta_min_nat[CTA_NAT_MAX] = { }; static inline int -ctnetlink_parse_nat(struct nfattr *cda[], +ctnetlink_parse_nat(struct nfattr *nat, const struct ip_conntrack *ct, struct ip_nat_range *range) { struct nfattr *tb[CTA_NAT_MAX]; @@ -639,7 +639,7 @@ ctnetlink_parse_nat(struct nfattr *cda[], memset(range, 0, sizeof(*range)); - nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); + nfattr_parse_nested(tb, CTA_NAT_MAX, nat); if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) return -EINVAL; @@ -854,39 +854,30 @@ ctnetlink_change_status(struct ip_conntrack *ct, struct nfattr *cda[]) /* ASSURED bit can only be set */ return -EINVAL; - if (cda[CTA_NAT-1]) { + if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { #ifndef CONFIG_IP_NF_NAT_NEEDED return -EINVAL; #else - unsigned int hooknum; struct ip_nat_range range; - if (ctnetlink_parse_nat(cda, ct, &range) < 0) - return -EINVAL; - - DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", - NIPQUAD(range.min_ip), NIPQUAD(range.max_ip), - htons(range.min.all), htons(range.max.all)); - - /* This is tricky but it works. ip_nat_setup_info needs the - * hook number as parameter, so let's do the correct - * conversion and run away */ - if (status & IPS_SRC_NAT_DONE) - hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */ - else if (status & IPS_DST_NAT_DONE) - hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */ - else - return -EINVAL; /* Missing NAT flags */ - - DEBUGP("NAT status: %lu\n", - status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); - - if (ip_nat_initialized(ct, HOOK2MANIP(hooknum))) - return -EEXIST; - ip_nat_setup_info(ct, &range, hooknum); - - DEBUGP("NAT status after setup_info: %lu\n", - ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); + if (cda[CTA_NAT_DST-1]) { + if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct, + &range) < 0) + return -EINVAL; + if (ip_nat_initialized(ct, + HOOK2MANIP(NF_IP_PRE_ROUTING))) + return -EEXIST; + ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING); + } + if (cda[CTA_NAT_SRC-1]) { + if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct, + &range) < 0) + return -EINVAL; + if (ip_nat_initialized(ct, + HOOK2MANIP(NF_IP_POST_ROUTING))) + return -EEXIST; + ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING); + } #endif } @@ -1106,7 +1097,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, /* implicit 'else' */ /* we only allow nat config for new conntracks */ - if (cda[CTA_NAT-1]) { + if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { err = -EINVAL; goto out_unlock; } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index bd10eb944b65..8f27fe9446f2 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -641,7 +641,7 @@ static const size_t cta_min_nat[CTA_NAT_MAX] = { }; static inline int -ctnetlink_parse_nat(struct nfattr *cda[], +ctnetlink_parse_nat(struct nfattr *nat, const struct nf_conn *ct, struct ip_nat_range *range) { struct nfattr *tb[CTA_NAT_MAX]; @@ -651,7 +651,7 @@ ctnetlink_parse_nat(struct nfattr *cda[], memset(range, 0, sizeof(*range)); - nfattr_parse_nested(tb, CTA_NAT_MAX, cda[CTA_NAT-1]); + nfattr_parse_nested(tb, CTA_NAT_MAX, nat); if (nfattr_bad_size(tb, CTA_NAT_MAX, cta_min_nat)) return -EINVAL; @@ -866,39 +866,30 @@ ctnetlink_change_status(struct nf_conn *ct, struct nfattr *cda[]) /* ASSURED bit can only be set */ return -EINVAL; - if (cda[CTA_NAT-1]) { + if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { #ifndef CONFIG_IP_NF_NAT_NEEDED return -EINVAL; #else - unsigned int hooknum; struct ip_nat_range range; - if (ctnetlink_parse_nat(cda, ct, &range) < 0) - return -EINVAL; - - DEBUGP("NAT: %u.%u.%u.%u-%u.%u.%u.%u:%u-%u\n", - NIPQUAD(range.min_ip), NIPQUAD(range.max_ip), - htons(range.min.all), htons(range.max.all)); - - /* This is tricky but it works. ip_nat_setup_info needs the - * hook number as parameter, so let's do the correct - * conversion and run away */ - if (status & IPS_SRC_NAT_DONE) - hooknum = NF_IP_POST_ROUTING; /* IP_NAT_MANIP_SRC */ - else if (status & IPS_DST_NAT_DONE) - hooknum = NF_IP_PRE_ROUTING; /* IP_NAT_MANIP_DST */ - else - return -EINVAL; /* Missing NAT flags */ - - DEBUGP("NAT status: %lu\n", - status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); - - if (ip_nat_initialized(ct, HOOK2MANIP(hooknum))) - return -EEXIST; - ip_nat_setup_info(ct, &range, hooknum); - - DEBUGP("NAT status after setup_info: %lu\n", - ct->status & (IPS_NAT_MASK | IPS_NAT_DONE_MASK)); + if (cda[CTA_NAT_DST-1]) { + if (ctnetlink_parse_nat(cda[CTA_NAT_DST-1], ct, + &range) < 0) + return -EINVAL; + if (ip_nat_initialized(ct, + HOOK2MANIP(NF_IP_PRE_ROUTING))) + return -EEXIST; + ip_nat_setup_info(ct, &range, hooknum); + } + if (cda[CTA_NAT_SRC-1]) { + if (ctnetlink_parse_nat(cda[CTA_NAT_SRC-1], ct, + &range) < 0) + return -EINVAL; + if (ip_nat_initialized(ct, + HOOK2MANIP(NF_IP_POST_ROUTING))) + return -EEXIST; + ip_nat_setup_info(ct, &range, hooknum); + } #endif } @@ -1122,7 +1113,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, /* implicit 'else' */ /* we only allow nat config for new conntracks */ - if (cda[CTA_NAT-1]) { + if (cda[CTA_NAT_SRC-1] || cda[CTA_NAT_DST-1]) { err = -EINVAL; goto out_unlock; } -- cgit v1.2.3-59-g8ed1b From 89f2e21883b59a6ff1e64d0b4924d06b1c6101ba Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:24:58 -0700 Subject: [NETFILTER]: ctnetlink: change table dumping not to require an unique ID Instead of using the ID to find out where to continue dumping, take a reference to the last entry dumped and try to continue there. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_netlink.c | 32 +++++++++++++++++++++++-------- net/netfilter/nf_conntrack_netlink.c | 32 +++++++++++++++++++++++-------- 2 files changed, 48 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_netlink.c b/net/ipv4/netfilter/ip_conntrack_netlink.c index af152e3623dc..33891bb1fde4 100644 --- a/net/ipv4/netfilter/ip_conntrack_netlink.c +++ b/net/ipv4/netfilter/ip_conntrack_netlink.c @@ -399,38 +399,54 @@ nfattr_failure: static int ctnetlink_done(struct netlink_callback *cb) { DEBUGP("entered %s\n", __FUNCTION__); + if (cb->args[1]) + ip_conntrack_put((struct ip_conntrack *)cb->args[1]); return 0; } static int ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { - struct ip_conntrack *ct = NULL; + struct ip_conntrack *ct, *last; struct ip_conntrack_tuple_hash *h; struct list_head *i; - u_int32_t *id = (u_int32_t *) &cb->args[1]; DEBUGP("entered %s, last bucket=%lu id=%u\n", __FUNCTION__, cb->args[0], *id); read_lock_bh(&ip_conntrack_lock); - for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++, *id = 0) { + for (; cb->args[0] < ip_conntrack_htable_size; cb->args[0]++) { +restart: + last = (struct ip_conntrack *)cb->args[1]; list_for_each_prev(i, &ip_conntrack_hash[cb->args[0]]) { h = (struct ip_conntrack_tuple_hash *) i; if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) continue; ct = tuplehash_to_ctrack(h); - if (ct->id <= *id) - continue; + if (last != NULL) { + if (ct == last) { + ip_conntrack_put(last); + cb->args[1] = 0; + last = NULL; + } else + continue; + } if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, IPCTNL_MSG_CT_NEW, - 1, ct) < 0) + 1, ct) < 0) { + nf_conntrack_get(&ct->ct_general); + cb->args[1] = (unsigned long)ct; goto out; - *id = ct->id; + } + } + if (last != NULL) { + ip_conntrack_put(last); + cb->args[1] = 0; + goto restart; } } -out: +out: read_unlock_bh(&ip_conntrack_lock); DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 8f27fe9446f2..b8c7c567c9df 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -407,6 +407,8 @@ nfattr_failure: static int ctnetlink_done(struct netlink_callback *cb) { + if (cb->args[1]) + nf_ct_put((struct nf_conn *)cb->args[1]); DEBUGP("entered %s\n", __FUNCTION__); return 0; } @@ -416,10 +418,9 @@ static int ctnetlink_done(struct netlink_callback *cb) static int ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) { - struct nf_conn *ct = NULL; + struct nf_conn *ct, *last; struct nf_conntrack_tuple_hash *h; struct list_head *i; - u_int32_t *id = (u_int32_t *) &cb->args[1]; struct nfgenmsg *nfmsg = NLMSG_DATA(cb->nlh); u_int8_t l3proto = nfmsg->nfgen_family; @@ -427,7 +428,9 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) cb->args[0], *id); read_lock_bh(&nf_conntrack_lock); - for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++, *id = 0) { + for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) { +restart: + last = (struct nf_conn *)cb->args[1]; list_for_each_prev(i, &nf_conntrack_hash[cb->args[0]]) { h = (struct nf_conntrack_tuple_hash *) i; if (DIRECTION(h) != IP_CT_DIR_ORIGINAL) @@ -438,17 +441,30 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb) * then dump everything. */ if (l3proto && L3PROTO(ct) != l3proto) continue; - if (ct->id <= *id) - continue; + if (last != NULL) { + if (ct == last) { + nf_ct_put(last); + cb->args[1] = 0; + last = NULL; + } else + continue; + } if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, IPCTNL_MSG_CT_NEW, - 1, ct) < 0) + 1, ct) < 0) { + nf_conntrack_get(&ct->ct_general); + cb->args[1] = (unsigned long)ct; goto out; - *id = ct->id; + } + } + if (last != NULL) { + nf_ct_put(last); + cb->args[1] = 0; + goto restart; } } -out: +out: read_unlock_bh(&nf_conntrack_lock); DEBUGP("leaving, last bucket=%lu id=%u\n", cb->args[0], *id); -- cgit v1.2.3-59-g8ed1b From 695ecea3299dba2239d1cb4fd4d4e4c95a5b9ce7 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:25:14 -0700 Subject: [NETFILTER]: SNMP helper: fix debug module param type debug is the debug level, not a bool. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_nat_snmp_basic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index c33244263b90..d20d557f915a 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c @@ -1348,4 +1348,4 @@ static void __exit ip_nat_snmp_basic_fini(void) module_init(ip_nat_snmp_basic_init); module_exit(ip_nat_snmp_basic_fini); -module_param(debug, bool, 0600); +module_param(debug, int, 0600); -- cgit v1.2.3-59-g8ed1b From 7d8c50181778b6ba10c2bba9a2f22db9493bb245 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:25:38 -0700 Subject: [NETFILTER]: FTP helper: search optimization Instead of skipping search entries for the wrong direction simply index them by direction. Based on patch by Pablo Neira Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_ftp.c | 77 +++++++++++++++++++---------------- net/netfilter/nf_conntrack_ftp.c | 77 +++++++++++++++++++---------------- 2 files changed, 86 insertions(+), 68 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_ftp.c b/net/ipv4/netfilter/ip_conntrack_ftp.c index 3e542bf28a9d..4dcf526c3944 100644 --- a/net/ipv4/netfilter/ip_conntrack_ftp.c +++ b/net/ipv4/netfilter/ip_conntrack_ftp.c @@ -56,37 +56,48 @@ static int try_eprt(const char *, size_t, u_int32_t [], char); static int try_epsv_response(const char *, size_t, u_int32_t [], char); static const struct ftp_search { - enum ip_conntrack_dir dir; const char *pattern; size_t plen; char skip; char term; enum ip_ct_ftp_type ftptype; int (*getnum)(const char *, size_t, u_int32_t[], char); -} search[] = { - { - IP_CT_DIR_ORIGINAL, - "PORT", sizeof("PORT") - 1, ' ', '\r', - IP_CT_FTP_PORT, - try_rfc959, +} search[IP_CT_DIR_MAX][2] = { + [IP_CT_DIR_ORIGINAL] = { + { + .pattern = "PORT", + .plen = sizeof("PORT") - 1, + .skip = ' ', + .term = '\r', + .ftptype = IP_CT_FTP_PORT, + .getnum = try_rfc959, + }, + { + .pattern = "EPRT", + .plen = sizeof("EPRT") - 1, + .skip = ' ', + .term = '\r', + .ftptype = IP_CT_FTP_EPRT, + .getnum = try_eprt, + }, }, - { - IP_CT_DIR_REPLY, - "227 ", sizeof("227 ") - 1, '(', ')', - IP_CT_FTP_PASV, - try_rfc959, - }, - { - IP_CT_DIR_ORIGINAL, - "EPRT", sizeof("EPRT") - 1, ' ', '\r', - IP_CT_FTP_EPRT, - try_eprt, - }, - { - IP_CT_DIR_REPLY, - "229 ", sizeof("229 ") - 1, '(', ')', - IP_CT_FTP_EPSV, - try_epsv_response, + [IP_CT_DIR_REPLY] = { + { + .pattern = "227 ", + .plen = sizeof("227 ") - 1, + .skip = '(', + .term = ')', + .ftptype = IP_CT_FTP_PASV, + .getnum = try_rfc959, + }, + { + .pattern = "229 ", + .plen = sizeof("229 ") - 1, + .skip = '(', + .term = ')', + .ftptype = IP_CT_FTP_EPSV, + .getnum = try_epsv_response, + }, }, }; @@ -346,17 +357,15 @@ static int help(struct sk_buff **pskb, array[2] = (ntohl(ct->tuplehash[dir].tuple.src.ip) >> 8) & 0xFF; array[3] = ntohl(ct->tuplehash[dir].tuple.src.ip) & 0xFF; - for (i = 0; i < ARRAY_SIZE(search); i++) { - if (search[i].dir != dir) continue; - + for (i = 0; i < ARRAY_SIZE(search[dir]); i++) { found = find_pattern(fb_ptr, (*pskb)->len - dataoff, - search[i].pattern, - search[i].plen, - search[i].skip, - search[i].term, + search[dir][i].pattern, + search[dir][i].plen, + search[dir][i].skip, + search[dir][i].term, &matchoff, &matchlen, array, - search[i].getnum); + search[dir][i].getnum); if (found) break; } if (found == -1) { @@ -366,7 +375,7 @@ static int help(struct sk_buff **pskb, this case. */ if (net_ratelimit()) printk("conntrack_ftp: partial %s %u+%u\n", - search[i].pattern, + search[dir][i].pattern, ntohl(th->seq), datalen); ret = NF_DROP; goto out; @@ -426,7 +435,7 @@ static int help(struct sk_buff **pskb, /* Now, NAT might want to mangle the packet, and register the * (possibly changed) expectation itself. */ if (ip_nat_ftp_hook) - ret = ip_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, + ret = ip_nat_ftp_hook(pskb, ctinfo, search[dir][i].ftptype, matchoff, matchlen, exp, &seq); else { /* Can't expect this? Best to drop packet now. */ diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index e38a4b5a3089..11d3be243536 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -67,37 +67,48 @@ static int try_epsv_response(const char *, size_t, struct nf_conntrack_man *, char); static struct ftp_search { - enum ip_conntrack_dir dir; const char *pattern; size_t plen; char skip; char term; enum ip_ct_ftp_type ftptype; int (*getnum)(const char *, size_t, struct nf_conntrack_man *, char); -} search[] = { - { - IP_CT_DIR_ORIGINAL, - "PORT", sizeof("PORT") - 1, ' ', '\r', - IP_CT_FTP_PORT, - try_rfc959, +} search[IP_CT_DIR_MAX][2] = { + [IP_CT_DIR_ORIGINAL] = { + { + .pattern = "PORT", + .plen = sizeof("PORT") - 1, + .skip = ' ', + .term = '\r', + .ftptype = IP_CT_FTP_PORT, + .getnum = try_rfc959, + }, + { + .pattern = "EPRT", + .plen = sizeof("EPRT") - 1, + .skip = ' ', + .term = '\r', + .ftptype = IP_CT_FTP_EPRT, + .getnum = try_eprt, + }, }, - { - IP_CT_DIR_REPLY, - "227 ", sizeof("227 ") - 1, '(', ')', - IP_CT_FTP_PASV, - try_rfc959, - }, - { - IP_CT_DIR_ORIGINAL, - "EPRT", sizeof("EPRT") - 1, ' ', '\r', - IP_CT_FTP_EPRT, - try_eprt, - }, - { - IP_CT_DIR_REPLY, - "229 ", sizeof("229 ") - 1, '(', ')', - IP_CT_FTP_EPSV, - try_epsv_response, + [IP_CT_DIR_REPLY] = { + { + .pattern = "227 ", + .plen = sizeof("227 ") - 1, + .skip = '(', + .term = ')', + .ftptype = IP_CT_FTP_PASV, + .getnum = try_rfc959, + }, + { + .pattern = "229 ", + .plen = sizeof("229 ") - 1, + .skip = '(', + .term = ')', + .ftptype = IP_CT_FTP_EPSV, + .getnum = try_epsv_response, + }, }, }; @@ -492,17 +503,15 @@ static int help(struct sk_buff **pskb, memcpy(cmd.u3.all, &ct->tuplehash[dir].tuple.src.u3.all, sizeof(cmd.u3.all)); - for (i = 0; i < ARRAY_SIZE(search); i++) { - if (search[i].dir != dir) continue; - + for (i = 0; i < ARRAY_SIZE(search[dir]); i++) { found = find_pattern(fb_ptr, datalen, - search[i].pattern, - search[i].plen, - search[i].skip, - search[i].term, + search[dir][i].pattern, + search[dir][i].plen, + search[dir][i].skip, + search[dir][i].term, &matchoff, &matchlen, &cmd, - search[i].getnum); + search[dir][i].getnum); if (found) break; } if (found == -1) { @@ -512,7 +521,7 @@ static int help(struct sk_buff **pskb, this case. */ if (net_ratelimit()) printk("conntrack_ftp: partial %s %u+%u\n", - search[i].pattern, + search[dir][i].pattern, ntohl(th->seq), datalen); ret = NF_DROP; goto out; @@ -597,7 +606,7 @@ static int help(struct sk_buff **pskb, /* Now, NAT might want to mangle the packet, and register the * (possibly changed) expectation itself. */ if (nf_nat_ftp_hook) - ret = nf_nat_ftp_hook(pskb, ctinfo, search[i].ftptype, + ret = nf_nat_ftp_hook(pskb, ctinfo, search[dir][i].ftptype, matchoff, matchlen, exp, &seq); else { /* Can't expect this? Best to drop packet now. */ -- cgit v1.2.3-59-g8ed1b From c95261693467f0aeac7fafa69860ddfb02bc12f8 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:25:58 -0700 Subject: [NETFILTER]: amanda helper: convert to textsearch infrastructure When a port number within a packet is replaced by a differently sized number only the packet is resized, but not the copy of the data. Following port numbers are rewritten based on their offsets within the copy, leading to packet corruption. Convert the amanda helper to the textsearch infrastructure to avoid the copy entirely. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/Kconfig | 2 + net/ipv4/netfilter/ip_conntrack_amanda.c | 143 ++++++++++++++++++++----------- 2 files changed, 96 insertions(+), 49 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index d4072533da21..1540e227890f 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -142,6 +142,8 @@ config IP_NF_TFTP config IP_NF_AMANDA tristate "Amanda backup protocol support" depends on IP_NF_CONNTRACK + select TEXTSEARCH + select TEXTSEARCH_KMP help If you are running the Amanda backup package on this machine or machines that will be MASQUERADED through this diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index a604b1ccfdaa..0a7bd7f04061 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c @@ -17,33 +17,29 @@ * this value. * */ - -#include #include #include -#include -#include #include +#include +#include +#include +#include #include -#include -#include +#include #include #include static unsigned int master_timeout = 300; +static char *ts_algo = "kmp"; MODULE_AUTHOR("Brian J. Murrell "); MODULE_DESCRIPTION("Amanda connection tracking module"); MODULE_LICENSE("GPL"); module_param(master_timeout, uint, 0600); MODULE_PARM_DESC(master_timeout, "timeout for the master connection"); - -static const char *conns[] = { "DATA ", "MESG ", "INDEX " }; - -/* This is slow, but it's simple. --RR */ -static char *amanda_buffer; -static DEFINE_SPINLOCK(amanda_buffer_lock); +module_param(ts_algo, charp, 0400); +MODULE_PARM_DESC(ts_algo, "textsearch algorithm to use (default kmp)"); unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, enum ip_conntrack_info ctinfo, @@ -52,12 +48,48 @@ unsigned int (*ip_nat_amanda_hook)(struct sk_buff **pskb, struct ip_conntrack_expect *exp); EXPORT_SYMBOL_GPL(ip_nat_amanda_hook); +enum amanda_strings { + SEARCH_CONNECT, + SEARCH_NEWLINE, + SEARCH_DATA, + SEARCH_MESG, + SEARCH_INDEX, +}; + +static struct { + char *string; + size_t len; + struct ts_config *ts; +} search[] = { + [SEARCH_CONNECT] = { + .string = "CONNECT ", + .len = 8, + }, + [SEARCH_NEWLINE] = { + .string = "\n", + .len = 1, + }, + [SEARCH_DATA] = { + .string = "DATA ", + .len = 5, + }, + [SEARCH_MESG] = { + .string = "MESG ", + .len = 5, + }, + [SEARCH_INDEX] = { + .string = "INDEX ", + .len = 6, + }, +}; + static int help(struct sk_buff **pskb, struct ip_conntrack *ct, enum ip_conntrack_info ctinfo) { + struct ts_state ts; struct ip_conntrack_expect *exp; - char *data, *data_limit, *tmp; - unsigned int dataoff, i; + unsigned int dataoff, start, stop, off, i; + char pbuf[sizeof("65535")], *tmp; u_int16_t port, len; int ret = NF_ACCEPT; @@ -77,29 +109,34 @@ static int help(struct sk_buff **pskb, return NF_ACCEPT; } - spin_lock_bh(&amanda_buffer_lock); - skb_copy_bits(*pskb, dataoff, amanda_buffer, (*pskb)->len - dataoff); - data = amanda_buffer; - data_limit = amanda_buffer + (*pskb)->len - dataoff; - *data_limit = '\0'; - - /* Search for the CONNECT string */ - data = strstr(data, "CONNECT "); - if (!data) + memset(&ts, 0, sizeof(ts)); + start = skb_find_text(*pskb, dataoff, (*pskb)->len, + search[SEARCH_CONNECT].ts, &ts); + if (start == UINT_MAX) goto out; - data += strlen("CONNECT "); + start += dataoff + search[SEARCH_CONNECT].len; - /* Only search first line. */ - if ((tmp = strchr(data, '\n'))) - *tmp = '\0'; + memset(&ts, 0, sizeof(ts)); + stop = skb_find_text(*pskb, start, (*pskb)->len, + search[SEARCH_NEWLINE].ts, &ts); + if (stop == UINT_MAX) + goto out; + stop += start; - for (i = 0; i < ARRAY_SIZE(conns); i++) { - char *match = strstr(data, conns[i]); - if (!match) + for (i = SEARCH_DATA; i <= SEARCH_INDEX; i++) { + memset(&ts, 0, sizeof(ts)); + off = skb_find_text(*pskb, start, stop, search[i].ts, &ts); + if (off == UINT_MAX) continue; - tmp = data = match + strlen(conns[i]); - port = simple_strtoul(data, &data, 10); - len = data - tmp; + off += start + search[i].len; + + len = min_t(unsigned int, sizeof(pbuf) - 1, stop - off); + if (skb_copy_bits(*pskb, off, pbuf, len)) + break; + pbuf[len] = '\0'; + + port = simple_strtoul(pbuf, &tmp, 10); + len = tmp - pbuf; if (port == 0 || len > 5) break; @@ -125,8 +162,7 @@ static int help(struct sk_buff **pskb, exp->mask.dst.u.tcp.port = 0xFFFF; if (ip_nat_amanda_hook) - ret = ip_nat_amanda_hook(pskb, ctinfo, - tmp - amanda_buffer, + ret = ip_nat_amanda_hook(pskb, ctinfo, off - dataoff, len, exp); else if (ip_conntrack_expect_related(exp) != 0) ret = NF_DROP; @@ -134,12 +170,11 @@ static int help(struct sk_buff **pskb, } out: - spin_unlock_bh(&amanda_buffer_lock); return ret; } static struct ip_conntrack_helper amanda_helper = { - .max_expected = ARRAY_SIZE(conns), + .max_expected = 3, .timeout = 180, .me = THIS_MODULE, .help = help, @@ -155,26 +190,36 @@ static struct ip_conntrack_helper amanda_helper = { static void __exit ip_conntrack_amanda_fini(void) { + int i; + ip_conntrack_helper_unregister(&amanda_helper); - kfree(amanda_buffer); + for (i = 0; i < ARRAY_SIZE(search); i++) + textsearch_destroy(search[i].ts); } static int __init ip_conntrack_amanda_init(void) { - int ret; - - amanda_buffer = kmalloc(65536, GFP_KERNEL); - if (!amanda_buffer) - return -ENOMEM; - - ret = ip_conntrack_helper_register(&amanda_helper); - if (ret < 0) { - kfree(amanda_buffer); - return ret; + int ret, i; + + ret = -ENOMEM; + for (i = 0; i < ARRAY_SIZE(search); i++) { + search[i].ts = textsearch_prepare(ts_algo, search[i].string, + search[i].len, + GFP_KERNEL, TS_AUTOLOAD); + if (search[i].ts == NULL) + goto err; } + ret = ip_conntrack_helper_register(&amanda_helper); + if (ret < 0) + goto err; return 0; - +err: + for (; i >= 0; i--) { + if (search[i].ts) + textsearch_destroy(search[i].ts); + } + return ret; } module_init(ip_conntrack_amanda_init); -- cgit v1.2.3-59-g8ed1b From c0d4cfd96dd0cc0dbf49435898808b5553af4822 Mon Sep 17 00:00:00 2001 From: Jing Min Zhao Date: Mon, 29 May 2006 18:26:27 -0700 Subject: [NETFILTER]: H.323 helper: Add support for Call Forwarding Signed-off-by: Jing Min Zhao Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter_ipv4/ip_conntrack.h | 1 + include/linux/netfilter_ipv4/ip_conntrack_h323.h | 7 ++ .../ip_conntrack_helper_h323_types.h | 3 +- net/ipv4/netfilter/Kconfig | 8 +- net/ipv4/netfilter/ip_conntrack_helper_h323.c | 112 +++++++++++++++++++++ .../netfilter/ip_conntrack_helper_h323_types.c | 6 +- net/ipv4/netfilter/ip_nat_helper_h323.c | 77 ++++++++++++++ 7 files changed, 206 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h index 5473c01f69ea..17d7ef938a09 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack.h +++ b/include/linux/netfilter_ipv4/ip_conntrack.h @@ -154,6 +154,7 @@ struct ip_conntrack_expect unsigned int flags; #ifdef CONFIG_IP_NF_NAT_NEEDED + u_int32_t saved_ip; /* This is the original per-proto part, used to map the * expected connection the way the recipient expects. */ union ip_conntrack_manip_proto saved_proto; diff --git a/include/linux/netfilter_ipv4/ip_conntrack_h323.h b/include/linux/netfilter_ipv4/ip_conntrack_h323.h index eace86bd2adb..3cbff7379002 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_h323.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_h323.h @@ -71,6 +71,13 @@ extern int (*nat_h245_hook) (struct sk_buff ** pskb, struct ip_conntrack * ct, unsigned char **data, int dataoff, TransportAddress * addr, u_int16_t port, struct ip_conntrack_expect * exp); +extern int (*nat_callforwarding_hook) (struct sk_buff ** pskb, + struct ip_conntrack * ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress * addr, + u_int16_t port, + struct ip_conntrack_expect * exp); extern int (*nat_q931_hook) (struct sk_buff ** pskb, struct ip_conntrack * ct, enum ip_conntrack_info ctinfo, unsigned char **data, TransportAddress * addr, diff --git a/include/linux/netfilter_ipv4/ip_conntrack_helper_h323_types.h b/include/linux/netfilter_ipv4/ip_conntrack_helper_h323_types.h index cc98f7aa5abe..3d4a773799fc 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack_helper_h323_types.h +++ b/include/linux/netfilter_ipv4/ip_conntrack_helper_h323_types.h @@ -1,4 +1,4 @@ -/* Generated by Jing Min Zhao's ASN.1 parser, Mar 15 2006 +/* Generated by Jing Min Zhao's ASN.1 parser, Apr 20 2006 * * Copyright (c) 2006 Jing Min Zhao * @@ -412,6 +412,7 @@ typedef struct Facility_UUIE { /* SEQUENCE */ eFacility_UUIE_destinationInfo = (1 << 14), eFacility_UUIE_h245SecurityMode = (1 << 13), } options; + TransportAddress alternativeAddress; FacilityReason reason; TransportAddress h245Address; Facility_UUIE_fastStart fastStart; diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 1540e227890f..f86aeda3a5a0 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -183,10 +183,10 @@ config IP_NF_H323 With this module you can support H.323 on a connection tracking/NAT firewall. - This module supports RAS, Fast-start, H.245 tunnelling, RTP/RTCP - and T.120 based data and applications including audio, video, FAX, - chat, whiteboard, file transfer, etc. For more information, please - see http://nath323.sourceforge.net/. + This module supports RAS, Fast Start, H.245 Tunnelling, Call + Forwarding, RTP/RTCP and T.120 based audio, video, fax, chat, + whiteboard, file transfer, etc. For more information, please + visit http://nath323.sourceforge.net/. If you want to compile it as a module, say 'M' here and read Documentation/modules.txt. If unsure, say 'N'. diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c index 518f581d39ec..3052468a6ab1 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_h323.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_h323.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include #if 0 #define DEBUGP printk @@ -38,6 +40,13 @@ static int gkrouted_only = 1; module_param(gkrouted_only, int, 0600); MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper"); +static char *internal_net = NULL; +static u_int32_t internal_net_addr = 0; +static u_int32_t internal_net_mask = 0; +module_param(internal_net, charp, 0600); +MODULE_PARM_DESC(internal_net, "specify your internal network using format " + "address/mask. this is used by call forwarding support"); + /* Hooks for NAT */ int (*set_h245_addr_hook) (struct sk_buff ** pskb, unsigned char **data, int dataoff, @@ -77,6 +86,12 @@ int (*nat_h245_hook) (struct sk_buff ** pskb, unsigned char **data, int dataoff, TransportAddress * addr, u_int16_t port, struct ip_conntrack_expect * exp); +int (*nat_callforwarding_hook) (struct sk_buff ** pskb, + struct ip_conntrack * ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress * addr, u_int16_t port, + struct ip_conntrack_expect * exp); int (*nat_q931_hook) (struct sk_buff ** pskb, struct ip_conntrack * ct, enum ip_conntrack_info ctinfo, @@ -683,6 +698,76 @@ static int expect_h245(struct sk_buff **pskb, struct ip_conntrack *ct, return ret; } +/* Forwarding declaration */ +void ip_conntrack_q931_expect(struct ip_conntrack *new, + struct ip_conntrack_expect *this); + +/****************************************************************************/ +static int expect_callforwarding(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress * addr) +{ + int dir = CTINFO2DIR(ctinfo); + int ret = 0; + u_int32_t ip; + u_int16_t port; + struct ip_conntrack_expect *exp = NULL; + + /* Read alternativeAddress */ + if (!get_h225_addr(*data, addr, &ip, &port) || port == 0) + return 0; + + /* If the calling party is on the same side of the forward-to party, + * we don't need to track the second call */ + if (internal_net && + ((ip & internal_net_mask) == internal_net_addr) == + ((ct->tuplehash[!dir].tuple.src.ip & internal_net_mask) == + internal_net_addr)) { + DEBUGP("ip_ct_q931: Call Forwarding not tracked\n"); + return 0; + } + + /* Create expect for the second call leg */ + if ((exp = ip_conntrack_expect_alloc(ct)) == NULL) + return -1; + exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; + exp->tuple.src.u.tcp.port = 0; + exp->tuple.dst.ip = ip; + exp->tuple.dst.u.tcp.port = htons(port); + exp->tuple.dst.protonum = IPPROTO_TCP; + exp->mask.src.ip = 0xFFFFFFFF; + exp->mask.src.u.tcp.port = 0; + exp->mask.dst.ip = 0xFFFFFFFF; + exp->mask.dst.u.tcp.port = 0xFFFF; + exp->mask.dst.protonum = 0xFF; + exp->flags = 0; + + if (ct->tuplehash[dir].tuple.src.ip != + ct->tuplehash[!dir].tuple.dst.ip && nat_callforwarding_hook) { + /* Need NAT */ + ret = nat_callforwarding_hook(pskb, ct, ctinfo, data, dataoff, + addr, port, exp); + } else { /* Conntrack only */ + exp->expectfn = ip_conntrack_q931_expect; + + if (ip_conntrack_expect_related(exp) == 0) { + DEBUGP("ip_ct_q931: expect Call Forwarding " + "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", + NIPQUAD(exp->tuple.src.ip), + ntohs(exp->tuple.src.u.tcp.port), + NIPQUAD(exp->tuple.dst.ip), + ntohs(exp->tuple.dst.u.tcp.port)); + } else + ret = -1; + } + + ip_conntrack_expect_put(exp); + + return ret; +} + /****************************************************************************/ static int process_setup(struct sk_buff **pskb, struct ip_conntrack *ct, enum ip_conntrack_info ctinfo, @@ -878,6 +963,15 @@ static int process_facility(struct sk_buff **pskb, struct ip_conntrack *ct, DEBUGP("ip_ct_q931: Facility\n"); + if (facility->reason.choice == eFacilityReason_callForwarded) { + if (facility->options & eFacility_UUIE_alternativeAddress) + return expect_callforwarding(pskb, ct, ctinfo, data, + dataoff, + &facility-> + alternativeAddress); + return 0; + } + if (facility->options & eFacility_UUIE_h245Address) { ret = expect_h245(pskb, ct, ctinfo, data, dataoff, &facility->h245Address); @@ -1668,6 +1762,7 @@ static void fini(void) static int __init init(void) { int ret; + char *p; h323_buffer = kmalloc(65536, GFP_KERNEL); if (!h323_buffer) @@ -1678,6 +1773,22 @@ static int __init init(void) return ret; } + if (internal_net) { + if ((p = strchr(internal_net, '/'))) + *p++ = 0; + if (isdigit(internal_net[0])) { + internal_net_addr = in_aton(internal_net); + if (p && isdigit(p[0])) + internal_net_mask = in_aton(p); + else + internal_net_mask = 0xffffffff; + internal_net_addr &= internal_net_mask; + } + DEBUGP("ip_ct_h323: internal_net = %u.%u.%u.%u/%u.%u.%u.%u\n", + NIPQUAD(internal_net_addr), + NIPQUAD(internal_net_mask)); + } + DEBUGP("ip_ct_h323: init success\n"); return 0; } @@ -1696,6 +1807,7 @@ EXPORT_SYMBOL_GPL(set_ras_addr_hook); EXPORT_SYMBOL_GPL(nat_rtp_rtcp_hook); EXPORT_SYMBOL_GPL(nat_t120_hook); EXPORT_SYMBOL_GPL(nat_h245_hook); +EXPORT_SYMBOL_GPL(nat_callforwarding_hook); EXPORT_SYMBOL_GPL(nat_q931_hook); MODULE_AUTHOR("Jing Min Zhao "); diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c index 022c47b9f6c9..4b359618bedd 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_h323_types.c @@ -1,4 +1,4 @@ -/* Generated by Jing Min Zhao's ASN.1 parser, Mar 15 2006 +/* Generated by Jing Min Zhao's ASN.1 parser, Apr 20 2006 * * Copyright (c) 2006 Jing Min Zhao * @@ -1069,8 +1069,8 @@ static field_t _Facility_UUIE_fastStart[] = { /* SEQUENCE OF */ static field_t _Facility_UUIE[] = { /* SEQUENCE */ {FNAME("protocolIdentifier") OID, BYTE, 0, 0, SKIP, 0, NULL}, - {FNAME("alternativeAddress") CHOICE, 3, 7, 7, SKIP | EXT | OPT, 0, - _TransportAddress}, + {FNAME("alternativeAddress") CHOICE, 3, 7, 7, DECODE | EXT | OPT, + offsetof(Facility_UUIE, alternativeAddress), _TransportAddress}, {FNAME("alternativeAliasAddress") SEQOF, SEMI, 0, 0, SKIP | OPT, 0, _Facility_UUIE_alternativeAliasAddress}, {FNAME("conferenceID") OCTSTR, FIXD, 16, 0, SKIP | OPT, 0, NULL}, diff --git a/net/ipv4/netfilter/ip_nat_helper_h323.c b/net/ipv4/netfilter/ip_nat_helper_h323.c index d45663d137a7..419b878fb467 100644 --- a/net/ipv4/netfilter/ip_nat_helper_h323.c +++ b/net/ipv4/netfilter/ip_nat_helper_h323.c @@ -486,6 +486,80 @@ static int nat_q931(struct sk_buff **pskb, struct ip_conntrack *ct, return 0; } +/****************************************************************************/ +static void ip_nat_callforwarding_expect(struct ip_conntrack *new, + struct ip_conntrack_expect *this) +{ + struct ip_nat_range range; + + /* This must be a fresh one. */ + BUG_ON(new->status & IPS_NAT_DONE_MASK); + + /* Change src to where master sends to */ + range.flags = IP_NAT_RANGE_MAP_IPS; + range.min_ip = range.max_ip = new->tuplehash[!this->dir].tuple.src.ip; + + /* hook doesn't matter, but it has to do source manip */ + ip_nat_setup_info(new, &range, NF_IP_POST_ROUTING); + + /* For DST manip, map port here to where it's expected. */ + range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED); + range.min = range.max = this->saved_proto; + range.min_ip = range.max_ip = this->saved_ip; + + /* hook doesn't matter, but it has to do destination manip */ + ip_nat_setup_info(new, &range, NF_IP_PRE_ROUTING); + + ip_conntrack_q931_expect(new, this); +} + +/****************************************************************************/ +static int nat_callforwarding(struct sk_buff **pskb, struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + unsigned char **data, int dataoff, + TransportAddress * addr, u_int16_t port, + struct ip_conntrack_expect *exp) +{ + int dir = CTINFO2DIR(ctinfo); + u_int16_t nated_port; + + /* Set expectations for NAT */ + exp->saved_ip = exp->tuple.dst.ip; + exp->tuple.dst.ip = ct->tuplehash[!dir].tuple.dst.ip; + exp->saved_proto.tcp.port = exp->tuple.dst.u.tcp.port; + exp->expectfn = ip_nat_callforwarding_expect; + exp->dir = !dir; + + /* Try to get same port: if not, try to change it. */ + for (nated_port = port; nated_port != 0; nated_port++) { + exp->tuple.dst.u.tcp.port = htons(nated_port); + if (ip_conntrack_expect_related(exp) == 0) + break; + } + + if (nated_port == 0) { /* No port available */ + if (net_ratelimit()) + printk("ip_nat_q931: out of TCP ports\n"); + return 0; + } + + /* Modify signal */ + if (!set_h225_addr(pskb, data, dataoff, addr, + ct->tuplehash[!dir].tuple.dst.ip, + nated_port) == 0) { + ip_conntrack_unexpect_related(exp); + return -1; + } + + /* Success */ + DEBUGP("ip_nat_q931: expect Call Forwarding " + "%u.%u.%u.%u:%hu->%u.%u.%u.%u:%hu\n", + NIPQUAD(exp->tuple.src.ip), ntohs(exp->tuple.src.u.tcp.port), + NIPQUAD(exp->tuple.dst.ip), ntohs(exp->tuple.dst.u.tcp.port)); + + return 0; +} + /****************************************************************************/ static int __init init(void) { @@ -496,6 +570,7 @@ static int __init init(void) BUG_ON(nat_rtp_rtcp_hook != NULL); BUG_ON(nat_t120_hook != NULL); BUG_ON(nat_h245_hook != NULL); + BUG_ON(nat_callforwarding_hook != NULL); BUG_ON(nat_q931_hook != NULL); set_h245_addr_hook = set_h245_addr; @@ -505,6 +580,7 @@ static int __init init(void) nat_rtp_rtcp_hook = nat_rtp_rtcp; nat_t120_hook = nat_t120; nat_h245_hook = nat_h245; + nat_callforwarding_hook = nat_callforwarding; nat_q931_hook = nat_q931; DEBUGP("ip_nat_h323: init success\n"); @@ -521,6 +597,7 @@ static void __exit fini(void) nat_rtp_rtcp_hook = NULL; nat_t120_hook = NULL; nat_h245_hook = NULL; + nat_callforwarding_hook = NULL; nat_q931_hook = NULL; synchronize_net(); } -- cgit v1.2.3-59-g8ed1b From e44ab66a75e20c02193440a5e27c16c91630109b Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:26:47 -0700 Subject: [NETFILTER]: H.323 helper: replace internal_net_addr parameter by routing-based heuristic Call Forwarding doesn't need to create an expectation if both peers can reach each other without our help. The internal_net_addr parameter lets the user explicitly specify a single network where this is true, but is not very flexible and even fails in the common case that calls will both be forwarded to outside parties and inside parties. Use an optional heuristic based on routing instead, the assumption is that if bpth the outgoing device and the gateway are equal, both peers can reach each other directly. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_helper_h323.c | 57 +++++++++++++-------------- 1 file changed, 27 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_helper_h323.c b/net/ipv4/netfilter/ip_conntrack_helper_h323.c index 3052468a6ab1..0665674218c6 100644 --- a/net/ipv4/netfilter/ip_conntrack_helper_h323.c +++ b/net/ipv4/netfilter/ip_conntrack_helper_h323.c @@ -40,12 +40,11 @@ static int gkrouted_only = 1; module_param(gkrouted_only, int, 0600); MODULE_PARM_DESC(gkrouted_only, "only accept calls from gatekeeper"); -static char *internal_net = NULL; -static u_int32_t internal_net_addr = 0; -static u_int32_t internal_net_mask = 0; -module_param(internal_net, charp, 0600); -MODULE_PARM_DESC(internal_net, "specify your internal network using format " - "address/mask. this is used by call forwarding support"); +static int callforward_filter = 1; +module_param(callforward_filter, bool, 0600); +MODULE_PARM_DESC(callforward_filter, "only create call forwarding expectations " + "if both endpoints are on different sides " + "(determined by routing information)"); /* Hooks for NAT */ int (*set_h245_addr_hook) (struct sk_buff ** pskb, @@ -721,12 +720,28 @@ static int expect_callforwarding(struct sk_buff **pskb, /* If the calling party is on the same side of the forward-to party, * we don't need to track the second call */ - if (internal_net && - ((ip & internal_net_mask) == internal_net_addr) == - ((ct->tuplehash[!dir].tuple.src.ip & internal_net_mask) == - internal_net_addr)) { - DEBUGP("ip_ct_q931: Call Forwarding not tracked\n"); - return 0; + if (callforward_filter) { + struct rtable *rt1, *rt2; + struct flowi fl1 = { + .fl4_dst = ip, + }; + struct flowi fl2 = { + .fl4_dst = ct->tuplehash[!dir].tuple.src.ip, + }; + + if (ip_route_output_key(&rt1, &fl1) == 0) { + if (ip_route_output_key(&rt2, &fl2) == 0) { + if (rt1->rt_gateway == rt2->rt_gateway && + rt1->u.dst.dev == rt2->u.dst.dev) + ret = 1; + dst_release(&rt2->u.dst); + } + dst_release(&rt1->u.dst); + } + if (ret) { + DEBUGP("ip_ct_q931: Call Forwarding not tracked\n"); + return 0; + } } /* Create expect for the second call leg */ @@ -1762,7 +1777,6 @@ static void fini(void) static int __init init(void) { int ret; - char *p; h323_buffer = kmalloc(65536, GFP_KERNEL); if (!h323_buffer) @@ -1772,23 +1786,6 @@ static int __init init(void) fini(); return ret; } - - if (internal_net) { - if ((p = strchr(internal_net, '/'))) - *p++ = 0; - if (isdigit(internal_net[0])) { - internal_net_addr = in_aton(internal_net); - if (p && isdigit(p[0])) - internal_net_mask = in_aton(p); - else - internal_net_mask = 0xffffffff; - internal_net_addr &= internal_net_mask; - } - DEBUGP("ip_ct_h323: internal_net = %u.%u.%u.%u/%u.%u.%u.%u\n", - NIPQUAD(internal_net_addr), - NIPQUAD(internal_net_mask)); - } - DEBUGP("ip_ct_h323: init success\n"); return 0; } -- cgit v1.2.3-59-g8ed1b From ae5b7d8ba2c28d7d9835856fe0ca5f6ec95ea768 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Mon, 29 May 2006 18:27:09 -0700 Subject: [NETFILTER]: Add SIP connection tracking helper Add SIP connection tracking helper. Originally written by Christian Hentschel , some cleanup, minor fixes and bidirectional SIP support added by myself. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- include/linux/netfilter_ipv4/ip_conntrack_sip.h | 44 +++ net/ipv4/netfilter/Kconfig | 18 + net/ipv4/netfilter/Makefile | 2 + net/ipv4/netfilter/ip_conntrack_sip.c | 471 ++++++++++++++++++++++++ net/ipv4/netfilter/ip_nat_sip.c | 249 +++++++++++++ 5 files changed, 784 insertions(+) create mode 100644 include/linux/netfilter_ipv4/ip_conntrack_sip.h create mode 100644 net/ipv4/netfilter/ip_conntrack_sip.c create mode 100644 net/ipv4/netfilter/ip_nat_sip.c (limited to 'net') diff --git a/include/linux/netfilter_ipv4/ip_conntrack_sip.h b/include/linux/netfilter_ipv4/ip_conntrack_sip.h new file mode 100644 index 000000000000..913dad66c0fb --- /dev/null +++ b/include/linux/netfilter_ipv4/ip_conntrack_sip.h @@ -0,0 +1,44 @@ +#ifndef __IP_CONNTRACK_SIP_H__ +#define __IP_CONNTRACK_SIP_H__ +#ifdef __KERNEL__ + +#define SIP_PORT 5060 +#define SIP_TIMEOUT 3600 + +#define POS_VIA 0 +#define POS_CONTACT 1 +#define POS_CONTENT 2 +#define POS_MEDIA 3 +#define POS_OWNER 4 +#define POS_CONNECTION 5 +#define POS_REQ_HEADER 6 +#define POS_SDP_HEADER 7 + +struct sip_header_nfo { + const char *lname; + const char *sname; + const char *ln_str; + size_t lnlen; + size_t snlen; + size_t ln_strlen; + int (*match_len)(const char *, const char *, int *); +}; + +extern unsigned int (*ip_nat_sip_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack *ct, + const char **dptr); +extern unsigned int (*ip_nat_sdp_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack_expect *exp, + const char *dptr); + +extern int ct_sip_get_info(const char *dptr, size_t dlen, + unsigned int *matchoff, + unsigned int *matchlen, + struct sip_header_nfo *hnfo); +extern int ct_sip_lnlen(const char *line, const char *limit); +extern const char *ct_sip_search(const char *needle, const char *haystack, + size_t needle_len, size_t haystack_len); +#endif /* __KERNEL__ */ +#endif /* __IP_CONNTRACK_SIP_H__ */ diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index f86aeda3a5a0..ff4b118f14a9 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -191,6 +191,18 @@ config IP_NF_H323 If you want to compile it as a module, say 'M' here and read Documentation/modules.txt. If unsure, say 'N'. +config IP_NF_SIP + tristate "SIP protocol support (EXPERIMENTAL)" + depends on IP_NF_CONNTRACK && EXPERIMENTAL + help + SIP is an application-layer control protocol that can establish, + modify, and terminate multimedia sessions (conferences) such as + Internet telephony calls. With the ip_conntrack_sip and + the ip_nat_sip modules you can support the protocol on a connection + tracking/NATing firewall. + + To compile it as a module, choose M here. If unsure, say Y. + config IP_NF_QUEUE tristate "IP Userspace queueing via NETLINK (OBSOLETE)" help @@ -503,6 +515,12 @@ config IP_NF_NAT_H323 default IP_NF_NAT if IP_NF_H323=y default m if IP_NF_H323=m +config IP_NF_NAT_SIP + tristate + depends on IP_NF_IPTABLES!=n && IP_NF_CONNTRACK!=n && IP_NF_NAT!=n + default IP_NF_NAT if IP_NF_SIP=y + default m if IP_NF_SIP=m + # mangle + specific targets config IP_NF_MANGLE tristate "Packet mangling" diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile index 461cb1eb5de7..3ded4a3af59c 100644 --- a/net/ipv4/netfilter/Makefile +++ b/net/ipv4/netfilter/Makefile @@ -31,6 +31,7 @@ obj-$(CONFIG_IP_NF_AMANDA) += ip_conntrack_amanda.o obj-$(CONFIG_IP_NF_TFTP) += ip_conntrack_tftp.o obj-$(CONFIG_IP_NF_FTP) += ip_conntrack_ftp.o obj-$(CONFIG_IP_NF_IRC) += ip_conntrack_irc.o +obj-$(CONFIG_IP_NF_SIP) += ip_conntrack_sip.o obj-$(CONFIG_IP_NF_NETBIOS_NS) += ip_conntrack_netbios_ns.o # NAT helpers @@ -40,6 +41,7 @@ obj-$(CONFIG_IP_NF_NAT_AMANDA) += ip_nat_amanda.o obj-$(CONFIG_IP_NF_NAT_TFTP) += ip_nat_tftp.o obj-$(CONFIG_IP_NF_NAT_FTP) += ip_nat_ftp.o obj-$(CONFIG_IP_NF_NAT_IRC) += ip_nat_irc.o +obj-$(CONFIG_IP_NF_NAT_SIP) += ip_nat_sip.o # generic IP tables obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o diff --git a/net/ipv4/netfilter/ip_conntrack_sip.c b/net/ipv4/netfilter/ip_conntrack_sip.c new file mode 100644 index 000000000000..fc87ce0da40d --- /dev/null +++ b/net/ipv4/netfilter/ip_conntrack_sip.c @@ -0,0 +1,471 @@ +/* SIP extension for IP connection tracking. + * + * (C) 2005 by Christian Hentschel + * based on RR's ip_conntrack_ftp.c and other modules. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Christian Hentschel "); +MODULE_DESCRIPTION("SIP connection tracking helper"); + +#define MAX_PORTS 8 +static unsigned short ports[MAX_PORTS]; +static int ports_c; +module_param_array(ports, ushort, &ports_c, 0400); +MODULE_PARM_DESC(ports, "port numbers of sip servers"); + +static unsigned int sip_timeout = SIP_TIMEOUT; +module_param(sip_timeout, uint, 0600); +MODULE_PARM_DESC(sip_timeout, "timeout for the master SIP session"); + +unsigned int (*ip_nat_sip_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack *ct, + const char **dptr); +EXPORT_SYMBOL_GPL(ip_nat_sip_hook); + +unsigned int (*ip_nat_sdp_hook)(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack_expect *exp, + const char *dptr); +EXPORT_SYMBOL_GPL(ip_nat_sdp_hook); + +int ct_sip_get_info(const char *dptr, size_t dlen, + unsigned int *matchoff, + unsigned int *matchlen, + struct sip_header_nfo *hnfo); +EXPORT_SYMBOL_GPL(ct_sip_get_info); + + +static int digits_len(const char *dptr, const char *limit, int *shift); +static int epaddr_len(const char *dptr, const char *limit, int *shift); +static int skp_digits_len(const char *dptr, const char *limit, int *shift); +static int skp_epaddr_len(const char *dptr, const char *limit, int *shift); + +struct sip_header_nfo ct_sip_hdrs[] = { + { /* Via header */ + .lname = "Via:", + .lnlen = sizeof("Via:") - 1, + .sname = "\r\nv:", + .snlen = sizeof("\r\nv:") - 1, /* rfc3261 "\r\n" */ + .ln_str = "UDP ", + .ln_strlen = sizeof("UDP ") - 1, + .match_len = epaddr_len, + }, + { /* Contact header */ + .lname = "Contact:", + .lnlen = sizeof("Contact:") - 1, + .sname = "\r\nm:", + .snlen = sizeof("\r\nm:") - 1, + .ln_str = "sip:", + .ln_strlen = sizeof("sip:") - 1, + .match_len = skp_epaddr_len + }, + { /* Content length header */ + .lname = "Content-Length:", + .lnlen = sizeof("Content-Length:") - 1, + .sname = "\r\nl:", + .snlen = sizeof("\r\nl:") - 1, + .ln_str = ":", + .ln_strlen = sizeof(":") - 1, + .match_len = skp_digits_len + }, + { /* SDP media info */ + .lname = "\nm=", + .lnlen = sizeof("\nm=") - 1, + .sname = "\rm=", + .snlen = sizeof("\rm=") - 1, + .ln_str = "audio ", + .ln_strlen = sizeof("audio ") - 1, + .match_len = digits_len + }, + { /* SDP owner address*/ + .lname = "\no=", + .lnlen = sizeof("\no=") - 1, + .sname = "\ro=", + .snlen = sizeof("\ro=") - 1, + .ln_str = "IN IP4 ", + .ln_strlen = sizeof("IN IP4 ") - 1, + .match_len = epaddr_len + }, + { /* SDP connection info */ + .lname = "\nc=", + .lnlen = sizeof("\nc=") - 1, + .sname = "\rc=", + .snlen = sizeof("\rc=") - 1, + .ln_str = "IN IP4 ", + .ln_strlen = sizeof("IN IP4 ") - 1, + .match_len = epaddr_len + }, + { /* Requests headers */ + .lname = "sip:", + .lnlen = sizeof("sip:") - 1, + .sname = "sip:", + .snlen = sizeof("sip:") - 1, /* yes, i know.. ;) */ + .ln_str = "@", + .ln_strlen = sizeof("@") - 1, + .match_len = epaddr_len + }, + { /* SDP version header */ + .lname = "\nv=", + .lnlen = sizeof("\nv=") - 1, + .sname = "\rv=", + .snlen = sizeof("\rv=") - 1, + .ln_str = "=", + .ln_strlen = sizeof("=") - 1, + .match_len = digits_len + } +}; +EXPORT_SYMBOL_GPL(ct_sip_hdrs); + +/* get line lenght until first CR or LF seen. */ +int ct_sip_lnlen(const char *line, const char *limit) +{ + const char *k = line; + + while ((line <= limit) && (*line == '\r' || *line == '\n')) + line++; + + while (line <= limit) { + if (*line == '\r' || *line == '\n') + break; + line++; + } + return line - k; +} +EXPORT_SYMBOL_GPL(ct_sip_lnlen); + +/* Linear string search, case sensitive. */ +const char *ct_sip_search(const char *needle, const char *haystack, + size_t needle_len, size_t haystack_len) +{ + const char *limit = haystack + (haystack_len - needle_len); + + while (haystack <= limit) { + if (memcmp(haystack, needle, needle_len) == 0) + return haystack; + haystack++; + } + return NULL; +} +EXPORT_SYMBOL_GPL(ct_sip_search); + +static int digits_len(const char *dptr, const char *limit, int *shift) +{ + int len = 0; + while (dptr <= limit && isdigit(*dptr)) { + dptr++; + len++; + } + return len; +} + +/* get digits lenght, skiping blank spaces. */ +static int skp_digits_len(const char *dptr, const char *limit, int *shift) +{ + for (; dptr <= limit && *dptr == ' '; dptr++) + (*shift)++; + + return digits_len(dptr, limit, shift); +} + +/* Simple ipaddr parser.. */ +static int parse_ipaddr(const char *cp, const char **endp, + u_int32_t *ipaddr, const char *limit) +{ + unsigned long int val; + int i, digit = 0; + + for (i = 0, *ipaddr = 0; cp <= limit && i < 4; i++) { + digit = 0; + if (!isdigit(*cp)) + break; + + val = simple_strtoul(cp, (char **)&cp, 10); + if (val > 0xFF) + return -1; + + ((u_int8_t *)ipaddr)[i] = val; + digit = 1; + + if (*cp != '.') + break; + cp++; + } + if (!digit) + return -1; + + if (endp) + *endp = cp; + + return 0; +} + +/* skip ip address. returns it lenght. */ +static int epaddr_len(const char *dptr, const char *limit, int *shift) +{ + const char *aux = dptr; + u_int32_t ip; + + if (parse_ipaddr(dptr, &dptr, &ip, limit) < 0) { + DEBUGP("ip: %s parse failed.!\n", dptr); + return 0; + } + + /* Port number */ + if (*dptr == ':') { + dptr++; + dptr += digits_len(dptr, limit, shift); + } + return dptr - aux; +} + +/* get address length, skiping user info. */ +static int skp_epaddr_len(const char *dptr, const char *limit, int *shift) +{ + int s = *shift; + + for (; dptr <= limit && *dptr != '@'; dptr++) + (*shift)++; + + if (*dptr == '@') { + dptr++; + (*shift)++; + } else + *shift = s; + + return epaddr_len(dptr, limit, shift); +} + +/* Returns 0 if not found, -1 error parsing. */ +int ct_sip_get_info(const char *dptr, size_t dlen, + unsigned int *matchoff, + unsigned int *matchlen, + struct sip_header_nfo *hnfo) +{ + const char *limit, *aux, *k = dptr; + int shift = 0; + + limit = dptr + (dlen - hnfo->lnlen); + + while (dptr <= limit) { + if ((strncmp(dptr, hnfo->lname, hnfo->lnlen) != 0) && + (strncmp(dptr, hnfo->sname, hnfo->snlen) != 0)) { + dptr++; + continue; + } + aux = ct_sip_search(hnfo->ln_str, dptr, hnfo->ln_strlen, + ct_sip_lnlen(dptr, limit)); + if (!aux) { + DEBUGP("'%s' not found in '%s'.\n", hnfo->ln_str, + hnfo->lname); + return -1; + } + aux += hnfo->ln_strlen; + + *matchlen = hnfo->match_len(aux, limit, &shift); + if (!*matchlen) + return -1; + + *matchoff = (aux - k) + shift; + + DEBUGP("%s match succeeded! - len: %u\n", hnfo->lname, + *matchlen); + return 1; + } + DEBUGP("%s header not found.\n", hnfo->lname); + return 0; +} + +static int set_expected_rtp(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo, + u_int32_t ipaddr, u_int16_t port, + const char *dptr) +{ + struct ip_conntrack_expect *exp; + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + int ret; + + exp = ip_conntrack_expect_alloc(ct); + if (exp == NULL) + return NF_DROP; + + exp->tuple.src.ip = ct->tuplehash[!dir].tuple.src.ip; + exp->tuple.src.u.udp.port = 0; + exp->tuple.dst.ip = ipaddr; + exp->tuple.dst.u.udp.port = htons(port); + exp->tuple.dst.protonum = IPPROTO_UDP; + + exp->mask.src.ip = 0xFFFFFFFF; + exp->mask.src.u.udp.port = 0; + exp->mask.dst.ip = 0xFFFFFFFF; + exp->mask.dst.u.udp.port = 0xFFFF; + exp->mask.dst.protonum = 0xFF; + + exp->expectfn = NULL; + exp->flags = 0; + + if (ip_nat_sdp_hook) + ret = ip_nat_sdp_hook(pskb, ctinfo, exp, dptr); + else { + if (ip_conntrack_expect_related(exp) != 0) + ret = NF_DROP; + else + ret = NF_ACCEPT; + } + ip_conntrack_expect_put(exp); + + return ret; +} + +static int sip_help(struct sk_buff **pskb, + struct ip_conntrack *ct, + enum ip_conntrack_info ctinfo) +{ + unsigned int dataoff, datalen; + const char *dptr; + int ret = NF_ACCEPT; + int matchoff, matchlen; + u_int32_t ipaddr; + u_int16_t port; + + /* No Data ? */ + dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + if (dataoff >= (*pskb)->len) { + DEBUGP("skb->len = %u\n", (*pskb)->len); + return NF_ACCEPT; + } + + ip_ct_refresh(ct, *pskb, sip_timeout * HZ); + + if (!skb_is_nonlinear(*pskb)) + dptr = (*pskb)->data + dataoff; + else { + DEBUGP("Copy of skbuff not supported yet.\n"); + goto out; + } + + if (ip_nat_sip_hook) { + if (!ip_nat_sip_hook(pskb, ctinfo, ct, &dptr)) { + ret = NF_DROP; + goto out; + } + } + + /* After this point NAT, could have mangled skb, so + we need to recalculate payload lenght. */ + datalen = (*pskb)->len - dataoff; + + if (datalen < (sizeof("SIP/2.0 200") - 1)) + goto out; + + /* RTP info only in some SDP pkts */ + if (memcmp(dptr, "INVITE", sizeof("INVITE") - 1) != 0 && + memcmp(dptr, "SIP/2.0 200", sizeof("SIP/2.0 200") - 1) != 0) { + goto out; + } + /* Get ip and port address from SDP packet. */ + if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen, + &ct_sip_hdrs[POS_CONNECTION]) > 0) { + + /* We'll drop only if there are parse problems. */ + if (parse_ipaddr(dptr + matchoff, NULL, &ipaddr, + dptr + datalen) < 0) { + ret = NF_DROP; + goto out; + } + if (ct_sip_get_info(dptr, datalen, &matchoff, &matchlen, + &ct_sip_hdrs[POS_MEDIA]) > 0) { + + port = simple_strtoul(dptr + matchoff, NULL, 10); + if (port < 1024) { + ret = NF_DROP; + goto out; + } + ret = set_expected_rtp(pskb, ct, ctinfo, + ipaddr, port, dptr); + } + } +out: + return ret; +} + +static struct ip_conntrack_helper sip[MAX_PORTS]; +static char sip_names[MAX_PORTS][10]; + +static void fini(void) +{ + int i; + for (i = 0; i < ports_c; i++) { + DEBUGP("unregistering helper for port %d\n", ports[i]); + ip_conntrack_helper_unregister(&sip[i]); + } +} + +static int __init init(void) +{ + int i, ret; + char *tmpname; + + if (ports_c == 0) + ports[ports_c++] = SIP_PORT; + + for (i = 0; i < ports_c; i++) { + /* Create helper structure */ + memset(&sip[i], 0, sizeof(struct ip_conntrack_helper)); + + sip[i].tuple.dst.protonum = IPPROTO_UDP; + sip[i].tuple.src.u.udp.port = htons(ports[i]); + sip[i].mask.src.u.udp.port = 0xFFFF; + sip[i].mask.dst.protonum = 0xFF; + sip[i].max_expected = 1; + sip[i].timeout = 3 * 60; /* 3 minutes */ + sip[i].me = THIS_MODULE; + sip[i].help = sip_help; + + tmpname = &sip_names[i][0]; + if (ports[i] == SIP_PORT) + sprintf(tmpname, "sip"); + else + sprintf(tmpname, "sip-%d", i); + sip[i].name = tmpname; + + DEBUGP("port #%d: %d\n", i, ports[i]); + + ret = ip_conntrack_helper_register(&sip[i]); + if (ret) { + printk("ERROR registering helper for port %d\n", + ports[i]); + fini(); + return ret; + } + } + return 0; +} + +module_init(init); +module_exit(fini); diff --git a/net/ipv4/netfilter/ip_nat_sip.c b/net/ipv4/netfilter/ip_nat_sip.c new file mode 100644 index 000000000000..6ffba63adca2 --- /dev/null +++ b/net/ipv4/netfilter/ip_nat_sip.c @@ -0,0 +1,249 @@ +/* SIP extension for UDP NAT alteration. + * + * (C) 2005 by Christian Hentschel + * based on RR's ip_nat_ftp.c and other modules. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Christian Hentschel "); +MODULE_DESCRIPTION("SIP NAT helper"); + +#if 0 +#define DEBUGP printk +#else +#define DEBUGP(format, args...) +#endif + +extern struct sip_header_nfo ct_sip_hdrs[]; + +static unsigned int mangle_sip_packet(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack *ct, + const char **dptr, size_t dlen, + char *buffer, int bufflen, + struct sip_header_nfo *hnfo) +{ + unsigned int matchlen, matchoff; + + if (ct_sip_get_info(*dptr, dlen, &matchoff, &matchlen, hnfo) <= 0) + return 0; + + if (!ip_nat_mangle_udp_packet(pskb, ct, ctinfo, + matchoff, matchlen, buffer, bufflen)) + return 0; + + /* We need to reload this. Thanks Patrick. */ + *dptr = (*pskb)->data + (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + return 1; +} + +static unsigned int ip_nat_sip(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack *ct, + const char **dptr) +{ + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + char buffer[sizeof("nnn.nnn.nnn.nnn:nnnnn")]; + unsigned int bufflen, dataoff; + u_int32_t ip; + u_int16_t port; + + dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + + ip = ct->tuplehash[!dir].tuple.dst.ip; + port = ct->tuplehash[!dir].tuple.dst.u.udp.port; + bufflen = sprintf(buffer, "%u.%u.%u.%u:%u", NIPQUAD(ip), ntohs(port)); + + /* short packet ? */ + if (((*pskb)->len - dataoff) < (sizeof("SIP/2.0") - 1)) + return 0; + + /* Basic rules: requests and responses. */ + if (memcmp(*dptr, "SIP/2.0", sizeof("SIP/2.0") - 1) == 0) { + const char *aux; + + if ((ctinfo) < IP_CT_IS_REPLY) { + mangle_sip_packet(pskb, ctinfo, ct, dptr, + (*pskb)->len - dataoff, + buffer, bufflen, + &ct_sip_hdrs[POS_CONTACT]); + return 1; + } + + if (!mangle_sip_packet(pskb, ctinfo, ct, dptr, + (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_VIA])) + return 0; + + /* This search should ignore case, but later.. */ + aux = ct_sip_search("CSeq:", *dptr, sizeof("CSeq:") - 1, + (*pskb)->len - dataoff); + if (!aux) + return 0; + + if (!ct_sip_search("REGISTER", aux, sizeof("REGISTER"), + ct_sip_lnlen(aux, *dptr + (*pskb)->len - dataoff))) + return 1; + + return mangle_sip_packet(pskb, ctinfo, ct, dptr, + (*pskb)->len - dataoff, + buffer, bufflen, + &ct_sip_hdrs[POS_CONTACT]); + } + if ((ctinfo) < IP_CT_IS_REPLY) { + if (!mangle_sip_packet(pskb, ctinfo, ct, dptr, + (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_VIA])) + return 0; + + /* Mangle Contact if exists only. - watch udp_nat_mangle()! */ + mangle_sip_packet(pskb, ctinfo, ct, dptr, (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_CONTACT]); + return 1; + } + /* This mangle requests headers. */ + return mangle_sip_packet(pskb, ctinfo, ct, dptr, + ct_sip_lnlen(*dptr, + *dptr + (*pskb)->len - dataoff), + buffer, bufflen, &ct_sip_hdrs[POS_REQ_HEADER]); +} + +static int mangle_content_len(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack *ct, + const char *dptr) +{ + unsigned int dataoff, matchoff, matchlen; + char buffer[sizeof("65536")]; + int bufflen; + + dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + + /* Get actual SDP lenght */ + if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff, + &matchlen, &ct_sip_hdrs[POS_SDP_HEADER]) > 0) { + + /* since ct_sip_get_info() give us a pointer passing 'v=' + we need to add 2 bytes in this count. */ + int c_len = (*pskb)->len - dataoff - matchoff + 2; + + /* Now, update SDP lenght */ + if (ct_sip_get_info(dptr, (*pskb)->len - dataoff, &matchoff, + &matchlen, &ct_sip_hdrs[POS_CONTENT]) > 0) { + + bufflen = sprintf(buffer, "%u", c_len); + + return ip_nat_mangle_udp_packet(pskb, ct, ctinfo, + matchoff, matchlen, + buffer, bufflen); + } + } + return 0; +} + +static unsigned int mangle_sdp(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack *ct, + u_int32_t newip, u_int16_t port, + const char *dptr) +{ + char buffer[sizeof("nnn.nnn.nnn.nnn")]; + unsigned int dataoff, bufflen; + + dataoff = (*pskb)->nh.iph->ihl*4 + sizeof(struct udphdr); + + /* Mangle owner and contact info. */ + bufflen = sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(newip)); + if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_OWNER])) + return 0; + + if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_CONNECTION])) + return 0; + + /* Mangle media port. */ + bufflen = sprintf(buffer, "%u", port); + if (!mangle_sip_packet(pskb, ctinfo, ct, &dptr, (*pskb)->len - dataoff, + buffer, bufflen, &ct_sip_hdrs[POS_MEDIA])) + return 0; + + return mangle_content_len(pskb, ctinfo, ct, dptr); +} + +/* So, this packet has hit the connection tracking matching code. + Mangle it, and change the expectation to match the new version. */ +static unsigned int ip_nat_sdp(struct sk_buff **pskb, + enum ip_conntrack_info ctinfo, + struct ip_conntrack_expect *exp, + const char *dptr) +{ + struct ip_conntrack *ct = exp->master; + enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); + u_int32_t newip; + u_int16_t port; + + DEBUGP("ip_nat_sdp():\n"); + + /* Connection will come from reply */ + newip = ct->tuplehash[!dir].tuple.dst.ip; + + exp->tuple.dst.ip = newip; + exp->saved_proto.udp.port = exp->tuple.dst.u.udp.port; + exp->dir = !dir; + + /* When you see the packet, we need to NAT it the same as the + this one. */ + exp->expectfn = ip_nat_follow_master; + + /* Try to get same port: if not, try to change it. */ + for (port = ntohs(exp->saved_proto.udp.port); port != 0; port++) { + exp->tuple.dst.u.udp.port = htons(port); + if (ip_conntrack_expect_related(exp) == 0) + break; + } + + if (port == 0) + return NF_DROP; + + if (!mangle_sdp(pskb, ctinfo, ct, newip, port, dptr)) { + ip_conntrack_unexpect_related(exp); + return NF_DROP; + } + return NF_ACCEPT; +} + +static void __exit fini(void) +{ + ip_nat_sip_hook = NULL; + ip_nat_sdp_hook = NULL; + /* Make sure noone calls it, meanwhile. */ + synchronize_net(); +} + +static int __init init(void) +{ + BUG_ON(ip_nat_sip_hook); + BUG_ON(ip_nat_sdp_hook); + ip_nat_sip_hook = ip_nat_sip; + ip_nat_sdp_hook = ip_nat_sdp; + return 0; +} + +module_init(init); +module_exit(fini); -- cgit v1.2.3-59-g8ed1b From c45fb1089e714146206d7e295ff337893424c874 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 29 May 2006 18:27:32 -0700 Subject: [NETFILTER]: PPTP helper: fixup gre_keymap_lookup() return type GRE keys are 16-bit wide. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ip_conntrack_proto_gre.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c index 56794797d55b..21ee124c0463 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c @@ -77,10 +77,10 @@ static inline int gre_key_cmpfn(const struct ip_ct_gre_keymap *km, } /* look up the source key for a given tuple */ -static u_int32_t gre_keymap_lookup(struct ip_conntrack_tuple *t) +static __be16 gre_keymap_lookup(struct ip_conntrack_tuple *t) { struct ip_ct_gre_keymap *km; - u_int32_t key = 0; + __be16 key = 0; read_lock_bh(&ip_ct_gre_lock); km = LIST_FIND(&gre_keymap_list, gre_key_cmpfn, @@ -190,7 +190,7 @@ static int gre_pkt_to_tuple(const struct sk_buff *skb, struct ip_conntrack_tuple *tuple) { struct gre_hdr_pptp _pgrehdr, *pgrehdr; - u_int32_t srckey; + __be16 srckey; struct gre_hdr _grehdr, *grehdr; /* first only delinearize old RFC1701 GRE header */ -- cgit v1.2.3-59-g8ed1b From 2f45c340e09242641d4f11498c3be48b35abb926 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 2 Jun 2006 16:29:20 -0700 Subject: [LLC]: Fix double receive of SKB. Oops fix from Stephen: remove duplicate rcv() calls. Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/llc/llc_input.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index 32cf8ac8ea80..94d2368ade92 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -176,7 +176,6 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, struct sk_buff *cskb = skb_clone(skb, GFP_ATOMIC); if (cskb) rcv(cskb, dev, pt, orig_dev); - rcv(skb, dev, pt, orig_dev); } dest = llc_pdu_type(skb); if (unlikely(!dest || !llc_type_handlers[dest - 1])) -- cgit v1.2.3-59-g8ed1b From 7c106d7e782bd4805f39da30e81018f861b4b8c5 Mon Sep 17 00:00:00 2001 From: Wong Hoi Sing Edison Date: Mon, 5 Jun 2006 17:27:58 -0700 Subject: [TCP]: TCP Low Priority congestion control TCP Low Priority is a distributed algorithm whose goal is to utilize only the excess network bandwidth as compared to the ``fair share`` of bandwidth as targeted by TCP. Available from: http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf Original Author: Aleksandar Kuzmanovic See http://www-ece.rice.edu/networks/TCP-LP/ for their implementation. As of 2.6.13, Linux supports pluggable congestion control algorithms. Due to the limitation of the API, we take the following changes from the original TCP-LP implementation: o We use newReno in most core CA handling. Only add some checking within cong_avoid. o Error correcting in remote HZ, therefore remote HZ will be keeped on checking and updating. o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne OWD have a similar meaning as RTT. Also correct the buggy formular. o Handle reaction for Early Congestion Indication (ECI) within pkts_acked, as mentioned within pseudo code. o OWD is handled in relative format, where local time stamp will in tcp_time_stamp format. Port from 2.4.19 to 2.6.16 as module by: Wong Hoi Sing Edison Hung Hing Lun Signed-off-by: Wong Hoi Sing Edison Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 10 ++ net/ipv4/Makefile | 1 + net/ipv4/tcp_lp.c | 338 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 349 insertions(+) create mode 100644 net/ipv4/tcp_lp.c (limited to 'net') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 35eb70b1448d..ae85149a0434 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -550,6 +550,16 @@ config TCP_CONG_SCALABLE properties, though is known to have fairness issues. See http://www-lce.eng.cam.ac.uk/~ctk21/scalable/ +config TCP_CONG_LP + tristate "TCP Low Priority" + depends on EXPERIMENTAL + default n + ---help--- + TCP Low Priority (TCP-LP), a distributed algorithm whose goal is + to utiliza only the excess network bandwidth as compared to the + ``fair share`` of bandwidth as targeted by TCP. + See http://www-ece.rice.edu/networks/TCP-LP/ + endmenu config TCP_CONG_BIC diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 4cc94482eacc..58a7a4634f70 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -44,6 +44,7 @@ obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o +obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c new file mode 100644 index 000000000000..1f977b6ee9a1 --- /dev/null +++ b/net/ipv4/tcp_lp.c @@ -0,0 +1,338 @@ +/* + * TCP Low Priority (TCP-LP) + * + * TCP Low Priority is a distributed algorithm whose goal is to utilize only + * the excess network bandwidth as compared to the ``fair share`` of + * bandwidth as targeted by TCP. Available from: + * http://www.ece.rice.edu/~akuzma/Doc/akuzma/TCP-LP.pdf + * + * Original Author: + * Aleksandar Kuzmanovic + * + * See http://www-ece.rice.edu/networks/TCP-LP/ for their implementation. + * As of 2.6.13, Linux supports pluggable congestion control algorithms. + * Due to the limitation of the API, we take the following changes from + * the original TCP-LP implementation: + * o We use newReno in most core CA handling. Only add some checking + * within cong_avoid. + * o Error correcting in remote HZ, therefore remote HZ will be keeped + * on checking and updating. + * o Handling calculation of One-Way-Delay (OWD) within rtt_sample, sicne + * OWD have a similar meaning as RTT. Also correct the buggy formular. + * o Handle reaction for Early Congestion Indication (ECI) within + * pkts_acked, as mentioned within pseudo code. + * o OWD is handled in relative format, where local time stamp will in + * tcp_time_stamp format. + * + * Port from 2.4.19 to 2.6.16 as module by: + * Wong Hoi Sing Edison + * Hung Hing Lun + * + * Version: $Id: tcp_lp.c,v 1.22 2006-05-02 18:18:19 hswong3i Exp $ + */ + +#include +#include +#include + +/* resolution of owd */ +#define LP_RESOL 1000 + +/** + * enum tcp_lp_state + * @LP_VALID_RHZ: is remote HZ valid? + * @LP_VALID_OWD: is OWD valid? + * @LP_WITHIN_THR: are we within threshold? + * @LP_WITHIN_INF: are we within inference? + * + * TCP-LP's state flags. + * We create this set of state flag mainly for debugging. + */ +enum tcp_lp_state { + LP_VALID_RHZ = (1 << 0), + LP_VALID_OWD = (1 << 1), + LP_WITHIN_THR = (1 << 3), + LP_WITHIN_INF = (1 << 4), +}; + +/** + * struct lp + * @flag: TCP-LP state flag + * @sowd: smoothed OWD << 3 + * @owd_min: min OWD + * @owd_max: max OWD + * @owd_max_rsv: resrved max owd + * @remote_hz: estimated remote HZ + * @remote_ref_time: remote reference time + * @local_ref_time: local reference time + * @last_drop: time for last active drop + * @inference: current inference + * + * TCP-LP's private struct. + * We get the idea from original TCP-LP implementation where only left those we + * found are really useful. + */ +struct lp { + u32 flag; + u32 sowd; + u32 owd_min; + u32 owd_max; + u32 owd_max_rsv; + u32 remote_hz; + u32 remote_ref_time; + u32 local_ref_time; + u32 last_drop; + u32 inference; +}; + +/** + * tcp_lp_init + * + * Init all required variables. + * Clone the handling from Vegas module implementation. + */ +static void tcp_lp_init(struct sock *sk) +{ + struct lp *lp = inet_csk_ca(sk); + + lp->flag = 0; + lp->sowd = 0; + lp->owd_min = 0xffffffff; + lp->owd_max = 0; + lp->owd_max_rsv = 0; + lp->remote_hz = 0; + lp->remote_ref_time = 0; + lp->local_ref_time = 0; + lp->last_drop = 0; + lp->inference = 0; +} + +/** + * tcp_lp_cong_avoid + * + * Implementation of cong_avoid. + * Will only call newReno CA when away from inference. + * From TCP-LP's paper, this will be handled in additive increasement. + */ +static void tcp_lp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, + int flag) +{ + struct lp *lp = inet_csk_ca(sk); + + if (!(lp->flag & LP_WITHIN_INF)) + tcp_reno_cong_avoid(sk, ack, rtt, in_flight, flag); +} + +/** + * tcp_lp_remote_hz_estimator + * + * Estimate remote HZ. + * We keep on updating the estimated value, where original TCP-LP + * implementation only guest it for once and use forever. + */ +static u32 tcp_lp_remote_hz_estimator(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct lp *lp = inet_csk_ca(sk); + s64 rhz = lp->remote_hz << 6; /* remote HZ << 6 */ + s64 m = 0; + + /* not yet record reference time + * go away!! record it before come back!! */ + if (lp->remote_ref_time == 0 || lp->local_ref_time == 0) + goto out; + + /* we can't calc remote HZ with no different!! */ + if (tp->rx_opt.rcv_tsval == lp->remote_ref_time + || tp->rx_opt.rcv_tsecr == lp->local_ref_time) + goto out; + + m = HZ * (tp->rx_opt.rcv_tsval - + lp->remote_ref_time) / (tp->rx_opt.rcv_tsecr - + lp->local_ref_time); + if (m < 0) + m = -m; + + if (rhz != 0) { + m -= rhz >> 6; /* m is now error in remote HZ est */ + rhz += m; /* 63/64 old + 1/64 new */ + } else + rhz = m << 6; + + /* record time for successful remote HZ calc */ + lp->flag |= LP_VALID_RHZ; + + out: + /* record reference time stamp */ + lp->remote_ref_time = tp->rx_opt.rcv_tsval; + lp->local_ref_time = tp->rx_opt.rcv_tsecr; + + return rhz >> 6; +} + +/** + * tcp_lp_owd_calculator + * + * Calculate one way delay (in relative format). + * Original implement OWD as minus of remote time difference to local time + * difference directly. As this time difference just simply equal to RTT, when + * the network status is stable, remote RTT will equal to local RTT, and result + * OWD into zero. + * It seems to be a bug and so we fixed it. + */ +static u32 tcp_lp_owd_calculator(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct lp *lp = inet_csk_ca(sk); + s64 owd = 0; + + lp->remote_hz = tcp_lp_remote_hz_estimator(sk); + + if (lp->flag & LP_VALID_RHZ) { + owd = + tp->rx_opt.rcv_tsval * (LP_RESOL / lp->remote_hz) - + tp->rx_opt.rcv_tsecr * (LP_RESOL / HZ); + if (owd < 0) + owd = -owd; + } + + if (owd > 0) + lp->flag |= LP_VALID_OWD; + else + lp->flag &= ~LP_VALID_OWD; + + return owd; +} + +/** + * tcp_lp_rtt_sample + * + * Implementation or rtt_sample. + * Will take the following action, + * 1. calc OWD, + * 2. record the min/max OWD, + * 3. calc smoothed OWD (SOWD). + * Most ideas come from the original TCP-LP implementation. + */ +static void tcp_lp_rtt_sample(struct sock *sk, u32 usrtt) +{ + struct lp *lp = inet_csk_ca(sk); + s64 mowd = tcp_lp_owd_calculator(sk); + + /* sorry that we don't have valid data */ + if (!(lp->flag & LP_VALID_RHZ) || !(lp->flag & LP_VALID_OWD)) + return; + + /* record the next min owd */ + if (mowd < lp->owd_min) + lp->owd_min = mowd; + + /* always forget the max of the max + * we just set owd_max as one below it */ + if (mowd > lp->owd_max) { + if (mowd > lp->owd_max_rsv) { + if (lp->owd_max_rsv == 0) + lp->owd_max = mowd; + else + lp->owd_max = lp->owd_max_rsv; + lp->owd_max_rsv = mowd; + } else + lp->owd_max = mowd; + } + + /* calc for smoothed owd */ + if (lp->sowd != 0) { + mowd -= lp->sowd >> 3; /* m is now error in owd est */ + lp->sowd += mowd; /* owd = 7/8 owd + 1/8 new */ + } else + lp->sowd = mowd << 3; /* take the measured time be owd */ +} + +/** + * tcp_lp_pkts_acked + * + * Implementation of pkts_acked. + * Deal with active drop under Early Congestion Indication. + * Only drop to half and 1 will be handle, because we hope to use back + * newReno in increase case. + * We work it out by following the idea from TCP-LP's paper directly + */ +static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct lp *lp = inet_csk_ca(sk); + + /* calc inference */ + if (tcp_time_stamp > tp->rx_opt.rcv_tsecr) + lp->inference = 3 * (tcp_time_stamp - tp->rx_opt.rcv_tsecr); + + /* test if within inference */ + if (lp->last_drop && (tcp_time_stamp - lp->last_drop < lp->inference)) + lp->flag |= LP_WITHIN_INF; + else + lp->flag &= ~LP_WITHIN_INF; + + /* test if within threshold */ + if (lp->sowd >> 3 < + lp->owd_min + 15 * (lp->owd_max - lp->owd_min) / 100) + lp->flag |= LP_WITHIN_THR; + else + lp->flag &= ~LP_WITHIN_THR; + + pr_debug("TCP-LP: %05o|%5u|%5u|%15u|%15u|%15u\n", lp->flag, + tp->snd_cwnd, lp->remote_hz, lp->owd_min, lp->owd_max, + lp->sowd >> 3); + + if (lp->flag & LP_WITHIN_THR) + return; + + /* FIXME: try to reset owd_min and owd_max here + * so decrease the chance the min/max is no longer suitable + * and will usually within threshold when whithin inference */ + lp->owd_min = lp->sowd >> 3; + lp->owd_max = lp->sowd >> 2; + lp->owd_max_rsv = lp->sowd >> 2; + + /* happened within inference + * drop snd_cwnd into 1 */ + if (lp->flag & LP_WITHIN_INF) + tp->snd_cwnd = 1U; + + /* happened after inference + * cut snd_cwnd into half */ + else + tp->snd_cwnd = max(tp->snd_cwnd >> 1U, 1U); + + /* record this drop time */ + lp->last_drop = tcp_time_stamp; +} + +static struct tcp_congestion_ops tcp_lp = { + .init = tcp_lp_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_lp_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + .rtt_sample = tcp_lp_rtt_sample, + .pkts_acked = tcp_lp_pkts_acked, + + .owner = THIS_MODULE, + .name = "lp" +}; + +static int __init tcp_lp_register(void) +{ + BUG_ON(sizeof(struct lp) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&tcp_lp); +} + +static void __exit tcp_lp_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_lp); +} + +module_init(tcp_lp_register); +module_exit(tcp_lp_unregister); + +MODULE_AUTHOR("Wong Hoi Sing Edison, Hung Hing Lun"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Low Priority"); -- cgit v1.2.3-59-g8ed1b From 76f1017757aa0c308a0b83ca611c9a89ee9a79a4 Mon Sep 17 00:00:00 2001 From: Bin Zhou Date: Mon, 5 Jun 2006 17:28:30 -0700 Subject: [TCP]: TCP Veno congestion control TCP Veno module is a new congestion control module to improve TCP performance over wireless networks. The key innovation in TCP Veno is the enhancement of TCP Reno/Sack congestion control algorithm by using the estimated state of a connection based on TCP Vegas. This scheme significantly reduces "blind" reduction of TCP window regardless of the cause of packet loss. This work is based on the research paper "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks." C. P. Fu, S. C. Liew, IEEE Journal on Selected Areas in Communication, Feb. 2003. Original paper and many latest research works on veno: http://www.ntu.edu.sg/home/ascpfu/veno/veno.html Signed-off-by: Bin Zhou Cheng Peng Fu Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 12 +++ net/ipv4/Makefile | 1 + net/ipv4/tcp_veno.c | 238 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 251 insertions(+) create mode 100644 net/ipv4/tcp_veno.c (limited to 'net') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index ae85149a0434..8514106761b0 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -560,6 +560,18 @@ config TCP_CONG_LP ``fair share`` of bandwidth as targeted by TCP. See http://www-ece.rice.edu/networks/TCP-LP/ +config TCP_CONG_VENO + tristate "TCP Veno" + depends on EXPERIMENTAL + default n + ---help--- + TCP Veno is a sender-side only enhancement of TCP to obtain better + throughput over wireless networks. TCP Veno makes use of state + distinguishing to circumvent the difficult judgment of the packet loss + type. TCP Veno cuts down less congestion window in response to random + loss packets. + See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf + endmenu config TCP_CONG_BIC diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 58a7a4634f70..00fcd2c1ba78 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -43,6 +43,7 @@ obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o obj-$(CONFIG_TCP_CONG_HTCP) += tcp_htcp.o obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o +obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c new file mode 100644 index 000000000000..1091671751c4 --- /dev/null +++ b/net/ipv4/tcp_veno.c @@ -0,0 +1,238 @@ +/* + * TCP Veno congestion control + * + * This is based on the congestion detection/avoidance scheme described in + * C. P. Fu, S. C. Liew. + * "TCP Veno: TCP Enhancement for Transmission over Wireless Access Networks." + * IEEE Journal on Selected Areas in Communication, + * Feb. 2003. + * See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf + */ + +#include +#include +#include +#include +#include + +#include + +/* Default values of the Veno variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. + */ +#define V_PARAM_SHIFT 1 +static const int beta = 3 << V_PARAM_SHIFT; + +/* Veno variables */ +struct veno { + u8 doing_veno_now; /* if true, do veno for this rtt */ + u16 cntrtt; /* # of rtts measured within last rtt */ + u32 minrtt; /* min of rtts measured within last rtt (in usec) */ + u32 basertt; /* the min of all Veno rtt measurements seen (in usec) */ + u32 inc; /* decide whether to increase cwnd */ + u32 diff; /* calculate the diff rate */ +}; + +/* There are several situations when we must "re-start" Veno: + * + * o when a connection is established + * o after an RTO + * o after fast recovery + * o when we send a packet and there is no outstanding + * unacknowledged data (restarting an idle connection) + * + */ +static inline void veno_enable(struct sock *sk) +{ + struct veno *veno = inet_csk_ca(sk); + + /* turn on Veno */ + veno->doing_veno_now = 1; + + veno->minrtt = 0x7fffffff; +} + +static inline void veno_disable(struct sock *sk) +{ + struct veno *veno = inet_csk_ca(sk); + + /* turn off Veno */ + veno->doing_veno_now = 0; +} + +static void tcp_veno_init(struct sock *sk) +{ + struct veno *veno = inet_csk_ca(sk); + + veno->basertt = 0x7fffffff; + veno->inc = 1; + veno_enable(sk); +} + +/* Do rtt sampling needed for Veno. */ +static void tcp_veno_rtt_calc(struct sock *sk, u32 usrtt) +{ + struct veno *veno = inet_csk_ca(sk); + u32 vrtt = usrtt + 1; /* Never allow zero rtt or basertt */ + + /* Filter to find propagation delay: */ + if (vrtt < veno->basertt) + veno->basertt = vrtt; + + /* Find the min rtt during the last rtt to find + * the current prop. delay + queuing delay: + */ + veno->minrtt = min(veno->minrtt, vrtt); + veno->cntrtt++; +} + +static void tcp_veno_state(struct sock *sk, u8 ca_state) +{ + if (ca_state == TCP_CA_Open) + veno_enable(sk); + else + veno_disable(sk); +} + +/* + * If the connection is idle and we are restarting, + * then we don't want to do any Veno calculations + * until we get fresh rtt samples. So when we + * restart, we reset our Veno state to a clean + * state. After we get acks for this flight of + * packets, _then_ we can make Veno calculations + * again. + */ +static void tcp_veno_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) + tcp_veno_init(sk); +} + +static void tcp_veno_cong_avoid(struct sock *sk, u32 ack, + u32 seq_rtt, u32 in_flight, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct veno *veno = inet_csk_ca(sk); + + if (!veno->doing_veno_now) + return tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); + + /* limited by applications */ + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + /* We do the Veno calculations only if we got enough rtt samples */ + if (veno->cntrtt <= 2) { + /* We don't have enough rtt samples to do the Veno + * calculation, so we'll behave like Reno. + */ + tcp_reno_cong_avoid(sk, ack, seq_rtt, in_flight, flag); + } else { + u32 rtt, target_cwnd; + + /* We have enough rtt samples, so, using the Veno + * algorithm, we determine the state of the network. + */ + + rtt = veno->minrtt; + + target_cwnd = ((tp->snd_cwnd * veno->basertt) + << V_PARAM_SHIFT) / rtt; + + veno->diff = (tp->snd_cwnd << V_PARAM_SHIFT) - target_cwnd; + + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* Slow start. */ + tcp_slow_start(tp); + } else { + /* Congestion avoidance. */ + if (veno->diff < beta) { + /* In the "non-congestive state", increase cwnd + * every rtt. + */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } else { + /* In the "congestive state", increase cwnd + * every other rtt. + */ + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + if (veno->inc + && tp->snd_cwnd < + tp->snd_cwnd_clamp) { + tp->snd_cwnd++; + veno->inc = 0; + } else + veno->inc = 1; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } + + } + if (tp->snd_cwnd < 2) + tp->snd_cwnd = 2; + else if (tp->snd_cwnd > tp->snd_cwnd_clamp) + tp->snd_cwnd = tp->snd_cwnd_clamp; + } + /* Wipe the slate clean for the next rtt. */ + /* veno->cntrtt = 0; */ + veno->minrtt = 0x7fffffff; +} + +/* Veno MD phase */ +static u32 tcp_veno_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct veno *veno = inet_csk_ca(sk); + + if (veno->diff < beta) + /* in "non-congestive state", cut cwnd by 1/5 */ + return max(tp->snd_cwnd * 4 / 5, 2U); + else + /* in "congestive state", cut cwnd by 1/2 */ + return max(tp->snd_cwnd >> 1U, 2U); +} + +static u32 tcp_veno_min_cwnd(struct sock * sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + return tp->snd_ssthresh; +} + +static struct tcp_congestion_ops tcp_veno = { + .init = tcp_veno_init, + .ssthresh = tcp_veno_ssthresh, + .cong_avoid = tcp_veno_cong_avoid, + .min_cwnd = tcp_veno_min_cwnd, + .rtt_sample = tcp_veno_rtt_calc, + .set_state = tcp_veno_state, + .cwnd_event = tcp_veno_cwnd_event, + + .owner = THIS_MODULE, + .name = "veno", +}; + +static int __init tcp_veno_register(void) +{ + BUG_ON(sizeof(struct veno) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_veno); + return 0; +} + +static void __exit tcp_veno_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_veno); +} + +module_init(tcp_veno_register); +module_exit(tcp_veno_unregister); + +MODULE_AUTHOR("Bin Zhou, Cheng Peng Fu"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Veno"); -- cgit v1.2.3-59-g8ed1b From f890f921040fef6a35e39d15b729af1fd1a35f29 Mon Sep 17 00:00:00 2001 From: "Angelo P. Castellani" Date: Mon, 5 Jun 2006 17:29:09 -0700 Subject: [TCP]: TCP Compound congestion control TCP Compound is a sender-side only change to TCP that uses a mixed Reno/Vegas approach to calculate the cwnd. For further details look here: ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf Signed-off-by: Angelo P. Castellani Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 10 ++ net/ipv4/Makefile | 1 + net/ipv4/tcp_compound.c | 407 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 418 insertions(+) create mode 100644 net/ipv4/tcp_compound.c (limited to 'net') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 8514106761b0..da33393be45f 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -572,6 +572,16 @@ config TCP_CONG_VENO loss packets. See http://www.ntu.edu.sg/home5/ZHOU0022/papers/CPFu03a.pdf +config TCP_CONG_COMPOUND + tristate "TCP Compound" + depends on EXPERIMENTAL + default n + ---help--- + TCP Compound is a sender-side only change to TCP that uses + a mixed Reno/Vegas approach to calculate the cwnd. + For further details look here: + ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf + endmenu config TCP_CONG_BIC diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 00fcd2c1ba78..f30faefc2672 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -46,6 +46,7 @@ obj-$(CONFIG_TCP_CONG_VEGAS) += tcp_vegas.o obj-$(CONFIG_TCP_CONG_VENO) += tcp_veno.o obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o +obj-$(CONFIG_TCP_CONG_COMPOUND) += tcp_compound.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ xfrm4_output.o diff --git a/net/ipv4/tcp_compound.c b/net/ipv4/tcp_compound.c new file mode 100644 index 000000000000..8c1ebfb7659e --- /dev/null +++ b/net/ipv4/tcp_compound.c @@ -0,0 +1,407 @@ +/* + * TCP Vegas congestion control + * + * This is based on the congestion detection/avoidance scheme described in + * Lawrence S. Brakmo and Larry L. Peterson. + * "TCP Vegas: End to end congestion avoidance on a global internet." + * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, + * October 1995. Available from: + * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps + * + * See http://www.cs.arizona.edu/xkernel/ for their implementation. + * The main aspects that distinguish this implementation from the + * Arizona Vegas implementation are: + * o We do not change the loss detection or recovery mechanisms of + * Linux in any way. Linux already recovers from losses quite well, + * using fine-grained timers, NewReno, and FACK. + * o To avoid the performance penalty imposed by increasing cwnd + * only every-other RTT during slow start, we increase during + * every RTT during slow start, just like Reno. + * o Largely to allow continuous cwnd growth during slow start, + * we use the rate at which ACKs come back as the "actual" + * rate, rather than the rate at which data is sent. + * o To speed convergence to the right rate, we set the cwnd + * to achieve the right ("actual") rate when we exit slow start. + * o To filter out the noise caused by delayed ACKs, we use the + * minimum RTT sample observed during the last RTT to calculate + * the actual rate. + * o When the sender re-starts from idle, it waits until it has + * received ACKs for an entire flight of new data before making + * a cwnd adjustment decision. The original Vegas implementation + * assumed senders never went idle. + * + * + * TCP Compound based on TCP Vegas + * + * further details can be found here: + * ftp://ftp.research.microsoft.com/pub/tr/TR-2005-86.pdf + */ + +#include +#include +#include +#include +#include + +#include + +/* Default values of the Vegas variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. + */ +#define V_PARAM_SHIFT 1 + +#define TCP_COMPOUND_ALPHA 3U +#define TCP_COMPOUND_BETA 1U +#define TCP_COMPOUND_KAPPA_POW 3 +#define TCP_COMPOUND_KAPPA_NSQRT 2 +#define TCP_COMPOUND_GAMMA 30 +#define TCP_COMPOUND_ZETA 1 + +/* TCP compound variables */ +struct compound { + u32 beg_snd_nxt; /* right edge during last RTT */ + u32 beg_snd_una; /* left edge during last RTT */ + u32 beg_snd_cwnd; /* saves the size of the cwnd */ + u8 doing_vegas_now; /* if true, do vegas for this RTT */ + u16 cntRTT; /* # of RTTs measured within last RTT */ + u32 minRTT; /* min of RTTs measured within last RTT (in usec) */ + u32 baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ + + u32 cwnd; + u32 dwnd; +}; + +/* There are several situations when we must "re-start" Vegas: + * + * o when a connection is established + * o after an RTO + * o after fast recovery + * o when we send a packet and there is no outstanding + * unacknowledged data (restarting an idle connection) + * + * In these circumstances we cannot do a Vegas calculation at the + * end of the first RTT, because any calculation we do is using + * stale info -- both the saved cwnd and congestion feedback are + * stale. + * + * Instead we must wait until the completion of an RTT during + * which we actually receive ACKs. + */ +static inline void vegas_enable(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct compound *vegas = inet_csk_ca(sk); + + /* Begin taking Vegas samples next time we send something. */ + vegas->doing_vegas_now = 1; + + /* Set the beginning of the next send window. */ + vegas->beg_snd_nxt = tp->snd_nxt; + + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; +} + +/* Stop taking Vegas samples for now. */ +static inline void vegas_disable(struct sock *sk) +{ + struct compound *vegas = inet_csk_ca(sk); + + vegas->doing_vegas_now = 0; +} + +static void tcp_compound_init(struct sock *sk) +{ + struct compound *vegas = inet_csk_ca(sk); + const struct tcp_sock *tp = tcp_sk(sk); + + vegas->baseRTT = 0x7fffffff; + vegas_enable(sk); + + vegas->dwnd = 0; + vegas->cwnd = tp->snd_cwnd; +} + +/* Do RTT sampling needed for Vegas. + * Basically we: + * o min-filter RTT samples from within an RTT to get the current + * propagation delay + queuing delay (we are min-filtering to try to + * avoid the effects of delayed ACKs) + * o min-filter RTT samples from a much longer window (forever for now) + * to find the propagation delay (baseRTT) + */ +static void tcp_compound_rtt_calc(struct sock *sk, u32 usrtt) +{ + struct compound *vegas = inet_csk_ca(sk); + u32 vrtt = usrtt + 1; /* Never allow zero rtt or baseRTT */ + + /* Filter to find propagation delay: */ + if (vrtt < vegas->baseRTT) + vegas->baseRTT = vrtt; + + /* Find the min RTT during the last RTT to find + * the current prop. delay + queuing delay: + */ + + vegas->minRTT = min(vegas->minRTT, vrtt); + vegas->cntRTT++; +} + +static void tcp_compound_state(struct sock *sk, u8 ca_state) +{ + + if (ca_state == TCP_CA_Open) + vegas_enable(sk); + else + vegas_disable(sk); +} + +/* + * If the connection is idle and we are restarting, + * then we don't want to do any Vegas calculations + * until we get fresh RTT samples. So when we + * restart, we reset our Vegas state to a clean + * slate. After we get acks for this flight of + * packets, _then_ we can make Vegas calculations + * again. + */ +static void tcp_compound_cwnd_event(struct sock *sk, enum tcp_ca_event event) +{ + if (event == CA_EVENT_CWND_RESTART || event == CA_EVENT_TX_START) + tcp_compound_init(sk); +} + +static void tcp_compound_cong_avoid(struct sock *sk, u32 ack, + u32 seq_rtt, u32 in_flight, int flag) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct compound *vegas = inet_csk_ca(sk); + u8 inc = 0; + + if (vegas->cwnd + vegas->dwnd > tp->snd_cwnd) { + if (vegas->cwnd > tp->snd_cwnd || vegas->dwnd > tp->snd_cwnd) { + vegas->cwnd = tp->snd_cwnd; + vegas->dwnd = 0; + } else + vegas->cwnd = tp->snd_cwnd - vegas->dwnd; + + } + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (vegas->cwnd <= tp->snd_ssthresh) + inc = 1; + else if (tp->snd_cwnd_cnt < tp->snd_cwnd) + tp->snd_cwnd_cnt++; + + if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { + inc = 1; + tp->snd_cwnd_cnt = 0; + } + + if (inc && tp->snd_cwnd < tp->snd_cwnd_clamp) + vegas->cwnd++; + + /* The key players are v_beg_snd_una and v_beg_snd_nxt. + * + * These are so named because they represent the approximate values + * of snd_una and snd_nxt at the beginning of the current RTT. More + * precisely, they represent the amount of data sent during the RTT. + * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, + * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding + * bytes of data have been ACKed during the course of the RTT, giving + * an "actual" rate of: + * + * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) + * + * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, + * because delayed ACKs can cover more than one segment, so they + * don't line up nicely with the boundaries of RTTs. + * + * Another unfortunate fact of life is that delayed ACKs delay the + * advance of the left edge of our send window, so that the number + * of bytes we send in an RTT is often less than our cwnd will allow. + * So we keep track of our cwnd separately, in v_beg_snd_cwnd. + */ + + if (after(ack, vegas->beg_snd_nxt)) { + /* Do the Vegas once-per-RTT cwnd adjustment. */ + u32 old_wnd, old_snd_cwnd; + + /* Here old_wnd is essentially the window of data that was + * sent during the previous RTT, and has all + * been acknowledged in the course of the RTT that ended + * with the ACK we just received. Likewise, old_snd_cwnd + * is the cwnd during the previous RTT. + */ + if (!tp->mss_cache) + return; + + old_wnd = (vegas->beg_snd_nxt - vegas->beg_snd_una) / + tp->mss_cache; + old_snd_cwnd = vegas->beg_snd_cwnd; + + /* Save the extent of the current window so we can use this + * at the end of the next RTT. + */ + vegas->beg_snd_una = vegas->beg_snd_nxt; + vegas->beg_snd_nxt = tp->snd_nxt; + vegas->beg_snd_cwnd = tp->snd_cwnd; + + /* We do the Vegas calculations only if we got enough RTT + * samples that we can be reasonably sure that we got + * at least one RTT sample that wasn't from a delayed ACK. + * If we only had 2 samples total, + * then that means we're getting only 1 ACK per RTT, which + * means they're almost certainly delayed ACKs. + * If we have 3 samples, we should be OK. + */ + + if (vegas->cntRTT > 2) { + u32 rtt, target_cwnd, diff; + u32 brtt, dwnd; + + /* We have enough RTT samples, so, using the Vegas + * algorithm, we determine if we should increase or + * decrease cwnd, and by how much. + */ + + /* Pluck out the RTT we are using for the Vegas + * calculations. This is the min RTT seen during the + * last RTT. Taking the min filters out the effects + * of delayed ACKs, at the cost of noticing congestion + * a bit later. + */ + rtt = vegas->minRTT; + + /* Calculate the cwnd we should have, if we weren't + * going too fast. + * + * This is: + * (actual rate in segments) * baseRTT + * We keep it as a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary point. + */ + if (!rtt) + return; + + brtt = vegas->baseRTT; + target_cwnd = ((old_wnd * brtt) + << V_PARAM_SHIFT) / rtt; + + /* Calculate the difference between the window we had, + * and the window we would like to have. This quantity + * is the "Diff" from the Arizona Vegas papers. + * + * Again, this is a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary + * point. + */ + + diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; + + dwnd = vegas->dwnd; + + if (diff < (TCP_COMPOUND_GAMMA << V_PARAM_SHIFT)) { + u32 i, j, x, x2; + u64 v; + + v = 1; + + for (i = 0; i < TCP_COMPOUND_KAPPA_POW; i++) + v *= old_wnd; + + for (i = 0; i < TCP_COMPOUND_KAPPA_NSQRT; i++) { + x = 1; + for (j = 0; j < 200; j++) { + x2 = (x + v / x) / 2; + + if (x2 == x || !x2) + break; + + x = x2; + } + v = x; + } + + x = (u32) v >> TCP_COMPOUND_ALPHA; + + if (x > 1) + dwnd = x - 1; + else + dwnd = 0; + + dwnd += vegas->dwnd; + + } else if ((dwnd << V_PARAM_SHIFT) < + (diff * TCP_COMPOUND_BETA)) + dwnd = 0; + else + dwnd = + ((dwnd << V_PARAM_SHIFT) - + (diff * + TCP_COMPOUND_BETA)) >> V_PARAM_SHIFT; + + vegas->dwnd = dwnd; + + } + + /* Wipe the slate clean for the next RTT. */ + vegas->cntRTT = 0; + vegas->minRTT = 0x7fffffff; + } + + tp->snd_cwnd = vegas->cwnd + vegas->dwnd; +} + +/* Extract info for Tcp socket info provided via netlink. */ +static void tcp_compound_get_info(struct sock *sk, u32 ext, struct sk_buff *skb) +{ + const struct compound *ca = inet_csk_ca(sk); + if (ext & (1 << (INET_DIAG_VEGASINFO - 1))) { + struct tcpvegas_info *info; + + info = RTA_DATA(__RTA_PUT(skb, INET_DIAG_VEGASINFO, + sizeof(*info))); + + info->tcpv_enabled = ca->doing_vegas_now; + info->tcpv_rttcnt = ca->cntRTT; + info->tcpv_rtt = ca->baseRTT; + info->tcpv_minrtt = ca->minRTT; + rtattr_failure:; + } +} + +static struct tcp_congestion_ops tcp_compound = { + .init = tcp_compound_init, + .ssthresh = tcp_reno_ssthresh, + .cong_avoid = tcp_compound_cong_avoid, + .min_cwnd = tcp_reno_min_cwnd, + .rtt_sample = tcp_compound_rtt_calc, + .set_state = tcp_compound_state, + .cwnd_event = tcp_compound_cwnd_event, + .get_info = tcp_compound_get_info, + + .owner = THIS_MODULE, + .name = "compound", +}; + +static int __init tcp_compound_register(void) +{ + BUG_ON(sizeof(struct compound) > ICSK_CA_PRIV_SIZE); + tcp_register_congestion_control(&tcp_compound); + return 0; +} + +static void __exit tcp_compound_unregister(void) +{ + tcp_unregister_congestion_control(&tcp_compound); +} + +module_init(tcp_compound_register); +module_exit(tcp_compound_unregister); + +MODULE_AUTHOR("Angelo P. Castellani, Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("TCP Compound"); -- cgit v1.2.3-59-g8ed1b From a4ed25849532728effaa0665c92e08e029e41407 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 5 Jun 2006 17:29:39 -0700 Subject: [TCP]: TCP Compound quad root function The original code did a 64 bit divide directly, which won't work on 32 bit platforms. Rather than doing a 64 bit square root twice, just implement a 4th root function in one pass using Newton's method. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_compound.c | 90 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 66 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_compound.c b/net/ipv4/tcp_compound.c index 8c1ebfb7659e..ec68cb8081c1 100644 --- a/net/ipv4/tcp_compound.c +++ b/net/ipv4/tcp_compound.c @@ -52,8 +52,6 @@ #define TCP_COMPOUND_ALPHA 3U #define TCP_COMPOUND_BETA 1U -#define TCP_COMPOUND_KAPPA_POW 3 -#define TCP_COMPOUND_KAPPA_NSQRT 2 #define TCP_COMPOUND_GAMMA 30 #define TCP_COMPOUND_ZETA 1 @@ -156,6 +154,58 @@ static void tcp_compound_state(struct sock *sk, u8 ca_state) vegas_disable(sk); } + +/* 64bit divisor, dividend and result. dynamic precision */ +static inline u64 div64_64(u64 dividend, u64 divisor) +{ + u32 d = divisor; + + if (divisor > 0xffffffffULL) { + unsigned int shift = fls(divisor >> 32); + + d = divisor >> shift; + dividend >>= shift; + } + + /* avoid 64 bit division if possible */ + if (dividend >> 32) + do_div(dividend, d); + else + dividend = (u32) dividend / d; + + return dividend; +} + +/* calculate the quartic root of "a" using Newton-Raphson */ +static u32 qroot(u64 a) +{ + u32 x, x1; + + /* Initial estimate is based on: + * qrt(x) = exp(log(x) / 4) + */ + x = 1u << (fls64(a) >> 2); + + /* + * Iteration based on: + * 3 + * x = ( 3 * x + a / x ) / 4 + * k+1 k k + */ + do { + u64 x3 = x; + + x1 = x; + x3 *= x; + x3 *= x; + + x = (3 * x + (u32) div64_64(a, x3)) / 4; + } while (abs(x1 - x) > 1); + + return x; +} + + /* * If the connection is idle and we are restarting, * then we don't want to do any Vegas calculations @@ -304,29 +354,21 @@ static void tcp_compound_cong_avoid(struct sock *sk, u32 ack, dwnd = vegas->dwnd; if (diff < (TCP_COMPOUND_GAMMA << V_PARAM_SHIFT)) { - u32 i, j, x, x2; u64 v; - - v = 1; - - for (i = 0; i < TCP_COMPOUND_KAPPA_POW; i++) - v *= old_wnd; - - for (i = 0; i < TCP_COMPOUND_KAPPA_NSQRT; i++) { - x = 1; - for (j = 0; j < 200; j++) { - x2 = (x + v / x) / 2; - - if (x2 == x || !x2) - break; - - x = x2; - } - v = x; - } - - x = (u32) v >> TCP_COMPOUND_ALPHA; - + u32 x; + + /* + * The TCP Compound paper describes the choice + * of "k" determines the agressiveness, + * ie. slope of the response function. + * + * For same value as HSTCP would be 0.8 + * but for computaional reasons, both the + * original authors and this implementation + * use 0.75. + */ + v = old_wnd; + x = qroot(v * v * v) >> TCP_COMPOUND_ALPHA; if (x > 1) dwnd = x - 1; else -- cgit v1.2.3-59-g8ed1b From 72dc5b9225c53310c010b68a70ea97c8c8e24bdf Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 5 Jun 2006 17:30:08 -0700 Subject: [TCP]: Minimum congestion window consolidation. Many of the TCP congestion methods all just use ssthresh as the minimum congestion window on decrease. Rather than duplicating the code, just have that be the default if that handle in the ops structure is not set. Minor behaviour change to TCP compound. It probably wants to use this (ssthresh) as lower bound, rather than ssthresh/2 because the latter causes undershoot on loss. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/tcp.h | 4 ++-- net/ipv4/tcp_bic.c | 7 ------- net/ipv4/tcp_compound.c | 1 - net/ipv4/tcp_cong.c | 6 +++--- net/ipv4/tcp_cubic.c | 6 ------ net/ipv4/tcp_htcp.c | 9 --------- net/ipv4/tcp_input.c | 13 +++++++++++-- net/ipv4/tcp_veno.c | 7 ------- net/ipv4/tcp_westwood.c | 18 +++++++----------- 9 files changed, 23 insertions(+), 48 deletions(-) (limited to 'net') diff --git a/include/net/tcp.h b/include/net/tcp.h index f1f472746e6c..de88c5472bfc 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -632,7 +632,7 @@ struct tcp_congestion_ops { /* return slow start threshold (required) */ u32 (*ssthresh)(struct sock *sk); /* lower bound for congestion window (optional) */ - u32 (*min_cwnd)(struct sock *sk); + u32 (*min_cwnd)(const struct sock *sk); /* do new cwnd calculation (required) */ void (*cong_avoid)(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int good_ack); @@ -667,7 +667,7 @@ extern struct tcp_congestion_ops tcp_init_congestion_ops; extern u32 tcp_reno_ssthresh(struct sock *sk); extern void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, int flag); -extern u32 tcp_reno_min_cwnd(struct sock *sk); +extern u32 tcp_reno_min_cwnd(const struct sock *sk); extern struct tcp_congestion_ops tcp_reno; static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state) diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 035f2092d73a..b2d9021ad22b 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -198,12 +198,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk) return max(tp->snd_cwnd, ca->last_max_cwnd); } -static u32 bictcp_min_cwnd(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - return tp->snd_ssthresh; -} - static void bictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) @@ -231,7 +225,6 @@ static struct tcp_congestion_ops bictcp = { .cong_avoid = bictcp_cong_avoid, .set_state = bictcp_state, .undo_cwnd = bictcp_undo_cwnd, - .min_cwnd = bictcp_min_cwnd, .pkts_acked = bictcp_acked, .owner = THIS_MODULE, .name = "bic", diff --git a/net/ipv4/tcp_compound.c b/net/ipv4/tcp_compound.c index ec68cb8081c1..bc54f7e9aea9 100644 --- a/net/ipv4/tcp_compound.c +++ b/net/ipv4/tcp_compound.c @@ -419,7 +419,6 @@ static struct tcp_congestion_ops tcp_compound = { .init = tcp_compound_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_compound_cong_avoid, - .min_cwnd = tcp_reno_min_cwnd, .rtt_sample = tcp_compound_rtt_calc, .set_state = tcp_compound_state, .cwnd_event = tcp_compound_cwnd_event, diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index 91c2f41c7f58..857eefc52aab 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -38,7 +38,7 @@ int tcp_register_congestion_control(struct tcp_congestion_ops *ca) int ret = 0; /* all algorithms must implement ssthresh and cong_avoid ops */ - if (!ca->ssthresh || !ca->cong_avoid || !ca->min_cwnd) { + if (!ca->ssthresh || !ca->cong_avoid) { printk(KERN_ERR "TCP %s does not implement required ops\n", ca->name); return -EINVAL; @@ -251,8 +251,8 @@ u32 tcp_reno_ssthresh(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); -/* Lower bound on congestion window. */ -u32 tcp_reno_min_cwnd(struct sock *sk) +/* Lower bound on congestion window with halving. */ +u32 tcp_reno_min_cwnd(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return tp->snd_ssthresh/2; diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 31a4986dfbf7..78b7a6b9e4de 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -325,11 +325,6 @@ static u32 bictcp_undo_cwnd(struct sock *sk) return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd); } -static u32 bictcp_min_cwnd(struct sock *sk) -{ - return tcp_sk(sk)->snd_ssthresh; -} - static void bictcp_state(struct sock *sk, u8 new_state) { if (new_state == TCP_CA_Loss) @@ -357,7 +352,6 @@ static struct tcp_congestion_ops cubictcp = { .cong_avoid = bictcp_cong_avoid, .set_state = bictcp_state, .undo_cwnd = bictcp_undo_cwnd, - .min_cwnd = bictcp_min_cwnd, .pkts_acked = bictcp_acked, .owner = THIS_MODULE, .name = "cubic", diff --git a/net/ipv4/tcp_htcp.c b/net/ipv4/tcp_htcp.c index 1b2ff53f98ed..3d92c1859267 100644 --- a/net/ipv4/tcp_htcp.c +++ b/net/ipv4/tcp_htcp.c @@ -246,14 +246,6 @@ static void htcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, } } -/* Lower bound on congestion window. */ -static u32 htcp_min_cwnd(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - return tp->snd_ssthresh; -} - - static void htcp_init(struct sock *sk) { struct htcp *ca = inet_csk_ca(sk); @@ -285,7 +277,6 @@ static void htcp_state(struct sock *sk, u8 new_state) static struct tcp_congestion_ops htcp = { .init = htcp_init, .ssthresh = htcp_recalc_ssthresh, - .min_cwnd = htcp_min_cwnd, .cong_avoid = htcp_cong_avoid, .set_state = htcp_state, .undo_cwnd = htcp_cwnd_undo, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 6d167889a4b0..e08245bdda3a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1689,17 +1689,26 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp) tp->snd_cwnd_stamp = tcp_time_stamp; } +/* Lower bound on congestion window is slow start threshold + * unless congestion avoidance choice decides to overide it. + */ +static inline u32 tcp_cwnd_min(const struct sock *sk) +{ + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; + + return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh; +} + /* Decrease cwnd each second ack. */ static void tcp_cwnd_down(struct sock *sk) { - const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); int decr = tp->snd_cwnd_cnt + 1; tp->snd_cwnd_cnt = decr&1; decr >>= 1; - if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk)) + if (decr && tp->snd_cwnd > tcp_cwnd_min(sk)) tp->snd_cwnd -= decr; tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 1091671751c4..11b42a7135c1 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -199,17 +199,10 @@ static u32 tcp_veno_ssthresh(struct sock *sk) return max(tp->snd_cwnd >> 1U, 2U); } -static u32 tcp_veno_min_cwnd(struct sock * sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - return tp->snd_ssthresh; -} - static struct tcp_congestion_ops tcp_veno = { .init = tcp_veno_init, .ssthresh = tcp_veno_ssthresh, .cong_avoid = tcp_veno_cong_avoid, - .min_cwnd = tcp_veno_min_cwnd, .rtt_sample = tcp_veno_rtt_calc, .set_state = tcp_veno_state, .cwnd_event = tcp_veno_cwnd_event, diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 0c340c3756c2..29eb258b6d82 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -162,12 +162,6 @@ static inline u32 westwood_acked_count(struct sock *sk) return w->cumul_ack; } -static inline u32 westwood_bw_rttmin(const struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - const struct westwood *w = inet_csk_ca(sk); - return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); -} /* * TCP Westwood @@ -175,9 +169,11 @@ static inline u32 westwood_bw_rttmin(const struct sock *sk) * in packets we use mss_cache). Rttmin is guaranteed to be >= 2 * so avoids ever returning 0. */ -static u32 tcp_westwood_cwnd_min(struct sock *sk) +static u32 tcp_westwood_bw_rttmin(const struct sock *sk) { - return westwood_bw_rttmin(sk); + const struct tcp_sock *tp = tcp_sk(sk); + const struct westwood *w = inet_csk_ca(sk); + return max_t(u32, (w->bw_est * w->rtt_min) / tp->mss_cache, 2); } static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) @@ -191,11 +187,11 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) break; case CA_EVENT_COMPLETE_CWR: - tp->snd_cwnd = tp->snd_ssthresh = westwood_bw_rttmin(sk); + tp->snd_cwnd = tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); break; case CA_EVENT_FRTO: - tp->snd_ssthresh = westwood_bw_rttmin(sk); + tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); break; case CA_EVENT_SLOW_ACK: @@ -235,7 +231,7 @@ static struct tcp_congestion_ops tcp_westwood = { .init = tcp_westwood_init, .ssthresh = tcp_reno_ssthresh, .cong_avoid = tcp_reno_cong_avoid, - .min_cwnd = tcp_westwood_cwnd_min, + .min_cwnd = tcp_westwood_bw_rttmin, .cwnd_event = tcp_westwood_event, .get_info = tcp_westwood_info, .pkts_acked = tcp_westwood_pkts_acked, -- cgit v1.2.3-59-g8ed1b From a42e9d6ce89cfd19aee9f990b7231ce697f0d00f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 5 Jun 2006 17:30:32 -0700 Subject: [TCP]: TCP Probe congestion window tracing This adds a new module for tracking TCP state variables non-intrusively using kprobes. It has a simple /proc interface that outputs one line for each packet received. A sample usage is to collect congestion window and ssthresh over time graphs. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/Kconfig | 15 +++++ net/ipv4/Makefile | 1 + net/ipv4/tcp_probe.c | 179 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 195 insertions(+) create mode 100644 net/ipv4/tcp_probe.c (limited to 'net') diff --git a/net/Kconfig b/net/Kconfig index 4193cdcd3ae7..ff0db804eed8 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -215,6 +215,21 @@ config NET_PKTGEN To compile this code as a module, choose M here: the module will be called pktgen. +config NET_TCPPROBE + tristate "TCP connection probing" + depends on INET && EXPERIMENTAL && PROC_FS && KPROBES + ---help--- + This module allows for capturing the changes to TCP connection + state in response to incoming patckets. It is used for debugging + TCP congestion avoidance modules. If you don't understand + what was just said, you don't need it: say N. + + Documentation on how to use the packet generator can be found + at http://linux-net.osdl.org/index.php/TcpProbe + + To compile this code as a module, choose M here: the + module will be called tcp_probe. + endmenu endmenu diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index f30faefc2672..38b8039bdd55 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_IP_VS) += ipvs/ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o +obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c new file mode 100644 index 000000000000..be94d526c7cb --- /dev/null +++ b/net/ipv4/tcp_probe.c @@ -0,0 +1,179 @@ +/* + * tcpprobe - Observe the TCP flow with kprobes. + * + * The idea for this came from Werner Almesberger's umlsim + * Copyright (C) 2004, Stephen Hemminger + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +MODULE_AUTHOR("Stephen Hemminger "); +MODULE_DESCRIPTION("TCP cwnd snooper"); +MODULE_LICENSE("GPL"); + +static int port = 0; +MODULE_PARM_DESC(port, "Port to match (0=all)"); +module_param(port, int, 0); + +static int bufsize = 64*1024; +MODULE_PARM_DESC(bufsize, "Log buffer size (default 64k)"); +module_param(bufsize, int, 0); + +static const char procname[] = "tcpprobe"; + +struct { + struct kfifo *fifo; + spinlock_t lock; + wait_queue_head_t wait; + struct timeval tstart; +} tcpw; + +static void printl(const char *fmt, ...) +{ + va_list args; + int len; + struct timeval now; + char tbuf[256]; + + va_start(args, fmt); + do_gettimeofday(&now); + + now.tv_sec -= tcpw.tstart.tv_sec; + now.tv_usec -= tcpw.tstart.tv_usec; + if (now.tv_usec < 0) { + --now.tv_sec; + now.tv_usec += 1000000; + } + + len = sprintf(tbuf, "%lu.%06lu ", now.tv_sec, now.tv_usec); + len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); + va_end(args); + + kfifo_put(tcpw.fifo, tbuf, len); + wake_up(&tcpw.wait); +} + +static int jtcp_sendmsg(struct kiocb *iocb, struct sock *sk, + struct msghdr *msg, size_t size) +{ + const struct tcp_sock *tp = tcp_sk(sk); + const struct inet_sock *inet = inet_sk(sk); + + if (port == 0 || ntohs(inet->dport) == port || + ntohs(inet->sport) == port) { + printl("%d.%d.%d.%d:%u %d.%d.%d.%d:%u %d %#x %#x %u %u %u\n", + NIPQUAD(inet->saddr), ntohs(inet->sport), + NIPQUAD(inet->daddr), ntohs(inet->dport), + size, tp->snd_nxt, tp->snd_una, + tp->snd_cwnd, tcp_current_ssthresh(sk), + tp->snd_wnd); + } + + jprobe_return(); + return 0; +} + +static struct jprobe tcp_send_probe = { + .kp = { .addr = (kprobe_opcode_t *) &tcp_sendmsg, }, + .entry = (kprobe_opcode_t *) &jtcp_sendmsg, +}; + + +static int tcpprobe_open(struct inode * inode, struct file * file) +{ + kfifo_reset(tcpw.fifo); + do_gettimeofday(&tcpw.tstart); + return 0; +} + +static ssize_t tcpprobe_read(struct file *file, char __user *buf, + size_t len, loff_t *ppos) +{ + int error = 0, cnt; + unsigned char *tbuf; + + if (!buf || len < 0) + return -EINVAL; + + if (len == 0) + return 0; + + tbuf = vmalloc(len); + if (!tbuf) + return -ENOMEM; + + error = wait_event_interruptible(tcpw.wait, + __kfifo_len(tcpw.fifo) != 0); + if (error) + return error; + + cnt = kfifo_get(tcpw.fifo, tbuf, len); + error = copy_to_user(buf, tbuf, cnt); + + vfree(tbuf); + + return error ? error : cnt; +} + +static struct file_operations tcpprobe_fops = { + .owner = THIS_MODULE, + .open = tcpprobe_open, + .read = tcpprobe_read, +}; + +static __init int tcpprobe_init(void) +{ + int ret = -ENOMEM; + + init_waitqueue_head(&tcpw.wait); + spin_lock_init(&tcpw.lock); + tcpw.fifo = kfifo_alloc(bufsize, GFP_KERNEL, &tcpw.lock); + + if (!proc_net_fops_create(procname, S_IRUSR, &tcpprobe_fops)) + goto err0; + + ret = register_jprobe(&tcp_send_probe); + if (ret) + goto err1; + + pr_info("TCP watch registered (port=%d)\n", port); + return 0; + err1: + proc_net_remove(procname); + err0: + kfifo_free(tcpw.fifo); + return ret; +} +module_init(tcpprobe_init); + +static __exit void tcpprobe_exit(void) +{ + kfifo_free(tcpw.fifo); + proc_net_remove(procname); + unregister_jprobe(&tcp_send_probe); + +} +module_exit(tcpprobe_exit); -- cgit v1.2.3-59-g8ed1b From 738980ffa658c86bd494ebb242ce8e44aff16a9e Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 5 Jun 2006 17:30:56 -0700 Subject: [TCP]: Limited slow start for Highspeed TCP Implementation of RFC3742 limited slow start. Added as part of the TCP highspeed congestion control module. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_highspeed.c | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_highspeed.c b/net/ipv4/tcp_highspeed.c index ba7c63ca5bb1..1120245b2373 100644 --- a/net/ipv4/tcp_highspeed.c +++ b/net/ipv4/tcp_highspeed.c @@ -98,6 +98,10 @@ struct hstcp { u32 ai; }; +static int max_ssthresh = 100; +module_param(max_ssthresh, int, 0644); +MODULE_PARM_DESC(max_ssthresh, "limited slow start threshold (RFC3742)"); + static void hstcp_init(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -119,9 +123,23 @@ static void hstcp_cong_avoid(struct sock *sk, u32 adk, u32 rtt, if (!tcp_is_cwnd_limited(sk, in_flight)) return; - if (tp->snd_cwnd <= tp->snd_ssthresh) - tcp_slow_start(tp); - else { + if (tp->snd_cwnd <= tp->snd_ssthresh) { + /* RFC3742: limited slow start + * the window is increased by 1/K MSS for each arriving ACK, + * for K = int(cwnd/(0.5 max_ssthresh)) + */ + if (max_ssthresh > 0 && tp->snd_cwnd > max_ssthresh) { + u32 k = max(tp->snd_cwnd / (max_ssthresh >> 1), 1U); + if (++tp->snd_cwnd_cnt >= k) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } + } else { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } + } else { /* Update AIMD parameters */ if (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd) { while (tp->snd_cwnd > hstcp_aimd_vals[ca->ai].cwnd && -- cgit v1.2.3-59-g8ed1b From 70df2311ee3fc607e7511873d7dade5bd17d593d Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 5 Jun 2006 17:59:20 -0700 Subject: [TCP]: Fix compile warning in tcp_probe.c The suseconds_t et al. are not necessarily any particular type on every platform, so cast to unsigned long so that we can use one printf format string and avoid warnings across the board Signed-off-by: David S. Miller --- net/ipv4/tcp_probe.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c index be94d526c7cb..d7d517a3a238 100644 --- a/net/ipv4/tcp_probe.c +++ b/net/ipv4/tcp_probe.c @@ -68,7 +68,9 @@ static void printl(const char *fmt, ...) now.tv_usec += 1000000; } - len = sprintf(tbuf, "%lu.%06lu ", now.tv_sec, now.tv_usec); + len = sprintf(tbuf, "%lu.%06lu ", + (unsigned long) now.tv_sec, + (unsigned long) now.tv_usec); len += vscnprintf(tbuf+len, sizeof(tbuf)-len, fmt, args); va_end(args); -- cgit v1.2.3-59-g8ed1b From 338fcf9886df9ad2873772197a73a57818973316 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 5 Jun 2006 21:04:39 -0700 Subject: [IPV4] igmp: Fixup struct ip_mc_list::multiaddr type All users except two expect 32-bit big-endian value. One is of ->multiaddr = ->multiaddr variety. And last one is "%08lX". Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/linux/igmp.h | 2 +- net/ipv4/igmp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/linux/igmp.h b/include/linux/igmp.h index 28f4f3b36950..899c3d4776f3 100644 --- a/include/linux/igmp.h +++ b/include/linux/igmp.h @@ -169,7 +169,7 @@ struct ip_sf_list struct ip_mc_list { struct in_device *interface; - unsigned long multiaddr; + __be32 multiaddr; struct ip_sf_list *sources; struct ip_sf_list *tomb; unsigned int sfmode; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index d512239a1473..ab680c851aa2 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2361,7 +2361,7 @@ static int igmp_mc_seq_show(struct seq_file *seq, void *v) } seq_printf(seq, - "\t\t\t\t%08lX %5d %d:%08lX\t\t%d\n", + "\t\t\t\t%08X %5d %d:%08lX\t\t%d\n", im->multiaddr, im->users, im->tm_running, im->tm_running ? jiffies_to_clock_t(im->timer.expires-jiffies) : 0, -- cgit v1.2.3-59-g8ed1b From 6d7416535097ed0943bdae8e69c14ba43061cab1 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 5 Jun 2006 21:06:41 -0700 Subject: [IPV4]: Right prototype of __raw_v4_lookup() All users pass 32-bit values as addresses and internally they're compared with 32-bit entities. So, change "laddr" and "raddr" types to __be32. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- include/net/raw.h | 2 +- net/ipv4/raw.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/include/net/raw.h b/include/net/raw.h index e67b28a0248c..d83571fe4c69 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -36,7 +36,7 @@ extern rwlock_t raw_v4_lock; extern struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, - unsigned long raddr, unsigned long laddr, + __be32 raddr, __be32 laddr, int dif); extern int raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index fc2562415555..bd221ec3f81e 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -103,7 +103,7 @@ static void raw_v4_unhash(struct sock *sk) } struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, - unsigned long raddr, unsigned long laddr, + __be32 raddr, __be32 laddr, int dif) { struct hlist_node *node; -- cgit v1.2.3-59-g8ed1b From f86502bfc177f69bbabbedb78ebf710579ae0e54 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 5 Jun 2006 21:19:24 -0700 Subject: [IPV4] icmp: Kill local 'ip' arg in icmp_redirect(). It is typed wrong, and it's only assigned and used once. So just pass in iph->daddr directly which fixes both problems. Based upon a patch by Alexey Dobriyan. Signed-off-by: David S. Miller --- net/ipv4/icmp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 2a0455911ee0..017900172f7d 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -730,7 +730,6 @@ out_err: static void icmp_redirect(struct sk_buff *skb) { struct iphdr *iph; - unsigned long ip; if (skb->len < sizeof(struct iphdr)) goto out_err; @@ -742,7 +741,6 @@ static void icmp_redirect(struct sk_buff *skb) goto out; iph = (struct iphdr *)skb->data; - ip = iph->daddr; switch (skb->h.icmph->code & 7) { case ICMP_REDIR_NET: @@ -752,7 +750,8 @@ static void icmp_redirect(struct sk_buff *skb) */ case ICMP_REDIR_HOST: case ICMP_REDIR_HOSTTOS: - ip_rt_redirect(skb->nh.iph->saddr, ip, skb->h.icmph->un.gateway, + ip_rt_redirect(skb->nh.iph->saddr, iph->daddr, + skb->h.icmph->un.gateway, iph->saddr, skb->dev); break; } -- cgit v1.2.3-59-g8ed1b From c8c05a8eec6f1258f6d5cb71a44ee5dc1e989b63 Mon Sep 17 00:00:00 2001 From: Catherine Zhang Date: Thu, 8 Jun 2006 23:39:49 -0700 Subject: [LSM-IPsec]: SELinux Authorize This patch contains a fix for the previous patch that adds security contexts to IPsec policies and security associations. In the previous patch, no authorization (besides the check for write permissions to SAD and SPD) is required to delete IPsec policies and security assocations with security contexts. Thus a user authorized to change SAD and SPD can bypass the IPsec policy authorization by simply deleteing policies with security contexts. To fix this security hole, an additional authorization check is added for removing security policies and security associations with security contexts. Note that if no security context is supplied on add or present on policy to be deleted, the SELinux module allows the change unconditionally. The hook is called on deletion when no context is present, which we may want to change. At present, I left it up to the module. LSM changes: The patch adds two new LSM hooks: xfrm_policy_delete and xfrm_state_delete. The new hooks are necessary to authorize deletion of IPsec policies that have security contexts. The existing hooks xfrm_policy_free and xfrm_state_free lack the context to do the authorization, so I decided to split authorization of deletion and memory management of security data, as is typical in the LSM interface. Use: The new delete hooks are checked when xfrm_policy or xfrm_state are deleted by either the xfrm_user interface (xfrm_get_policy, xfrm_del_sa) or the pfkey interface (pfkey_spddelete, pfkey_delete). SELinux changes: The new policy_delete and state_delete functions are added. Signed-off-by: Catherine Zhang Signed-off-by: Trent Jaeger Acked-by: James Morris Signed-off-by: David S. Miller --- include/linux/security.h | 40 ++++++++++++++++++++++++++++++++++------ net/key/af_key.c | 17 +++++++++++------ net/xfrm/xfrm_user.c | 19 ++++++++++++------- security/dummy.c | 12 ++++++++++++ security/selinux/hooks.c | 2 ++ security/selinux/include/xfrm.h | 2 ++ security/selinux/xfrm.c | 39 +++++++++++++++++++++++++++++++++++---- 7 files changed, 108 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/include/linux/security.h b/include/linux/security.h index 1bab48f6aeac..14c9bd050607 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -805,31 +805,37 @@ struct swap_info_struct; * used by the XFRM system. * @sec_ctx contains the security context information being provided by * the user-level policy update program (e.g., setkey). - * Allocate a security structure to the xp->selector.security field. + * Allocate a security structure to the xp->security field. * The security field is initialized to NULL when the xfrm_policy is * allocated. * Return 0 if operation was successful (memory to allocate, legal context) * @xfrm_policy_clone_security: * @old contains an existing xfrm_policy in the SPD. * @new contains a new xfrm_policy being cloned from old. - * Allocate a security structure to the new->selector.security field - * that contains the information from the old->selector.security field. + * Allocate a security structure to the new->security field + * that contains the information from the old->security field. * Return 0 if operation was successful (memory to allocate). * @xfrm_policy_free_security: * @xp contains the xfrm_policy - * Deallocate xp->selector.security. + * Deallocate xp->security. + * @xfrm_policy_delete_security: + * @xp contains the xfrm_policy. + * Authorize deletion of xp->security. * @xfrm_state_alloc_security: * @x contains the xfrm_state being added to the Security Association * Database by the XFRM system. * @sec_ctx contains the security context information being provided by * the user-level SA generation program (e.g., setkey or racoon). - * Allocate a security structure to the x->sel.security field. The + * Allocate a security structure to the x->security field. The * security field is initialized to NULL when the xfrm_state is * allocated. * Return 0 if operation was successful (memory to allocate, legal context). * @xfrm_state_free_security: * @x contains the xfrm_state. - * Deallocate x>sel.security. + * Deallocate x->security. + * @xfrm_state_delete_security: + * @x contains the xfrm_state. + * Authorize deletion of x->security. * @xfrm_policy_lookup: * @xp contains the xfrm_policy for which the access control is being * checked. @@ -1298,8 +1304,10 @@ struct security_operations { int (*xfrm_policy_alloc_security) (struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx); int (*xfrm_policy_clone_security) (struct xfrm_policy *old, struct xfrm_policy *new); void (*xfrm_policy_free_security) (struct xfrm_policy *xp); + int (*xfrm_policy_delete_security) (struct xfrm_policy *xp); int (*xfrm_state_alloc_security) (struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx); void (*xfrm_state_free_security) (struct xfrm_state *x); + int (*xfrm_state_delete_security) (struct xfrm_state *x); int (*xfrm_policy_lookup)(struct xfrm_policy *xp, u32 sk_sid, u8 dir); #endif /* CONFIG_SECURITY_NETWORK_XFRM */ @@ -2934,11 +2942,21 @@ static inline void security_xfrm_policy_free(struct xfrm_policy *xp) security_ops->xfrm_policy_free_security(xp); } +static inline int security_xfrm_policy_delete(struct xfrm_policy *xp) +{ + return security_ops->xfrm_policy_delete_security(xp); +} + static inline int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx) { return security_ops->xfrm_state_alloc_security(x, sec_ctx); } +static inline int security_xfrm_state_delete(struct xfrm_state *x) +{ + return security_ops->xfrm_state_delete_security(x); +} + static inline void security_xfrm_state_free(struct xfrm_state *x) { security_ops->xfrm_state_free_security(x); @@ -2963,6 +2981,11 @@ static inline void security_xfrm_policy_free(struct xfrm_policy *xp) { } +static inline int security_xfrm_policy_delete(struct xfrm_policy *xp) +{ + return 0; +} + static inline int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx) { return 0; @@ -2972,6 +2995,11 @@ static inline void security_xfrm_state_free(struct xfrm_state *x) { } +static inline int security_xfrm_state_delete(struct xfrm_policy *xp) +{ + return 0; +} + static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir) { return 0; diff --git a/net/key/af_key.c b/net/key/af_key.c index 859582275cab..d5e2121ea207 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -1454,21 +1454,23 @@ static int pfkey_delete(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h if (x == NULL) return -ESRCH; + if ((err = security_xfrm_state_delete(x))) + goto out; + if (xfrm_state_kern(x)) { - xfrm_state_put(x); - return -EPERM; + err = -EPERM; + goto out; } err = xfrm_state_delete(x); - if (err < 0) { - xfrm_state_put(x); - return err; - } + if (err < 0) + goto out; c.seq = hdr->sadb_msg_seq; c.pid = hdr->sadb_msg_pid; c.event = XFRM_MSG_DELSA; km_state_notify(x, &c); +out: xfrm_state_put(x); return err; @@ -2274,11 +2276,14 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg err = 0; + if ((err = security_xfrm_policy_delete(xp))) + goto out; c.seq = hdr->sadb_msg_seq; c.pid = hdr->sadb_msg_pid; c.event = XFRM_MSG_DELPOLICY; km_policy_notify(xp, pol->sadb_x_policy_dir-1, &c); +out: xfrm_pol_put(xp); return err; } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 81d1005830f4..a3733d2db3ba 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -427,23 +427,25 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) if (x == NULL) return -ESRCH; + if (err = security_xfrm_state_delete(x)) + goto out; + if (xfrm_state_kern(x)) { - xfrm_state_put(x); - return -EPERM; + err = -EPERM; + goto out; } err = xfrm_state_delete(x); - if (err < 0) { - xfrm_state_put(x); - return err; - } + if (err < 0) + goto out; c.seq = nlh->nlmsg_seq; c.pid = nlh->nlmsg_pid; c.event = nlh->nlmsg_type; km_state_notify(x, &c); - xfrm_state_put(x); +out: + xfrm_state_put(x); return err; } @@ -1055,6 +1057,8 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr MSG_DONTWAIT); } } else { + if (err = security_xfrm_policy_delete(xp)) + goto out; c.data.byid = p->index; c.event = nlh->nlmsg_type; c.seq = nlh->nlmsg_seq; @@ -1064,6 +1068,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr xfrm_pol_put(xp); +out: return err; } diff --git a/security/dummy.c b/security/dummy.c index 8ccccccc12ac..64f6da0f422e 100644 --- a/security/dummy.c +++ b/security/dummy.c @@ -810,6 +810,11 @@ static void dummy_xfrm_policy_free_security(struct xfrm_policy *xp) { } +static int dummy_xfrm_policy_delete_security(struct xfrm_policy *xp) +{ + return 0; +} + static int dummy_xfrm_state_alloc_security(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx) { return 0; @@ -819,6 +824,11 @@ static void dummy_xfrm_state_free_security(struct xfrm_state *x) { } +static int dummy_xfrm_state_delete_security(struct xfrm_state *x) +{ + return 0; +} + static int dummy_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir) { return 0; @@ -1024,8 +1034,10 @@ void security_fixup_ops (struct security_operations *ops) set_to_dummy_if_null(ops, xfrm_policy_alloc_security); set_to_dummy_if_null(ops, xfrm_policy_clone_security); set_to_dummy_if_null(ops, xfrm_policy_free_security); + set_to_dummy_if_null(ops, xfrm_policy_delete_security); set_to_dummy_if_null(ops, xfrm_state_alloc_security); set_to_dummy_if_null(ops, xfrm_state_free_security); + set_to_dummy_if_null(ops, xfrm_state_delete_security); set_to_dummy_if_null(ops, xfrm_policy_lookup); #endif /* CONFIG_SECURITY_NETWORK_XFRM */ #ifdef CONFIG_KEYS diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 90b4cdc0c948..cf7b62ca886a 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -4374,8 +4374,10 @@ static struct security_operations selinux_ops = { .xfrm_policy_alloc_security = selinux_xfrm_policy_alloc, .xfrm_policy_clone_security = selinux_xfrm_policy_clone, .xfrm_policy_free_security = selinux_xfrm_policy_free, + .xfrm_policy_delete_security = selinux_xfrm_policy_delete, .xfrm_state_alloc_security = selinux_xfrm_state_alloc, .xfrm_state_free_security = selinux_xfrm_state_free, + .xfrm_state_delete_security = selinux_xfrm_state_delete, .xfrm_policy_lookup = selinux_xfrm_policy_lookup, #endif }; diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h index c10f1fc41502..f0f4e480ff99 100644 --- a/security/selinux/include/xfrm.h +++ b/security/selinux/include/xfrm.h @@ -9,8 +9,10 @@ int selinux_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx); int selinux_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new); void selinux_xfrm_policy_free(struct xfrm_policy *xp); +int selinux_xfrm_policy_delete(struct xfrm_policy *xp); int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx); void selinux_xfrm_state_free(struct xfrm_state *x); +int selinux_xfrm_state_delete(struct xfrm_state *x); int selinux_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir); /* diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c index abe99d881376..0e24df41099f 100644 --- a/security/selinux/xfrm.c +++ b/security/selinux/xfrm.c @@ -132,10 +132,7 @@ static int selinux_xfrm_sec_ctx_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_us goto out; /* - * Does the subject have permission to set security or permission to - * do the relabel? - * Must be permitted to relabel from default socket type (process type) - * to specified context + * Does the subject have permission to set security context? */ rc = avc_has_perm(tsec->sid, ctx->ctx_sid, SECCLASS_ASSOCIATION, @@ -200,6 +197,23 @@ void selinux_xfrm_policy_free(struct xfrm_policy *xp) kfree(ctx); } +/* + * LSM hook implementation that authorizes deletion of labeled policies. + */ +int selinux_xfrm_policy_delete(struct xfrm_policy *xp) +{ + struct task_security_struct *tsec = current->security; + struct xfrm_sec_ctx *ctx = xp->security; + int rc = 0; + + if (ctx) + rc = avc_has_perm(tsec->sid, ctx->ctx_sid, + SECCLASS_ASSOCIATION, + ASSOCIATION__SETCONTEXT, NULL); + + return rc; +} + /* * LSM hook implementation that allocs and transfers sec_ctx spec to * xfrm_state. @@ -292,6 +306,23 @@ u32 selinux_socket_getpeer_dgram(struct sk_buff *skb) return SECSID_NULL; } + /* + * LSM hook implementation that authorizes deletion of labeled SAs. + */ +int selinux_xfrm_state_delete(struct xfrm_state *x) +{ + struct task_security_struct *tsec = current->security; + struct xfrm_sec_ctx *ctx = x->security; + int rc = 0; + + if (ctx) + rc = avc_has_perm(tsec->sid, ctx->ctx_sid, + SECCLASS_ASSOCIATION, + ASSOCIATION__SETCONTEXT, NULL); + + return rc; +} + /* * LSM hook that controls access to unlabelled packets. If * a xfrm_state is authorizable (defined by macro) then it was -- cgit v1.2.3-59-g8ed1b From 9dadaa19cb11a8db38072a92a3f95deab7a797fb Mon Sep 17 00:00:00 2001 From: Dave Jones Date: Thu, 8 Jun 2006 23:42:09 -0700 Subject: [NET]: NET_TCPPROBE Kconfig fix Just spotted this typo in a new option. Signed-off-by: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- net/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/Kconfig b/net/Kconfig index ff0db804eed8..ccadc8e48152 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -220,7 +220,7 @@ config NET_TCPPROBE depends on INET && EXPERIMENTAL && PROC_FS && KPROBES ---help--- This module allows for capturing the changes to TCP connection - state in response to incoming patckets. It is used for debugging + state in response to incoming packets. It is used for debugging TCP congestion avoidance modules. If you don't understand what was just said, you don't need it: say N. -- cgit v1.2.3-59-g8ed1b From 6f68dc37759b1d6ff3b4d4a9d097605a09f8f043 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 8 Jun 2006 23:58:52 -0700 Subject: [NET]: Fix warnings after LSM-IPSEC changes. Assignment used as truth value in xfrm_del_sa() and xfrm_get_policy(). Wrong argument type declared for security_xfrm_state_delete() when SELINUX is disabled. Signed-off-by: David S. Miller --- include/linux/security.h | 2 +- net/xfrm/xfrm_user.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/include/linux/security.h b/include/linux/security.h index 14c9bd050607..4dfb1b84a9b3 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -2995,7 +2995,7 @@ static inline void security_xfrm_state_free(struct xfrm_state *x) { } -static inline int security_xfrm_state_delete(struct xfrm_policy *xp) +static inline int security_xfrm_state_delete(struct xfrm_state *x) { return 0; } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index a3733d2db3ba..c21dc26141ea 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -427,7 +427,7 @@ static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) if (x == NULL) return -ESRCH; - if (err = security_xfrm_state_delete(x)) + if ((err = security_xfrm_state_delete(x)) != 0) goto out; if (xfrm_state_kern(x)) { @@ -1057,7 +1057,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr MSG_DONTWAIT); } } else { - if (err = security_xfrm_policy_delete(xp)) + if ((err = security_xfrm_policy_delete(xp)) != 0) goto out; c.data.byid = p->index; c.event = nlh->nlmsg_type; -- cgit v1.2.3-59-g8ed1b From 984bc16cc92ea3c247bf34ad667cfb95331b9d3c Mon Sep 17 00:00:00 2001 From: James Morris Date: Fri, 9 Jun 2006 00:29:17 -0700 Subject: [SECMARK]: Add secmark support to core networking. Add a secmark field to the skbuff structure, to allow security subsystems to place security markings on network packets. This is similar to the nfmark field, except is intended for implementing security policy, rather than than networking policy. This patch was already acked in principle by Dave Miller. Signed-off-by: James Morris Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/linux/skbuff.h | 22 ++++++++++++++++++++++ net/Kconfig | 7 +++++++ net/core/skbuff.c | 3 ++- net/ipv4/ip_output.c | 1 + net/ipv4/netfilter/ipt_REJECT.c | 1 + net/ipv6/ip6_output.c | 1 + 6 files changed, 34 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 23bad3bf3c9d..fe2c58e5306f 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -210,6 +210,7 @@ enum { * @nf_bridge: Saved data about a bridged frame - see br_netfilter.c * @tc_index: Traffic control index * @tc_verd: traffic control verdict + * @secmark: security marking */ struct sk_buff { @@ -289,6 +290,9 @@ struct sk_buff { #ifdef CONFIG_NET_DMA dma_cookie_t dma_cookie; #endif +#ifdef CONFIG_NETWORK_SECMARK + __u32 secmark; +#endif /* These elements must be at the end, see alloc_skb() for details. */ @@ -1400,5 +1404,23 @@ static inline void nf_reset(struct sk_buff *skb) static inline void nf_reset(struct sk_buff *skb) {} #endif /* CONFIG_NETFILTER */ +#ifdef CONFIG_NETWORK_SECMARK +static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from) +{ + to->secmark = from->secmark; +} + +static inline void skb_init_secmark(struct sk_buff *skb) +{ + skb->secmark = 0; +} +#else +static inline void skb_copy_secmark(struct sk_buff *to, const struct sk_buff *from) +{ } + +static inline void skb_init_secmark(struct sk_buff *skb) +{ } +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_SKBUFF_H */ diff --git a/net/Kconfig b/net/Kconfig index ccadc8e48152..c6cec5aa5486 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -66,6 +66,13 @@ source "net/ipv6/Kconfig" endif # if INET +config NETWORK_SECMARK + bool "Security Marking" + help + This enables security marking of network packets, similar + to nfmark, but designated for security purposes. + If you are unsure how to answer this question, answer N. + menuconfig NETFILTER bool "Network packet filtering (replaces ipchains)" ---help--- diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fb3770f9c094..96cdcbe24ba2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -464,7 +464,7 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) n->tc_verd = CLR_TC_MUNGED(n->tc_verd); C(input_dev); #endif - + skb_copy_secmark(n, skb); #endif C(truesize); atomic_set(&n->users, 1); @@ -526,6 +526,7 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #endif new->tc_index = old->tc_index; #endif + skb_copy_secmark(new, old); atomic_set(&new->users, 1); skb_shinfo(new)->tso_size = skb_shinfo(old)->tso_size; skb_shinfo(new)->tso_segs = skb_shinfo(old)->tso_segs; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index cff9c3a72daf..d4bb3fae4e49 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -410,6 +410,7 @@ static void ip_copy_metadata(struct sk_buff *to, struct sk_buff *from) nf_bridge_get(to->nf_bridge); #endif #endif + skb_copy_secmark(to, from); } /* diff --git a/net/ipv4/netfilter/ipt_REJECT.c b/net/ipv4/netfilter/ipt_REJECT.c index 0bba3c2bb786..431a3ce6f7b7 100644 --- a/net/ipv4/netfilter/ipt_REJECT.c +++ b/net/ipv4/netfilter/ipt_REJECT.c @@ -147,6 +147,7 @@ static void send_reset(struct sk_buff *oldskb, int hook) /* This packet will not be the same as the other: clear nf fields */ nf_reset(nskb); nskb->nfmark = 0; + skb_init_secmark(nskb); tcph = (struct tcphdr *)((u_int32_t*)nskb->nh.iph + nskb->nh.iph->ihl); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 416f6e428a0a..d29620f4910e 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -459,6 +459,7 @@ static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) nf_bridge_get(to->nf_bridge); #endif #endif + skb_copy_secmark(to, from); } int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) -- cgit v1.2.3-59-g8ed1b From 5e6874cdb8de94cd3c15d853a8ef9c6f4c305055 Mon Sep 17 00:00:00 2001 From: James Morris Date: Fri, 9 Jun 2006 00:30:57 -0700 Subject: [SECMARK]: Add xtables SECMARK target Add a SECMARK target to xtables, allowing the admin to apply security marks to packets via both iptables and ip6tables. The target currently handles SELinux security marking, but can be extended for other purposes as needed. Signed-off-by: James Morris Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/linux/netfilter/xt_SECMARK.h | 26 ++++++ net/netfilter/Kconfig | 9 ++ net/netfilter/Makefile | 1 + net/netfilter/xt_SECMARK.c | 156 +++++++++++++++++++++++++++++++++++ 4 files changed, 192 insertions(+) create mode 100644 include/linux/netfilter/xt_SECMARK.h create mode 100644 net/netfilter/xt_SECMARK.c (limited to 'net') diff --git a/include/linux/netfilter/xt_SECMARK.h b/include/linux/netfilter/xt_SECMARK.h new file mode 100644 index 000000000000..c53fbffa997d --- /dev/null +++ b/include/linux/netfilter/xt_SECMARK.h @@ -0,0 +1,26 @@ +#ifndef _XT_SECMARK_H_target +#define _XT_SECMARK_H_target + +/* + * This is intended for use by various security subsystems (but not + * at the same time). + * + * 'mode' refers to the specific security subsystem which the + * packets are being marked for. + */ +#define SECMARK_MODE_SEL 0x01 /* SELinux */ +#define SECMARK_SELCTX_MAX 256 + +struct xt_secmark_target_selinux_info { + u_int32_t selsid; + char selctx[SECMARK_SELCTX_MAX]; +}; + +struct xt_secmark_target_info { + u_int8_t mode; + union { + struct xt_secmark_target_selinux_info sel; + } u; +}; + +#endif /*_XT_SECMARK_H_target */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 85a7e1770252..10eccdd4d6ea 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -174,6 +174,15 @@ config NETFILTER_XT_TARGET_NOTRACK If you want to compile it as a module, say M here and read . If unsure, say `N'. +config NETFILTER_XT_TARGET_SECMARK + tristate '"SECMARK" target support' + depends on NETFILTER_XTABLES && NETWORK_SECMARK + help + The SECMARK target allows security marking of network + packets, for use with security subsystems. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_MATCH_COMMENT tristate '"comment" match support' depends on NETFILTER_XTABLES diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index df1da25f40df..3645de046890 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -28,6 +28,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_CONNMARK) += xt_CONNMARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o # matches obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o diff --git a/net/netfilter/xt_SECMARK.c b/net/netfilter/xt_SECMARK.c new file mode 100644 index 000000000000..c2ce9c4011cc --- /dev/null +++ b/net/netfilter/xt_SECMARK.c @@ -0,0 +1,156 @@ +/* + * Module for modifying the secmark field of the skb, for use by + * security subsystems. + * + * Based on the nfmark match by: + * (C) 1999-2001 Marc Boucher + * + * (C) 2006 Red Hat, Inc., James Morris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris "); +MODULE_DESCRIPTION("ip[6]tables SECMARK modification module"); +MODULE_ALIAS("ipt_SECMARK"); +MODULE_ALIAS("ip6t_SECMARK"); + +#define PFX "SECMARK: " + +static u8 mode; + +static unsigned int target(struct sk_buff **pskb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, + const void *targinfo, void *userinfo) +{ + u32 secmark = 0; + const struct xt_secmark_target_info *info = targinfo; + + BUG_ON(info->mode != mode); + + switch (mode) { + case SECMARK_MODE_SEL: + secmark = info->u.sel.selsid; + break; + + default: + BUG(); + } + + if ((*pskb)->secmark != secmark) + (*pskb)->secmark = secmark; + + return XT_CONTINUE; +} + +static int checkentry_selinux(struct xt_secmark_target_info *info) +{ + int err; + struct xt_secmark_target_selinux_info *sel = &info->u.sel; + + err = selinux_string_to_sid(sel->selctx, &sel->selsid); + if (err) { + if (err == -EINVAL) + printk(KERN_INFO PFX "invalid SELinux context \'%s\'\n", + sel->selctx); + return 0; + } + + if (!sel->selsid) { + printk(KERN_INFO PFX "unable to map SELinux context \'%s\'\n", + sel->selctx); + return 0; + } + + err = selinux_relabel_packet_permission(sel->selsid); + if (err) { + printk(KERN_INFO PFX "unable to obtain relabeling permission\n"); + return 0; + } + + return 1; +} + +static int checkentry(const char *tablename, const void *entry, + const struct xt_target *target, void *targinfo, + unsigned int targinfosize, unsigned int hook_mask) +{ + struct xt_secmark_target_info *info = targinfo; + + if (mode && mode != info->mode) { + printk(KERN_INFO PFX "mode already set to %hu cannot mix with " + "rules for mode %hu\n", mode, info->mode); + return 0; + } + + switch (info->mode) { + case SECMARK_MODE_SEL: + if (!checkentry_selinux(info)) + return 0; + break; + + default: + printk(KERN_INFO PFX "invalid mode: %hu\n", info->mode); + return 0; + } + + if (!mode) + mode = info->mode; + return 1; +} + +static struct xt_target ipt_secmark_reg = { + .name = "SECMARK", + .target = target, + .targetsize = sizeof(struct xt_secmark_target_info), + .table = "mangle", + .checkentry = checkentry, + .me = THIS_MODULE, + .family = AF_INET, + .revision = 0, +}; + +static struct xt_target ip6t_secmark_reg = { + .name = "SECMARK", + .target = target, + .targetsize = sizeof(struct xt_secmark_target_info), + .table = "mangle", + .checkentry = checkentry, + .me = THIS_MODULE, + .family = AF_INET6, + .revision = 0, +}; + +static int __init xt_secmark_init(void) +{ + int err; + + err = xt_register_target(&ipt_secmark_reg); + if (err) + return err; + + err = xt_register_target(&ip6t_secmark_reg); + if (err) + xt_unregister_target(&ipt_secmark_reg); + + return err; +} + +static void __exit xt_secmark_fini(void) +{ + xt_unregister_target(&ip6t_secmark_reg); + xt_unregister_target(&ipt_secmark_reg); +} + +module_init(xt_secmark_init); +module_exit(xt_secmark_fini); -- cgit v1.2.3-59-g8ed1b From 7c9728c393dceb724d66d696cfabce82151a78e5 Mon Sep 17 00:00:00 2001 From: James Morris Date: Fri, 9 Jun 2006 00:31:46 -0700 Subject: [SECMARK]: Add secmark support to conntrack Add a secmark field to IP and NF conntracks, so that security markings on packets can be copied to their associated connections, and also copied back to packets as required. This is similar to the network mark field currently used with conntrack, although it is intended for enforcement of security policy rather than network policy. Signed-off-by: James Morris Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/linux/netfilter_ipv4/ip_conntrack.h | 4 ++++ include/net/netfilter/nf_conntrack.h | 4 ++++ include/net/netfilter/nf_conntrack_compat.h | 26 ++++++++++++++++++++++++++ net/ipv4/netfilter/Kconfig | 12 ++++++++++++ net/ipv4/netfilter/ip_conntrack_core.c | 3 +++ net/ipv4/netfilter/ip_conntrack_standalone.c | 5 +++++ net/netfilter/Kconfig | 12 ++++++++++++ net/netfilter/nf_conntrack_core.c | 3 +++ net/netfilter/nf_conntrack_standalone.c | 5 +++++ 9 files changed, 74 insertions(+) (limited to 'net') diff --git a/include/linux/netfilter_ipv4/ip_conntrack.h b/include/linux/netfilter_ipv4/ip_conntrack.h index 17d7ef938a09..e0e9951eb8c3 100644 --- a/include/linux/netfilter_ipv4/ip_conntrack.h +++ b/include/linux/netfilter_ipv4/ip_conntrack.h @@ -121,6 +121,10 @@ struct ip_conntrack u_int32_t mark; #endif +#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK + u_int32_t secmark; +#endif + /* Traversed often, so hopefully in different cacheline to top */ /* These are my tuples; original and reply */ struct ip_conntrack_tuple_hash tuplehash[IP_CT_DIR_MAX]; diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h index dbe7a114d0c5..411117815807 100644 --- a/include/net/netfilter/nf_conntrack.h +++ b/include/net/netfilter/nf_conntrack.h @@ -114,6 +114,10 @@ struct nf_conn u_int32_t mark; #endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + u_int32_t secmark; +#endif + /* Storage reserved for other modules: */ union nf_conntrack_proto proto; diff --git a/include/net/netfilter/nf_conntrack_compat.h b/include/net/netfilter/nf_conntrack_compat.h index 3cac19fb3648..f1b1482d7200 100644 --- a/include/net/netfilter/nf_conntrack_compat.h +++ b/include/net/netfilter/nf_conntrack_compat.h @@ -20,6 +20,19 @@ static inline u_int32_t *nf_ct_get_mark(const struct sk_buff *skb, } #endif /* CONFIG_IP_NF_CONNTRACK_MARK */ +#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK +static inline u_int32_t *nf_ct_get_secmark(const struct sk_buff *skb, + u_int32_t *ctinfo) +{ + struct ip_conntrack *ct = ip_conntrack_get(skb, ctinfo); + + if (ct) + return &ct->secmark; + else + return NULL; +} +#endif /* CONFIG_IP_NF_CONNTRACK_SECMARK */ + #ifdef CONFIG_IP_NF_CT_ACCT static inline struct ip_conntrack_counter * nf_ct_get_counters(const struct sk_buff *skb) @@ -70,6 +83,19 @@ static inline u_int32_t *nf_ct_get_mark(const struct sk_buff *skb, } #endif /* CONFIG_NF_CONNTRACK_MARK */ +#ifdef CONFIG_NF_CONNTRACK_SECMARK +static inline u_int32_t *nf_ct_get_secmark(const struct sk_buff *skb, + u_int32_t *ctinfo) +{ + struct nf_conn *ct = nf_ct_get(skb, ctinfo); + + if (ct) + return &ct->secmark; + else + return NULL; +} +#endif /* CONFIG_NF_CONNTRACK_MARK */ + #ifdef CONFIG_NF_CT_ACCT static inline struct ip_conntrack_counter * nf_ct_get_counters(const struct sk_buff *skb) diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index ff4b118f14a9..e1d7f5fbc526 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -55,6 +55,18 @@ config IP_NF_CONNTRACK_MARK of packets, but this mark value is kept in the conntrack session instead of the individual packets. +config IP_NF_CONNTRACK_SECMARK + bool 'Connection tracking security mark support' + depends on IP_NF_CONNTRACK && NETWORK_SECMARK + help + This option enables security markings to be applied to + connections. Typically they are copied to connections from + packets using the CONNSECMARK target and copied back from + connections to packets with the same target, with the packets + being originally labeled via SECMARK. + + If unsure, say 'N'. + config IP_NF_CONNTRACK_EVENTS bool "Connection tracking events (EXPERIMENTAL)" depends on EXPERIMENTAL && IP_NF_CONNTRACK diff --git a/net/ipv4/netfilter/ip_conntrack_core.c b/net/ipv4/netfilter/ip_conntrack_core.c index 4fe9e69378df..7e4cf9a4d15f 100644 --- a/net/ipv4/netfilter/ip_conntrack_core.c +++ b/net/ipv4/netfilter/ip_conntrack_core.c @@ -723,6 +723,9 @@ init_conntrack(struct ip_conntrack_tuple *tuple, defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE) /* this is ugly, but there is no other place where to put it */ conntrack->nat.masq_index = exp->master->nat.masq_index; +#endif +#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK + conntrack->secmark = exp->master->secmark; #endif nf_conntrack_get(&conntrack->master->ct_general); CONNTRACK_STAT_INC(expect_new); diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index 6cb9b989d14c..88445aac3f28 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -189,6 +189,11 @@ static int ct_seq_show(struct seq_file *s, void *v) return -ENOSPC; #endif +#ifdef CONFIG_IP_NF_CONNTRACK_SECMARK + if (seq_printf(s, "secmark=%u ", conntrack->secmark)) + return -ENOSPC; +#endif + if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) return -ENOSPC; diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 10eccdd4d6ea..023f81e5f96b 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -60,6 +60,18 @@ config NF_CONNTRACK_MARK of packets, but this mark value is kept in the conntrack session instead of the individual packets. +config NF_CONNTRACK_SECMARK + bool 'Connection tracking security mark support' + depends on NF_CONNTRACK && NETWORK_SECMARK + help + This option enables security markings to be applied to + connections. Typically they are copied to connections from + packets using the CONNSECMARK target and copied back from + connections to packets with the same target, with the packets + being originally labeled via SECMARK. + + If unsure, say 'N'. + config NF_CONNTRACK_EVENTS bool "Connection tracking events (EXPERIMENTAL)" depends on EXPERIMENTAL && NF_CONNTRACK diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index bc2bd4c3859e..cd299f4b7db1 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -989,6 +989,9 @@ init_conntrack(const struct nf_conntrack_tuple *tuple, conntrack->master = exp->master; #ifdef CONFIG_NF_CONNTRACK_MARK conntrack->mark = exp->master->mark; +#endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + conntrack->secmark = exp->master->secmark; #endif nf_conntrack_get(&conntrack->master->ct_general); NF_CT_STAT_INC(expect_new); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index e01d20d8e287..e34c574f0351 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -213,6 +213,11 @@ static int ct_seq_show(struct seq_file *s, void *v) return -ENOSPC; #endif +#ifdef CONFIG_NF_CONNTRACK_SECMARK + if (seq_printf(s, "secmark=%u ", conntrack->secmark)) + return -ENOSPC; +#endif + if (seq_printf(s, "use=%u\n", atomic_read(&conntrack->ct_general.use))) return -ENOSPC; -- cgit v1.2.3-59-g8ed1b From 100468e9c05c10fb6872751c1af523b996d6afa9 Mon Sep 17 00:00:00 2001 From: James Morris Date: Fri, 9 Jun 2006 00:32:39 -0700 Subject: [SECMARK]: Add CONNSECMARK xtables target Add a new xtables target, CONNSECMARK, which is used to specify rules for copying security marks from packets to connections, and for copyying security marks back from connections to packets. This is similar to the CONNMARK target, but is more limited in scope in that it only allows copying of security marks to and from packets, as this is all it needs to do. A typical scenario would be to apply a security mark to a 'new' packet with SECMARK, then copy that to its conntrack via CONNMARK, and then restore the security mark from the connection to established and related packets on that connection. Signed-off-by: James Morris Signed-off-by: Andrew Morton Signed-off-by: David S. Miller --- include/linux/netfilter/xt_CONNSECMARK.h | 13 +++ net/netfilter/Kconfig | 11 +++ net/netfilter/Makefile | 1 + net/netfilter/xt_CONNSECMARK.c | 155 +++++++++++++++++++++++++++++++ 4 files changed, 180 insertions(+) create mode 100644 include/linux/netfilter/xt_CONNSECMARK.h create mode 100644 net/netfilter/xt_CONNSECMARK.c (limited to 'net') diff --git a/include/linux/netfilter/xt_CONNSECMARK.h b/include/linux/netfilter/xt_CONNSECMARK.h new file mode 100644 index 000000000000..c6bd75469ba2 --- /dev/null +++ b/include/linux/netfilter/xt_CONNSECMARK.h @@ -0,0 +1,13 @@ +#ifndef _XT_CONNSECMARK_H_target +#define _XT_CONNSECMARK_H_target + +enum { + CONNSECMARK_SAVE = 1, + CONNSECMARK_RESTORE, +}; + +struct xt_connsecmark_target_info { + u_int8_t mode; +}; + +#endif /*_XT_CONNSECMARK_H_target */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 023f81e5f96b..b1622b7de1cf 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -195,6 +195,17 @@ config NETFILTER_XT_TARGET_SECMARK To compile it as a module, choose M here. If unsure, say N. +config NETFILTER_XT_TARGET_CONNSECMARK + tristate '"CONNSECMARK" target support' + depends on NETFILTER_XTABLES && (NF_CONNTRACK_SECMARK || IP_NF_CONNTRACK_SECMARK) + help + The CONNSECMARK target copies security markings from packets + to connections, and restores security markings from connections + to packets (if the packets are not already marked). This would + normally be used in conjunction with the SECMARK target. + + To compile it as a module, choose M here. If unsure, say N. + config NETFILTER_XT_MATCH_COMMENT tristate '"comment" match support' depends on NETFILTER_XTABLES diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 3645de046890..6fa4b7580458 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -29,6 +29,7 @@ obj-$(CONFIG_NETFILTER_XT_TARGET_MARK) += xt_MARK.o obj-$(CONFIG_NETFILTER_XT_TARGET_NFQUEUE) += xt_NFQUEUE.o obj-$(CONFIG_NETFILTER_XT_TARGET_NOTRACK) += xt_NOTRACK.o obj-$(CONFIG_NETFILTER_XT_TARGET_SECMARK) += xt_SECMARK.o +obj-$(CONFIG_NETFILTER_XT_TARGET_CONNSECMARK) += xt_CONNSECMARK.o # matches obj-$(CONFIG_NETFILTER_XT_MATCH_COMMENT) += xt_comment.o diff --git a/net/netfilter/xt_CONNSECMARK.c b/net/netfilter/xt_CONNSECMARK.c new file mode 100644 index 000000000000..8c011e020769 --- /dev/null +++ b/net/netfilter/xt_CONNSECMARK.c @@ -0,0 +1,155 @@ +/* + * This module is used to copy security markings from packets + * to connections, and restore security markings from connections + * back to packets. This would normally be performed in conjunction + * with the SECMARK target and state match. + * + * Based somewhat on CONNMARK: + * Copyright (C) 2002,2004 MARA Systems AB + * by Henrik Nordstrom + * + * (C) 2006 Red Hat, Inc., James Morris + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ +#include +#include +#include +#include +#include + +#define PFX "CONNSECMARK: " + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("James Morris "); +MODULE_DESCRIPTION("ip[6]tables CONNSECMARK module"); +MODULE_ALIAS("ipt_CONNSECMARK"); +MODULE_ALIAS("ip6t_CONNSECMARK"); + +/* + * If the packet has a security mark and the connection does not, copy + * the security mark from the packet to the connection. + */ +static void secmark_save(struct sk_buff *skb) +{ + if (skb->secmark) { + u32 *connsecmark; + enum ip_conntrack_info ctinfo; + + connsecmark = nf_ct_get_secmark(skb, &ctinfo); + if (connsecmark && !*connsecmark) + if (*connsecmark != skb->secmark) + *connsecmark = skb->secmark; + } +} + +/* + * If packet has no security mark, and the connection does, restore the + * security mark from the connection to the packet. + */ +static void secmark_restore(struct sk_buff *skb) +{ + if (!skb->secmark) { + u32 *connsecmark; + enum ip_conntrack_info ctinfo; + + connsecmark = nf_ct_get_secmark(skb, &ctinfo); + if (connsecmark && *connsecmark) + if (skb->secmark != *connsecmark) + skb->secmark = *connsecmark; + } +} + +static unsigned int target(struct sk_buff **pskb, const struct net_device *in, + const struct net_device *out, unsigned int hooknum, + const struct xt_target *target, + const void *targinfo, void *userinfo) +{ + struct sk_buff *skb = *pskb; + const struct xt_connsecmark_target_info *info = targinfo; + + switch (info->mode) { + case CONNSECMARK_SAVE: + secmark_save(skb); + break; + + case CONNSECMARK_RESTORE: + secmark_restore(skb); + break; + + default: + BUG(); + } + + return XT_CONTINUE; +} + +static int checkentry(const char *tablename, const void *entry, + const struct xt_target *target, void *targinfo, + unsigned int targinfosize, unsigned int hook_mask) +{ + struct xt_connsecmark_target_info *info = targinfo; + + switch (info->mode) { + case CONNSECMARK_SAVE: + case CONNSECMARK_RESTORE: + break; + + default: + printk(KERN_INFO PFX "invalid mode: %hu\n", info->mode); + return 0; + } + + return 1; +} + +static struct xt_target ipt_connsecmark_reg = { + .name = "CONNSECMARK", + .target = target, + .targetsize = sizeof(struct xt_connsecmark_target_info), + .table = "mangle", + .checkentry = checkentry, + .me = THIS_MODULE, + .family = AF_INET, + .revision = 0, +}; + +static struct xt_target ip6t_connsecmark_reg = { + .name = "CONNSECMARK", + .target = target, + .targetsize = sizeof(struct xt_connsecmark_target_info), + .table = "mangle", + .checkentry = checkentry, + .me = THIS_MODULE, + .family = AF_INET6, + .revision = 0, +}; + +static int __init xt_connsecmark_init(void) +{ + int err; + + need_conntrack(); + + err = xt_register_target(&ipt_connsecmark_reg); + if (err) + return err; + + err = xt_register_target(&ip6t_connsecmark_reg); + if (err) + xt_unregister_target(&ipt_connsecmark_reg); + + return err; +} + +static void __exit xt_connsecmark_fini(void) +{ + xt_unregister_target(&ip6t_connsecmark_reg); + xt_unregister_target(&ipt_connsecmark_reg); +} + +module_init(xt_connsecmark_init); +module_exit(xt_connsecmark_fini); -- cgit v1.2.3-59-g8ed1b From a0e889bb1bdc083dbbdb02cce9698765847b841f Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 9 Jun 2006 12:17:41 -0700 Subject: [NETFILTER]: recent match: fix "sleeping function called from invalid context" create_proc_entry must not be called with locks held. Use a mutex instead to protect data only changed in user context. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_recent.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index 9686c4d2057d..9b09e481957d 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -69,6 +69,7 @@ struct recent_table { static LIST_HEAD(tables); static DEFINE_SPINLOCK(recent_lock); +static DEFINE_MUTEX(recent_mutex); #ifdef CONFIG_PROC_FS static struct proc_dir_entry *proc_dir; @@ -249,7 +250,7 @@ ipt_recent_checkentry(const char *tablename, const void *ip, strnlen(info->name, IPT_RECENT_NAME_LEN) == IPT_RECENT_NAME_LEN) return 0; - spin_lock_bh(&recent_lock); + mutex_lock(&recent_mutex); t = recent_table_lookup(info->name); if (t != NULL) { t->refcnt++; @@ -258,7 +259,7 @@ ipt_recent_checkentry(const char *tablename, const void *ip, } t = kzalloc(sizeof(*t) + sizeof(t->iphash[0]) * ip_list_hash_size, - GFP_ATOMIC); + GFP_KERNEL); if (t == NULL) goto out; strcpy(t->name, info->name); @@ -274,10 +275,12 @@ ipt_recent_checkentry(const char *tablename, const void *ip, t->proc->proc_fops = &recent_fops; t->proc->data = t; #endif + spin_lock_bh(&recent_lock); list_add_tail(&t->list, &tables); + spin_unlock_bh(&recent_lock); ret = 1; out: - spin_unlock_bh(&recent_lock); + mutex_unlock(&recent_mutex); return ret; } @@ -288,17 +291,19 @@ ipt_recent_destroy(const struct xt_match *match, void *matchinfo, const struct ipt_recent_info *info = matchinfo; struct recent_table *t; - spin_lock_bh(&recent_lock); + mutex_lock(&recent_mutex); t = recent_table_lookup(info->name); if (--t->refcnt == 0) { + spin_lock_bh(&recent_lock); list_del(&t->list); + spin_unlock_bh(&recent_lock); recent_table_flush(t); #ifdef CONFIG_PROC_FS remove_proc_entry(t->name, proc_dir); #endif kfree(t); } - spin_unlock_bh(&recent_lock); + mutex_unlock(&recent_mutex); } #ifdef CONFIG_PROC_FS -- cgit v1.2.3-59-g8ed1b From 2b2283d0302d520f08ded41c2ca17886dfbb865a Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 9 Jun 2006 12:18:17 -0700 Subject: [NETFILTER]: recent match: missing refcnt initialization Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_recent.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_recent.c b/net/ipv4/netfilter/ipt_recent.c index 9b09e481957d..61a2139f9cfd 100644 --- a/net/ipv4/netfilter/ipt_recent.c +++ b/net/ipv4/netfilter/ipt_recent.c @@ -262,6 +262,7 @@ ipt_recent_checkentry(const char *tablename, const void *ip, GFP_KERNEL); if (t == NULL) goto out; + t->refcnt = 1; strcpy(t->name, info->name); INIT_LIST_HEAD(&t->lru_list); for (i = 0; i < ip_list_hash_size; i++) -- cgit v1.2.3-59-g8ed1b From bf0857ea32addb6bc8b46383604b218b8ec09f19 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Fri, 9 Jun 2006 12:18:47 -0700 Subject: [NETFILTER]: hashlimit match: fix random initialization hashlimit does: if (!ht->rnd) get_random_bytes(&ht->rnd, 4); ignoring that 0 is also a valid random number. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_hashlimit.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_hashlimit.c b/net/ipv4/netfilter/ipt_hashlimit.c index 85edfb79469a..92980ab8ce48 100644 --- a/net/ipv4/netfilter/ipt_hashlimit.c +++ b/net/ipv4/netfilter/ipt_hashlimit.c @@ -80,6 +80,7 @@ struct ipt_hashlimit_htable { /* used internally */ spinlock_t lock; /* lock for list_head */ u_int32_t rnd; /* random seed for hash */ + int rnd_initialized; struct timer_list timer; /* timer for gc */ atomic_t count; /* number entries in table */ @@ -134,8 +135,10 @@ __dsthash_alloc_init(struct ipt_hashlimit_htable *ht, struct dsthash_dst *dst) /* initialize hash with random val at the time we allocate * the first hashtable entry */ - if (!ht->rnd) + if (!ht->rnd_initialized) { get_random_bytes(&ht->rnd, 4); + ht->rnd_initialized = 1; + } if (ht->cfg.max && atomic_read(&ht->count) >= ht->cfg.max) { @@ -214,7 +217,7 @@ static int htable_create(struct ipt_hashlimit_info *minfo) atomic_set(&hinfo->count, 0); atomic_set(&hinfo->use, 1); - hinfo->rnd = 0; + hinfo->rnd_initialized = 0; spin_lock_init(&hinfo->lock); hinfo->pde = create_proc_entry(minfo->name, 0, hashlimit_procdir); if (!hinfo->pde) { -- cgit v1.2.3-59-g8ed1b From 932ff279a43ab7257942cddff2595acd541cc49b Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 9 Jun 2006 12:20:56 -0700 Subject: [NET]: Add netif_tx_lock Various drivers use xmit_lock internally to synchronise with their transmission routines. They do so without setting xmit_lock_owner. This is fine as long as netpoll is not in use. With netpoll it is possible for deadlocks to occur if xmit_lock_owner isn't set. This is because if a printk occurs while xmit_lock is held and xmit_lock_owner is not set can cause netpoll to attempt to take xmit_lock recursively. While it is possible to resolve this by getting netpoll to use trylock, it is suboptimal because netpoll's sole objective is to maximise the chance of getting the printk out on the wire. So delaying or dropping the message is to be avoided as much as possible. So the only alternative is to always set xmit_lock_owner. The following patch does this by introducing the netif_tx_lock family of functions that take care of setting/unsetting xmit_lock_owner. I renamed xmit_lock to _xmit_lock to indicate that it should not be used directly. I didn't provide irq versions of the netif_tx_lock functions since xmit_lock is meant to be a BH-disabling lock. This is pretty much a straight text substitution except for a small bug fix in winbond. It currently uses netif_stop_queue/spin_unlock_wait to stop transmission. This is unsafe as an IRQ can potentially wake up the queue. So it is safer to use netif_tx_disable. The hamradio bits used spin_lock_irq but it is unnecessary as xmit_lock must never be taken in an IRQ handler. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- Documentation/networking/netdevices.txt | 8 +++--- drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 6 ++-- drivers/media/dvb/dvb-core/dvb_net.c | 4 +-- drivers/net/bnx2.c | 4 +-- drivers/net/bonding/bond_main.c | 2 +- drivers/net/forcedeth.c | 18 ++++++------ drivers/net/hamradio/6pack.c | 8 +++--- drivers/net/hamradio/mkiss.c | 8 +++--- drivers/net/ifb.c | 10 +++---- drivers/net/irda/vlsi_ir.c | 2 +- drivers/net/natsemi.c | 4 +-- drivers/net/tulip/winbond-840.c | 9 +++--- drivers/net/wireless/orinoco.c | 4 ++- include/linux/netdevice.h | 38 ++++++++++++++++++++++++-- net/atm/clip.c | 4 +-- net/core/dev.c | 12 ++++---- net/core/dev_mcast.c | 28 +++++++++---------- net/core/netpoll.c | 9 ++---- net/core/pktgen.c | 4 +-- net/sched/sch_generic.c | 28 ++++++++----------- net/sched/sch_teql.c | 9 ++---- 21 files changed, 121 insertions(+), 98 deletions(-) (limited to 'net') diff --git a/Documentation/networking/netdevices.txt b/Documentation/networking/netdevices.txt index 3c0a5ba614d7..847cedb238f6 100644 --- a/Documentation/networking/netdevices.txt +++ b/Documentation/networking/netdevices.txt @@ -42,9 +42,9 @@ dev->get_stats: Context: nominally process, but don't sleep inside an rwlock dev->hard_start_xmit: - Synchronization: dev->xmit_lock spinlock. + Synchronization: netif_tx_lock spinlock. When the driver sets NETIF_F_LLTX in dev->features this will be - called without holding xmit_lock. In this case the driver + called without holding netif_tx_lock. In this case the driver has to lock by itself when needed. It is recommended to use a try lock for this and return -1 when the spin lock fails. The locking there should also properly protect against @@ -62,12 +62,12 @@ dev->hard_start_xmit: Only valid when NETIF_F_LLTX is set. dev->tx_timeout: - Synchronization: dev->xmit_lock spinlock. + Synchronization: netif_tx_lock spinlock. Context: BHs disabled Notes: netif_queue_stopped() is guaranteed true dev->set_multicast_list: - Synchronization: dev->xmit_lock spinlock. + Synchronization: netif_tx_lock spinlock. Context: BHs disabled dev->poll: diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index 1dae4b238252..1d917edcf9ba 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -821,7 +821,8 @@ void ipoib_mcast_restart_task(void *dev_ptr) ipoib_mcast_stop_thread(dev, 0); - spin_lock_irqsave(&dev->xmit_lock, flags); + local_irq_save(flags); + netif_tx_lock(dev); spin_lock(&priv->lock); /* @@ -896,7 +897,8 @@ void ipoib_mcast_restart_task(void *dev_ptr) } spin_unlock(&priv->lock); - spin_unlock_irqrestore(&dev->xmit_lock, flags); + netif_tx_unlock(dev); + local_irq_restore(flags); /* We have to cancel outside of the spinlock */ list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { diff --git a/drivers/media/dvb/dvb-core/dvb_net.c b/drivers/media/dvb/dvb-core/dvb_net.c index 2f0f35811bf7..9fd87521a163 100644 --- a/drivers/media/dvb/dvb-core/dvb_net.c +++ b/drivers/media/dvb/dvb-core/dvb_net.c @@ -1052,7 +1052,7 @@ static void wq_set_multicast_list (void *data) dvb_net_feed_stop(dev); priv->rx_mode = RX_MODE_UNI; - spin_lock_bh(&dev->xmit_lock); + netif_tx_lock_bh(dev); if (dev->flags & IFF_PROMISC) { dprintk("%s: promiscuous mode\n", dev->name); @@ -1077,7 +1077,7 @@ static void wq_set_multicast_list (void *data) } } - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); dvb_net_feed_start(dev); } diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index 54161aef3cac..9c5a8842ed0f 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -2009,7 +2009,7 @@ bnx2_poll(struct net_device *dev, int *budget) return 1; } -/* Called with rtnl_lock from vlan functions and also dev->xmit_lock +/* Called with rtnl_lock from vlan functions and also netif_tx_lock * from set_multicast. */ static void @@ -4252,7 +4252,7 @@ bnx2_vlan_rx_kill_vid(struct net_device *dev, uint16_t vid) } #endif -/* Called with dev->xmit_lock. +/* Called with netif_tx_lock. * hard_start_xmit is pseudo-lockless - a lock is only required when * the tx queue is full. This way, we get the benefit of lockless * operations most of the time without the complexities to handle diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 55d236726d11..46326cdfb277 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -4191,7 +4191,7 @@ static int bond_init(struct net_device *bond_dev, struct bond_params *params) */ bond_dev->features |= NETIF_F_VLAN_CHALLENGED; - /* don't acquire bond device's xmit_lock when + /* don't acquire bond device's netif_tx_lock when * transmitting */ bond_dev->features |= NETIF_F_LLTX; diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c index feb5b223cd60..5a8651b4b01d 100644 --- a/drivers/net/forcedeth.c +++ b/drivers/net/forcedeth.c @@ -533,9 +533,9 @@ typedef union _ring_type { * critical parts: * - rx is (pseudo-) lockless: it relies on the single-threading provided * by the arch code for interrupts. - * - tx setup is lockless: it relies on dev->xmit_lock. Actual submission + * - tx setup is lockless: it relies on netif_tx_lock. Actual submission * needs dev->priv->lock :-( - * - set_multicast_list: preparation lockless, relies on dev->xmit_lock. + * - set_multicast_list: preparation lockless, relies on netif_tx_lock. */ /* in dev: base, irq */ @@ -1213,7 +1213,7 @@ static void drain_ring(struct net_device *dev) /* * nv_start_xmit: dev->hard_start_xmit function - * Called with dev->xmit_lock held. + * Called with netif_tx_lock held. */ static int nv_start_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -1407,7 +1407,7 @@ static void nv_tx_done(struct net_device *dev) /* * nv_tx_timeout: dev->tx_timeout function - * Called with dev->xmit_lock held. + * Called with netif_tx_lock held. */ static void nv_tx_timeout(struct net_device *dev) { @@ -1737,7 +1737,7 @@ static int nv_change_mtu(struct net_device *dev, int new_mtu) * Changing the MTU is a rare event, it shouldn't matter. */ nv_disable_irq(dev); - spin_lock_bh(&dev->xmit_lock); + netif_tx_lock_bh(dev); spin_lock(&np->lock); /* stop engines */ nv_stop_rx(dev); @@ -1768,7 +1768,7 @@ static int nv_change_mtu(struct net_device *dev, int new_mtu) nv_start_rx(dev); nv_start_tx(dev); spin_unlock(&np->lock); - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); nv_enable_irq(dev); } return 0; @@ -1803,7 +1803,7 @@ static int nv_set_mac_address(struct net_device *dev, void *addr) memcpy(dev->dev_addr, macaddr->sa_data, ETH_ALEN); if (netif_running(dev)) { - spin_lock_bh(&dev->xmit_lock); + netif_tx_lock_bh(dev); spin_lock_irq(&np->lock); /* stop rx engine */ @@ -1815,7 +1815,7 @@ static int nv_set_mac_address(struct net_device *dev, void *addr) /* restart rx engine */ nv_start_rx(dev); spin_unlock_irq(&np->lock); - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); } else { nv_copy_mac_to_hw(dev); } @@ -1824,7 +1824,7 @@ static int nv_set_mac_address(struct net_device *dev, void *addr) /* * nv_set_multicast: dev->set_multicast function - * Called with dev->xmit_lock held. + * Called with netif_tx_lock held. */ static void nv_set_multicast(struct net_device *dev) { diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c index 102c1f0b90da..d12605f0ac7c 100644 --- a/drivers/net/hamradio/6pack.c +++ b/drivers/net/hamradio/6pack.c @@ -308,9 +308,9 @@ static int sp_set_mac_address(struct net_device *dev, void *addr) { struct sockaddr_ax25 *sa = addr; - spin_lock_irq(&dev->xmit_lock); + netif_tx_lock_bh(dev); memcpy(dev->dev_addr, &sa->sax25_call, AX25_ADDR_LEN); - spin_unlock_irq(&dev->xmit_lock); + netif_tx_unlock_bh(dev); return 0; } @@ -767,9 +767,9 @@ static int sixpack_ioctl(struct tty_struct *tty, struct file *file, break; } - spin_lock_irq(&dev->xmit_lock); + netif_tx_lock_bh(dev); memcpy(dev->dev_addr, &addr, AX25_ADDR_LEN); - spin_unlock_irq(&dev->xmit_lock); + netif_tx_unlock_bh(dev); err = 0; break; diff --git a/drivers/net/hamradio/mkiss.c b/drivers/net/hamradio/mkiss.c index d81a8e1eeb8d..3ebbbe56b6e9 100644 --- a/drivers/net/hamradio/mkiss.c +++ b/drivers/net/hamradio/mkiss.c @@ -357,9 +357,9 @@ static int ax_set_mac_address(struct net_device *dev, void *addr) { struct sockaddr_ax25 *sa = addr; - spin_lock_irq(&dev->xmit_lock); + netif_tx_lock_bh(dev); memcpy(dev->dev_addr, &sa->sax25_call, AX25_ADDR_LEN); - spin_unlock_irq(&dev->xmit_lock); + netif_tx_unlock_bh(dev); return 0; } @@ -886,9 +886,9 @@ static int mkiss_ioctl(struct tty_struct *tty, struct file *file, break; } - spin_lock_irq(&dev->xmit_lock); + netif_tx_lock_bh(dev); memcpy(dev->dev_addr, addr, AX25_ADDR_LEN); - spin_unlock_irq(&dev->xmit_lock); + netif_tx_unlock_bh(dev); err = 0; break; diff --git a/drivers/net/ifb.c b/drivers/net/ifb.c index 31fb2d75dc44..2e222ef91e22 100644 --- a/drivers/net/ifb.c +++ b/drivers/net/ifb.c @@ -76,13 +76,13 @@ static void ri_tasklet(unsigned long dev) dp->st_task_enter++; if ((skb = skb_peek(&dp->tq)) == NULL) { dp->st_txq_refl_try++; - if (spin_trylock(&_dev->xmit_lock)) { + if (netif_tx_trylock(_dev)) { dp->st_rxq_enter++; while ((skb = skb_dequeue(&dp->rq)) != NULL) { skb_queue_tail(&dp->tq, skb); dp->st_rx2tx_tran++; } - spin_unlock(&_dev->xmit_lock); + netif_tx_unlock(_dev); } else { /* reschedule */ dp->st_rxq_notenter++; @@ -110,7 +110,7 @@ static void ri_tasklet(unsigned long dev) } } - if (spin_trylock(&_dev->xmit_lock)) { + if (netif_tx_trylock(_dev)) { dp->st_rxq_check++; if ((skb = skb_peek(&dp->rq)) == NULL) { dp->tasklet_pending = 0; @@ -118,10 +118,10 @@ static void ri_tasklet(unsigned long dev) netif_wake_queue(_dev); } else { dp->st_rxq_rsch++; - spin_unlock(&_dev->xmit_lock); + netif_tx_unlock(_dev); goto resched; } - spin_unlock(&_dev->xmit_lock); + netif_tx_unlock(_dev); } else { resched: dp->tasklet_pending = 1; diff --git a/drivers/net/irda/vlsi_ir.c b/drivers/net/irda/vlsi_ir.c index 97a49e0be76b..d70b9e8d6e60 100644 --- a/drivers/net/irda/vlsi_ir.c +++ b/drivers/net/irda/vlsi_ir.c @@ -959,7 +959,7 @@ static int vlsi_hard_start_xmit(struct sk_buff *skb, struct net_device *ndev) || (now.tv_sec==ready.tv_sec && now.tv_usec>=ready.tv_usec)) break; udelay(100); - /* must not sleep here - we are called under xmit_lock! */ + /* must not sleep here - called under netif_tx_lock! */ } } diff --git a/drivers/net/natsemi.c b/drivers/net/natsemi.c index 90627756d6fa..2e4ecedba057 100644 --- a/drivers/net/natsemi.c +++ b/drivers/net/natsemi.c @@ -318,12 +318,12 @@ performance critical codepaths: The rx process only runs in the interrupt handler. Access from outside the interrupt handler is only permitted after disable_irq(). -The rx process usually runs under the dev->xmit_lock. If np->intr_tx_reap +The rx process usually runs under the netif_tx_lock. If np->intr_tx_reap is set, then access is permitted under spin_lock_irq(&np->lock). Thus configuration functions that want to access everything must call disable_irq(dev->irq); - spin_lock_bh(dev->xmit_lock); + netif_tx_lock_bh(dev); spin_lock_irq(&np->lock); IV. Notes diff --git a/drivers/net/tulip/winbond-840.c b/drivers/net/tulip/winbond-840.c index 136a70c4d5e4..56d86c7c0386 100644 --- a/drivers/net/tulip/winbond-840.c +++ b/drivers/net/tulip/winbond-840.c @@ -1605,11 +1605,11 @@ static void __devexit w840_remove1 (struct pci_dev *pdev) * - get_stats: * spin_lock_irq(np->lock), doesn't touch hw if not present * - hard_start_xmit: - * netif_stop_queue + spin_unlock_wait(&dev->xmit_lock); + * synchronize_irq + netif_tx_disable; * - tx_timeout: - * netif_device_detach + spin_unlock_wait(&dev->xmit_lock); + * netif_device_detach + netif_tx_disable; * - set_multicast_list - * netif_device_detach + spin_unlock_wait(&dev->xmit_lock); + * netif_device_detach + netif_tx_disable; * - interrupt handler * doesn't touch hw if not present, synchronize_irq waits for * running instances of the interrupt handler. @@ -1635,11 +1635,10 @@ static int w840_suspend (struct pci_dev *pdev, pm_message_t state) netif_device_detach(dev); update_csr6(dev, 0); iowrite32(0, ioaddr + IntrEnable); - netif_stop_queue(dev); spin_unlock_irq(&np->lock); - spin_unlock_wait(&dev->xmit_lock); synchronize_irq(dev->irq); + netif_tx_disable(dev); np->stats.rx_missed_errors += ioread32(ioaddr + RxMissed) & 0xffff; diff --git a/drivers/net/wireless/orinoco.c b/drivers/net/wireless/orinoco.c index c2d0b09e0418..a5fcfcde63d1 100644 --- a/drivers/net/wireless/orinoco.c +++ b/drivers/net/wireless/orinoco.c @@ -1833,7 +1833,9 @@ static int __orinoco_program_rids(struct net_device *dev) /* Set promiscuity / multicast*/ priv->promiscuous = 0; priv->mc_count = 0; - __orinoco_set_multicast_list(dev); /* FIXME: what about the xmit_lock */ + + /* FIXME: what about netif_tx_lock */ + __orinoco_set_multicast_list(dev); return 0; } diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index b5760c67af9c..067b9ccafd87 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -407,7 +407,7 @@ struct net_device * One part is mostly used on xmit path (device) */ /* hard_start_xmit synchronizer */ - spinlock_t xmit_lock ____cacheline_aligned_in_smp; + spinlock_t _xmit_lock ____cacheline_aligned_in_smp; /* cpu id of processor entered to hard_start_xmit or -1, if nobody entered there. */ @@ -893,11 +893,43 @@ static inline void __netif_rx_complete(struct net_device *dev) clear_bit(__LINK_STATE_RX_SCHED, &dev->state); } +static inline void netif_tx_lock(struct net_device *dev) +{ + spin_lock(&dev->_xmit_lock); + dev->xmit_lock_owner = smp_processor_id(); +} + +static inline void netif_tx_lock_bh(struct net_device *dev) +{ + spin_lock_bh(&dev->_xmit_lock); + dev->xmit_lock_owner = smp_processor_id(); +} + +static inline int netif_tx_trylock(struct net_device *dev) +{ + int err = spin_trylock(&dev->_xmit_lock); + if (!err) + dev->xmit_lock_owner = smp_processor_id(); + return err; +} + +static inline void netif_tx_unlock(struct net_device *dev) +{ + dev->xmit_lock_owner = -1; + spin_unlock(&dev->_xmit_lock); +} + +static inline void netif_tx_unlock_bh(struct net_device *dev) +{ + dev->xmit_lock_owner = -1; + spin_unlock_bh(&dev->_xmit_lock); +} + static inline void netif_tx_disable(struct net_device *dev) { - spin_lock_bh(&dev->xmit_lock); + netif_tx_lock_bh(dev); netif_stop_queue(dev); - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); } /* These functions live elsewhere (drivers/net/net_init.c, but related) */ diff --git a/net/atm/clip.c b/net/atm/clip.c index 72d852982664..f92f9c94d2c7 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -98,7 +98,7 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc) printk(KERN_CRIT "!clip_vcc->entry (clip_vcc %p)\n", clip_vcc); return; } - spin_lock_bh(&entry->neigh->dev->xmit_lock); /* block clip_start_xmit() */ + netif_tx_lock_bh(entry->neigh->dev); /* block clip_start_xmit() */ entry->neigh->used = jiffies; for (walk = &entry->vccs; *walk; walk = &(*walk)->next) if (*walk == clip_vcc) { @@ -122,7 +122,7 @@ static void unlink_clip_vcc(struct clip_vcc *clip_vcc) printk(KERN_CRIT "ATMARP: unlink_clip_vcc failed (entry %p, vcc " "0x%p)\n", entry, clip_vcc); out: - spin_unlock_bh(&entry->neigh->dev->xmit_lock); + netif_tx_unlock_bh(entry->neigh->dev); } /* The neighbour entry n->lock is held. */ diff --git a/net/core/dev.c b/net/core/dev.c index 6bfa78c66c25..1b09f1cae46e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1282,15 +1282,13 @@ int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask) #define HARD_TX_LOCK(dev, cpu) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \ - spin_lock(&dev->xmit_lock); \ - dev->xmit_lock_owner = cpu; \ + netif_tx_lock(dev); \ } \ } #define HARD_TX_UNLOCK(dev) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \ - dev->xmit_lock_owner = -1; \ - spin_unlock(&dev->xmit_lock); \ + netif_tx_unlock(dev); \ } \ } @@ -1389,8 +1387,8 @@ int dev_queue_xmit(struct sk_buff *skb) /* The device has no queue. Common case for software devices: loopback, all the sorts of tunnels... - Really, it is unlikely that xmit_lock protection is necessary here. - (f.e. loopback and IP tunnels are clean ignoring statistics + Really, it is unlikely that netif_tx_lock protection is necessary + here. (f.e. loopback and IP tunnels are clean ignoring statistics counters.) However, it is possible, that they rely on protection made by us here. @@ -2805,7 +2803,7 @@ int register_netdevice(struct net_device *dev) BUG_ON(dev->reg_state != NETREG_UNINITIALIZED); spin_lock_init(&dev->queue_lock); - spin_lock_init(&dev->xmit_lock); + spin_lock_init(&dev->_xmit_lock); dev->xmit_lock_owner = -1; #ifdef CONFIG_NET_CLS_ACT spin_lock_init(&dev->ingress_lock); diff --git a/net/core/dev_mcast.c b/net/core/dev_mcast.c index 05d60850840e..c57d887da2ef 100644 --- a/net/core/dev_mcast.c +++ b/net/core/dev_mcast.c @@ -62,7 +62,7 @@ * Device mc lists are changed by bh at least if IPv6 is enabled, * so that it must be bh protected. * - * We block accesses to device mc filters with dev->xmit_lock. + * We block accesses to device mc filters with netif_tx_lock. */ /* @@ -93,9 +93,9 @@ static void __dev_mc_upload(struct net_device *dev) void dev_mc_upload(struct net_device *dev) { - spin_lock_bh(&dev->xmit_lock); + netif_tx_lock_bh(dev); __dev_mc_upload(dev); - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); } /* @@ -107,7 +107,7 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl) int err = 0; struct dev_mc_list *dmi, **dmip; - spin_lock_bh(&dev->xmit_lock); + netif_tx_lock_bh(dev); for (dmip = &dev->mc_list; (dmi = *dmip) != NULL; dmip = &dmi->next) { /* @@ -139,13 +139,13 @@ int dev_mc_delete(struct net_device *dev, void *addr, int alen, int glbl) */ __dev_mc_upload(dev); - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); return 0; } } err = -ENOENT; done: - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); return err; } @@ -160,7 +160,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) dmi1 = kmalloc(sizeof(*dmi), GFP_ATOMIC); - spin_lock_bh(&dev->xmit_lock); + netif_tx_lock_bh(dev); for (dmi = dev->mc_list; dmi != NULL; dmi = dmi->next) { if (memcmp(dmi->dmi_addr, addr, dmi->dmi_addrlen) == 0 && dmi->dmi_addrlen == alen) { @@ -176,7 +176,7 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) } if ((dmi = dmi1) == NULL) { - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); return -ENOMEM; } memcpy(dmi->dmi_addr, addr, alen); @@ -189,11 +189,11 @@ int dev_mc_add(struct net_device *dev, void *addr, int alen, int glbl) __dev_mc_upload(dev); - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); return 0; done: - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); kfree(dmi1); return err; } @@ -204,7 +204,7 @@ done: void dev_mc_discard(struct net_device *dev) { - spin_lock_bh(&dev->xmit_lock); + netif_tx_lock_bh(dev); while (dev->mc_list != NULL) { struct dev_mc_list *tmp = dev->mc_list; @@ -215,7 +215,7 @@ void dev_mc_discard(struct net_device *dev) } dev->mc_count = 0; - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); } #ifdef CONFIG_PROC_FS @@ -250,7 +250,7 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v) struct dev_mc_list *m; struct net_device *dev = v; - spin_lock_bh(&dev->xmit_lock); + netif_tx_lock_bh(dev); for (m = dev->mc_list; m; m = m->next) { int i; @@ -262,7 +262,7 @@ static int dev_mc_seq_show(struct seq_file *seq, void *v) seq_putc(seq, '\n'); } - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); return 0; } diff --git a/net/core/netpoll.c b/net/core/netpoll.c index e8e05cebd95a..9cb781830380 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -273,24 +273,21 @@ static void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) do { npinfo->tries--; - spin_lock(&np->dev->xmit_lock); - np->dev->xmit_lock_owner = smp_processor_id(); + netif_tx_lock(np->dev); /* * network drivers do not expect to be called if the queue is * stopped. */ if (netif_queue_stopped(np->dev)) { - np->dev->xmit_lock_owner = -1; - spin_unlock(&np->dev->xmit_lock); + netif_tx_unlock(np->dev); netpoll_poll(np); udelay(50); continue; } status = np->dev->hard_start_xmit(skb, np->dev); - np->dev->xmit_lock_owner = -1; - spin_unlock(&np->dev->xmit_lock); + netif_tx_unlock(np->dev); /* success */ if(!status) { diff --git a/net/core/pktgen.c b/net/core/pktgen.c index c23e9c06ee23..67ed14ddabd2 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2897,7 +2897,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) } } - spin_lock_bh(&odev->xmit_lock); + netif_tx_lock_bh(odev); if (!netif_queue_stopped(odev)) { atomic_inc(&(pkt_dev->skb->users)); @@ -2942,7 +2942,7 @@ static __inline__ void pktgen_xmit(struct pktgen_dev *pkt_dev) pkt_dev->next_tx_ns = 0; } - spin_unlock_bh(&odev->xmit_lock); + netif_tx_unlock_bh(odev); /* If pkt_dev->count is zero, then run forever */ if ((pkt_dev->count != 0) && (pkt_dev->sofar >= pkt_dev->count)) { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 138ea92ed268..b1e4c5e20ac7 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -72,9 +72,9 @@ void qdisc_unlock_tree(struct net_device *dev) dev->queue_lock serializes queue accesses for this device AND dev->qdisc pointer itself. - dev->xmit_lock serializes accesses to device driver. + netif_tx_lock serializes accesses to device driver. - dev->queue_lock and dev->xmit_lock are mutually exclusive, + dev->queue_lock and netif_tx_lock are mutually exclusive, if one is grabbed, another must be free. */ @@ -108,7 +108,7 @@ int qdisc_restart(struct net_device *dev) * will be requeued. */ if (!nolock) { - if (!spin_trylock(&dev->xmit_lock)) { + if (!netif_tx_trylock(dev)) { collision: /* So, someone grabbed the driver. */ @@ -126,8 +126,6 @@ int qdisc_restart(struct net_device *dev) __get_cpu_var(netdev_rx_stat).cpu_collision++; goto requeue; } - /* Remember that the driver is grabbed by us. */ - dev->xmit_lock_owner = smp_processor_id(); } { @@ -142,8 +140,7 @@ int qdisc_restart(struct net_device *dev) ret = dev->hard_start_xmit(skb, dev); if (ret == NETDEV_TX_OK) { if (!nolock) { - dev->xmit_lock_owner = -1; - spin_unlock(&dev->xmit_lock); + netif_tx_unlock(dev); } spin_lock(&dev->queue_lock); return -1; @@ -157,8 +154,7 @@ int qdisc_restart(struct net_device *dev) /* NETDEV_TX_BUSY - we need to requeue */ /* Release the driver */ if (!nolock) { - dev->xmit_lock_owner = -1; - spin_unlock(&dev->xmit_lock); + netif_tx_unlock(dev); } spin_lock(&dev->queue_lock); q = dev->qdisc; @@ -187,7 +183,7 @@ static void dev_watchdog(unsigned long arg) { struct net_device *dev = (struct net_device *)arg; - spin_lock(&dev->xmit_lock); + netif_tx_lock(dev); if (dev->qdisc != &noop_qdisc) { if (netif_device_present(dev) && netif_running(dev) && @@ -203,7 +199,7 @@ static void dev_watchdog(unsigned long arg) dev_hold(dev); } } - spin_unlock(&dev->xmit_lock); + netif_tx_unlock(dev); dev_put(dev); } @@ -227,17 +223,17 @@ void __netdev_watchdog_up(struct net_device *dev) static void dev_watchdog_up(struct net_device *dev) { - spin_lock_bh(&dev->xmit_lock); + netif_tx_lock_bh(dev); __netdev_watchdog_up(dev); - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); } static void dev_watchdog_down(struct net_device *dev) { - spin_lock_bh(&dev->xmit_lock); + netif_tx_lock_bh(dev); if (del_timer(&dev->watchdog_timer)) dev_put(dev); - spin_unlock_bh(&dev->xmit_lock); + netif_tx_unlock_bh(dev); } void netif_carrier_on(struct net_device *dev) @@ -582,7 +578,7 @@ void dev_deactivate(struct net_device *dev) while (test_bit(__LINK_STATE_SCHED, &dev->state)) yield(); - spin_unlock_wait(&dev->xmit_lock); + spin_unlock_wait(&dev->_xmit_lock); } void dev_init_scheduler(struct net_device *dev) diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 79b8ef34c6e4..4c16ad57a3e4 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -302,20 +302,17 @@ restart: switch (teql_resolve(skb, skb_res, slave)) { case 0: - if (spin_trylock(&slave->xmit_lock)) { - slave->xmit_lock_owner = smp_processor_id(); + if (netif_tx_trylock(slave)) { if (!netif_queue_stopped(slave) && slave->hard_start_xmit(skb, slave) == 0) { - slave->xmit_lock_owner = -1; - spin_unlock(&slave->xmit_lock); + netif_tx_unlock(slave); master->slaves = NEXT_SLAVE(q); netif_wake_queue(dev); master->stats.tx_packets++; master->stats.tx_bytes += len; return 0; } - slave->xmit_lock_owner = -1; - spin_unlock(&slave->xmit_lock); + netif_tx_unlock(slave); } if (netif_queue_stopped(dev)) busy = 1; -- cgit v1.2.3-59-g8ed1b From 364c6badde0dd62a0a38e5ed67f85d87d6665780 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 9 Jun 2006 16:10:40 -0700 Subject: [NET]: Clean up skb_linearize The linearisation operation doesn't need to be super-optimised. So we can replace __skb_linearize with __pskb_pull_tail which does the same thing but is more general. Also, most users of skb_linearize end up testing whether the skb is linear or not so it helps to make skb_linearize do just that. Some callers of skb_linearize also use it to copy cloned data, so it's useful to have a new function skb_linearize_cow to copy the data if it's either non-linear or cloned. Last but not least, I've removed the gfp argument since nobody uses it anymore. If it's ever needed we can easily add it back. Misc bugs fixed by this patch: * via-velocity error handling (also, no SG => no frags) Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- drivers/block/aoe/aoenet.c | 3 +-- drivers/net/mv643xx_eth.c | 2 +- drivers/net/via-velocity.c | 10 +++++--- include/linux/skbuff.h | 24 +++++++++++++++--- net/core/dev.c | 63 ++-------------------------------------------- net/decnet/dn_nsp_in.c | 3 +-- net/decnet/dn_route.c | 3 +-- net/ipv4/ipcomp.c | 11 +++----- net/ipv6/ipcomp6.c | 11 +++----- 9 files changed, 39 insertions(+), 91 deletions(-) (limited to 'net') diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c index fdff774b8ab9..c1434ed11880 100644 --- a/drivers/block/aoe/aoenet.c +++ b/drivers/block/aoe/aoenet.c @@ -116,8 +116,7 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, skb = skb_share_check(skb, GFP_ATOMIC); if (skb == NULL) return 0; - if (skb_is_nonlinear(skb)) - if (skb_linearize(skb, GFP_ATOMIC) < 0) + if (skb_linearize(skb)) goto exit; if (!is_aoe_netif(ifp)) goto exit; diff --git a/drivers/net/mv643xx_eth.c b/drivers/net/mv643xx_eth.c index 411f4d809c47..625ff61c9988 100644 --- a/drivers/net/mv643xx_eth.c +++ b/drivers/net/mv643xx_eth.c @@ -1200,7 +1200,7 @@ static int mv643xx_eth_start_xmit(struct sk_buff *skb, struct net_device *dev) } if (has_tiny_unaligned_frags(skb)) { - if ((skb_linearize(skb, GFP_ATOMIC) != 0)) { + if (__skb_linearize(skb)) { stats->tx_dropped++; printk(KERN_DEBUG "%s: failed to linearize tiny " "unaligned fragment\n", dev->name); diff --git a/drivers/net/via-velocity.c b/drivers/net/via-velocity.c index ed1f837c8fda..2eb6b5f9ba0d 100644 --- a/drivers/net/via-velocity.c +++ b/drivers/net/via-velocity.c @@ -1899,6 +1899,13 @@ static int velocity_xmit(struct sk_buff *skb, struct net_device *dev) int pktlen = skb->len; +#ifdef VELOCITY_ZERO_COPY_SUPPORT + if (skb_shinfo(skb)->nr_frags > 6 && __skb_linearize(skb)) { + kfree_skb(skb); + return 0; + } +#endif + spin_lock_irqsave(&vptr->lock, flags); index = vptr->td_curr[qnum]; @@ -1914,8 +1921,6 @@ static int velocity_xmit(struct sk_buff *skb, struct net_device *dev) */ if (pktlen < ETH_ZLEN) { /* Cannot occur until ZC support */ - if(skb_linearize(skb, GFP_ATOMIC)) - return 0; pktlen = ETH_ZLEN; memcpy(tdinfo->buf, skb->data, skb->len); memset(tdinfo->buf + skb->len, 0, ETH_ZLEN - skb->len); @@ -1933,7 +1938,6 @@ static int velocity_xmit(struct sk_buff *skb, struct net_device *dev) int nfrags = skb_shinfo(skb)->nr_frags; tdinfo->skb = skb; if (nfrags > 6) { - skb_linearize(skb, GFP_ATOMIC); memcpy(tdinfo->buf, skb->data, skb->len); tdinfo->skb_dma[0] = tdinfo->buf_dma; td_ptr->tdesc0.pktsize = diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index fe2c58e5306f..830f58fa03a2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1169,18 +1169,34 @@ static inline int skb_can_coalesce(struct sk_buff *skb, int i, return 0; } +static inline int __skb_linearize(struct sk_buff *skb) +{ + return __pskb_pull_tail(skb, skb->data_len) ? 0 : -ENOMEM; +} + /** * skb_linearize - convert paged skb to linear one * @skb: buffer to linarize - * @gfp: allocation mode * * If there is no free memory -ENOMEM is returned, otherwise zero * is returned and the old skb data released. */ -extern int __skb_linearize(struct sk_buff *skb, gfp_t gfp); -static inline int skb_linearize(struct sk_buff *skb, gfp_t gfp) +static inline int skb_linearize(struct sk_buff *skb) +{ + return skb_is_nonlinear(skb) ? __skb_linearize(skb) : 0; +} + +/** + * skb_linearize_cow - make sure skb is linear and writable + * @skb: buffer to process + * + * If there is no free memory -ENOMEM is returned, otherwise zero + * is returned and the old skb data released. + */ +static inline int skb_linearize_cow(struct sk_buff *skb) { - return __skb_linearize(skb, gfp); + return skb_is_nonlinear(skb) || skb_cloned(skb) ? + __skb_linearize(skb) : 0; } /** diff --git a/net/core/dev.c b/net/core/dev.c index 1b09f1cae46e..91361bc2b682 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1222,64 +1222,6 @@ static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb) #define illegal_highdma(dev, skb) (0) #endif -/* Keep head the same: replace data */ -int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask) -{ - unsigned int size; - u8 *data; - long offset; - struct skb_shared_info *ninfo; - int headerlen = skb->data - skb->head; - int expand = (skb->tail + skb->data_len) - skb->end; - - if (skb_shared(skb)) - BUG(); - - if (expand <= 0) - expand = 0; - - size = skb->end - skb->head + expand; - size = SKB_DATA_ALIGN(size); - data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask); - if (!data) - return -ENOMEM; - - /* Copy entire thing */ - if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len)) - BUG(); - - /* Set up shinfo */ - ninfo = (struct skb_shared_info*)(data + size); - atomic_set(&ninfo->dataref, 1); - ninfo->tso_size = skb_shinfo(skb)->tso_size; - ninfo->tso_segs = skb_shinfo(skb)->tso_segs; - ninfo->nr_frags = 0; - ninfo->frag_list = NULL; - - /* Offset between the two in bytes */ - offset = data - skb->head; - - /* Free old data. */ - skb_release_data(skb); - - skb->head = data; - skb->end = data + size; - - /* Set up new pointers */ - skb->h.raw += offset; - skb->nh.raw += offset; - skb->mac.raw += offset; - skb->tail += offset; - skb->data += offset; - - /* We are no longer a clone, even if we were. */ - skb->cloned = 0; - - skb->tail += skb->data_len; - skb->data_len = 0; - return 0; -} - #define HARD_TX_LOCK(dev, cpu) { \ if ((dev->features & NETIF_F_LLTX) == 0) { \ netif_tx_lock(dev); \ @@ -1326,7 +1268,7 @@ int dev_queue_xmit(struct sk_buff *skb) if (skb_shinfo(skb)->frag_list && !(dev->features & NETIF_F_FRAGLIST) && - __skb_linearize(skb, GFP_ATOMIC)) + __skb_linearize(skb)) goto out_kfree_skb; /* Fragmented skb is linearized if device does not support SG, @@ -1335,7 +1277,7 @@ int dev_queue_xmit(struct sk_buff *skb) */ if (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) && - __skb_linearize(skb, GFP_ATOMIC)) + __skb_linearize(skb)) goto out_kfree_skb; /* If packet is not checksummed and device does not support @@ -3473,7 +3415,6 @@ subsys_initcall(net_dev_init); EXPORT_SYMBOL(__dev_get_by_index); EXPORT_SYMBOL(__dev_get_by_name); EXPORT_SYMBOL(__dev_remove_pack); -EXPORT_SYMBOL(__skb_linearize); EXPORT_SYMBOL(dev_valid_name); EXPORT_SYMBOL(dev_add_pack); EXPORT_SYMBOL(dev_alloc_name); diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index 547523b41c81..a2ba9db1c376 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -801,8 +801,7 @@ got_it: * We linearize everything except data segments here. */ if (cb->nsp_flags & ~0x60) { - if (unlikely(skb_is_nonlinear(skb)) && - skb_linearize(skb, GFP_ATOMIC) != 0) + if (unlikely(skb_linearize(skb))) goto free_out; } diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index e172cf98d7fc..5abf7057af00 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -629,8 +629,7 @@ int dn_route_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type padlen); if (flags & DN_RT_PKT_CNTL) { - if (unlikely(skb_is_nonlinear(skb)) && - skb_linearize(skb, GFP_ATOMIC) != 0) + if (unlikely(skb_linearize(skb))) goto dump_it; switch(flags & DN_RT_CNTL_MSK) { diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 8e243589045f..3ed8b57a1002 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -80,15 +80,12 @@ out: static int ipcomp_input(struct xfrm_state *x, struct sk_buff *skb) { - int err = 0; + int err = -ENOMEM; struct iphdr *iph; struct ip_comp_hdr *ipch; - if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && - skb_linearize(skb, GFP_ATOMIC) != 0) { - err = -ENOMEM; + if (skb_linearize_cow(skb)) goto out; - } skb->ip_summed = CHECKSUM_NONE; @@ -158,10 +155,8 @@ static int ipcomp_output(struct xfrm_state *x, struct sk_buff *skb) goto out_ok; } - if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && - skb_linearize(skb, GFP_ATOMIC) != 0) { + if (skb_linearize_cow(skb)) goto out_ok; - } err = ipcomp_compress(x, skb); iph = skb->nh.iph; diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index cec3be544b69..f28cd37feed3 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -65,7 +65,7 @@ static LIST_HEAD(ipcomp6_tfms_list); static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) { - int err = 0; + int err = -ENOMEM; struct ipv6hdr *iph; struct ipv6_comp_hdr *ipch; int plen, dlen; @@ -74,11 +74,8 @@ static int ipcomp6_input(struct xfrm_state *x, struct sk_buff *skb) struct crypto_tfm *tfm; int cpu; - if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && - skb_linearize(skb, GFP_ATOMIC) != 0) { - err = -ENOMEM; + if (skb_linearize_cow(skb)) goto out; - } skb->ip_summed = CHECKSUM_NONE; @@ -142,10 +139,8 @@ static int ipcomp6_output(struct xfrm_state *x, struct sk_buff *skb) goto out_ok; } - if ((skb_is_nonlinear(skb) || skb_cloned(skb)) && - skb_linearize(skb, GFP_ATOMIC) != 0) { + if (skb_linearize_cow(skb)) goto out_ok; - } /* compression */ plen = skb->len - hdr_len; -- cgit v1.2.3-59-g8ed1b From b38dfee3d616ffadb58d4215e3ff9d1d7921031e Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 9 Jun 2006 16:13:01 -0700 Subject: [NET]: skb_trim audit I found a few more spots where pskb_trim_rcsum could be used but were not. This patch changes them to use it. Also, sk_filter can get paged skb data. Therefore we must use pskb_trim instead of skb_trim. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/net/sock.h | 5 +---- net/bridge/br_netfilter.c | 14 +++----------- net/ipv6/netfilter/nf_conntrack_reasm.c | 10 +++------- 3 files changed, 7 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/include/net/sock.h b/include/net/sock.h index 75b0e97ed93d..96565ff0de6a 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -873,10 +873,7 @@ static inline int sk_filter(struct sock *sk, struct sk_buff *skb, int needlock) if (filter) { unsigned int pkt_len = sk_run_filter(skb, filter->insns, filter->len); - if (!pkt_len) - err = -EPERM; - else - skb_trim(skb, pkt_len); + err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; } if (needlock) diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 3da9264449f7..3e41f9d6d51c 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -407,12 +407,8 @@ static unsigned int br_nf_pre_routing_ipv6(unsigned int hook, if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) { if (pkt_len + sizeof(struct ipv6hdr) > skb->len) goto inhdr_error; - if (pkt_len + sizeof(struct ipv6hdr) < skb->len) { - if (__pskb_trim(skb, pkt_len + sizeof(struct ipv6hdr))) - goto inhdr_error; - if (skb->ip_summed == CHECKSUM_HW) - skb->ip_summed = CHECKSUM_NONE; - } + if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr))) + goto inhdr_error; } if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb)) goto inhdr_error; @@ -495,11 +491,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff **pskb, if (skb->len < len || len < 4 * iph->ihl) goto inhdr_error; - if (skb->len > len) { - __pskb_trim(skb, len); - if (skb->ip_summed == CHECKSUM_HW) - skb->ip_summed = CHECKSUM_NONE; - } + pskb_trim_rcsum(skb, len); nf_bridge_put(skb->nf_bridge); if (!nf_bridge_alloc(skb)) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 3e319035f82d..c32a029e43f0 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -456,13 +456,9 @@ static int nf_ct_frag6_queue(struct nf_ct_frag6_queue *fq, struct sk_buff *skb, DEBUGP("queue: message is too short.\n"); goto err; } - if (end-offset < skb->len) { - if (pskb_trim(skb, end - offset)) { - DEBUGP("Can't trim\n"); - goto err; - } - if (skb->ip_summed != CHECKSUM_UNNECESSARY) - skb->ip_summed = CHECKSUM_NONE; + if (pskb_trim_rcsum(skb, end - offset)) { + DEBUGP("Can't trim\n"); + goto err; } /* Find out which fragments are in front and at the back of us -- cgit v1.2.3-59-g8ed1b From 3cc0e873986fe594d0e96d07259b11f755325cb2 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Fri, 9 Jun 2006 16:13:38 -0700 Subject: [NET]: Warn in __skb_trim if skb is paged It's better to warn and fail rather than rarely triggering BUG on paths that incorrectly call skb_trim/__skb_trim on a non-linear skb. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/skbuff.h | 24 ++++++++++++------------ net/core/skbuff.c | 7 ++----- 2 files changed, 14 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 830f58fa03a2..93e4db221585 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -975,15 +975,16 @@ static inline void skb_reserve(struct sk_buff *skb, int len) #define NET_SKB_PAD 16 #endif -extern int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc); +extern int ___pskb_trim(struct sk_buff *skb, unsigned int len); static inline void __skb_trim(struct sk_buff *skb, unsigned int len) { - if (!skb->data_len) { - skb->len = len; - skb->tail = skb->data + len; - } else - ___pskb_trim(skb, len, 0); + if (unlikely(skb->data_len)) { + WARN_ON(1); + return; + } + skb->len = len; + skb->tail = skb->data + len; } /** @@ -993,6 +994,7 @@ static inline void __skb_trim(struct sk_buff *skb, unsigned int len) * * Cut the length of a buffer down by removing data from the tail. If * the buffer is already under the length specified it is not modified. + * The skb must be linear. */ static inline void skb_trim(struct sk_buff *skb, unsigned int len) { @@ -1003,12 +1005,10 @@ static inline void skb_trim(struct sk_buff *skb, unsigned int len) static inline int __pskb_trim(struct sk_buff *skb, unsigned int len) { - if (!skb->data_len) { - skb->len = len; - skb->tail = skb->data+len; - return 0; - } - return ___pskb_trim(skb, len, 1); + if (skb->data_len) + return ___pskb_trim(skb, len); + __skb_trim(skb, len); + return 0; } static inline int pskb_trim(struct sk_buff *skb, unsigned int len) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 96cdcbe24ba2..bb7210f4005e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -801,12 +801,10 @@ struct sk_buff *skb_pad(struct sk_buff *skb, int pad) return nskb; } -/* Trims skb to length len. It can change skb pointers, if "realloc" is 1. - * If realloc==0 and trimming is impossible without change of data, - * it is BUG(). +/* Trims skb to length len. It can change skb pointers. */ -int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc) +int ___pskb_trim(struct sk_buff *skb, unsigned int len) { int offset = skb_headlen(skb); int nfrags = skb_shinfo(skb)->nr_frags; @@ -816,7 +814,6 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len, int realloc) int end = offset + skb_shinfo(skb)->frags[i].size; if (end > len) { if (skb_cloned(skb)) { - BUG_ON(!realloc); if (pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) return -ENOMEM; } -- cgit v1.2.3-59-g8ed1b From f8d596211291a8d98efa47ae0261326218f310cf Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sat, 10 Jun 2006 18:05:35 -0700 Subject: [IPX]: Endian bug in ipxrtr_route_packet() Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- net/ipx/ipx_route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipx/ipx_route.c b/net/ipx/ipx_route.c index a394c6fe19a2..bba3431cd9a5 100644 --- a/net/ipx/ipx_route.c +++ b/net/ipx/ipx_route.c @@ -238,7 +238,7 @@ int ipxrtr_route_packet(struct sock *sk, struct sockaddr_ipx *usipx, } /* Apply checksum. Not allowed on 802.3 links. */ - if (sk->sk_no_check || intrfc->if_dlink_type == IPX_FRAME_8023) + if (sk->sk_no_check || intrfc->if_dlink_type == htons(IPX_FRAME_8023)) ipx->ipx_checksum = 0xFFFF; else ipx->ipx_checksum = ipx_cksum(ipx, len + sizeof(struct ipxhdr)); -- cgit v1.2.3-59-g8ed1b From bdeb04c6d9a957ae2a51c3033414467b82b2a736 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 11 Jun 2006 21:20:38 -0700 Subject: [NET]: net.ipv4.ip_autoconfig sysctl removal The sysctl net.ipv4.ip_autoconfig is a legacy value that is not used. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/ip.h | 1 - net/ipv4/sysctl_net_ipv4.c | 8 -------- 2 files changed, 9 deletions(-) (limited to 'net') diff --git a/include/net/ip.h b/include/net/ip.h index 3d2e5ca62a5a..ead233c9540d 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -147,7 +147,6 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar struct ipv4_config { int log_martians; - int autoconfig; int no_pmtu_disc; }; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 6a6aa537b7aa..e7fb665873c6 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -181,14 +181,6 @@ ctl_table ipv4_table[] = { .proc_handler = &ipv4_doint_and_flush, .strategy = &ipv4_doint_and_flush_strategy, }, - { - .ctl_name = NET_IPV4_AUTOCONFIG, - .procname = "ip_autoconfig", - .data = &ipv4_config.autoconfig, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec - }, { .ctl_name = NET_IPV4_NO_PMTU_DISC, .procname = "ip_no_pmtu_disc", -- cgit v1.2.3-59-g8ed1b From f61e29018a30c738e1298e1b13be956aa17ee17b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Sun, 11 Jun 2006 23:01:02 -0700 Subject: [TCP] Westwood: fix first sample Need to update send sequence number tracking after first ack. Rework of patch from Luca De Cicco. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_westwood.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 29eb258b6d82..62a96b71b0dd 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -22,6 +22,7 @@ struct westwood { u32 accounted; u32 rtt; u32 rtt_min; /* minimum observed RTT */ + u8 first_ack; /* flag which infers that this is the first ack */ }; @@ -52,6 +53,7 @@ static void tcp_westwood_init(struct sock *sk) w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; w->rtt_win_sx = tcp_time_stamp; w->snd_una = tcp_sk(sk)->snd_una; + w->first_ack = 1; } /* @@ -91,6 +93,15 @@ static void westwood_update_window(struct sock *sk) struct westwood *w = inet_csk_ca(sk); s32 delta = tcp_time_stamp - w->rtt_win_sx; + /* Initialise w->snd_una with the first acked sequence number in order + * to fix mismatch between tp->snd_una and w->snd_una for the first + * bandwidth sample + */ + if (w->first_ack) { + w->snd_una = tcp_sk(sk)->snd_una; + w->first_ack = 0; + } + /* * See if a RTT-window has passed. * Be careful since if RTT is less than @@ -180,7 +191,7 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) { struct tcp_sock *tp = tcp_sk(sk); struct westwood *w = inet_csk_ca(sk); - + switch(event) { case CA_EVENT_FAST_ACK: westwood_fast_bw(sk); -- cgit v1.2.3-59-g8ed1b From b7d7a9e3c900f0733bf2aabdd41e6dbc70eae94b Mon Sep 17 00:00:00 2001 From: Luca De Cicco Date: Sun, 11 Jun 2006 23:01:39 -0700 Subject: [TCP] Westwood: comment fixes Cleanup some comments and add more references Signed-off-by: Luca De Cicco Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_westwood.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 62a96b71b0dd..12a2cd976e3a 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -1,7 +1,24 @@ /* - * TCP Westwood+ + * TCP Westwood+: end-to-end bandwidth estimation for TCP * - * Angelo Dell'Aera: TCP Westwood+ support + * Angelo Dell'Aera: author of the first version of TCP Westwood+ in Linux 2.4 + * + * Support at http://c3lab.poliba.it/index.php/Westwood + * Main references in literature: + * + * - Mascolo S, Casetti, M. Gerla et al. + * "TCP Westwood: bandwidth estimation for TCP" Proc. ACM Mobicom 2001 + * + * - A. Grieco, s. Mascolo + * "Performance evaluation of New Reno, Vegas, Westwood+ TCP" ACM Computer + * Comm. Review, 2004 + * + * - A. Dell'Aera, L. Grieco, S. Mascolo. + * "Linux 2.4 Implementation of Westwood+ TCP with Rate-Halving : + * A Performance Evaluation Over the Internet" (ICC 2004), Paris, June 2004 + * + * Westwood+ employs end-to-end bandwidth measurement to set cwnd and + * ssthresh after packet loss. The probing phase is as the original Reno. */ #include @@ -93,7 +110,7 @@ static void westwood_update_window(struct sock *sk) struct westwood *w = inet_csk_ca(sk); s32 delta = tcp_time_stamp - w->rtt_win_sx; - /* Initialise w->snd_una with the first acked sequence number in order + /* Initialize w->snd_una with the first acked sequence number in order * to fix mismatch between tp->snd_una and w->snd_una for the first * bandwidth sample */ @@ -191,7 +208,7 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) { struct tcp_sock *tp = tcp_sk(sk); struct westwood *w = inet_csk_ca(sk); - + switch(event) { case CA_EVENT_FAST_ACK: westwood_fast_bw(sk); -- cgit v1.2.3-59-g8ed1b From b3a92eabe5b67bd207a38ae13dd51f4e08c1f6f7 Mon Sep 17 00:00:00 2001 From: Luca De Cicco Date: Sun, 11 Jun 2006 23:01:59 -0700 Subject: [TCP] Westwood: bandwidth filter startup The bandwidth estimate filter is now initialized with the first sample in order to have better performances in the case of small file transfers. Signed-off-by: Luca De Cicco Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_westwood.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 12a2cd976e3a..edf2ee19a8ba 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -82,10 +82,16 @@ static inline u32 westwood_do_filter(u32 a, u32 b) return (((7 * a) + b) >> 3); } -static inline void westwood_filter(struct westwood *w, u32 delta) +static void westwood_filter(struct westwood *w, u32 delta) { - w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); - w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); + /* If the filter is empty fill it with the first sample of bandwidth */ + if (w->bw_ns_est == 0 && w->bw_est == 0) { + w->bw_ns_est = w->bk / delta; + w->bw_est = w->bw_ns_est; + } else { + w->bw_ns_est = westwood_do_filter(w->bw_ns_est, w->bk / delta); + w->bw_est = westwood_do_filter(w->bw_est, w->bw_ns_est); + } } /* -- cgit v1.2.3-59-g8ed1b From bc726a71d2799f0f8b68a17f49d86aa030f64abc Mon Sep 17 00:00:00 2001 From: Luca De Cicco Date: Sun, 11 Jun 2006 23:02:19 -0700 Subject: [TCP] Westwood: reset RTT min after FRTO RTT_min is updated each time a timeout event occurs in order to cope with hard handovers in wireless scenarios such as UMTS. Signed-off-by: Luca De Cicco Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_westwood.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index edf2ee19a8ba..4247da1384bf 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -40,6 +40,7 @@ struct westwood { u32 rtt; u32 rtt_min; /* minimum observed RTT */ u8 first_ack; /* flag which infers that this is the first ack */ + u8 reset_rtt_min; /* Reset RTT min to next RTT sample*/ }; @@ -67,6 +68,7 @@ static void tcp_westwood_init(struct sock *sk) w->bw_est = 0; w->accounted = 0; w->cumul_ack = 0; + w->reset_rtt_min = 1; w->rtt_min = w->rtt = TCP_WESTWOOD_INIT_RTT; w->rtt_win_sx = tcp_time_stamp; w->snd_una = tcp_sk(sk)->snd_una; @@ -142,6 +144,16 @@ static void westwood_update_window(struct sock *sk) } } +static inline void update_rtt_min(struct westwood *w) +{ + if (w->reset_rtt_min) { + w->rtt_min = w->rtt; + w->reset_rtt_min = 0; + } else + w->rtt_min = min(w->rtt, w->rtt_min); +} + + /* * @westwood_fast_bw * It is called when we are in fast path. In particular it is called when @@ -157,7 +169,7 @@ static inline void westwood_fast_bw(struct sock *sk) w->bk += tp->snd_una - w->snd_una; w->snd_una = tp->snd_una; - w->rtt_min = min(w->rtt, w->rtt_min); + update_rtt_min(w); } /* @@ -226,12 +238,14 @@ static void tcp_westwood_event(struct sock *sk, enum tcp_ca_event event) case CA_EVENT_FRTO: tp->snd_ssthresh = tcp_westwood_bw_rttmin(sk); + /* Update RTT_min when next ack arrives */ + w->reset_rtt_min = 1; break; case CA_EVENT_SLOW_ACK: westwood_update_window(sk); w->bk += westwood_acked_count(sk); - w->rtt_min = min(w->rtt, w->rtt_min); + update_rtt_min(w); break; default: -- cgit v1.2.3-59-g8ed1b From 35089bb203f44e33b6bbb6c4de0b0708f9a48921 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 13 Jun 2006 22:33:04 -0700 Subject: [TCP]: Add tcp_slow_start_after_idle sysctl. A lot of people have asked for a way to disable tcp_cwnd_restart(), and it seems reasonable to add a sysctl to do that. Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 7 +++++++ include/linux/sysctl.h | 1 + include/net/tcp.h | 1 + net/ipv4/sysctl_net_ipv4.c | 8 ++++++++ net/ipv4/tcp_output.c | 6 +++++- 5 files changed, 22 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index f12007b80a46..d46338af6002 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -362,6 +362,13 @@ tcp_workaround_signed_windows - BOOLEAN not receive a window scaling option from them. Default: 0 +tcp_slow_start_after_idle - BOOLEAN + If set, provide RFC2861 behavior and time out the congestion + window after an idle period. An idle period is defined at + the current RTO. If unset, the congestion window will not + be timed out after an idle period. + Default: 1 + IP Variables: ip_local_port_range - 2 INTEGERS diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 98338ed2c0b6..cee944dbdcd4 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -405,6 +405,7 @@ enum NET_TCP_BASE_MSS=114, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS=115, NET_TCP_DMA_COPYBREAK=116, + NET_TCP_SLOW_START_AFTER_IDLE=117, }; enum { diff --git a/include/net/tcp.h b/include/net/tcp.h index de88c5472bfc..bfc71f954bbe 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -227,6 +227,7 @@ extern int sysctl_tcp_abc; extern int sysctl_tcp_mtu_probing; extern int sysctl_tcp_base_mss; extern int sysctl_tcp_workaround_signed_windows; +extern int sysctl_tcp_slow_start_after_idle; extern atomic_t tcp_memory_allocated; extern atomic_t tcp_sockets_allocated; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index e7fb665873c6..ce4cd5f35511 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -690,6 +690,14 @@ ctl_table ipv4_table[] = { .proc_handler = &proc_dointvec }, #endif + { + .ctl_name = NET_TCP_SLOW_START_AFTER_IDLE, + .procname = "tcp_slow_start_after_idle", + .data = &sysctl_tcp_slow_start_after_idle, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec + }, { .ctl_name = 0 } }; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f33c9dddaa12..07bb5a2b375e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -59,6 +59,9 @@ int sysctl_tcp_tso_win_divisor = 3; int sysctl_tcp_mtu_probing = 0; int sysctl_tcp_base_mss = 512; +/* By default, RFC2861 behavior. */ +int sysctl_tcp_slow_start_after_idle = 1; + static void update_send_head(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) { @@ -138,7 +141,8 @@ static void tcp_event_data_sent(struct tcp_sock *tp, struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; - if (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto) + if (sysctl_tcp_slow_start_after_idle && + (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto)) tcp_cwnd_restart(sk, __sk_dst_get(sk)); tp->lsndtime = now; -- cgit v1.2.3-59-g8ed1b From 8648b3053bff39a7ee4c711d74268079c928a657 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 17 Jun 2006 22:06:05 -0700 Subject: [NET]: Add NETIF_F_GEN_CSUM and NETIF_F_ALL_CSUM The current stack treats NETIF_F_HW_CSUM and NETIF_F_NO_CSUM identically so we test for them in quite a few places. For the sake of brevity, I'm adding the macro NETIF_F_GEN_CSUM for these two. We also test the disjunct of NETIF_F_IP_CSUM and the other two in various places, for that purpose I've added NETIF_F_ALL_CSUM. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 7 ++----- include/linux/netdevice.h | 3 +++ net/bridge/br_if.c | 3 +-- net/core/dev.c | 6 ++---- net/core/ethtool.c | 6 ++---- net/ipv4/ip_output.c | 2 +- net/ipv4/tcp.c | 10 ++-------- 7 files changed, 13 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 46326cdfb277..8171cae06688 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -1199,8 +1199,7 @@ int bond_sethwaddr(struct net_device *bond_dev, struct net_device *slave_dev) } #define BOND_INTERSECT_FEATURES \ - (NETIF_F_SG|NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM|\ - NETIF_F_TSO|NETIF_F_UFO) + (NETIF_F_SG | NETIF_F_ALL_CSUM | NETIF_F_TSO | NETIF_F_UFO) /* * Compute the common dev->feature set available to all slaves. Some @@ -1218,9 +1217,7 @@ static int bond_compute_features(struct bonding *bond) features &= (slave->dev->features & BOND_INTERSECT_FEATURES); if ((features & NETIF_F_SG) && - !(features & (NETIF_F_IP_CSUM | - NETIF_F_NO_CSUM | - NETIF_F_HW_CSUM))) + !(features & NETIF_F_ALL_CSUM)) features &= ~NETIF_F_SG; /* diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 067b9ccafd87..e432b743dda2 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -312,6 +312,9 @@ struct net_device #define NETIF_F_LLTX 4096 /* LockLess TX */ #define NETIF_F_UFO 8192 /* Can offload UDP Large Send*/ +#define NETIF_F_GEN_CSUM (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM) +#define NETIF_F_ALL_CSUM (NETIF_F_IP_CSUM | NETIF_F_GEN_CSUM) + struct net_device *next_sched; /* Interface index. Unique device identifier */ diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index f5d47bf4f967..90c95f59dc82 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -376,8 +376,7 @@ void br_features_recompute(struct net_bridge *br) checksum = br->feature_mask & NETIF_F_IP_CSUM; list_for_each_entry(p, &br->port_list, list) { - if (!(p->dev->features - & (NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM))) + if (!(p->dev->features & NETIF_F_ALL_CSUM)) checksum = 0; features &= p->dev->features; } diff --git a/net/core/dev.c b/net/core/dev.c index 91361bc2b682..ab39fe17cb58 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1284,7 +1284,7 @@ int dev_queue_xmit(struct sk_buff *skb) * checksumming for this protocol, complete checksumming here. */ if (skb->ip_summed == CHECKSUM_HW && - (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) && + (!(dev->features & NETIF_F_GEN_CSUM) && (!(dev->features & NETIF_F_IP_CSUM) || skb->protocol != htons(ETH_P_IP)))) if (skb_checksum_help(skb, 0)) @@ -2789,9 +2789,7 @@ int register_netdevice(struct net_device *dev) /* Fix illegal SG+CSUM combinations. */ if ((dev->features & NETIF_F_SG) && - !(dev->features & (NETIF_F_IP_CSUM | - NETIF_F_NO_CSUM | - NETIF_F_HW_CSUM))) { + !(dev->features & NETIF_F_ALL_CSUM)) { printk("%s: Dropping NETIF_F_SG since no checksum feature.\n", dev->name); dev->features &= ~NETIF_F_SG; diff --git a/net/core/ethtool.c b/net/core/ethtool.c index e6f76106a99b..3f269e4e9438 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -30,7 +30,7 @@ u32 ethtool_op_get_link(struct net_device *dev) u32 ethtool_op_get_tx_csum(struct net_device *dev) { - return (dev->features & (NETIF_F_IP_CSUM | NETIF_F_HW_CSUM)) != 0; + return (dev->features & NETIF_F_ALL_CSUM) != 0; } int ethtool_op_set_tx_csum(struct net_device *dev, u32 data) @@ -551,9 +551,7 @@ static int ethtool_set_sg(struct net_device *dev, char __user *useraddr) return -EFAULT; if (edata.data && - !(dev->features & (NETIF_F_IP_CSUM | - NETIF_F_NO_CSUM | - NETIF_F_HW_CSUM))) + !(dev->features & NETIF_F_ALL_CSUM)) return -EINVAL; return __ethtool_set_sg(dev, edata.data); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index d4bb3fae4e49..8538aac3d148 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -840,7 +840,7 @@ int ip_append_data(struct sock *sk, */ if (transhdrlen && length + fragheaderlen <= mtu && - rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) && + rt->u.dst.dev->features & NETIF_F_ALL_CSUM && !exthdrlen) csummode = CHECKSUM_HW; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ff6ccda9ff46..74998f250071 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -622,14 +622,10 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, ssize_t res; struct sock *sk = sock->sk; -#define TCP_ZC_CSUM_FLAGS (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | NETIF_F_HW_CSUM) - if (!(sk->sk_route_caps & NETIF_F_SG) || - !(sk->sk_route_caps & TCP_ZC_CSUM_FLAGS)) + !(sk->sk_route_caps & NETIF_F_ALL_CSUM)) return sock_no_sendpage(sock, page, offset, size, flags); -#undef TCP_ZC_CSUM_FLAGS - lock_sock(sk); TCP_CHECK_TIMER(sk); res = do_tcp_sendpages(sk, &page, offset, size, flags); @@ -726,9 +722,7 @@ new_segment: /* * Check whether we can use HW checksum. */ - if (sk->sk_route_caps & - (NETIF_F_IP_CSUM | NETIF_F_NO_CSUM | - NETIF_F_HW_CSUM)) + if (sk->sk_route_caps & NETIF_F_ALL_CSUM) skb->ip_summed = CHECKSUM_HW; skb_entail(sk, tp, skb); -- cgit v1.2.3-59-g8ed1b From 2c6cc0d8539f121c3c75aa3641c19b67e8723379 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 17 Jun 2006 22:06:45 -0700 Subject: [BRIDGE]: Add support for NETIF_F_HW_CSUM devices As it is the bridge will only ever declare NETIF_F_IP_CSUM even if all its constituent devices support NETIF_F_HW_CSUM. This patch fixes this by supporting the first one out of NETIF_F_NO_CSUM, NETIF_F_HW_CSUM, and NETIF_F_IP_CSUM that is supported by all constituent devices. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/bridge/br_device.c | 6 +++--- net/bridge/br_if.c | 12 +++++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index 0c88a2ac32c1..2afdc7c0736c 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -145,9 +145,9 @@ static int br_set_tx_csum(struct net_device *dev, u32 data) struct net_bridge *br = netdev_priv(dev); if (data) - br->feature_mask |= NETIF_F_IP_CSUM; + br->feature_mask |= NETIF_F_NO_CSUM; else - br->feature_mask &= ~NETIF_F_IP_CSUM; + br->feature_mask &= ~NETIF_F_ALL_CSUM; br_features_recompute(br); return 0; @@ -185,5 +185,5 @@ void br_dev_setup(struct net_device *dev) dev->priv_flags = IFF_EBRIDGE; dev->features = NETIF_F_SG | NETIF_F_FRAGLIST - | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_IP_CSUM; + | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_NO_CSUM; } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 90c95f59dc82..fdec773f5b52 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -372,11 +372,17 @@ void br_features_recompute(struct net_bridge *br) struct net_bridge_port *p; unsigned long features, checksum; - features = br->feature_mask &~ NETIF_F_IP_CSUM; - checksum = br->feature_mask & NETIF_F_IP_CSUM; + checksum = br->feature_mask & NETIF_F_ALL_CSUM ? NETIF_F_NO_CSUM : 0; + features = br->feature_mask & ~NETIF_F_ALL_CSUM; list_for_each_entry(p, &br->port_list, list) { - if (!(p->dev->features & NETIF_F_ALL_CSUM)) + if (checksum & NETIF_F_NO_CSUM && + !(p->dev->features & NETIF_F_NO_CSUM)) + checksum ^= NETIF_F_NO_CSUM | NETIF_F_HW_CSUM; + if (checksum & NETIF_F_HW_CSUM && + !(p->dev->features & NETIF_F_HW_CSUM)) + checksum ^= NETIF_F_HW_CSUM | NETIF_F_IP_CSUM; + if (!(p->dev->features & NETIF_F_IP_CSUM)) checksum = 0; features &= p->dev->features; } -- cgit v1.2.3-59-g8ed1b From b293acfd3133393a81bcd382eb71a210c9cf9526 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sat, 17 Jun 2006 22:16:13 -0700 Subject: [IRDA]: Use put_unaligned() in irlmp_do_discovery(). irda_device_info->hints[] is byte aligned but is being accessed as a u16 Based upon a patch by Luke Yang . Signed-off-by: David S. Miller --- net/irda/irlmp.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c index c19e9ce05a3a..57ea160f470b 100644 --- a/net/irda/irlmp.c +++ b/net/irda/irlmp.c @@ -44,6 +44,8 @@ #include #include +#include + static __u8 irlmp_find_free_slsap(void); static int irlmp_slsap_inuse(__u8 slsap_sel); @@ -840,6 +842,7 @@ void irlmp_do_expiry(void) void irlmp_do_discovery(int nslots) { struct lap_cb *lap; + __u16 *data_hintsp; /* Make sure the value is sane */ if ((nslots != 1) && (nslots != 6) && (nslots != 8) && (nslots != 16)){ @@ -849,7 +852,8 @@ void irlmp_do_discovery(int nslots) } /* Construct new discovery info to be used by IrLAP, */ - u16ho(irlmp->discovery_cmd.data.hints) = irlmp->hints.word; + data_hintsp = (__u16 *) irlmp->discovery_cmd.data.hints; + put_unaligned(irlmp->hints.word, data_hintsp); /* * Set character set for device name (we use ASCII), and -- cgit v1.2.3-59-g8ed1b From c5396a31b20991c856facbce18a2a56d1a14e8d0 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Sat, 17 Jun 2006 22:48:48 -0700 Subject: [IPV6]: Sum real space for RTAs. This patch fixes RTNLGRP_IPV6_IFINFO netlink notifications. Issue pointed out by Patrick McHardy . Signed-off-by: YOSHIFUJI Hideaki Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 445006ee4522..c2c26fa0943d 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2860,6 +2860,11 @@ inet6_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) return inet6_addr_add(ifm->ifa_index, pfx, ifm->ifa_prefixlen); } +/* Maximum length of ifa_cacheinfo attributes */ +#define INET6_IFADDR_RTA_SPACE \ + RTA_SPACE(16) /* IFA_ADDRESS */ + \ + RTA_SPACE(sizeof(struct ifa_cacheinfo)) /* CACHEINFO */ + static int inet6_fill_ifaddr(struct sk_buff *skb, struct inet6_ifaddr *ifa, u32 pid, u32 seq, int event, unsigned int flags) { @@ -3092,7 +3097,7 @@ static int inet6_dump_ifacaddr(struct sk_buff *skb, struct netlink_callback *cb) static void inet6_ifa_notify(int event, struct inet6_ifaddr *ifa) { struct sk_buff *skb; - int size = NLMSG_SPACE(sizeof(struct ifaddrmsg)+128); + int size = NLMSG_SPACE(sizeof(struct ifaddrmsg) + INET6_IFADDR_RTA_SPACE); skb = alloc_skb(size, GFP_ATOMIC); if (!skb) { @@ -3142,6 +3147,17 @@ static void inline ipv6_store_devconf(struct ipv6_devconf *cnf, #endif } +/* Maximum length of ifinfomsg attributes */ +#define INET6_IFINFO_RTA_SPACE \ + RTA_SPACE(IFNAMSIZ) /* IFNAME */ + \ + RTA_SPACE(MAX_ADDR_LEN) /* ADDRESS */ + \ + RTA_SPACE(sizeof(u32)) /* MTU */ + \ + RTA_SPACE(sizeof(int)) /* LINK */ + \ + RTA_SPACE(0) /* PROTINFO */ + \ + RTA_SPACE(sizeof(u32)) /* FLAGS */ + \ + RTA_SPACE(sizeof(struct ifla_cacheinfo)) /* CACHEINFO */ + \ + RTA_SPACE(sizeof(__s32[DEVCONF_MAX])) /* CONF */ + static int inet6_fill_ifinfo(struct sk_buff *skb, struct inet6_dev *idev, u32 pid, u32 seq, int event, unsigned int flags) { @@ -3235,8 +3251,7 @@ static int inet6_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) void inet6_ifinfo_notify(int event, struct inet6_dev *idev) { struct sk_buff *skb; - /* 128 bytes ?? */ - int size = NLMSG_SPACE(sizeof(struct ifinfomsg)+128); + int size = NLMSG_SPACE(sizeof(struct ifinfomsg) + INET6_IFINFO_RTA_SPACE); skb = alloc_skb(size, GFP_ATOMIC); if (!skb) { @@ -3252,6 +3267,11 @@ void inet6_ifinfo_notify(int event, struct inet6_dev *idev) netlink_broadcast(rtnl, skb, 0, RTNLGRP_IPV6_IFINFO, GFP_ATOMIC); } +/* Maximum length of prefix_cacheinfo attributes */ +#define INET6_PREFIX_RTA_SPACE \ + RTA_SPACE(sizeof(((struct prefix_info *)NULL)->prefix)) /* ADDRESS */ + \ + RTA_SPACE(sizeof(struct prefix_cacheinfo)) /* CACHEINFO */ + static int inet6_fill_prefix(struct sk_buff *skb, struct inet6_dev *idev, struct prefix_info *pinfo, u32 pid, u32 seq, int event, unsigned int flags) @@ -3296,7 +3316,7 @@ static void inet6_prefix_notify(int event, struct inet6_dev *idev, struct prefix_info *pinfo) { struct sk_buff *skb; - int size = NLMSG_SPACE(sizeof(struct prefixmsg)+128); + int size = NLMSG_SPACE(sizeof(struct prefixmsg) + INET6_PREFIX_RTA_SPACE); skb = alloc_skb(size, GFP_ATOMIC); if (!skb) { -- cgit v1.2.3-59-g8ed1b From 402d68c43326d2f0e7e2e9a9013cd4c098d9b87c Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Sat, 17 Jun 2006 22:54:51 -0700 Subject: [SCTP]: Limit association max_retrans setting in setsockopt. When using ASSOCINFO socket option, we need to limit the number of maximum association retransmissions to be no greater than the sum of all the path retransmissions. This is specified in Section 7.1.2 of the SCTP socket API draft. However, we only do this if the association has multiple paths. If there is only one path, the protocol stack will use the assoc_max_retrans setting when trying to retransmit packets. Signed-off-by: Vlad Yasevich Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/sctp/socket.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 174d4d35e951..b41dcbb89685 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2530,8 +2530,32 @@ static int sctp_setsockopt_associnfo(struct sock *sk, char __user *optval, int o /* Set the values to the specific association */ if (asoc) { - if (assocparams.sasoc_asocmaxrxt != 0) + if (assocparams.sasoc_asocmaxrxt != 0) { + __u32 path_sum = 0; + int paths = 0; + struct list_head *pos; + struct sctp_transport *peer_addr; + + list_for_each(pos, &asoc->peer.transport_addr_list) { + peer_addr = list_entry(pos, + struct sctp_transport, + transports); + path_sum += peer_addr->pathmaxrxt; + paths++; + } + + /* Only validate asocmaxrxt if we have more then + * one path/transport. We do this because path + * retransmissions are only counted when we have more + * then one path. + */ + if (paths > 1 && + assocparams.sasoc_asocmaxrxt > path_sum) + return -EINVAL; + asoc->max_retrans = assocparams.sasoc_asocmaxrxt; + } + if (assocparams.sasoc_cookie_life != 0) { asoc->cookie_life.tv_sec = assocparams.sasoc_cookie_life / 1000; -- cgit v1.2.3-59-g8ed1b From 5636bef7324f49e36f05ec8a5f6284e11b1bcca4 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Sat, 17 Jun 2006 22:55:35 -0700 Subject: [SCTP]: Reject sctp packets with broadcast addresses. Signed-off-by: Vlad Yasevich Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 3 ++- net/sctp/input.c | 3 ++- net/sctp/ipv6.c | 6 ++++-- net/sctp/protocol.c | 8 +++++++- net/sctp/socket.c | 2 +- 5 files changed, 16 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 7f4fea173fb1..5f69158c1006 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -555,7 +555,8 @@ struct sctp_af { int (*to_addr_param) (const union sctp_addr *, union sctp_addr_param *); int (*addr_valid) (union sctp_addr *, - struct sctp_sock *); + struct sctp_sock *, + const struct sk_buff *); sctp_scope_t (*scope) (union sctp_addr *); void (*inaddr_any) (union sctp_addr *, unsigned short); int (*is_any) (const union sctp_addr *); diff --git a/net/sctp/input.c b/net/sctp/input.c index 1662f9cc869e..70d6606e2812 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -170,7 +170,8 @@ int sctp_rcv(struct sk_buff *skb) * IP broadcast addresses cannot be used in an SCTP transport * address." */ - if (!af->addr_valid(&src, NULL) || !af->addr_valid(&dest, NULL)) + if (!af->addr_valid(&src, NULL, skb) || + !af->addr_valid(&dest, NULL, skb)) goto discard_it; asoc = __sctp_rcv_lookup(skb, &src, &dest, &transport); diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index c20d282fac06..8ef08070c8b6 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -523,7 +523,9 @@ static int sctp_v6_available(union sctp_addr *addr, struct sctp_sock *sp) * Return 0 - If the address is a non-unicast or an illegal address. * Return 1 - If the address is a unicast. */ -static int sctp_v6_addr_valid(union sctp_addr *addr, struct sctp_sock *sp) +static int sctp_v6_addr_valid(union sctp_addr *addr, + struct sctp_sock *sp, + const struct sk_buff *skb) { int ret = ipv6_addr_type(&addr->v6.sin6_addr); @@ -537,7 +539,7 @@ static int sctp_v6_addr_valid(union sctp_addr *addr, struct sctp_sock *sp) if (sp && ipv6_only_sock(sctp_opt2sk(sp))) return 0; sctp_v6_map_v4(addr); - return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp); + return sctp_get_af_specific(AF_INET)->addr_valid(addr, sp, skb); } /* Is this a non-unicast address */ diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 2088aa992b7a..816c033d7886 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -365,12 +365,18 @@ static int sctp_v4_is_any(const union sctp_addr *addr) * Return 0 - If the address is a non-unicast or an illegal address. * Return 1 - If the address is a unicast. */ -static int sctp_v4_addr_valid(union sctp_addr *addr, struct sctp_sock *sp) +static int sctp_v4_addr_valid(union sctp_addr *addr, + struct sctp_sock *sp, + const struct sk_buff *skb) { /* Is this a non-unicast address or a unusable SCTP address? */ if (IS_IPV4_UNUSABLE_ADDRESS(&addr->v4.sin_addr.s_addr)) return 0; + /* Is this a broadcast address? */ + if (skb && ((struct rtable *)skb->dst)->rt_flags & RTCF_BROADCAST) + return 0; + return 1; } diff --git a/net/sctp/socket.c b/net/sctp/socket.c index b41dcbb89685..b811691c35bf 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -172,7 +172,7 @@ static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr, return -EINVAL; /* Is this a valid SCTP address? */ - if (!af->addr_valid(addr, sctp_sk(sk))) + if (!af->addr_valid(addr, sctp_sk(sk), NULL)) return -EINVAL; if (!sctp_sk(sk)->pf->send_verify(sctp_sk(sk), (addr))) -- cgit v1.2.3-59-g8ed1b From 4c9f5d5305a23851e67471b147e0d459a7166717 Mon Sep 17 00:00:00 2001 From: Vlad Yasevich Date: Sat, 17 Jun 2006 22:56:08 -0700 Subject: [SCTP] Reset rtt_in_progress for the chunk when processing its sack. Signed-off-by: Vlad Yasevich Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- include/net/sctp/sctp.h | 2 +- net/sctp/outqueue.c | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index aa6033ca7cd8..b2b40f951ae6 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -255,7 +255,7 @@ extern int sctp_debug_flag; #define SCTP_DEBUG_PRINTK_IPADDR(whatever...) #define SCTP_ENABLE_DEBUG #define SCTP_DISABLE_DEBUG -#define SCTP_ASSERT(expr, str, func) +#define SCTP_ASSERT(expr, str, func) BUG_ON(!(expr)) #endif /* SCTP_DEBUG */ diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index f148f9576dd2..e5faa351aaad 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -1262,6 +1262,7 @@ static void sctp_check_transmitted(struct sctp_outq *q, if (!tchunk->tsn_gap_acked && !tchunk->resent && tchunk->rtt_in_progress) { + tchunk->rtt_in_progress = 0; rtt = jiffies - tchunk->sent_at; sctp_transport_update_rto(transport, rtt); -- cgit v1.2.3-59-g8ed1b From 503b55fd77d11381b1950d1651d3bc782c0cc2cd Mon Sep 17 00:00:00 2001 From: Sridhar Samudrala Date: Sat, 17 Jun 2006 22:57:28 -0700 Subject: [SCTP]: Don't do CRC32C checksum over loopback. Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/sctp/input.c | 3 ++- net/sctp/output.c | 48 +++++++++++++++++++++++++++--------------------- 2 files changed, 29 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/sctp/input.c b/net/sctp/input.c index 70d6606e2812..42b66e74bbb5 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -141,7 +141,8 @@ int sctp_rcv(struct sk_buff *skb) __skb_pull(skb, skb->h.raw - skb->data); if (skb->len < sizeof(struct sctphdr)) goto discard_it; - if (sctp_rcv_checksum(skb) < 0) + if ((skb->ip_summed != CHECKSUM_UNNECESSARY) && + (sctp_rcv_checksum(skb) < 0)) goto discard_it; skb_pull(skb, sizeof(struct sctphdr)); diff --git a/net/sctp/output.c b/net/sctp/output.c index 437cba7260a4..cdc5a3936766 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -295,14 +295,14 @@ int sctp_packet_transmit(struct sctp_packet *packet) struct sctp_transport *tp = packet->transport; struct sctp_association *asoc = tp->asoc; struct sctphdr *sh; - __u32 crc32; + __u32 crc32 = 0; struct sk_buff *nskb; struct sctp_chunk *chunk, *tmp; struct sock *sk; int err = 0; int padding; /* How much padding do we need? */ __u8 has_data = 0; - struct dst_entry *dst; + struct dst_entry *dst = tp->dst; SCTP_DEBUG_PRINTK("%s: packet:%p\n", __FUNCTION__, packet); @@ -327,6 +327,19 @@ int sctp_packet_transmit(struct sctp_packet *packet) */ skb_set_owner_w(nskb, sk); + /* The 'obsolete' field of dst is set to 2 when a dst is freed. */ + if (!dst || (dst->obsolete > 1)) { + dst_release(dst); + sctp_transport_route(tp, NULL, sctp_sk(sk)); + if (asoc && (asoc->param_flags & SPP_PMTUD_ENABLE)) { + sctp_assoc_sync_pmtu(asoc); + } + } + nskb->dst = dst_clone(tp->dst); + if (!nskb->dst) + goto no_route; + dst = nskb->dst; + /* Build the SCTP header. */ sh = (struct sctphdr *)skb_push(nskb, sizeof(struct sctphdr)); sh->source = htons(packet->source_port); @@ -350,7 +363,8 @@ int sctp_packet_transmit(struct sctp_packet *packet) * Note: Adler-32 is no longer applicable, as has been replaced * by CRC32-C as described in . */ - crc32 = sctp_start_cksum((__u8 *)sh, sizeof(struct sctphdr)); + if (!(dst->dev->features & NETIF_F_NO_CSUM)) + crc32 = sctp_start_cksum((__u8 *)sh, sizeof(struct sctphdr)); /** * 6.10 Bundling @@ -402,9 +416,14 @@ int sctp_packet_transmit(struct sctp_packet *packet) if (padding) memset(skb_put(chunk->skb, padding), 0, padding); - crc32 = sctp_update_copy_cksum(skb_put(nskb, chunk->skb->len), - chunk->skb->data, - chunk->skb->len, crc32); + if (dst->dev->features & NETIF_F_NO_CSUM) + memcpy(skb_put(nskb, chunk->skb->len), + chunk->skb->data, chunk->skb->len); + else + crc32 = sctp_update_copy_cksum(skb_put(nskb, + chunk->skb->len), + chunk->skb->data, + chunk->skb->len, crc32); SCTP_DEBUG_PRINTK("%s %p[%s] %s 0x%x, %s %d, %s %d, %s %d\n", "*** Chunk", chunk, @@ -427,7 +446,8 @@ int sctp_packet_transmit(struct sctp_packet *packet) } /* Perform final transformation on checksum. */ - crc32 = sctp_end_cksum(crc32); + if (!(dst->dev->features & NETIF_F_NO_CSUM)) + crc32 = sctp_end_cksum(crc32); /* 3) Put the resultant value into the checksum field in the * common header, and leave the rest of the bits unchanged. @@ -477,20 +497,6 @@ int sctp_packet_transmit(struct sctp_packet *packet) } } - dst = tp->dst; - /* The 'obsolete' field of dst is set to 2 when a dst is freed. */ - if (!dst || (dst->obsolete > 1)) { - dst_release(dst); - sctp_transport_route(tp, NULL, sctp_sk(sk)); - if (asoc->param_flags & SPP_PMTUD_ENABLE) { - sctp_assoc_sync_pmtu(asoc); - } - } - - nskb->dst = dst_clone(tp->dst); - if (!nskb->dst) - goto no_route; - SCTP_DEBUG_PRINTK("***sctp_transmit_packet*** skb len %d\n", nskb->len); -- cgit v1.2.3-59-g8ed1b From d7c2c9e3977e4312d093ac092761798d4d47c9e0 Mon Sep 17 00:00:00 2001 From: Tsutomu Fujii Date: Sat, 17 Jun 2006 22:58:28 -0700 Subject: [SCTP]: Send only 1 window update SACK per message. Right now, every time we increase our rwnd by more then MTU bytes, we trigger a SACK. When processing large messages, this will generate a SACK for almost every other SCTP fragment. However since we are freeing the entire message at the same time, we might as well collapse the SACK generation to 1. Signed-off-by: Tsutomu Fujii Signed-off-by: Vlad Yasevich Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/sctp/ulpevent.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index ba97f974f57c..ee236784a6bb 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -51,6 +51,8 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, struct sctp_association *asoc); static void sctp_ulpevent_release_data(struct sctp_ulpevent *event); +static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event); + /* Initialize an ULP event from an given skb. */ SCTP_STATIC void sctp_ulpevent_init(struct sctp_ulpevent *event, int msg_flags) @@ -883,6 +885,7 @@ static void sctp_ulpevent_receive_data(struct sctp_ulpevent *event, static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) { struct sk_buff *skb, *frag; + unsigned int len; /* Current stack structures assume that the rcv buffer is * per socket. For UDP style sockets this is not true as @@ -892,7 +895,30 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) */ skb = sctp_event2skb(event); - sctp_assoc_rwnd_increase(event->asoc, skb_headlen(skb)); + len = skb->len; + + if (!skb->data_len) + goto done; + + /* Don't forget the fragments. */ + for (frag = skb_shinfo(skb)->frag_list; frag; frag = frag->next) { + /* NOTE: skb_shinfos are recursive. Although IP returns + * skb's with only 1 level of fragments, SCTP reassembly can + * increase the levels. + */ + sctp_ulpevent_release_frag_data(sctp_skb2event(frag)); + } + +done: + sctp_assoc_rwnd_increase(event->asoc, len); + sctp_ulpevent_release_owner(event); +} + +static void sctp_ulpevent_release_frag_data(struct sctp_ulpevent *event) +{ + struct sk_buff *skb, *frag; + + skb = sctp_event2skb(event); if (!skb->data_len) goto done; @@ -903,7 +929,7 @@ static void sctp_ulpevent_release_data(struct sctp_ulpevent *event) * skb's with only 1 level of fragments, SCTP reassembly can * increase the levels. */ - sctp_ulpevent_release_data(sctp_skb2event(frag)); + sctp_ulpevent_release_frag_data(sctp_skb2event(frag)); } done: -- cgit v1.2.3-59-g8ed1b From d5b9f4c083b0e3102f3101545279f623680cb3a0 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Sat, 17 Jun 2006 22:59:03 -0700 Subject: [SCTP]: Fix persistent slowdown in sctp when a gap ack consumes rx buffer. In the event that our entire receive buffer is full with a series of chunks that represent a single gap-ack, and then we accept a chunk (or chunks) that fill in the gap between the ctsn and the first gap, we renege chunks from the end of the buffer, which effectively does nothing but move our gap to the end of our received tsn stream. This does little but move our missing tsns down stream a little, and, if the sender is sending sufficiently large retransmit frames, the result is a perpetual slowdown which can never be recovered from, since the only chunk that can be accepted to allow progress in the tsn stream necessitates that a new gap be created to make room for it. This leads to a constant need for retransmits, and subsequent receiver stalls. The fix I've come up with is to deliver the frame without reneging if we have a full receive buffer and the receiving sockets sk_receive_queue is empty(indicating that the receive buffer is being blocked by a missing tsn). Signed-off-by: Neil Horman Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- net/sctp/sm_statefuns.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 8bc279219a72..9e58144f4851 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -5293,10 +5293,18 @@ static int sctp_eat_data(const struct sctp_association *asoc, * seems a bit troublesome in that frag_point varies based on * PMTU. In cases, such as loopback, this might be a rather * large spill over. + * NOTE: If we have a full receive buffer here, we only renege if + * our receiver can still make progress without the tsn being + * received. We do this because in the event that the associations + * receive queue is empty we are filling a leading gap, and since + * reneging moves the gap to the end of the tsn stream, we are likely + * to stall again very shortly. Avoiding the renege when we fill a + * leading gap is a good heuristic for avoiding such steady state + * stalls. */ if (!asoc->rwnd || asoc->rwnd_over || (datalen > asoc->rwnd + asoc->frag_point) || - rcvbuf_over) { + (rcvbuf_over && (!skb_queue_len(&sk->sk_receive_queue)))) { /* If this is the next TSN, consider reneging to make * room. Note: Playing nice with a confused sender. A -- cgit v1.2.3-59-g8ed1b From 47552c4e555eefe381f3d45140b59a2ea4b16486 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 17 Jun 2006 23:00:20 -0700 Subject: [ETHTOOL]: Fix UFO typo The function ethtool_get_ufo was referring to ETHTOOL_GTSO instead of ETHTOOL_GUFO. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/ethtool.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 3f269e4e9438..33ce7ed6afc6 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -589,7 +589,7 @@ static int ethtool_set_tso(struct net_device *dev, char __user *useraddr) static int ethtool_get_ufo(struct net_device *dev, char __user *useraddr) { - struct ethtool_value edata = { ETHTOOL_GTSO }; + struct ethtool_value edata = { ETHTOOL_GUFO }; if (!dev->ethtool_ops->get_ufo) return -EOPNOTSUPP; @@ -598,6 +598,7 @@ static int ethtool_get_ufo(struct net_device *dev, char __user *useraddr) return -EFAULT; return 0; } + static int ethtool_set_ufo(struct net_device *dev, char __user *useraddr) { struct ethtool_value edata; -- cgit v1.2.3-59-g8ed1b