From c16e19c11730199c1df686b160c9c972ad28baf8 Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Thu, 10 Feb 2011 10:13:07 +0100 Subject: netfilter: ipset: add dependency on CONFIG_NETFILTER_NETLINK When SYSCTL and PROC_FS and NETFILTER_NETLINK are not enabled: net/built-in.o: In function `try_to_load_type': ip_set_core.c:(.text+0x3ab49): undefined reference to `nfnl_unlock' ip_set_core.c:(.text+0x3ab4e): undefined reference to `nfnl_lock' ... Reported-by: Randy Dunlap Acked-by: Randy Dunlap Signed-off-by: Patrick McHardy --- net/netfilter/ipset/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/net/netfilter/ipset/Kconfig b/net/netfilter/ipset/Kconfig index 3b970d343023..2c5b348eb3a8 100644 --- a/net/netfilter/ipset/Kconfig +++ b/net/netfilter/ipset/Kconfig @@ -1,6 +1,7 @@ menuconfig IP_SET tristate "IP set support" depends on INET && NETFILTER + depends on NETFILTER_NETLINK help This option adds IP set support to the kernel. In order to define and use the sets, you need the userspace utility -- cgit v1.2.3-59-g8ed1b From 44bd4de9c2270b22c3c898310102bc6be9ed2978 Mon Sep 17 00:00:00 2001 From: Stefan Berger Date: Fri, 11 Feb 2011 18:00:07 +0100 Subject: netfilter: xt_connlimit: connlimit-above early loop termination The patch below introduces an early termination of the loop that is counting matches. It terminates once the counter has exceeded the threshold provided by the user. There's no point in continuing the loop afterwards and looking at other entries. It plays together with the following code further below: return (connections > info->limit) ^ info->inverse; where connections is the result of the counted connection, which in turn is the matches variable in the loop. So once -> matches = info->limit + 1 alias -> matches > info->limit alias -> matches > threshold we can terminate the loop. Signed-off-by: Stefan Berger Signed-off-by: Patrick McHardy --- net/netfilter/xt_connlimit.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index e029c4807404..82ce7c5fbbc2 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -97,7 +97,8 @@ static int count_them(struct net *net, const struct nf_conntrack_tuple *tuple, const union nf_inet_addr *addr, const union nf_inet_addr *mask, - u_int8_t family) + u_int8_t family, + unsigned int threshold) { const struct nf_conntrack_tuple_hash *found; struct xt_connlimit_conn *conn; @@ -151,9 +152,14 @@ static int count_them(struct net *net, continue; } - if (same_source_net(addr, mask, &conn->tuple.src.u3, family)) + if (same_source_net(addr, mask, &conn->tuple.src.u3, family)) { /* same source network -> be counted! */ ++matches; + if (matches > threshold) { + nf_ct_put(found_ct); + break; + } + } nf_ct_put(found_ct); } @@ -207,7 +213,8 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) spin_lock_bh(&info->data->lock); connections = count_them(net, info->data, tuple_ptr, &addr, - &info->mask, par->family); + &info->mask, par->family, + info->limit); spin_unlock_bh(&info->data->lock); if (connections < 0) -- cgit v1.2.3-59-g8ed1b From d846f71195d57b0bbb143382647c2c6638b04c5a Mon Sep 17 00:00:00 2001 From: Vasiliy Kulikov Date: Mon, 14 Feb 2011 16:49:23 +0100 Subject: bridge: netfilter: fix information leak Struct tmp is copied from userspace. It is not checked whether the "name" field is NULL terminated. This may lead to buffer overflow and passing contents of kernel stack as a module name to try_then_request_module() and, consequently, to modprobe commandline. It would be seen by all userspace processes. Signed-off-by: Vasiliy Kulikov Signed-off-by: Patrick McHardy --- net/bridge/netfilter/ebtables.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 5f1825df9dca..893669caa8de 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1107,6 +1107,8 @@ static int do_replace(struct net *net, const void __user *user, if (tmp.num_counters >= INT_MAX / sizeof(struct ebt_counter)) return -ENOMEM; + tmp.name[sizeof(tmp.name) - 1] = 0; + countersize = COUNTER_OFFSET(tmp.nentries) * nr_cpu_ids; newinfo = vmalloc(sizeof(*newinfo) + countersize); if (!newinfo) -- cgit v1.2.3-59-g8ed1b From 20b7975e5aefc7fd08b7f582f3901b1669725cd0 Mon Sep 17 00:00:00 2001 From: Stefan Berger Date: Mon, 14 Feb 2011 16:54:33 +0100 Subject: Revert "netfilter: xt_connlimit: connlimit-above early loop termination" This reverts commit 44bd4de9c2270b22c3c898310102bc6be9ed2978. I have to revert the early loop termination in connlimit since it generates problems when an iptables statement does not use -m state --state NEW before the connlimit match extension. Signed-off-by: Stefan Berger Signed-off-by: Patrick McHardy --- net/netfilter/xt_connlimit.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 82ce7c5fbbc2..e029c4807404 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -97,8 +97,7 @@ static int count_them(struct net *net, const struct nf_conntrack_tuple *tuple, const union nf_inet_addr *addr, const union nf_inet_addr *mask, - u_int8_t family, - unsigned int threshold) + u_int8_t family) { const struct nf_conntrack_tuple_hash *found; struct xt_connlimit_conn *conn; @@ -152,14 +151,9 @@ static int count_them(struct net *net, continue; } - if (same_source_net(addr, mask, &conn->tuple.src.u3, family)) { + if (same_source_net(addr, mask, &conn->tuple.src.u3, family)) /* same source network -> be counted! */ ++matches; - if (matches > threshold) { - nf_ct_put(found_ct); - break; - } - } nf_ct_put(found_ct); } @@ -213,8 +207,7 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par) spin_lock_bh(&info->data->lock); connections = count_them(net, info->data, tuple_ptr, &addr, - &info->mask, par->family, - info->limit); + &info->mask, par->family); spin_unlock_bh(&info->data->lock); if (connections < 0) -- cgit v1.2.3-59-g8ed1b From a2361c8735e07322023aedc36e4938b35af31eb0 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Mon, 14 Feb 2011 17:28:55 +0100 Subject: netfilter: xt_conntrack: warn about use in raw table nfct happens to run after the raw table only. Signed-off-by: Jan Engelhardt Signed-off-by: Patrick McHardy --- net/netfilter/xt_conntrack.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 4ef1b63ad73f..2c0086a4751e 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -272,6 +272,11 @@ static int conntrack_mt_check(const struct xt_mtchk_param *par) { int ret; + if (strcmp(par->table, "raw") == 0) { + pr_info("state is undetermined at the time of raw table\n"); + return -EINVAL; + } + ret = nf_ct_l3proto_try_module_get(par->family); if (ret < 0) pr_info("cannot load conntrack support for proto=%u\n", -- cgit v1.2.3-59-g8ed1b From 8248779b1878f17cce2bb809831f4f2a252bdb77 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 15 Feb 2011 21:59:37 +0100 Subject: netfilter: nfnetlink_log: remove unused parameter Signed-off-by: Florian Westphal Signed-off-by: Patrick McHardy --- net/netfilter/nfnetlink_log.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index 91592da504b9..985e9b76c916 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -376,7 +376,6 @@ __build_packet_message(struct nfulnl_instance *inst, unsigned int hooknum, const struct net_device *indev, const struct net_device *outdev, - const struct nf_loginfo *li, const char *prefix, unsigned int plen) { struct nfulnl_msg_packet_hdr pmsg; @@ -652,7 +651,7 @@ nfulnl_log_packet(u_int8_t pf, inst->qlen++; __build_packet_message(inst, skb, data_len, pf, - hooknum, in, out, li, prefix, plen); + hooknum, in, out, prefix, plen); if (inst->qlen >= qthreshold) __nfulnl_flush(inst); -- cgit v1.2.3-59-g8ed1b From 16a7fd323f93eab88df79fc647575ae9789037c2 Mon Sep 17 00:00:00 2001 From: Tinggong Wang Date: Wed, 9 Feb 2011 02:21:59 +0200 Subject: ipvs: fix timer in get_curr_sync_buff Fix get_curr_sync_buff to keep buffer for 2 seconds as intended, not just for the current jiffie. By this way we will sync more connection structures with single packet. Signed-off-by: Tinggong Wang Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_sync.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index d1b7298e5894..fecf24de4af3 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -374,8 +374,8 @@ get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time) struct ip_vs_sync_buff *sb; spin_lock_bh(&ipvs->sync_buff_lock); - if (ipvs->sync_buff && (time == 0 || - time_before(jiffies - ipvs->sync_buff->firstuse, time))) { + if (ipvs->sync_buff && + time_after_eq(jiffies - ipvs->sync_buff->firstuse, time)) { sb = ipvs->sync_buff; ipvs->sync_buff = NULL; } else -- cgit v1.2.3-59-g8ed1b From 6cb90db502c5f276c8d6256762cc3acde4d3bd9d Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Wed, 9 Feb 2011 02:26:38 +0200 Subject: ipvs: remove extra lookups for ICMP packets Remove code that should not be called anymore. Now when ip_vs_out handles replies for local clients at LOCAL_IN hook we do not need to call conn_out_get and handle_response_icmp from ip_vs_in_icmp* because such lookups were already performed for the ICMP packet and no connection was found. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 4d06617fab6c..2d1f932add46 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -729,7 +729,7 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, #endif /* Handle relevant response ICMP messages - forward to the right - * destination host. Used for NAT and local client. + * destination host. */ static int handle_response_icmp(int af, struct sk_buff *skb, union nf_inet_addr *snet, @@ -979,7 +979,6 @@ static inline int is_tcp_reset(const struct sk_buff *skb, int nh_len) } /* Handle response packets: rewrite addresses and send away... - * Used for NAT and local client. */ static unsigned int handle_response(int af, struct sk_buff *skb, struct ip_vs_proto_data *pd, @@ -1280,7 +1279,6 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; unsigned int offset, ihl, verdict; - union nf_inet_addr snet; *related = 1; @@ -1339,17 +1337,8 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) ip_vs_fill_iphdr(AF_INET, cih, &ciph); /* The embedded headers contain source and dest in reverse order */ cp = pp->conn_in_get(AF_INET, skb, &ciph, offset, 1); - if (!cp) { - /* The packet could also belong to a local client */ - cp = pp->conn_out_get(AF_INET, skb, &ciph, offset, 1); - if (cp) { - snet.ip = iph->saddr; - return handle_response_icmp(AF_INET, skb, &snet, - cih->protocol, cp, pp, - offset, ihl); - } + if (!cp) return NF_ACCEPT; - } verdict = NF_DROP; @@ -1395,7 +1384,6 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) struct ip_vs_protocol *pp; struct ip_vs_proto_data *pd; unsigned int offset, verdict; - union nf_inet_addr snet; struct rt6_info *rt; *related = 1; @@ -1455,18 +1443,8 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) ip_vs_fill_iphdr(AF_INET6, cih, &ciph); /* The embedded headers contain source and dest in reverse order */ cp = pp->conn_in_get(AF_INET6, skb, &ciph, offset, 1); - if (!cp) { - /* The packet could also belong to a local client */ - cp = pp->conn_out_get(AF_INET6, skb, &ciph, offset, 1); - if (cp) { - ipv6_addr_copy(&snet.in6, &iph->saddr); - return handle_response_icmp(AF_INET6, skb, &snet, - cih->nexthdr, - cp, pp, offset, - sizeof(struct ipv6hdr)); - } + if (!cp) return NF_ACCEPT; - } verdict = NF_DROP; -- cgit v1.2.3-59-g8ed1b From 41ac51eeda58a85b8a06d748cce7035cc77deebd Mon Sep 17 00:00:00 2001 From: Patrick Schaaf Date: Fri, 11 Feb 2011 14:01:12 +0100 Subject: ipvs: make "no destination available" message more informative When IP_VS schedulers do not find a destination, they output a terse "WLC: no destination available" message through kernel syslog, which I can not only make sense of because syslog puts them in a logfile together with keepalived checker results. This patch makes the output a bit more informative, by telling you which virtual service failed to find a destination. Example output: kernel: [1539214.552233] IPVS: wlc: TCP 192.168.8.30:22 - no destination available kernel: [1539299.674418] IPVS: wlc: FWM 22 0x00000016 - no destination available I have tested the code for IPv4 and FWM services, as you can see from the example; I do not have an IPv6 setup to test the third code path with. To avoid code duplication, I put a new function ip_vs_scheduler_err() into ip_vs_sched.c, and use that from the schedulers instead of calling IP_VS_ERR_RL directly. Signed-off-by: Patrick Schaaf Signed-off-by: Simon Horman --- include/net/ip_vs.h | 2 ++ net/netfilter/ipvs/ip_vs_lblc.c | 2 +- net/netfilter/ipvs/ip_vs_lblcr.c | 2 +- net/netfilter/ipvs/ip_vs_lc.c | 2 +- net/netfilter/ipvs/ip_vs_nq.c | 2 +- net/netfilter/ipvs/ip_vs_rr.c | 2 +- net/netfilter/ipvs/ip_vs_sched.c | 25 +++++++++++++++++++++++++ net/netfilter/ipvs/ip_vs_sed.c | 2 +- net/netfilter/ipvs/ip_vs_sh.c | 2 +- net/netfilter/ipvs/ip_vs_wlc.c | 2 +- net/netfilter/ipvs/ip_vs_wrr.c | 14 ++++++++------ 11 files changed, 43 insertions(+), 14 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 5d75feadf4f4..93995494dfd4 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1019,6 +1019,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, struct ip_vs_proto_data *pd); +extern void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg); + /* * IPVS control data and functions (from ip_vs_ctl.c) diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 00b5ffab3768..4a9c8cd19690 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -510,7 +510,7 @@ ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) /* No cache entry or it is invalid, time to schedule */ dest = __ip_vs_lblc_schedule(svc); if (!dest) { - IP_VS_ERR_RL("LBLC: no destination available\n"); + ip_vs_scheduler_err(svc, "no destination available"); return NULL; } diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index bfa25f1ea9e4..bd329b1e9589 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -692,7 +692,7 @@ ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) /* The cache entry is invalid, time to schedule */ dest = __ip_vs_lblcr_schedule(svc); if (!dest) { - IP_VS_ERR_RL("LBLCR: no destination available\n"); + ip_vs_scheduler_err(svc, "no destination available"); read_unlock(&svc->sched_lock); return NULL; } diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c index 4f69db1fac56..60638007c6c7 100644 --- a/net/netfilter/ipvs/ip_vs_lc.c +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -70,7 +70,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } if (!least) - IP_VS_ERR_RL("LC: no destination available\n"); + ip_vs_scheduler_err(svc, "no destination available"); else IP_VS_DBG_BUF(6, "LC: server %s:%u activeconns %d " "inactconns %d\n", diff --git a/net/netfilter/ipvs/ip_vs_nq.c b/net/netfilter/ipvs/ip_vs_nq.c index c413e1830823..984d9c137d84 100644 --- a/net/netfilter/ipvs/ip_vs_nq.c +++ b/net/netfilter/ipvs/ip_vs_nq.c @@ -99,7 +99,7 @@ ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) } if (!least) { - IP_VS_ERR_RL("NQ: no destination available\n"); + ip_vs_scheduler_err(svc, "no destination available"); return NULL; } diff --git a/net/netfilter/ipvs/ip_vs_rr.c b/net/netfilter/ipvs/ip_vs_rr.c index e210f37d8ea2..c49b388d1085 100644 --- a/net/netfilter/ipvs/ip_vs_rr.c +++ b/net/netfilter/ipvs/ip_vs_rr.c @@ -72,7 +72,7 @@ ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) q = q->next; } while (q != p); write_unlock(&svc->sched_lock); - IP_VS_ERR_RL("RR: no destination available\n"); + ip_vs_scheduler_err(svc, "no destination available"); return NULL; out: diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index 076ebe00435d..08dbdd5bc18f 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -29,6 +29,7 @@ #include +EXPORT_SYMBOL(ip_vs_scheduler_err); /* * IPVS scheduler list */ @@ -146,6 +147,30 @@ void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) module_put(scheduler->module); } +/* + * Common error output helper for schedulers + */ + +void ip_vs_scheduler_err(struct ip_vs_service *svc, const char *msg) +{ + if (svc->fwmark) { + IP_VS_ERR_RL("%s: FWM %u 0x%08X - %s\n", + svc->scheduler->name, svc->fwmark, + svc->fwmark, msg); +#ifdef CONFIG_IP_VS_IPV6 + } else if (svc->af == AF_INET6) { + IP_VS_ERR_RL("%s: %s [%pI6]:%d - %s\n", + svc->scheduler->name, + ip_vs_proto_name(svc->protocol), + &svc->addr.in6, ntohs(svc->port), msg); +#endif + } else { + IP_VS_ERR_RL("%s: %s %pI4:%d - %s\n", + svc->scheduler->name, + ip_vs_proto_name(svc->protocol), + &svc->addr.ip, ntohs(svc->port), msg); + } +} /* * Register a scheduler in the scheduler list diff --git a/net/netfilter/ipvs/ip_vs_sed.c b/net/netfilter/ipvs/ip_vs_sed.c index 1ab75a9dc400..89ead246ed3d 100644 --- a/net/netfilter/ipvs/ip_vs_sed.c +++ b/net/netfilter/ipvs/ip_vs_sed.c @@ -87,7 +87,7 @@ ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) goto nextstage; } } - IP_VS_ERR_RL("SED: no destination available\n"); + ip_vs_scheduler_err(svc, "no destination available"); return NULL; /* diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c index e6cc174fbc06..b5e2556c581a 100644 --- a/net/netfilter/ipvs/ip_vs_sh.c +++ b/net/netfilter/ipvs/ip_vs_sh.c @@ -223,7 +223,7 @@ ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) || !(dest->flags & IP_VS_DEST_F_AVAILABLE) || atomic_read(&dest->weight) <= 0 || is_overloaded(dest)) { - IP_VS_ERR_RL("SH: no destination available\n"); + ip_vs_scheduler_err(svc, "no destination available"); return NULL; } diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c index bbddfdb10db2..fdf0f58962a4 100644 --- a/net/netfilter/ipvs/ip_vs_wlc.c +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -75,7 +75,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) goto nextstage; } } - IP_VS_ERR_RL("WLC: no destination available\n"); + ip_vs_scheduler_err(svc, "no destination available"); return NULL; /* diff --git a/net/netfilter/ipvs/ip_vs_wrr.c b/net/netfilter/ipvs/ip_vs_wrr.c index 30db633f88f1..1ef41f50723c 100644 --- a/net/netfilter/ipvs/ip_vs_wrr.c +++ b/net/netfilter/ipvs/ip_vs_wrr.c @@ -147,8 +147,9 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) if (mark->cl == mark->cl->next) { /* no dest entry */ - IP_VS_ERR_RL("WRR: no destination available: " - "no destinations present\n"); + ip_vs_scheduler_err(svc, + "no destination available: " + "no destinations present"); dest = NULL; goto out; } @@ -162,8 +163,8 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) */ if (mark->cw == 0) { mark->cl = &svc->destinations; - IP_VS_ERR_RL("WRR: no destination " - "available\n"); + ip_vs_scheduler_err(svc, + "no destination available"); dest = NULL; goto out; } @@ -185,8 +186,9 @@ ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) /* back to the start, and no dest is found. It is only possible when all dests are OVERLOADED */ dest = NULL; - IP_VS_ERR_RL("WRR: no destination available: " - "all destinations are overloaded\n"); + ip_vs_scheduler_err(svc, + "no destination available: " + "all destinations are overloaded"); goto out; } } -- cgit v1.2.3-59-g8ed1b From 731109e78415b4cc6c2f8de6c11b37f0e40741f8 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Sat, 19 Feb 2011 18:05:08 +0800 Subject: ipvs: use hlist instead of list Signed-off-by: Changli Gao Signed-off-by: Simon Horman --- include/net/ip_vs.h | 2 +- net/netfilter/ipvs/ip_vs_conn.c | 52 +++++++++++++++++++++++------------------ 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 93995494dfd4..17b01b2d48f9 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -494,7 +494,7 @@ struct ip_vs_conn_param { * IP_VS structure allocated for each dynamically scheduled connection */ struct ip_vs_conn { - struct list_head c_list; /* hashed list heads */ + struct hlist_node c_list; /* hashed list heads */ #ifdef CONFIG_NET_NS struct net *net; /* Name space */ #endif diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 83233fe24a08..9c2a517b69c8 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -59,7 +59,7 @@ static int ip_vs_conn_tab_mask __read_mostly; /* * Connection hash table: for input and output packets lookups of IPVS */ -static struct list_head *ip_vs_conn_tab __read_mostly; +static struct hlist_head *ip_vs_conn_tab __read_mostly; /* SLAB cache for IPVS connections */ static struct kmem_cache *ip_vs_conn_cachep __read_mostly; @@ -201,7 +201,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) spin_lock(&cp->lock); if (!(cp->flags & IP_VS_CONN_F_HASHED)) { - list_add(&cp->c_list, &ip_vs_conn_tab[hash]); + hlist_add_head(&cp->c_list, &ip_vs_conn_tab[hash]); cp->flags |= IP_VS_CONN_F_HASHED; atomic_inc(&cp->refcnt); ret = 1; @@ -234,7 +234,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) spin_lock(&cp->lock); if (cp->flags & IP_VS_CONN_F_HASHED) { - list_del(&cp->c_list); + hlist_del(&cp->c_list); cp->flags &= ~IP_VS_CONN_F_HASHED; atomic_dec(&cp->refcnt); ret = 1; @@ -259,12 +259,13 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) { unsigned hash; struct ip_vs_conn *cp; + struct hlist_node *n; hash = ip_vs_conn_hashkey_param(p, false); ct_read_lock(hash); - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { if (cp->af == p->af && p->cport == cp->cport && p->vport == cp->vport && ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && @@ -345,12 +346,13 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) { unsigned hash; struct ip_vs_conn *cp; + struct hlist_node *n; hash = ip_vs_conn_hashkey_param(p, false); ct_read_lock(hash); - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { if (!ip_vs_conn_net_eq(cp, p->net)) continue; if (p->pe_data && p->pe->ct_match) { @@ -394,6 +396,7 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) { unsigned hash; struct ip_vs_conn *cp, *ret=NULL; + struct hlist_node *n; /* * Check for "full" addressed entries @@ -402,7 +405,7 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) ct_read_lock(hash); - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { if (cp->af == p->af && p->vport == cp->cport && p->cport == cp->dport && ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && @@ -818,7 +821,7 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, return NULL; } - INIT_LIST_HEAD(&cp->c_list); + INIT_HLIST_NODE(&cp->c_list); setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); ip_vs_conn_net_set(cp, p->net); cp->af = p->af; @@ -894,8 +897,8 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, */ #ifdef CONFIG_PROC_FS struct ip_vs_iter_state { - struct seq_net_private p; - struct list_head *l; + struct seq_net_private p; + struct hlist_head *l; }; static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) @@ -903,13 +906,14 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) int idx; struct ip_vs_conn *cp; struct ip_vs_iter_state *iter = seq->private; + struct hlist_node *n; for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { ct_read_lock_bh(idx); - list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { + hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) { if (pos-- == 0) { iter->l = &ip_vs_conn_tab[idx]; - return cp; + return cp; } } ct_read_unlock_bh(idx); @@ -930,7 +934,8 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct ip_vs_conn *cp = v; struct ip_vs_iter_state *iter = seq->private; - struct list_head *e, *l = iter->l; + struct hlist_node *e; + struct hlist_head *l = iter->l; int idx; ++*pos; @@ -938,15 +943,15 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) return ip_vs_conn_array(seq, 0); /* more on same hash chain? */ - if ((e = cp->c_list.next) != l) - return list_entry(e, struct ip_vs_conn, c_list); + if ((e = cp->c_list.next)) + return hlist_entry(e, struct ip_vs_conn, c_list); idx = l - ip_vs_conn_tab; ct_read_unlock_bh(idx); while (++idx < ip_vs_conn_tab_size) { ct_read_lock_bh(idx); - list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { + hlist_for_each_entry(cp, e, &ip_vs_conn_tab[idx], c_list) { iter->l = &ip_vs_conn_tab[idx]; return cp; } @@ -959,7 +964,7 @@ static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v) { struct ip_vs_iter_state *iter = seq->private; - struct list_head *l = iter->l; + struct hlist_head *l = iter->l; if (l) ct_read_unlock_bh(l - ip_vs_conn_tab); @@ -1148,13 +1153,14 @@ void ip_vs_random_dropentry(struct net *net) */ for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { unsigned hash = net_random() & ip_vs_conn_tab_mask; + struct hlist_node *n; /* * Lock is actually needed in this loop. */ ct_write_lock_bh(hash); - list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + hlist_for_each_entry(cp, n, &ip_vs_conn_tab[hash], c_list) { if (cp->flags & IP_VS_CONN_F_TEMPLATE) /* connection template */ continue; @@ -1202,12 +1208,14 @@ static void ip_vs_conn_flush(struct net *net) flush_again: for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { + struct hlist_node *n; + /* * Lock is actually needed in this loop. */ ct_write_lock_bh(idx); - list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { + hlist_for_each_entry(cp, n, &ip_vs_conn_tab[idx], c_list) { if (!ip_vs_conn_net_eq(cp, net)) continue; IP_VS_DBG(4, "del connection\n"); @@ -1265,8 +1273,7 @@ int __init ip_vs_conn_init(void) /* * Allocate the connection hash table and initialize its list heads */ - ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * - sizeof(struct list_head)); + ip_vs_conn_tab = vmalloc(ip_vs_conn_tab_size * sizeof(*ip_vs_conn_tab)); if (!ip_vs_conn_tab) return -ENOMEM; @@ -1286,9 +1293,8 @@ int __init ip_vs_conn_init(void) IP_VS_DBG(0, "Each connection entry needs %Zd bytes at least\n", sizeof(struct ip_vs_conn)); - for (idx = 0; idx < ip_vs_conn_tab_size; idx++) { - INIT_LIST_HEAD(&ip_vs_conn_tab[idx]); - } + for (idx = 0; idx < ip_vs_conn_tab_size; idx++) + INIT_HLIST_HEAD(&ip_vs_conn_tab[idx]); for (idx = 0; idx < CT_LOCKARRAY_SIZE; idx++) { rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); -- cgit v1.2.3-59-g8ed1b From 17a8f8e3734920cf2f030f2fa521a0b940ef6f90 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Thu, 24 Feb 2011 08:19:57 +0800 Subject: ipvs: use enum to instead of magic numbers Signed-off-by: Changli Gao Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_xmit.c | 41 +++++++++++++++++++++++++++-------------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 1f2a4e35fb11..a48239aba33b 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -43,6 +43,13 @@ #include +enum { + IP_VS_RT_MODE_LOCAL = 1, /* Allow local dest */ + IP_VS_RT_MODE_NON_LOCAL = 2, /* Allow non-local dest */ + IP_VS_RT_MODE_RDR = 4, /* Allow redirect from remote daddr to + * local + */ +}; /* * Destination cache to speed up outgoing route lookup @@ -77,11 +84,7 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos) return dst; } -/* - * Get route to destination or remote server - * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest, - * &4=Allow redirect from remote daddr to local - */ +/* Get route to destination or remote server */ static struct rtable * __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, __be32 daddr, u32 rtos, int rt_mode) @@ -126,15 +129,16 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, } local = rt->rt_flags & RTCF_LOCAL; - if (!((local ? 1 : 2) & rt_mode)) { + if (!((local ? IP_VS_RT_MODE_LOCAL : IP_VS_RT_MODE_NON_LOCAL) & + rt_mode)) { IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n", (rt->rt_flags & RTCF_LOCAL) ? "local":"non-local", &rt->rt_dst); ip_rt_put(rt); return NULL; } - if (local && !(rt_mode & 4) && !((ort = skb_rtable(skb)) && - ort->rt_flags & RTCF_LOCAL)) { + if (local && !(rt_mode & IP_VS_RT_MODE_RDR) && + !((ort = skb_rtable(skb)) && ort->rt_flags & RTCF_LOCAL)) { IP_VS_DBG_RL("Redirect from non-local address %pI4 to local " "requires NAT method, dest: %pI4\n", &ip_hdr(skb)->daddr, &rt->rt_dst); @@ -383,8 +387,8 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, - RT_TOS(iph->tos), 2))) + if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, RT_TOS(iph->tos), + IP_VS_RT_MODE_NON_LOCAL))) goto tx_error_icmp; /* MTU checking */ @@ -512,7 +516,10 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, } if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, - RT_TOS(iph->tos), 1|2|4))) + RT_TOS(iph->tos), + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR))) goto tx_error_icmp; local = rt->rt_flags & RTCF_LOCAL; /* @@ -755,7 +762,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, - RT_TOS(tos), 1|2))) + RT_TOS(tos), IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL))) goto tx_error_icmp; if (rt->rt_flags & RTCF_LOCAL) { ip_rt_put(rt); @@ -984,7 +992,9 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, - RT_TOS(iph->tos), 1|2))) + RT_TOS(iph->tos), + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL))) goto tx_error_icmp; if (rt->rt_flags & RTCF_LOCAL) { ip_rt_put(rt); @@ -1128,7 +1138,10 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, */ if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, - RT_TOS(ip_hdr(skb)->tos), 1|2|4))) + RT_TOS(ip_hdr(skb)->tos), + IP_VS_RT_MODE_LOCAL | + IP_VS_RT_MODE_NON_LOCAL | + IP_VS_RT_MODE_RDR))) goto tx_error_icmp; local = rt->rt_flags & RTCF_LOCAL; -- cgit v1.2.3-59-g8ed1b From b552f7e3a9524abcbcdf86f0a99b2be58e55a9c6 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Sat, 19 Feb 2011 17:32:28 +0800 Subject: ipvs: unify the formula to estimate the overhead of processing connections lc and wlc use the same formula, but lblc and lblcr use another one. There is no reason for using two different formulas for the lc variants. The formula used by lc is used by all the lc variants in this patch. Signed-off-by: Changli Gao Acked-by: Wensong Zhang Signed-off-by: Simon Horman --- include/net/ip_vs.h | 14 ++++++++++++++ net/netfilter/ipvs/ip_vs_lblc.c | 13 +++---------- net/netfilter/ipvs/ip_vs_lblcr.c | 25 +++++++------------------ net/netfilter/ipvs/ip_vs_lc.c | 18 +----------------- net/netfilter/ipvs/ip_vs_wlc.c | 20 ++------------------ 5 files changed, 27 insertions(+), 63 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 17b01b2d48f9..e74da41ebd1b 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -1243,6 +1243,20 @@ static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) /* CONFIG_IP_VS_NFCT */ #endif +static inline unsigned int +ip_vs_dest_conn_overhead(struct ip_vs_dest *dest) +{ + /* + * We think the overhead of processing active connections is 256 + * times higher than that of inactive connections in average. (This + * 256 times might not be accurate, we will change it later) We + * use the following formula to estimate the overhead now: + * dest->activeconns*256 + dest->inactconns + */ + return (atomic_read(&dest->activeconns) << 8) + + atomic_read(&dest->inactconns); +} + #endif /* __KERNEL__ */ #endif /* _NET_IP_VS_H */ diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c index 4a9c8cd19690..6bf7a807649c 100644 --- a/net/netfilter/ipvs/ip_vs_lblc.c +++ b/net/netfilter/ipvs/ip_vs_lblc.c @@ -389,12 +389,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc) int loh, doh; /* - * We think the overhead of processing active connections is fifty - * times higher than that of inactive connections in average. (This - * fifty times might not be accurate, we will change it later.) We - * use the following formula to estimate the overhead: - * dest->activeconns*50 + dest->inactconns - * and the load: + * We use the following formula to estimate the load: * (dest overhead) / dest->weight * * Remember -- no floats in kernel mode!!! @@ -410,8 +405,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc) continue; if (atomic_read(&dest->weight) > 0) { least = dest; - loh = atomic_read(&least->activeconns) * 50 - + atomic_read(&least->inactconns); + loh = ip_vs_dest_conn_overhead(least); goto nextstage; } } @@ -425,8 +419,7 @@ __ip_vs_lblc_schedule(struct ip_vs_service *svc) if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); + doh = ip_vs_dest_conn_overhead(dest); if (loh * atomic_read(&dest->weight) > doh * atomic_read(&least->weight)) { least = dest; diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c index bd329b1e9589..00631765b92a 100644 --- a/net/netfilter/ipvs/ip_vs_lblcr.c +++ b/net/netfilter/ipvs/ip_vs_lblcr.c @@ -178,8 +178,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) if ((atomic_read(&least->weight) > 0) && (least->flags & IP_VS_DEST_F_AVAILABLE)) { - loh = atomic_read(&least->activeconns) * 50 - + atomic_read(&least->inactconns); + loh = ip_vs_dest_conn_overhead(least); goto nextstage; } } @@ -192,8 +191,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_min(struct ip_vs_dest_set *set) if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); + doh = ip_vs_dest_conn_overhead(dest); if ((loh * atomic_read(&dest->weight) > doh * atomic_read(&least->weight)) && (dest->flags & IP_VS_DEST_F_AVAILABLE)) { @@ -228,8 +226,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) list_for_each_entry(e, &set->list, list) { most = e->dest; if (atomic_read(&most->weight) > 0) { - moh = atomic_read(&most->activeconns) * 50 - + atomic_read(&most->inactconns); + moh = ip_vs_dest_conn_overhead(most); goto nextstage; } } @@ -239,8 +236,7 @@ static inline struct ip_vs_dest *ip_vs_dest_set_max(struct ip_vs_dest_set *set) nextstage: list_for_each_entry(e, &set->list, list) { dest = e->dest; - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); + doh = ip_vs_dest_conn_overhead(dest); /* moh/mw < doh/dw ==> moh*dw < doh*mw, where mw,dw>0 */ if ((moh * atomic_read(&dest->weight) < doh * atomic_read(&most->weight)) @@ -563,12 +559,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc) int loh, doh; /* - * We think the overhead of processing active connections is fifty - * times higher than that of inactive connections in average. (This - * fifty times might not be accurate, we will change it later.) We - * use the following formula to estimate the overhead: - * dest->activeconns*50 + dest->inactconns - * and the load: + * We use the following formula to estimate the load: * (dest overhead) / dest->weight * * Remember -- no floats in kernel mode!!! @@ -585,8 +576,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc) if (atomic_read(&dest->weight) > 0) { least = dest; - loh = atomic_read(&least->activeconns) * 50 - + atomic_read(&least->inactconns); + loh = ip_vs_dest_conn_overhead(least); goto nextstage; } } @@ -600,8 +590,7 @@ __ip_vs_lblcr_schedule(struct ip_vs_service *svc) if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; - doh = atomic_read(&dest->activeconns) * 50 - + atomic_read(&dest->inactconns); + doh = ip_vs_dest_conn_overhead(dest); if (loh * atomic_read(&dest->weight) > doh * atomic_read(&least->weight)) { least = dest; diff --git a/net/netfilter/ipvs/ip_vs_lc.c b/net/netfilter/ipvs/ip_vs_lc.c index 60638007c6c7..f391819c0cca 100644 --- a/net/netfilter/ipvs/ip_vs_lc.c +++ b/net/netfilter/ipvs/ip_vs_lc.c @@ -22,22 +22,6 @@ #include - -static inline unsigned int -ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We think the overhead of processing active connections is 256 - * times higher than that of inactive connections in average. (This - * 256 times might not be accurate, we will change it later) We - * use the following formula to estimate the overhead now: - * dest->activeconns*256 + dest->inactconns - */ - return (atomic_read(&dest->activeconns) << 8) + - atomic_read(&dest->inactconns); -} - - /* * Least Connection scheduling */ @@ -62,7 +46,7 @@ ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) if ((dest->flags & IP_VS_DEST_F_OVERLOAD) || atomic_read(&dest->weight) == 0) continue; - doh = ip_vs_lc_dest_overhead(dest); + doh = ip_vs_dest_conn_overhead(dest); if (!least || doh < loh) { least = dest; loh = doh; diff --git a/net/netfilter/ipvs/ip_vs_wlc.c b/net/netfilter/ipvs/ip_vs_wlc.c index fdf0f58962a4..bc1bfc48a17f 100644 --- a/net/netfilter/ipvs/ip_vs_wlc.c +++ b/net/netfilter/ipvs/ip_vs_wlc.c @@ -27,22 +27,6 @@ #include - -static inline unsigned int -ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) -{ - /* - * We think the overhead of processing active connections is 256 - * times higher than that of inactive connections in average. (This - * 256 times might not be accurate, we will change it later) We - * use the following formula to estimate the overhead now: - * dest->activeconns*256 + dest->inactconns - */ - return (atomic_read(&dest->activeconns) << 8) + - atomic_read(&dest->inactconns); -} - - /* * Weighted Least Connection scheduling */ @@ -71,7 +55,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) if (!(dest->flags & IP_VS_DEST_F_OVERLOAD) && atomic_read(&dest->weight) > 0) { least = dest; - loh = ip_vs_wlc_dest_overhead(least); + loh = ip_vs_dest_conn_overhead(least); goto nextstage; } } @@ -85,7 +69,7 @@ ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) list_for_each_entry_continue(dest, &svc->destinations, n_list) { if (dest->flags & IP_VS_DEST_F_OVERLOAD) continue; - doh = ip_vs_wlc_dest_overhead(dest); + doh = ip_vs_dest_conn_overhead(dest); if (loh * atomic_read(&dest->weight) > doh * atomic_read(&least->weight)) { least = dest; -- cgit v1.2.3-59-g8ed1b From 8a80c79a776d1b1b54895314ffaf53d0c7604c80 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 28 Feb 2011 17:59:15 +0100 Subject: netfilter: nf_ct_tcp: fix out of sync scenario while in SYN_RECV This patch fixes the out of sync scenarios while in SYN_RECV state. Quoting Jozsef, what it happens if we are out of sync if the following: > > b. conntrack entry is outdated, new SYN received > > - (b1) we ignore it but save the initialization data from it > > - (b2) when the reply SYN/ACK receives and it matches the saved data, > > we pick up the new connection This is what it should happen if we are in SYN_RECV state. Initially, the SYN packet hits b1, thus we save data from it. But the SYN/ACK packet is considered a retransmission given that we're in SYN_RECV state. Therefore, we never hit b2 and we don't get in sync. To fix this, we ignore SYN/ACK if we are in SYN_RECV. If the previous packet was a SYN, then we enter the ignore case that get us in sync. This patch helps a lot to conntrackd in stress scenarios (assumming a client that generates lots of small TCP connections). During the failover, consider that the new primary has injected one outdated flow in SYN_RECV state (this is likely to happen if the conntrack event rate is high because the backup will be a bit delayed from the primary). With the current code, if the client starts a new fresh connection that matches the tuple, the SYN packet will be ignored without updating the state tracking, and the SYN+ACK in reply will blocked as it will not pass checkings III or IV (since all state tracking in the original direction is not initialized because of the SYN packet was ignored and the ignore case that get us in sync is not applied). I posted a couple of patches before this one. Changli Gao spotted a simpler way to fix this problem. This patch implements his idea. Cc: Changli Gao Cc: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso Signed-off-by: Jozsef Kadlecsik Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_proto_tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 6f38d0e2ea4a..37bf94394be0 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -227,11 +227,11 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sCL -> sIV */ /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ -/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, +/*synack*/ { sIV, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, /* * sSS -> sSR Standard open. * sS2 -> sSR Simultaneous open - * sSR -> sSR Retransmitted SYN/ACK. + * sSR -> sIG Retransmitted SYN/ACK, ignore it. * sES -> sIG Late retransmitted SYN/ACK? * sFW -> sIG Might be SYN/ACK answering ignored SYN * sCW -> sIG -- cgit v1.2.3-59-g8ed1b