aboutsummaryrefslogtreecommitdiffstats
path: root/net/netfilter/ipvs
diff options
context:
space:
mode:
Diffstat (limited to 'net/netfilter/ipvs')
-rw-r--r--net/netfilter/ipvs/Kconfig37
-rw-r--r--net/netfilter/ipvs/Makefile1
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c24
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_dh.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_ftp.c467
-rw-r--r--net/netfilter/ipvs/ip_vs_lblc.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_lblcr.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_mh.c540
-rw-r--r--net/netfilter/ipvs/ip_vs_nfct.c101
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_tcp.c8
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_udp.c4
-rw-r--r--net/netfilter/ipvs/ip_vs_sh.c3
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c5
15 files changed, 971 insertions, 238 deletions
diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig
index b32fb0dbe237..05dc1b77e466 100644
--- a/net/netfilter/ipvs/Kconfig
+++ b/net/netfilter/ipvs/Kconfig
@@ -225,6 +225,25 @@ config IP_VS_SH
If you want to compile it in kernel, say Y. To compile it as a
module, choose M here. If unsure, say N.
+config IP_VS_MH
+ tristate "maglev hashing scheduling"
+ ---help---
+ The maglev consistent hashing scheduling algorithm provides the
+ Google's Maglev hashing algorithm as a IPVS scheduler. It assigns
+ network connections to the servers through looking up a statically
+ assigned special hash table called the lookup table. Maglev hashing
+ is to assign a preference list of all the lookup table positions
+ to each destination.
+
+ Through this operation, The maglev hashing gives an almost equal
+ share of the lookup table to each of the destinations and provides
+ minimal disruption by using the lookup table. When the set of
+ destinations changes, a connection will likely be sent to the same
+ destination as it was before.
+
+ If you want to compile it in kernel, say Y. To compile it as a
+ module, choose M here. If unsure, say N.
+
config IP_VS_SED
tristate "shortest expected delay scheduling"
---help---
@@ -266,6 +285,24 @@ config IP_VS_SH_TAB_BITS
needs to be large enough to effectively fit all the destinations
multiplied by their respective weights.
+comment 'IPVS MH scheduler'
+
+config IP_VS_MH_TAB_INDEX
+ int "IPVS maglev hashing table index of size (the prime numbers)"
+ range 8 17
+ default 12
+ ---help---
+ The maglev hashing scheduler maps source IPs to destinations
+ stored in a hash table. This table is assigned by a preference
+ list of the positions to each destination until all slots in
+ the table are filled. The index determines the prime for size of
+ the table as 251, 509, 1021, 2039, 4093, 8191, 16381, 32749,
+ 65521 or 131071. When using weights to allow destinations to
+ receive more connections, the table is assigned an amount
+ proportional to the weights specified. The table needs to be large
+ enough to effectively fit all the destinations multiplied by their
+ respective weights.
+
comment 'IPVS application helper'
config IP_VS_FTP
diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile
index c552993fa4b9..bfce2677fda2 100644
--- a/net/netfilter/ipvs/Makefile
+++ b/net/netfilter/ipvs/Makefile
@@ -33,6 +33,7 @@ obj-$(CONFIG_IP_VS_LBLC) += ip_vs_lblc.o
obj-$(CONFIG_IP_VS_LBLCR) += ip_vs_lblcr.o
obj-$(CONFIG_IP_VS_DH) += ip_vs_dh.o
obj-$(CONFIG_IP_VS_SH) += ip_vs_sh.o
+obj-$(CONFIG_IP_VS_MH) += ip_vs_mh.o
obj-$(CONFIG_IP_VS_SED) += ip_vs_sed.o
obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index c3db074fc1f7..7588aeaa605f 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -355,7 +355,8 @@ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
}
static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
- struct ip_vs_app *app)
+ struct ip_vs_app *app,
+ struct ip_vs_iphdr *ipvsh)
{
int diff;
const unsigned int tcp_offset = ip_hdrlen(skb);
@@ -386,7 +387,7 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
if (app->pkt_out == NULL)
return 1;
- if (!app->pkt_out(app, cp, skb, &diff))
+ if (!app->pkt_out(app, cp, skb, &diff, ipvsh))
return 0;
/*
@@ -404,7 +405,8 @@ static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
* called by ipvs packet handler, assumes previously checked cp!=NULL
* returns false if it can't handle packet (oom)
*/
-int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
+int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb,
+ struct ip_vs_iphdr *ipvsh)
{
struct ip_vs_app *app;
@@ -417,7 +419,7 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
/* TCP is complicated */
if (cp->protocol == IPPROTO_TCP)
- return app_tcp_pkt_out(cp, skb, app);
+ return app_tcp_pkt_out(cp, skb, app, ipvsh);
/*
* Call private output hook function
@@ -425,12 +427,13 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
if (app->pkt_out == NULL)
return 1;
- return app->pkt_out(app, cp, skb, NULL);
+ return app->pkt_out(app, cp, skb, NULL, ipvsh);
}
static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
- struct ip_vs_app *app)
+ struct ip_vs_app *app,
+ struct ip_vs_iphdr *ipvsh)
{
int diff;
const unsigned int tcp_offset = ip_hdrlen(skb);
@@ -461,7 +464,7 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
if (app->pkt_in == NULL)
return 1;
- if (!app->pkt_in(app, cp, skb, &diff))
+ if (!app->pkt_in(app, cp, skb, &diff, ipvsh))
return 0;
/*
@@ -479,7 +482,8 @@ static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
* called by ipvs packet handler, assumes previously checked cp!=NULL.
* returns false if can't handle packet (oom).
*/
-int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
+int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb,
+ struct ip_vs_iphdr *ipvsh)
{
struct ip_vs_app *app;
@@ -492,7 +496,7 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
/* TCP is complicated */
if (cp->protocol == IPPROTO_TCP)
- return app_tcp_pkt_in(cp, skb, app);
+ return app_tcp_pkt_in(cp, skb, app, ipvsh);
/*
* Call private input hook function
@@ -500,7 +504,7 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
if (app->pkt_in == NULL)
return 1;
- return app->pkt_in(app, cp, skb, NULL);
+ return app->pkt_in(app, cp, skb, NULL, ipvsh);
}
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 141b1509c948..0c03c0e16a96 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -821,6 +821,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
if (add && udest->af != svc->af)
ipvs->mixed_address_family_dests++;
+ /* keep the last_weight with latest non-0 weight */
+ if (add || udest->weight != 0)
+ atomic_set(&dest->last_weight, udest->weight);
+
/* set the weight and the flags */
atomic_set(&dest->weight, udest->weight);
conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
diff --git a/net/netfilter/ipvs/ip_vs_dh.c b/net/netfilter/ipvs/ip_vs_dh.c
index 75f798f8e83b..07459e71d907 100644
--- a/net/netfilter/ipvs/ip_vs_dh.c
+++ b/net/netfilter/ipvs/ip_vs_dh.c
@@ -43,6 +43,7 @@
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
+#include <linux/hash.h>
#include <net/ip_vs.h>
@@ -81,7 +82,7 @@ static inline unsigned int ip_vs_dh_hashkey(int af, const union nf_inet_addr *ad
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
- return (ntohl(addr_fold)*2654435761UL) & IP_VS_DH_TAB_MASK;
+ return hash_32(ntohl(addr_fold), IP_VS_DH_TAB_BITS);
}
diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c
index 58d5d05aec24..4398a72edec5 100644
--- a/net/netfilter/ipvs/ip_vs_ftp.c
+++ b/net/netfilter/ipvs/ip_vs_ftp.c
@@ -29,6 +29,8 @@
#include <linux/moduleparam.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
+#include <linux/ctype.h>
+#include <linux/inet.h>
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/netfilter.h>
@@ -44,9 +46,18 @@
#include <net/ip_vs.h>
-#define SERVER_STRING "227 "
-#define CLIENT_STRING "PORT"
+#define SERVER_STRING_PASV "227 "
+#define CLIENT_STRING_PORT "PORT"
+#define SERVER_STRING_EPSV "229 "
+#define CLIENT_STRING_EPRT "EPRT"
+enum {
+ IP_VS_FTP_ACTIVE = 0,
+ IP_VS_FTP_PORT = 0,
+ IP_VS_FTP_PASV,
+ IP_VS_FTP_EPRT,
+ IP_VS_FTP_EPSV,
+};
/*
* List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper
@@ -58,9 +69,15 @@ module_param_array(ports, ushort, &ports_count, 0444);
MODULE_PARM_DESC(ports, "Ports to monitor for FTP control commands");
-/* Dummy variable */
-static int ip_vs_ftp_pasv;
+static char *ip_vs_ftp_data_ptr(struct sk_buff *skb, struct ip_vs_iphdr *ipvsh)
+{
+ struct tcphdr *th = (struct tcphdr *)((char *)skb->data + ipvsh->len);
+
+ if ((th->doff << 2) < sizeof(struct tcphdr))
+ return NULL;
+ return (char *)th + (th->doff << 2);
+}
static int
ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
@@ -78,20 +95,20 @@ ip_vs_ftp_done_conn(struct ip_vs_app *app, struct ip_vs_conn *cp)
}
-/*
- * Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
- * with the "pattern", ignoring before "skip" and terminated with
- * the "term" character.
- * <addr,port> is in network order.
+/* Get <addr,port> from the string "xxx.xxx.xxx.xxx,ppp,ppp", started
+ * with the "pattern". <addr,port> is in network order.
+ * Parse extended format depending on ext. In this case addr can be pre-set.
*/
static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
const char *pattern, size_t plen,
- char skip, char term,
- __be32 *addr, __be16 *port,
- char **start, char **end)
+ char skip, bool ext, int mode,
+ union nf_inet_addr *addr, __be16 *port,
+ __u16 af, char **start, char **end)
{
char *s, c;
unsigned char p[6];
+ char edelim;
+ __u16 hport;
int i = 0;
if (data_limit - data < plen) {
@@ -113,6 +130,11 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
if (s == data_limit)
return -1;
if (!found) {
+ /* "(" is optional for non-extended format,
+ * so catch the start of IPv4 address
+ */
+ if (!ext && isdigit(*s))
+ break;
if (*s == skip)
found = 1;
} else if (*s != skip) {
@@ -120,41 +142,102 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
}
}
}
+ /* Old IPv4-only format? */
+ if (!ext) {
+ p[0] = 0;
+ for (data = s; ; data++) {
+ if (data == data_limit)
+ return -1;
+ c = *data;
+ if (isdigit(c)) {
+ p[i] = p[i]*10 + c - '0';
+ } else if (c == ',' && i < 5) {
+ i++;
+ p[i] = 0;
+ } else {
+ /* unexpected character or terminator */
+ break;
+ }
+ }
- for (data = s; ; data++) {
- if (data == data_limit)
+ if (i != 5)
return -1;
- if (*data == term)
- break;
+
+ *start = s;
+ *end = data;
+ addr->ip = get_unaligned((__be32 *) p);
+ *port = get_unaligned((__be16 *) (p + 4));
+ return 1;
}
- *end = data;
+ if (s == data_limit)
+ return -1;
+ *start = s;
+ edelim = *s++;
+ if (edelim < 33 || edelim > 126)
+ return -1;
+ if (s == data_limit)
+ return -1;
+ if (*s == edelim) {
+ /* Address family is usually missing for EPSV response */
+ if (mode != IP_VS_FTP_EPSV)
+ return -1;
+ s++;
+ if (s == data_limit)
+ return -1;
+ /* Then address should be missing too */
+ if (*s != edelim)
+ return -1;
+ /* Caller can pre-set addr, if needed */
+ s++;
+ } else {
+ const char *ep;
- memset(p, 0, sizeof(p));
- for (data = s; ; data++) {
- c = *data;
- if (c == term)
- break;
- if (c >= '0' && c <= '9') {
- p[i] = p[i]*10 + c - '0';
- } else if (c == ',' && i < 5) {
- i++;
- } else {
- /* unexpected character */
+ /* We allow address only from same family */
+ if (af == AF_INET6 && *s != '2')
return -1;
+ if (af == AF_INET && *s != '1')
+ return -1;
+ s++;
+ if (s == data_limit)
+ return -1;
+ if (*s != edelim)
+ return -1;
+ s++;
+ if (s == data_limit)
+ return -1;
+ if (af == AF_INET6) {
+ if (in6_pton(s, data_limit - s, (u8 *)addr, edelim,
+ &ep) <= 0)
+ return -1;
+ } else {
+ if (in4_pton(s, data_limit - s, (u8 *)addr, edelim,
+ &ep) <= 0)
+ return -1;
}
+ s = (char *) ep;
+ if (s == data_limit)
+ return -1;
+ if (*s != edelim)
+ return -1;
+ s++;
}
-
- if (i != 5)
+ for (hport = 0; ; s++)
+ {
+ if (s == data_limit)
+ return -1;
+ if (!isdigit(*s))
+ break;
+ hport = hport * 10 + *s - '0';
+ }
+ if (s == data_limit || !hport || *s != edelim)
return -1;
-
- *start = s;
- *addr = get_unaligned((__be32 *) p);
- *port = get_unaligned((__be16 *) (p + 4));
+ s++;
+ *end = s;
+ *port = htons(hport);
return 1;
}
-/*
- * Look at outgoing ftp packets to catch the response to a PASV command
+/* Look at outgoing ftp packets to catch the response to a PASV/EPSV command
* from the server (inside-to-outside).
* When we see one, we build a connection entry with the client address,
* client port 0 (unknown at the moment), the server address and the
@@ -165,12 +248,13 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
* The outgoing packet should be something like
* "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
* xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
+ * The extended format for EPSV response provides usually only port:
+ * "229 Entering Extended Passive Mode (|||ppp|)"
*/
static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
- struct sk_buff *skb, int *diff)
+ struct sk_buff *skb, int *diff,
+ struct ip_vs_iphdr *ipvsh)
{
- struct iphdr *iph;
- struct tcphdr *th;
char *data, *data_limit;
char *start, *end;
union nf_inet_addr from;
@@ -184,14 +268,6 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
*diff = 0;
-#ifdef CONFIG_IP_VS_IPV6
- /* This application helper doesn't work with IPv6 yet,
- * so turn this into a no-op for IPv6 packets
- */
- if (cp->af == AF_INET6)
- return 1;
-#endif
-
/* Only useful for established sessions */
if (cp->state != IP_VS_TCP_S_ESTABLISHED)
return 1;
@@ -200,53 +276,77 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
if (!skb_make_writable(skb, skb->len))
return 0;
- if (cp->app_data == &ip_vs_ftp_pasv) {
- iph = ip_hdr(skb);
- th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
- data = (char *)th + (th->doff << 2);
+ if (cp->app_data == (void *) IP_VS_FTP_PASV) {
+ data = ip_vs_ftp_data_ptr(skb, ipvsh);
data_limit = skb_tail_pointer(skb);
+ if (!data || data >= data_limit)
+ return 1;
+
if (ip_vs_ftp_get_addrport(data, data_limit,
- SERVER_STRING,
- sizeof(SERVER_STRING)-1,
- '(', ')',
- &from.ip, &port,
+ SERVER_STRING_PASV,
+ sizeof(SERVER_STRING_PASV)-1,
+ '(', false, IP_VS_FTP_PASV,
+ &from, &port, cp->af,
&start, &end) != 1)
return 1;
- IP_VS_DBG(7, "PASV response (%pI4:%d) -> %pI4:%d detected\n",
+ IP_VS_DBG(7, "PASV response (%pI4:%u) -> %pI4:%u detected\n",
&from.ip, ntohs(port), &cp->caddr.ip, 0);
+ } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) {
+ data = ip_vs_ftp_data_ptr(skb, ipvsh);
+ data_limit = skb_tail_pointer(skb);
- /*
- * Now update or create an connection entry for it
+ if (!data || data >= data_limit)
+ return 1;
+
+ /* Usually, data address is not specified but
+ * we support different address, so pre-set it.
*/
- {
- struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(cp->ipvs, AF_INET,
- iph->protocol, &from, port,
- &cp->caddr, 0, &p);
- n_cp = ip_vs_conn_out_get(&p);
- }
- if (!n_cp) {
- struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(cp->ipvs,
- AF_INET, IPPROTO_TCP, &cp->caddr,
- 0, &cp->vaddr, port, &p);
- /* As above, this is ipv4 only */
- n_cp = ip_vs_conn_new(&p, AF_INET, &from, port,
- IP_VS_CONN_F_NO_CPORT |
- IP_VS_CONN_F_NFCT,
- cp->dest, skb->mark);
- if (!n_cp)
- return 0;
+ from = cp->daddr;
+ if (ip_vs_ftp_get_addrport(data, data_limit,
+ SERVER_STRING_EPSV,
+ sizeof(SERVER_STRING_EPSV)-1,
+ '(', true, IP_VS_FTP_EPSV,
+ &from, &port, cp->af,
+ &start, &end) != 1)
+ return 1;
- /* add its controller */
- ip_vs_control_add(n_cp, cp);
- }
+ IP_VS_DBG_BUF(7, "EPSV response (%s:%u) -> %s:%u detected\n",
+ IP_VS_DBG_ADDR(cp->af, &from), ntohs(port),
+ IP_VS_DBG_ADDR(cp->af, &cp->caddr), 0);
+ } else {
+ return 1;
+ }
- /*
- * Replace the old passive address with the new one
- */
+ /* Now update or create a connection entry for it */
+ {
+ struct ip_vs_conn_param p;
+
+ ip_vs_conn_fill_param(cp->ipvs, cp->af,
+ ipvsh->protocol, &from, port,
+ &cp->caddr, 0, &p);
+ n_cp = ip_vs_conn_out_get(&p);
+ }
+ if (!n_cp) {
+ struct ip_vs_conn_param p;
+
+ ip_vs_conn_fill_param(cp->ipvs,
+ cp->af, ipvsh->protocol, &cp->caddr,
+ 0, &cp->vaddr, port, &p);
+ n_cp = ip_vs_conn_new(&p, cp->af, &from, port,
+ IP_VS_CONN_F_NO_CPORT |
+ IP_VS_CONN_F_NFCT,
+ cp->dest, skb->mark);
+ if (!n_cp)
+ return 0;
+
+ /* add its controller */
+ ip_vs_control_add(n_cp, cp);
+ }
+
+ /* Replace the old passive address with the new one */
+ if (cp->app_data == (void *) IP_VS_FTP_PASV) {
from.ip = n_cp->vaddr.ip;
port = n_cp->vport;
snprintf(buf, sizeof(buf), "%u,%u,%u,%u,%u,%u",
@@ -256,50 +356,54 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
((unsigned char *)&from.ip)[3],
ntohs(port) >> 8,
ntohs(port) & 0xFF);
+ } else if (cp->app_data == (void *) IP_VS_FTP_EPSV) {
+ from = n_cp->vaddr;
+ port = n_cp->vport;
+ /* Only port, client will use VIP for the data connection */
+ snprintf(buf, sizeof(buf), "|||%u|",
+ ntohs(port));
+ } else {
+ *buf = 0;
+ }
+ buf_len = strlen(buf);
- buf_len = strlen(buf);
-
- ct = nf_ct_get(skb, &ctinfo);
- if (ct) {
- bool mangled;
-
- /* If mangling fails this function will return 0
- * which will cause the packet to be dropped.
- * Mangling can only fail under memory pressure,
- * hopefully it will succeed on the retransmitted
- * packet.
- */
- mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
- iph->ihl * 4,
- start - data,
- end - start,
- buf, buf_len);
- if (mangled) {
- ip_vs_nfct_expect_related(skb, ct, n_cp,
- IPPROTO_TCP, 0, 0);
- if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->ip_summed = CHECKSUM_UNNECESSARY;
- /* csum is updated */
- ret = 1;
- }
- }
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct) {
+ bool mangled;
- /*
- * Not setting 'diff' is intentional, otherwise the sequence
- * would be adjusted twice.
+ /* If mangling fails this function will return 0
+ * which will cause the packet to be dropped.
+ * Mangling can only fail under memory pressure,
+ * hopefully it will succeed on the retransmitted
+ * packet.
*/
-
- cp->app_data = NULL;
- ip_vs_tcp_conn_listen(n_cp);
- ip_vs_conn_put(n_cp);
- return ret;
+ mangled = nf_nat_mangle_tcp_packet(skb, ct, ctinfo,
+ ipvsh->len,
+ start - data,
+ end - start,
+ buf, buf_len);
+ if (mangled) {
+ ip_vs_nfct_expect_related(skb, ct, n_cp,
+ ipvsh->protocol, 0, 0);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ /* csum is updated */
+ ret = 1;
+ }
}
- return 1;
+
+ /* Not setting 'diff' is intentional, otherwise the sequence
+ * would be adjusted twice.
+ */
+
+ cp->app_data = (void *) IP_VS_FTP_ACTIVE;
+ ip_vs_tcp_conn_listen(n_cp);
+ ip_vs_conn_put(n_cp);
+ return ret;
}
-/*
- * Look at incoming ftp packets to catch the PASV/PORT command
+/* Look at incoming ftp packets to catch the PASV/PORT/EPRT/EPSV command
* (outside-to-inside).
*
* The incoming packet having the PORT command should be something like
@@ -308,12 +412,19 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
* In this case, we create a connection entry using the client address and
* port, so that the active ftp data connection from the server can reach
* the client.
+ * Extended format:
+ * "EPSV\r\n" when client requests server address from same family
+ * "EPSV 1\r\n" when client requests IPv4 server address
+ * "EPSV 2\r\n" when client requests IPv6 server address
+ * "EPSV ALL\r\n" - not supported
+ * EPRT with specified delimiter (ASCII 33..126), "|" by default:
+ * "EPRT |1|IPv4ADDR|PORT|\r\n" when client provides IPv4 addrport
+ * "EPRT |2|IPv6ADDR|PORT|\r\n" when client provides IPv6 addrport
*/
static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
- struct sk_buff *skb, int *diff)
+ struct sk_buff *skb, int *diff,
+ struct ip_vs_iphdr *ipvsh)
{
- struct iphdr *iph;
- struct tcphdr *th;
char *data, *data_start, *data_limit;
char *start, *end;
union nf_inet_addr to;
@@ -323,14 +434,6 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
/* no diff required for incoming packets */
*diff = 0;
-#ifdef CONFIG_IP_VS_IPV6
- /* This application helper doesn't work with IPv6 yet,
- * so turn this into a no-op for IPv6 packets
- */
- if (cp->af == AF_INET6)
- return 1;
-#endif
-
/* Only useful for established sessions */
if (cp->state != IP_VS_TCP_S_ESTABLISHED)
return 1;
@@ -339,27 +442,48 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
if (!skb_make_writable(skb, skb->len))
return 0;
- /*
- * Detecting whether it is passive
- */
- iph = ip_hdr(skb);
- th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
-
- /* Since there may be OPTIONS in the TCP packet and the HLEN is
- the length of the header in 32-bit multiples, it is accurate
- to calculate data address by th+HLEN*4 */
- data = data_start = (char *)th + (th->doff << 2);
+ data = data_start = ip_vs_ftp_data_ptr(skb, ipvsh);
data_limit = skb_tail_pointer(skb);
+ if (!data || data >= data_limit)
+ return 1;
while (data <= data_limit - 6) {
- if (strncasecmp(data, "PASV\r\n", 6) == 0) {
+ if (cp->af == AF_INET &&
+ strncasecmp(data, "PASV\r\n", 6) == 0) {
/* Passive mode on */
IP_VS_DBG(7, "got PASV at %td of %td\n",
data - data_start,
data_limit - data_start);
- cp->app_data = &ip_vs_ftp_pasv;
+ cp->app_data = (void *) IP_VS_FTP_PASV;
return 1;
}
+
+ /* EPSV or EPSV<space><net-prt> */
+ if (strncasecmp(data, "EPSV", 4) == 0 &&
+ (data[4] == ' ' || data[4] == '\r')) {
+ if (data[4] == ' ') {
+ char proto = data[5];
+
+ if (data > data_limit - 7 || data[6] != '\r')
+ return 1;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (cp->af == AF_INET6 && proto == '2') {
+ } else
+#endif
+ if (cp->af == AF_INET && proto == '1') {
+ } else {
+ return 1;
+ }
+ }
+ /* Extended Passive mode on */
+ IP_VS_DBG(7, "got EPSV at %td of %td\n",
+ data - data_start,
+ data_limit - data_start);
+ cp->app_data = (void *) IP_VS_FTP_EPSV;
+ return 1;
+ }
+
data++;
}
@@ -370,33 +494,52 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
* then create a new connection entry for the coming data
* connection.
*/
- if (ip_vs_ftp_get_addrport(data_start, data_limit,
- CLIENT_STRING, sizeof(CLIENT_STRING)-1,
- ' ', '\r', &to.ip, &port,
- &start, &end) != 1)
+ if (cp->af == AF_INET &&
+ ip_vs_ftp_get_addrport(data_start, data_limit,
+ CLIENT_STRING_PORT,
+ sizeof(CLIENT_STRING_PORT)-1,
+ ' ', false, IP_VS_FTP_PORT,
+ &to, &port, cp->af,
+ &start, &end) == 1) {
+
+ IP_VS_DBG(7, "PORT %pI4:%u detected\n", &to.ip, ntohs(port));
+
+ /* Now update or create a connection entry for it */
+ IP_VS_DBG(7, "protocol %s %pI4:%u %pI4:%u\n",
+ ip_vs_proto_name(ipvsh->protocol),
+ &to.ip, ntohs(port), &cp->vaddr.ip,
+ ntohs(cp->vport)-1);
+ } else if (ip_vs_ftp_get_addrport(data_start, data_limit,
+ CLIENT_STRING_EPRT,
+ sizeof(CLIENT_STRING_EPRT)-1,
+ ' ', true, IP_VS_FTP_EPRT,
+ &to, &port, cp->af,
+ &start, &end) == 1) {
+
+ IP_VS_DBG_BUF(7, "EPRT %s:%u detected\n",
+ IP_VS_DBG_ADDR(cp->af, &to), ntohs(port));
+
+ /* Now update or create a connection entry for it */
+ IP_VS_DBG_BUF(7, "protocol %s %s:%u %s:%u\n",
+ ip_vs_proto_name(ipvsh->protocol),
+ IP_VS_DBG_ADDR(cp->af, &to), ntohs(port),
+ IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
+ ntohs(cp->vport)-1);
+ } else {
return 1;
-
- IP_VS_DBG(7, "PORT %pI4:%d detected\n", &to.ip, ntohs(port));
+ }
/* Passive mode off */
- cp->app_data = NULL;
-
- /*
- * Now update or create a connection entry for it
- */
- IP_VS_DBG(7, "protocol %s %pI4:%d %pI4:%d\n",
- ip_vs_proto_name(iph->protocol),
- &to.ip, ntohs(port), &cp->vaddr.ip, 0);
+ cp->app_data = (void *) IP_VS_FTP_ACTIVE;
{
struct ip_vs_conn_param p;
- ip_vs_conn_fill_param(cp->ipvs, AF_INET,
- iph->protocol, &to, port, &cp->vaddr,
+ ip_vs_conn_fill_param(cp->ipvs, cp->af,
+ ipvsh->protocol, &to, port, &cp->vaddr,
htons(ntohs(cp->vport)-1), &p);
n_cp = ip_vs_conn_in_get(&p);
if (!n_cp) {
- /* This is ipv4 only */
- n_cp = ip_vs_conn_new(&p, AF_INET, &cp->daddr,
+ n_cp = ip_vs_conn_new(&p, cp->af, &cp->daddr,
htons(ntohs(cp->dport)-1),
IP_VS_CONN_F_NFCT, cp->dest,
skb->mark);
@@ -454,7 +597,7 @@ static int __net_init __ip_vs_ftp_init(struct net *net)
ret = register_ip_vs_app_inc(ipvs, app, app->protocol, ports[i]);
if (ret)
goto err_unreg;
- pr_info("%s: loaded support on port[%d] = %d\n",
+ pr_info("%s: loaded support on port[%d] = %u\n",
app->name, i, ports[i]);
}
return 0;
diff --git a/net/netfilter/ipvs/ip_vs_lblc.c b/net/netfilter/ipvs/ip_vs_lblc.c
index 3057e453bf31..b9f375e6dc93 100644
--- a/net/netfilter/ipvs/ip_vs_lblc.c
+++ b/net/netfilter/ipvs/ip_vs_lblc.c
@@ -48,6 +48,7 @@
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/jiffies.h>
+#include <linux/hash.h>
/* for sysctl */
#include <linux/fs.h>
@@ -160,7 +161,7 @@ ip_vs_lblc_hashkey(int af, const union nf_inet_addr *addr)
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
- return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLC_TAB_MASK;
+ return hash_32(ntohl(addr_fold), IP_VS_LBLC_TAB_BITS);
}
@@ -371,6 +372,7 @@ static int ip_vs_lblc_init_svc(struct ip_vs_service *svc)
tbl->counter = 1;
tbl->dead = false;
tbl->svc = svc;
+ atomic_set(&tbl->entries, 0);
/*
* Hook periodic timer for garbage collection
diff --git a/net/netfilter/ipvs/ip_vs_lblcr.c b/net/netfilter/ipvs/ip_vs_lblcr.c
index 92adc04557ed..542c4949937a 100644
--- a/net/netfilter/ipvs/ip_vs_lblcr.c
+++ b/net/netfilter/ipvs/ip_vs_lblcr.c
@@ -47,6 +47,7 @@
#include <linux/jiffies.h>
#include <linux/list.h>
#include <linux/slab.h>
+#include <linux/hash.h>
/* for sysctl */
#include <linux/fs.h>
@@ -323,7 +324,7 @@ ip_vs_lblcr_hashkey(int af, const union nf_inet_addr *addr)
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
- return (ntohl(addr_fold)*2654435761UL) & IP_VS_LBLCR_TAB_MASK;
+ return hash_32(ntohl(addr_fold), IP_VS_LBLCR_TAB_BITS);
}
@@ -534,6 +535,7 @@ static int ip_vs_lblcr_init_svc(struct ip_vs_service *svc)
tbl->counter = 1;
tbl->dead = false;
tbl->svc = svc;
+ atomic_set(&tbl->entries, 0);
/*
* Hook periodic timer for garbage collection
diff --git a/net/netfilter/ipvs/ip_vs_mh.c b/net/netfilter/ipvs/ip_vs_mh.c
new file mode 100644
index 000000000000..0f795b186eb3
--- /dev/null
+++ b/net/netfilter/ipvs/ip_vs_mh.c
@@ -0,0 +1,540 @@
+// SPDX-License-Identifier: GPL-2.0
+/* IPVS: Maglev Hashing scheduling module
+ *
+ * Authors: Inju Song <inju.song@navercorp.com>
+ *
+ */
+
+/* The mh algorithm is to assign a preference list of all the lookup
+ * table positions to each destination and populate the table with
+ * the most-preferred position of destinations. Then it is to select
+ * destination with the hash key of source IP address through looking
+ * up a the lookup table.
+ *
+ * The algorithm is detailed in:
+ * [3.4 Consistent Hasing]
+https://www.usenix.org/system/files/conference/nsdi16/nsdi16-paper-eisenbud.pdf
+ *
+ */
+
+#define KMSG_COMPONENT "IPVS"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/ip.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+
+#include <net/ip_vs.h>
+
+#include <linux/siphash.h>
+#include <linux/bitops.h>
+#include <linux/gcd.h>
+
+#define IP_VS_SVC_F_SCHED_MH_FALLBACK IP_VS_SVC_F_SCHED1 /* MH fallback */
+#define IP_VS_SVC_F_SCHED_MH_PORT IP_VS_SVC_F_SCHED2 /* MH use port */
+
+struct ip_vs_mh_lookup {
+ struct ip_vs_dest __rcu *dest; /* real server (cache) */
+};
+
+struct ip_vs_mh_dest_setup {
+ unsigned int offset; /* starting offset */
+ unsigned int skip; /* skip */
+ unsigned int perm; /* next_offset */
+ int turns; /* weight / gcd() and rshift */
+};
+
+/* Available prime numbers for MH table */
+static int primes[] = {251, 509, 1021, 2039, 4093,
+ 8191, 16381, 32749, 65521, 131071};
+
+/* For IPVS MH entry hash table */
+#ifndef CONFIG_IP_VS_MH_TAB_INDEX
+#define CONFIG_IP_VS_MH_TAB_INDEX 12
+#endif
+#define IP_VS_MH_TAB_BITS (CONFIG_IP_VS_MH_TAB_INDEX / 2)
+#define IP_VS_MH_TAB_INDEX (CONFIG_IP_VS_MH_TAB_INDEX - 8)
+#define IP_VS_MH_TAB_SIZE primes[IP_VS_MH_TAB_INDEX]
+
+struct ip_vs_mh_state {
+ struct rcu_head rcu_head;
+ struct ip_vs_mh_lookup *lookup;
+ struct ip_vs_mh_dest_setup *dest_setup;
+ hsiphash_key_t hash1, hash2;
+ int gcd;
+ int rshift;
+};
+
+static inline void generate_hash_secret(hsiphash_key_t *hash1,
+ hsiphash_key_t *hash2)
+{
+ hash1->key[0] = 2654435761UL;
+ hash1->key[1] = 2654435761UL;
+
+ hash2->key[0] = 2654446892UL;
+ hash2->key[1] = 2654446892UL;
+}
+
+/* Helper function to determine if server is unavailable */
+static inline bool is_unavailable(struct ip_vs_dest *dest)
+{
+ return atomic_read(&dest->weight) <= 0 ||
+ dest->flags & IP_VS_DEST_F_OVERLOAD;
+}
+
+/* Returns hash value for IPVS MH entry */
+static inline unsigned int
+ip_vs_mh_hashkey(int af, const union nf_inet_addr *addr,
+ __be16 port, hsiphash_key_t *key, unsigned int offset)
+{
+ unsigned int v;
+ __be32 addr_fold = addr->ip;
+
+#ifdef CONFIG_IP_VS_IPV6
+ if (af == AF_INET6)
+ addr_fold = addr->ip6[0] ^ addr->ip6[1] ^
+ addr->ip6[2] ^ addr->ip6[3];
+#endif
+ v = (offset + ntohs(port) + ntohl(addr_fold));
+ return hsiphash(&v, sizeof(v), key);
+}
+
+/* Reset all the hash buckets of the specified table. */
+static void ip_vs_mh_reset(struct ip_vs_mh_state *s)
+{
+ int i;
+ struct ip_vs_mh_lookup *l;
+ struct ip_vs_dest *dest;
+
+ l = &s->lookup[0];
+ for (i = 0; i < IP_VS_MH_TAB_SIZE; i++) {
+ dest = rcu_dereference_protected(l->dest, 1);
+ if (dest) {
+ ip_vs_dest_put(dest);
+ RCU_INIT_POINTER(l->dest, NULL);
+ }
+ l++;
+ }
+}
+
+static int ip_vs_mh_permutate(struct ip_vs_mh_state *s,
+ struct ip_vs_service *svc)
+{
+ struct list_head *p;
+ struct ip_vs_mh_dest_setup *ds;
+ struct ip_vs_dest *dest;
+ int lw;
+
+ /* If gcd is smaller then 1, number of dests or
+ * all last_weight of dests are zero. So, skip
+ * permutation for the dests.
+ */
+ if (s->gcd < 1)
+ return 0;
+
+ /* Set dest_setup for the dests permutation */
+ p = &svc->destinations;
+ ds = &s->dest_setup[0];
+ while ((p = p->next) != &svc->destinations) {
+ dest = list_entry(p, struct ip_vs_dest, n_list);
+
+ ds->offset = ip_vs_mh_hashkey(svc->af, &dest->addr,
+ dest->port, &s->hash1, 0) %
+ IP_VS_MH_TAB_SIZE;
+ ds->skip = ip_vs_mh_hashkey(svc->af, &dest->addr,
+ dest->port, &s->hash2, 0) %
+ (IP_VS_MH_TAB_SIZE - 1) + 1;
+ ds->perm = ds->offset;
+
+ lw = atomic_read(&dest->last_weight);
+ ds->turns = ((lw / s->gcd) >> s->rshift) ? : (lw != 0);
+ ds++;
+ }
+
+ return 0;
+}
+
+static int ip_vs_mh_populate(struct ip_vs_mh_state *s,
+ struct ip_vs_service *svc)
+{
+ int n, c, dt_count;
+ unsigned long *table;
+ struct list_head *p;
+ struct ip_vs_mh_dest_setup *ds;
+ struct ip_vs_dest *dest, *new_dest;
+
+ /* If gcd is smaller then 1, number of dests or
+ * all last_weight of dests are zero. So, skip
+ * the population for the dests and reset lookup table.
+ */
+ if (s->gcd < 1) {
+ ip_vs_mh_reset(s);
+ return 0;
+ }
+
+ table = kcalloc(BITS_TO_LONGS(IP_VS_MH_TAB_SIZE),
+ sizeof(unsigned long), GFP_KERNEL);
+ if (!table)
+ return -ENOMEM;
+
+ p = &svc->destinations;
+ n = 0;
+ dt_count = 0;
+ while (n < IP_VS_MH_TAB_SIZE) {
+ if (p == &svc->destinations)
+ p = p->next;
+
+ ds = &s->dest_setup[0];
+ while (p != &svc->destinations) {
+ /* Ignore added server with zero weight */
+ if (ds->turns < 1) {
+ p = p->next;
+ ds++;
+ continue;
+ }
+
+ c = ds->perm;
+ while (test_bit(c, table)) {
+ /* Add skip, mod IP_VS_MH_TAB_SIZE */
+ ds->perm += ds->skip;
+ if (ds->perm >= IP_VS_MH_TAB_SIZE)
+ ds->perm -= IP_VS_MH_TAB_SIZE;
+ c = ds->perm;
+ }
+
+ __set_bit(c, table);
+
+ dest = rcu_dereference_protected(s->lookup[c].dest, 1);
+ new_dest = list_entry(p, struct ip_vs_dest, n_list);
+ if (dest != new_dest) {
+ if (dest)
+ ip_vs_dest_put(dest);
+ ip_vs_dest_hold(new_dest);
+ RCU_INIT_POINTER(s->lookup[c].dest, new_dest);
+ }
+
+ if (++n == IP_VS_MH_TAB_SIZE)
+ goto out;
+
+ if (++dt_count >= ds->turns) {
+ dt_count = 0;
+ p = p->next;
+ ds++;
+ }
+ }
+ }
+
+out:
+ kfree(table);
+ return 0;
+}
+
+/* Get ip_vs_dest associated with supplied parameters. */
+static inline struct ip_vs_dest *
+ip_vs_mh_get(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
+ const union nf_inet_addr *addr, __be16 port)
+{
+ unsigned int hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1, 0)
+ % IP_VS_MH_TAB_SIZE;
+ struct ip_vs_dest *dest = rcu_dereference(s->lookup[hash].dest);
+
+ return (!dest || is_unavailable(dest)) ? NULL : dest;
+}
+
+/* As ip_vs_mh_get, but with fallback if selected server is unavailable */
+static inline struct ip_vs_dest *
+ip_vs_mh_get_fallback(struct ip_vs_service *svc, struct ip_vs_mh_state *s,
+ const union nf_inet_addr *addr, __be16 port)
+{
+ unsigned int offset, roffset;
+ unsigned int hash, ihash;
+ struct ip_vs_dest *dest;
+
+ /* First try the dest it's supposed to go to */
+ ihash = ip_vs_mh_hashkey(svc->af, addr, port,
+ &s->hash1, 0) % IP_VS_MH_TAB_SIZE;
+ dest = rcu_dereference(s->lookup[ihash].dest);
+ if (!dest)
+ return NULL;
+ if (!is_unavailable(dest))
+ return dest;
+
+ IP_VS_DBG_BUF(6, "MH: selected unavailable server %s:%u, reselecting",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port));
+
+ /* If the original dest is unavailable, loop around the table
+ * starting from ihash to find a new dest
+ */
+ for (offset = 0; offset < IP_VS_MH_TAB_SIZE; offset++) {
+ roffset = (offset + ihash) % IP_VS_MH_TAB_SIZE;
+ hash = ip_vs_mh_hashkey(svc->af, addr, port, &s->hash1,
+ roffset) % IP_VS_MH_TAB_SIZE;
+ dest = rcu_dereference(s->lookup[hash].dest);
+ if (!dest)
+ break;
+ if (!is_unavailable(dest))
+ return dest;
+ IP_VS_DBG_BUF(6,
+ "MH: selected unavailable server %s:%u (offset %u), reselecting",
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port), roffset);
+ }
+
+ return NULL;
+}
+
+/* Assign all the hash buckets of the specified table with the service. */
+static int ip_vs_mh_reassign(struct ip_vs_mh_state *s,
+ struct ip_vs_service *svc)
+{
+ int ret;
+
+ if (svc->num_dests > IP_VS_MH_TAB_SIZE)
+ return -EINVAL;
+
+ if (svc->num_dests >= 1) {
+ s->dest_setup = kcalloc(svc->num_dests,
+ sizeof(struct ip_vs_mh_dest_setup),
+ GFP_KERNEL);
+ if (!s->dest_setup)
+ return -ENOMEM;
+ }
+
+ ip_vs_mh_permutate(s, svc);
+
+ ret = ip_vs_mh_populate(s, svc);
+ if (ret < 0)
+ goto out;
+
+ IP_VS_DBG_BUF(6, "MH: reassign lookup table of %s:%u\n",
+ IP_VS_DBG_ADDR(svc->af, &svc->addr),
+ ntohs(svc->port));
+
+out:
+ if (svc->num_dests >= 1) {
+ kfree(s->dest_setup);
+ s->dest_setup = NULL;
+ }
+ return ret;
+}
+
+static int ip_vs_mh_gcd_weight(struct ip_vs_service *svc)
+{
+ struct ip_vs_dest *dest;
+ int weight;
+ int g = 0;
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ weight = atomic_read(&dest->last_weight);
+ if (weight > 0) {
+ if (g > 0)
+ g = gcd(weight, g);
+ else
+ g = weight;
+ }
+ }
+ return g;
+}
+
+/* To avoid assigning huge weight for the MH table,
+ * calculate shift value with gcd.
+ */
+static int ip_vs_mh_shift_weight(struct ip_vs_service *svc, int gcd)
+{
+ struct ip_vs_dest *dest;
+ int new_weight, weight = 0;
+ int mw, shift;
+
+ /* If gcd is smaller then 1, number of dests or
+ * all last_weight of dests are zero. So, return
+ * shift value as zero.
+ */
+ if (gcd < 1)
+ return 0;
+
+ list_for_each_entry(dest, &svc->destinations, n_list) {
+ new_weight = atomic_read(&dest->last_weight);
+ if (new_weight > weight)
+ weight = new_weight;
+ }
+
+ /* Because gcd is greater than zero,
+ * the maximum weight and gcd are always greater than zero
+ */
+ mw = weight / gcd;
+
+ /* shift = occupied bits of weight/gcd - MH highest bits */
+ shift = fls(mw) - IP_VS_MH_TAB_BITS;
+ return (shift >= 0) ? shift : 0;
+}
+
+static void ip_vs_mh_state_free(struct rcu_head *head)
+{
+ struct ip_vs_mh_state *s;
+
+ s = container_of(head, struct ip_vs_mh_state, rcu_head);
+ kfree(s->lookup);
+ kfree(s);
+}
+
+static int ip_vs_mh_init_svc(struct ip_vs_service *svc)
+{
+ int ret;
+ struct ip_vs_mh_state *s;
+
+ /* Allocate the MH table for this service */
+ s = kzalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ s->lookup = kcalloc(IP_VS_MH_TAB_SIZE, sizeof(struct ip_vs_mh_lookup),
+ GFP_KERNEL);
+ if (!s->lookup) {
+ kfree(s);
+ return -ENOMEM;
+ }
+
+ generate_hash_secret(&s->hash1, &s->hash2);
+ s->gcd = ip_vs_mh_gcd_weight(svc);
+ s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
+
+ IP_VS_DBG(6,
+ "MH lookup table (memory=%zdbytes) allocated for current service\n",
+ sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
+
+ /* Assign the lookup table with current dests */
+ ret = ip_vs_mh_reassign(s, svc);
+ if (ret < 0) {
+ ip_vs_mh_reset(s);
+ ip_vs_mh_state_free(&s->rcu_head);
+ return ret;
+ }
+
+ /* No more failures, attach state */
+ svc->sched_data = s;
+ return 0;
+}
+
+static void ip_vs_mh_done_svc(struct ip_vs_service *svc)
+{
+ struct ip_vs_mh_state *s = svc->sched_data;
+
+ /* Got to clean up lookup entry here */
+ ip_vs_mh_reset(s);
+
+ call_rcu(&s->rcu_head, ip_vs_mh_state_free);
+ IP_VS_DBG(6, "MH lookup table (memory=%zdbytes) released\n",
+ sizeof(struct ip_vs_mh_lookup) * IP_VS_MH_TAB_SIZE);
+}
+
+static int ip_vs_mh_dest_changed(struct ip_vs_service *svc,
+ struct ip_vs_dest *dest)
+{
+ struct ip_vs_mh_state *s = svc->sched_data;
+
+ s->gcd = ip_vs_mh_gcd_weight(svc);
+ s->rshift = ip_vs_mh_shift_weight(svc, s->gcd);
+
+ /* Assign the lookup table with the updated service */
+ return ip_vs_mh_reassign(s, svc);
+}
+
+/* Helper function to get port number */
+static inline __be16
+ip_vs_mh_get_port(const struct sk_buff *skb, struct ip_vs_iphdr *iph)
+{
+ __be16 _ports[2], *ports;
+
+ /* At this point we know that we have a valid packet of some kind.
+ * Because ICMP packets are only guaranteed to have the first 8
+ * bytes, let's just grab the ports. Fortunately they're in the
+ * same position for all three of the protocols we care about.
+ */
+ switch (iph->protocol) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ case IPPROTO_SCTP:
+ ports = skb_header_pointer(skb, iph->len, sizeof(_ports),
+ &_ports);
+ if (unlikely(!ports))
+ return 0;
+
+ if (likely(!ip_vs_iph_inverse(iph)))
+ return ports[0];
+ else
+ return ports[1];
+ default:
+ return 0;
+ }
+}
+
+/* Maglev Hashing scheduling */
+static struct ip_vs_dest *
+ip_vs_mh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb,
+ struct ip_vs_iphdr *iph)
+{
+ struct ip_vs_dest *dest;
+ struct ip_vs_mh_state *s;
+ __be16 port = 0;
+ const union nf_inet_addr *hash_addr;
+
+ hash_addr = ip_vs_iph_inverse(iph) ? &iph->daddr : &iph->saddr;
+
+ IP_VS_DBG(6, "%s : Scheduling...\n", __func__);
+
+ if (svc->flags & IP_VS_SVC_F_SCHED_MH_PORT)
+ port = ip_vs_mh_get_port(skb, iph);
+
+ s = (struct ip_vs_mh_state *)svc->sched_data;
+
+ if (svc->flags & IP_VS_SVC_F_SCHED_MH_FALLBACK)
+ dest = ip_vs_mh_get_fallback(svc, s, hash_addr, port);
+ else
+ dest = ip_vs_mh_get(svc, s, hash_addr, port);
+
+ if (!dest) {
+ ip_vs_scheduler_err(svc, "no destination available");
+ return NULL;
+ }
+
+ IP_VS_DBG_BUF(6, "MH: source IP address %s:%u --> server %s:%u\n",
+ IP_VS_DBG_ADDR(svc->af, hash_addr),
+ ntohs(port),
+ IP_VS_DBG_ADDR(dest->af, &dest->addr),
+ ntohs(dest->port));
+
+ return dest;
+}
+
+/* IPVS MH Scheduler structure */
+static struct ip_vs_scheduler ip_vs_mh_scheduler = {
+ .name = "mh",
+ .refcnt = ATOMIC_INIT(0),
+ .module = THIS_MODULE,
+ .n_list = LIST_HEAD_INIT(ip_vs_mh_scheduler.n_list),
+ .init_service = ip_vs_mh_init_svc,
+ .done_service = ip_vs_mh_done_svc,
+ .add_dest = ip_vs_mh_dest_changed,
+ .del_dest = ip_vs_mh_dest_changed,
+ .upd_dest = ip_vs_mh_dest_changed,
+ .schedule = ip_vs_mh_schedule,
+};
+
+static int __init ip_vs_mh_init(void)
+{
+ return register_ip_vs_scheduler(&ip_vs_mh_scheduler);
+}
+
+static void __exit ip_vs_mh_cleanup(void)
+{
+ unregister_ip_vs_scheduler(&ip_vs_mh_scheduler);
+ rcu_barrier();
+}
+
+module_init(ip_vs_mh_init);
+module_exit(ip_vs_mh_cleanup);
+MODULE_DESCRIPTION("Maglev hashing ipvs scheduler");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Inju Song <inju.song@navercorp.com>");
diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c
index 6cf3fd81a5ec..eb8b9c883889 100644
--- a/net/netfilter/ipvs/ip_vs_nfct.c
+++ b/net/netfilter/ipvs/ip_vs_nfct.c
@@ -67,15 +67,20 @@
#include <net/netfilter/nf_conntrack_zones.h>
-#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u"
-#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \
- &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \
+#define FMT_TUPLE "%s:%u->%s:%u/%u"
+#define ARG_TUPLE(T) IP_VS_DBG_ADDR((T)->src.l3num, &(T)->src.u3), \
+ ntohs((T)->src.u.all), \
+ IP_VS_DBG_ADDR((T)->src.l3num, &(T)->dst.u3), \
+ ntohs((T)->dst.u.all), \
(T)->dst.protonum
-#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u"
-#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \
- &((C)->vaddr.ip), ntohs((C)->vport), \
- &((C)->daddr.ip), ntohs((C)->dport), \
+#define FMT_CONN "%s:%u->%s:%u->%s:%u/%u:%u"
+#define ARG_CONN(C) IP_VS_DBG_ADDR((C)->af, &((C)->caddr)), \
+ ntohs((C)->cport), \
+ IP_VS_DBG_ADDR((C)->af, &((C)->vaddr)), \
+ ntohs((C)->vport), \
+ IP_VS_DBG_ADDR((C)->daf, &((C)->daddr)), \
+ ntohs((C)->dport), \
(C)->protocol, (C)->state
void
@@ -127,13 +132,17 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
new_tuple.dst.protonum != IPPROTO_ICMPV6)
new_tuple.dst.u.tcp.port = cp->vport;
}
- IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
- "ctinfo=%d, old reply=" FMT_TUPLE
- ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n",
- __func__, ct, ct->status, ctinfo,
- ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple),
- ARG_TUPLE(&new_tuple), ARG_CONN(cp));
+ IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+ "ctinfo=%d, old reply=" FMT_TUPLE "\n",
+ __func__, ct, ct->status, ctinfo,
+ ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple));
+ IP_VS_DBG_BUF(7, "%s: Updating conntrack ct=%p, status=0x%lX, "
+ "ctinfo=%d, new reply=" FMT_TUPLE "\n",
+ __func__, ct, ct->status, ctinfo,
+ ARG_TUPLE(&new_tuple));
nf_conntrack_alter_reply(ct, &new_tuple);
+ IP_VS_DBG_BUF(7, "%s: Updated conntrack ct=%p for cp=" FMT_CONN "\n",
+ __func__, ct, ARG_CONN(cp));
}
int ip_vs_confirm_conntrack(struct sk_buff *skb)
@@ -152,9 +161,6 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
struct ip_vs_conn_param p;
struct net *net = nf_ct_net(ct);
- if (exp->tuple.src.l3num != PF_INET)
- return;
-
/*
* We assume that no NF locks are held before this callback.
* ip_vs_conn_out_get and ip_vs_conn_in_get should match their
@@ -171,19 +177,15 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
cp = ip_vs_conn_out_get(&p);
if (cp) {
/* Change reply CLIENT->RS to CLIENT->VS */
+ IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found inout cp="
+ FMT_CONN "\n",
+ __func__, ct, ct->status, ARG_CONN(cp));
new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
- FMT_TUPLE ", found inout cp=" FMT_CONN "\n",
- __func__, ct, ct->status,
- ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
- ARG_CONN(cp));
+ IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&new_reply));
new_reply.dst.u3 = cp->vaddr;
new_reply.dst.u.tcp.port = cp->vport;
- IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE
- ", inout cp=" FMT_CONN "\n",
- __func__, ct,
- ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
- ARG_CONN(cp));
goto alter;
}
@@ -191,25 +193,21 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct,
cp = ip_vs_conn_in_get(&p);
if (cp) {
/* Change reply VS->CLIENT to RS->CLIENT */
+ IP_VS_DBG_BUF(7, "%s: for ct=%p, status=0x%lX found outin cp="
+ FMT_CONN "\n",
+ __func__, ct, ct->status, ARG_CONN(cp));
new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple;
- IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", "
- FMT_TUPLE ", found outin cp=" FMT_CONN "\n",
- __func__, ct, ct->status,
- ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
- ARG_CONN(cp));
+ IP_VS_DBG_BUF(7, "%s: ct=%p before alter: reply tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&new_reply));
new_reply.src.u3 = cp->daddr;
new_reply.src.u.tcp.port = cp->dport;
- IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", "
- FMT_TUPLE ", outin cp=" FMT_CONN "\n",
- __func__, ct,
- ARG_TUPLE(orig), ARG_TUPLE(&new_reply),
- ARG_CONN(cp));
goto alter;
}
- IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
- " - unknown expect\n",
- __func__, ct, ct->status, ARG_TUPLE(orig));
+ IP_VS_DBG_BUF(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE
+ " - unknown expect\n",
+ __func__, ct, ct->status, ARG_TUPLE(orig));
return;
alter:
@@ -247,8 +245,8 @@ void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct,
exp->expectfn = ip_vs_nfct_expect_callback;
- IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
- __func__, ct, ARG_TUPLE(&exp->tuple));
+ IP_VS_DBG_BUF(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&exp->tuple));
nf_ct_expect_related(exp);
nf_ct_expect_put(exp);
}
@@ -274,26 +272,25 @@ void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp)
tuple.dst.u3 = cp->vaddr;
tuple.dst.u.all = cp->vport;
- IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE
- " for conn " FMT_CONN "\n",
- __func__, ARG_TUPLE(&tuple), ARG_CONN(cp));
+ IP_VS_DBG_BUF(7, "%s: dropping conntrack for conn " FMT_CONN "\n",
+ __func__, ARG_CONN(cp));
h = nf_conntrack_find_get(cp->ipvs->net, &nf_ct_zone_dflt, &tuple);
if (h) {
ct = nf_ct_tuplehash_to_ctrack(h);
if (nf_ct_kill(ct)) {
- IP_VS_DBG(7, "%s: ct=%p, deleted conntrack for tuple="
- FMT_TUPLE "\n",
- __func__, ct, ARG_TUPLE(&tuple));
+ IP_VS_DBG_BUF(7, "%s: ct=%p deleted for tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&tuple));
} else {
- IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple="
- FMT_TUPLE "\n",
- __func__, ct, ARG_TUPLE(&tuple));
+ IP_VS_DBG_BUF(7, "%s: ct=%p, no conntrack for tuple="
+ FMT_TUPLE "\n",
+ __func__, ct, ARG_TUPLE(&tuple));
}
nf_ct_put(ct);
} else {
- IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
- __func__, ARG_TUPLE(&tuple));
+ IP_VS_DBG_BUF(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n",
+ __func__, ARG_TUPLE(&tuple));
}
}
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index eff7569824e5..3250c4a1111e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -109,7 +109,7 @@ sctp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
return 0;
/* Call application helper if needed */
- ret = ip_vs_app_pkt_out(cp, skb);
+ ret = ip_vs_app_pkt_out(cp, skb, iph);
if (ret == 0)
return 0;
/* ret=2: csum update is needed after payload mangling */
@@ -156,7 +156,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
return 0;
/* Call application helper if needed */
- ret = ip_vs_app_pkt_in(cp, skb);
+ ret = ip_vs_app_pkt_in(cp, skb, iph);
if (ret == 0)
return 0;
/* ret=2: csum update is needed after payload mangling */
diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c
index bcd9b7bde4ee..80d10ad12a15 100644
--- a/net/netfilter/ipvs/ip_vs_proto_tcp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c
@@ -170,7 +170,7 @@ tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
return 0;
/* Call application helper if needed */
- if (!(ret = ip_vs_app_pkt_out(cp, skb)))
+ if (!(ret = ip_vs_app_pkt_out(cp, skb, iph)))
return 0;
/* ret=2: csum update is needed after payload mangling */
if (ret == 1)
@@ -251,7 +251,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
* Attempt ip_vs_app call.
* It will fix ip_vs_conn and iph ack_seq stuff
*/
- if (!(ret = ip_vs_app_pkt_in(cp, skb)))
+ if (!(ret = ip_vs_app_pkt_in(cp, skb, iph)))
return 0;
/* ret=2: csum update is needed after payload mangling */
if (ret == 1)
@@ -436,7 +436,7 @@ static bool tcp_state_active(int state)
return tcp_state_active_table[state];
}
-static struct tcp_states_t tcp_states [] = {
+static struct tcp_states_t tcp_states[] = {
/* INPUT */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
@@ -459,7 +459,7 @@ static struct tcp_states_t tcp_states [] = {
/*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
};
-static struct tcp_states_t tcp_states_dos [] = {
+static struct tcp_states_t tcp_states_dos[] = {
/* INPUT */
/* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */
/*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c
index c15ef7c2a1fa..e0ef11c3691e 100644
--- a/net/netfilter/ipvs/ip_vs_proto_udp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_udp.c
@@ -162,7 +162,7 @@ udp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
/*
* Call application helper if needed
*/
- if (!(ret = ip_vs_app_pkt_out(cp, skb)))
+ if (!(ret = ip_vs_app_pkt_out(cp, skb, iph)))
return 0;
/* ret=2: csum update is needed after payload mangling */
if (ret == 1)
@@ -246,7 +246,7 @@ udp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
* Attempt ip_vs_app call.
* It will fix ip_vs_conn
*/
- if (!(ret = ip_vs_app_pkt_in(cp, skb)))
+ if (!(ret = ip_vs_app_pkt_in(cp, skb, iph)))
return 0;
/* ret=2: csum update is needed after payload mangling */
if (ret == 1)
diff --git a/net/netfilter/ipvs/ip_vs_sh.c b/net/netfilter/ipvs/ip_vs_sh.c
index 16aaac6eedc9..1e01c782583a 100644
--- a/net/netfilter/ipvs/ip_vs_sh.c
+++ b/net/netfilter/ipvs/ip_vs_sh.c
@@ -96,7 +96,8 @@ ip_vs_sh_hashkey(int af, const union nf_inet_addr *addr,
addr_fold = addr->ip6[0]^addr->ip6[1]^
addr->ip6[2]^addr->ip6[3];
#endif
- return (offset + (ntohs(port) + ntohl(addr_fold))*2654435761UL) &
+ return (offset + hash_32(ntohs(port) + ntohl(addr_fold),
+ IP_VS_SH_TAB_BITS)) &
IP_VS_SH_TAB_MASK;
}
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 4527921b1c3a..ba0a0fd045c8 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -266,12 +266,13 @@ static inline bool decrement_ttl(struct netns_ipvs *ipvs,
/* check and decrement ttl */
if (ipv6_hdr(skb)->hop_limit <= 1) {
+ struct inet6_dev *idev = __in6_dev_get_safely(skb->dev);
+
/* Force OUTPUT device used as source address */
skb->dev = dst->dev;
icmpv6_send(skb, ICMPV6_TIME_EXCEED,
ICMPV6_EXC_HOPLIMIT, 0);
- __IP6_INC_STATS(net, ip6_dst_idev(dst),
- IPSTATS_MIB_INHDRERRORS);
+ __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
return false;
}