aboutsummaryrefslogtreecommitdiffstats
path: root/net/netfilter/ipvs/ip_vs_sync.c
diff options
context:
space:
mode:
Diffstat (limited to 'net/netfilter/ipvs/ip_vs_sync.c')
-rw-r--r--net/netfilter/ipvs/ip_vs_sync.c668
1 files changed, 458 insertions, 210 deletions
diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c
index 8a0d6d6889f0..effa10c9e4e3 100644
--- a/net/netfilter/ipvs/ip_vs_sync.c
+++ b/net/netfilter/ipvs/ip_vs_sync.c
@@ -196,6 +196,7 @@ struct ip_vs_sync_thread_data {
struct net *net;
struct socket *sock;
char *buf;
+ int id;
};
/* Version 0 definition of packet sizes */
@@ -271,13 +272,6 @@ struct ip_vs_sync_buff {
unsigned char *end;
};
-/* multicast addr */
-static struct sockaddr_in mcast_addr = {
- .sin_family = AF_INET,
- .sin_port = cpu_to_be16(IP_VS_SYNC_PORT),
- .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
-};
-
/*
* Copy of struct ip_vs_seq
* From unaligned network order to aligned host order
@@ -300,18 +294,22 @@ static void hton_seq(struct ip_vs_seq *ho, struct ip_vs_seq *no)
put_unaligned_be32(ho->previous_delta, &no->previous_delta);
}
-static inline struct ip_vs_sync_buff *sb_dequeue(struct netns_ipvs *ipvs)
+static inline struct ip_vs_sync_buff *
+sb_dequeue(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
{
struct ip_vs_sync_buff *sb;
spin_lock_bh(&ipvs->sync_lock);
- if (list_empty(&ipvs->sync_queue)) {
+ if (list_empty(&ms->sync_queue)) {
sb = NULL;
+ __set_current_state(TASK_INTERRUPTIBLE);
} else {
- sb = list_entry(ipvs->sync_queue.next,
- struct ip_vs_sync_buff,
+ sb = list_entry(ms->sync_queue.next, struct ip_vs_sync_buff,
list);
list_del(&sb->list);
+ ms->sync_queue_len--;
+ if (!ms->sync_queue_len)
+ ms->sync_queue_delay = 0;
}
spin_unlock_bh(&ipvs->sync_lock);
@@ -334,7 +332,7 @@ ip_vs_sync_buff_create(struct netns_ipvs *ipvs)
kfree(sb);
return NULL;
}
- sb->mesg->reserved = 0; /* old nr_conns i.e. must be zeo now */
+ sb->mesg->reserved = 0; /* old nr_conns i.e. must be zero now */
sb->mesg->version = SYNC_PROTO_VER;
sb->mesg->syncid = ipvs->master_syncid;
sb->mesg->size = sizeof(struct ip_vs_sync_mesg);
@@ -353,14 +351,22 @@ static inline void ip_vs_sync_buff_release(struct ip_vs_sync_buff *sb)
kfree(sb);
}
-static inline void sb_queue_tail(struct netns_ipvs *ipvs)
+static inline void sb_queue_tail(struct netns_ipvs *ipvs,
+ struct ipvs_master_sync_state *ms)
{
- struct ip_vs_sync_buff *sb = ipvs->sync_buff;
+ struct ip_vs_sync_buff *sb = ms->sync_buff;
spin_lock(&ipvs->sync_lock);
- if (ipvs->sync_state & IP_VS_STATE_MASTER)
- list_add_tail(&sb->list, &ipvs->sync_queue);
- else
+ if (ipvs->sync_state & IP_VS_STATE_MASTER &&
+ ms->sync_queue_len < sysctl_sync_qlen_max(ipvs)) {
+ if (!ms->sync_queue_len)
+ schedule_delayed_work(&ms->master_wakeup_work,
+ max(IPVS_SYNC_SEND_DELAY, 1));
+ ms->sync_queue_len++;
+ list_add_tail(&sb->list, &ms->sync_queue);
+ if ((++ms->sync_queue_delay) == IPVS_SYNC_WAKEUP_RATE)
+ wake_up_process(ms->master_thread);
+ } else
ip_vs_sync_buff_release(sb);
spin_unlock(&ipvs->sync_lock);
}
@@ -370,49 +376,26 @@ static inline void sb_queue_tail(struct netns_ipvs *ipvs)
* than the specified time or the specified time is zero.
*/
static inline struct ip_vs_sync_buff *
-get_curr_sync_buff(struct netns_ipvs *ipvs, unsigned long time)
+get_curr_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms,
+ unsigned long time)
{
struct ip_vs_sync_buff *sb;
spin_lock_bh(&ipvs->sync_buff_lock);
- if (ipvs->sync_buff &&
- time_after_eq(jiffies - ipvs->sync_buff->firstuse, time)) {
- sb = ipvs->sync_buff;
- ipvs->sync_buff = NULL;
+ sb = ms->sync_buff;
+ if (sb && time_after_eq(jiffies - sb->firstuse, time)) {
+ ms->sync_buff = NULL;
+ __set_current_state(TASK_RUNNING);
} else
sb = NULL;
spin_unlock_bh(&ipvs->sync_buff_lock);
return sb;
}
-/*
- * Switch mode from sending version 0 or 1
- * - must handle sync_buf
- */
-void ip_vs_sync_switch_mode(struct net *net, int mode)
+static inline int
+select_master_thread_id(struct netns_ipvs *ipvs, struct ip_vs_conn *cp)
{
- struct netns_ipvs *ipvs = net_ipvs(net);
-
- if (!(ipvs->sync_state & IP_VS_STATE_MASTER))
- return;
- if (mode == sysctl_sync_ver(ipvs) || !ipvs->sync_buff)
- return;
-
- spin_lock_bh(&ipvs->sync_buff_lock);
- /* Buffer empty ? then let buf_create do the job */
- if (ipvs->sync_buff->mesg->size <= sizeof(struct ip_vs_sync_mesg)) {
- kfree(ipvs->sync_buff);
- ipvs->sync_buff = NULL;
- } else {
- spin_lock_bh(&ipvs->sync_lock);
- if (ipvs->sync_state & IP_VS_STATE_MASTER)
- list_add_tail(&ipvs->sync_buff->list,
- &ipvs->sync_queue);
- else
- ip_vs_sync_buff_release(ipvs->sync_buff);
- spin_unlock_bh(&ipvs->sync_lock);
- }
- spin_unlock_bh(&ipvs->sync_buff_lock);
+ return ((long) cp >> (1 + ilog2(sizeof(*cp)))) & ipvs->threads_mask;
}
/*
@@ -442,15 +425,101 @@ ip_vs_sync_buff_create_v0(struct netns_ipvs *ipvs)
return sb;
}
+/* Check if conn should be synced.
+ * pkts: conn packets, use sysctl_sync_threshold to avoid packet check
+ * - (1) sync_refresh_period: reduce sync rate. Additionally, retry
+ * sync_retries times with period of sync_refresh_period/8
+ * - (2) if both sync_refresh_period and sync_period are 0 send sync only
+ * for state changes or only once when pkts matches sync_threshold
+ * - (3) templates: rate can be reduced only with sync_refresh_period or
+ * with (2)
+ */
+static int ip_vs_sync_conn_needed(struct netns_ipvs *ipvs,
+ struct ip_vs_conn *cp, int pkts)
+{
+ unsigned long orig = ACCESS_ONCE(cp->sync_endtime);
+ unsigned long now = jiffies;
+ unsigned long n = (now + cp->timeout) & ~3UL;
+ unsigned int sync_refresh_period;
+ int sync_period;
+ int force;
+
+ /* Check if we sync in current state */
+ if (unlikely(cp->flags & IP_VS_CONN_F_TEMPLATE))
+ force = 0;
+ else if (likely(cp->protocol == IPPROTO_TCP)) {
+ if (!((1 << cp->state) &
+ ((1 << IP_VS_TCP_S_ESTABLISHED) |
+ (1 << IP_VS_TCP_S_FIN_WAIT) |
+ (1 << IP_VS_TCP_S_CLOSE) |
+ (1 << IP_VS_TCP_S_CLOSE_WAIT) |
+ (1 << IP_VS_TCP_S_TIME_WAIT))))
+ return 0;
+ force = cp->state != cp->old_state;
+ if (force && cp->state != IP_VS_TCP_S_ESTABLISHED)
+ goto set;
+ } else if (unlikely(cp->protocol == IPPROTO_SCTP)) {
+ if (!((1 << cp->state) &
+ ((1 << IP_VS_SCTP_S_ESTABLISHED) |
+ (1 << IP_VS_SCTP_S_CLOSED) |
+ (1 << IP_VS_SCTP_S_SHUT_ACK_CLI) |
+ (1 << IP_VS_SCTP_S_SHUT_ACK_SER))))
+ return 0;
+ force = cp->state != cp->old_state;
+ if (force && cp->state != IP_VS_SCTP_S_ESTABLISHED)
+ goto set;
+ } else {
+ /* UDP or another protocol with single state */
+ force = 0;
+ }
+
+ sync_refresh_period = sysctl_sync_refresh_period(ipvs);
+ if (sync_refresh_period > 0) {
+ long diff = n - orig;
+ long min_diff = max(cp->timeout >> 1, 10UL * HZ);
+
+ /* Avoid sync if difference is below sync_refresh_period
+ * and below the half timeout.
+ */
+ if (abs(diff) < min_t(long, sync_refresh_period, min_diff)) {
+ int retries = orig & 3;
+
+ if (retries >= sysctl_sync_retries(ipvs))
+ return 0;
+ if (time_before(now, orig - cp->timeout +
+ (sync_refresh_period >> 3)))
+ return 0;
+ n |= retries + 1;
+ }
+ }
+ sync_period = sysctl_sync_period(ipvs);
+ if (sync_period > 0) {
+ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE) &&
+ pkts % sync_period != sysctl_sync_threshold(ipvs))
+ return 0;
+ } else if (sync_refresh_period <= 0 &&
+ pkts != sysctl_sync_threshold(ipvs))
+ return 0;
+
+set:
+ cp->old_state = cp->state;
+ n = cmpxchg(&cp->sync_endtime, orig, n);
+ return n == orig || force;
+}
+
/*
* Version 0 , could be switched in by sys_ctl.
* Add an ip_vs_conn information into the current sync_buff.
*/
-void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
+static void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp,
+ int pkts)
{
struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_sync_mesg_v0 *m;
struct ip_vs_sync_conn_v0 *s;
+ struct ip_vs_sync_buff *buff;
+ struct ipvs_master_sync_state *ms;
+ int id;
int len;
if (unlikely(cp->af != AF_INET))
@@ -459,21 +528,41 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
return;
+ if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
+ return;
+
spin_lock(&ipvs->sync_buff_lock);
- if (!ipvs->sync_buff) {
- ipvs->sync_buff =
- ip_vs_sync_buff_create_v0(ipvs);
- if (!ipvs->sync_buff) {
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
+ spin_unlock(&ipvs->sync_buff_lock);
+ return;
+ }
+
+ id = select_master_thread_id(ipvs, cp);
+ ms = &ipvs->ms[id];
+ buff = ms->sync_buff;
+ if (buff) {
+ m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
+ /* Send buffer if it is for v1 */
+ if (!m->nr_conns) {
+ sb_queue_tail(ipvs, ms);
+ ms->sync_buff = NULL;
+ buff = NULL;
+ }
+ }
+ if (!buff) {
+ buff = ip_vs_sync_buff_create_v0(ipvs);
+ if (!buff) {
spin_unlock(&ipvs->sync_buff_lock);
pr_err("ip_vs_sync_buff_create failed.\n");
return;
}
+ ms->sync_buff = buff;
}
len = (cp->flags & IP_VS_CONN_F_SEQ_MASK) ? FULL_CONN_SIZE :
SIMPLE_CONN_SIZE;
- m = (struct ip_vs_sync_mesg_v0 *)ipvs->sync_buff->mesg;
- s = (struct ip_vs_sync_conn_v0 *)ipvs->sync_buff->head;
+ m = (struct ip_vs_sync_mesg_v0 *) buff->mesg;
+ s = (struct ip_vs_sync_conn_v0 *) buff->head;
/* copy members */
s->reserved = 0;
@@ -494,18 +583,24 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
m->nr_conns++;
m->size += len;
- ipvs->sync_buff->head += len;
+ buff->head += len;
/* check if there is a space for next one */
- if (ipvs->sync_buff->head + FULL_CONN_SIZE > ipvs->sync_buff->end) {
- sb_queue_tail(ipvs);
- ipvs->sync_buff = NULL;
+ if (buff->head + FULL_CONN_SIZE > buff->end) {
+ sb_queue_tail(ipvs, ms);
+ ms->sync_buff = NULL;
}
spin_unlock(&ipvs->sync_buff_lock);
/* synchronize its controller if it has */
- if (cp->control)
- ip_vs_sync_conn(net, cp->control);
+ cp = cp->control;
+ if (cp) {
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+ pkts = atomic_add_return(1, &cp->in_pkts);
+ else
+ pkts = sysctl_sync_threshold(ipvs);
+ ip_vs_sync_conn(net, cp->control, pkts);
+ }
}
/*
@@ -513,23 +608,29 @@ void ip_vs_sync_conn_v0(struct net *net, struct ip_vs_conn *cp)
* Called by ip_vs_in.
* Sending Version 1 messages
*/
-void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp)
+void ip_vs_sync_conn(struct net *net, struct ip_vs_conn *cp, int pkts)
{
struct netns_ipvs *ipvs = net_ipvs(net);
struct ip_vs_sync_mesg *m;
union ip_vs_sync_conn *s;
+ struct ip_vs_sync_buff *buff;
+ struct ipvs_master_sync_state *ms;
+ int id;
__u8 *p;
unsigned int len, pe_name_len, pad;
/* Handle old version of the protocol */
if (sysctl_sync_ver(ipvs) == 0) {
- ip_vs_sync_conn_v0(net, cp);
+ ip_vs_sync_conn_v0(net, cp, pkts);
return;
}
/* Do not sync ONE PACKET */
if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
goto control;
sloop:
+ if (!ip_vs_sync_conn_needed(ipvs, cp, pkts))
+ goto control;
+
/* Sanity checks */
pe_name_len = 0;
if (cp->pe_data_len) {
@@ -541,6 +642,13 @@ sloop:
}
spin_lock(&ipvs->sync_buff_lock);
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
+ spin_unlock(&ipvs->sync_buff_lock);
+ return;
+ }
+
+ id = select_master_thread_id(ipvs, cp);
+ ms = &ipvs->ms[id];
#ifdef CONFIG_IP_VS_IPV6
if (cp->af == AF_INET6)
@@ -559,27 +667,32 @@ sloop:
/* check if there is a space for this one */
pad = 0;
- if (ipvs->sync_buff) {
- pad = (4 - (size_t)ipvs->sync_buff->head) & 3;
- if (ipvs->sync_buff->head + len + pad > ipvs->sync_buff->end) {
- sb_queue_tail(ipvs);
- ipvs->sync_buff = NULL;
+ buff = ms->sync_buff;
+ if (buff) {
+ m = buff->mesg;
+ pad = (4 - (size_t) buff->head) & 3;
+ /* Send buffer if it is for v0 */
+ if (buff->head + len + pad > buff->end || m->reserved) {
+ sb_queue_tail(ipvs, ms);
+ ms->sync_buff = NULL;
+ buff = NULL;
pad = 0;
}
}
- if (!ipvs->sync_buff) {
- ipvs->sync_buff = ip_vs_sync_buff_create(ipvs);
- if (!ipvs->sync_buff) {
+ if (!buff) {
+ buff = ip_vs_sync_buff_create(ipvs);
+ if (!buff) {
spin_unlock(&ipvs->sync_buff_lock);
pr_err("ip_vs_sync_buff_create failed.\n");
return;
}
+ ms->sync_buff = buff;
+ m = buff->mesg;
}
- m = ipvs->sync_buff->mesg;
- p = ipvs->sync_buff->head;
- ipvs->sync_buff->head += pad + len;
+ p = buff->head;
+ buff->head += pad + len;
m->size += pad + len;
/* Add ev. padding from prev. sync_conn */
while (pad--)
@@ -644,16 +757,10 @@ control:
cp = cp->control;
if (!cp)
return;
- /*
- * Reduce sync rate for templates
- * i.e only increment in_pkts for Templates.
- */
- if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
- int pkts = atomic_add_return(1, &cp->in_pkts);
-
- if (pkts % sysctl_sync_period(ipvs) != 1)
- return;
- }
+ if (cp->flags & IP_VS_CONN_F_TEMPLATE)
+ pkts = atomic_add_return(1, &cp->in_pkts);
+ else
+ pkts = sysctl_sync_threshold(ipvs);
goto sloop;
}
@@ -731,9 +838,32 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
else
cp = ip_vs_ct_in_get(param);
- if (cp && param->pe_data) /* Free pe_data */
+ if (cp) {
+ /* Free pe_data */
kfree(param->pe_data);
- if (!cp) {
+
+ dest = cp->dest;
+ spin_lock(&cp->lock);
+ if ((cp->flags ^ flags) & IP_VS_CONN_F_INACTIVE &&
+ !(flags & IP_VS_CONN_F_TEMPLATE) && dest) {
+ if (flags & IP_VS_CONN_F_INACTIVE) {
+ atomic_dec(&dest->activeconns);
+ atomic_inc(&dest->inactconns);
+ } else {
+ atomic_inc(&dest->activeconns);
+ atomic_dec(&dest->inactconns);
+ }
+ }
+ flags &= IP_VS_CONN_F_BACKUP_UPD_MASK;
+ flags |= cp->flags & ~IP_VS_CONN_F_BACKUP_UPD_MASK;
+ cp->flags = flags;
+ spin_unlock(&cp->lock);
+ if (!dest) {
+ dest = ip_vs_try_bind_dest(cp);
+ if (dest)
+ atomic_dec(&dest->refcnt);
+ }
+ } else {
/*
* Find the appropriate destination for the connection.
* If it is not found the connection will remain unbound
@@ -742,18 +872,6 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
dest = ip_vs_find_dest(net, type, daddr, dport, param->vaddr,
param->vport, protocol, fwmark, flags);
- /* Set the approprite ativity flag */
- if (protocol == IPPROTO_TCP) {
- if (state != IP_VS_TCP_S_ESTABLISHED)
- flags |= IP_VS_CONN_F_INACTIVE;
- else
- flags &= ~IP_VS_CONN_F_INACTIVE;
- } else if (protocol == IPPROTO_SCTP) {
- if (state != IP_VS_SCTP_S_ESTABLISHED)
- flags |= IP_VS_CONN_F_INACTIVE;
- else
- flags &= ~IP_VS_CONN_F_INACTIVE;
- }
cp = ip_vs_conn_new(param, daddr, dport, flags, dest, fwmark);
if (dest)
atomic_dec(&dest->refcnt);
@@ -763,34 +881,6 @@ static void ip_vs_proc_conn(struct net *net, struct ip_vs_conn_param *param,
IP_VS_DBG(2, "BACKUP, add new conn. failed\n");
return;
}
- } else if (!cp->dest) {
- dest = ip_vs_try_bind_dest(cp);
- if (dest)
- atomic_dec(&dest->refcnt);
- } else if ((cp->dest) && (cp->protocol == IPPROTO_TCP) &&
- (cp->state != state)) {
- /* update active/inactive flag for the connection */
- dest = cp->dest;
- if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (state != IP_VS_TCP_S_ESTABLISHED)) {
- atomic_dec(&dest->activeconns);
- atomic_inc(&dest->inactconns);
- cp->flags |= IP_VS_CONN_F_INACTIVE;
- } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (state == IP_VS_TCP_S_ESTABLISHED)) {
- atomic_inc(&dest->activeconns);
- atomic_dec(&dest->inactconns);
- cp->flags &= ~IP_VS_CONN_F_INACTIVE;
- }
- } else if ((cp->dest) && (cp->protocol == IPPROTO_SCTP) &&
- (cp->state != state)) {
- dest = cp->dest;
- if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
- (state != IP_VS_SCTP_S_ESTABLISHED)) {
- atomic_dec(&dest->activeconns);
- atomic_inc(&dest->inactconns);
- cp->flags &= ~IP_VS_CONN_F_INACTIVE;
- }
}
if (opt)
@@ -839,7 +929,7 @@ static void ip_vs_process_message_v0(struct net *net, const char *buffer,
p = (char *)buffer + sizeof(struct ip_vs_sync_mesg_v0);
for (i=0; i<m->nr_conns; i++) {
- unsigned flags, state;
+ unsigned int flags, state;
if (p + SIMPLE_CONN_SIZE > buffer+buflen) {
IP_VS_ERR_RL("BACKUP v0, bogus conn\n");
@@ -1109,7 +1199,7 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer,
for (i=0; i<nr_conns; i++) {
union ip_vs_sync_conn *s;
- unsigned size;
+ unsigned int size;
int retc;
p = msg_end;
@@ -1149,6 +1239,28 @@ static void ip_vs_process_message(struct net *net, __u8 *buffer,
/*
+ * Setup sndbuf (mode=1) or rcvbuf (mode=0)
+ */
+static void set_sock_size(struct sock *sk, int mode, int val)
+{
+ /* setsockopt(sock, SOL_SOCKET, SO_SNDBUF, &val, sizeof(val)); */
+ /* setsockopt(sock, SOL_SOCKET, SO_RCVBUF, &val, sizeof(val)); */
+ lock_sock(sk);
+ if (mode) {
+ val = clamp_t(int, val, (SOCK_MIN_SNDBUF + 1) / 2,
+ sysctl_wmem_max);
+ sk->sk_sndbuf = val * 2;
+ sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ } else {
+ val = clamp_t(int, val, (SOCK_MIN_RCVBUF + 1) / 2,
+ sysctl_rmem_max);
+ sk->sk_rcvbuf = val * 2;
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ }
+ release_sock(sk);
+}
+
+/*
* Setup loopback of outgoing multicasts on a sending socket
*/
static void set_mcast_loop(struct sock *sk, u_char loop)
@@ -1298,9 +1410,15 @@ static int bind_mcastif_addr(struct socket *sock, char *ifname)
/*
* Set up sending multicast socket over UDP
*/
-static struct socket *make_send_sock(struct net *net)
+static struct socket *make_send_sock(struct net *net, int id)
{
struct netns_ipvs *ipvs = net_ipvs(net);
+ /* multicast addr */
+ struct sockaddr_in mcast_addr = {
+ .sin_family = AF_INET,
+ .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id),
+ .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
+ };
struct socket *sock;
int result;
@@ -1324,6 +1442,9 @@ static struct socket *make_send_sock(struct net *net)
set_mcast_loop(sock->sk, 0);
set_mcast_ttl(sock->sk, 1);
+ result = sysctl_sync_sock_size(ipvs);
+ if (result > 0)
+ set_sock_size(sock->sk, 1, result);
result = bind_mcastif_addr(sock, ipvs->master_mcast_ifn);
if (result < 0) {
@@ -1349,9 +1470,15 @@ error:
/*
* Set up receiving multicast socket over UDP
*/
-static struct socket *make_receive_sock(struct net *net)
+static struct socket *make_receive_sock(struct net *net, int id)
{
struct netns_ipvs *ipvs = net_ipvs(net);
+ /* multicast addr */
+ struct sockaddr_in mcast_addr = {
+ .sin_family = AF_INET,
+ .sin_port = cpu_to_be16(IP_VS_SYNC_PORT + id),
+ .sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP),
+ };
struct socket *sock;
int result;
@@ -1368,7 +1495,10 @@ static struct socket *make_receive_sock(struct net *net)
*/
sk_change_net(sock->sk, net);
/* it is equivalent to the REUSEADDR option in user-space */
- sock->sk->sk_reuse = 1;
+ sock->sk->sk_reuse = SK_CAN_REUSE;
+ result = sysctl_sync_sock_size(ipvs);
+ if (result > 0)
+ set_sock_size(sock->sk, 0, result);
result = sock->ops->bind(sock, (struct sockaddr *) &mcast_addr,
sizeof(struct sockaddr));
@@ -1411,18 +1541,22 @@ ip_vs_send_async(struct socket *sock, const char *buffer, const size_t length)
return len;
}
-static void
+static int
ip_vs_send_sync_msg(struct socket *sock, struct ip_vs_sync_mesg *msg)
{
int msize;
+ int ret;
msize = msg->size;
/* Put size in network byte order */
msg->size = htons(msg->size);
- if (ip_vs_send_async(sock, (char *)msg, msize) != msize)
- pr_err("ip_vs_send_async error\n");
+ ret = ip_vs_send_async(sock, (char *)msg, msize);
+ if (ret >= 0 || ret == -EAGAIN)
+ return ret;
+ pr_err("ip_vs_send_async error %d\n", ret);
+ return 0;
}
static int
@@ -1438,48 +1572,90 @@ ip_vs_receive(struct socket *sock, char *buffer, const size_t buflen)
iov.iov_base = buffer;
iov.iov_len = (size_t)buflen;
- len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, 0);
+ len = kernel_recvmsg(sock, &msg, &iov, 1, buflen, MSG_DONTWAIT);
if (len < 0)
- return -1;
+ return len;
LeaveFunction(7);
return len;
}
+/* Wakeup the master thread for sending */
+static void master_wakeup_work_handler(struct work_struct *work)
+{
+ struct ipvs_master_sync_state *ms =
+ container_of(work, struct ipvs_master_sync_state,
+ master_wakeup_work.work);
+ struct netns_ipvs *ipvs = ms->ipvs;
+
+ spin_lock_bh(&ipvs->sync_lock);
+ if (ms->sync_queue_len &&
+ ms->sync_queue_delay < IPVS_SYNC_WAKEUP_RATE) {
+ ms->sync_queue_delay = IPVS_SYNC_WAKEUP_RATE;
+ wake_up_process(ms->master_thread);
+ }
+ spin_unlock_bh(&ipvs->sync_lock);
+}
+
+/* Get next buffer to send */
+static inline struct ip_vs_sync_buff *
+next_sync_buff(struct netns_ipvs *ipvs, struct ipvs_master_sync_state *ms)
+{
+ struct ip_vs_sync_buff *sb;
+
+ sb = sb_dequeue(ipvs, ms);
+ if (sb)
+ return sb;
+ /* Do not delay entries in buffer for more than 2 seconds */
+ return get_curr_sync_buff(ipvs, ms, IPVS_SYNC_FLUSH_TIME);
+}
static int sync_thread_master(void *data)
{
struct ip_vs_sync_thread_data *tinfo = data;
struct netns_ipvs *ipvs = net_ipvs(tinfo->net);
+ struct ipvs_master_sync_state *ms = &ipvs->ms[tinfo->id];
+ struct sock *sk = tinfo->sock->sk;
struct ip_vs_sync_buff *sb;
pr_info("sync thread started: state = MASTER, mcast_ifn = %s, "
- "syncid = %d\n",
- ipvs->master_mcast_ifn, ipvs->master_syncid);
+ "syncid = %d, id = %d\n",
+ ipvs->master_mcast_ifn, ipvs->master_syncid, tinfo->id);
- while (!kthread_should_stop()) {
- while ((sb = sb_dequeue(ipvs))) {
- ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
- ip_vs_sync_buff_release(sb);
+ for (;;) {
+ sb = next_sync_buff(ipvs, ms);
+ if (unlikely(kthread_should_stop()))
+ break;
+ if (!sb) {
+ schedule_timeout(IPVS_SYNC_CHECK_PERIOD);
+ continue;
}
-
- /* check if entries stay in ipvs->sync_buff for 2 seconds */
- sb = get_curr_sync_buff(ipvs, 2 * HZ);
- if (sb) {
- ip_vs_send_sync_msg(tinfo->sock, sb->mesg);
- ip_vs_sync_buff_release(sb);
+ while (ip_vs_send_sync_msg(tinfo->sock, sb->mesg) < 0) {
+ int ret = 0;
+
+ __wait_event_interruptible(*sk_sleep(sk),
+ sock_writeable(sk) ||
+ kthread_should_stop(),
+ ret);
+ if (unlikely(kthread_should_stop()))
+ goto done;
}
-
- schedule_timeout_interruptible(HZ);
+ ip_vs_sync_buff_release(sb);
}
+done:
+ __set_current_state(TASK_RUNNING);
+ if (sb)
+ ip_vs_sync_buff_release(sb);
+
/* clean up the sync_buff queue */
- while ((sb = sb_dequeue(ipvs)))
+ while ((sb = sb_dequeue(ipvs, ms)))
ip_vs_sync_buff_release(sb);
+ __set_current_state(TASK_RUNNING);
/* clean up the current sync_buff */
- sb = get_curr_sync_buff(ipvs, 0);
+ sb = get_curr_sync_buff(ipvs, ms, 0);
if (sb)
ip_vs_sync_buff_release(sb);
@@ -1498,8 +1674,8 @@ static int sync_thread_backup(void *data)
int len;
pr_info("sync thread started: state = BACKUP, mcast_ifn = %s, "
- "syncid = %d\n",
- ipvs->backup_mcast_ifn, ipvs->backup_syncid);
+ "syncid = %d, id = %d\n",
+ ipvs->backup_mcast_ifn, ipvs->backup_syncid, tinfo->id);
while (!kthread_should_stop()) {
wait_event_interruptible(*sk_sleep(tinfo->sock->sk),
@@ -1511,7 +1687,8 @@ static int sync_thread_backup(void *data)
len = ip_vs_receive(tinfo->sock, tinfo->buf,
ipvs->recv_mesg_maxlen);
if (len <= 0) {
- pr_err("receiving message error\n");
+ if (len != -EAGAIN)
+ pr_err("receiving message error\n");
break;
}
@@ -1535,86 +1712,140 @@ static int sync_thread_backup(void *data)
int start_sync_thread(struct net *net, int state, char *mcast_ifn, __u8 syncid)
{
struct ip_vs_sync_thread_data *tinfo;
- struct task_struct **realtask, *task;
+ struct task_struct **array = NULL, *task;
struct socket *sock;
struct netns_ipvs *ipvs = net_ipvs(net);
- char *name, *buf = NULL;
+ char *name;
int (*threadfn)(void *data);
+ int id, count;
int result = -ENOMEM;
IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
IP_VS_DBG(7, "Each ip_vs_sync_conn entry needs %Zd bytes\n",
sizeof(struct ip_vs_sync_conn_v0));
+ if (!ipvs->sync_state) {
+ count = clamp(sysctl_sync_ports(ipvs), 1, IPVS_SYNC_PORTS_MAX);
+ ipvs->threads_mask = count - 1;
+ } else
+ count = ipvs->threads_mask + 1;
if (state == IP_VS_STATE_MASTER) {
- if (ipvs->master_thread)
+ if (ipvs->ms)
return -EEXIST;
strlcpy(ipvs->master_mcast_ifn, mcast_ifn,
sizeof(ipvs->master_mcast_ifn));
ipvs->master_syncid = syncid;
- realtask = &ipvs->master_thread;
- name = "ipvs_master:%d";
+ name = "ipvs-m:%d:%d";
threadfn = sync_thread_master;
- sock = make_send_sock(net);
} else if (state == IP_VS_STATE_BACKUP) {
- if (ipvs->backup_thread)
+ if (ipvs->backup_threads)
return -EEXIST;
strlcpy(ipvs->backup_mcast_ifn, mcast_ifn,
sizeof(ipvs->backup_mcast_ifn));
ipvs->backup_syncid = syncid;
- realtask = &ipvs->backup_thread;
- name = "ipvs_backup:%d";
+ name = "ipvs-b:%d:%d";
threadfn = sync_thread_backup;
- sock = make_receive_sock(net);
} else {
return -EINVAL;
}
- if (IS_ERR(sock)) {
- result = PTR_ERR(sock);
- goto out;
- }
+ if (state == IP_VS_STATE_MASTER) {
+ struct ipvs_master_sync_state *ms;
- set_sync_mesg_maxlen(net, state);
- if (state == IP_VS_STATE_BACKUP) {
- buf = kmalloc(ipvs->recv_mesg_maxlen, GFP_KERNEL);
- if (!buf)
- goto outsocket;
+ ipvs->ms = kzalloc(count * sizeof(ipvs->ms[0]), GFP_KERNEL);
+ if (!ipvs->ms)
+ goto out;
+ ms = ipvs->ms;
+ for (id = 0; id < count; id++, ms++) {
+ INIT_LIST_HEAD(&ms->sync_queue);
+ ms->sync_queue_len = 0;
+ ms->sync_queue_delay = 0;
+ INIT_DELAYED_WORK(&ms->master_wakeup_work,
+ master_wakeup_work_handler);
+ ms->ipvs = ipvs;
+ }
+ } else {
+ array = kzalloc(count * sizeof(struct task_struct *),
+ GFP_KERNEL);
+ if (!array)
+ goto out;
}
+ set_sync_mesg_maxlen(net, state);
- tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
- if (!tinfo)
- goto outbuf;
-
- tinfo->net = net;
- tinfo->sock = sock;
- tinfo->buf = buf;
+ tinfo = NULL;
+ for (id = 0; id < count; id++) {
+ if (state == IP_VS_STATE_MASTER)
+ sock = make_send_sock(net, id);
+ else
+ sock = make_receive_sock(net, id);
+ if (IS_ERR(sock)) {
+ result = PTR_ERR(sock);
+ goto outtinfo;
+ }
+ tinfo = kmalloc(sizeof(*tinfo), GFP_KERNEL);
+ if (!tinfo)
+ goto outsocket;
+ tinfo->net = net;
+ tinfo->sock = sock;
+ if (state == IP_VS_STATE_BACKUP) {
+ tinfo->buf = kmalloc(ipvs->recv_mesg_maxlen,
+ GFP_KERNEL);
+ if (!tinfo->buf)
+ goto outtinfo;
+ }
+ tinfo->id = id;
- task = kthread_run(threadfn, tinfo, name, ipvs->gen);
- if (IS_ERR(task)) {
- result = PTR_ERR(task);
- goto outtinfo;
+ task = kthread_run(threadfn, tinfo, name, ipvs->gen, id);
+ if (IS_ERR(task)) {
+ result = PTR_ERR(task);
+ goto outtinfo;
+ }
+ tinfo = NULL;
+ if (state == IP_VS_STATE_MASTER)
+ ipvs->ms[id].master_thread = task;
+ else
+ array[id] = task;
}
/* mark as active */
- *realtask = task;
+
+ if (state == IP_VS_STATE_BACKUP)
+ ipvs->backup_threads = array;
+ spin_lock_bh(&ipvs->sync_buff_lock);
ipvs->sync_state |= state;
+ spin_unlock_bh(&ipvs->sync_buff_lock);
/* increase the module use count */
ip_vs_use_count_inc();
return 0;
-outtinfo:
- kfree(tinfo);
-outbuf:
- kfree(buf);
outsocket:
sk_release_kernel(sock->sk);
+
+outtinfo:
+ if (tinfo) {
+ sk_release_kernel(tinfo->sock->sk);
+ kfree(tinfo->buf);
+ kfree(tinfo);
+ }
+ count = id;
+ while (count-- > 0) {
+ if (state == IP_VS_STATE_MASTER)
+ kthread_stop(ipvs->ms[count].master_thread);
+ else
+ kthread_stop(array[count]);
+ }
+ kfree(array);
+
out:
+ if (!(ipvs->sync_state & IP_VS_STATE_MASTER)) {
+ kfree(ipvs->ms);
+ ipvs->ms = NULL;
+ }
return result;
}
@@ -1622,38 +1853,60 @@ out:
int stop_sync_thread(struct net *net, int state)
{
struct netns_ipvs *ipvs = net_ipvs(net);
+ struct task_struct **array;
+ int id;
int retc = -EINVAL;
IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current));
if (state == IP_VS_STATE_MASTER) {
- if (!ipvs->master_thread)
+ if (!ipvs->ms)
return -ESRCH;
- pr_info("stopping master sync thread %d ...\n",
- task_pid_nr(ipvs->master_thread));
-
/*
* The lock synchronizes with sb_queue_tail(), so that we don't
* add sync buffers to the queue, when we are already in
* progress of stopping the master sync daemon.
*/
- spin_lock_bh(&ipvs->sync_lock);
+ spin_lock_bh(&ipvs->sync_buff_lock);
+ spin_lock(&ipvs->sync_lock);
ipvs->sync_state &= ~IP_VS_STATE_MASTER;
- spin_unlock_bh(&ipvs->sync_lock);
- retc = kthread_stop(ipvs->master_thread);
- ipvs->master_thread = NULL;
+ spin_unlock(&ipvs->sync_lock);
+ spin_unlock_bh(&ipvs->sync_buff_lock);
+
+ retc = 0;
+ for (id = ipvs->threads_mask; id >= 0; id--) {
+ struct ipvs_master_sync_state *ms = &ipvs->ms[id];
+ int ret;
+
+ pr_info("stopping master sync thread %d ...\n",
+ task_pid_nr(ms->master_thread));
+ cancel_delayed_work_sync(&ms->master_wakeup_work);
+ ret = kthread_stop(ms->master_thread);
+ if (retc >= 0)
+ retc = ret;
+ }
+ kfree(ipvs->ms);
+ ipvs->ms = NULL;
} else if (state == IP_VS_STATE_BACKUP) {
- if (!ipvs->backup_thread)
+ if (!ipvs->backup_threads)
return -ESRCH;
- pr_info("stopping backup sync thread %d ...\n",
- task_pid_nr(ipvs->backup_thread));
-
ipvs->sync_state &= ~IP_VS_STATE_BACKUP;
- retc = kthread_stop(ipvs->backup_thread);
- ipvs->backup_thread = NULL;
+ array = ipvs->backup_threads;
+ retc = 0;
+ for (id = ipvs->threads_mask; id >= 0; id--) {
+ int ret;
+
+ pr_info("stopping backup sync thread %d ...\n",
+ task_pid_nr(array[id]));
+ ret = kthread_stop(array[id]);
+ if (retc >= 0)
+ retc = ret;
+ }
+ kfree(array);
+ ipvs->backup_threads = NULL;
}
/* decrease the module use count */
@@ -1670,13 +1923,8 @@ int __net_init ip_vs_sync_net_init(struct net *net)
struct netns_ipvs *ipvs = net_ipvs(net);
__mutex_init(&ipvs->sync_mutex, "ipvs->sync_mutex", &__ipvs_sync_key);
- INIT_LIST_HEAD(&ipvs->sync_queue);
spin_lock_init(&ipvs->sync_lock);
spin_lock_init(&ipvs->sync_buff_lock);
-
- ipvs->sync_mcast_addr.sin_family = AF_INET;
- ipvs->sync_mcast_addr.sin_port = cpu_to_be16(IP_VS_SYNC_PORT);
- ipvs->sync_mcast_addr.sin_addr.s_addr = cpu_to_be32(IP_VS_SYNC_GROUP);
return 0;
}