1 files changed, 1204 insertions, 0 deletions
diff --git a/smtpd/scheduler_ramqueue.c b/smtpd/scheduler_ramqueue.c
new file mode 100644
index 00000000..0c04fc0b
--- /dev/null
+++ b/smtpd/scheduler_ramqueue.c
@@ -0,0 +1,1204 @@
+/*	$OpenBSD: scheduler_ramqueue.c,v 1.45 2018/05/31 21:06:12 gilles Exp $	*/
+
+/*
+ * Copyright (c) 2012 Gilles Chehade <gilles@poolp.org>
+ * Copyright (c) 2012 Eric Faurot <eric@openbsd.org>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+
+#include "includes.h"
+
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/tree.h>
+#include <sys/socket.h>
+
+#include <ctype.h>
+#include <err.h>
+#include <event.h>
+#include <fcntl.h>
+#include <imsg.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <limits.h>
+#include <time.h>
+
+#include "smtpd.h"
+#include "log.h"
+
+TAILQ_HEAD(evplist, rq_envelope);
+
+struct rq_message {
+	uint32_t		 msgid;
+	struct tree		 envelopes;
+};
+
+struct rq_envelope {
+	TAILQ_ENTRY(rq_envelope) entry;
+	SPLAY_ENTRY(rq_envelope) t_entry;
+
+	uint64_t		 evpid;
+	uint64_t		 holdq;
+	enum delivery_type	 type;
+
+#define	RQ_EVPSTATE_PENDING	 0
+#define	RQ_EVPSTATE_SCHEDULED	 1
+#define	RQ_EVPSTATE_INFLIGHT	 2
+#define	RQ_EVPSTATE_HELD	 3
+	uint8_t			 state;
+
+#define	RQ_ENVELOPE_EXPIRED	 0x01
+#define	RQ_ENVELOPE_REMOVED	 0x02
+#define	RQ_ENVELOPE_SUSPEND	 0x04
+#define	RQ_ENVELOPE_UPDATE	 0x08
+#define	RQ_ENVELOPE_OVERFLOW	 0x10
+	uint8_t			 flags;
+
+	time_t			 ctime;
+	time_t			 sched;
+	time_t			 expire;
+
+	struct rq_message	*message;
+
+	time_t			 t_inflight;
+	time_t			 t_scheduled;
+};
+
+struct rq_holdq {
+	struct evplist		 q;
+	size_t			 count;
+};
+
+struct rq_queue {
+	size_t			 evpcount;
+	struct tree		 messages;
+	SPLAY_HEAD(prioqtree, rq_envelope)	q_priotree;
+
+	struct evplist		 q_pending;
+	struct evplist		 q_inflight;
+
+	struct evplist		 q_mta;
+	struct evplist		 q_mda;
+	struct evplist		 q_bounce;
+	struct evplist		 q_update;
+	struct evplist		 q_expired;
+	struct evplist		 q_removed;
+};
+
+static int rq_envelope_cmp(struct rq_envelope *, struct rq_envelope *);
+
+SPLAY_PROTOTYPE(prioqtree, rq_envelope, t_entry, rq_envelope_cmp);
+static int scheduler_ram_init(const char *);
+static int scheduler_ram_insert(struct scheduler_info *);
+static size_t scheduler_ram_commit(uint32_t);
+static size_t scheduler_ram_rollback(uint32_t);
+static int scheduler_ram_update(struct scheduler_info *);
+static int scheduler_ram_delete(uint64_t);
+static int scheduler_ram_hold(uint64_t, uint64_t);
+static int scheduler_ram_release(int, uint64_t, int);
+static int scheduler_ram_batch(int, int *, size_t *, uint64_t *, int *);
+static size_t scheduler_ram_messages(uint32_t, uint32_t *, size_t);
+static size_t scheduler_ram_envelopes(uint64_t, struct evpstate *, size_t);
+static int scheduler_ram_schedule(uint64_t);
+static int scheduler_ram_remove(uint64_t);
+static int scheduler_ram_suspend(uint64_t);
+static int scheduler_ram_resume(uint64_t);
+static int scheduler_ram_query(uint64_t);
+
+static void sorted_insert(struct rq_queue *, struct rq_envelope *);
+
+static void rq_queue_init(struct rq_queue *);
+static void rq_queue_merge(struct rq_queue *, struct rq_queue *);
+static void rq_queue_dump(struct rq_queue *, const char *);
+static void rq_queue_schedule(struct rq_queue *rq);
+static struct evplist *rq_envelope_list(struct rq_queue *, struct rq_envelope *);
+static void rq_envelope_schedule(struct rq_queue *, struct rq_envelope *);
+static int rq_envelope_remove(struct rq_queue *, struct rq_envelope *);
+static int rq_envelope_suspend(struct rq_queue *, struct rq_envelope *);
+static int rq_envelope_resume(struct rq_queue *, struct rq_envelope *);
+static void rq_envelope_delete(struct rq_queue *, struct rq_envelope *);
+static const char *rq_envelope_to_text(struct rq_envelope *);
+
+struct scheduler_backend scheduler_backend_ramqueue = {
+	scheduler_ram_init,
+
+	scheduler_ram_insert,
+	scheduler_ram_commit,
+	scheduler_ram_rollback,
+
+	scheduler_ram_update,
+	scheduler_ram_delete,
+	scheduler_ram_hold,
+	scheduler_ram_release,
+
+	scheduler_ram_batch,
+
+	scheduler_ram_messages,
+	scheduler_ram_envelopes,
+	scheduler_ram_schedule,
+	scheduler_ram_remove,
+	scheduler_ram_suspend,
+	scheduler_ram_resume,
+	scheduler_ram_query,
+};
+
+static struct rq_queue	ramqueue;
+static struct tree	updates;
+static struct tree	holdqs[3]; /* delivery type */
+
+static time_t		currtime;
+
+#define BACKOFF_TRANSFER	400
+#define BACKOFF_DELIVERY	10
+#define BACKOFF_OVERFLOW	3
+
+static time_t
+scheduler_backoff(time_t t0, time_t base, uint32_t step)
+{
+	return (t0 + base * step * step);
+}
+
+static time_t
+scheduler_next(time_t t0, time_t base, uint32_t step)
+{
+	time_t t;
+
+	/* XXX be more efficient */
+	while ((t = scheduler_backoff(t0, base, step)) <= currtime)
+		step++;
+
+	return (t);
+}
+
+static int
+scheduler_ram_init(const char *arg)
+{
+	rq_queue_init(&ramqueue);
+	tree_init(&updates);
+	tree_init(&holdqs[D_MDA]);
+	tree_init(&holdqs[D_MTA]);
+	tree_init(&holdqs[D_BOUNCE]);
+
+	return (1);
+}
+
+static int
+scheduler_ram_insert(struct scheduler_info *si)
+{
+	struct rq_queue		*update;
+	struct rq_message	*message;
+	struct rq_envelope	*envelope;
+	uint32_t		 msgid;
+
+	currtime = time(NULL);
+
+	msgid = evpid_to_msgid(si->evpid);
+
+	/* find/prepare a ramqueue update */
+	if ((update = tree_get(&updates, msgid)) == NULL) {
+		update = xcalloc(1, sizeof *update);
+		stat_increment("scheduler.ramqueue.update", 1);
+		rq_queue_init(update);
+		tree_xset(&updates, msgid, update);
+	}
+
+	/* find/prepare the msgtree message in ramqueue update */
+	if ((message = tree_get(&update->messages, msgid)) == NULL) {
+		message = xcalloc(1, sizeof *message);
+		message->msgid = msgid;
+		tree_init(&message->envelopes);
+		tree_xset(&update->messages, msgid, message);
+		stat_increment("scheduler.ramqueue.message", 1);
+	}
+
+	/* create envelope in ramqueue message */
+	envelope = xcalloc(1, sizeof *envelope);
+	envelope->evpid = si->evpid;
+	envelope->type = si->type;
+	envelope->message = message;
+	envelope->ctime = si->creation;
+	envelope->expire = si->creation + si->ttl;
+	envelope->sched = scheduler_backoff(si->creation,
+	    (si->type == D_MTA) ? BACKOFF_TRANSFER : BACKOFF_DELIVERY, si->retry);
+	tree_xset(&message->envelopes, envelope->evpid, envelope);
+
+	update->evpcount++;
+	stat_increment("scheduler.ramqueue.envelope", 1);
+
+	envelope->state = RQ_EVPSTATE_PENDING;
+	TAILQ_INSERT_TAIL(&update->q_pending, envelope, entry);
+
+	si->nexttry = envelope->sched;
+
+	return (1);
+}
+
+static size_t
+scheduler_ram_commit(uint32_t msgid)
+{
+	struct rq_queue	*update;
+	size_t		 r;
+
+	currtime = time(NULL);
+
+	update = tree_xpop(&updates, msgid);
+	r = update->evpcount;
+
+	if (tracing & TRACE_SCHEDULER)
+		rq_queue_dump(update, "update to commit");
+
+	rq_queue_merge(&ramqueue, update);
+
+	if (tracing & TRACE_SCHEDULER)
+		rq_queue_dump(&ramqueue, "resulting queue");
+
+	rq_queue_schedule(&ramqueue);
+
+	free(update);
+	stat_decrement("scheduler.ramqueue.update", 1);
+
+	return (r);
+}
+
+static size_t
+scheduler_ram_rollback(uint32_t msgid)
+{
+	struct rq_queue		*update;
+	struct rq_envelope	*evp;
+	size_t			 r;
+
+	currtime = time(NULL);
+
+	if ((update = tree_pop(&updates, msgid)) == NULL)
+		return (0);
+	r = update->evpcount;
+
+	while ((evp = TAILQ_FIRST(&update->q_pending))) {
+		TAILQ_REMOVE(&update->q_pending, evp, entry);
+		rq_envelope_delete(update, evp);
+	}
+
+	free(update);
+	stat_decrement("scheduler.ramqueue.update", 1);
+
+	return (r);
+}
+
+static int
+scheduler_ram_update(struct scheduler_info *si)
+{
+	struct rq_message	*msg;
+	struct rq_envelope	*evp;
+	uint32_t		 msgid;
+
+	currtime = time(NULL);
+
+	msgid = evpid_to_msgid(si->evpid);
+	msg = tree_xget(&ramqueue.messages, msgid);
+	evp = tree_xget(&msg->envelopes, si->evpid);
+
+	/* it *must* be in-flight */
+	if (evp->state != RQ_EVPSTATE_INFLIGHT)
+		errx(1, "evp:%016" PRIx64 " not in-flight", si->evpid);
+
+	TAILQ_REMOVE(&ramqueue.q_inflight, evp, entry);
+
+	/*
+	 * If the envelope was removed while inflight,  schedule it for
+	 * removal immediately.
+	 */
+	if (evp->flags & RQ_ENVELOPE_REMOVED) {
+		TAILQ_INSERT_TAIL(&ramqueue.q_removed, evp, entry);
+		evp->state = RQ_EVPSTATE_SCHEDULED;
+		evp->t_scheduled = currtime;
+		return (1);
+	}
+
+	evp->sched = scheduler_next(evp->ctime,
+	    (si->type == D_MTA) ? BACKOFF_TRANSFER : BACKOFF_DELIVERY, si->retry);
+
+	evp->state = RQ_EVPSTATE_PENDING;
+	if (!(evp->flags & RQ_ENVELOPE_SUSPEND))
+		sorted_insert(&ramqueue, evp);
+
+	si->nexttry = evp->sched;
+
+	return (1);
+}
+
+static int
+scheduler_ram_delete(uint64_t evpid)
+{
+	struct rq_message	*msg;
+	struct rq_envelope	*evp;
+	uint32_t		 msgid;
+
+	currtime = time(NULL);
+
+	msgid = evpid_to_msgid(evpid);
+	msg = tree_xget(&ramqueue.messages, msgid);
+	evp = tree_xget(&msg->envelopes, evpid);
+
+	/* it *must* be in-flight */
+	if (evp->state != RQ_EVPSTATE_INFLIGHT)
+		errx(1, "evp:%016" PRIx64 " not in-flight", evpid);
+
+	TAILQ_REMOVE(&ramqueue.q_inflight, evp, entry);
+
+	rq_envelope_delete(&ramqueue, evp);
+
+	return (1);
+}
+
+#define HOLDQ_MAXSIZE	1000
+
+static int
+scheduler_ram_hold(uint64_t evpid, uint64_t holdq)
+{
+	struct rq_holdq		*hq;
+	struct rq_message	*msg;
+	struct rq_envelope	*evp;
+	uint32_t		 msgid;
+
+	currtime = time(NULL);
+
+	msgid = evpid_to_msgid(evpid);
+	msg = tree_xget(&ramqueue.messages, msgid);
+	evp = tree_xget(&msg->envelopes, evpid);
+
+	/* it *must* be in-flight */
+	if (evp->state != RQ_EVPSTATE_INFLIGHT)
+		errx(1, "evp:%016" PRIx64 " not in-flight", evpid);
+
+	TAILQ_REMOVE(&ramqueue.q_inflight, evp, entry);
+
+	/* If the envelope is suspended, just mark it as pending */
+	if (evp->flags & RQ_ENVELOPE_SUSPEND) {
+		evp->state = RQ_EVPSTATE_PENDING;
+		return (0);
+	}
+
+	hq = tree_get(&holdqs[evp->type], holdq);
+	if (hq == NULL) {
+		hq = xcalloc(1, sizeof(*hq));
+		TAILQ_INIT(&hq->q);
+		tree_xset(&holdqs[evp->type], holdq, hq);
+		stat_increment("scheduler.ramqueue.holdq", 1);
+	}
+
+	/* If the holdq is full, just "tempfail" the envelope */
+	if (hq->count >= HOLDQ_MAXSIZE) {
+		evp->state = RQ_EVPSTATE_PENDING;
+		evp->flags |= RQ_ENVELOPE_UPDATE;
+		evp->flags |= RQ_ENVELOPE_OVERFLOW;
+		sorted_insert(&ramqueue, evp);
+		stat_increment("scheduler.ramqueue.hold-overflow", 1);
+		return (0);
+	}
+
+	evp->state = RQ_EVPSTATE_HELD;
+	evp->holdq = holdq;
+	/* This is an optimization: upon release, the envelopes will be
+	 * inserted in the pending queue from the first element to the last.
+	 * Since elements already in the queue were received first, they
+	 * were scheduled first, so they will be reinserted before the
+	 * current element.
+	 */
+	TAILQ_INSERT_HEAD(&hq->q, evp, entry);
+	hq->count += 1;
+	stat_increment("scheduler.ramqueue.hold", 1);
+
+	return (1);
+}
+
+static int
+scheduler_ram_release(int type, uint64_t holdq, int n)
+{
+	struct rq_holdq		*hq;
+	struct rq_envelope	*evp;
+	int			 i, update;
+
+	currtime = time(NULL);
+
+	hq = tree_get(&holdqs[type], holdq);
+	if (hq == NULL)
+		return (0);
+
+	if (n == -1) {
+		n = 0;
+		update = 1;
+	}
+	else
+		update = 0;
+
+	for (i = 0; n == 0 || i < n; i++) {
+		evp = TAILQ_FIRST(&hq->q);
+		if (evp == NULL)
+			break;
+
+		TAILQ_REMOVE(&hq->q, evp, entry);
+		hq->count -= 1;
+		evp->holdq = 0;
+
+		/* When released, all envelopes are put in the pending queue
+		 * and will be rescheduled immediately.  As an optimization,
+		 * we could just schedule them directly.
+		 */
+		evp->state = RQ_EVPSTATE_PENDING;
+		if (update)
+			evp->flags |= RQ_ENVELOPE_UPDATE;
+		sorted_insert(&ramqueue, evp);
+	}
+
+	if (TAILQ_EMPTY(&hq->q)) {
+		tree_xpop(&holdqs[type], holdq);
+		free(hq);
+		stat_decrement("scheduler.ramqueue.holdq", 1);
+	}
+	stat_decrement("scheduler.ramqueue.hold", i);
+
+	return (i);
+}
+
+static int
+scheduler_ram_batch(int mask, int *delay, size_t *count, uint64_t *evpids, int *types)
+{
+	struct rq_envelope	*evp;
+	size_t			 i, n;
+	time_t			 t;
+
+	currtime = time(NULL);
+
+	rq_queue_schedule(&ramqueue);
+	if (tracing & TRACE_SCHEDULER)
+		rq_queue_dump(&ramqueue, "scheduler_ram_batch()");
+
+	i = 0;
+	n = 0;
+
+	for (;;) {
+
+		if (mask & SCHED_REMOVE && (evp = TAILQ_FIRST(&ramqueue.q_removed))) {
+			TAILQ_REMOVE(&ramqueue.q_removed, evp, entry);
+			types[i] = SCHED_REMOVE;
+			evpids[i] = evp->evpid;
+			rq_envelope_delete(&ramqueue, evp);
+
+			if (++i == *count)
+				break;
+		}
+
+		if (mask & SCHED_EXPIRE && (evp = TAILQ_FIRST(&ramqueue.q_expired))) {
+			TAILQ_REMOVE(&ramqueue.q_expired, evp, entry);
+			types[i] = SCHED_EXPIRE;
+			evpids[i] = evp->evpid;
+			rq_envelope_delete(&ramqueue, evp);
+
+			if (++i == *count)
+				break;
+		}
+
+		if (mask & SCHED_UPDATE && (evp = TAILQ_FIRST(&ramqueue.q_update))) {
+			TAILQ_REMOVE(&ramqueue.q_update, evp, entry);
+			types[i] = SCHED_UPDATE;
+			evpids[i] = evp->evpid;
+
+			if (evp->flags & RQ_ENVELOPE_OVERFLOW)
+				t = BACKOFF_OVERFLOW;
+			else if (evp->type == D_MTA)
+				t = BACKOFF_TRANSFER;
+			else
+				t = BACKOFF_DELIVERY;
+
+			evp->sched = scheduler_next(evp->ctime, t, 0);
+			evp->flags &= ~(RQ_ENVELOPE_UPDATE|RQ_ENVELOPE_OVERFLOW);
+			evp->state = RQ_EVPSTATE_PENDING;
+			if (!(evp->flags & RQ_ENVELOPE_SUSPEND))
+				sorted_insert(&ramqueue, evp);
+
+			if (++i == *count)
+				break;
+		}
+
+		if (mask & SCHED_BOUNCE && (evp = TAILQ_FIRST(&ramqueue.q_bounce))) {
+			TAILQ_REMOVE(&ramqueue.q_bounce, evp, entry);
+			types[i] = SCHED_BOUNCE;
+			evpids[i] = evp->evpid;
+
+			TAILQ_INSERT_TAIL(&ramqueue.q_inflight, evp, entry);
+			evp->state = RQ_EVPSTATE_INFLIGHT;
+			evp->t_inflight = currtime;
+
+			if (++i == *count)
+				break;
+		}
+
+		if (mask & SCHED_MDA && (evp = TAILQ_FIRST(&ramqueue.q_mda))) {
+			TAILQ_REMOVE(&ramqueue.q_mda, evp, entry);
+			types[i] = SCHED_MDA;
+			evpids[i] = evp->evpid;
+
+			TAILQ_INSERT_TAIL(&ramqueue.q_inflight, evp, entry);
+			evp->state = RQ_EVPSTATE_INFLIGHT;
+			evp->t_inflight = currtime;
+
+			if (++i == *count)
+				break;
+		}
+
+		if (mask & SCHED_MTA && (evp = TAILQ_FIRST(&ramqueue.q_mta))) {
+			TAILQ_REMOVE(&ramqueue.q_mta, evp, entry);
+			types[i] = SCHED_MTA;
+			evpids[i] = evp->evpid;
+
+			TAILQ_INSERT_TAIL(&ramqueue.q_inflight, evp, entry);
+			evp->state = RQ_EVPSTATE_INFLIGHT;
+			evp->t_inflight = currtime;
+
+			if (++i == *count)
+				break;
+		}
+
+		/* nothing seen this round */
+		if (i == n)
+			break;
+
+		n = i;
+	}
+
+	if (i) {
+		*count = i;
+		return (1);
+	}
+
+	if ((evp = TAILQ_FIRST(&ramqueue.q_pending))) {
+		if (evp->sched < evp->expire)
+			t = evp->sched;
+		else
+			t = evp->expire;
+		*delay = (t < currtime) ? 0 : (t - currtime);
+	}
+	else
+		*delay = -1;
+
+	return (0);
+}
+
+static size_t
+scheduler_ram_messages(uint32_t from, uint32_t *dst, size_t size)
+{
+	uint64_t id;
+	size_t	 n;
+	void	*i;
+
+	for (n = 0, i = NULL; n < size; n++) {
+		if (tree_iterfrom(&ramqueue.messages, &i, from, &id, NULL) == 0)
+			break;
+		dst[n] = id;
+	}
+
+	return (n);
+}
+
+static size_t
+scheduler_ram_envelopes(uint64_t from, struct evpstate *dst, size_t size)
+{
+	struct rq_message	*msg;
+	struct rq_envelope	*evp;
+	void			*i;
+	size_t			 n;
+
+	if ((msg = tree_get(&ramqueue.messages, evpid_to_msgid(from))) == NULL)
+		return (0);
+
+	for (n = 0, i = NULL; n < size; ) {
+
+		if (tree_iterfrom(&msg->envelopes, &i, from, NULL,
+		    (void**)&evp) == 0)
+			break;
+
+		if (evp->flags & (RQ_ENVELOPE_REMOVED | RQ_ENVELOPE_EXPIRED))
+			continue;
+
+		dst[n].evpid = evp->evpid;
+		dst[n].flags = 0;
+		dst[n].retry = 0;
+		dst[n].time = 0;
+
+		if (evp->state == RQ_EVPSTATE_PENDING) {
+			dst[n].time = evp->sched;
+			dst[n].flags = EF_PENDING;
+		}
+		else if (evp->state == RQ_EVPSTATE_SCHEDULED) {
+			dst[n].time = evp->t_scheduled;
+			dst[n].flags = EF_PENDING;
+		}
+		else if (evp->state == RQ_EVPSTATE_INFLIGHT) {
+			dst[n].time = evp->t_inflight;
+			dst[n].flags = EF_INFLIGHT;
+		}
+		else if (evp->state == RQ_EVPSTATE_HELD) {
+			/* same as scheduled */
+			dst[n].time = evp->t_scheduled;
+			dst[n].flags = EF_PENDING;
+			dst[n].flags |= EF_HOLD;
+		}
+		if (evp->flags & RQ_ENVELOPE_SUSPEND)
+			dst[n].flags |= EF_SUSPEND;
+
+		n++;
+	}
+
+	return (n);
+}
+
+static int
+scheduler_ram_schedule(uint64_t evpid)
+{
+	struct rq_message	*msg;
+	struct rq_envelope	*evp;
+	uint32_t		 msgid;
+	void			*i;
+	int			 r;
+
+	currtime = time(NULL);
+
+	if (evpid > 0xffffffff) {
+		msgid = evpid_to_msgid(evpid);
+		if ((msg = tree_get(&ramqueue.messages, msgid)) == NULL)
+			return (0);
+		if ((evp = tree_get(&msg->envelopes, evpid)) == NULL)
+			return (0);
+		if (evp->state == RQ_EVPSTATE_INFLIGHT)
+			return (0);
+		rq_envelope_schedule(&ramqueue, evp);
+		return (1);
+	}
+	else {
+		msgid = evpid;
+		if ((msg = tree_get(&ramqueue.messages, msgid)) == NULL)
+			return (0);
+		i = NULL;
+		r = 0;
+		while (tree_iter(&msg->envelopes, &i, NULL, (void*)(&evp))) {
+			if (evp->state == RQ_EVPSTATE_INFLIGHT)
+				continue;
+			rq_envelope_schedule(&ramqueue, evp);
+			r++;
+		}
+		return (r);
+	}
+}
+
+static int
+scheduler_ram_remove(uint64_t evpid)
+{
+	struct rq_message	*msg;
+	struct rq_envelope	*evp;
+	uint32_t		 msgid;
+	void			*i;
+	int			 r;
+
+	currtime = time(NULL);
+
+	if (evpid > 0xffffffff) {
+		msgid = evpid_to_msgid(evpid);
+		if ((msg = tree_get(&ramqueue.messages, msgid)) == NULL)
+			return (0);
+		if ((evp = tree_get(&msg->envelopes, evpid)) == NULL)
+			return (0);
+		if (rq_envelope_remove(&ramqueue, evp))
+			return (1);
+		return (0);
+	}
+	else {
+		msgid = evpid;
+		if ((msg = tree_get(&ramqueue.messages, msgid)) == NULL)
+			return (0);
+		i = NULL;
+		r = 0;
+		while (tree_iter(&msg->envelopes, &i, NULL, (void*)(&evp)))
+			if (rq_envelope_remove(&ramqueue, evp))
+				r++;
+		return (r);
+	}
+}
+
+static int
+scheduler_ram_suspend(uint64_t evpid)
+{
+	struct rq_message	*msg;
+	struct rq_envelope	*evp;
+	uint32_t		 msgid;
+	void			*i;
+	int			 r;
+
+	currtime = time(NULL);
+
+	if (evpid > 0xffffffff) {
+		msgid = evpid_to_msgid(evpid);
+		if ((msg = tree_get(&ramqueue.messages, msgid)) == NULL)
+			return (0);
+		if ((evp = tree_get(&msg->envelopes, evpid)) == NULL)
+			return (0);
+		if (rq_envelope_suspend(&ramqueue, evp))
+			return (1);
+		return (0);
+	}
+	else {
+		msgid = evpid;
+		if ((msg = tree_get(&ramqueue.messages, msgid)) == NULL)
+			return (0);
+		i = NULL;
+		r = 0;
+		while (tree_iter(&msg->envelopes, &i, NULL, (void*)(&evp)))
+			if (rq_envelope_suspend(&ramqueue, evp))
+				r++;
+		return (r);
+	}
+}
+
+static int
+scheduler_ram_resume(uint64_t evpid)
+{
+	struct rq_message	*msg;
+	struct rq_envelope	*evp;
+	uint32_t		 msgid;
+	void			*i;
+	int			 r;
+
+	currtime = time(NULL);
+
+	if (evpid > 0xffffffff) {
+		msgid = evpid_to_msgid(evpid);
+		if ((msg = tree_get(&ramqueue.messages, msgid)) == NULL)
+			return (0);
+		if ((evp = tree_get(&msg->envelopes, evpid)) == NULL)
+			return (0);
+		if (rq_envelope_resume(&ramqueue, evp))
+			return (1);
+		return (0);
+	}
+	else {
+		msgid = evpid;
+		if ((msg = tree_get(&ramqueue.messages, msgid)) == NULL)
+			return (0);
+		i = NULL;
+		r = 0;
+		while (tree_iter(&msg->envelopes, &i, NULL, (void*)(&evp)))
+			if (rq_envelope_resume(&ramqueue, evp))
+				r++;
+		return (r);
+	}
+}
+
+static int
+scheduler_ram_query(uint64_t evpid)
+{
+	uint32_t msgid;
+
+	if (evpid > 0xffffffff)
+		msgid = evpid_to_msgid(evpid);
+	else
+		msgid = evpid;
+
+	if (tree_get(&ramqueue.messages, msgid) == NULL)
+		return (0);
+
+	return (1);
+}
+
+static void
+sorted_insert(struct rq_queue *rq, struct rq_envelope *evp)
+{
+	struct rq_envelope	*evp2;
+
+	SPLAY_INSERT(prioqtree, &rq->q_priotree, evp);
+	evp2 = SPLAY_NEXT(prioqtree, &rq->q_priotree, evp);
+	if (evp2)
+		TAILQ_INSERT_BEFORE(evp2, evp, entry);
+	else
+		TAILQ_INSERT_TAIL(&rq->q_pending, evp, entry);
+}
+
+static void
+rq_queue_init(struct rq_queue *rq)
+{
+	memset(rq, 0, sizeof *rq);
+	tree_init(&rq->messages);
+	TAILQ_INIT(&rq->q_pending);
+	TAILQ_INIT(&rq->q_inflight);
+	TAILQ_INIT(&rq->q_mta);
+	TAILQ_INIT(&rq->q_mda);
+	TAILQ_INIT(&rq->q_bounce);
+	TAILQ_INIT(&rq->q_update);
+	TAILQ_INIT(&rq->q_expired);
+	TAILQ_INIT(&rq->q_removed);
+	SPLAY_INIT(&rq->q_priotree);
+}
+
+static void
+rq_queue_merge(struct rq_queue *rq, struct rq_queue *update)
+{
+	struct rq_message	*message, *tomessage;
+	struct rq_envelope	*envelope;
+	uint64_t		 id;
+	void			*i;
+
+	while (tree_poproot(&update->messages, &id, (void*)&message)) {
+		if ((tomessage = tree_get(&rq->messages, id)) == NULL) {
+			/* message does not exist. re-use structure */
+			tree_xset(&rq->messages, id, message);
+			continue;
+		}
+		/* need to re-link all envelopes before merging them */
+		i = NULL;
+		while ((tree_iter(&message->envelopes, &i, &id,
+		    (void*)&envelope)))
+			envelope->message = tomessage;
+		tree_merge(&tomessage->envelopes, &message->envelopes);
+		free(message);
+		stat_decrement("scheduler.ramqueue.message", 1);
+	}
+
+	/* Sorted insert in the pending queue */
+	while ((envelope = TAILQ_FIRST(&update->q_pending))) {
+		TAILQ_REMOVE(&update->q_pending, envelope, entry);
+		sorted_insert(rq, envelope);
+	}
+
+	rq->evpcount += update->evpcount;
+}
+
+#define SCHEDULEMAX	1024
+
+static void
+rq_queue_schedule(struct rq_queue *rq)
+{
+	struct rq_envelope	*evp;
+	size_t			 n;
+
+	n = 0;
+	while ((evp = TAILQ_FIRST(&rq->q_pending))) {
+		if (evp->sched > currtime && evp->expire > currtime)
+			break;
+
+		if (n == SCHEDULEMAX)
+			break;
+
+		if (evp->state != RQ_EVPSTATE_PENDING)
+			errx(1, "evp:%016" PRIx64 " flags=0x%x", evp->evpid,
+			    evp->flags);
+
+		if (evp->expire <= currtime) {
+			TAILQ_REMOVE(&rq->q_pending, evp, entry);
+			SPLAY_REMOVE(prioqtree, &rq->q_priotree, evp);
+			TAILQ_INSERT_TAIL(&rq->q_expired, evp, entry);
+			evp->state = RQ_EVPSTATE_SCHEDULED;
+			evp->flags |= RQ_ENVELOPE_EXPIRED;
+			evp->t_scheduled = currtime;
+			continue;
+		}
+		rq_envelope_schedule(rq, evp);
+		n += 1;
+	}
+}
+
+static struct evplist *
+rq_envelope_list(struct rq_queue *rq, struct rq_envelope *evp)
+{
+	switch (evp->state) {
+	case RQ_EVPSTATE_PENDING:
+		return &rq->q_pending;
+
+	case RQ_EVPSTATE_SCHEDULED:
+		if (evp->flags & RQ_ENVELOPE_EXPIRED)
+			return &rq->q_expired;
+		if (evp->flags & RQ_ENVELOPE_REMOVED)
+			return &rq->q_removed;
+		if (evp->flags & RQ_ENVELOPE_UPDATE)
+			return &rq->q_update;
+		if (evp->type == D_MTA)
+			return &rq->q_mta;
+		if (evp->type == D_MDA)
+			return &rq->q_mda;
+		if (evp->type == D_BOUNCE)
+			return &rq->q_bounce;
+		errx(1, "%016" PRIx64 " bad evp type %d", evp->evpid, evp->type);
+
+	case RQ_EVPSTATE_INFLIGHT:
+		return &rq->q_inflight;
+
+	case RQ_EVPSTATE_HELD:
+		return (NULL);
+	}
+
+	errx(1, "%016" PRIx64 " bad state %d", evp->evpid, evp->state);
+	return (NULL);
+}
+
+static void
+rq_envelope_schedule(struct rq_queue *rq, struct rq_envelope *evp)
+{
+	struct rq_holdq	*hq;
+	struct evplist	*q = NULL;
+
+	switch (evp->type) {
+	case D_MTA:
+		q = &rq->q_mta;
+		break;
+	case D_MDA:
+		q = &rq->q_mda;
+		break;
+	case D_BOUNCE:
+		q = &rq->q_bounce;
+		break;
+	}
+
+	if (evp->flags & RQ_ENVELOPE_UPDATE)
+		q = &rq->q_update;
+
+	if (evp->state == RQ_EVPSTATE_HELD) {
+		hq = tree_xget(&holdqs[evp->type], evp->holdq);
+		TAILQ_REMOVE(&hq->q, evp, entry);
+		hq->count -= 1;
+		if (TAILQ_EMPTY(&hq->q)) {
+			tree_xpop(&holdqs[evp->type], evp->holdq);
+			free(hq);
+		}
+		evp->holdq = 0;
+		stat_decrement("scheduler.ramqueue.hold", 1);
+	}
+	else if (!(evp->flags & RQ_ENVELOPE_SUSPEND)) {
+		TAILQ_REMOVE(&rq->q_pending, evp, entry);
+		SPLAY_REMOVE(prioqtree, &rq->q_priotree, evp);
+	}
+
+	TAILQ_INSERT_TAIL(q, evp, entry);
+	evp->state = RQ_EVPSTATE_SCHEDULED;
+	evp->t_scheduled = currtime;
+}
+
+static int
+rq_envelope_remove(struct rq_queue *rq, struct rq_envelope *evp)
+{
+	struct rq_holdq	*hq;
+	struct evplist	*evl;
+
+	if (evp->flags & (RQ_ENVELOPE_REMOVED | RQ_ENVELOPE_EXPIRED))
+		return (0);
+	/*
+	 * If envelope is inflight, mark it envelope for removal.
+	 */
+	if (evp->state == RQ_EVPSTATE_INFLIGHT) {
+		evp->flags |= RQ_ENVELOPE_REMOVED;
+		return (1);
+	}
+
+	if (evp->state == RQ_EVPSTATE_HELD) {
+		hq = tree_xget(&holdqs[evp->type], evp->holdq);
+		TAILQ_REMOVE(&hq->q, evp, entry);
+		hq->count -= 1;
+		if (TAILQ_EMPTY(&hq->q)) {
+			tree_xpop(&holdqs[evp->type], evp->holdq);
+			free(hq);
+		}
+		evp->holdq = 0;
+		stat_decrement("scheduler.ramqueue.hold", 1);
+	}
+	else if (!(evp->flags & RQ_ENVELOPE_SUSPEND)) {
+		evl = rq_envelope_list(rq, evp);
+		TAILQ_REMOVE(evl, evp, entry);
+		if (evl == &rq->q_pending)
+			SPLAY_REMOVE(prioqtree, &rq->q_priotree, evp);
+	}
+
+	TAILQ_INSERT_TAIL(&rq->q_removed, evp, entry);
+	evp->state = RQ_EVPSTATE_SCHEDULED;
+	evp->flags |= RQ_ENVELOPE_REMOVED;
+	evp->t_scheduled = currtime;
+
+	return (1);
+}
+
+static int
+rq_envelope_suspend(struct rq_queue *rq, struct rq_envelope *evp)
+{
+	struct rq_holdq	*hq;
+	struct evplist	*evl;
+
+	if (evp->flags & RQ_ENVELOPE_SUSPEND)
+		return (0);
+
+	if (evp->state == RQ_EVPSTATE_HELD) {
+		hq = tree_xget(&holdqs[evp->type], evp->holdq);
+		TAILQ_REMOVE(&hq->q, evp, entry);
+		hq->count -= 1;
+		if (TAILQ_EMPTY(&hq->q)) {
+			tree_xpop(&holdqs[evp->type], evp->holdq);
+			free(hq);
+		}
+		evp->holdq = 0;
+		evp->state = RQ_EVPSTATE_PENDING;
+		stat_decrement("scheduler.ramqueue.hold", 1);
+	}
+	else if (evp->state != RQ_EVPSTATE_INFLIGHT) {
+		evl = rq_envelope_list(rq, evp);
+		TAILQ_REMOVE(evl, evp, entry);
+		if (evl == &rq->q_pending)
+			SPLAY_REMOVE(prioqtree, &rq->q_priotree, evp);
+	}
+
+	evp->flags |= RQ_ENVELOPE_SUSPEND;
+
+	return (1);
+}
+
+static int
+rq_envelope_resume(struct rq_queue *rq, struct rq_envelope *evp)
+{
+	struct evplist	*evl;
+
+	if (!(evp->flags & RQ_ENVELOPE_SUSPEND))
+		return (0);
+
+	if (evp->state != RQ_EVPSTATE_INFLIGHT) {
+		evl = rq_envelope_list(rq, evp);
+		if (evl == &rq->q_pending)
+			sorted_insert(rq, evp);
+		else
+			TAILQ_INSERT_TAIL(evl, evp, entry);
+	}
+
+	evp->flags &= ~RQ_ENVELOPE_SUSPEND;
+
+	return (1);
+}
+
+static void
+rq_envelope_delete(struct rq_queue *rq, struct rq_envelope *evp)
+{
+	tree_xpop(&evp->message->envelopes, evp->evpid);
+	if (tree_empty(&evp->message->envelopes)) {
+		tree_xpop(&rq->messages, evp->message->msgid);
+		free(evp->message);
+		stat_decrement("scheduler.ramqueue.message", 1);
+	}
+
+	free(evp);
+	rq->evpcount--;
+	stat_decrement("scheduler.ramqueue.envelope", 1);
+}
+
+static const char *
+rq_envelope_to_text(struct rq_envelope *e)
+{
+	static char	buf[256];
+	char		t[64];
+
+	(void)snprintf(buf, sizeof buf, "evp:%016" PRIx64 " [", e->evpid);
+
+	if (e->type == D_BOUNCE)
+		(void)strlcat(buf, "bounce", sizeof buf);
+	else if (e->type == D_MDA)
+		(void)strlcat(buf, "mda", sizeof buf);
+	else if (e->type == D_MTA)
+		(void)strlcat(buf, "mta", sizeof buf);
+
+	(void)snprintf(t, sizeof t, ",expire=%s",
+	    duration_to_text(e->expire - currtime));
+	(void)strlcat(buf, t, sizeof buf);
+
+
+	switch (e->state) {
+	case RQ_EVPSTATE_PENDING:
+		(void)snprintf(t, sizeof t, ",pending=%s",
+		    duration_to_text(e->sched - currtime));
+		(void)strlcat(buf, t, sizeof buf);
+		break;
+
+	case RQ_EVPSTATE_SCHEDULED:
+		(void)snprintf(t, sizeof t, ",scheduled=%s",
+		    duration_to_text(currtime - e->t_scheduled));
+		(void)strlcat(buf, t, sizeof buf);
+		break;
+
+	case RQ_EVPSTATE_INFLIGHT:
+		(void)snprintf(t, sizeof t, ",inflight=%s",
+		    duration_to_text(currtime - e->t_inflight));
+		(void)strlcat(buf, t, sizeof buf);
+		break;
+
+	case RQ_EVPSTATE_HELD:
+		(void)snprintf(t, sizeof t, ",held=%s",
+		    duration_to_text(currtime - e->t_inflight));
+		(void)strlcat(buf, t, sizeof buf);
+		break;
+	default:
+		errx(1, "%016" PRIx64 " bad state %d", e->evpid, e->state);
+	}
+
+	if (e->flags & RQ_ENVELOPE_REMOVED)
+		(void)strlcat(buf, ",removed", sizeof buf);
+	if (e->flags & RQ_ENVELOPE_EXPIRED)
+		(void)strlcat(buf, ",expired", sizeof buf);
+	if (e->flags & RQ_ENVELOPE_SUSPEND)
+		(void)strlcat(buf, ",suspended", sizeof buf);
+
+	(void)strlcat(buf, "]", sizeof buf);
+
+	return (buf);
+}
+
+static void
+rq_queue_dump(struct rq_queue *rq, const char * name)
+{
+	struct rq_message	*message;
+	struct rq_envelope	*envelope;
+	void			*i, *j;
+	uint64_t		 id;
+
+	log_debug("debug: /--- ramqueue: %s", name);
+
+	i = NULL;
+	while ((tree_iter(&rq->messages, &i, &id, (void*)&message))) {
+		log_debug("debug: | msg:%08" PRIx32, message->msgid);
+		j = NULL;
+		while ((tree_iter(&message->envelopes, &j, &id,
+		    (void*)&envelope)))
+			log_debug("debug: |   %s",
+			    rq_envelope_to_text(envelope));
+	}
+	log_debug("debug: \\---");
+}
+
+static int
+rq_envelope_cmp(struct rq_envelope *e1, struct rq_envelope *e2)
+{
+	time_t	ref1, ref2;
+
+	ref1 = (e1->sched < e1->expire) ? e1->sched : e1->expire;
+	ref2 = (e2->sched < e2->expire) ? e2->sched : e2->expire;
+	if (ref1 != ref2)
+		return (ref1 < ref2) ? -1 : 1;
+
+	if (e1->evpid != e2->evpid)
+		return (e1->evpid < e2->evpid) ? -1 : 1;
+
+	return 0;
+}
+
+SPLAY_GENERATE(prioqtree, rq_envelope, t_entry, rq_envelope_cmp);