summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--sys/netinet/ip.h14
-rw-r--r--sys/netinet/ip_ecn.c85
-rw-r--r--sys/netinet/ip_ecn.h6
-rw-r--r--sys/netinet/ip_input.c19
-rw-r--r--sys/netinet/ip_ipip.c12
-rw-r--r--sys/netinet/tcp.h4
-rw-r--r--sys/netinet/tcp_input.c132
-rw-r--r--sys/netinet/tcp_output.c63
-rw-r--r--sys/netinet/tcp_subr.c3
-rw-r--r--sys/netinet/tcp_timer.c9
-rw-r--r--sys/netinet/tcp_usrreq.c7
-rw-r--r--sys/netinet/tcp_var.h29
-rw-r--r--sys/netinet6/frag6.c25
13 files changed, 372 insertions, 36 deletions
diff --git a/sys/netinet/ip.h b/sys/netinet/ip.h
index bad86dc5ef4..ce6e27ae26e 100644
--- a/sys/netinet/ip.h
+++ b/sys/netinet/ip.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: ip.h,v 1.7 2001/06/09 07:03:41 angelos Exp $ */
+/* $OpenBSD: ip.h,v 1.8 2002/05/16 14:10:51 kjc Exp $ */
/* $NetBSD: ip.h,v 1.9 1995/05/15 01:22:44 cgd Exp $ */
/*
@@ -81,7 +81,7 @@ struct ip {
#define IPTOS_RELIABILITY 0x04
/* IPTOS_LOWCOST 0x02 XXX */
#if 1
-/* ECN bits proposed by Sally Floyd */
+/* ECN RFC3168 obsoletes RFC2481, and these will be deprecated soon. */
#define IPTOS_CE 0x01 /* congestion experienced */
#define IPTOS_ECT 0x02 /* ECN-capable transport */
#endif
@@ -99,6 +99,16 @@ struct ip {
#define IPTOS_PREC_ROUTINE 0x00
/*
+ * ECN (Explicit Congestion Notification) codepoints in RFC3168
+ * mapped to the lower 2 bits of the TOS field.
+ */
+#define IPTOS_ECN_NOTECT 0x00 /* not-ECT */
+#define IPTOS_ECN_ECT1 0x01 /* ECN-capable transport (1) */
+#define IPTOS_ECN_ECT0 0x02 /* ECN-capable transport (0) */
+#define IPTOS_ECN_CE 0x03 /* congestion experienced */
+#define IPTOS_ECN_MASK 0x03 /* ECN field mask */
+
+/*
* Definitions for options.
*/
#define IPOPT_COPIED(o) ((o)&0x80)
diff --git a/sys/netinet/ip_ecn.c b/sys/netinet/ip_ecn.c
index 3664e847255..6f3a1f49e6a 100644
--- a/sys/netinet/ip_ecn.c
+++ b/sys/netinet/ip_ecn.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: ip_ecn.c,v 1.3 2001/06/08 03:53:45 angelos Exp $ */
+/* $OpenBSD: ip_ecn.c,v 1.4 2002/05/16 14:10:51 kjc Exp $ */
/* $KAME: ip_ecn.c,v 1.9 2000/10/01 12:44:48 itojun Exp $ */
/*
@@ -55,6 +55,37 @@
#include <netinet/ip_ecn.h>
/*
+ * ECN and TOS (or TCLASS) processing rules at tunnel encapsulation and
+ * decapsulation from RFC3168:
+ *
+ * Outer Hdr at Inner Hdr at
+ * Encapsulator Decapsulator
+ * Header fields: -------------------- ------------
+ * DS Field copied from inner hdr no change
+ * ECN Field constructed by (I) constructed by (E)
+ *
+ * ECN_ALLOWED (full functionality):
+ * (I) if the ECN field in the inner header is set to CE, then set the
+ * ECN field in the outer header to ECT(0).
+ * otherwise, copy the ECN field to the outer header.
+ *
+ * (E) if the ECN field in the outer header is set to CE and the ECN
+ * field of the inner header is not-ECT, drop the packet.
+ * if the ECN field in the inner header is set to ECT(0) or ECT(1)
+ * and the ECN field in the outer header is set to CE, then copy CE to
+ * the inner header. otherwise, make no change to the inner header.
+ *
+ * ECN_FORBIDDEN (limited functionality):
+ * (I) set the ECN field to not-ECT in the outer header.
+ *
+ * (E) if the ECN field in the outer header is set to CE, drop the packet.
+ * otherwise, make no change to the ECN field in the inner header.
+ *
+ * the drop rule is for backward compatibility and protection against
+ * erasure of CE.
+ */
+
+/*
* modify outer ECN (TOS) field on ingress operation (tunnel encapsulation).
* call it after you've done the default initialization/copy for the outer.
*/
@@ -67,12 +98,21 @@ ip_ecn_ingress(mode, outer, inner)
if (!outer || !inner)
panic("NULL pointer passed to ip_ecn_ingress");
+ *outer = *inner;
switch (mode) {
case ECN_ALLOWED: /* ECN allowed */
- *outer &= ~IPTOS_CE;
+ /*
+ * full-functionality: if the inner is CE, set ECT(0)
+ * to the outer. otherwise, copy the ECN field.
+ */
+ if ((*inner & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
+ *outer &= ~IPTOS_ECN_ECT1;
break;
case ECN_FORBIDDEN: /* ECN forbidden */
- *outer &= ~(IPTOS_ECT | IPTOS_CE);
+ /*
+ * limited-functionality: set not-ECT to the outer
+ */
+ *outer &= ~IPTOS_ECN_MASK;
break;
case ECN_NOCARE: /* no consideration to ECN */
break;
@@ -82,8 +122,9 @@ ip_ecn_ingress(mode, outer, inner)
/*
* modify inner ECN (TOS) field on egress operation (tunnel decapsulation).
* call it after you've done the default initialization/copy for the inner.
+ * the caller should drop the packet if the return value is 0.
*/
-void
+int
ip_ecn_egress(mode, outer, inner)
int mode;
u_int8_t *outer;
@@ -94,13 +135,28 @@ ip_ecn_egress(mode, outer, inner)
switch (mode) {
case ECN_ALLOWED:
- if (*outer & IPTOS_CE)
- *inner |= IPTOS_CE;
+ /*
+ * full-functionality: if the outer is CE and the inner is
+ * not-ECT, should drop it. otherwise, copy CE.
+ */
+ if ((*outer & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
+ if ((*inner & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
+ return (0);
+ *inner |= IPTOS_ECN_CE;
+ }
break;
case ECN_FORBIDDEN: /* ECN forbidden */
+ /*
+ * limited-functionality: if the outer is CE, should drop it.
+ * otherwise, leave the inner.
+ */
+ if ((*outer & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
+ return (0);
+ break;
case ECN_NOCARE: /* no consideration to ECN */
break;
}
+ return (1);
}
#ifdef INET6
@@ -115,28 +171,31 @@ ip6_ecn_ingress(mode, outer, inner)
if (!outer || !inner)
panic("NULL pointer passed to ip6_ecn_ingress");
- outer8 = (ntohl(*outer) >> 20) & 0xff;
inner8 = (ntohl(*inner) >> 20) & 0xff;
ip_ecn_ingress(mode, &outer8, &inner8);
*outer &= ~htonl(0xff << 20);
*outer |= htonl((u_int32_t)outer8 << 20);
}
-void
+int
ip6_ecn_egress(mode, outer, inner)
int mode;
u_int32_t *outer;
u_int32_t *inner;
{
- u_int8_t outer8, inner8;
+ u_int8_t outer8, inner8, oinner8;
if (!outer || !inner)
panic("NULL pointer passed to ip6_ecn_egress");
outer8 = (ntohl(*outer) >> 20) & 0xff;
- inner8 = (ntohl(*inner) >> 20) & 0xff;
- ip_ecn_egress(mode, &outer8, &inner8);
- *inner &= ~htonl(0xff << 20);
- *inner |= htonl((u_int32_t)inner8 << 20);
+ inner8 = oinner8 = (ntohl(*inner) >> 20) & 0xff;
+ if (ip_ecn_egress(mode, &outer8, &inner8) == 0)
+ return (0);
+ if (inner8 != oinner8) {
+ *inner &= ~htonl(0xff << 20);
+ *inner |= htonl((u_int32_t)inner8 << 20);
+ }
+ return (1);
}
#endif
diff --git a/sys/netinet/ip_ecn.h b/sys/netinet/ip_ecn.h
index af627b0f1cd..7c8a0e615c2 100644
--- a/sys/netinet/ip_ecn.h
+++ b/sys/netinet/ip_ecn.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: ip_ecn.h,v 1.4 2002/03/14 01:27:11 millert Exp $ */
+/* $OpenBSD: ip_ecn.h,v 1.5 2002/05/16 14:10:51 kjc Exp $ */
/* $KAME: ip_ecn.h,v 1.5 2000/03/27 04:58:38 sumikawa Exp $ */
/*
@@ -45,10 +45,10 @@
#if defined(_KERNEL)
extern void ip_ecn_ingress(int, u_int8_t *, u_int8_t *);
-extern void ip_ecn_egress(int, u_int8_t *, u_int8_t *);
+extern int ip_ecn_egress(int, u_int8_t *, u_int8_t *);
#ifdef INET6
extern void ip6_ecn_ingress(int, u_int32_t *, u_int32_t *);
-extern void ip6_ecn_egress(int, u_int32_t *, u_int32_t *);
+extern int ip6_ecn_egress(int, u_int32_t *, u_int32_t *);
#endif /* INET6 */
#endif /* _KERNEL */
#endif /* _NETINET_IP_ECN_H_ */
diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c
index dc1f610a32b..86c473e346c 100644
--- a/sys/netinet/ip_input.c
+++ b/sys/netinet/ip_input.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: ip_input.c,v 1.101 2002/04/24 01:05:12 angelos Exp $ */
+/* $OpenBSD: ip_input.c,v 1.102 2002/05/16 14:10:51 kjc Exp $ */
/* $NetBSD: ip_input.c,v 1.30 1996/03/16 23:53:58 christos Exp $ */
/*
@@ -738,6 +738,7 @@ ip_reass(ipqe, fp)
struct mbuf *t;
int hlen = ipqe->ipqe_ip->ip_hl << 2;
int i, next;
+ u_int8_t ecn, ecn0;
/*
* Presence of header sizes in mbufs
@@ -766,6 +767,22 @@ ip_reass(ipqe, fp)
}
/*
+ * Handle ECN by comparing this segment with the first one;
+ * if CE is set, do not lose CE.
+ * drop if CE and not-ECT are mixed for the same packet.
+ */
+ ecn = ipqe->ipqe_ip->ip_tos & IPTOS_ECN_MASK;
+ ecn0 = fp->ipq_fragq.lh_first->ipqe_ip->ip_tos & IPTOS_ECN_MASK;
+ if (ecn == IPTOS_ECN_CE) {
+ if (ecn0 == IPTOS_ECN_NOTECT)
+ goto dropfrag;
+ if (ecn0 != IPTOS_ECN_CE)
+ fp->ipq_fragq.lh_first->ipqe_ip->ip_tos |= IPTOS_ECN_CE;
+ }
+ if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT)
+ goto dropfrag;
+
+ /*
* Find a segment which begins after this one does.
*/
for (p = NULL, q = fp->ipq_fragq.lh_first; q != NULL;
diff --git a/sys/netinet/ip_ipip.c b/sys/netinet/ip_ipip.c
index 8d680706688..36f20ddb75a 100644
--- a/sys/netinet/ip_ipip.c
+++ b/sys/netinet/ip_ipip.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: ip_ipip.c,v 1.22 2001/12/06 20:14:53 angelos Exp $ */
+/* $OpenBSD: ip_ipip.c,v 1.23 2002/05/16 14:10:51 kjc Exp $ */
/*
* The authors of this code are John Ioannidis (ji@tla.org),
* Angelos D. Keromytis (kermit@csd.uch.gr) and
@@ -254,7 +254,10 @@ ipip_input(struct mbuf *m, int iphlen, struct ifnet *gifp)
case 4:
ipo = mtod(m, struct ip *);
nxt = ipo->ip_p;
- ip_ecn_egress(ECN_ALLOWED, &otos, &ipo->ip_tos);
+ if (!ip_ecn_egress(ECN_ALLOWED, &otos, &ipo->ip_tos)) {
+ m_freem(m);
+ return;
+ }
break;
#endif /* INET */
@@ -263,7 +266,10 @@ ipip_input(struct mbuf *m, int iphlen, struct ifnet *gifp)
ip6 = (struct ip6_hdr *) ipo;
nxt = ip6->ip6_nxt;
itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
- ip_ecn_egress(ECN_ALLOWED, &otos, &itos);
+ if (!ip_ecn_egress(ECN_ALLOWED, &otos, &itos)) {
+ m_freem(m);
+ return;
+ }
ip6->ip6_flow &= ~htonl(0xff << 20);
ip6->ip6_flow |= htonl((u_int32_t) itos << 20);
break;
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
index 56f26181ce2..5eacb39a45a 100644
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp.h,v 1.9 2001/06/09 07:03:43 angelos Exp $ */
+/* $OpenBSD: tcp.h,v 1.10 2002/05/16 14:10:51 kjc Exp $ */
/* $NetBSD: tcp.h,v 1.8 1995/04/17 05:32:58 cgd Exp $ */
/*
@@ -65,6 +65,8 @@ struct tcphdr {
#define TH_PUSH 0x08
#define TH_ACK 0x10
#define TH_URG 0x20
+#define TH_ECE 0x40
+#define TH_CWR 0x80
u_int16_t th_win; /* window */
u_int16_t th_sum; /* checksum */
u_int16_t th_urp; /* urgent pointer */
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
index 847eacf5947..fd775bf73ee 100644
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp_input.c,v 1.110 2002/03/19 14:58:54 itojun Exp $ */
+/* $OpenBSD: tcp_input.c,v 1.111 2002/05/16 14:10:51 kjc Exp $ */
/* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */
/*
@@ -142,6 +142,18 @@ do { \
#define ND6_HINT(tp)
#endif
+#ifdef TCP_ECN
+/*
+ * ECN (Explicit Congestion Notification) support based on RFC3168
+ * implementation note:
+ * snd_last is used to track a recovery phase.
+ * when cwnd is reduced, snd_last is set to snd_max.
+ * while snd_last > snd_una, the sender is in a recovery phase and
+ * its cwnd should not be reduced again.
+ * snd_last follows snd_una when not in a recovery phase.
+ */
+#endif
+
/*
* Macro to compute ACK transmission behavior. Delay the ACK unless
* we have already delayed an ACK (must send an ACK every two segments).
@@ -419,6 +431,9 @@ tcp_input(struct mbuf *m, ...)
int error, s;
#endif /* IPSEC */
int af;
+#ifdef TCP_ECN
+ u_char iptos;
+#endif
va_start(ap, m);
iphlen = va_arg(ap, int);
@@ -515,6 +530,10 @@ tcp_input(struct mbuf *m, ...)
#endif
ti = mtod(m, struct tcpiphdr *);
+#ifdef TCP_ECN
+ /* save ip_tos before clearing it for checksum */
+ iptos = ip->ip_tos;
+#endif
/*
* Checksum extended TCP header and data.
*/
@@ -542,6 +561,9 @@ tcp_input(struct mbuf *m, ...)
case AF_INET6:
ipv6 = mtod(m, struct ip6_hdr *);
tlen = m->m_pkthdr.len - iphlen;
+#ifdef TCP_ECN
+ iptos = (ntohl(ipv6->ip6_flow) >> 20) & 0xff;
+#endif
/* Be proactive about malicious use of IPv4 mapped address */
if (IN6_IS_ADDR_V4MAPPED(&ipv6->ip6_src) ||
@@ -896,6 +918,13 @@ findpcb:
tp->rcv_lastend = th->th_seq + tlen;
}
#endif /* TCP_SACK */
+#ifdef TCP_ECN
+ /* if congestion experienced, set ECE bit in subsequent packets. */
+ if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
+ tp->t_flags |= TF_RCVD_CE;
+ tcpstat.tcps_ecn_rcvce++;
+ }
+#endif
/*
* Header prediction: check for the two common cases
* of a uni-directional data xfer. If the packet has
@@ -911,7 +940,11 @@ findpcb:
* the socket buffer and note that we need a delayed ack.
*/
if (tp->t_state == TCPS_ESTABLISHED &&
+#ifdef TCP_ECN
+ (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK &&
+#else
(tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
+#endif
(!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) &&
th->th_seq == tp->rcv_nxt &&
tiwin && tiwin == tp->snd_wnd &&
@@ -948,12 +981,15 @@ findpcb:
ND6_HINT(tp);
sbdrop(&so->so_snd, acked);
tp->snd_una = th->th_ack;
-#if defined(TCP_SACK)
+#if defined(TCP_SACK) || defined(TCP_ECN)
/*
* We want snd_last to track snd_una so
* as to avoid sequence wraparound problems
* for very large transfers.
*/
+#ifdef TCP_ECN
+ if (SEQ_GT(tp->snd_una, tp->snd_last))
+#endif
tp->snd_last = tp->snd_una;
#endif /* TCP_SACK */
#if defined(TCP_SACK) && defined(TCP_FACK)
@@ -1188,7 +1224,7 @@ findpcb:
}
tp->irs = th->th_seq;
tcp_sendseqinit(tp);
-#if defined (TCP_SACK)
+#if defined (TCP_SACK) || defined(TCP_ECN)
tp->snd_last = tp->snd_una;
#endif /* TCP_SACK */
#if defined(TCP_SACK) && defined(TCP_FACK)
@@ -1196,6 +1232,16 @@ findpcb:
tp->retran_data = 0;
tp->snd_awnd = 0;
#endif /* TCP_FACK */
+#ifdef TCP_ECN
+ /*
+ * if both ECE and CWR flag bits are set, peer is ECN capable.
+ */
+ if (tcp_do_ecn &&
+ (tiflags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) {
+ tp->t_flags |= TF_ECN_PERMIT;
+ tcpstat.tcps_ecn_accepts++;
+ }
+#endif
tcp_rcvseqinit(tp);
tp->t_flags |= TF_ACKNOW;
tp->t_state = TCPS_SYN_RECEIVED;
@@ -1241,6 +1287,11 @@ findpcb:
SEQ_GT(th->th_ack, tp->snd_max)))
goto dropwithreset;
if (tiflags & TH_RST) {
+#ifdef TCP_ECN
+ /* if ECN is enabled, fall back to non-ecn at rexmit */
+ if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN))
+ goto drop;
+#endif
if (tiflags & TH_ACK)
tp = tcp_drop(tp, ECONNREFUSED);
goto drop;
@@ -1266,6 +1317,24 @@ findpcb:
if ((tp->t_flags & TF_SACK_PERMIT) == 0)
tp->sack_disable = 1;
#endif
+#ifdef TCP_ECN
+ /*
+ * if ECE is set but CWR is not set for SYN-ACK, or
+ * both ECE and CWR are set for simultaneous open,
+ * peer is ECN capable.
+ */
+ if (tcp_do_ecn) {
+ if ((tiflags & (TH_ACK|TH_ECE|TH_CWR))
+ == (TH_ACK|TH_ECE) ||
+ (tiflags & (TH_ACK|TH_ECE|TH_CWR))
+ == (TH_ECE|TH_CWR)) {
+ tp->t_flags |= TF_ECN_PERMIT;
+ tiflags &= ~(TH_ECE|TH_CWR);
+ tcpstat.tcps_ecn_accepts++;
+ }
+ }
+#endif
+
if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) {
tcpstat.tcps_connects++;
soisconnected(so);
@@ -1471,6 +1540,11 @@ trimthenstep6:
switch (tp->t_state) {
case TCPS_SYN_RECEIVED:
+#ifdef TCP_ECN
+ /* if ECN is enabled, fall back to non-ecn at rexmit */
+ if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN))
+ goto drop;
+#endif
so->so_error = ECONNREFUSED;
goto close;
@@ -1551,6 +1625,39 @@ trimthenstep6:
case TCPS_CLOSING:
case TCPS_LAST_ACK:
case TCPS_TIME_WAIT:
+#ifdef TCP_ECN
+ /*
+ * if we receive ECE and are not already in recovery phase,
+ * reduce cwnd by half but don't slow-start.
+ * advance snd_last to snd_max not to reduce cwnd again
+ * until all outstanding packets are acked.
+ */
+ if (tcp_do_ecn && (tiflags & TH_ECE)) {
+ if ((tp->t_flags & TF_ECN_PERMIT) &&
+ SEQ_GEQ(tp->snd_una, tp->snd_last)) {
+ u_int win;
+
+ win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg;
+ if (win > 1) {
+ tp->snd_ssthresh = win / 2 * tp->t_maxseg;
+ tp->snd_cwnd = tp->snd_ssthresh;
+ tp->snd_last = tp->snd_max;
+ tp->t_flags |= TF_SEND_CWR;
+ tcpstat.tcps_cwr_ecn++;
+ }
+ }
+ tcpstat.tcps_ecn_rcvece++;
+ }
+ /*
+ * if we receive CWR, we know that the peer has reduced
+ * its congestion window. stop sending ecn-echo.
+ */
+ if ((tiflags & TH_CWR)) {
+ tp->t_flags &= ~TF_RCVD_CE;
+ tcpstat.tcps_ecn_rcvcwr++;
+ }
+#endif /* TCP_ECN */
+
if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
/*
* Duplicate/old ACK processing.
@@ -1621,7 +1728,7 @@ trimthenstep6:
ulmin(tp->snd_wnd, tp->snd_cwnd) /
2 / tp->t_maxseg;
-#if defined(TCP_SACK)
+#if defined(TCP_SACK) || defined(TCP_ECN)
if (SEQ_LT(th->th_ack, tp->snd_last)){
/*
* False fast retx after
@@ -1641,6 +1748,12 @@ trimthenstep6:
if (!tp->sack_disable) {
TCP_TIMER_DISARM(tp, TCPT_REXMT);
tp->t_rtttime = 0;
+#ifdef TCP_ECN
+ tp->t_flags |= TF_SEND_CWR;
+#endif
+#if 1 /* TCP_ECN */
+ tcpstat.tcps_cwr_frecovery++;
+#endif
tcpstat.tcps_sndrexmitfast++;
#if defined(TCP_SACK) && defined(TCP_FACK)
tp->t_dupacks = tcprexmtthresh;
@@ -1666,6 +1779,12 @@ trimthenstep6:
tp->t_rtttime = 0;
tp->snd_nxt = th->th_ack;
tp->snd_cwnd = tp->t_maxseg;
+#ifdef TCP_ECN
+ tp->t_flags |= TF_SEND_CWR;
+#endif
+#if 1 /* TCP_ECN */
+ tcpstat.tcps_cwr_frecovery++;
+#endif
tcpstat.tcps_sndrexmitfast++;
(void) tcp_output(tp);
@@ -1818,6 +1937,11 @@ trimthenstep6:
if (sb_notify(&so->so_snd))
sowwakeup(so);
tp->snd_una = th->th_ack;
+#ifdef TCP_ECN
+ /* sync snd_last with snd_una */
+ if (SEQ_GT(tp->snd_una, tp->snd_last))
+ tp->snd_last = tp->snd_una;
+#endif
if (SEQ_LT(tp->snd_nxt, tp->snd_una))
tp->snd_nxt = tp->snd_una;
#if defined (TCP_SACK) && defined (TCP_FACK)
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
index 625bca510c8..2453d4c132c 100644
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp_output.c,v 1.49 2002/03/08 03:49:58 provos Exp $ */
+/* $OpenBSD: tcp_output.c,v 1.50 2002/05/16 14:10:51 kjc Exp $ */
/* $NetBSD: tcp_output.c,v 1.16 1997/06/03 16:17:09 kml Exp $ */
/*
@@ -237,6 +237,9 @@ tcp_output(tp)
#ifdef TCP_SIGNATURE
unsigned int sigoff;
#endif /* TCP_SIGNATURE */
+#ifdef TCP_ECN
+ int needect;
+#endif
#if defined(TCP_SACK) && defined(TCP_SIGNATURE) && defined(DIAGNOSTIC)
if (!tp->sack_disable && (tp->t_flags & TF_SIGNATURE))
@@ -815,6 +818,39 @@ send:
bcopy((caddr_t)opt, (caddr_t)(th + 1), optlen);
th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
}
+#ifdef TCP_ECN
+ if (tcp_do_ecn) {
+ /*
+ * if we have received congestion experienced segs,
+ * set ECE bit.
+ */
+ if (tp->t_flags & TF_RCVD_CE) {
+ flags |= TH_ECE;
+ tcpstat.tcps_ecn_sndece++;
+ }
+ if (!(tp->t_flags & TF_DISABLE_ECN)) {
+ /*
+ * if this is a SYN seg, set ECE and CWR.
+ * set only ECE for SYN-ACK if peer supports ECN.
+ */
+ if ((flags & (TH_SYN|TH_ACK)) == TH_SYN)
+ flags |= (TH_ECE|TH_CWR);
+ else if ((tp->t_flags & TF_ECN_PERMIT) &&
+ (flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK))
+ flags |= TH_ECE;
+ }
+ /*
+ * if we have reduced the congestion window, notify
+ * the peer by setting CWR bit.
+ */
+ if ((tp->t_flags & TF_ECN_PERMIT) &&
+ (tp->t_flags & TF_SEND_CWR)) {
+ flags |= TH_CWR;
+ tp->t_flags &= ~TF_SEND_CWR;
+ tcpstat.tcps_ecn_sndcwr++;
+ }
+ }
+#endif
th->th_flags = flags;
/*
@@ -1038,6 +1074,23 @@ send:
*/
m->m_pkthdr.len = hdrlen + len;
+#ifdef TCP_ECN
+ /*
+ * if peer is ECN capable, set the ECT bit in the IP header.
+ * but don't set ECT for a pure ack, a retransmit or a window probe.
+ */
+ needect = 0;
+ if (tcp_do_ecn && (tp->t_flags & TF_ECN_PERMIT)) {
+ if (len == 0 || SEQ_LT(tp->snd_nxt, tp->snd_max) ||
+ (tp->t_force && len == 1)) {
+ /* don't set ECT */
+ } else {
+ needect = 1;
+ tcpstat.tcps_ecn_sndect++;
+ }
+ }
+#endif
+
switch (tp->pf) {
case 0: /*default to PF_INET*/
#ifdef INET
@@ -1049,6 +1102,10 @@ send:
ip->ip_len = m->m_pkthdr.len;
ip->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl;
ip->ip_tos = tp->t_inpcb->inp_ip.ip_tos;
+#ifdef TCP_ECN
+ if (needect)
+ ip->ip_tos |= IPTOS_ECN_ECT0;
+#endif
}
error = ip_output(m, tp->t_inpcb->inp_options,
&tp->t_inpcb->inp_route,
@@ -1067,6 +1124,10 @@ send:
sizeof(struct ip6_hdr);
ipv6->ip6_nxt = IPPROTO_TCP;
ipv6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);
+#ifdef TCP_ECN
+ if (needect)
+ ipv6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
+#endif
}
error = ip6_output(m, tp->t_inpcb->inp_outputopts6,
&tp->t_inpcb->inp_route6,
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
index ffc8b254cd4..733cccc1ff5 100644
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp_subr.c,v 1.61 2002/03/14 01:27:11 millert Exp $ */
+/* $OpenBSD: tcp_subr.c,v 1.62 2002/05/16 14:10:51 kjc Exp $ */
/* $NetBSD: tcp_subr.c,v 1.22 1996/02/13 23:44:00 christos Exp $ */
/*
@@ -134,6 +134,7 @@ int tcp_do_rfc1323 = TCP_DO_RFC1323;
#endif
int tcp_do_sack = TCP_DO_SACK; /* RFC 2018 selective ACKs */
int tcp_ack_on_push = 0; /* set to enable immediate ACK-on-PUSH */
+int tcp_do_ecn = 0; /* RFC3168 ECN enabled/disabled? */
#ifndef TCBHASHSIZE
#define TCBHASHSIZE 128
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
index 5b86cac3218..36a3716f769 100644
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp_timer.c,v 1.28 2002/03/08 03:49:58 provos Exp $ */
+/* $OpenBSD: tcp_timer.c,v 1.29 2002/05/16 14:10:51 kjc Exp $ */
/* $NetBSD: tcp_timer.c,v 1.14 1996/02/13 23:44:09 christos Exp $ */
/*
@@ -330,6 +330,13 @@ tcp_timer_rexmt(void *arg)
tp->snd_cwnd = tp->t_maxseg;
tp->snd_ssthresh = win * tp->t_maxseg;
tp->t_dupacks = 0;
+#ifdef TCP_ECN
+ tp->snd_last = tp->snd_max;
+ tp->t_flags |= TF_SEND_CWR;
+#endif
+#if 1 /* TCP_ECN */
+ tcpstat.tcps_cwr_timeout++;
+#endif
}
(void) tcp_output(tp);
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
index a2130df23a1..b7488a6d1df 100644
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp_usrreq.c,v 1.59 2002/03/14 01:27:11 millert Exp $ */
+/* $OpenBSD: tcp_usrreq.c,v 1.60 2002/05/16 14:10:51 kjc Exp $ */
/* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */
/*
@@ -924,6 +924,11 @@ tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen)
case TCPCTL_ACK_ON_PUSH:
return (sysctl_int(oldp, oldlenp, newp, newlen,
&tcp_ack_on_push));
+#ifdef TCP_ECN
+ case TCPCTL_ECN:
+ return (sysctl_int(oldp, oldlenp, newp, newlen,
+ &tcp_do_ecn));
+#endif
default:
return (ENOPROTOOPT);
}
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
index 60aa5162f44..2be685a288d 100644
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -1,4 +1,4 @@
-/* $OpenBSD: tcp_var.h,v 1.42 2002/03/14 01:27:11 millert Exp $ */
+/* $OpenBSD: tcp_var.h,v 1.43 2002/05/16 14:10:51 kjc Exp $ */
/* $NetBSD: tcp_var.h,v 1.17 1996/02/13 23:44:24 christos Exp $ */
/*
@@ -68,7 +68,7 @@ struct tcpcb {
short t_dupacks; /* consecutive dup acks recd */
u_short t_maxseg; /* maximum segment size */
char t_force; /* 1 if forcing out a byte */
- u_short t_flags;
+ u_int t_flags;
#define TF_ACKNOW 0x0001 /* ack peer immediately */
#define TF_DELACK 0x0002 /* ack, but try to delay it */
#define TF_NODELAY 0x0004 /* don't delay packets to coalesce */
@@ -80,6 +80,12 @@ struct tcpcb {
#define TF_RCVD_TSTMP 0x0100 /* a timestamp was received in SYN */
#define TF_SACK_PERMIT 0x0200 /* other side said I could SACK */
#define TF_SIGNATURE 0x0400 /* require TCP MD5 signature */
+#ifdef TCP_ECN
+#define TF_ECN_PERMIT 0x00008000 /* other side said I could ECN */
+#define TF_RCVD_CE 0x00010000 /* send ECE in subsequent segs */
+#define TF_SEND_CWR 0x00020000 /* send CWR in next seg */
+#define TF_DISABLE_ECN 0x00040000 /* disable ECN for this connection */
+#endif
struct mbuf *t_template; /* skeletal packet for transmit */
struct inpcb *t_inpcb; /* back pointer to internet pcb */
@@ -107,7 +113,7 @@ struct tcpcb {
int retran_data; /* amount of outstanding retx. data */
#endif /* TCP_FACK */
#endif /* TCP_SACK */
-#if defined(TCP_SACK)
+#if defined(TCP_SACK) || defined(TCP_ECN)
tcp_seq snd_last; /* for use in fast recovery */
#endif
/* receive sequence variables */
@@ -305,6 +311,18 @@ struct tcpstat {
u_int64_t tcps_rcvgoodsig; /* rcvd good TCP signatures */
u_int32_t tcps_inhwcsum; /* input hardware-checksummed packets */
u_int32_t tcps_outhwcsum; /* output hardware-checksummed packets */
+
+ /* ECN stats */
+ u_int32_t tcps_ecn_accepts; /* ecn connections accepted */
+ u_int32_t tcps_ecn_rcvece; /* # of rcvd ece */
+ u_int32_t tcps_ecn_rcvcwr; /* # of rcvd cwr */
+ u_int32_t tcps_ecn_rcvce; /* # of rcvd ce in ip header */
+ u_int32_t tcps_ecn_sndect; /* # of cwr sent */
+ u_int32_t tcps_ecn_sndece; /* # of ece sent */
+ u_int32_t tcps_ecn_sndcwr; /* # of cwr sent */
+ u_int32_t tcps_cwr_ecn; /* # of cwnd reduced by ecn */
+ u_int32_t tcps_cwr_frecovery; /* # of cwnd reduced by fastrecovery */
+ u_int32_t tcps_cwr_timeout; /* # of cwnd reduced by timeout */
};
/*
@@ -324,7 +342,8 @@ struct tcpstat {
#define TCPCTL_MSSDFLT 11 /* Default maximum segment size */
#define TCPCTL_RSTPPSLIMIT 12 /* RST pps limit */
#define TCPCTL_ACK_ON_PUSH 13 /* ACK immediately on PUSH */
-#define TCPCTL_MAXID 14
+#define TCPCTL_ECN 14 /* RFC3168 ECN */
+#define TCPCTL_MAXID 15
#define TCPCTL_NAMES { \
{ 0, 0 }, \
@@ -341,6 +360,7 @@ struct tcpstat {
{ "mssdflt", CTLTYPE_INT }, \
{ "rstppslimit", CTLTYPE_INT }, \
{ "ackonpush", CTLTYPE_INT }, \
+ { "ecn", CTLTYPE_INT }, \
}
struct tcp_ident_mapping {
@@ -359,6 +379,7 @@ extern int tcp_ack_on_push; /* ACK immediately on PUSH */
extern int tcp_do_sack; /* SACK enabled/disabled */
extern struct pool sackhl_pool;
#endif
+extern int tcp_do_ecn; /* RFC3168 ECN enabled/disabled? */
int tcp_attach(struct socket *);
void tcp_canceltimers(struct tcpcb *);
diff --git a/sys/netinet6/frag6.c b/sys/netinet6/frag6.c
index e49893f844c..ab453aea1ad 100644
--- a/sys/netinet6/frag6.c
+++ b/sys/netinet6/frag6.c
@@ -1,4 +1,4 @@
-/* $OpenBSD: frag6.c,v 1.13 2002/03/15 10:50:59 itojun Exp $ */
+/* $OpenBSD: frag6.c,v 1.14 2002/05/16 14:10:51 kjc Exp $ */
/* $KAME: frag6.c,v 1.31 2001/05/17 13:45:34 jinmei Exp $ */
/*
@@ -50,6 +50,8 @@
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
#include <netinet/icmp6.h>
+#include <netinet/in_systm.h> /* for ECN definitions */
+#include <netinet/ip.h> /* for ECN definitions */
#include <dev/rndvar.h>
@@ -184,6 +186,7 @@ frag6_input(mp, offp, proto)
static struct route_in6 ro;
struct sockaddr_in6 *dst;
#endif
+ u_int8_t ecn, ecn0;
ip6 = mtod(m, struct ip6_hdr *);
#ifndef PULLDOWN_TEST
@@ -388,6 +391,26 @@ frag6_input(mp, offp, proto)
}
/*
+ * Handle ECN by comparing this segment with the first one;
+ * if CE is set, do not lose CE.
+ * drop if CE and not-ECT are mixed for the same packet.
+ */
+ ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK;
+ ecn0 = (ntohl(q6->ip6q_down->ip6af_head) >> 20) & IPTOS_ECN_MASK;
+ if (ecn == IPTOS_ECN_CE) {
+ if (ecn0 == IPTOS_ECN_NOTECT) {
+ free(ip6af, M_FTABLE);
+ goto dropfrag;
+ }
+ if (ecn0 != IPTOS_ECN_CE)
+ q6->ip6q_down->ip6af_head |= htonl(IPTOS_ECN_CE << 20);
+ }
+ if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) {
+ free(ip6af, M_FTABLE);
+ goto dropfrag;
+ }
+
+ /*
* Find a segment which begins after this one does.
*/
for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6;