diff options
-rw-r--r-- | sys/netinet/ip.h | 14 | ||||
-rw-r--r-- | sys/netinet/ip_ecn.c | 85 | ||||
-rw-r--r-- | sys/netinet/ip_ecn.h | 6 | ||||
-rw-r--r-- | sys/netinet/ip_input.c | 19 | ||||
-rw-r--r-- | sys/netinet/ip_ipip.c | 12 | ||||
-rw-r--r-- | sys/netinet/tcp.h | 4 | ||||
-rw-r--r-- | sys/netinet/tcp_input.c | 132 | ||||
-rw-r--r-- | sys/netinet/tcp_output.c | 63 | ||||
-rw-r--r-- | sys/netinet/tcp_subr.c | 3 | ||||
-rw-r--r-- | sys/netinet/tcp_timer.c | 9 | ||||
-rw-r--r-- | sys/netinet/tcp_usrreq.c | 7 | ||||
-rw-r--r-- | sys/netinet/tcp_var.h | 29 | ||||
-rw-r--r-- | sys/netinet6/frag6.c | 25 |
13 files changed, 372 insertions, 36 deletions
diff --git a/sys/netinet/ip.h b/sys/netinet/ip.h index bad86dc5ef4..ce6e27ae26e 100644 --- a/sys/netinet/ip.h +++ b/sys/netinet/ip.h @@ -1,4 +1,4 @@ -/* $OpenBSD: ip.h,v 1.7 2001/06/09 07:03:41 angelos Exp $ */ +/* $OpenBSD: ip.h,v 1.8 2002/05/16 14:10:51 kjc Exp $ */ /* $NetBSD: ip.h,v 1.9 1995/05/15 01:22:44 cgd Exp $ */ /* @@ -81,7 +81,7 @@ struct ip { #define IPTOS_RELIABILITY 0x04 /* IPTOS_LOWCOST 0x02 XXX */ #if 1 -/* ECN bits proposed by Sally Floyd */ +/* ECN RFC3168 obsoletes RFC2481, and these will be deprecated soon. */ #define IPTOS_CE 0x01 /* congestion experienced */ #define IPTOS_ECT 0x02 /* ECN-capable transport */ #endif @@ -99,6 +99,16 @@ struct ip { #define IPTOS_PREC_ROUTINE 0x00 /* + * ECN (Explicit Congestion Notification) codepoints in RFC3168 + * mapped to the lower 2 bits of the TOS field. + */ +#define IPTOS_ECN_NOTECT 0x00 /* not-ECT */ +#define IPTOS_ECN_ECT1 0x01 /* ECN-capable transport (1) */ +#define IPTOS_ECN_ECT0 0x02 /* ECN-capable transport (0) */ +#define IPTOS_ECN_CE 0x03 /* congestion experienced */ +#define IPTOS_ECN_MASK 0x03 /* ECN field mask */ + +/* * Definitions for options. */ #define IPOPT_COPIED(o) ((o)&0x80) diff --git a/sys/netinet/ip_ecn.c b/sys/netinet/ip_ecn.c index 3664e847255..6f3a1f49e6a 100644 --- a/sys/netinet/ip_ecn.c +++ b/sys/netinet/ip_ecn.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ip_ecn.c,v 1.3 2001/06/08 03:53:45 angelos Exp $ */ +/* $OpenBSD: ip_ecn.c,v 1.4 2002/05/16 14:10:51 kjc Exp $ */ /* $KAME: ip_ecn.c,v 1.9 2000/10/01 12:44:48 itojun Exp $ */ /* @@ -55,6 +55,37 @@ #include <netinet/ip_ecn.h> /* + * ECN and TOS (or TCLASS) processing rules at tunnel encapsulation and + * decapsulation from RFC3168: + * + * Outer Hdr at Inner Hdr at + * Encapsulator Decapsulator + * Header fields: -------------------- ------------ + * DS Field copied from inner hdr no change + * ECN Field constructed by (I) constructed by (E) + * + * ECN_ALLOWED (full functionality): + * (I) if the ECN field in the inner header is set to CE, then set the + * ECN field in the outer header to ECT(0). + * otherwise, copy the ECN field to the outer header. + * + * (E) if the ECN field in the outer header is set to CE and the ECN + * field of the inner header is not-ECT, drop the packet. + * if the ECN field in the inner header is set to ECT(0) or ECT(1) + * and the ECN field in the outer header is set to CE, then copy CE to + * the inner header. otherwise, make no change to the inner header. + * + * ECN_FORBIDDEN (limited functionality): + * (I) set the ECN field to not-ECT in the outer header. + * + * (E) if the ECN field in the outer header is set to CE, drop the packet. + * otherwise, make no change to the ECN field in the inner header. + * + * the drop rule is for backward compatibility and protection against + * erasure of CE. + */ + +/* * modify outer ECN (TOS) field on ingress operation (tunnel encapsulation). * call it after you've done the default initialization/copy for the outer. */ @@ -67,12 +98,21 @@ ip_ecn_ingress(mode, outer, inner) if (!outer || !inner) panic("NULL pointer passed to ip_ecn_ingress"); + *outer = *inner; switch (mode) { case ECN_ALLOWED: /* ECN allowed */ - *outer &= ~IPTOS_CE; + /* + * full-functionality: if the inner is CE, set ECT(0) + * to the outer. otherwise, copy the ECN field. + */ + if ((*inner & IPTOS_ECN_MASK) == IPTOS_ECN_CE) + *outer &= ~IPTOS_ECN_ECT1; break; case ECN_FORBIDDEN: /* ECN forbidden */ - *outer &= ~(IPTOS_ECT | IPTOS_CE); + /* + * limited-functionality: set not-ECT to the outer + */ + *outer &= ~IPTOS_ECN_MASK; break; case ECN_NOCARE: /* no consideration to ECN */ break; @@ -82,8 +122,9 @@ ip_ecn_ingress(mode, outer, inner) /* * modify inner ECN (TOS) field on egress operation (tunnel decapsulation). * call it after you've done the default initialization/copy for the inner. + * the caller should drop the packet if the return value is 0. */ -void +int ip_ecn_egress(mode, outer, inner) int mode; u_int8_t *outer; @@ -94,13 +135,28 @@ ip_ecn_egress(mode, outer, inner) switch (mode) { case ECN_ALLOWED: - if (*outer & IPTOS_CE) - *inner |= IPTOS_CE; + /* + * full-functionality: if the outer is CE and the inner is + * not-ECT, should drop it. otherwise, copy CE. + */ + if ((*outer & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { + if ((*inner & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) + return (0); + *inner |= IPTOS_ECN_CE; + } break; case ECN_FORBIDDEN: /* ECN forbidden */ + /* + * limited-functionality: if the outer is CE, should drop it. + * otherwise, leave the inner. + */ + if ((*outer & IPTOS_ECN_MASK) == IPTOS_ECN_CE) + return (0); + break; case ECN_NOCARE: /* no consideration to ECN */ break; } + return (1); } #ifdef INET6 @@ -115,28 +171,31 @@ ip6_ecn_ingress(mode, outer, inner) if (!outer || !inner) panic("NULL pointer passed to ip6_ecn_ingress"); - outer8 = (ntohl(*outer) >> 20) & 0xff; inner8 = (ntohl(*inner) >> 20) & 0xff; ip_ecn_ingress(mode, &outer8, &inner8); *outer &= ~htonl(0xff << 20); *outer |= htonl((u_int32_t)outer8 << 20); } -void +int ip6_ecn_egress(mode, outer, inner) int mode; u_int32_t *outer; u_int32_t *inner; { - u_int8_t outer8, inner8; + u_int8_t outer8, inner8, oinner8; if (!outer || !inner) panic("NULL pointer passed to ip6_ecn_egress"); outer8 = (ntohl(*outer) >> 20) & 0xff; - inner8 = (ntohl(*inner) >> 20) & 0xff; - ip_ecn_egress(mode, &outer8, &inner8); - *inner &= ~htonl(0xff << 20); - *inner |= htonl((u_int32_t)inner8 << 20); + inner8 = oinner8 = (ntohl(*inner) >> 20) & 0xff; + if (ip_ecn_egress(mode, &outer8, &inner8) == 0) + return (0); + if (inner8 != oinner8) { + *inner &= ~htonl(0xff << 20); + *inner |= htonl((u_int32_t)inner8 << 20); + } + return (1); } #endif diff --git a/sys/netinet/ip_ecn.h b/sys/netinet/ip_ecn.h index af627b0f1cd..7c8a0e615c2 100644 --- a/sys/netinet/ip_ecn.h +++ b/sys/netinet/ip_ecn.h @@ -1,4 +1,4 @@ -/* $OpenBSD: ip_ecn.h,v 1.4 2002/03/14 01:27:11 millert Exp $ */ +/* $OpenBSD: ip_ecn.h,v 1.5 2002/05/16 14:10:51 kjc Exp $ */ /* $KAME: ip_ecn.h,v 1.5 2000/03/27 04:58:38 sumikawa Exp $ */ /* @@ -45,10 +45,10 @@ #if defined(_KERNEL) extern void ip_ecn_ingress(int, u_int8_t *, u_int8_t *); -extern void ip_ecn_egress(int, u_int8_t *, u_int8_t *); +extern int ip_ecn_egress(int, u_int8_t *, u_int8_t *); #ifdef INET6 extern void ip6_ecn_ingress(int, u_int32_t *, u_int32_t *); -extern void ip6_ecn_egress(int, u_int32_t *, u_int32_t *); +extern int ip6_ecn_egress(int, u_int32_t *, u_int32_t *); #endif /* INET6 */ #endif /* _KERNEL */ #endif /* _NETINET_IP_ECN_H_ */ diff --git a/sys/netinet/ip_input.c b/sys/netinet/ip_input.c index dc1f610a32b..86c473e346c 100644 --- a/sys/netinet/ip_input.c +++ b/sys/netinet/ip_input.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ip_input.c,v 1.101 2002/04/24 01:05:12 angelos Exp $ */ +/* $OpenBSD: ip_input.c,v 1.102 2002/05/16 14:10:51 kjc Exp $ */ /* $NetBSD: ip_input.c,v 1.30 1996/03/16 23:53:58 christos Exp $ */ /* @@ -738,6 +738,7 @@ ip_reass(ipqe, fp) struct mbuf *t; int hlen = ipqe->ipqe_ip->ip_hl << 2; int i, next; + u_int8_t ecn, ecn0; /* * Presence of header sizes in mbufs @@ -766,6 +767,22 @@ ip_reass(ipqe, fp) } /* + * Handle ECN by comparing this segment with the first one; + * if CE is set, do not lose CE. + * drop if CE and not-ECT are mixed for the same packet. + */ + ecn = ipqe->ipqe_ip->ip_tos & IPTOS_ECN_MASK; + ecn0 = fp->ipq_fragq.lh_first->ipqe_ip->ip_tos & IPTOS_ECN_MASK; + if (ecn == IPTOS_ECN_CE) { + if (ecn0 == IPTOS_ECN_NOTECT) + goto dropfrag; + if (ecn0 != IPTOS_ECN_CE) + fp->ipq_fragq.lh_first->ipqe_ip->ip_tos |= IPTOS_ECN_CE; + } + if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) + goto dropfrag; + + /* * Find a segment which begins after this one does. */ for (p = NULL, q = fp->ipq_fragq.lh_first; q != NULL; diff --git a/sys/netinet/ip_ipip.c b/sys/netinet/ip_ipip.c index 8d680706688..36f20ddb75a 100644 --- a/sys/netinet/ip_ipip.c +++ b/sys/netinet/ip_ipip.c @@ -1,4 +1,4 @@ -/* $OpenBSD: ip_ipip.c,v 1.22 2001/12/06 20:14:53 angelos Exp $ */ +/* $OpenBSD: ip_ipip.c,v 1.23 2002/05/16 14:10:51 kjc Exp $ */ /* * The authors of this code are John Ioannidis (ji@tla.org), * Angelos D. Keromytis (kermit@csd.uch.gr) and @@ -254,7 +254,10 @@ ipip_input(struct mbuf *m, int iphlen, struct ifnet *gifp) case 4: ipo = mtod(m, struct ip *); nxt = ipo->ip_p; - ip_ecn_egress(ECN_ALLOWED, &otos, &ipo->ip_tos); + if (!ip_ecn_egress(ECN_ALLOWED, &otos, &ipo->ip_tos)) { + m_freem(m); + return; + } break; #endif /* INET */ @@ -263,7 +266,10 @@ ipip_input(struct mbuf *m, int iphlen, struct ifnet *gifp) ip6 = (struct ip6_hdr *) ipo; nxt = ip6->ip6_nxt; itos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; - ip_ecn_egress(ECN_ALLOWED, &otos, &itos); + if (!ip_ecn_egress(ECN_ALLOWED, &otos, &itos)) { + m_freem(m); + return; + } ip6->ip6_flow &= ~htonl(0xff << 20); ip6->ip6_flow |= htonl((u_int32_t) itos << 20); break; diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h index 56f26181ce2..5eacb39a45a 100644 --- a/sys/netinet/tcp.h +++ b/sys/netinet/tcp.h @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp.h,v 1.9 2001/06/09 07:03:43 angelos Exp $ */ +/* $OpenBSD: tcp.h,v 1.10 2002/05/16 14:10:51 kjc Exp $ */ /* $NetBSD: tcp.h,v 1.8 1995/04/17 05:32:58 cgd Exp $ */ /* @@ -65,6 +65,8 @@ struct tcphdr { #define TH_PUSH 0x08 #define TH_ACK 0x10 #define TH_URG 0x20 +#define TH_ECE 0x40 +#define TH_CWR 0x80 u_int16_t th_win; /* window */ u_int16_t th_sum; /* checksum */ u_int16_t th_urp; /* urgent pointer */ diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c index 847eacf5947..fd775bf73ee 100644 --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_input.c,v 1.110 2002/03/19 14:58:54 itojun Exp $ */ +/* $OpenBSD: tcp_input.c,v 1.111 2002/05/16 14:10:51 kjc Exp $ */ /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */ /* @@ -142,6 +142,18 @@ do { \ #define ND6_HINT(tp) #endif +#ifdef TCP_ECN +/* + * ECN (Explicit Congestion Notification) support based on RFC3168 + * implementation note: + * snd_last is used to track a recovery phase. + * when cwnd is reduced, snd_last is set to snd_max. + * while snd_last > snd_una, the sender is in a recovery phase and + * its cwnd should not be reduced again. + * snd_last follows snd_una when not in a recovery phase. + */ +#endif + /* * Macro to compute ACK transmission behavior. Delay the ACK unless * we have already delayed an ACK (must send an ACK every two segments). @@ -419,6 +431,9 @@ tcp_input(struct mbuf *m, ...) int error, s; #endif /* IPSEC */ int af; +#ifdef TCP_ECN + u_char iptos; +#endif va_start(ap, m); iphlen = va_arg(ap, int); @@ -515,6 +530,10 @@ tcp_input(struct mbuf *m, ...) #endif ti = mtod(m, struct tcpiphdr *); +#ifdef TCP_ECN + /* save ip_tos before clearing it for checksum */ + iptos = ip->ip_tos; +#endif /* * Checksum extended TCP header and data. */ @@ -542,6 +561,9 @@ tcp_input(struct mbuf *m, ...) case AF_INET6: ipv6 = mtod(m, struct ip6_hdr *); tlen = m->m_pkthdr.len - iphlen; +#ifdef TCP_ECN + iptos = (ntohl(ipv6->ip6_flow) >> 20) & 0xff; +#endif /* Be proactive about malicious use of IPv4 mapped address */ if (IN6_IS_ADDR_V4MAPPED(&ipv6->ip6_src) || @@ -896,6 +918,13 @@ findpcb: tp->rcv_lastend = th->th_seq + tlen; } #endif /* TCP_SACK */ +#ifdef TCP_ECN + /* if congestion experienced, set ECE bit in subsequent packets. */ + if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) { + tp->t_flags |= TF_RCVD_CE; + tcpstat.tcps_ecn_rcvce++; + } +#endif /* * Header prediction: check for the two common cases * of a uni-directional data xfer. If the packet has @@ -911,7 +940,11 @@ findpcb: * the socket buffer and note that we need a delayed ack. */ if (tp->t_state == TCPS_ESTABLISHED && +#ifdef TCP_ECN + (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK && +#else (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && +#endif (!ts_present || TSTMP_GEQ(ts_val, tp->ts_recent)) && th->th_seq == tp->rcv_nxt && tiwin && tiwin == tp->snd_wnd && @@ -948,12 +981,15 @@ findpcb: ND6_HINT(tp); sbdrop(&so->so_snd, acked); tp->snd_una = th->th_ack; -#if defined(TCP_SACK) +#if defined(TCP_SACK) || defined(TCP_ECN) /* * We want snd_last to track snd_una so * as to avoid sequence wraparound problems * for very large transfers. */ +#ifdef TCP_ECN + if (SEQ_GT(tp->snd_una, tp->snd_last)) +#endif tp->snd_last = tp->snd_una; #endif /* TCP_SACK */ #if defined(TCP_SACK) && defined(TCP_FACK) @@ -1188,7 +1224,7 @@ findpcb: } tp->irs = th->th_seq; tcp_sendseqinit(tp); -#if defined (TCP_SACK) +#if defined (TCP_SACK) || defined(TCP_ECN) tp->snd_last = tp->snd_una; #endif /* TCP_SACK */ #if defined(TCP_SACK) && defined(TCP_FACK) @@ -1196,6 +1232,16 @@ findpcb: tp->retran_data = 0; tp->snd_awnd = 0; #endif /* TCP_FACK */ +#ifdef TCP_ECN + /* + * if both ECE and CWR flag bits are set, peer is ECN capable. + */ + if (tcp_do_ecn && + (tiflags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { + tp->t_flags |= TF_ECN_PERMIT; + tcpstat.tcps_ecn_accepts++; + } +#endif tcp_rcvseqinit(tp); tp->t_flags |= TF_ACKNOW; tp->t_state = TCPS_SYN_RECEIVED; @@ -1241,6 +1287,11 @@ findpcb: SEQ_GT(th->th_ack, tp->snd_max))) goto dropwithreset; if (tiflags & TH_RST) { +#ifdef TCP_ECN + /* if ECN is enabled, fall back to non-ecn at rexmit */ + if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) + goto drop; +#endif if (tiflags & TH_ACK) tp = tcp_drop(tp, ECONNREFUSED); goto drop; @@ -1266,6 +1317,24 @@ findpcb: if ((tp->t_flags & TF_SACK_PERMIT) == 0) tp->sack_disable = 1; #endif +#ifdef TCP_ECN + /* + * if ECE is set but CWR is not set for SYN-ACK, or + * both ECE and CWR are set for simultaneous open, + * peer is ECN capable. + */ + if (tcp_do_ecn) { + if ((tiflags & (TH_ACK|TH_ECE|TH_CWR)) + == (TH_ACK|TH_ECE) || + (tiflags & (TH_ACK|TH_ECE|TH_CWR)) + == (TH_ECE|TH_CWR)) { + tp->t_flags |= TF_ECN_PERMIT; + tiflags &= ~(TH_ECE|TH_CWR); + tcpstat.tcps_ecn_accepts++; + } + } +#endif + if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) { tcpstat.tcps_connects++; soisconnected(so); @@ -1471,6 +1540,11 @@ trimthenstep6: switch (tp->t_state) { case TCPS_SYN_RECEIVED: +#ifdef TCP_ECN + /* if ECN is enabled, fall back to non-ecn at rexmit */ + if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN)) + goto drop; +#endif so->so_error = ECONNREFUSED; goto close; @@ -1551,6 +1625,39 @@ trimthenstep6: case TCPS_CLOSING: case TCPS_LAST_ACK: case TCPS_TIME_WAIT: +#ifdef TCP_ECN + /* + * if we receive ECE and are not already in recovery phase, + * reduce cwnd by half but don't slow-start. + * advance snd_last to snd_max not to reduce cwnd again + * until all outstanding packets are acked. + */ + if (tcp_do_ecn && (tiflags & TH_ECE)) { + if ((tp->t_flags & TF_ECN_PERMIT) && + SEQ_GEQ(tp->snd_una, tp->snd_last)) { + u_int win; + + win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg; + if (win > 1) { + tp->snd_ssthresh = win / 2 * tp->t_maxseg; + tp->snd_cwnd = tp->snd_ssthresh; + tp->snd_last = tp->snd_max; + tp->t_flags |= TF_SEND_CWR; + tcpstat.tcps_cwr_ecn++; + } + } + tcpstat.tcps_ecn_rcvece++; + } + /* + * if we receive CWR, we know that the peer has reduced + * its congestion window. stop sending ecn-echo. + */ + if ((tiflags & TH_CWR)) { + tp->t_flags &= ~TF_RCVD_CE; + tcpstat.tcps_ecn_rcvcwr++; + } +#endif /* TCP_ECN */ + if (SEQ_LEQ(th->th_ack, tp->snd_una)) { /* * Duplicate/old ACK processing. @@ -1621,7 +1728,7 @@ trimthenstep6: ulmin(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; -#if defined(TCP_SACK) +#if defined(TCP_SACK) || defined(TCP_ECN) if (SEQ_LT(th->th_ack, tp->snd_last)){ /* * False fast retx after @@ -1641,6 +1748,12 @@ trimthenstep6: if (!tp->sack_disable) { TCP_TIMER_DISARM(tp, TCPT_REXMT); tp->t_rtttime = 0; +#ifdef TCP_ECN + tp->t_flags |= TF_SEND_CWR; +#endif +#if 1 /* TCP_ECN */ + tcpstat.tcps_cwr_frecovery++; +#endif tcpstat.tcps_sndrexmitfast++; #if defined(TCP_SACK) && defined(TCP_FACK) tp->t_dupacks = tcprexmtthresh; @@ -1666,6 +1779,12 @@ trimthenstep6: tp->t_rtttime = 0; tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; +#ifdef TCP_ECN + tp->t_flags |= TF_SEND_CWR; +#endif +#if 1 /* TCP_ECN */ + tcpstat.tcps_cwr_frecovery++; +#endif tcpstat.tcps_sndrexmitfast++; (void) tcp_output(tp); @@ -1818,6 +1937,11 @@ trimthenstep6: if (sb_notify(&so->so_snd)) sowwakeup(so); tp->snd_una = th->th_ack; +#ifdef TCP_ECN + /* sync snd_last with snd_una */ + if (SEQ_GT(tp->snd_una, tp->snd_last)) + tp->snd_last = tp->snd_una; +#endif if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; #if defined (TCP_SACK) && defined (TCP_FACK) diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c index 625bca510c8..2453d4c132c 100644 --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_output.c,v 1.49 2002/03/08 03:49:58 provos Exp $ */ +/* $OpenBSD: tcp_output.c,v 1.50 2002/05/16 14:10:51 kjc Exp $ */ /* $NetBSD: tcp_output.c,v 1.16 1997/06/03 16:17:09 kml Exp $ */ /* @@ -237,6 +237,9 @@ tcp_output(tp) #ifdef TCP_SIGNATURE unsigned int sigoff; #endif /* TCP_SIGNATURE */ +#ifdef TCP_ECN + int needect; +#endif #if defined(TCP_SACK) && defined(TCP_SIGNATURE) && defined(DIAGNOSTIC) if (!tp->sack_disable && (tp->t_flags & TF_SIGNATURE)) @@ -815,6 +818,39 @@ send: bcopy((caddr_t)opt, (caddr_t)(th + 1), optlen); th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; } +#ifdef TCP_ECN + if (tcp_do_ecn) { + /* + * if we have received congestion experienced segs, + * set ECE bit. + */ + if (tp->t_flags & TF_RCVD_CE) { + flags |= TH_ECE; + tcpstat.tcps_ecn_sndece++; + } + if (!(tp->t_flags & TF_DISABLE_ECN)) { + /* + * if this is a SYN seg, set ECE and CWR. + * set only ECE for SYN-ACK if peer supports ECN. + */ + if ((flags & (TH_SYN|TH_ACK)) == TH_SYN) + flags |= (TH_ECE|TH_CWR); + else if ((tp->t_flags & TF_ECN_PERMIT) && + (flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK)) + flags |= TH_ECE; + } + /* + * if we have reduced the congestion window, notify + * the peer by setting CWR bit. + */ + if ((tp->t_flags & TF_ECN_PERMIT) && + (tp->t_flags & TF_SEND_CWR)) { + flags |= TH_CWR; + tp->t_flags &= ~TF_SEND_CWR; + tcpstat.tcps_ecn_sndcwr++; + } + } +#endif th->th_flags = flags; /* @@ -1038,6 +1074,23 @@ send: */ m->m_pkthdr.len = hdrlen + len; +#ifdef TCP_ECN + /* + * if peer is ECN capable, set the ECT bit in the IP header. + * but don't set ECT for a pure ack, a retransmit or a window probe. + */ + needect = 0; + if (tcp_do_ecn && (tp->t_flags & TF_ECN_PERMIT)) { + if (len == 0 || SEQ_LT(tp->snd_nxt, tp->snd_max) || + (tp->t_force && len == 1)) { + /* don't set ECT */ + } else { + needect = 1; + tcpstat.tcps_ecn_sndect++; + } + } +#endif + switch (tp->pf) { case 0: /*default to PF_INET*/ #ifdef INET @@ -1049,6 +1102,10 @@ send: ip->ip_len = m->m_pkthdr.len; ip->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl; ip->ip_tos = tp->t_inpcb->inp_ip.ip_tos; +#ifdef TCP_ECN + if (needect) + ip->ip_tos |= IPTOS_ECN_ECT0; +#endif } error = ip_output(m, tp->t_inpcb->inp_options, &tp->t_inpcb->inp_route, @@ -1067,6 +1124,10 @@ send: sizeof(struct ip6_hdr); ipv6->ip6_nxt = IPPROTO_TCP; ipv6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL); +#ifdef TCP_ECN + if (needect) + ipv6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); +#endif } error = ip6_output(m, tp->t_inpcb->inp_outputopts6, &tp->t_inpcb->inp_route6, diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c index ffc8b254cd4..733cccc1ff5 100644 --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_subr.c,v 1.61 2002/03/14 01:27:11 millert Exp $ */ +/* $OpenBSD: tcp_subr.c,v 1.62 2002/05/16 14:10:51 kjc Exp $ */ /* $NetBSD: tcp_subr.c,v 1.22 1996/02/13 23:44:00 christos Exp $ */ /* @@ -134,6 +134,7 @@ int tcp_do_rfc1323 = TCP_DO_RFC1323; #endif int tcp_do_sack = TCP_DO_SACK; /* RFC 2018 selective ACKs */ int tcp_ack_on_push = 0; /* set to enable immediate ACK-on-PUSH */ +int tcp_do_ecn = 0; /* RFC3168 ECN enabled/disabled? */ #ifndef TCBHASHSIZE #define TCBHASHSIZE 128 diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c index 5b86cac3218..36a3716f769 100644 --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_timer.c,v 1.28 2002/03/08 03:49:58 provos Exp $ */ +/* $OpenBSD: tcp_timer.c,v 1.29 2002/05/16 14:10:51 kjc Exp $ */ /* $NetBSD: tcp_timer.c,v 1.14 1996/02/13 23:44:09 christos Exp $ */ /* @@ -330,6 +330,13 @@ tcp_timer_rexmt(void *arg) tp->snd_cwnd = tp->t_maxseg; tp->snd_ssthresh = win * tp->t_maxseg; tp->t_dupacks = 0; +#ifdef TCP_ECN + tp->snd_last = tp->snd_max; + tp->t_flags |= TF_SEND_CWR; +#endif +#if 1 /* TCP_ECN */ + tcpstat.tcps_cwr_timeout++; +#endif } (void) tcp_output(tp); diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c index a2130df23a1..b7488a6d1df 100644 --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_usrreq.c,v 1.59 2002/03/14 01:27:11 millert Exp $ */ +/* $OpenBSD: tcp_usrreq.c,v 1.60 2002/05/16 14:10:51 kjc Exp $ */ /* $NetBSD: tcp_usrreq.c,v 1.20 1996/02/13 23:44:16 christos Exp $ */ /* @@ -924,6 +924,11 @@ tcp_sysctl(name, namelen, oldp, oldlenp, newp, newlen) case TCPCTL_ACK_ON_PUSH: return (sysctl_int(oldp, oldlenp, newp, newlen, &tcp_ack_on_push)); +#ifdef TCP_ECN + case TCPCTL_ECN: + return (sysctl_int(oldp, oldlenp, newp, newlen, + &tcp_do_ecn)); +#endif default: return (ENOPROTOOPT); } diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h index 60aa5162f44..2be685a288d 100644 --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -1,4 +1,4 @@ -/* $OpenBSD: tcp_var.h,v 1.42 2002/03/14 01:27:11 millert Exp $ */ +/* $OpenBSD: tcp_var.h,v 1.43 2002/05/16 14:10:51 kjc Exp $ */ /* $NetBSD: tcp_var.h,v 1.17 1996/02/13 23:44:24 christos Exp $ */ /* @@ -68,7 +68,7 @@ struct tcpcb { short t_dupacks; /* consecutive dup acks recd */ u_short t_maxseg; /* maximum segment size */ char t_force; /* 1 if forcing out a byte */ - u_short t_flags; + u_int t_flags; #define TF_ACKNOW 0x0001 /* ack peer immediately */ #define TF_DELACK 0x0002 /* ack, but try to delay it */ #define TF_NODELAY 0x0004 /* don't delay packets to coalesce */ @@ -80,6 +80,12 @@ struct tcpcb { #define TF_RCVD_TSTMP 0x0100 /* a timestamp was received in SYN */ #define TF_SACK_PERMIT 0x0200 /* other side said I could SACK */ #define TF_SIGNATURE 0x0400 /* require TCP MD5 signature */ +#ifdef TCP_ECN +#define TF_ECN_PERMIT 0x00008000 /* other side said I could ECN */ +#define TF_RCVD_CE 0x00010000 /* send ECE in subsequent segs */ +#define TF_SEND_CWR 0x00020000 /* send CWR in next seg */ +#define TF_DISABLE_ECN 0x00040000 /* disable ECN for this connection */ +#endif struct mbuf *t_template; /* skeletal packet for transmit */ struct inpcb *t_inpcb; /* back pointer to internet pcb */ @@ -107,7 +113,7 @@ struct tcpcb { int retran_data; /* amount of outstanding retx. data */ #endif /* TCP_FACK */ #endif /* TCP_SACK */ -#if defined(TCP_SACK) +#if defined(TCP_SACK) || defined(TCP_ECN) tcp_seq snd_last; /* for use in fast recovery */ #endif /* receive sequence variables */ @@ -305,6 +311,18 @@ struct tcpstat { u_int64_t tcps_rcvgoodsig; /* rcvd good TCP signatures */ u_int32_t tcps_inhwcsum; /* input hardware-checksummed packets */ u_int32_t tcps_outhwcsum; /* output hardware-checksummed packets */ + + /* ECN stats */ + u_int32_t tcps_ecn_accepts; /* ecn connections accepted */ + u_int32_t tcps_ecn_rcvece; /* # of rcvd ece */ + u_int32_t tcps_ecn_rcvcwr; /* # of rcvd cwr */ + u_int32_t tcps_ecn_rcvce; /* # of rcvd ce in ip header */ + u_int32_t tcps_ecn_sndect; /* # of cwr sent */ + u_int32_t tcps_ecn_sndece; /* # of ece sent */ + u_int32_t tcps_ecn_sndcwr; /* # of cwr sent */ + u_int32_t tcps_cwr_ecn; /* # of cwnd reduced by ecn */ + u_int32_t tcps_cwr_frecovery; /* # of cwnd reduced by fastrecovery */ + u_int32_t tcps_cwr_timeout; /* # of cwnd reduced by timeout */ }; /* @@ -324,7 +342,8 @@ struct tcpstat { #define TCPCTL_MSSDFLT 11 /* Default maximum segment size */ #define TCPCTL_RSTPPSLIMIT 12 /* RST pps limit */ #define TCPCTL_ACK_ON_PUSH 13 /* ACK immediately on PUSH */ -#define TCPCTL_MAXID 14 +#define TCPCTL_ECN 14 /* RFC3168 ECN */ +#define TCPCTL_MAXID 15 #define TCPCTL_NAMES { \ { 0, 0 }, \ @@ -341,6 +360,7 @@ struct tcpstat { { "mssdflt", CTLTYPE_INT }, \ { "rstppslimit", CTLTYPE_INT }, \ { "ackonpush", CTLTYPE_INT }, \ + { "ecn", CTLTYPE_INT }, \ } struct tcp_ident_mapping { @@ -359,6 +379,7 @@ extern int tcp_ack_on_push; /* ACK immediately on PUSH */ extern int tcp_do_sack; /* SACK enabled/disabled */ extern struct pool sackhl_pool; #endif +extern int tcp_do_ecn; /* RFC3168 ECN enabled/disabled? */ int tcp_attach(struct socket *); void tcp_canceltimers(struct tcpcb *); diff --git a/sys/netinet6/frag6.c b/sys/netinet6/frag6.c index e49893f844c..ab453aea1ad 100644 --- a/sys/netinet6/frag6.c +++ b/sys/netinet6/frag6.c @@ -1,4 +1,4 @@ -/* $OpenBSD: frag6.c,v 1.13 2002/03/15 10:50:59 itojun Exp $ */ +/* $OpenBSD: frag6.c,v 1.14 2002/05/16 14:10:51 kjc Exp $ */ /* $KAME: frag6.c,v 1.31 2001/05/17 13:45:34 jinmei Exp $ */ /* @@ -50,6 +50,8 @@ #include <netinet/ip6.h> #include <netinet6/ip6_var.h> #include <netinet/icmp6.h> +#include <netinet/in_systm.h> /* for ECN definitions */ +#include <netinet/ip.h> /* for ECN definitions */ #include <dev/rndvar.h> @@ -184,6 +186,7 @@ frag6_input(mp, offp, proto) static struct route_in6 ro; struct sockaddr_in6 *dst; #endif + u_int8_t ecn, ecn0; ip6 = mtod(m, struct ip6_hdr *); #ifndef PULLDOWN_TEST @@ -388,6 +391,26 @@ frag6_input(mp, offp, proto) } /* + * Handle ECN by comparing this segment with the first one; + * if CE is set, do not lose CE. + * drop if CE and not-ECT are mixed for the same packet. + */ + ecn = (ntohl(ip6->ip6_flow) >> 20) & IPTOS_ECN_MASK; + ecn0 = (ntohl(q6->ip6q_down->ip6af_head) >> 20) & IPTOS_ECN_MASK; + if (ecn == IPTOS_ECN_CE) { + if (ecn0 == IPTOS_ECN_NOTECT) { + free(ip6af, M_FTABLE); + goto dropfrag; + } + if (ecn0 != IPTOS_ECN_CE) + q6->ip6q_down->ip6af_head |= htonl(IPTOS_ECN_CE << 20); + } + if (ecn == IPTOS_ECN_NOTECT && ecn0 != IPTOS_ECN_NOTECT) { + free(ip6af, M_FTABLE); + goto dropfrag; + } + + /* * Find a segment which begins after this one does. */ for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6; |