Index: sys/conf/files =========================================================================== --- sys/conf/files 2005/04/28 15:14:13 #106 +++ sys/conf/files 2005/04/28 15:14:13 @@ -1087,6 +1087,7 @@ netinet/tcp_syncache.c optional inet netinet/tcp_timer.c optional inet netinet/tcp_usrreq.c optional inet +netinet/tcp_sack.c optional inet netinet/udp_usrreq.c optional inet netinet6/ah_core.c optional ipsec netinet6/ah_input.c optional ipsec Index: sys/conf/options =========================================================================== --- sys/conf/options 2005/04/28 15:14:13 #45 +++ sys/conf/options 2005/04/28 15:14:13 @@ -306,6 +306,7 @@ TCPDEBUG TCP_SIGNATURE opt_inet.h TCP_DROP_SYNFIN opt_tcp_input.h +TCP_SACK_DEBUG opt_tcp_sack.h XBONEHACK MBUF_STRESS_TEST opt_mbuf_stress_test.h Index: sys/netinet/tcp.h =========================================================================== --- sys/netinet/tcp.h 2005/04/28 15:14:13 #4 +++ sys/netinet/tcp.h 2005/04/28 15:14:13 @@ -85,12 +85,15 @@ #define TCPOPT_SACK_PERMITTED 4 /* Experimental */ #define TCPOLEN_SACK_PERMITTED 2 #define TCPOPT_SACK 5 /* Experimental */ +#define TCPOLEN_SACK 8 /* 2*sizeof(tcp_seq) */ #define TCPOPT_TIMESTAMP 8 #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_TSTAMP_APPA (TCPOLEN_TIMESTAMP+2) /* appendix A */ #define TCPOPT_TSTAMP_HDR \ (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_TIMESTAMP<<8|TCPOLEN_TIMESTAMP) +#define MAX_TCPOPTLEN 40 /* Absolute maximum TCP options len */ + #define TCPOPT_CC 11 /* CC options: RFC-1644 */ #define TCPOPT_CCNEW 12 #define TCPOPT_CCECHO 13 @@ -101,6 +104,14 @@ #define TCPOPT_SIGNATURE 19 /* Keyed MD5: RFC 2385 */ #define TCPOLEN_SIGNATURE 18 +/* Option definitions */ +#define TCPOPT_SACK_PERMIT_HDR \ +(TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK_PERMITTED<<8|TCPOLEN_SACK_PERMITTED) +#define TCPOPT_SACK_HDR (TCPOPT_NOP<<24|TCPOPT_NOP<<16|TCPOPT_SACK<<8) +/* Miscellaneous constants */ +#define MAX_SACK_BLKS 6 /* Max # SACK blocks stored at sender side */ +#define TCP_MAX_SACK 3 /* MAX # SACKs sent in any segment */ + /* * Default maximum segment size for TCP. * With an IP MSS of 576, this is 536, Index: sys/netinet/tcp_input.c =========================================================================== --- sys/netinet/tcp_input.c 2005/04/28 15:14:13 #37 +++ sys/netinet/tcp_input.c 2005/04/28 15:14:13 @@ -40,6 +40,7 @@ #include "opt_ipsec.h" #include "opt_tcpdebug.h" #include "opt_tcp_input.h" +#include "opt_tcp_sack.h" #include #include @@ -153,7 +154,8 @@ #define tcb6 tcb /* for KAME src sync over BSD*'s */ struct inpcbinfo tcbinfo; -static void tcp_dooptions(struct tcpopt *, u_char *, int, int); +static void tcp_dooptions(struct tcpcb *, struct tcpopt *, u_char *, + int, int, struct tcphdr *); static void tcp_pulloutofband(struct socket *, struct tcphdr *, struct mbuf *, int); static int tcp_reass(struct tcpcb *, struct tcphdr *, int *, @@ -218,6 +220,7 @@ tcp_reass_overflows++; tcpstat.tcps_rcvmemdrop++; m_freem(m); + *tlenp = 0; return (0); } @@ -227,6 +230,7 @@ if (te == NULL) { tcpstat.tcps_rcvmemdrop++; m_freem(m); + *tlenp = 0; return (0); } tcp_reass_qsize++; @@ -860,7 +864,7 @@ * for syncache, or perform t/tcp connection. */ if (so->so_qlen <= so->so_qlimit) { - tcp_dooptions(&to, optp, optlen, 1); + tcp_dooptions((struct tcpcb *)NULL, &to, optp, optlen, 1, th); if (!syncache_add(&inc, &to, th, &so, m)) goto drop; if (so == NULL) @@ -921,7 +925,7 @@ * Process options. * XXX this is tradtitional behavior, may need to be cleaned up. */ - tcp_dooptions(&to, optp, optlen, thflags & TH_SYN); + tcp_dooptions(tp, &to, optp, optlen, thflags & TH_SYN, th); if (thflags & TH_SYN) { if (to.to_flags & TOF_SCALE) { tp->t_flags |= TF_RCVD_SCALE; @@ -936,6 +940,17 @@ tp->t_flags |= TF_RCVD_CC; if (to.to_flags & TOF_MSS) tcp_mss(tp, to.to_mss); + if (tp->sack_enable) { + if (!(to.to_flags & TOF_SACK)) + tp->sack_enable = 0; + else + tp->t_flags |= TF_SACK_PERMIT; + } + } + + if (tp->sack_enable) { + /* Delete stale (cumulatively acked) SACK holes */ + tcp_del_sackholes(tp, th); } /* @@ -987,9 +1002,9 @@ if (SEQ_GT(th->th_ack, tp->snd_una) && SEQ_LEQ(th->th_ack, tp->snd_max) && tp->snd_cwnd >= tp->snd_wnd && - ((!tcp_do_newreno && + ((!tcp_do_newreno && !tp->sack_enable && tp->t_dupacks < tcprexmtthresh) || - (tcp_do_newreno && + ((tcp_do_newreno || tp->sack_enable) && !SEQ_LT(tp->snd_una, tp->snd_recover)))) { /* * this is a pure ack for outstanding data. @@ -1065,6 +1080,9 @@ * with nothing on the reassembly queue and * we have enough buffer space to take it. */ + /* Clean receiver SACK report if present */ + if (tp->sack_enable && tp->rcv_numsacks) + tcp_clean_sackreport(tp); ++tcpstat.tcps_preddat; tp->rcv_nxt += tlen; tcpstat.tcps_rcvpack++; @@ -1168,6 +1186,7 @@ tp->irs = th->th_seq; tcp_rcvseqinit(tp); + if (thflags & TH_ACK) { /* * Our SYN was acked. If segment contains CC.ECHO @@ -1587,11 +1606,25 @@ /* * If last ACK falls within this segment's sequence numbers, * record its timestamp. - * NOTE that the test is modified according to the latest - * proposal of the tcplw@cray.com list (Braden 1993/04/26). + * NOTE: + * 1) That the test incorporates suggestions from the latest + * proposal of the tcplw@cray.com list (Braden 1993/04/26). + * 2) That updating only on newer timestamps interferes with + * our earlier PAWS tests, so this check should be solely + * predicated on the sequence space of this segment. + * 3) That we modify the segment boundary check to be + * Last.ACK.Sent <= SEG.SEQ + SEG.Len + * instead of RFC1323's + * Last.ACK.Sent < SEG.SEQ + SEG.Len, + * This modified check allows us to overcome RFC1323's + * limitations as described in Stevens TCP/IP Illustrated + * Vol. 2 p.869. In such cases, we can still calculate the + * RTT correctly when RCV.NXT == Last.ACK.Sent. */ if ((to.to_flags & TOF_TS) != 0 && - SEQ_LEQ(th->th_seq, tp->last_ack_sent)) { + SEQ_LEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen + + ((thflags & (TH_SYN|TH_FIN)) != 0))) { tp->ts_recent_age = ticks; tp->ts_recent = to.to_tsval; } @@ -1719,19 +1752,47 @@ th->th_ack != tp->snd_una) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || - (tcp_do_newreno && - SEQ_LT(tp->snd_una, - tp->snd_recover))) { - tp->snd_cwnd += tp->t_maxseg; + ((tcp_do_newreno || tp->sack_enable) && + SEQ_LT(tp->snd_una, tp->snd_recover))) { + if (tp->sack_enable && + SEQ_LT(tp->snd_una, tp->snd_recover)) { + int data_in_pipe; + int sacked, lost_not_rexmitted; + + /* + * Compute the amount of data in flight first. + * We can inject new data into the pipe iff + * we have less than 1/2 the original window's + * worth of data in flight. + */ + sacked = tcp_sacked_bytes(tp, &lost_not_rexmitted); + data_in_pipe = (tp->snd_nxt - tp->snd_una) - + (sacked + lost_not_rexmitted); + if (data_in_pipe < tp->snd_ssthresh) { + tp->snd_cwnd += tp->t_maxseg; + if (tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd = tp->snd_ssthresh; + } + } else + tp->snd_cwnd += tp->t_maxseg; (void) tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh) { tcp_seq onxt = tp->snd_nxt; u_int win; - if (tcp_do_newreno && - SEQ_LEQ(th->th_ack, tp->snd_high)) { - tp->t_dupacks = 0; - break; + + if (tp->sack_enable) { + if (SEQ_LT(th->th_ack, + tp->snd_recover)) { + tp->t_dupacks = 0; + break; + } + } else if (tcp_do_newreno) { + if (SEQ_LEQ(th->th_ack, + tp->snd_high)) { + tp->t_dupacks = 0; + break; + } } win = min(tp->snd_wnd, tp->snd_cwnd) / 2 / tp->t_maxseg; @@ -1741,6 +1802,13 @@ tp->snd_recover = tp->snd_max; callout_stop(tp->tt_rexmt); tp->t_rtttime = 0; + if (tp->sack_enable) { + tcpstat.tcps_sack_recovery_episode++; + tp->sack_newdata = tp->snd_nxt; + tp->snd_cwnd = tp->t_maxseg; + (void) tcp_output(tp); + goto drop; + } tp->snd_nxt = th->th_ack; tp->snd_cwnd = tp->t_maxseg; (void) tcp_output(tp); @@ -1761,12 +1829,16 @@ * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ - if (tcp_do_newreno) { + if (tp->sack_enable || tcp_do_newreno) { if (SEQ_LT(tp->snd_una, tp->snd_recover)) { if (SEQ_LT(th->th_ack, tp->snd_recover)) { - tcp_newreno_partial_ack(tp, th); + if (tp->sack_enable) + tcp_sack_partialack(tp, th); + else + tcp_newreno_partial_ack(tp, th); } else { - /* + /* + * Out of fast recovery. * Window inflation should have left us * with approximately snd_ssthresh * outstanding data. @@ -1774,16 +1846,15 @@ * send a burst, better to do it via * the slow start mechanism. */ - if (SEQ_GT(th->th_ack + - tp->snd_ssthresh, + if (SEQ_GT(th->th_ack + tp->snd_ssthresh, tp->snd_max)) tp->snd_cwnd = tp->snd_max - - th->th_ack + - tp->t_maxseg; - else + th->th_ack + + tp->t_maxseg; + else tp->snd_cwnd = tp->snd_ssthresh; } - } + } } else { if (tp->t_dupacks >= tcprexmtthresh && tp->snd_cwnd > tp->snd_ssthresh) @@ -1884,7 +1955,8 @@ * Otherwise open linearly: maxseg per window * (maxseg^2 / cwnd per packet). */ - if (!tcp_do_newreno || SEQ_GEQ(tp->snd_una, tp->snd_recover)) { + if ((!tcp_do_newreno && !tp->sack_enable) || + SEQ_GEQ(tp->snd_una, tp->snd_recover)) { register u_int cw = tp->snd_cwnd; register u_int incr = tp->t_maxseg; if (cw > tp->snd_ssthresh) @@ -1909,6 +1981,10 @@ SEQ_LEQ(th->th_ack, tp->snd_high)) tp->snd_high = th->th_ack - 1; tp->snd_una = th->th_ack; + if (tp->sack_enable) { + if (SEQ_GT(tp->snd_una, tp->snd_recover)) + tp->snd_recover = tp->snd_una; + } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; @@ -2115,7 +2191,8 @@ thflags = tcp_reass(tp, th, &tlen, m); tp->t_flags |= TF_ACKNOW; } - + if (tlen > 0 && tp->sack_enable) + tcp_update_sack_list(tp, th->th_seq, th->th_seq + tlen); /* * Note the amount of data that peer has sent into * our window, in order to estimate the sender's @@ -2310,10 +2387,12 @@ * Parse TCP options and place in tcpopt. */ static void -tcp_dooptions(to, cp, cnt, is_syn) +tcp_dooptions(tp, to, cp, cnt, is_syn, th) + struct tcpcb *tp; struct tcpopt *to; u_char *cp; int cnt; + struct tcphdr *th; { int opt, optlen; @@ -2402,6 +2481,21 @@ to->to_flags |= (TOF_SIGNATURE | TOF_SIGLEN); break; #endif + + case TCPOPT_SACK_PERMITTED: + if (!tcp_do_sack || + optlen != TCPOLEN_SACK_PERMITTED) + continue; + if (is_syn) { + /* MUST only be set on SYN */ + to->to_flags |= TOF_SACK; + } + break; + + case TCPOPT_SACK: + if (!tp || tcp_sack_option(tp, th, cp, optlen)) + continue; + break; default: continue; } Index: sys/netinet/tcp_output.c =========================================================================== --- sys/netinet/tcp_output.c 2005/04/28 15:14:13 #15 +++ sys/netinet/tcp_output.c 2005/04/28 15:14:13 @@ -38,6 +38,7 @@ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" +#include "opt_tcp_sack.h" #include #include @@ -87,6 +88,8 @@ extern struct mbuf *m_copypack(); #endif +int tcp_output_sack_miss_seg = 0; + int path_mtu_discovery = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, path_mtu_discovery, CTLFLAG_RW, &path_mtu_discovery, 1, "Enable Path MTU Discovery"); @@ -126,9 +129,9 @@ u_char opt[TCP_MAXOLEN]; unsigned ipoptlen, optlen, hdrlen; int idle, sendalot; -#if 0 - int maxburst = TCP_MAXBURST; -#endif + int i, sack_rxmit; + int sack_bytes_rxmt; + struct sackhole *p; struct rmxp_tao *taop; #ifdef INET6 int isipv6; @@ -176,13 +179,76 @@ } } again: + /* + * If we've recently taken a timeout, snd_max will be greater than + * snd_nxt. There may be SACK information that allows us to avoid + * resending already delivered data. Adjust snd_nxt accordingly. + */ + if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max)) + tcp_sack_adjust(tp); + sendalot = 0; off = tp->snd_nxt - tp->snd_una; win = min(tp->snd_wnd, tp->snd_cwnd); win = min(win, tp->snd_bwnd); flags = tcp_outflags[tp->t_state]; + /* + * Send any SACK-generated retransmissions. If we're explicitly trying + * to send out new data (when sendalot is 1), bypass this function. + * If we retransmit in fast recovery mode, decrement snd_cwnd, since + * we're replacing a (future) new transmission with a retransmission + * now, and we previously incremented snd_cwnd in tcp_input(). + */ + /* + * Still in sack recovery , reset rxmit flag to zero. + */ + sack_rxmit = 0; + sack_bytes_rxmt = 0; + len = 0; + p = NULL; + if (tp->sack_enable && SEQ_LT(tp->snd_una, tp->snd_recover) && + (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { + long cwin; + + cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt; + if (cwin < 0) + cwin = 0; + /* Do not retransmit SACK segments beyond snd_recover */ + if (SEQ_GT(p->end, tp->snd_recover)) { + /* + * (At least) part of sack hole extends beyond + * snd_recover. Check to see if we can rexmit data + * for this hole. + */ + if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { + /* + * Can't rexmit any more data for this hole. + * That data will be rexmitted in the next + * sack recovery episode, when snd_recover + * moves past p->rxmit. + */ + p = NULL; + goto after_sack_rexmit; + } else + /* Can rexmit part of the current hole */ + len = ((long)ulmin(cwin, + tp->snd_recover - p->rxmit)); + } else + len = ((long)ulmin(cwin, p->end - p->rxmit)); + sack_rxmit = 1; + off = p->rxmit - tp->snd_una; + KASSERT(off >= 0,("%s: sack block to the left of una : %d", + __func__, off)); + if (len > 0) { + sendalot = 1; + tcpstat.tcps_sack_rexmits++; + tcpstat.tcps_sack_rexmit_bytes += min(len, tp->t_maxseg); + } + } +after_sack_rexmit: + /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. */ @@ -235,8 +301,39 @@ * In the normal retransmit-FIN-only case, however, snd_nxt will * be set to snd_una, the offset will be 0, and the length may * wind up 0. + * + * If sack_rxmit is true we are retransmitting from the scoreboard + * in which case len is already set. */ - len = (long)ulmin(so->so_snd.sb_cc, win) - off; + if (sack_rxmit == 0) { + if (sack_bytes_rxmt == 0) + len = ((long)ulmin(so->so_snd.sb_cc, win) - off); + else { + long cwin; + + /* + * We are inside of a SACK recovery episode and are + * sending new data, having retransmitted all the + * data possible in the scoreboard. + */ + len = ((long)ulmin(so->so_snd.sb_cc, tp->snd_wnd) - off); + /* + * Don't remove this (len > 0) check ! + * We explicitly check for len > 0 here (although it + * isn't really necessary), to work around a gcc + * optimization issue - to force gcc to compute + * len above. Without this check, the computation + * of len is bungled by the optimizer. + */ + if (len > 0) { + cwin = tp->snd_cwnd - (tp->snd_nxt - tp->sack_newdata) - + sack_bytes_rxmt; + if (cwin < 0) + cwin = 0; + len = lmin(len, cwin); + } + } + } taop = tcp_gettaocache(&tp->t_inpcb->inp_inc); @@ -296,7 +393,9 @@ len = tp->t_maxseg; sendalot = 1; } - if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + so->so_snd.sb_cc)) + + if (SEQ_LT(((sack_rxmit) ? p->rxmit : tp->snd_nxt) + len, + tp->snd_una + so->so_snd.sb_cc)) flags &= ~TH_FIN; win = sbspace(&so->so_rcv); @@ -335,6 +434,8 @@ goto send; if (SEQ_LT(tp->snd_nxt, tp->snd_max)) /* retransmit case */ goto send; + if (sack_rxmit) + goto send; } /* @@ -379,6 +480,19 @@ goto send; /* + * In SACK, it is possible for tcp_output to fail to send a segment + * after the retransmission timer has been turned off. Make sure + * that the retransmission timer is set. + */ + if (tp->sack_enable && SEQ_GT(tp->snd_max, tp->snd_una) && + !callout_active(tp->tt_rexmt) && + !callout_active(tp->tt_persist)) { + callout_reset(tp->tt_rexmt, tp->t_rxtcur, + tcp_timer_rexmt, tp); + return (0); + } + + /* * TCP window updates are not reliable, rather a polling protocol * using ``persist'' packets is used to insure receipt of window * updates. The three ``states'' for the output side are: @@ -557,14 +671,88 @@ for (i = 0; i < TCP_SIGLEN; i++) *bp++ = 0; optlen += TCPOLEN_SIGNATURE; + } +#endif /* TCP_SIGNATURE */ + + if (tp->sack_enable && ((tp->t_flags & TF_NOOPT) == 0)) { + /* + * Tack on the SACK permitted option *last*. + * And do padding of options after tacking this on. + * This is because of MSS, TS, WinScale and Signatures are + * all present, we have just 2 bytes left for the SACK + * permitted option, which is just enough. + */ + /* + * If this is the first SYN of connection (not a SYN + * ACK), include SACK permitted option. If this is a + * SYN ACK, include SACK permitted option if peer has + * already done so. This is only for active connect, + * since the syncache takes care of the passive connect. + */ + if ((flags & TH_SYN) && + (!(flags & TH_ACK) || (tp->t_flags & TF_SACK_PERMIT))) { + u_char *bp; + bp = (u_char *)opt + optlen; + *bp++ = TCPOPT_SACK_PERMITTED; + *bp++ = TCPOLEN_SACK_PERMITTED; + optlen += TCPOLEN_SACK_PERMITTED; + } /* - * Terminate options list and maintain 32-bit alignment. + * Send SACKs if necessary. This should be the last + * option processed. Only as many SACKs are sent as + * are permitted by the maximum options size. + * + * In general, SACK blocks consume 8*n+2 bytes. + * So a full size SACK blocks option is 34 bytes + * (to generate 4 SACK blocks). At a minimum, + * we need 10 bytes (to generate 1 SACK block). + * If TCP Timestamps (12 bytes) and TCP Signatures + * (18 bytes) are both present, we'll just have + * 10 bytes for SACK options 40 - (12 + 18). */ - *bp++ = TCPOPT_NOP; - *bp++ = TCPOPT_EOL; - optlen += 2; + if (TCPS_HAVEESTABLISHED(tp->t_state) && + (tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks > 0 && + MAX_TCPOPTLEN - optlen - 2 >= TCPOLEN_SACK) { + int nsack, sackoptlen, padlen; + u_char *bp = (u_char *)opt + optlen; + u_int32_t *lp; + + nsack = (MAX_TCPOPTLEN - optlen - 2) / TCPOLEN_SACK; + nsack = min(nsack, tp->rcv_numsacks); + sackoptlen = (2 + nsack * TCPOLEN_SACK); + /* + * First we need to pad options so that the + * SACK blocks can start at a 4-byte boundary + * (sack option and length are at a 2 byte offset). + */ + padlen = (MAX_TCPOPTLEN - optlen - sackoptlen) % 4; + optlen += padlen; + while (padlen-- > 0) + *bp++ = TCPOPT_NOP; + tcpstat.tcps_sack_send_blocks++; + *bp++ = TCPOPT_SACK; + *bp++ = sackoptlen; + lp = (u_int32_t *)bp; + for (i = 0; i < nsack; i++) { + struct sackblk sack = tp->sackblks[i]; + *lp++ = htonl(sack.start); + *lp++ = htonl(sack.end); + } + optlen += sackoptlen; + } } -#endif /* TCP_SIGNATURE */ + + /* Pad TCP options to a 4 byte boundary */ + if (optlen < MAX_TCPOPTLEN && (optlen % sizeof(u_int32_t))) { + int pad = sizeof(u_int32_t) - (optlen % sizeof(u_int32_t)); + u_char *bp = (u_char *)opt + optlen; + + optlen += pad; + while (pad) { + *bp++ = TCPOPT_EOL; + pad--; + } + } hdrlen += optlen; @@ -736,11 +924,16 @@ * case, since we know we aren't doing a retransmission. * (retransmit and persist are mutually exclusive...) */ - if (len || (flags & (TH_SYN|TH_FIN)) - || callout_active(tp->tt_persist)) - th->th_seq = htonl(tp->snd_nxt); - else - th->th_seq = htonl(tp->snd_max); + if (sack_rxmit == 0) { + if (len || (flags & (TH_SYN|TH_FIN)) + || callout_active(tp->tt_persist)) + th->th_seq = htonl(tp->snd_nxt); + else + th->th_seq = htonl(tp->snd_max); + } else { + th->th_seq = htonl(p->rxmit); + p->rxmit += len; + } th->th_ack = htonl(tp->rcv_nxt); if (optlen) { bcopy(opt, th + 1, optlen); @@ -838,6 +1031,11 @@ tp->t_flags |= TF_SENTFIN; } } + if (sack_rxmit) { + if (tp->snd_nxt == p->rxmit) + tcp_output_sack_miss_seg++; + goto timer; + } tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { tp->snd_max = tp->snd_nxt; @@ -860,8 +1058,10 @@ * Initialize shift counter which is used for backoff * of retransmit time. */ +timer: if (!callout_active(tp->tt_rexmt) && - tp->snd_nxt != tp->snd_una) { + ((sack_rxmit && tp->snd_nxt != tp->snd_max) || + (tp->snd_nxt != tp->snd_una))) { if (callout_active(tp->tt_persist)) { callout_stop(tp->tt_persist); tp->t_rxtshift = 0; @@ -964,8 +1164,12 @@ * No need to check for TH_FIN here because * the TF_SENTFIN flag handles that case. */ - if ((flags & TH_SYN) == 0) - tp->snd_nxt -= len; + if ((flags & TH_SYN) == 0) { + if (sack_rxmit) + p->rxmit -= len; + else + tp->snd_nxt -= len; + } } out: @@ -1008,16 +1212,6 @@ tp->t_flags &= ~TF_ACKNOW; if (tcp_delack_enabled) callout_stop(tp->tt_delack); -#if 0 - /* - * This completely breaks TCP if newreno is turned on. What happens - * is that if delayed-acks are turned on on the receiver, this code - * on the transmitter effectively destroys the TCP window, forcing - * it to four packets (1.5Kx4 = 6K window). - */ - if (sendalot && (!tcp_do_newreno || --maxburst)) - goto again; -#endif if (sendalot) goto again; return (0); Index: sys/netinet/tcp_sack.c =========================================================================== *** /dev/null Thu Apr 28 15:14:15 2005 --- sys/netinet/tcp_sack.c Thu Apr 28 15:14:15 2005 *************** *** 0 **** --- 1,551 ---- + /* + * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95 + * $FreeBSD: src/sys/netinet/tcp_sack.c,v 1.107.2.40 2004/03/02 17:19:18 nectar Exp $ + */ + + #include "opt_ipfw.h" /* for ipfw_fwd */ + #include "opt_inet.h" + #include "opt_inet6.h" + #include "opt_ipsec.h" + #include "opt_tcpdebug.h" + #include "opt_tcp_input.h" + #include "opt_tcp_sack.h" + + #include + #include + #include + #include + #include + #include + #include /* for proc0 declaration */ + #include + #include + #include + #include + + #include /* before tcp_seq.h, for tcp_random18() */ + + #include + #include + + #include + #include + #include + #include /* for ICMP_BANDLIM */ + #include + #include /* for ICMP_BANDLIM */ + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #include + #ifdef TCPDEBUG + #include + #endif /* TCPDEBUG */ + + #ifdef FAST_IPSEC + #include + #include + #endif + + #ifdef IPSEC + #include + #include + #include + #endif /*IPSEC*/ + #include + #include + + + extern struct vm_zone *sack_hole_zone; + + /* + * This function is called upon receipt of new valid data (while not in header + * prediction mode), and it updates the ordered list of sacks. + */ + void + tcp_update_sack_list(tp, rcv_laststart, rcv_lastend) + struct tcpcb *tp; + tcp_seq rcv_laststart, rcv_lastend; + { + /* + * First reported block MUST be the most recent one. Subsequent + * blocks SHOULD be in the order in which they arrived at the + * receiver. These two conditions make the implementation fully + * compliant with RFC 2018. + */ + int i, j = 0, count = 0, lastpos = -1; + struct sackblk sack, firstsack, temp[MAX_SACK_BLKS]; + + /* First clean up current list of sacks */ + for (i = 0; i < tp->rcv_numsacks; i++) { + sack = tp->sackblks[i]; + if (sack.start == 0 && sack.end == 0) { + count++; /* count = number of blocks to be discarded */ + continue; + } + if (SEQ_LEQ(sack.end, tp->rcv_nxt)) { + tp->sackblks[i].start = tp->sackblks[i].end = 0; + count++; + } else { + temp[j].start = tp->sackblks[i].start; + temp[j++].end = tp->sackblks[i].end; + } + } + tp->rcv_numsacks -= count; + if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */ + tcp_clean_sackreport(tp); + if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) { + /* ==> need first sack block */ + tp->sackblks[0].start = rcv_laststart; + tp->sackblks[0].end = rcv_lastend; + tp->rcv_numsacks = 1; + } + return; + } + /* Otherwise, sack blocks are already present. */ + for (i = 0; i < tp->rcv_numsacks; i++) + tp->sackblks[i] = temp[i]; /* first copy back sack list */ + if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend)) + return; /* sack list remains unchanged */ + /* + * From here, segment just received should be (part of) the 1st sack. + * Go through list, possibly coalescing sack block entries. + */ + firstsack.start = rcv_laststart; + firstsack.end = rcv_lastend; + for (i = 0; i < tp->rcv_numsacks; i++) { + sack = tp->sackblks[i]; + if (SEQ_LT(sack.end, firstsack.start) || + SEQ_GT(sack.start, firstsack.end)) + continue; /* no overlap */ + if (sack.start == firstsack.start && sack.end == firstsack.end){ + /* + * identical block; delete it here since we will + * move it to the front of the list. + */ + tp->sackblks[i].start = tp->sackblks[i].end = 0; + lastpos = i; /* last posn with a zero entry */ + continue; + } + if (SEQ_LEQ(sack.start, firstsack.start)) + firstsack.start = sack.start; /* merge blocks */ + if (SEQ_GEQ(sack.end, firstsack.end)) + firstsack.end = sack.end; /* merge blocks */ + tp->sackblks[i].start = tp->sackblks[i].end = 0; + lastpos = i; /* last posn with a zero entry */ + } + if (lastpos != -1) { /* at least one merge */ + for (i = 0, j = 1; i < tp->rcv_numsacks; i++) { + sack = tp->sackblks[i]; + if (sack.start == 0 && sack.end == 0) + continue; + temp[j++] = sack; + } + tp->rcv_numsacks = j; /* including first blk (added later) */ + for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */ + tp->sackblks[i] = temp[i]; + } else { /* no merges -- shift sacks by 1 */ + if (tp->rcv_numsacks < MAX_SACK_BLKS) + tp->rcv_numsacks++; + for (i = tp->rcv_numsacks-1; i > 0; i--) + tp->sackblks[i] = tp->sackblks[i-1]; + } + tp->sackblks[0] = firstsack; + return; + } + + /* + * Delete all receiver-side SACK information. + */ + void + tcp_clean_sackreport(tp) + struct tcpcb *tp; + { + int i; + + tp->rcv_numsacks = 0; + for (i = 0; i < MAX_SACK_BLKS; i++) + tp->sackblks[i].start = tp->sackblks[i].end=0; + } + + /* + * Process the TCP SACK option. Returns 1 if tcp_dooptions() should continue, + * and 0 otherwise, if the option was fine. tp->snd_holes is an ordered list + * of holes (oldest to newest, in terms of the sequence space). + */ + int + tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen) + { + int tmp_olen; + u_char *tmp_cp; + struct sackhole *cur, *p, *temp; + + if (!tp->sack_enable) + return (1); + + /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */ + if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0) + return (1); + if ((th->th_flags & TH_ACK) == 0) + return (1); + /* If ack is outside [snd_una, snd_max], ignore the SACK options */ + if (SEQ_LT(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max)) + return (1); + tmp_cp = cp + 2; + tmp_olen = optlen - 2; + tcpstat.tcps_sack_rcv_blocks++; + if (tp->snd_numholes < 0) + tp->snd_numholes = 0; + if (tp->t_maxseg == 0) + panic("tcp_sack_option"); /* Should never happen */ + while (tmp_olen > 0) { + struct sackblk sack; + + bcopy(tmp_cp, (char *) &(sack.start), sizeof(tcp_seq)); + NTOHL(sack.start); + bcopy(tmp_cp + sizeof(tcp_seq), + (char *) &(sack.end), sizeof(tcp_seq)); + NTOHL(sack.end); + tmp_olen -= TCPOLEN_SACK; + tmp_cp += TCPOLEN_SACK; + if (SEQ_LEQ(sack.end, sack.start)) + continue; /* bad SACK fields */ + if (SEQ_LEQ(sack.end, tp->snd_una)) + continue; /* old block */ + if (SEQ_GT(th->th_ack, tp->snd_una)) { + if (SEQ_LT(sack.start, th->th_ack)) + continue; + } + if (SEQ_GT(sack.end, tp->snd_max)) + continue; + if (tp->snd_holes == NULL) { /* first hole */ + tp->snd_holes = (struct sackhole *) + zalloc(sack_hole_zone); + if (tp->snd_holes == NULL) { + /* ENOBUFS, so ignore SACKed block for now*/ + continue; + } + cur = tp->snd_holes; + cur->start = th->th_ack; + cur->end = sack.start; + cur->rxmit = cur->start; + cur->next = NULL; + tp->snd_numholes = 1; + tp->rcv_lastsack = sack.end; + continue; /* with next sack block */ + } + /* Go thru list of holes: p = previous, cur = current */ + p = cur = tp->snd_holes; + while (cur) { + if (SEQ_LEQ(sack.end, cur->start)) + /* SACKs data before the current hole */ + break; /* no use going through more holes */ + if (SEQ_GEQ(sack.start, cur->end)) { + /* SACKs data beyond the current hole */ + p = cur; + cur = cur->next; + continue; + } + if (SEQ_LEQ(sack.start, cur->start)) { + /* Data acks at least the beginning of hole */ + if (SEQ_GEQ(sack.end, cur->end)) { + /* Acks entire hole, so delete hole */ + if (p != cur) { + p->next = cur->next; + zfree(sack_hole_zone, cur); + cur = p->next; + } else { + cur = cur->next; + zfree(sack_hole_zone, p); + p = cur; + tp->snd_holes = p; + } + tp->snd_numholes--; + continue; + } + /* otherwise, move start of hole forward */ + cur->start = sack.end; + cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); + p = cur; + cur = cur->next; + continue; + } + /* move end of hole backward */ + if (SEQ_GEQ(sack.end, cur->end)) { + cur->end = sack.start; + cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); + p = cur; + cur = cur->next; + continue; + } + if (SEQ_LT(cur->start, sack.start) && + SEQ_GT(cur->end, sack.end)) { + /* + * ACKs some data in middle of a hole; need to + * split current hole + */ + temp = (struct sackhole *) + zalloc(sack_hole_zone); + if (temp == NULL) + continue; /* ENOBUFS */ + temp->next = cur->next; + temp->start = sack.end; + temp->end = cur->end; + temp->rxmit = SEQ_MAX(cur->rxmit, temp->start); + cur->end = sack.start; + cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); + cur->next = temp; + p = temp; + cur = p->next; + tp->snd_numholes++; + } + } + /* At this point, p points to the last hole on the list */ + if (SEQ_LT(tp->rcv_lastsack, sack.start)) { + /* + * Need to append new hole at end. + * Last hole is p (and it's not NULL). + */ + temp = (struct sackhole *) + zalloc(sack_hole_zone); + if (temp == NULL) + continue; /* ENOBUFS */ + temp->start = tp->rcv_lastsack; + temp->end = sack.start; + temp->rxmit = temp->start; + temp->next = 0; + p->next = temp; + tp->rcv_lastsack = sack.end; + tp->snd_numholes++; + } + if (SEQ_LT(tp->rcv_lastsack, sack.end)) + tp->rcv_lastsack = sack.end; + } + return (0); + } + + /* + * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if + * it is completely acked; otherwise, tcp_sack_option(), called from + * tcp_dooptions(), will fix up the hole. + */ + void + tcp_del_sackholes(tp, th) + struct tcpcb *tp; + struct tcphdr *th; + { + if (tp->sack_enable && tp->t_state != TCPS_LISTEN) { + /* max because this could be an older ack just arrived */ + tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? + th->th_ack : tp->snd_una; + struct sackhole *cur = tp->snd_holes; + struct sackhole *prev; + while (cur) + if (SEQ_LEQ(cur->end, lastack)) { + prev = cur; + cur = cur->next; + zfree(sack_hole_zone, prev); + tp->snd_numholes--; + } else if (SEQ_LT(cur->start, lastack)) { + cur->start = lastack; + if (SEQ_LT(cur->rxmit, cur->start)) + cur->rxmit = cur->start; + break; + } else + break; + tp->snd_holes = cur; + } + } + + void + tcp_free_sackholes(struct tcpcb *tp) + { + struct sackhole *p, *q; + + q = tp->snd_holes; + while (q != NULL) { + p = q; + q = q->next; + zfree(sack_hole_zone, p); + } + tp->snd_holes = 0; + } + + #ifdef TCP_SACK_DEBUG + void + tcp_print_holes(struct tcpcb *tp) + { + struct sackhole *p = tp->snd_holes; + if (p == 0) + return; + printf("Hole report: start--end dups rxmit\n"); + while (p) { + printf("%x--%x r %x\n", p->start, p->end, p->rxmit); + p = p->next; + } + printf("\n"); + } + #endif /* TCP_SACK_DEBUG */ + + /* + * Returns pointer to a sackhole if there are any pending retransmissions; + * NULL otherwise. + */ + struct sackhole * + tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt) + { + struct sackhole *p = NULL; + + *sack_bytes_rexmt = 0; + for (p = tp->snd_holes; p ; p = p->next) { + if (SEQ_LT(p->rxmit, p->end)) { + if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */ + continue; + } + #ifdef TCP_SACK_DEBUG + if (p) + tcp_print_holes(tp); + #endif + /* XXX - Not necessary to do SEQ_MIN here. Because + * p->rxmit can never be to right of snd_recover. */ + *sack_bytes_rexmt += (p->rxmit - p->start); + break; + } + *sack_bytes_rexmt += (p->rxmit - p->start); + } + return (p); + } + + /* + * After a timeout, the SACK list may be rebuilt. This SACK information + * should be used to avoid retransmitting SACKed data. This function + * traverses the SACK list to see if snd_nxt should be moved forward. + */ + void + tcp_sack_adjust(struct tcpcb *tp) + { + struct sackhole *cur = tp->snd_holes; + if (cur == NULL) + return; /* No holes */ + if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack)) + return; /* We're already beyond any SACKed blocks */ + /* + * Two cases for which we want to advance snd_nxt: + * i) snd_nxt lies between end of one hole and beginning of another + * ii) snd_nxt lies between end of last hole and rcv_lastsack + */ + while (cur->next) { + if (SEQ_LT(tp->snd_nxt, cur->end)) + return; + if (SEQ_GEQ(tp->snd_nxt, cur->next->start)) + cur = cur->next; + else { + tp->snd_nxt = cur->next->start; + return; + } + } + if (SEQ_LT(tp->snd_nxt, cur->end)) + return; + tp->snd_nxt = tp->rcv_lastsack; + return; + } + + /* + * Partial ack handling within a sack recovery episode. + * Keeping this very simple for now. When a partial ack + * is received, force snd_cwnd to a value that will allow + * the sender to transmit no more than 2 segments. + * If necessary, a better scheme can be adopted at a + * later point, but for now, the goal is to prevent the + * sender from bursting a large amount of data in the midst + * of sack recovery. + */ + void + tcp_sack_partialack(tp, th) + struct tcpcb *tp; + struct tcphdr *th; + { + int num_segs = 1; + int sack_bytes_rxmt = 0; + + callout_stop(tp->tt_rexmt); + tp->t_rtttime = 0; + /* send one or 2 segments based on how much new data was acked */ + if (((th->th_ack - tp->snd_una) / tp->t_maxseg) > 2) + num_segs = 2; + (void)tcp_sack_output(tp, &sack_bytes_rxmt); + tp->snd_cwnd = sack_bytes_rxmt + (tp->snd_nxt - tp->sack_newdata) + + num_segs * tp->t_maxseg; + tp->t_flags |= TF_ACKNOW; + (void) tcp_output(tp); + } + + /* + * Calculate the number of SACKed bytes in the scoreboard by + * subtracting the amount of data accounted for in sackholes + * from the total span of the scoreboard. Also returns the + * amount of data that is "lost" and has not yet been retransmitted. + */ + int + tcp_sacked_bytes(struct tcpcb *tp, int *lost_not_rexmitted) + { + struct sackhole *cur = tp->snd_holes; + int sacked = 0; + u_long lost = 0; + + if (cur == NULL) /* Scoreboard empty. */ + goto out; + if (SEQ_GEQ(tp->snd_una, tp->rcv_lastsack)) /* Scoreboard is stale. */ + goto out; + sacked = tp->rcv_lastsack - cur->start; + while (cur) { + lost += (cur->end - cur->rxmit); + sacked -= (cur->end - cur->start); + cur = cur->next; + } + out: + if (lost_not_rexmitted) + *lost_not_rexmitted = lost; + return (sacked); + } Index: sys/netinet/tcp_seq.h =========================================================================== --- sys/netinet/tcp_seq.h 2005/04/28 15:14:13 #8 +++ sys/netinet/tcp_seq.h 2005/04/28 15:14:13 @@ -46,6 +46,9 @@ #define SEQ_GT(a,b) ((int)((a)-(b)) > 0) #define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0) +#define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b)) +#define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b)) + /* for modulo comparisons of timestamps */ #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0) #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0) Index: sys/netinet/tcp_subr.c =========================================================================== --- sys/netinet/tcp_subr.c 2005/04/28 15:14:13 #23 +++ sys/netinet/tcp_subr.c 2005/04/28 15:14:13 @@ -39,6 +39,7 @@ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" +#include "opt_tcp_sack.h" #include #include @@ -180,6 +181,17 @@ SYSCTL_INT(_net_inet_tcp, OID_AUTO, inflight_stab, CTLFLAG_RW, &tcp_inflight_stab, 0, "Slop in maximal packets / 10 (20 = 2 packets)"); +SYSCTL_NODE(_net_inet_tcp, OID_AUTO, sack, CTLFLAG_RW, 0, "TCP SACK"); +int tcp_do_sack = 1; +SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, enable, CTLFLAG_RW, + &tcp_do_sack, 0, "Enable/Disable TCP SACK support"); + +int tcp_sackhole_limit = 10 * 1024; /* Arbitrarily set */ +SYSCTL_INT(_net_inet_tcp_sack, OID_AUTO, maxholes, CTLFLAG_RW, + &tcp_sackhole_limit, 0, "Limit on the total SACK scoreboard elements"); + +struct vm_zone *sack_hole_zone; + static void tcp_cleartaocache __P((void)); static void tcp_notify __P((struct inpcb *, int)); @@ -264,6 +276,9 @@ panic("tcp_init"); #undef TCP_MINPROTOHDR + sack_hole_zone = zinit("sackhole", sizeof(struct sackhole), + tcp_sackhole_limit, ZONE_INTERRUPT, 0); + syncache_init(); } @@ -570,6 +585,9 @@ tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); if (tcp_do_rfc1644) tp->t_flags |= TF_REQ_CC; + + tp->sack_enable = tcp_do_sack; + tp->t_inpcb = inp; /* XXX */ /* * Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no @@ -759,6 +777,7 @@ FREE(q, M_TSEGQ); tcp_reass_qsize--; } + tcp_free_sackholes(tp); inp->inp_ppcb = NULL; soisdisconnected(so); #ifdef INET6 @@ -797,6 +816,7 @@ FREE(te, M_TSEGQ); tcp_reass_qsize--; } + tcp_clean_sackreport(tcpb); } } Index: sys/netinet/tcp_syncache.c =========================================================================== --- sys/netinet/tcp_syncache.c 2005/04/28 15:14:13 #16 +++ sys/netinet/tcp_syncache.c 2005/04/28 15:14:13 @@ -38,6 +38,7 @@ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_random_ip_id.h" +#include "opt_tcp_sack.h" #include #include @@ -715,6 +716,11 @@ tp->t_flags |= TF_SIGNATURE; #endif + if (sc->sc_flags & SCF_SACK) { + tp->sack_enable = 1; + tp->t_flags |= TF_SACK_PERMIT; + } + tcp_mss(tp, sc->sc_peer_mss); /* @@ -1005,6 +1011,9 @@ sc->sc_flags = SCF_SIGNATURE; #endif + if (to->to_flags & TOF_SACK) + sc->sc_flags |= SCF_SACK; + /* * XXX * We have the option here of not doing TAO (even if the segment @@ -1110,9 +1119,12 @@ ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0) + ((sc->sc_flags & SCF_CC) ? TCPOLEN_CC_APPA * 2 : 0); #ifdef TCP_SIGNATURE - optlen += ((sc->sc_flags & SCF_SIGNATURE) ? - (TCPOLEN_SIGNATURE + 2) : 0); + if (sc->sc_flags & SCF_SIGNATURE) + optlen += TCPOLEN_SIGNATURE; #endif + if (sc->sc_flags & SCF_SACK) + optlen += TCPOLEN_SACK_PERMITTED; + optlen = roundup2(optlen, 4); } tlen = hlen + sizeof(struct tcphdr) + optlen; @@ -1243,11 +1255,22 @@ *bp++ = 0; tcp_signature_compute(m, sizeof(struct ip), 0, optlen, optp + 2, IPSEC_DIR_OUTBOUND); - *bp++ = TCPOPT_NOP; - *bp++ = TCPOPT_EOL; - optp += TCPOLEN_SIGNATURE + 2; + optp += TCPOLEN_SIGNATURE; } #endif /* TCP_SIGNATURE */ + + if (sc->sc_flags & SCF_SACK) { + *optp++ = TCPOPT_SACK_PERMITTED; + *optp++ = TCPOLEN_SACK_PERMITTED; + } + + { + /* Pad TCP options to a 4 byte boundary */ + int padlen = optlen - (optp - (u_int8_t *)(th + 1)); + while (padlen-- > 0) + *optp++ = TCPOPT_EOL; + } + no_options: #ifdef INET6 Index: sys/netinet/tcp_timer.c =========================================================================== --- sys/netinet/tcp_timer.c 2005/04/28 15:14:13 #11 +++ sys/netinet/tcp_timer.c 2005/04/28 15:14:13 @@ -37,6 +37,7 @@ #include "opt_compat.h" #include "opt_inet6.h" #include "opt_tcpdebug.h" +#include "opt_tcp_sack.h" #include #include @@ -200,6 +201,9 @@ ostate = tp->t_state; #endif s = splnet(); + + tcp_free_sackholes(tp); + if (callout_pending(tp->tt_2msl) || !callout_active(tp->tt_2msl)) { splx(s); return; @@ -368,6 +372,9 @@ return; } callout_deactivate(tp->tt_rexmt); + + tcp_free_sackholes(tp); + /* * Retransmission timer went off. Message has not * been acked within retransmit interval. Back off @@ -431,6 +438,10 @@ } tp->snd_nxt = tp->snd_una; tp->snd_high = tp->snd_max; + + if (tp->sack_enable) + tp->snd_recover = tp->snd_una; + /* * Force a segment to be sent. */ Index: sys/netinet/tcp_usrreq.c =========================================================================== --- sys/netinet/tcp_usrreq.c 2005/04/28 15:14:13 #23 +++ sys/netinet/tcp_usrreq.c 2005/04/28 15:14:13 @@ -777,7 +777,7 @@ tp->iss = tcp_new_isn(tp); tp->t_bw_rtseq = tp->iss; tcp_sendseqinit(tp); - + tp->snd_recover = tp->snd_una; /* * Generate a CC value for this connection and * check whether CC or CCnew should be used. Index: sys/netinet/tcp_var.h =========================================================================== --- sys/netinet/tcp_var.h 2005/04/28 15:14:13 #14 +++ sys/netinet/tcp_var.h 2005/04/28 15:14:13 @@ -59,6 +59,18 @@ MALLOC_DECLARE(M_TSEGQ); #endif +struct sackblk { + tcp_seq start; /* start seq no. of sack block */ + tcp_seq end; /* end seq no. */ +}; + +struct sackhole { + tcp_seq start; /* start seq no. of hole */ + tcp_seq end; /* end seq no. */ + tcp_seq rxmit; /* next seq. no in hole to be retransmitted */ + struct sackhole *next; /* next in list */ +}; + struct tcptemp { u_char tt_ipgen[40]; /* the size must be of max ip header, now IPv6 */ struct tcphdr tt_t; @@ -104,7 +116,7 @@ #define TF_LQ_OVERFLOW 0x20000 /* listen queue overflow */ #define TF_LASTIDLE 0x40000 /* connection was previously idle */ #define TF_RXWIN0SENT 0x80000 /* sent a receiver win 0 in response */ -#define TF_SIGNATURE 0x400000 /* require MD5 digests (RFC2385) */ +#define TF_SIGNATURE 0x400000 /* require MD5 digests (RFC2385) */ int t_force; /* 1 if forcing out a byte */ tcp_seq snd_una; /* send unacknowledged */ @@ -179,6 +191,16 @@ u_long snd_ssthresh_prev; /* ssthresh prior to retransmit */ tcp_seq snd_high_prev; /* snd_high prior to retransmit */ u_long t_badrxtwin; /* window for retransmit recovery */ + + int sack_enable; /* enable SACK for this connection */ +/* SACK related state */ + int snd_numholes; /* number of holes seen by sender */ + struct sackhole *snd_holes; /* linked list of holes (sorted) */ + tcp_seq rcv_lastsack; /* last seq number(+1) sack'd by rcv'r*/ + int rcv_numsacks; /* # distinct sack blks present */ + struct sackblk sackblks[MAX_SACK_BLKS]; /* seq nos. of sack blocks */ + tcp_seq sack_newdata; /* New data xmitted in this recovery + episode starts at this seq number */ }; #ifdef TCP_SIGNATURE @@ -211,7 +233,8 @@ #define TOF_MSS 0x0010 #define TOF_SCALE 0x0020 #define TOF_SIGNATURE 0x0040 /* signature option present */ -#define TOF_SIGLEN 0x0080 /* sigature length valid (RFC2385) */ +#define TOF_SIGLEN 0x0080 /* signature length valid (RFC2385) */ +#define TOF_SACK 0x0100 /* Peer sent SACK option */ u_int32_t to_tsval; u_int32_t to_tsecr; tcp_cc to_cc; /* holds CC or CCnew */ @@ -248,6 +271,7 @@ #define SCF_UNREACH 0x10 /* icmp unreachable received */ #define SCF_KEEPROUTE 0x20 /* keep cloned route */ #define SCF_SIGNATURE 0x40 /* send MD5 digests */ +#define SCF_SACK 0x80 /* send SACK option */ TAILQ_ENTRY(syncache) sc_hash; TAILQ_ENTRY(syncache) sc_timerq; }; @@ -397,6 +421,13 @@ u_long tcps_sc_zonefail; /* zalloc() failed */ u_long tcps_sc_sendcookie; /* SYN cookie sent */ u_long tcps_sc_recvcookie; /* SYN cookie received */ + + /* SACK related stats */ + u_long tcps_sack_recovery_episode; /* SACK recovery episodes */ + u_long tcps_sack_rexmits; /* SACK rexmit segments */ + u_long tcps_sack_rexmit_bytes; /* SACK rexmit bytes */ + u_long tcps_sack_rcv_blocks; /* SACK blocks (options) received */ + u_long tcps_sack_send_blocks; /* SACK blocks (options) sent */ }; /* @@ -431,6 +462,7 @@ #define TCPCTL_DELACKTIME 12 /* time before sending delayed ACK */ #define TCPCTL_V6MSSDFLT 13 /* MSS default for IPv6 */ #define TCPCTL_MAXID 14 +#define TCPCTL_SACK 15 /* selective acknowledgement, rfc 2018 */ #define TCPCTL_NAMES { \ { 0, 0 }, \ @@ -465,6 +497,8 @@ extern int ss_fltsz; extern int ss_fltsz_local; +extern int tcp_do_sack; /* SACK enabled/disabled */ + void tcp_canceltimers __P((struct tcpcb *)); struct tcpcb * tcp_close __P((struct tcpcb *)); @@ -517,6 +551,25 @@ extern u_long tcp_recvspace; tcp_seq tcp_new_isn __P((struct tcpcb *)); +int tcp_sack_option(struct tcpcb *,struct tcphdr *,u_char *,int); +void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend); +void tcp_del_sackholes(struct tcpcb *, struct tcphdr *); +void tcp_clean_sackreport(struct tcpcb *tp); +void tcp_sack_adjust(struct tcpcb *tp); +struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt); +void tcp_free_sackholes(struct tcpcb *tp); +#ifdef DEBUG +void tcp_print_holes(struct tcpcb *tp); +#endif +int tcp_newreno(struct tcpcb *, struct tcphdr *); +u_long tcp_seq_subtract(u_long, u_long ); +#ifdef TCP_SACK_DEBUG +void tcp_print_holes(struct tcpcb *tp); +#endif /* TCP_SACK_DEBUG */ + +int tcp_sacked_bytes(struct tcpcb *tp, int *lost_not_rexmitted); +void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); + #endif /* _KERNEL */ #endif /* _NETINET_TCP_VAR_H_ */ Index: usr.bin/netstat/inet.c =========================================================================== --- usr.bin/netstat/inet.c 2005/04/28 15:14:13 #12 +++ usr.bin/netstat/inet.c 2005/04/28 15:14:13 @@ -450,6 +450,13 @@ p(tcps_sc_zonefail, "\t\t%lu zone failures\n"); p(tcps_sc_sendcookie, "\t%lu cookies sent\n"); p(tcps_sc_recvcookie, "\t%lu cookies received\n"); + + p(tcps_sack_recovery_episode, "\t%lu SACK recovery episodes\n"); + p(tcps_sack_rexmits, "\t%lu segments re-transmitted during SACK recovery episodes\n"); + p(tcps_sack_rexmit_bytes, "\t%lu bytes re-transmitted during SACK recovery episodes\n"); + p(tcps_sack_rcv_blocks, "\t%lu SACK options (SACK blocks) received\n"); + p(tcps_sack_send_blocks, "\t%lu SACK options (SACK blocks) sent\n"); + #undef p #undef p1a #undef p2