sys/netinet/tcp_output.c - annotate

Return to tcp_output.c CVS log
Up to [local] / sys / netinet
Annotation of sys/netinet/tcp_output.c, Revision 1.1.1.1

1.1       nbrk        1: /*     $OpenBSD: tcp_output.c,v 1.80 2007/06/01 00:52:38 henning Exp $ */
                      2: /*     $NetBSD: tcp_output.c,v 1.16 1997/06/03 16:17:09 kml Exp $      */
                      3:
                      4: /*
                      5:  * Copyright (c) 1982, 1986, 1988, 1990, 1993
                      6:  *     The Regents of the University of California.  All rights reserved.
                      7:  *
                      8:  * Redistribution and use in source and binary forms, with or without
                      9:  * modification, are permitted provided that the following conditions
                     10:  * are met:
                     11:  * 1. Redistributions of source code must retain the above copyright
                     12:  *    notice, this list of conditions and the following disclaimer.
                     13:  * 2. Redistributions in binary form must reproduce the above copyright
                     14:  *    notice, this list of conditions and the following disclaimer in the
                     15:  *    documentation and/or other materials provided with the distribution.
                     16:  * 3. Neither the name of the University nor the names of its contributors
                     17:  *    may be used to endorse or promote products derived from this software
                     18:  *    without specific prior written permission.
                     19:  *
                     20:  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
                     21:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
                     22:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
                     23:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
                     24:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
                     25:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
                     26:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
                     27:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
                     28:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
                     29:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
                     30:  * SUCH DAMAGE.
                     31:  *
                     32:  *     @(#)COPYRIGHT   1.1 (NRL) 17 January 1995
                     33:  *
                     34:  * NRL grants permission for redistribution and use in source and binary
                     35:  * forms, with or without modification, of the software and documentation
                     36:  * created at NRL provided that the following conditions are met:
                     37:  *
                     38:  * 1. Redistributions of source code must retain the above copyright
                     39:  *    notice, this list of conditions and the following disclaimer.
                     40:  * 2. Redistributions in binary form must reproduce the above copyright
                     41:  *    notice, this list of conditions and the following disclaimer in the
                     42:  *    documentation and/or other materials provided with the distribution.
                     43:  * 3. All advertising materials mentioning features or use of this software
                     44:  *    must display the following acknowledgements:
                     45:  *     This product includes software developed by the University of
                     46:  *     California, Berkeley and its contributors.
                     47:  *     This product includes software developed at the Information
                     48:  *     Technology Division, US Naval Research Laboratory.
                     49:  * 4. Neither the name of the NRL nor the names of its contributors
                     50:  *    may be used to endorse or promote products derived from this software
                     51:  *    without specific prior written permission.
                     52:  *
                     53:  * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
                     54:  * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
                     55:  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
                     56:  * PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL NRL OR
                     57:  * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
                     58:  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
                     59:  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
                     60:  * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
                     61:  * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
                     62:  * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
                     63:  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
                     64:  *
                     65:  * The views and conclusions contained in the software and documentation
                     66:  * are those of the authors and should not be interpreted as representing
                     67:  * official policies, either expressed or implied, of the US Naval
                     68:  * Research Laboratory (NRL).
                     69:  */
                     70:
                     71: #include <sys/param.h>
                     72: #include <sys/systm.h>
                     73: #include <sys/mbuf.h>
                     74: #include <sys/protosw.h>
                     75: #include <sys/socket.h>
                     76: #include <sys/socketvar.h>
                     77: #include <sys/kernel.h>
                     78:
                     79: #include <net/route.h>
                     80: #include <net/if.h>
                     81:
                     82: #include <netinet/in.h>
                     83: #include <netinet/in_systm.h>
                     84: #include <netinet/ip.h>
                     85: #include <netinet/in_pcb.h>
                     86: #include <netinet/ip_var.h>
                     87: #include <netinet/tcp.h>
                     88: #define        TCPOUTFLAGS
                     89: #include <netinet/tcp_fsm.h>
                     90: #include <netinet/tcp_seq.h>
                     91: #include <netinet/tcp_timer.h>
                     92: #include <netinet/tcp_var.h>
                     93: #include <netinet/tcpip.h>
                     94: #include <netinet/tcp_debug.h>
                     95:
                     96: #ifdef INET6
                     97: #include <netinet6/tcpipv6.h>
                     98: #include <netinet6/in6_var.h>
                     99: #endif /* INET6 */
                    100:
                    101: #ifdef notyet
                    102: extern struct mbuf *m_copypack();
                    103: #endif
                    104:
                    105: #ifdef TCP_SACK
                    106: extern int tcprexmtthresh;
                    107: #endif
                    108:
                    109: #ifdef TCP_SACK
                    110: #ifdef TCP_SACK_DEBUG
                    111: void tcp_print_holes(struct tcpcb *tp);
                    112:
                    113: void
                    114: tcp_print_holes(struct tcpcb *tp)
                    115: {
                    116:        struct sackhole *p = tp->snd_holes;
                    117:        if (p == 0)
                    118:                return;
                    119:        printf("Hole report: start--end dups rxmit\n");
                    120:        while (p) {
                    121:                printf("%x--%x d %d r %x\n", p->start, p->end, p->dups,
                    122:                    p->rxmit);
                    123:                p = p->next;
                    124:        }
                    125:        printf("\n");
                    126: }
                    127: #endif /* TCP_SACK_DEBUG */
                    128:
                    129: /*
                    130:  * Returns pointer to a sackhole if there are any pending retransmissions;
                    131:  * NULL otherwise.
                    132:  */
                    133: struct sackhole *
                    134: tcp_sack_output(struct tcpcb *tp)
                    135: {
                    136:        struct sackhole *p;
                    137:
                    138:        if (!tp->sack_enable)
                    139:                return (NULL);
                    140:        p = tp->snd_holes;
                    141:        while (p) {
                    142: #ifndef TCP_FACK
                    143:                if (p->dups >= tcprexmtthresh && SEQ_LT(p->rxmit, p->end)) {
                    144: #else
                    145:                /* In FACK, if p->dups is less than tcprexmtthresh, but
                    146:                 * snd_fack advances more than tcprextmtthresh * tp->t_maxseg,
                    147:                 * tcp_input() will try fast retransmit. This forces output.
                    148:                 */
                    149:                if ((p->dups >= tcprexmtthresh ||
                    150:                     tp->t_dupacks == tcprexmtthresh) &&
                    151:                    SEQ_LT(p->rxmit, p->end)) {
                    152: #endif /* TCP_FACK */
                    153:                        if (SEQ_LT(p->rxmit, tp->snd_una)) {/* old SACK hole */
                    154:                                p = p->next;
                    155:                                continue;
                    156:                        }
                    157: #ifdef TCP_SACK_DEBUG
                    158:                        if (p)
                    159:                                tcp_print_holes(tp);
                    160: #endif
                    161:                        return (p);
                    162:                }
                    163:                p = p->next;
                    164:        }
                    165:        return (NULL);
                    166: }
                    167:
                    168: /*
                    169:  * After a timeout, the SACK list may be rebuilt.  This SACK information
                    170:  * should be used to avoid retransmitting SACKed data.  This function
                    171:  * traverses the SACK list to see if snd_nxt should be moved forward.
                    172:  */
                    173:
                    174: void
                    175: tcp_sack_adjust(struct tcpcb *tp)
                    176: {
                    177:        struct sackhole *cur = tp->snd_holes;
                    178:        if (cur == NULL)
                    179:                return; /* No holes */
                    180:        if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack))
                    181:                return; /* We're already beyond any SACKed blocks */
                    182:        /*
                    183:         * Two cases for which we want to advance snd_nxt:
                    184:         * i) snd_nxt lies between end of one hole and beginning of another
                    185:         * ii) snd_nxt lies between end of last hole and rcv_lastsack
                    186:         */
                    187:        while (cur->next) {
                    188:                if (SEQ_LT(tp->snd_nxt, cur->end))
                    189:                        return;
                    190:                if (SEQ_GEQ(tp->snd_nxt, cur->next->start))
                    191:                        cur = cur->next;
                    192:                else {
                    193:                        tp->snd_nxt = cur->next->start;
                    194:                        return;
                    195:                }
                    196:        }
                    197:        if (SEQ_LT(tp->snd_nxt, cur->end))
                    198:                return;
                    199:        tp->snd_nxt = tp->rcv_lastsack;
                    200:        return;
                    201: }
                    202: #endif /* TCP_SACK */
                    203:
                    204: /*
                    205:  * Tcp output routine: figure out what should be sent and send it.
                    206:  */
                    207: int
                    208: tcp_output(tp)
                    209:        struct tcpcb *tp;
                    210: {
                    211:        struct socket *so = tp->t_inpcb->inp_socket;
                    212:        long len, win, txmaxseg;
                    213:        int off, flags, error;
                    214:        struct mbuf *m;
                    215:        struct tcphdr *th;
                    216:        u_char opt[MAX_TCPOPTLEN];
                    217:        unsigned int optlen, hdrlen, packetlen;
                    218:        int idle, sendalot = 0;
                    219: #ifdef TCP_SACK
                    220:        int i, sack_rxmit = 0;
                    221:        struct sackhole *p;
                    222: #endif
                    223: #if defined(TCP_SACK)
                    224:        int maxburst = TCP_MAXBURST;
                    225: #endif
                    226: #ifdef TCP_SIGNATURE
                    227:        unsigned int sigoff;
                    228: #endif /* TCP_SIGNATURE */
                    229: #ifdef TCP_ECN
                    230:        int needect;
                    231: #endif
                    232:
                    233: #if defined(TCP_SACK) && defined(TCP_SIGNATURE) && defined(DIAGNOSTIC)
                    234:        if (tp->sack_enable && (tp->t_flags & TF_SIGNATURE))
                    235:                return (EINVAL);
                    236: #endif /* defined(TCP_SACK) && defined(TCP_SIGNATURE) && defined(DIAGNOSTIC) */
                    237:
                    238:        /*
                    239:         * Determine length of data that should be transmitted,
                    240:         * and flags that will be used.
                    241:         * If there is some data or critical controls (SYN, RST)
                    242:         * to send, then transmit; otherwise, investigate further.
                    243:         */
                    244:        idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una);
                    245:        if (idle && (tcp_now - tp->t_rcvtime) >= tp->t_rxtcur)
                    246:                /*
                    247:                 * We have been idle for "a while" and no acks are
                    248:                 * expected to clock out any data we send --
                    249:                 * slow start to get ack "clock" running again.
                    250:                 */
                    251:                tp->snd_cwnd = 2 * tp->t_maxseg;
                    252:
                    253:        /* remember 'idle' for next invocation of tcp_output */
                    254:        if (idle && soissending(so)) {
                    255:                tp->t_flags |= TF_LASTIDLE;
                    256:                idle = 0;
                    257:        } else
                    258:                tp->t_flags &= ~TF_LASTIDLE;
                    259:
                    260: again:
                    261: #ifdef TCP_SACK
                    262:        /*
                    263:         * If we've recently taken a timeout, snd_max will be greater than
                    264:         * snd_nxt.  There may be SACK information that allows us to avoid
                    265:         * resending already delivered data.  Adjust snd_nxt accordingly.
                    266:         */
                    267:        if (tp->sack_enable && SEQ_LT(tp->snd_nxt, tp->snd_max))
                    268:                tcp_sack_adjust(tp);
                    269: #endif
                    270:        off = tp->snd_nxt - tp->snd_una;
                    271: #if defined(TCP_SACK) && defined(TCP_FACK)
                    272:        /* Normally, sendable data is limited by off < tp->snd_cwnd.
                    273:         * But in FACK, sendable data is limited by snd_awnd < snd_cwnd,
                    274:         * regardless of offset.
                    275:         */
                    276:        if (tp->sack_enable && (tp->t_dupacks > tcprexmtthresh))
                    277:                win = tp->snd_wnd;
                    278:        else
                    279: #endif
                    280:        win = ulmin(tp->snd_wnd, tp->snd_cwnd);
                    281:
                    282:        flags = tcp_outflags[tp->t_state];
                    283:
                    284: #ifdef TCP_SACK
                    285:        /*
                    286:         * Send any SACK-generated retransmissions.  If we're explicitly trying
                    287:         * to send out new data (when sendalot is 1), bypass this function.
                    288:         * If we retransmit in fast recovery mode, decrement snd_cwnd, since
                    289:         * we're replacing a (future) new transmission with a retransmission
                    290:         * now, and we previously incremented snd_cwnd in tcp_input().
                    291:         */
                    292:        if (tp->sack_enable && !sendalot) {
                    293:                if (tp->t_dupacks >= tcprexmtthresh &&
                    294:                    (p = tcp_sack_output(tp))) {
                    295:                        off = p->rxmit - tp->snd_una;
                    296:                        sack_rxmit = 1;
                    297: #if 0
                    298:                        /* Coalesce holes into a single retransmission */
                    299: #endif
                    300:                        len = min(tp->t_maxseg, p->end - p->rxmit);
                    301: #ifndef TCP_FACK
                    302:                        /* in FACK, hold snd_cwnd constant during recovery */
                    303:                        if (SEQ_LT(tp->snd_una, tp->snd_last))
                    304:                                tp->snd_cwnd -= tp->t_maxseg;
                    305: #endif
                    306:                }
                    307:        }
                    308: #endif /* TCP_SACK */
                    309:
                    310:        sendalot = 0;
                    311:        /*
                    312:         * If in persist timeout with window of 0, send 1 byte.
                    313:         * Otherwise, if window is small but nonzero
                    314:         * and timer expired, we will send what we can
                    315:         * and go to transmit state.
                    316:         */
                    317:        if (tp->t_force) {
                    318:                if (win == 0) {
                    319:                        /*
                    320:                         * If we still have some data to send, then
                    321:                         * clear the FIN bit.  Usually this would
                    322:                         * happen below when it realizes that we
                    323:                         * aren't sending all the data.  However,
                    324:                         * if we have exactly 1 byte of unset data,
                    325:                         * then it won't clear the FIN bit below,
                    326:                         * and if we are in persist state, we wind
                    327:                         * up sending the packet without recording
                    328:                         * that we sent the FIN bit.
                    329:                         *
                    330:                         * We can't just blindly clear the FIN bit,
                    331:                         * because if we don't have any more data
                    332:                         * to send then the probe will be the FIN
                    333:                         * itself.
                    334:                         */
                    335:                        if (off < so->so_snd.sb_cc)
                    336:                                flags &= ~TH_FIN;
                    337:                        win = 1;
                    338:                } else {
                    339:                        TCP_TIMER_DISARM(tp, TCPT_PERSIST);
                    340:                        tp->t_rxtshift = 0;
                    341:                }
                    342:        }
                    343:
                    344: #ifdef TCP_SACK
                    345:        if (!sack_rxmit) {
                    346: #endif
                    347:        len = ulmin(so->so_snd.sb_cc, win) - off;
                    348:
                    349: #if defined(TCP_SACK) && defined(TCP_FACK)
                    350:        /*
                    351:         * If we're in fast recovery (SEQ_GT(tp->snd_last, tp->snd_una)), and
                    352:         * amount of outstanding data (snd_awnd) is >= snd_cwnd, then
                    353:         * do not send data (like zero window conditions)
                    354:         */
                    355:        if (tp->sack_enable && len && SEQ_GT(tp->snd_last, tp->snd_una) &&
                    356:            (tp->snd_awnd >= tp->snd_cwnd))
                    357:                len = 0;
                    358: #endif /* TCP_FACK */
                    359: #ifdef TCP_SACK
                    360:        }
                    361: #endif
                    362:
                    363:        if (len < 0) {
                    364:                /*
                    365:                 * If FIN has been sent but not acked,
                    366:                 * but we haven't been called to retransmit,
                    367:                 * len will be -1.  Otherwise, window shrank
                    368:                 * after we sent into it.  If window shrank to 0,
                    369:                 * cancel pending retransmit, pull snd_nxt back
                    370:                 * to (closed) window, and set the persist timer
                    371:                 * if it isn't already going.  If the window didn't
                    372:                 * close completely, just wait for an ACK.
                    373:                 */
                    374:                len = 0;
                    375:                if (win == 0) {
                    376:                        TCP_TIMER_DISARM(tp, TCPT_REXMT);
                    377:                        tp->t_rxtshift = 0;
                    378:                        tp->snd_nxt = tp->snd_una;
                    379:                        if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
                    380:                                tcp_setpersist(tp);
                    381:                }
                    382:        }
                    383:
                    384:         /*
                    385:          * Never send more than half a buffer full.  This insures that we can
                    386:          * always keep 2 packets on the wire, no matter what SO_SNDBUF is, and
                    387:          * therefore acks will never be delayed unless we run out of data to
                    388:          * transmit.
                    389:          */
                    390:        txmaxseg = ulmin(so->so_snd.sb_hiwat / 2, tp->t_maxseg);
                    391:
                    392:        if (len > txmaxseg) {
                    393:                len = txmaxseg;
                    394:                sendalot = 1;
                    395:        }
                    396:        if (off + len < so->so_snd.sb_cc)
                    397:                flags &= ~TH_FIN;
                    398:
                    399:        win = sbspace(&so->so_rcv);
                    400:
                    401:        /*
                    402:         * Sender silly window avoidance.  If connection is idle
                    403:         * and can send all data, a maximum segment,
                    404:         * at least a maximum default-size segment do it,
                    405:         * or are forced, do it; otherwise don't bother.
                    406:         * If peer's buffer is tiny, then send
                    407:         * when window is at least half open.
                    408:         * If retransmitting (possibly after persist timer forced us
                    409:         * to send into a small window), then must resend.
                    410:         */
                    411:        if (len) {
                    412:                if (len == txmaxseg)
                    413:                        goto send;
                    414:                if ((idle || tp->t_flags & TF_NODELAY) &&
                    415:                    len + off >= so->so_snd.sb_cc && !soissending(so))
                    416:                        goto send;
                    417:                if (tp->t_force)
                    418:                        goto send;
                    419:                if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0)
                    420:                        goto send;
                    421:                if (SEQ_LT(tp->snd_nxt, tp->snd_max))
                    422:                        goto send;
                    423: #ifdef TCP_SACK
                    424:                if (sack_rxmit)
                    425:                        goto send;
                    426: #endif
                    427:        }
                    428:
                    429:        /*
                    430:         * Compare available window to amount of window
                    431:         * known to peer (as advertised window less
                    432:         * next expected input).  If the difference is at least two
                    433:         * max size segments, or at least 50% of the maximum possible
                    434:         * window, then want to send a window update to peer.
                    435:         */
                    436:        if (win > 0) {
                    437:                /*
                    438:                 * "adv" is the amount we can increase the window,
                    439:                 * taking into account that we are limited by
                    440:                 * TCP_MAXWIN << tp->rcv_scale.
                    441:                 */
                    442:                long adv = lmin(win, (long)TCP_MAXWIN << tp->rcv_scale) -
                    443:                        (tp->rcv_adv - tp->rcv_nxt);
                    444:
                    445:                if (adv >= (long) (2 * tp->t_maxseg))
                    446:                        goto send;
                    447:                if (2 * adv >= (long) so->so_rcv.sb_hiwat)
                    448:                        goto send;
                    449:        }
                    450:
                    451:        /*
                    452:         * Send if we owe peer an ACK.
                    453:         */
                    454:        if (tp->t_flags & TF_ACKNOW)
                    455:                goto send;
                    456:        if (flags & (TH_SYN|TH_RST))
                    457:                goto send;
                    458:        if (SEQ_GT(tp->snd_up, tp->snd_una))
                    459:                goto send;
                    460:        /*
                    461:         * If our state indicates that FIN should be sent
                    462:         * and we have not yet done so, or we're retransmitting the FIN,
                    463:         * then we need to send.
                    464:         */
                    465:        if (flags & TH_FIN &&
                    466:            ((tp->t_flags & TF_SENTFIN) == 0 || tp->snd_nxt == tp->snd_una))
                    467:                goto send;
                    468: #ifdef TCP_SACK
                    469:        /*
                    470:         * In SACK, it is possible for tcp_output to fail to send a segment
                    471:         * after the retransmission timer has been turned off.  Make sure
                    472:         * that the retransmission timer is set.
                    473:         */
                    474:        if (SEQ_GT(tp->snd_max, tp->snd_una) &&
                    475:            TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
                    476:            TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
                    477:                TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
                    478:                return (0);
                    479:        }
                    480: #endif /* TCP_SACK */
                    481:
                    482:        /*
                    483:         * TCP window updates are not reliable, rather a polling protocol
                    484:         * using ``persist'' packets is used to insure receipt of window
                    485:         * updates.  The three ``states'' for the output side are:
                    486:         *      idle                    not doing retransmits or persists
                    487:         *      persisting              to move a small or zero window
                    488:         *      (re)transmitting        and thereby not persisting
                    489:         *
                    490:         * tp->t_timer[TCPT_PERSIST]
                    491:         *      is set when we are in persist state.
                    492:         * tp->t_force
                    493:         *      is set when we are called to send a persist packet.
                    494:         * tp->t_timer[TCPT_REXMT]
                    495:         *      is set when we are retransmitting
                    496:         * The output side is idle when both timers are zero.
                    497:         *
                    498:         * If send window is too small, there is data to transmit, and no
                    499:         * retransmit or persist is pending, then go to persist state.
                    500:         * If nothing happens soon, send when timer expires:
                    501:         * if window is nonzero, transmit what we can,
                    502:         * otherwise force out a byte.
                    503:         */
                    504:        if (so->so_snd.sb_cc && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
                    505:            TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
                    506:                tp->t_rxtshift = 0;
                    507:                tcp_setpersist(tp);
                    508:        }
                    509:
                    510:        /*
                    511:         * No reason to send a segment, just return.
                    512:         */
                    513:        return (0);
                    514:
                    515: send:
                    516:        /*
                    517:         * Before ESTABLISHED, force sending of initial options
                    518:         * unless TCP set not to do any options.
                    519:         * NOTE: we assume that the IP/TCP header plus TCP options
                    520:         * always fit in a single mbuf, leaving room for a maximum
                    521:         * link header, i.e.
                    522:         *      max_linkhdr + sizeof(network header) + sizeof(struct tcphdr +
                    523:         *              optlen <= MHLEN
                    524:         */
                    525:        optlen = 0;
                    526:
                    527:        switch (tp->pf) {
                    528:        case 0: /*default to PF_INET*/
                    529: #ifdef INET
                    530:        case PF_INET:
                    531:                hdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
                    532:                break;
                    533: #endif /* INET */
                    534: #ifdef INET6
                    535:        case PF_INET6:
                    536:                hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
                    537:                break;
                    538: #endif /* INET6 */
                    539:        default:
                    540:                return (EPFNOSUPPORT);
                    541:        }
                    542:
                    543:        if (flags & TH_SYN) {
                    544:                tp->snd_nxt = tp->iss;
                    545:                if ((tp->t_flags & TF_NOOPT) == 0) {
                    546:                        u_int16_t mss;
                    547:
                    548:                        opt[0] = TCPOPT_MAXSEG;
                    549:                        opt[1] = 4;
                    550:                        mss = htons((u_int16_t) tcp_mss(tp, 0));
                    551:                        bcopy((caddr_t)&mss, (caddr_t)(opt + 2), sizeof(mss));
                    552:                        optlen = 4;
                    553:
                    554:                        if (flags & TH_ACK)
                    555:                                tcp_mss_update(tp);
                    556: #ifdef TCP_SACK
                    557:                        /*
                    558:                         * If this is the first SYN of connection (not a SYN
                    559:                         * ACK), include SACK_PERMIT_HDR option.  If this is a
                    560:                         * SYN ACK, include SACK_PERMIT_HDR option if peer has
                    561:                         * already done so.
                    562:                         */
                    563:                        if (tp->sack_enable && ((flags & TH_ACK) == 0 ||
                    564:                            (tp->t_flags & TF_SACK_PERMIT))) {
                    565:                                *((u_int32_t *) (opt + optlen)) =
                    566:                                    htonl(TCPOPT_SACK_PERMIT_HDR);
                    567:                                optlen += 4;
                    568:                        }
                    569: #endif
                    570:
                    571:                        if ((tp->t_flags & TF_REQ_SCALE) &&
                    572:                            ((flags & TH_ACK) == 0 ||
                    573:                            (tp->t_flags & TF_RCVD_SCALE))) {
                    574:                                *((u_int32_t *) (opt + optlen)) = htonl(
                    575:                                        TCPOPT_NOP << 24 |
                    576:                                        TCPOPT_WINDOW << 16 |
                    577:                                        TCPOLEN_WINDOW << 8 |
                    578:                                        tp->request_r_scale);
                    579:                                optlen += 4;
                    580:                        }
                    581:                }
                    582:        }
                    583:
                    584:        /*
                    585:         * Send a timestamp and echo-reply if this is a SYN and our side
                    586:         * wants to use timestamps (TF_REQ_TSTMP is set) or both our side
                    587:         * and our peer have sent timestamps in our SYN's.
                    588:         */
                    589:        if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
                    590:             (flags & TH_RST) == 0 &&
                    591:            ((flags & (TH_SYN|TH_ACK)) == TH_SYN ||
                    592:             (tp->t_flags & TF_RCVD_TSTMP))) {
                    593:                u_int32_t *lp = (u_int32_t *)(opt + optlen);
                    594:
                    595:                /* Form timestamp option as shown in appendix A of RFC 1323. */
                    596:                *lp++ = htonl(TCPOPT_TSTAMP_HDR);
                    597:                *lp++ = htonl(tcp_now + tp->ts_modulate);
                    598:                *lp   = htonl(tp->ts_recent);
                    599:                optlen += TCPOLEN_TSTAMP_APPA;
                    600:        }
                    601:
                    602: #ifdef TCP_SIGNATURE
                    603:        if (tp->t_flags & TF_SIGNATURE) {
                    604:                u_int8_t *bp = (u_int8_t *)(opt + optlen);
                    605:
                    606:                /* Send signature option */
                    607:                *(bp++) = TCPOPT_SIGNATURE;
                    608:                *(bp++) = TCPOLEN_SIGNATURE;
                    609:                sigoff = optlen + 2;
                    610:
                    611:                {
                    612:                        unsigned int i;
                    613:
                    614:                        for (i = 0; i < 16; i++)
                    615:                                *(bp++) = 0;
                    616:                }
                    617:
                    618:
                    619:                /* Pad options list to the next 32 bit boundary and
                    620:                 * terminate it.
                    621:                 */
                    622:                *bp++ = TCPOPT_NOP;
                    623:                *bp++ = TCPOPT_EOL;
                    624:
                    625:                optlen += TCPOLEN_SIGLEN;
                    626:        }
                    627: #endif /* TCP_SIGNATURE */
                    628:
                    629: #ifdef TCP_SACK
                    630:        /*
                    631:         * Send SACKs if necessary.  This should be the last option processed.
                    632:         * Only as many SACKs are sent as are permitted by the maximum options
                    633:         * size.  No more than three SACKs are sent.
                    634:         */
                    635:        if (tp->sack_enable && tp->t_state == TCPS_ESTABLISHED &&
                    636:            (tp->t_flags & (TF_SACK_PERMIT|TF_NOOPT)) == TF_SACK_PERMIT &&
                    637:            tp->rcv_numsacks) {
                    638:                u_int32_t *lp = (u_int32_t *)(opt + optlen);
                    639:                u_int32_t *olp = lp++;
                    640:                int count = 0;  /* actual number of SACKs inserted */
                    641:                int maxsack = (MAX_TCPOPTLEN - (optlen + 4))/TCPOLEN_SACK;
                    642:
                    643:                tcpstat.tcps_sack_snd_opts++;
                    644:                maxsack = min(maxsack, TCP_MAX_SACK);
                    645:                for (i = 0; (i < tp->rcv_numsacks && count < maxsack); i++) {
                    646:                        struct sackblk sack = tp->sackblks[i];
                    647:                        if (sack.start == 0 && sack.end == 0)
                    648:                                continue;
                    649:                        *lp++ = htonl(sack.start);
                    650:                        *lp++ = htonl(sack.end);
                    651:                        count++;
                    652:                }
                    653:                *olp = htonl(TCPOPT_SACK_HDR|(TCPOLEN_SACK*count+2));
                    654:                optlen += TCPOLEN_SACK*count + 4; /* including leading NOPs */
                    655:        }
                    656: #endif /* TCP_SACK */
                    657:
                    658: #ifdef DIAGNOSTIC
                    659:        if (optlen > MAX_TCPOPTLEN)
                    660:                panic("tcp_output: options too long");
                    661: #endif /* DIAGNOSTIC */
                    662:
                    663:        hdrlen += optlen;
                    664:
                    665:        /*
                    666:         * Adjust data length if insertion of options will
                    667:         * bump the packet length beyond the t_maxopd length.
                    668:         */
                    669:        if (len > tp->t_maxopd - optlen) {
                    670:                len = tp->t_maxopd - optlen;
                    671:                sendalot = 1;
                    672:                flags &= ~TH_FIN;
                    673:         }
                    674:
                    675: #ifdef DIAGNOSTIC
                    676:        if (max_linkhdr + hdrlen > MCLBYTES)
                    677:                panic("tcphdr too big");
                    678: #endif
                    679:
                    680:        /*
                    681:         * Grab a header mbuf, attaching a copy of data to
                    682:         * be transmitted, and initialize the header from
                    683:         * the template for sends on this connection.
                    684:         */
                    685:        if (len) {
                    686:                if (tp->t_force && len == 1)
                    687:                        tcpstat.tcps_sndprobe++;
                    688:                else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
                    689:                        tcpstat.tcps_sndrexmitpack++;
                    690:                        tcpstat.tcps_sndrexmitbyte += len;
                    691:                } else {
                    692:                        tcpstat.tcps_sndpack++;
                    693:                        tcpstat.tcps_sndbyte += len;
                    694:                }
                    695: #ifdef notyet
                    696:                if ((m = m_copypack(so->so_snd.sb_mb, off,
                    697:                    (int)len, max_linkhdr + hdrlen)) == 0) {
                    698:                        error = ENOBUFS;
                    699:                        goto out;
                    700:                }
                    701:                /*
                    702:                 * m_copypack left space for our hdr; use it.
                    703:                 */
                    704:                m->m_len += hdrlen;
                    705:                m->m_data -= hdrlen;
                    706: #else
                    707:                MGETHDR(m, M_DONTWAIT, MT_HEADER);
                    708:                if (m != NULL && max_linkhdr + hdrlen > MHLEN) {
                    709:                        MCLGET(m, M_DONTWAIT);
                    710:                        if ((m->m_flags & M_EXT) == 0) {
                    711:                                m_freem(m);
                    712:                                m = NULL;
                    713:                        }
                    714:                }
                    715:                if (m == NULL) {
                    716:                        error = ENOBUFS;
                    717:                        goto out;
                    718:                }
                    719:                m->m_data += max_linkhdr;
                    720:                m->m_len = hdrlen;
                    721:                if (len <= M_TRAILINGSPACE(m)) {
                    722:                        m_copydata(so->so_snd.sb_mb, off, (int) len,
                    723:                            mtod(m, caddr_t) + hdrlen);
                    724:                        m->m_len += len;
                    725:                } else {
                    726:                        m->m_next = m_copy(so->so_snd.sb_mb, off, (int) len);
                    727:                        if (m->m_next == 0) {
                    728:                                (void) m_free(m);
                    729:                                error = ENOBUFS;
                    730:                                goto out;
                    731:                        }
                    732:                }
                    733: #endif
                    734:                /*
                    735:                 * If we're sending everything we've got, set PUSH.
                    736:                 * (This will keep happy those implementations which only
                    737:                 * give data to the user when a buffer fills or
                    738:                 * a PUSH comes in.)
                    739:                 */
                    740:                if (off + len == so->so_snd.sb_cc && !soissending(so))
                    741:                        flags |= TH_PUSH;
                    742:        } else {
                    743:                if (tp->t_flags & TF_ACKNOW)
                    744:                        tcpstat.tcps_sndacks++;
                    745:                else if (flags & (TH_SYN|TH_FIN|TH_RST))
                    746:                        tcpstat.tcps_sndctrl++;
                    747:                else if (SEQ_GT(tp->snd_up, tp->snd_una))
                    748:                        tcpstat.tcps_sndurg++;
                    749:                else
                    750:                        tcpstat.tcps_sndwinup++;
                    751:
                    752:                MGETHDR(m, M_DONTWAIT, MT_HEADER);
                    753:                if (m != NULL && max_linkhdr + hdrlen > MHLEN) {
                    754:                        MCLGET(m, M_DONTWAIT);
                    755:                        if ((m->m_flags & M_EXT) == 0) {
                    756:                                m_freem(m);
                    757:                                m = NULL;
                    758:                        }
                    759:                }
                    760:                if (m == NULL) {
                    761:                        error = ENOBUFS;
                    762:                        goto out;
                    763:                }
                    764:                m->m_data += max_linkhdr;
                    765:                m->m_len = hdrlen;
                    766:        }
                    767:        m->m_pkthdr.rcvif = (struct ifnet *)0;
                    768:        m->m_pkthdr.len = hdrlen + len;
                    769:
                    770:        if (!tp->t_template)
                    771:                panic("tcp_output");
                    772: #ifdef DIAGNOSTIC
                    773:        if (tp->t_template->m_len != hdrlen - optlen)
                    774:                panic("tcp_output: template len != hdrlen - optlen");
                    775: #endif /* DIAGNOSTIC */
                    776:        bcopy(mtod(tp->t_template, caddr_t), mtod(m, caddr_t),
                    777:                tp->t_template->m_len);
                    778:        th = (struct tcphdr *)(mtod(m, caddr_t) + tp->t_template->m_len -
                    779:                sizeof(struct tcphdr));
                    780:
                    781:        /*
                    782:         * Fill in fields, remembering maximum advertised
                    783:         * window for use in delaying messages about window sizes.
                    784:         * If resending a FIN, be sure not to use a new sequence number.
                    785:         */
                    786:        if ((flags & TH_FIN) && (tp->t_flags & TF_SENTFIN) &&
                    787:            (tp->snd_nxt == tp->snd_max))
                    788:                tp->snd_nxt--;
                    789:        /*
                    790:         * If we are doing retransmissions, then snd_nxt will
                    791:         * not reflect the first unsent octet.  For ACK only
                    792:         * packets, we do not want the sequence number of the
                    793:         * retransmitted packet, we want the sequence number
                    794:         * of the next unsent octet.  So, if there is no data
                    795:         * (and no SYN or FIN), use snd_max instead of snd_nxt
                    796:         * when filling in ti_seq.  But if we are in persist
                    797:         * state, snd_max might reflect one byte beyond the
                    798:         * right edge of the window, so use snd_nxt in that
                    799:         * case, since we know we aren't doing a retransmission.
                    800:         * (retransmit and persist are mutually exclusive...)
                    801:         */
                    802:        if (len || (flags & (TH_SYN|TH_FIN)) || TCP_TIMER_ISARMED(tp, TCPT_PERSIST))
                    803:                th->th_seq = htonl(tp->snd_nxt);
                    804:        else
                    805:                th->th_seq = htonl(tp->snd_max);
                    806:
                    807: #ifdef TCP_SACK
                    808:        if (sack_rxmit) {
                    809:                /*
                    810:                 * If sendalot was turned on (due to option stuffing), turn it
                    811:                 * off. Properly set th_seq field.  Advance the ret'x pointer
                    812:                 * by len.
                    813:                 */
                    814:                if (sendalot)
                    815:                        sendalot = 0;
                    816:                th->th_seq = htonl(p->rxmit);
                    817:                p->rxmit += len;
                    818: #if defined(TCP_SACK) && defined(TCP_FACK)
                    819:                tp->retran_data += len;
                    820: #endif /* TCP_FACK */
                    821:                tcpstat.tcps_sack_rexmits++;
                    822:                tcpstat.tcps_sack_rexmit_bytes += len;
                    823:        }
                    824: #endif /* TCP_SACK */
                    825:
                    826:        th->th_ack = htonl(tp->rcv_nxt);
                    827:        if (optlen) {
                    828:                bcopy((caddr_t)opt, (caddr_t)(th + 1), optlen);
                    829:                th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
                    830:        }
                    831: #ifdef TCP_ECN
                    832:        if (tcp_do_ecn) {
                    833:                /*
                    834:                 * if we have received congestion experienced segs,
                    835:                 * set ECE bit.
                    836:                 */
                    837:                if (tp->t_flags & TF_RCVD_CE) {
                    838:                        flags |= TH_ECE;
                    839:                        tcpstat.tcps_ecn_sndece++;
                    840:                }
                    841:                if (!(tp->t_flags & TF_DISABLE_ECN)) {
                    842:                        /*
                    843:                         * if this is a SYN seg, set ECE and CWR.
                    844:                         * set only ECE for SYN-ACK if peer supports ECN.
                    845:                         */
                    846:                        if ((flags & (TH_SYN|TH_ACK)) == TH_SYN)
                    847:                                flags |= (TH_ECE|TH_CWR);
                    848:                        else if ((tp->t_flags & TF_ECN_PERMIT) &&
                    849:                                 (flags & (TH_SYN|TH_ACK)) == (TH_SYN|TH_ACK))
                    850:                                flags |= TH_ECE;
                    851:                }
                    852:                /*
                    853:                 * if we have reduced the congestion window, notify
                    854:                 * the peer by setting CWR bit.
                    855:                 */
                    856:                if ((tp->t_flags & TF_ECN_PERMIT) &&
                    857:                    (tp->t_flags & TF_SEND_CWR)) {
                    858:                        flags |= TH_CWR;
                    859:                        tp->t_flags &= ~TF_SEND_CWR;
                    860:                        tcpstat.tcps_ecn_sndcwr++;
                    861:                }
                    862:        }
                    863: #endif
                    864:        th->th_flags = flags;
                    865:
                    866:        /*
                    867:         * Calculate receive window.  Don't shrink window,
                    868:         * but avoid silly window syndrome.
                    869:         */
                    870:        if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)tp->t_maxseg)
                    871:                win = 0;
                    872:        if (win > (long)TCP_MAXWIN << tp->rcv_scale)
                    873:                win = (long)TCP_MAXWIN << tp->rcv_scale;
                    874:        if (win < (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt))
                    875:                win = (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt);
                    876:        if (flags & TH_RST)
                    877:                win = 0;
                    878:        th->th_win = htons((u_int16_t) (win>>tp->rcv_scale));
                    879:        if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
                    880:                u_int32_t urp = tp->snd_up - tp->snd_nxt;
                    881:                if (urp > IP_MAXPACKET)
                    882:                        urp = IP_MAXPACKET;
                    883:                th->th_urp = htons((u_int16_t)urp);
                    884:                th->th_flags |= TH_URG;
                    885:        } else
                    886:                /*
                    887:                 * If no urgent pointer to send, then we pull
                    888:                 * the urgent pointer to the left edge of the send window
                    889:                 * so that it doesn't drift into the send window on sequence
                    890:                 * number wraparound.
                    891:                 */
                    892:                tp->snd_up = tp->snd_una;               /* drag it along */
                    893:
                    894: #ifdef TCP_SIGNATURE
                    895:        if (tp->t_flags & TF_SIGNATURE) {
                    896:                int iphlen;
                    897:                union sockaddr_union src, dst;
                    898:                struct tdb *tdb;
                    899:
                    900:                bzero(&src, sizeof(union sockaddr_union));
                    901:                bzero(&dst, sizeof(union sockaddr_union));
                    902:
                    903:                switch (tp->pf) {
                    904:                case 0: /*default to PF_INET*/
                    905: #ifdef INET
                    906:                case AF_INET:
                    907:                        iphlen = sizeof(struct ip);
                    908:                        src.sa.sa_len = sizeof(struct sockaddr_in);
                    909:                        src.sa.sa_family = AF_INET;
                    910:                        src.sin.sin_addr = mtod(m, struct ip *)->ip_src;
                    911:                        dst.sa.sa_len = sizeof(struct sockaddr_in);
                    912:                        dst.sa.sa_family = AF_INET;
                    913:                        dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst;
                    914:                        break;
                    915: #endif /* INET */
                    916: #ifdef INET6
                    917:                case AF_INET6:
                    918:                        iphlen = sizeof(struct ip6_hdr);
                    919:                        src.sa.sa_len = sizeof(struct sockaddr_in6);
                    920:                        src.sa.sa_family = AF_INET6;
                    921:                        src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src;
                    922:                        dst.sa.sa_len = sizeof(struct sockaddr_in6);
                    923:                        dst.sa.sa_family = AF_INET6;
                    924:                        dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst;
                    925:                        break;
                    926: #endif /* INET6 */
                    927:                }
                    928:
                    929:                /* XXX gettdbbysrcdst() should really be called at spltdb(). */
                    930:                /* XXX this is splsoftnet(), currently they are the same. */
                    931:                tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP);
                    932:                if (tdb == NULL)
                    933:                        return (EPERM);
                    934:
                    935:                if (tcp_signature(tdb, tp->pf, m, th, iphlen, 0,
                    936:                    mtod(m, caddr_t) + hdrlen - optlen + sigoff) < 0)
                    937:                        return (EINVAL);
                    938:        }
                    939: #endif /* TCP_SIGNATURE */
                    940:
                    941:        /*
                    942:         * Put TCP length in extended header, and then
                    943:         * checksum extended header and data.
                    944:         */
                    945:        switch (tp->pf) {
                    946:        case 0: /*default to PF_INET*/
                    947: #ifdef INET
                    948:        case AF_INET:
                    949:                /* Defer checksumming until later (ip_output() or hardware) */
                    950:                m->m_pkthdr.csum_flags |= M_TCPV4_CSUM_OUT;
                    951:                if (len + optlen)
                    952:                        th->th_sum = in_cksum_addword(th->th_sum,
                    953:                            htons((u_int16_t)(len + optlen)));
                    954:                break;
                    955: #endif /* INET */
                    956: #ifdef INET6
                    957:        case AF_INET6:
                    958:                th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
                    959:                        hdrlen - sizeof(struct ip6_hdr) + len);
                    960:                break;
                    961: #endif /* INET6 */
                    962:        }
                    963:
                    964:        /*
                    965:         * In transmit state, time the transmission and arrange for
                    966:         * the retransmit.  In persist state, just set snd_max.
                    967:         */
                    968:        if (tp->t_force == 0 || TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
                    969:                tcp_seq startseq = tp->snd_nxt;
                    970:
                    971:                /*
                    972:                 * Advance snd_nxt over sequence space of this segment.
                    973:                 */
                    974:                if (flags & (TH_SYN|TH_FIN)) {
                    975:                        if (flags & TH_SYN)
                    976:                                tp->snd_nxt++;
                    977:                        if (flags & TH_FIN) {
                    978:                                tp->snd_nxt++;
                    979:                                tp->t_flags |= TF_SENTFIN;
                    980:                        }
                    981:                }
                    982: #ifdef TCP_SACK
                    983:                if (tp->sack_enable) {
                    984:                        if (sack_rxmit && (p->rxmit != tp->snd_nxt)) {
                    985:                                goto timer;
                    986:                        }
                    987:                }
                    988: #endif
                    989:                tp->snd_nxt += len;
                    990:                if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
                    991:                        tp->snd_max = tp->snd_nxt;
                    992:                        /*
                    993:                         * Time this transmission if not a retransmission and
                    994:                         * not currently timing anything.
                    995:                         */
                    996:                        if (tp->t_rtttime == 0) {
                    997:                                tp->t_rtttime = tcp_now;
                    998:                                tp->t_rtseq = startseq;
                    999:                                tcpstat.tcps_segstimed++;
                   1000:                        }
                   1001:                }
                   1002:
                   1003:                /*
                   1004:                 * Set retransmit timer if not currently set,
                   1005:                 * and not doing an ack or a keep-alive probe.
                   1006:                 * Initial value for retransmit timer is smoothed
                   1007:                 * round-trip time + 2 * round-trip time variance.
                   1008:                 * Initialize shift counter which is used for backoff
                   1009:                 * of retransmit time.
                   1010:                 */
                   1011: #ifdef TCP_SACK
                   1012:  timer:
                   1013:                if (tp->sack_enable && sack_rxmit &&
                   1014:                    TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
                   1015:                    tp->snd_nxt != tp->snd_max) {
                   1016:                        TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
                   1017:                        if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) {
                   1018:                                TCP_TIMER_DISARM(tp, TCPT_PERSIST);
                   1019:                                tp->t_rxtshift = 0;
                   1020:                        }
                   1021:                }
                   1022: #endif
                   1023:
                   1024:                if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
                   1025:                    tp->snd_nxt != tp->snd_una) {
                   1026:                        TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
                   1027:                        if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) {
                   1028:                                TCP_TIMER_DISARM(tp, TCPT_PERSIST);
                   1029:                                tp->t_rxtshift = 0;
                   1030:                        }
                   1031:                }
                   1032:        } else
                   1033:                if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
                   1034:                        tp->snd_max = tp->snd_nxt + len;
                   1035:
                   1036:        /*
                   1037:         * Trace.
                   1038:         */
                   1039:        if (so->so_options & SO_DEBUG)
                   1040:                tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, caddr_t), 0,
                   1041:                        len);
                   1042:
                   1043:        /*
                   1044:         * Fill in IP length and desired time to live and
                   1045:         * send to IP level.  There should be a better way
                   1046:         * to handle ttl and tos; we could keep them in
                   1047:         * the template, but need a way to checksum without them.
                   1048:         */
                   1049:
                   1050: #ifdef TCP_ECN
                   1051:        /*
                   1052:         * if peer is ECN capable, set the ECT bit in the IP header.
                   1053:         * but don't set ECT for a pure ack, a retransmit or a window probe.
                   1054:         */
                   1055:        needect = 0;
                   1056:        if (tcp_do_ecn && (tp->t_flags & TF_ECN_PERMIT)) {
                   1057:                if (len == 0 || SEQ_LT(tp->snd_nxt, tp->snd_max) ||
                   1058:                    (tp->t_force && len == 1)) {
                   1059:                        /* don't set ECT */
                   1060:                } else {
                   1061:                        needect = 1;
                   1062:                        tcpstat.tcps_ecn_sndect++;
                   1063:                }
                   1064:        }
                   1065: #endif
                   1066:
                   1067:        switch (tp->pf) {
                   1068:        case 0: /*default to PF_INET*/
                   1069: #ifdef INET
                   1070:        case AF_INET:
                   1071:                {
                   1072:                        struct ip *ip;
                   1073:
                   1074:                        ip = mtod(m, struct ip *);
                   1075:                        ip->ip_len = htons(m->m_pkthdr.len);
                   1076:                        packetlen = m->m_pkthdr.len;
                   1077:                        ip->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl;
                   1078:                        ip->ip_tos = tp->t_inpcb->inp_ip.ip_tos;
                   1079: #ifdef TCP_ECN
                   1080:                        if (needect)
                   1081:                                ip->ip_tos |= IPTOS_ECN_ECT0;
                   1082: #endif
                   1083:                }
                   1084:                error = ip_output(m, tp->t_inpcb->inp_options,
                   1085:                        &tp->t_inpcb->inp_route,
                   1086:                        (ip_mtudisc ? IP_MTUDISC : 0) |
                   1087:                                  (so->so_options & SO_DONTROUTE),
                   1088:                        (void *)NULL, tp->t_inpcb);
                   1089:                break;
                   1090: #endif /* INET */
                   1091: #ifdef INET6
                   1092:        case AF_INET6:
                   1093:                {
                   1094:                        struct ip6_hdr *ip6;
                   1095:
                   1096:                        ip6 = mtod(m, struct ip6_hdr *);
                   1097:                        ip6->ip6_plen = m->m_pkthdr.len -
                   1098:                                sizeof(struct ip6_hdr);
                   1099:                        packetlen = m->m_pkthdr.len;
                   1100:                        ip6->ip6_nxt = IPPROTO_TCP;
                   1101:                        ip6->ip6_hlim = in6_selecthlim(tp->t_inpcb, NULL);
                   1102: #ifdef TCP_ECN
                   1103:                        if (needect)
                   1104:                                ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20);
                   1105: #endif
                   1106:                }
                   1107:                error = ip6_output(m, tp->t_inpcb->inp_outputopts6,
                   1108:                          &tp->t_inpcb->inp_route6,
                   1109:                          (so->so_options & SO_DONTROUTE), NULL, NULL,
                   1110:                          tp->t_inpcb);
                   1111:                break;
                   1112: #endif /* INET6 */
                   1113:        }
                   1114:
                   1115: #if defined(TCP_SACK) && defined(TCP_FACK)
                   1116:        /* Update snd_awnd to reflect the new data that was sent.  */
                   1117:        tp->snd_awnd = tcp_seq_subtract(tp->snd_max, tp->snd_fack) +
                   1118:                tp->retran_data;
                   1119: #endif /* defined(TCP_SACK) && defined(TCP_FACK) */
                   1120:
                   1121:        if (error) {
                   1122: out:
                   1123:                if (error == ENOBUFS) {
                   1124:                        /*
                   1125:                         * If the interface queue is full, or IP cannot
                   1126:                         * get an mbuf, trigger TCP slow start.
                   1127:                         */
                   1128:                        tp->snd_cwnd = tp->t_maxseg;
                   1129:                        return (0);
                   1130:                }
                   1131:                if (error == EMSGSIZE) {
                   1132:                        /*
                   1133:                         * ip_output() will have already fixed the route
                   1134:                         * for us.  tcp_mtudisc() will, as its last action,
                   1135:                         * initiate retransmission, so it is important to
                   1136:                         * not do so here.
                   1137:                         */
                   1138:                        tcp_mtudisc(tp->t_inpcb, -1);
                   1139:                        return (0);
                   1140:                }
                   1141:                if ((error == EHOSTUNREACH || error == ENETDOWN) &&
                   1142:                    TCPS_HAVERCVDSYN(tp->t_state)) {
                   1143:                        tp->t_softerror = error;
                   1144:                        return (0);
                   1145:                }
                   1146:
                   1147:                /* Restart the delayed ACK timer, if necessary. */
                   1148:                if (tp->t_flags & TF_DELACK)
                   1149:                        TCP_RESTART_DELACK(tp);
                   1150:
                   1151:                return (error);
                   1152:        }
                   1153:
                   1154:        if (packetlen > tp->t_pmtud_mtu_sent)
                   1155:                tp->t_pmtud_mtu_sent = packetlen;
                   1156:
                   1157:        tcpstat.tcps_sndtotal++;
                   1158:        if (tp->t_flags & TF_DELACK)
                   1159:                tcpstat.tcps_delack++;
                   1160:
                   1161:        /*
                   1162:         * Data sent (as far as we can tell).
                   1163:         * If this advertises a larger window than any other segment,
                   1164:         * then remember the size of the advertised window.
                   1165:         * Any pending ACK has now been sent.
                   1166:         */
                   1167:        if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
                   1168:                tp->rcv_adv = tp->rcv_nxt + win;
                   1169:        tp->last_ack_sent = tp->rcv_nxt;
                   1170:        tp->t_flags &= ~TF_ACKNOW;
                   1171:        TCP_CLEAR_DELACK(tp);
                   1172: #if defined(TCP_SACK)
                   1173:        if (sendalot && --maxburst)
                   1174: #else
                   1175:        if (sendalot)
                   1176: #endif
                   1177:                goto again;
                   1178:        return (0);
                   1179: }
                   1180:
                   1181: void
                   1182: tcp_setpersist(struct tcpcb *tp)
                   1183: {
                   1184:        int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + TCP_RTT_BASE_SHIFT);
                   1185:        int nticks;
                   1186:
                   1187:        if (TCP_TIMER_ISARMED(tp, TCPT_REXMT))
                   1188:                panic("tcp_output REXMT");
                   1189:        /*
                   1190:         * Start/restart persistance timer.
                   1191:         */
                   1192:        if (t < tp->t_rttmin)
                   1193:                t = tp->t_rttmin;
                   1194:        TCPT_RANGESET(nticks, t * tcp_backoff[tp->t_rxtshift],
                   1195:            TCPTV_PERSMIN, TCPTV_PERSMAX);
                   1196:        TCP_TIMER_ARM(tp, TCPT_PERSIST, nticks);
                   1197:        if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
                   1198:                tp->t_rxtshift++;
                   1199: }
CVSweb