Annotation of sys/netinet/tcp_input.c, Revision 1.1
1.1 ! nbrk 1: /* $OpenBSD: tcp_input.c,v 1.207 2007/06/15 18:23:06 markus Exp $ */
! 2: /* $NetBSD: tcp_input.c,v 1.23 1996/02/13 23:43:44 christos Exp $ */
! 3:
! 4: /*
! 5: * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994
! 6: * The Regents of the University of California. All rights reserved.
! 7: *
! 8: * Redistribution and use in source and binary forms, with or without
! 9: * modification, are permitted provided that the following conditions
! 10: * are met:
! 11: * 1. Redistributions of source code must retain the above copyright
! 12: * notice, this list of conditions and the following disclaimer.
! 13: * 2. Redistributions in binary form must reproduce the above copyright
! 14: * notice, this list of conditions and the following disclaimer in the
! 15: * documentation and/or other materials provided with the distribution.
! 16: * 3. Neither the name of the University nor the names of its contributors
! 17: * may be used to endorse or promote products derived from this software
! 18: * without specific prior written permission.
! 19: *
! 20: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
! 21: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
! 22: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
! 23: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
! 24: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
! 25: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
! 26: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
! 27: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
! 28: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
! 29: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
! 30: * SUCH DAMAGE.
! 31: *
! 32: * @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
! 33: *
! 34: * NRL grants permission for redistribution and use in source and binary
! 35: * forms, with or without modification, of the software and documentation
! 36: * created at NRL provided that the following conditions are met:
! 37: *
! 38: * 1. Redistributions of source code must retain the above copyright
! 39: * notice, this list of conditions and the following disclaimer.
! 40: * 2. Redistributions in binary form must reproduce the above copyright
! 41: * notice, this list of conditions and the following disclaimer in the
! 42: * documentation and/or other materials provided with the distribution.
! 43: * 3. All advertising materials mentioning features or use of this software
! 44: * must display the following acknowledgements:
! 45: * This product includes software developed by the University of
! 46: * California, Berkeley and its contributors.
! 47: * This product includes software developed at the Information
! 48: * Technology Division, US Naval Research Laboratory.
! 49: * 4. Neither the name of the NRL nor the names of its contributors
! 50: * may be used to endorse or promote products derived from this software
! 51: * without specific prior written permission.
! 52: *
! 53: * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
! 54: * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
! 55: * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
! 56: * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
! 57: * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
! 58: * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
! 59: * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
! 60: * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
! 61: * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
! 62: * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
! 63: * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
! 64: *
! 65: * The views and conclusions contained in the software and documentation
! 66: * are those of the authors and should not be interpreted as representing
! 67: * official policies, either expressed or implied, of the US Naval
! 68: * Research Laboratory (NRL).
! 69: */
! 70:
! 71: #include <sys/param.h>
! 72: #include <sys/systm.h>
! 73: #include <sys/mbuf.h>
! 74: #include <sys/protosw.h>
! 75: #include <sys/socket.h>
! 76: #include <sys/socketvar.h>
! 77: #include <sys/kernel.h>
! 78:
! 79: #include <dev/rndvar.h>
! 80:
! 81: #include <net/if.h>
! 82: #include <net/route.h>
! 83:
! 84: #include <netinet/in.h>
! 85: #include <netinet/in_systm.h>
! 86: #include <netinet/ip.h>
! 87: #include <netinet/in_pcb.h>
! 88: #include <netinet/ip_var.h>
! 89: #include <netinet/tcp.h>
! 90: #include <netinet/tcp_fsm.h>
! 91: #include <netinet/tcp_seq.h>
! 92: #include <netinet/tcp_timer.h>
! 93: #include <netinet/tcp_var.h>
! 94: #include <netinet/tcpip.h>
! 95: #include <netinet/tcp_debug.h>
! 96:
! 97: struct tcpiphdr tcp_saveti;
! 98:
! 99: int tcp_mss_adv(struct ifnet *, int);
! 100:
! 101: #ifdef INET6
! 102: #include <netinet6/in6_var.h>
! 103: #include <netinet6/nd6.h>
! 104:
! 105: struct tcpipv6hdr tcp_saveti6;
! 106:
! 107: /* for the packet header length in the mbuf */
! 108: #define M_PH_LEN(m) (((struct mbuf *)(m))->m_pkthdr.len)
! 109: #define M_V6_LEN(m) (M_PH_LEN(m) - sizeof(struct ip6_hdr))
! 110: #define M_V4_LEN(m) (M_PH_LEN(m) - sizeof(struct ip))
! 111: #endif /* INET6 */
! 112:
! 113: int tcprexmtthresh = 3;
! 114: int tcptv_keep_init = TCPTV_KEEP_INIT;
! 115:
! 116: extern u_long sb_max;
! 117:
! 118: int tcp_rst_ppslim = 100; /* 100pps */
! 119: int tcp_rst_ppslim_count = 0;
! 120: struct timeval tcp_rst_ppslim_last;
! 121:
! 122: int tcp_ackdrop_ppslim = 100; /* 100pps */
! 123: int tcp_ackdrop_ppslim_count = 0;
! 124: struct timeval tcp_ackdrop_ppslim_last;
! 125:
! 126: #define TCP_PAWS_IDLE (24 * 24 * 60 * 60 * PR_SLOWHZ)
! 127:
! 128: /* for modulo comparisons of timestamps */
! 129: #define TSTMP_LT(a,b) ((int)((a)-(b)) < 0)
! 130: #define TSTMP_GEQ(a,b) ((int)((a)-(b)) >= 0)
! 131:
! 132: /* for TCP SACK comparisons */
! 133: #define SEQ_MIN(a,b) (SEQ_LT(a,b) ? (a) : (b))
! 134: #define SEQ_MAX(a,b) (SEQ_GT(a,b) ? (a) : (b))
! 135:
! 136: /*
! 137: * Neighbor Discovery, Neighbor Unreachability Detection Upper layer hint.
! 138: */
! 139: #ifdef INET6
! 140: #define ND6_HINT(tp) \
! 141: do { \
! 142: if (tp && tp->t_inpcb && (tp->t_inpcb->inp_flags & INP_IPV6) && \
! 143: tp->t_inpcb->inp_route6.ro_rt) { \
! 144: nd6_nud_hint(tp->t_inpcb->inp_route6.ro_rt, NULL, 0); \
! 145: } \
! 146: } while (0)
! 147: #else
! 148: #define ND6_HINT(tp)
! 149: #endif
! 150:
! 151: #ifdef TCP_ECN
! 152: /*
! 153: * ECN (Explicit Congestion Notification) support based on RFC3168
! 154: * implementation note:
! 155: * snd_last is used to track a recovery phase.
! 156: * when cwnd is reduced, snd_last is set to snd_max.
! 157: * while snd_last > snd_una, the sender is in a recovery phase and
! 158: * its cwnd should not be reduced again.
! 159: * snd_last follows snd_una when not in a recovery phase.
! 160: */
! 161: #endif
! 162:
! 163: /*
! 164: * Macro to compute ACK transmission behavior. Delay the ACK unless
! 165: * we have already delayed an ACK (must send an ACK every two segments).
! 166: * We also ACK immediately if we received a PUSH and the ACK-on-PUSH
! 167: * option is enabled.
! 168: */
! 169: #define TCP_SETUP_ACK(tp, tiflags) \
! 170: do { \
! 171: if ((tp)->t_flags & TF_DELACK || \
! 172: (tcp_ack_on_push && (tiflags) & TH_PUSH)) \
! 173: tp->t_flags |= TF_ACKNOW; \
! 174: else \
! 175: TCP_SET_DELACK(tp); \
! 176: } while (0)
! 177:
! 178: /*
! 179: * Insert segment ti into reassembly queue of tcp with
! 180: * control block tp. Return TH_FIN if reassembly now includes
! 181: * a segment with FIN. The macro form does the common case inline
! 182: * (segment is the next to be received on an established connection,
! 183: * and the queue is empty), avoiding linkage into and removal
! 184: * from the queue and repetition of various conversions.
! 185: * Set DELACK for segments received in order, but ack immediately
! 186: * when segments are out of order (so fast retransmit can work).
! 187: */
! 188:
! 189: int
! 190: tcp_reass(tp, th, m, tlen)
! 191: struct tcpcb *tp;
! 192: struct tcphdr *th;
! 193: struct mbuf *m;
! 194: int *tlen;
! 195: {
! 196: struct tcpqent *p, *q, *nq, *tiqe;
! 197: struct socket *so = tp->t_inpcb->inp_socket;
! 198: int flags;
! 199:
! 200: /*
! 201: * Call with th==0 after become established to
! 202: * force pre-ESTABLISHED data up to user socket.
! 203: */
! 204: if (th == 0)
! 205: goto present;
! 206:
! 207: /*
! 208: * Allocate a new queue entry, before we throw away any data.
! 209: * If we can't, just drop the packet. XXX
! 210: */
! 211: tiqe = pool_get(&tcpqe_pool, PR_NOWAIT);
! 212: if (tiqe == NULL) {
! 213: tiqe = TAILQ_LAST(&tp->t_segq, tcpqehead);
! 214: if (tiqe != NULL && th->th_seq == tp->rcv_nxt) {
! 215: /* Reuse last entry since new segment fills a hole */
! 216: m_freem(tiqe->tcpqe_m);
! 217: TAILQ_REMOVE(&tp->t_segq, tiqe, tcpqe_q);
! 218: }
! 219: if (tiqe == NULL || th->th_seq != tp->rcv_nxt) {
! 220: /* Flush segment queue for this connection */
! 221: tcp_freeq(tp);
! 222: tcpstat.tcps_rcvmemdrop++;
! 223: m_freem(m);
! 224: return (0);
! 225: }
! 226: }
! 227:
! 228: /*
! 229: * Find a segment which begins after this one does.
! 230: */
! 231: for (p = NULL, q = TAILQ_FIRST(&tp->t_segq); q != NULL;
! 232: p = q, q = TAILQ_NEXT(q, tcpqe_q))
! 233: if (SEQ_GT(q->tcpqe_tcp->th_seq, th->th_seq))
! 234: break;
! 235:
! 236: /*
! 237: * If there is a preceding segment, it may provide some of
! 238: * our data already. If so, drop the data from the incoming
! 239: * segment. If it provides all of our data, drop us.
! 240: */
! 241: if (p != NULL) {
! 242: struct tcphdr *phdr = p->tcpqe_tcp;
! 243: int i;
! 244:
! 245: /* conversion to int (in i) handles seq wraparound */
! 246: i = phdr->th_seq + phdr->th_reseqlen - th->th_seq;
! 247: if (i > 0) {
! 248: if (i >= *tlen) {
! 249: tcpstat.tcps_rcvduppack++;
! 250: tcpstat.tcps_rcvdupbyte += *tlen;
! 251: m_freem(m);
! 252: pool_put(&tcpqe_pool, tiqe);
! 253: return (0);
! 254: }
! 255: m_adj(m, i);
! 256: *tlen -= i;
! 257: th->th_seq += i;
! 258: }
! 259: }
! 260: tcpstat.tcps_rcvoopack++;
! 261: tcpstat.tcps_rcvoobyte += *tlen;
! 262:
! 263: /*
! 264: * While we overlap succeeding segments trim them or,
! 265: * if they are completely covered, dequeue them.
! 266: */
! 267: for (; q != NULL; q = nq) {
! 268: struct tcphdr *qhdr = q->tcpqe_tcp;
! 269: int i = (th->th_seq + *tlen) - qhdr->th_seq;
! 270:
! 271: if (i <= 0)
! 272: break;
! 273: if (i < qhdr->th_reseqlen) {
! 274: qhdr->th_seq += i;
! 275: qhdr->th_reseqlen -= i;
! 276: m_adj(q->tcpqe_m, i);
! 277: break;
! 278: }
! 279: nq = TAILQ_NEXT(q, tcpqe_q);
! 280: m_freem(q->tcpqe_m);
! 281: TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q);
! 282: pool_put(&tcpqe_pool, q);
! 283: }
! 284:
! 285: /* Insert the new segment queue entry into place. */
! 286: tiqe->tcpqe_m = m;
! 287: th->th_reseqlen = *tlen;
! 288: tiqe->tcpqe_tcp = th;
! 289: if (p == NULL) {
! 290: TAILQ_INSERT_HEAD(&tp->t_segq, tiqe, tcpqe_q);
! 291: } else {
! 292: TAILQ_INSERT_AFTER(&tp->t_segq, p, tiqe, tcpqe_q);
! 293: }
! 294:
! 295: present:
! 296: /*
! 297: * Present data to user, advancing rcv_nxt through
! 298: * completed sequence space.
! 299: */
! 300: if (TCPS_HAVEESTABLISHED(tp->t_state) == 0)
! 301: return (0);
! 302: q = TAILQ_FIRST(&tp->t_segq);
! 303: if (q == NULL || q->tcpqe_tcp->th_seq != tp->rcv_nxt)
! 304: return (0);
! 305: if (tp->t_state == TCPS_SYN_RECEIVED && q->tcpqe_tcp->th_reseqlen)
! 306: return (0);
! 307: do {
! 308: tp->rcv_nxt += q->tcpqe_tcp->th_reseqlen;
! 309: flags = q->tcpqe_tcp->th_flags & TH_FIN;
! 310:
! 311: nq = TAILQ_NEXT(q, tcpqe_q);
! 312: TAILQ_REMOVE(&tp->t_segq, q, tcpqe_q);
! 313: ND6_HINT(tp);
! 314: if (so->so_state & SS_CANTRCVMORE)
! 315: m_freem(q->tcpqe_m);
! 316: else
! 317: sbappendstream(&so->so_rcv, q->tcpqe_m);
! 318: pool_put(&tcpqe_pool, q);
! 319: q = nq;
! 320: } while (q != NULL && q->tcpqe_tcp->th_seq == tp->rcv_nxt);
! 321: sorwakeup(so);
! 322: return (flags);
! 323: }
! 324:
! 325: #ifdef INET6
! 326: int
! 327: tcp6_input(mp, offp, proto)
! 328: struct mbuf **mp;
! 329: int *offp, proto;
! 330: {
! 331: struct mbuf *m = *mp;
! 332:
! 333: #if defined(NFAITH) && 0 < NFAITH
! 334: if (m->m_pkthdr.rcvif) {
! 335: if (m->m_pkthdr.rcvif->if_type == IFT_FAITH) {
! 336: /* XXX send icmp6 host/port unreach? */
! 337: m_freem(m);
! 338: return IPPROTO_DONE;
! 339: }
! 340: }
! 341: #endif
! 342:
! 343: /*
! 344: * draft-itojun-ipv6-tcp-to-anycast
! 345: * better place to put this in?
! 346: */
! 347: if (m->m_flags & M_ANYCAST6) {
! 348: if (m->m_len >= sizeof(struct ip6_hdr)) {
! 349: struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
! 350: icmp6_error(m, ICMP6_DST_UNREACH,
! 351: ICMP6_DST_UNREACH_ADDR,
! 352: (caddr_t)&ip6->ip6_dst - (caddr_t)ip6);
! 353: } else
! 354: m_freem(m);
! 355: return IPPROTO_DONE;
! 356: }
! 357:
! 358: tcp_input(m, *offp, proto);
! 359: return IPPROTO_DONE;
! 360: }
! 361: #endif
! 362:
! 363: /*
! 364: * TCP input routine, follows pages 65-76 of the
! 365: * protocol specification dated September, 1981 very closely.
! 366: */
! 367: void
! 368: tcp_input(struct mbuf *m, ...)
! 369: {
! 370: struct ip *ip;
! 371: struct inpcb *inp;
! 372: u_int8_t *optp = NULL;
! 373: int optlen = 0;
! 374: int tlen, off;
! 375: struct tcpcb *tp = 0;
! 376: int tiflags;
! 377: struct socket *so = NULL;
! 378: int todrop, acked, ourfinisacked, needoutput = 0;
! 379: int hdroptlen = 0;
! 380: short ostate = 0;
! 381: tcp_seq iss, *reuse = NULL;
! 382: u_long tiwin;
! 383: struct tcp_opt_info opti;
! 384: int iphlen;
! 385: va_list ap;
! 386: struct tcphdr *th;
! 387: #ifdef INET6
! 388: struct ip6_hdr *ip6 = NULL;
! 389: #endif /* INET6 */
! 390: #ifdef IPSEC
! 391: struct m_tag *mtag;
! 392: struct tdb_ident *tdbi;
! 393: struct tdb *tdb;
! 394: int error, s;
! 395: #endif /* IPSEC */
! 396: int af;
! 397: #ifdef TCP_ECN
! 398: u_char iptos;
! 399: #endif
! 400:
! 401: va_start(ap, m);
! 402: iphlen = va_arg(ap, int);
! 403: va_end(ap);
! 404:
! 405: tcpstat.tcps_rcvtotal++;
! 406:
! 407: opti.ts_present = 0;
! 408: opti.maxseg = 0;
! 409:
! 410: /*
! 411: * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
! 412: * See below for AF specific multicast.
! 413: */
! 414: if (m->m_flags & (M_BCAST|M_MCAST))
! 415: goto drop;
! 416:
! 417: /*
! 418: * Before we do ANYTHING, we have to figure out if it's TCP/IPv6 or
! 419: * TCP/IPv4.
! 420: */
! 421: switch (mtod(m, struct ip *)->ip_v) {
! 422: #ifdef INET6
! 423: case 6:
! 424: af = AF_INET6;
! 425: break;
! 426: #endif
! 427: case 4:
! 428: af = AF_INET;
! 429: break;
! 430: default:
! 431: m_freem(m);
! 432: return; /*EAFNOSUPPORT*/
! 433: }
! 434:
! 435: /*
! 436: * Get IP and TCP header together in first mbuf.
! 437: * Note: IP leaves IP header in first mbuf.
! 438: */
! 439: switch (af) {
! 440: case AF_INET:
! 441: #ifdef DIAGNOSTIC
! 442: if (iphlen < sizeof(struct ip)) {
! 443: m_freem(m);
! 444: return;
! 445: }
! 446: #endif /* DIAGNOSTIC */
! 447: break;
! 448: #ifdef INET6
! 449: case AF_INET6:
! 450: #ifdef DIAGNOSTIC
! 451: if (iphlen < sizeof(struct ip6_hdr)) {
! 452: m_freem(m);
! 453: return;
! 454: }
! 455: #endif /* DIAGNOSTIC */
! 456: break;
! 457: #endif
! 458: default:
! 459: m_freem(m);
! 460: return;
! 461: }
! 462:
! 463: IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, sizeof(*th));
! 464: if (!th) {
! 465: tcpstat.tcps_rcvshort++;
! 466: return;
! 467: }
! 468:
! 469: tlen = m->m_pkthdr.len - iphlen;
! 470: ip = NULL;
! 471: #ifdef INET6
! 472: ip6 = NULL;
! 473: #endif
! 474: switch (af) {
! 475: case AF_INET:
! 476: ip = mtod(m, struct ip *);
! 477: if (IN_MULTICAST(ip->ip_dst.s_addr) ||
! 478: in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif))
! 479: goto drop;
! 480: #ifdef TCP_ECN
! 481: /* save ip_tos before clearing it for checksum */
! 482: iptos = ip->ip_tos;
! 483: #endif
! 484: /*
! 485: * Checksum extended TCP header and data.
! 486: */
! 487: if ((m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_OK) == 0) {
! 488: if (m->m_pkthdr.csum_flags & M_TCP_CSUM_IN_BAD) {
! 489: tcpstat.tcps_inhwcsum++;
! 490: tcpstat.tcps_rcvbadsum++;
! 491: goto drop;
! 492: }
! 493: if (in4_cksum(m, IPPROTO_TCP, iphlen, tlen) != 0) {
! 494: tcpstat.tcps_rcvbadsum++;
! 495: goto drop;
! 496: }
! 497: } else {
! 498: m->m_pkthdr.csum_flags &= ~M_TCP_CSUM_IN_OK;
! 499: tcpstat.tcps_inhwcsum++;
! 500: }
! 501: break;
! 502: #ifdef INET6
! 503: case AF_INET6:
! 504: ip6 = mtod(m, struct ip6_hdr *);
! 505: #ifdef TCP_ECN
! 506: iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
! 507: #endif
! 508:
! 509: /* Be proactive about malicious use of IPv4 mapped address */
! 510: if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
! 511: IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
! 512: /* XXX stat */
! 513: goto drop;
! 514: }
! 515:
! 516: /*
! 517: * Be proactive about unspecified IPv6 address in source.
! 518: * As we use all-zero to indicate unbounded/unconnected pcb,
! 519: * unspecified IPv6 address can be used to confuse us.
! 520: *
! 521: * Note that packets with unspecified IPv6 destination is
! 522: * already dropped in ip6_input.
! 523: */
! 524: if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
! 525: /* XXX stat */
! 526: goto drop;
! 527: }
! 528:
! 529: /* Discard packets to multicast */
! 530: if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) {
! 531: /* XXX stat */
! 532: goto drop;
! 533: }
! 534:
! 535: /*
! 536: * Checksum extended TCP header and data.
! 537: */
! 538: if (in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr), tlen)) {
! 539: tcpstat.tcps_rcvbadsum++;
! 540: goto drop;
! 541: }
! 542: break;
! 543: #endif
! 544: }
! 545:
! 546: /*
! 547: * Check that TCP offset makes sense,
! 548: * pull out TCP options and adjust length. XXX
! 549: */
! 550: off = th->th_off << 2;
! 551: if (off < sizeof(struct tcphdr) || off > tlen) {
! 552: tcpstat.tcps_rcvbadoff++;
! 553: goto drop;
! 554: }
! 555: tlen -= off;
! 556: if (off > sizeof(struct tcphdr)) {
! 557: IP6_EXTHDR_GET(th, struct tcphdr *, m, iphlen, off);
! 558: if (!th) {
! 559: tcpstat.tcps_rcvshort++;
! 560: return;
! 561: }
! 562: optlen = off - sizeof(struct tcphdr);
! 563: optp = (u_int8_t *)(th + 1);
! 564: /*
! 565: * Do quick retrieval of timestamp options ("options
! 566: * prediction?"). If timestamp is the only option and it's
! 567: * formatted as recommended in RFC 1323 appendix A, we
! 568: * quickly get the values now and not bother calling
! 569: * tcp_dooptions(), etc.
! 570: */
! 571: if ((optlen == TCPOLEN_TSTAMP_APPA ||
! 572: (optlen > TCPOLEN_TSTAMP_APPA &&
! 573: optp[TCPOLEN_TSTAMP_APPA] == TCPOPT_EOL)) &&
! 574: *(u_int32_t *)optp == htonl(TCPOPT_TSTAMP_HDR) &&
! 575: (th->th_flags & TH_SYN) == 0) {
! 576: opti.ts_present = 1;
! 577: opti.ts_val = ntohl(*(u_int32_t *)(optp + 4));
! 578: opti.ts_ecr = ntohl(*(u_int32_t *)(optp + 8));
! 579: optp = NULL; /* we've parsed the options */
! 580: }
! 581: }
! 582: tiflags = th->th_flags;
! 583:
! 584: /*
! 585: * Convert TCP protocol specific fields to host format.
! 586: */
! 587: NTOHL(th->th_seq);
! 588: NTOHL(th->th_ack);
! 589: NTOHS(th->th_win);
! 590: NTOHS(th->th_urp);
! 591:
! 592: /*
! 593: * Locate pcb for segment.
! 594: */
! 595: findpcb:
! 596: switch (af) {
! 597: #ifdef INET6
! 598: case AF_INET6:
! 599: inp = in6_pcbhashlookup(&tcbtable, &ip6->ip6_src, th->th_sport,
! 600: &ip6->ip6_dst, th->th_dport);
! 601: break;
! 602: #endif
! 603: case AF_INET:
! 604: inp = in_pcbhashlookup(&tcbtable, ip->ip_src, th->th_sport,
! 605: ip->ip_dst, th->th_dport);
! 606: break;
! 607: }
! 608: if (inp == 0) {
! 609: int inpl_flags = 0;
! 610: if (m->m_pkthdr.pf.flags & PF_TAG_TRANSLATE_LOCALHOST)
! 611: inpl_flags = INPLOOKUP_WILDCARD;
! 612: ++tcpstat.tcps_pcbhashmiss;
! 613: switch (af) {
! 614: #ifdef INET6
! 615: case AF_INET6:
! 616: inp = in6_pcblookup_listen(&tcbtable,
! 617: &ip6->ip6_dst, th->th_dport, inpl_flags);
! 618: break;
! 619: #endif /* INET6 */
! 620: case AF_INET:
! 621: inp = in_pcblookup_listen(&tcbtable,
! 622: ip->ip_dst, th->th_dport, inpl_flags);
! 623: break;
! 624: }
! 625: /*
! 626: * If the state is CLOSED (i.e., TCB does not exist) then
! 627: * all data in the incoming segment is discarded.
! 628: * If the TCB exists but is in CLOSED state, it is embryonic,
! 629: * but should either do a listen or a connect soon.
! 630: */
! 631: if (inp == 0) {
! 632: ++tcpstat.tcps_noport;
! 633: goto dropwithreset_ratelim;
! 634: }
! 635: }
! 636:
! 637: /* Check the minimum TTL for socket. */
! 638: if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl)
! 639: goto drop;
! 640:
! 641: tp = intotcpcb(inp);
! 642: if (tp == 0)
! 643: goto dropwithreset_ratelim;
! 644: if (tp->t_state == TCPS_CLOSED)
! 645: goto drop;
! 646:
! 647: /* Unscale the window into a 32-bit value. */
! 648: if ((tiflags & TH_SYN) == 0)
! 649: tiwin = th->th_win << tp->snd_scale;
! 650: else
! 651: tiwin = th->th_win;
! 652:
! 653: so = inp->inp_socket;
! 654: if (so->so_options & (SO_DEBUG|SO_ACCEPTCONN)) {
! 655: union syn_cache_sa src;
! 656: union syn_cache_sa dst;
! 657:
! 658: bzero(&src, sizeof(src));
! 659: bzero(&dst, sizeof(dst));
! 660: switch (af) {
! 661: #ifdef INET
! 662: case AF_INET:
! 663: src.sin.sin_len = sizeof(struct sockaddr_in);
! 664: src.sin.sin_family = AF_INET;
! 665: src.sin.sin_addr = ip->ip_src;
! 666: src.sin.sin_port = th->th_sport;
! 667:
! 668: dst.sin.sin_len = sizeof(struct sockaddr_in);
! 669: dst.sin.sin_family = AF_INET;
! 670: dst.sin.sin_addr = ip->ip_dst;
! 671: dst.sin.sin_port = th->th_dport;
! 672: break;
! 673: #endif
! 674: #ifdef INET6
! 675: case AF_INET6:
! 676: src.sin6.sin6_len = sizeof(struct sockaddr_in6);
! 677: src.sin6.sin6_family = AF_INET6;
! 678: src.sin6.sin6_addr = ip6->ip6_src;
! 679: src.sin6.sin6_port = th->th_sport;
! 680:
! 681: dst.sin6.sin6_len = sizeof(struct sockaddr_in6);
! 682: dst.sin6.sin6_family = AF_INET6;
! 683: dst.sin6.sin6_addr = ip6->ip6_dst;
! 684: dst.sin6.sin6_port = th->th_dport;
! 685: break;
! 686: #endif /* INET6 */
! 687: default:
! 688: goto badsyn; /*sanity*/
! 689: }
! 690:
! 691: if (so->so_options & SO_DEBUG) {
! 692: ostate = tp->t_state;
! 693: switch (af) {
! 694: #ifdef INET6
! 695: case AF_INET6:
! 696: bcopy(ip6, &tcp_saveti6.ti6_i, sizeof(*ip6));
! 697: bcopy(th, &tcp_saveti6.ti6_t, sizeof(*th));
! 698: break;
! 699: #endif
! 700: case AF_INET:
! 701: bcopy(ip, &tcp_saveti.ti_i, sizeof(*ip));
! 702: bcopy(th, &tcp_saveti.ti_t, sizeof(*th));
! 703: break;
! 704: }
! 705: }
! 706: if (so->so_options & SO_ACCEPTCONN) {
! 707: if ((tiflags & (TH_RST|TH_ACK|TH_SYN)) != TH_SYN) {
! 708: if (tiflags & TH_RST) {
! 709: syn_cache_reset(&src.sa, &dst.sa, th);
! 710: } else if ((tiflags & (TH_ACK|TH_SYN)) ==
! 711: (TH_ACK|TH_SYN)) {
! 712: /*
! 713: * Received a SYN,ACK. This should
! 714: * never happen while we are in
! 715: * LISTEN. Send an RST.
! 716: */
! 717: goto badsyn;
! 718: } else if (tiflags & TH_ACK) {
! 719: so = syn_cache_get(&src.sa, &dst.sa,
! 720: th, iphlen, tlen, so, m);
! 721: if (so == NULL) {
! 722: /*
! 723: * We don't have a SYN for
! 724: * this ACK; send an RST.
! 725: */
! 726: goto badsyn;
! 727: } else if (so ==
! 728: (struct socket *)(-1)) {
! 729: /*
! 730: * We were unable to create
! 731: * the connection. If the
! 732: * 3-way handshake was
! 733: * completed, and RST has
! 734: * been sent to the peer.
! 735: * Since the mbuf might be
! 736: * in use for the reply,
! 737: * do not free it.
! 738: */
! 739: m = NULL;
! 740: } else {
! 741: /*
! 742: * We have created a
! 743: * full-blown connection.
! 744: */
! 745: tp = NULL;
! 746: inp = (struct inpcb *)so->so_pcb;
! 747: tp = intotcpcb(inp);
! 748: if (tp == NULL)
! 749: goto badsyn; /*XXX*/
! 750:
! 751: /*
! 752: * Compute proper scaling
! 753: * value from buffer space
! 754: */
! 755: tcp_rscale(tp, so->so_rcv.sb_hiwat);
! 756: goto after_listen;
! 757: }
! 758: } else {
! 759: /*
! 760: * None of RST, SYN or ACK was set.
! 761: * This is an invalid packet for a
! 762: * TCB in LISTEN state. Send a RST.
! 763: */
! 764: goto badsyn;
! 765: }
! 766: } else {
! 767: /*
! 768: * Received a SYN.
! 769: */
! 770: #ifdef INET6
! 771: /*
! 772: * If deprecated address is forbidden, we do
! 773: * not accept SYN to deprecated interface
! 774: * address to prevent any new inbound
! 775: * connection from getting established.
! 776: * When we do not accept SYN, we send a TCP
! 777: * RST, with deprecated source address (instead
! 778: * of dropping it). We compromise it as it is
! 779: * much better for peer to send a RST, and
! 780: * RST will be the final packet for the
! 781: * exchange.
! 782: *
! 783: * If we do not forbid deprecated addresses, we
! 784: * accept the SYN packet. RFC2462 does not
! 785: * suggest dropping SYN in this case.
! 786: * If we decipher RFC2462 5.5.4, it says like
! 787: * this:
! 788: * 1. use of deprecated addr with existing
! 789: * communication is okay - "SHOULD continue
! 790: * to be used"
! 791: * 2. use of it with new communication:
! 792: * (2a) "SHOULD NOT be used if alternate
! 793: * address with sufficient scope is
! 794: * available"
! 795: * (2b) nothing mentioned otherwise.
! 796: * Here we fall into (2b) case as we have no
! 797: * choice in our source address selection - we
! 798: * must obey the peer.
! 799: *
! 800: * The wording in RFC2462 is confusing, and
! 801: * there are multiple description text for
! 802: * deprecated address handling - worse, they
! 803: * are not exactly the same. I believe 5.5.4
! 804: * is the best one, so we follow 5.5.4.
! 805: */
! 806: if (ip6 && !ip6_use_deprecated) {
! 807: struct in6_ifaddr *ia6;
! 808:
! 809: if ((ia6 = in6ifa_ifpwithaddr(m->m_pkthdr.rcvif,
! 810: &ip6->ip6_dst)) &&
! 811: (ia6->ia6_flags & IN6_IFF_DEPRECATED)) {
! 812: tp = NULL;
! 813: goto dropwithreset;
! 814: }
! 815: }
! 816: #endif
! 817:
! 818: /*
! 819: * LISTEN socket received a SYN
! 820: * from itself? This can't possibly
! 821: * be valid; drop the packet.
! 822: */
! 823: if (th->th_dport == th->th_sport) {
! 824: switch (af) {
! 825: #ifdef INET6
! 826: case AF_INET6:
! 827: if (IN6_ARE_ADDR_EQUAL(&ip6->ip6_src,
! 828: &ip6->ip6_dst)) {
! 829: tcpstat.tcps_badsyn++;
! 830: goto drop;
! 831: }
! 832: break;
! 833: #endif /* INET6 */
! 834: case AF_INET:
! 835: if (ip->ip_dst.s_addr == ip->ip_src.s_addr) {
! 836: tcpstat.tcps_badsyn++;
! 837: goto drop;
! 838: }
! 839: break;
! 840: }
! 841: }
! 842:
! 843: /*
! 844: * SYN looks ok; create compressed TCP
! 845: * state for it.
! 846: */
! 847: if (so->so_qlen <= so->so_qlimit &&
! 848: syn_cache_add(&src.sa, &dst.sa, th, iphlen,
! 849: so, m, optp, optlen, &opti, reuse))
! 850: m = NULL;
! 851: }
! 852: goto drop;
! 853: }
! 854: }
! 855:
! 856: after_listen:
! 857: #ifdef DIAGNOSTIC
! 858: /*
! 859: * Should not happen now that all embryonic connections
! 860: * are handled with compressed state.
! 861: */
! 862: if (tp->t_state == TCPS_LISTEN)
! 863: panic("tcp_input: TCPS_LISTEN");
! 864: #endif
! 865:
! 866: #ifdef IPSEC
! 867: /* Find most recent IPsec tag */
! 868: mtag = m_tag_find(m, PACKET_TAG_IPSEC_IN_DONE, NULL);
! 869: s = splnet();
! 870: if (mtag != NULL) {
! 871: tdbi = (struct tdb_ident *)(mtag + 1);
! 872: tdb = gettdb(tdbi->spi, &tdbi->dst, tdbi->proto);
! 873: } else
! 874: tdb = NULL;
! 875: ipsp_spd_lookup(m, af, iphlen, &error, IPSP_DIRECTION_IN,
! 876: tdb, inp);
! 877: if (error) {
! 878: splx(s);
! 879: goto drop;
! 880: }
! 881:
! 882: /* Latch SA */
! 883: if (inp->inp_tdb_in != tdb) {
! 884: if (tdb) {
! 885: tdb_add_inp(tdb, inp, 1);
! 886: if (inp->inp_ipo == NULL) {
! 887: inp->inp_ipo = ipsec_add_policy(inp, af,
! 888: IPSP_DIRECTION_OUT);
! 889: if (inp->inp_ipo == NULL) {
! 890: splx(s);
! 891: goto drop;
! 892: }
! 893: }
! 894: if (inp->inp_ipo->ipo_dstid == NULL &&
! 895: tdb->tdb_srcid != NULL) {
! 896: inp->inp_ipo->ipo_dstid = tdb->tdb_srcid;
! 897: tdb->tdb_srcid->ref_count++;
! 898: }
! 899: if (inp->inp_ipsec_remotecred == NULL &&
! 900: tdb->tdb_remote_cred != NULL) {
! 901: inp->inp_ipsec_remotecred =
! 902: tdb->tdb_remote_cred;
! 903: tdb->tdb_remote_cred->ref_count++;
! 904: }
! 905: if (inp->inp_ipsec_remoteauth == NULL &&
! 906: tdb->tdb_remote_auth != NULL) {
! 907: inp->inp_ipsec_remoteauth =
! 908: tdb->tdb_remote_auth;
! 909: tdb->tdb_remote_auth->ref_count++;
! 910: }
! 911: } else { /* Just reset */
! 912: TAILQ_REMOVE(&inp->inp_tdb_in->tdb_inp_in, inp,
! 913: inp_tdb_in_next);
! 914: inp->inp_tdb_in = NULL;
! 915: }
! 916: }
! 917: splx(s);
! 918: #endif /* IPSEC */
! 919:
! 920: /*
! 921: * Segment received on connection.
! 922: * Reset idle time and keep-alive timer.
! 923: */
! 924: tp->t_rcvtime = tcp_now;
! 925: if (TCPS_HAVEESTABLISHED(tp->t_state))
! 926: TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
! 927:
! 928: #ifdef TCP_SACK
! 929: if (tp->sack_enable)
! 930: tcp_del_sackholes(tp, th); /* Delete stale SACK holes */
! 931: #endif /* TCP_SACK */
! 932:
! 933: /*
! 934: * Process options.
! 935: */
! 936: #ifdef TCP_SIGNATURE
! 937: if (optp || (tp->t_flags & TF_SIGNATURE))
! 938: #else
! 939: if (optp)
! 940: #endif
! 941: if (tcp_dooptions(tp, optp, optlen, th, m, iphlen, &opti))
! 942: goto drop;
! 943:
! 944: if (opti.ts_present && opti.ts_ecr) {
! 945: int rtt_test;
! 946:
! 947: /* subtract out the tcp timestamp modulator */
! 948: opti.ts_ecr -= tp->ts_modulate;
! 949:
! 950: /* make sure ts_ecr is sensible */
! 951: rtt_test = tcp_now - opti.ts_ecr;
! 952: if (rtt_test < 0 || rtt_test > TCP_RTT_MAX)
! 953: opti.ts_ecr = 0;
! 954: }
! 955:
! 956: #ifdef TCP_ECN
! 957: /* if congestion experienced, set ECE bit in subsequent packets. */
! 958: if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) {
! 959: tp->t_flags |= TF_RCVD_CE;
! 960: tcpstat.tcps_ecn_rcvce++;
! 961: }
! 962: #endif
! 963: /*
! 964: * Header prediction: check for the two common cases
! 965: * of a uni-directional data xfer. If the packet has
! 966: * no control flags, is in-sequence, the window didn't
! 967: * change and we're not retransmitting, it's a
! 968: * candidate. If the length is zero and the ack moved
! 969: * forward, we're the sender side of the xfer. Just
! 970: * free the data acked & wake any higher level process
! 971: * that was blocked waiting for space. If the length
! 972: * is non-zero and the ack didn't move, we're the
! 973: * receiver side. If we're getting packets in-order
! 974: * (the reassembly queue is empty), add the data to
! 975: * the socket buffer and note that we need a delayed ack.
! 976: */
! 977: if (tp->t_state == TCPS_ESTABLISHED &&
! 978: #ifdef TCP_ECN
! 979: (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ECE|TH_CWR|TH_ACK)) == TH_ACK &&
! 980: #else
! 981: (tiflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK &&
! 982: #endif
! 983: (!opti.ts_present || TSTMP_GEQ(opti.ts_val, tp->ts_recent)) &&
! 984: th->th_seq == tp->rcv_nxt &&
! 985: tiwin && tiwin == tp->snd_wnd &&
! 986: tp->snd_nxt == tp->snd_max) {
! 987:
! 988: /*
! 989: * If last ACK falls within this segment's sequence numbers,
! 990: * record the timestamp.
! 991: * Fix from Braden, see Stevens p. 870
! 992: */
! 993: if (opti.ts_present && SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
! 994: tp->ts_recent_age = tcp_now;
! 995: tp->ts_recent = opti.ts_val;
! 996: }
! 997:
! 998: if (tlen == 0) {
! 999: if (SEQ_GT(th->th_ack, tp->snd_una) &&
! 1000: SEQ_LEQ(th->th_ack, tp->snd_max) &&
! 1001: tp->snd_cwnd >= tp->snd_wnd &&
! 1002: tp->t_dupacks == 0) {
! 1003: /*
! 1004: * this is a pure ack for outstanding data.
! 1005: */
! 1006: ++tcpstat.tcps_predack;
! 1007: if (opti.ts_present && opti.ts_ecr)
! 1008: tcp_xmit_timer(tp, tcp_now - opti.ts_ecr);
! 1009: else if (tp->t_rtttime &&
! 1010: SEQ_GT(th->th_ack, tp->t_rtseq))
! 1011: tcp_xmit_timer(tp,
! 1012: tcp_now - tp->t_rtttime);
! 1013: acked = th->th_ack - tp->snd_una;
! 1014: tcpstat.tcps_rcvackpack++;
! 1015: tcpstat.tcps_rcvackbyte += acked;
! 1016: ND6_HINT(tp);
! 1017: sbdrop(&so->so_snd, acked);
! 1018:
! 1019: /*
! 1020: * If we had a pending ICMP message that
! 1021: * referres to data that have just been
! 1022: * acknowledged, disregard the recorded ICMP
! 1023: * message.
! 1024: */
! 1025: if ((tp->t_flags & TF_PMTUD_PEND) &&
! 1026: SEQ_GT(th->th_ack, tp->t_pmtud_th_seq))
! 1027: tp->t_flags &= ~TF_PMTUD_PEND;
! 1028:
! 1029: /*
! 1030: * Keep track of the largest chunk of data
! 1031: * acknowledged since last PMTU update
! 1032: */
! 1033: if (tp->t_pmtud_mss_acked < acked)
! 1034: tp->t_pmtud_mss_acked = acked;
! 1035:
! 1036: tp->snd_una = th->th_ack;
! 1037: #if defined(TCP_SACK) || defined(TCP_ECN)
! 1038: /*
! 1039: * We want snd_last to track snd_una so
! 1040: * as to avoid sequence wraparound problems
! 1041: * for very large transfers.
! 1042: */
! 1043: #ifdef TCP_ECN
! 1044: if (SEQ_GT(tp->snd_una, tp->snd_last))
! 1045: #endif
! 1046: tp->snd_last = tp->snd_una;
! 1047: #endif /* TCP_SACK */
! 1048: #if defined(TCP_SACK) && defined(TCP_FACK)
! 1049: tp->snd_fack = tp->snd_una;
! 1050: tp->retran_data = 0;
! 1051: #endif /* TCP_FACK */
! 1052: m_freem(m);
! 1053:
! 1054: /*
! 1055: * If all outstanding data are acked, stop
! 1056: * retransmit timer, otherwise restart timer
! 1057: * using current (possibly backed-off) value.
! 1058: * If process is waiting for space,
! 1059: * wakeup/selwakeup/signal. If data
! 1060: * are ready to send, let tcp_output
! 1061: * decide between more output or persist.
! 1062: */
! 1063: if (tp->snd_una == tp->snd_max)
! 1064: TCP_TIMER_DISARM(tp, TCPT_REXMT);
! 1065: else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
! 1066: TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
! 1067:
! 1068: if (sb_notify(&so->so_snd))
! 1069: sowwakeup(so);
! 1070: if (so->so_snd.sb_cc)
! 1071: (void) tcp_output(tp);
! 1072: return;
! 1073: }
! 1074: } else if (th->th_ack == tp->snd_una &&
! 1075: TAILQ_EMPTY(&tp->t_segq) &&
! 1076: tlen <= sbspace(&so->so_rcv)) {
! 1077: /*
! 1078: * This is a pure, in-sequence data packet
! 1079: * with nothing on the reassembly queue and
! 1080: * we have enough buffer space to take it.
! 1081: */
! 1082: #ifdef TCP_SACK
! 1083: /* Clean receiver SACK report if present */
! 1084: if (tp->sack_enable && tp->rcv_numsacks)
! 1085: tcp_clean_sackreport(tp);
! 1086: #endif /* TCP_SACK */
! 1087: ++tcpstat.tcps_preddat;
! 1088: tp->rcv_nxt += tlen;
! 1089: tcpstat.tcps_rcvpack++;
! 1090: tcpstat.tcps_rcvbyte += tlen;
! 1091: ND6_HINT(tp);
! 1092: /*
! 1093: * Drop TCP, IP headers and TCP options then add data
! 1094: * to socket buffer.
! 1095: */
! 1096: if (so->so_state & SS_CANTRCVMORE)
! 1097: m_freem(m);
! 1098: else {
! 1099: m_adj(m, iphlen + off);
! 1100: sbappendstream(&so->so_rcv, m);
! 1101: }
! 1102: sorwakeup(so);
! 1103: TCP_SETUP_ACK(tp, tiflags);
! 1104: if (tp->t_flags & TF_ACKNOW)
! 1105: (void) tcp_output(tp);
! 1106: return;
! 1107: }
! 1108: }
! 1109:
! 1110: /*
! 1111: * Compute mbuf offset to TCP data segment.
! 1112: */
! 1113: hdroptlen = iphlen + off;
! 1114:
! 1115: /*
! 1116: * Calculate amount of space in receive window,
! 1117: * and then do TCP input processing.
! 1118: * Receive window is amount of space in rcv queue,
! 1119: * but not less than advertised window.
! 1120: */
! 1121: { int win;
! 1122:
! 1123: win = sbspace(&so->so_rcv);
! 1124: if (win < 0)
! 1125: win = 0;
! 1126: tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
! 1127: }
! 1128:
! 1129: switch (tp->t_state) {
! 1130:
! 1131: /*
! 1132: * If the state is SYN_RECEIVED:
! 1133: * if seg contains SYN/ACK, send an RST.
! 1134: * if seg contains an ACK, but not for our SYN/ACK, send an RST
! 1135: */
! 1136:
! 1137: case TCPS_SYN_RECEIVED:
! 1138: if (tiflags & TH_ACK) {
! 1139: if (tiflags & TH_SYN) {
! 1140: tcpstat.tcps_badsyn++;
! 1141: goto dropwithreset;
! 1142: }
! 1143: if (SEQ_LEQ(th->th_ack, tp->snd_una) ||
! 1144: SEQ_GT(th->th_ack, tp->snd_max))
! 1145: goto dropwithreset;
! 1146: }
! 1147: break;
! 1148:
! 1149: /*
! 1150: * If the state is SYN_SENT:
! 1151: * if seg contains an ACK, but not for our SYN, drop the input.
! 1152: * if seg contains a RST, then drop the connection.
! 1153: * if seg does not contain SYN, then drop it.
! 1154: * Otherwise this is an acceptable SYN segment
! 1155: * initialize tp->rcv_nxt and tp->irs
! 1156: * if seg contains ack then advance tp->snd_una
! 1157: * if SYN has been acked change to ESTABLISHED else SYN_RCVD state
! 1158: * arrange for segment to be acked (eventually)
! 1159: * continue processing rest of data/controls, beginning with URG
! 1160: */
! 1161: case TCPS_SYN_SENT:
! 1162: if ((tiflags & TH_ACK) &&
! 1163: (SEQ_LEQ(th->th_ack, tp->iss) ||
! 1164: SEQ_GT(th->th_ack, tp->snd_max)))
! 1165: goto dropwithreset;
! 1166: if (tiflags & TH_RST) {
! 1167: #ifdef TCP_ECN
! 1168: /* if ECN is enabled, fall back to non-ecn at rexmit */
! 1169: if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN))
! 1170: goto drop;
! 1171: #endif
! 1172: if (tiflags & TH_ACK)
! 1173: tp = tcp_drop(tp, ECONNREFUSED);
! 1174: goto drop;
! 1175: }
! 1176: if ((tiflags & TH_SYN) == 0)
! 1177: goto drop;
! 1178: if (tiflags & TH_ACK) {
! 1179: tp->snd_una = th->th_ack;
! 1180: if (SEQ_LT(tp->snd_nxt, tp->snd_una))
! 1181: tp->snd_nxt = tp->snd_una;
! 1182: }
! 1183: TCP_TIMER_DISARM(tp, TCPT_REXMT);
! 1184: tp->irs = th->th_seq;
! 1185: tcp_mss(tp, opti.maxseg);
! 1186: /* Reset initial window to 1 segment for retransmit */
! 1187: if (tp->t_rxtshift > 0)
! 1188: tp->snd_cwnd = tp->t_maxseg;
! 1189: tcp_rcvseqinit(tp);
! 1190: tp->t_flags |= TF_ACKNOW;
! 1191: #ifdef TCP_SACK
! 1192: /*
! 1193: * If we've sent a SACK_PERMITTED option, and the peer
! 1194: * also replied with one, then TF_SACK_PERMIT should have
! 1195: * been set in tcp_dooptions(). If it was not, disable SACKs.
! 1196: */
! 1197: if (tp->sack_enable)
! 1198: tp->sack_enable = tp->t_flags & TF_SACK_PERMIT;
! 1199: #endif
! 1200: #ifdef TCP_ECN
! 1201: /*
! 1202: * if ECE is set but CWR is not set for SYN-ACK, or
! 1203: * both ECE and CWR are set for simultaneous open,
! 1204: * peer is ECN capable.
! 1205: */
! 1206: if (tcp_do_ecn) {
! 1207: if ((tiflags & (TH_ACK|TH_ECE|TH_CWR))
! 1208: == (TH_ACK|TH_ECE) ||
! 1209: (tiflags & (TH_ACK|TH_ECE|TH_CWR))
! 1210: == (TH_ECE|TH_CWR)) {
! 1211: tp->t_flags |= TF_ECN_PERMIT;
! 1212: tiflags &= ~(TH_ECE|TH_CWR);
! 1213: tcpstat.tcps_ecn_accepts++;
! 1214: }
! 1215: }
! 1216: #endif
! 1217:
! 1218: if (tiflags & TH_ACK && SEQ_GT(tp->snd_una, tp->iss)) {
! 1219: tcpstat.tcps_connects++;
! 1220: soisconnected(so);
! 1221: tp->t_state = TCPS_ESTABLISHED;
! 1222: TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
! 1223: /* Do window scaling on this connection? */
! 1224: if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
! 1225: (TF_RCVD_SCALE|TF_REQ_SCALE)) {
! 1226: tp->snd_scale = tp->requested_s_scale;
! 1227: tp->rcv_scale = tp->request_r_scale;
! 1228: }
! 1229: tcp_reass_lock(tp);
! 1230: (void) tcp_reass(tp, (struct tcphdr *)0,
! 1231: (struct mbuf *)0, &tlen);
! 1232: tcp_reass_unlock(tp);
! 1233: /*
! 1234: * if we didn't have to retransmit the SYN,
! 1235: * use its rtt as our initial srtt & rtt var.
! 1236: */
! 1237: if (tp->t_rtttime)
! 1238: tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
! 1239: /*
! 1240: * Since new data was acked (the SYN), open the
! 1241: * congestion window by one MSS. We do this
! 1242: * here, because we won't go through the normal
! 1243: * ACK processing below. And since this is the
! 1244: * start of the connection, we know we are in
! 1245: * the exponential phase of slow-start.
! 1246: */
! 1247: tp->snd_cwnd += tp->t_maxseg;
! 1248: } else
! 1249: tp->t_state = TCPS_SYN_RECEIVED;
! 1250:
! 1251: #if 0
! 1252: trimthenstep6:
! 1253: #endif
! 1254: /*
! 1255: * Advance th->th_seq to correspond to first data byte.
! 1256: * If data, trim to stay within window,
! 1257: * dropping FIN if necessary.
! 1258: */
! 1259: th->th_seq++;
! 1260: if (tlen > tp->rcv_wnd) {
! 1261: todrop = tlen - tp->rcv_wnd;
! 1262: m_adj(m, -todrop);
! 1263: tlen = tp->rcv_wnd;
! 1264: tiflags &= ~TH_FIN;
! 1265: tcpstat.tcps_rcvpackafterwin++;
! 1266: tcpstat.tcps_rcvbyteafterwin += todrop;
! 1267: }
! 1268: tp->snd_wl1 = th->th_seq - 1;
! 1269: tp->rcv_up = th->th_seq;
! 1270: goto step6;
! 1271: /*
! 1272: * If a new connection request is received while in TIME_WAIT,
! 1273: * drop the old connection and start over if the if the
! 1274: * timestamp or the sequence numbers are above the previous
! 1275: * ones.
! 1276: */
! 1277: case TCPS_TIME_WAIT:
! 1278: if (((tiflags & (TH_SYN|TH_ACK)) == TH_SYN) &&
! 1279: ((opti.ts_present &&
! 1280: TSTMP_LT(tp->ts_recent, opti.ts_val)) ||
! 1281: SEQ_GT(th->th_seq, tp->rcv_nxt))) {
! 1282: /*
! 1283: * Advance the iss by at least 32768, but
! 1284: * clear the msb in order to make sure
! 1285: * that SEG_LT(snd_nxt, iss).
! 1286: */
! 1287: iss = tp->snd_nxt +
! 1288: ((arc4random() & 0x7fffffff) | 0x8000);
! 1289: reuse = &iss;
! 1290: tp = tcp_close(tp);
! 1291: goto findpcb;
! 1292: }
! 1293: }
! 1294:
! 1295: /*
! 1296: * States other than LISTEN or SYN_SENT.
! 1297: * First check timestamp, if present.
! 1298: * Then check that at least some bytes of segment are within
! 1299: * receive window. If segment begins before rcv_nxt,
! 1300: * drop leading data (and SYN); if nothing left, just ack.
! 1301: *
! 1302: * RFC 1323 PAWS: If we have a timestamp reply on this segment
! 1303: * and it's less than opti.ts_recent, drop it.
! 1304: */
! 1305: if (opti.ts_present && (tiflags & TH_RST) == 0 && tp->ts_recent &&
! 1306: TSTMP_LT(opti.ts_val, tp->ts_recent)) {
! 1307:
! 1308: /* Check to see if ts_recent is over 24 days old. */
! 1309: if ((int)(tcp_now - tp->ts_recent_age) > TCP_PAWS_IDLE) {
! 1310: /*
! 1311: * Invalidate ts_recent. If this segment updates
! 1312: * ts_recent, the age will be reset later and ts_recent
! 1313: * will get a valid value. If it does not, setting
! 1314: * ts_recent to zero will at least satisfy the
! 1315: * requirement that zero be placed in the timestamp
! 1316: * echo reply when ts_recent isn't valid. The
! 1317: * age isn't reset until we get a valid ts_recent
! 1318: * because we don't want out-of-order segments to be
! 1319: * dropped when ts_recent is old.
! 1320: */
! 1321: tp->ts_recent = 0;
! 1322: } else {
! 1323: tcpstat.tcps_rcvduppack++;
! 1324: tcpstat.tcps_rcvdupbyte += tlen;
! 1325: tcpstat.tcps_pawsdrop++;
! 1326: goto dropafterack;
! 1327: }
! 1328: }
! 1329:
! 1330: todrop = tp->rcv_nxt - th->th_seq;
! 1331: if (todrop > 0) {
! 1332: if (tiflags & TH_SYN) {
! 1333: tiflags &= ~TH_SYN;
! 1334: th->th_seq++;
! 1335: if (th->th_urp > 1)
! 1336: th->th_urp--;
! 1337: else
! 1338: tiflags &= ~TH_URG;
! 1339: todrop--;
! 1340: }
! 1341: if (todrop > tlen ||
! 1342: (todrop == tlen && (tiflags & TH_FIN) == 0)) {
! 1343: /*
! 1344: * Any valid FIN must be to the left of the
! 1345: * window. At this point, FIN must be a
! 1346: * duplicate or out-of-sequence, so drop it.
! 1347: */
! 1348: tiflags &= ~TH_FIN;
! 1349: /*
! 1350: * Send ACK to resynchronize, and drop any data,
! 1351: * but keep on processing for RST or ACK.
! 1352: */
! 1353: tp->t_flags |= TF_ACKNOW;
! 1354: tcpstat.tcps_rcvdupbyte += todrop = tlen;
! 1355: tcpstat.tcps_rcvduppack++;
! 1356: } else {
! 1357: tcpstat.tcps_rcvpartduppack++;
! 1358: tcpstat.tcps_rcvpartdupbyte += todrop;
! 1359: }
! 1360: hdroptlen += todrop; /* drop from head afterwards */
! 1361: th->th_seq += todrop;
! 1362: tlen -= todrop;
! 1363: if (th->th_urp > todrop)
! 1364: th->th_urp -= todrop;
! 1365: else {
! 1366: tiflags &= ~TH_URG;
! 1367: th->th_urp = 0;
! 1368: }
! 1369: }
! 1370:
! 1371: /*
! 1372: * If new data are received on a connection after the
! 1373: * user processes are gone, then RST the other end.
! 1374: */
! 1375: if ((so->so_state & SS_NOFDREF) &&
! 1376: tp->t_state > TCPS_CLOSE_WAIT && tlen) {
! 1377: tp = tcp_close(tp);
! 1378: tcpstat.tcps_rcvafterclose++;
! 1379: goto dropwithreset;
! 1380: }
! 1381:
! 1382: /*
! 1383: * If segment ends after window, drop trailing data
! 1384: * (and PUSH and FIN); if nothing left, just ACK.
! 1385: */
! 1386: todrop = (th->th_seq + tlen) - (tp->rcv_nxt+tp->rcv_wnd);
! 1387: if (todrop > 0) {
! 1388: tcpstat.tcps_rcvpackafterwin++;
! 1389: if (todrop >= tlen) {
! 1390: tcpstat.tcps_rcvbyteafterwin += tlen;
! 1391: /*
! 1392: * If window is closed can only take segments at
! 1393: * window edge, and have to drop data and PUSH from
! 1394: * incoming segments. Continue processing, but
! 1395: * remember to ack. Otherwise, drop segment
! 1396: * and ack.
! 1397: */
! 1398: if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
! 1399: tp->t_flags |= TF_ACKNOW;
! 1400: tcpstat.tcps_rcvwinprobe++;
! 1401: } else
! 1402: goto dropafterack;
! 1403: } else
! 1404: tcpstat.tcps_rcvbyteafterwin += todrop;
! 1405: m_adj(m, -todrop);
! 1406: tlen -= todrop;
! 1407: tiflags &= ~(TH_PUSH|TH_FIN);
! 1408: }
! 1409:
! 1410: /*
! 1411: * If last ACK falls within this segment's sequence numbers,
! 1412: * record its timestamp if it's more recent.
! 1413: * Cf fix from Braden, see Stevens p. 870
! 1414: */
! 1415: if (opti.ts_present && TSTMP_GEQ(opti.ts_val, tp->ts_recent) &&
! 1416: SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
! 1417: if (SEQ_LEQ(tp->last_ack_sent, th->th_seq + tlen +
! 1418: ((tiflags & (TH_SYN|TH_FIN)) != 0)))
! 1419: tp->ts_recent = opti.ts_val;
! 1420: else
! 1421: tp->ts_recent = 0;
! 1422: tp->ts_recent_age = tcp_now;
! 1423: }
! 1424:
! 1425: /*
! 1426: * If the RST bit is set examine the state:
! 1427: * SYN_RECEIVED STATE:
! 1428: * If passive open, return to LISTEN state.
! 1429: * If active open, inform user that connection was refused.
! 1430: * ESTABLISHED, FIN_WAIT_1, FIN_WAIT2, CLOSE_WAIT STATES:
! 1431: * Inform user that connection was reset, and close tcb.
! 1432: * CLOSING, LAST_ACK, TIME_WAIT STATES
! 1433: * Close the tcb.
! 1434: */
! 1435: if (tiflags & TH_RST) {
! 1436: if (th->th_seq != tp->last_ack_sent &&
! 1437: th->th_seq != tp->rcv_nxt &&
! 1438: th->th_seq != (tp->rcv_nxt + 1))
! 1439: goto drop;
! 1440:
! 1441: switch (tp->t_state) {
! 1442: case TCPS_SYN_RECEIVED:
! 1443: #ifdef TCP_ECN
! 1444: /* if ECN is enabled, fall back to non-ecn at rexmit */
! 1445: if (tcp_do_ecn && !(tp->t_flags & TF_DISABLE_ECN))
! 1446: goto drop;
! 1447: #endif
! 1448: so->so_error = ECONNREFUSED;
! 1449: goto close;
! 1450:
! 1451: case TCPS_ESTABLISHED:
! 1452: case TCPS_FIN_WAIT_1:
! 1453: case TCPS_FIN_WAIT_2:
! 1454: case TCPS_CLOSE_WAIT:
! 1455: so->so_error = ECONNRESET;
! 1456: close:
! 1457: tp->t_state = TCPS_CLOSED;
! 1458: tcpstat.tcps_drops++;
! 1459: tp = tcp_close(tp);
! 1460: goto drop;
! 1461: case TCPS_CLOSING:
! 1462: case TCPS_LAST_ACK:
! 1463: case TCPS_TIME_WAIT:
! 1464: tp = tcp_close(tp);
! 1465: goto drop;
! 1466: }
! 1467: }
! 1468:
! 1469: /*
! 1470: * If a SYN is in the window, then this is an
! 1471: * error and we ACK and drop the packet.
! 1472: */
! 1473: if (tiflags & TH_SYN)
! 1474: goto dropafterack_ratelim;
! 1475:
! 1476: /*
! 1477: * If the ACK bit is off we drop the segment and return.
! 1478: */
! 1479: if ((tiflags & TH_ACK) == 0) {
! 1480: if (tp->t_flags & TF_ACKNOW)
! 1481: goto dropafterack;
! 1482: else
! 1483: goto drop;
! 1484: }
! 1485:
! 1486: /*
! 1487: * Ack processing.
! 1488: */
! 1489: switch (tp->t_state) {
! 1490:
! 1491: /*
! 1492: * In SYN_RECEIVED state, the ack ACKs our SYN, so enter
! 1493: * ESTABLISHED state and continue processing.
! 1494: * The ACK was checked above.
! 1495: */
! 1496: case TCPS_SYN_RECEIVED:
! 1497: tcpstat.tcps_connects++;
! 1498: soisconnected(so);
! 1499: tp->t_state = TCPS_ESTABLISHED;
! 1500: TCP_TIMER_ARM(tp, TCPT_KEEP, tcp_keepidle);
! 1501: /* Do window scaling? */
! 1502: if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
! 1503: (TF_RCVD_SCALE|TF_REQ_SCALE)) {
! 1504: tp->snd_scale = tp->requested_s_scale;
! 1505: tp->rcv_scale = tp->request_r_scale;
! 1506: }
! 1507: tcp_reass_lock(tp);
! 1508: (void) tcp_reass(tp, (struct tcphdr *)0, (struct mbuf *)0,
! 1509: &tlen);
! 1510: tcp_reass_unlock(tp);
! 1511: tp->snd_wl1 = th->th_seq - 1;
! 1512: /* fall into ... */
! 1513:
! 1514: /*
! 1515: * In ESTABLISHED state: drop duplicate ACKs; ACK out of range
! 1516: * ACKs. If the ack is in the range
! 1517: * tp->snd_una < th->th_ack <= tp->snd_max
! 1518: * then advance tp->snd_una to th->th_ack and drop
! 1519: * data from the retransmission queue. If this ACK reflects
! 1520: * more up to date window information we update our window information.
! 1521: */
! 1522: case TCPS_ESTABLISHED:
! 1523: case TCPS_FIN_WAIT_1:
! 1524: case TCPS_FIN_WAIT_2:
! 1525: case TCPS_CLOSE_WAIT:
! 1526: case TCPS_CLOSING:
! 1527: case TCPS_LAST_ACK:
! 1528: case TCPS_TIME_WAIT:
! 1529: #ifdef TCP_ECN
! 1530: /*
! 1531: * if we receive ECE and are not already in recovery phase,
! 1532: * reduce cwnd by half but don't slow-start.
! 1533: * advance snd_last to snd_max not to reduce cwnd again
! 1534: * until all outstanding packets are acked.
! 1535: */
! 1536: if (tcp_do_ecn && (tiflags & TH_ECE)) {
! 1537: if ((tp->t_flags & TF_ECN_PERMIT) &&
! 1538: SEQ_GEQ(tp->snd_una, tp->snd_last)) {
! 1539: u_int win;
! 1540:
! 1541: win = min(tp->snd_wnd, tp->snd_cwnd) / tp->t_maxseg;
! 1542: if (win > 1) {
! 1543: tp->snd_ssthresh = win / 2 * tp->t_maxseg;
! 1544: tp->snd_cwnd = tp->snd_ssthresh;
! 1545: tp->snd_last = tp->snd_max;
! 1546: tp->t_flags |= TF_SEND_CWR;
! 1547: tcpstat.tcps_cwr_ecn++;
! 1548: }
! 1549: }
! 1550: tcpstat.tcps_ecn_rcvece++;
! 1551: }
! 1552: /*
! 1553: * if we receive CWR, we know that the peer has reduced
! 1554: * its congestion window. stop sending ecn-echo.
! 1555: */
! 1556: if ((tiflags & TH_CWR)) {
! 1557: tp->t_flags &= ~TF_RCVD_CE;
! 1558: tcpstat.tcps_ecn_rcvcwr++;
! 1559: }
! 1560: #endif /* TCP_ECN */
! 1561:
! 1562: if (SEQ_LEQ(th->th_ack, tp->snd_una)) {
! 1563: /*
! 1564: * Duplicate/old ACK processing.
! 1565: * Increments t_dupacks:
! 1566: * Pure duplicate (same seq/ack/window, no data)
! 1567: * Doesn't affect t_dupacks:
! 1568: * Data packets.
! 1569: * Normal window updates (window opens)
! 1570: * Resets t_dupacks:
! 1571: * New data ACKed.
! 1572: * Window shrinks
! 1573: * Old ACK
! 1574: */
! 1575: if (tlen) {
! 1576: /* Drop very old ACKs unless th_seq matches */
! 1577: if (th->th_seq != tp->rcv_nxt &&
! 1578: SEQ_LT(th->th_ack,
! 1579: tp->snd_una - tp->max_sndwnd)) {
! 1580: tcpstat.tcps_rcvacktooold++;
! 1581: goto drop;
! 1582: }
! 1583: break;
! 1584: }
! 1585: /*
! 1586: * If we get an old ACK, there is probably packet
! 1587: * reordering going on. Be conservative and reset
! 1588: * t_dupacks so that we are less agressive in
! 1589: * doing a fast retransmit.
! 1590: */
! 1591: if (th->th_ack != tp->snd_una) {
! 1592: tp->t_dupacks = 0;
! 1593: break;
! 1594: }
! 1595: if (tiwin == tp->snd_wnd) {
! 1596: tcpstat.tcps_rcvdupack++;
! 1597: /*
! 1598: * If we have outstanding data (other than
! 1599: * a window probe), this is a completely
! 1600: * duplicate ack (ie, window info didn't
! 1601: * change), the ack is the biggest we've
! 1602: * seen and we've seen exactly our rexmt
! 1603: * threshold of them, assume a packet
! 1604: * has been dropped and retransmit it.
! 1605: * Kludge snd_nxt & the congestion
! 1606: * window so we send only this one
! 1607: * packet.
! 1608: *
! 1609: * We know we're losing at the current
! 1610: * window size so do congestion avoidance
! 1611: * (set ssthresh to half the current window
! 1612: * and pull our congestion window back to
! 1613: * the new ssthresh).
! 1614: *
! 1615: * Dup acks mean that packets have left the
! 1616: * network (they're now cached at the receiver)
! 1617: * so bump cwnd by the amount in the receiver
! 1618: * to keep a constant cwnd packets in the
! 1619: * network.
! 1620: */
! 1621: if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0)
! 1622: tp->t_dupacks = 0;
! 1623: #if defined(TCP_SACK) && defined(TCP_FACK)
! 1624: /*
! 1625: * In FACK, can enter fast rec. if the receiver
! 1626: * reports a reass. queue longer than 3 segs.
! 1627: */
! 1628: else if (++tp->t_dupacks == tcprexmtthresh ||
! 1629: ((SEQ_GT(tp->snd_fack, tcprexmtthresh *
! 1630: tp->t_maxseg + tp->snd_una)) &&
! 1631: SEQ_GT(tp->snd_una, tp->snd_last))) {
! 1632: #else
! 1633: else if (++tp->t_dupacks == tcprexmtthresh) {
! 1634: #endif /* TCP_FACK */
! 1635: tcp_seq onxt = tp->snd_nxt;
! 1636: u_long win =
! 1637: ulmin(tp->snd_wnd, tp->snd_cwnd) /
! 1638: 2 / tp->t_maxseg;
! 1639:
! 1640: #if defined(TCP_SACK) || defined(TCP_ECN)
! 1641: if (SEQ_LT(th->th_ack, tp->snd_last)){
! 1642: /*
! 1643: * False fast retx after
! 1644: * timeout. Do not cut window.
! 1645: */
! 1646: tp->t_dupacks = 0;
! 1647: goto drop;
! 1648: }
! 1649: #endif
! 1650: if (win < 2)
! 1651: win = 2;
! 1652: tp->snd_ssthresh = win * tp->t_maxseg;
! 1653: #if defined(TCP_SACK)
! 1654: tp->snd_last = tp->snd_max;
! 1655: #endif
! 1656: #ifdef TCP_SACK
! 1657: if (tp->sack_enable) {
! 1658: TCP_TIMER_DISARM(tp, TCPT_REXMT);
! 1659: tp->t_rtttime = 0;
! 1660: #ifdef TCP_ECN
! 1661: tp->t_flags |= TF_SEND_CWR;
! 1662: #endif
! 1663: #if 1 /* TCP_ECN */
! 1664: tcpstat.tcps_cwr_frecovery++;
! 1665: #endif
! 1666: tcpstat.tcps_sack_recovery_episode++;
! 1667: #if defined(TCP_SACK) && defined(TCP_FACK)
! 1668: tp->t_dupacks = tcprexmtthresh;
! 1669: (void) tcp_output(tp);
! 1670: /*
! 1671: * During FR, snd_cwnd is held
! 1672: * constant for FACK.
! 1673: */
! 1674: tp->snd_cwnd = tp->snd_ssthresh;
! 1675: #else
! 1676: /*
! 1677: * tcp_output() will send
! 1678: * oldest SACK-eligible rtx.
! 1679: */
! 1680: (void) tcp_output(tp);
! 1681: tp->snd_cwnd = tp->snd_ssthresh+
! 1682: tp->t_maxseg * tp->t_dupacks;
! 1683: #endif /* TCP_FACK */
! 1684: goto drop;
! 1685: }
! 1686: #endif /* TCP_SACK */
! 1687: TCP_TIMER_DISARM(tp, TCPT_REXMT);
! 1688: tp->t_rtttime = 0;
! 1689: tp->snd_nxt = th->th_ack;
! 1690: tp->snd_cwnd = tp->t_maxseg;
! 1691: #ifdef TCP_ECN
! 1692: tp->t_flags |= TF_SEND_CWR;
! 1693: #endif
! 1694: #if 1 /* TCP_ECN */
! 1695: tcpstat.tcps_cwr_frecovery++;
! 1696: #endif
! 1697: tcpstat.tcps_sndrexmitfast++;
! 1698: (void) tcp_output(tp);
! 1699:
! 1700: tp->snd_cwnd = tp->snd_ssthresh +
! 1701: tp->t_maxseg * tp->t_dupacks;
! 1702: if (SEQ_GT(onxt, tp->snd_nxt))
! 1703: tp->snd_nxt = onxt;
! 1704: goto drop;
! 1705: } else if (tp->t_dupacks > tcprexmtthresh) {
! 1706: #if defined(TCP_SACK) && defined(TCP_FACK)
! 1707: /*
! 1708: * while (awnd < cwnd)
! 1709: * sendsomething();
! 1710: */
! 1711: if (tp->sack_enable) {
! 1712: if (tp->snd_awnd < tp->snd_cwnd)
! 1713: tcp_output(tp);
! 1714: goto drop;
! 1715: }
! 1716: #endif /* TCP_FACK */
! 1717: tp->snd_cwnd += tp->t_maxseg;
! 1718: (void) tcp_output(tp);
! 1719: goto drop;
! 1720: }
! 1721: } else if (tiwin < tp->snd_wnd) {
! 1722: /*
! 1723: * The window was retracted! Previous dup
! 1724: * ACKs may have been due to packets arriving
! 1725: * after the shrunken window, not a missing
! 1726: * packet, so play it safe and reset t_dupacks
! 1727: */
! 1728: tp->t_dupacks = 0;
! 1729: }
! 1730: break;
! 1731: }
! 1732: /*
! 1733: * If the congestion window was inflated to account
! 1734: * for the other side's cached packets, retract it.
! 1735: */
! 1736: #if defined(TCP_SACK)
! 1737: if (tp->sack_enable) {
! 1738: if (tp->t_dupacks >= tcprexmtthresh) {
! 1739: /* Check for a partial ACK */
! 1740: if (tcp_sack_partialack(tp, th)) {
! 1741: #if defined(TCP_SACK) && defined(TCP_FACK)
! 1742: /* Force call to tcp_output */
! 1743: if (tp->snd_awnd < tp->snd_cwnd)
! 1744: needoutput = 1;
! 1745: #else
! 1746: tp->snd_cwnd += tp->t_maxseg;
! 1747: needoutput = 1;
! 1748: #endif /* TCP_FACK */
! 1749: } else {
! 1750: /* Out of fast recovery */
! 1751: tp->snd_cwnd = tp->snd_ssthresh;
! 1752: if (tcp_seq_subtract(tp->snd_max,
! 1753: th->th_ack) < tp->snd_ssthresh)
! 1754: tp->snd_cwnd =
! 1755: tcp_seq_subtract(tp->snd_max,
! 1756: th->th_ack);
! 1757: tp->t_dupacks = 0;
! 1758: #if defined(TCP_SACK) && defined(TCP_FACK)
! 1759: if (SEQ_GT(th->th_ack, tp->snd_fack))
! 1760: tp->snd_fack = th->th_ack;
! 1761: #endif /* TCP_FACK */
! 1762: }
! 1763: }
! 1764: } else {
! 1765: if (tp->t_dupacks >= tcprexmtthresh &&
! 1766: !tcp_newreno(tp, th)) {
! 1767: /* Out of fast recovery */
! 1768: tp->snd_cwnd = tp->snd_ssthresh;
! 1769: if (tcp_seq_subtract(tp->snd_max, th->th_ack) <
! 1770: tp->snd_ssthresh)
! 1771: tp->snd_cwnd =
! 1772: tcp_seq_subtract(tp->snd_max,
! 1773: th->th_ack);
! 1774: tp->t_dupacks = 0;
! 1775: }
! 1776: }
! 1777: if (tp->t_dupacks < tcprexmtthresh)
! 1778: tp->t_dupacks = 0;
! 1779: #else /* else no TCP_SACK */
! 1780: if (tp->t_dupacks >= tcprexmtthresh &&
! 1781: tp->snd_cwnd > tp->snd_ssthresh)
! 1782: tp->snd_cwnd = tp->snd_ssthresh;
! 1783: tp->t_dupacks = 0;
! 1784: #endif
! 1785: if (SEQ_GT(th->th_ack, tp->snd_max)) {
! 1786: tcpstat.tcps_rcvacktoomuch++;
! 1787: goto dropafterack_ratelim;
! 1788: }
! 1789: acked = th->th_ack - tp->snd_una;
! 1790: tcpstat.tcps_rcvackpack++;
! 1791: tcpstat.tcps_rcvackbyte += acked;
! 1792:
! 1793: /*
! 1794: * If we have a timestamp reply, update smoothed
! 1795: * round trip time. If no timestamp is present but
! 1796: * transmit timer is running and timed sequence
! 1797: * number was acked, update smoothed round trip time.
! 1798: * Since we now have an rtt measurement, cancel the
! 1799: * timer backoff (cf., Phil Karn's retransmit alg.).
! 1800: * Recompute the initial retransmit timer.
! 1801: */
! 1802: if (opti.ts_present && opti.ts_ecr)
! 1803: tcp_xmit_timer(tp, tcp_now - opti.ts_ecr);
! 1804: else if (tp->t_rtttime && SEQ_GT(th->th_ack, tp->t_rtseq))
! 1805: tcp_xmit_timer(tp, tcp_now - tp->t_rtttime);
! 1806:
! 1807: /*
! 1808: * If all outstanding data is acked, stop retransmit
! 1809: * timer and remember to restart (more output or persist).
! 1810: * If there is more data to be acked, restart retransmit
! 1811: * timer, using current (possibly backed-off) value.
! 1812: */
! 1813: if (th->th_ack == tp->snd_max) {
! 1814: TCP_TIMER_DISARM(tp, TCPT_REXMT);
! 1815: needoutput = 1;
! 1816: } else if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
! 1817: TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
! 1818: /*
! 1819: * When new data is acked, open the congestion window.
! 1820: * If the window gives us less than ssthresh packets
! 1821: * in flight, open exponentially (maxseg per packet).
! 1822: * Otherwise open linearly: maxseg per window
! 1823: * (maxseg^2 / cwnd per packet).
! 1824: */
! 1825: {
! 1826: u_int cw = tp->snd_cwnd;
! 1827: u_int incr = tp->t_maxseg;
! 1828:
! 1829: if (cw > tp->snd_ssthresh)
! 1830: incr = incr * incr / cw;
! 1831: #if defined (TCP_SACK)
! 1832: if (tp->t_dupacks < tcprexmtthresh)
! 1833: #endif
! 1834: tp->snd_cwnd = ulmin(cw + incr, TCP_MAXWIN<<tp->snd_scale);
! 1835: }
! 1836: ND6_HINT(tp);
! 1837: if (acked > so->so_snd.sb_cc) {
! 1838: tp->snd_wnd -= so->so_snd.sb_cc;
! 1839: sbdrop(&so->so_snd, (int)so->so_snd.sb_cc);
! 1840: ourfinisacked = 1;
! 1841: } else {
! 1842: sbdrop(&so->so_snd, acked);
! 1843: tp->snd_wnd -= acked;
! 1844: ourfinisacked = 0;
! 1845: }
! 1846: if (sb_notify(&so->so_snd))
! 1847: sowwakeup(so);
! 1848:
! 1849: /*
! 1850: * If we had a pending ICMP message that referred to data
! 1851: * that have just been acknowledged, disregard the recorded
! 1852: * ICMP message.
! 1853: */
! 1854: if ((tp->t_flags & TF_PMTUD_PEND) &&
! 1855: SEQ_GT(th->th_ack, tp->t_pmtud_th_seq))
! 1856: tp->t_flags &= ~TF_PMTUD_PEND;
! 1857:
! 1858: /*
! 1859: * Keep track of the largest chunk of data acknowledged
! 1860: * since last PMTU update
! 1861: */
! 1862: if (tp->t_pmtud_mss_acked < acked)
! 1863: tp->t_pmtud_mss_acked = acked;
! 1864:
! 1865: tp->snd_una = th->th_ack;
! 1866: #ifdef TCP_ECN
! 1867: /* sync snd_last with snd_una */
! 1868: if (SEQ_GT(tp->snd_una, tp->snd_last))
! 1869: tp->snd_last = tp->snd_una;
! 1870: #endif
! 1871: if (SEQ_LT(tp->snd_nxt, tp->snd_una))
! 1872: tp->snd_nxt = tp->snd_una;
! 1873: #if defined (TCP_SACK) && defined (TCP_FACK)
! 1874: if (SEQ_GT(tp->snd_una, tp->snd_fack)) {
! 1875: tp->snd_fack = tp->snd_una;
! 1876: /* Update snd_awnd for partial ACK
! 1877: * without any SACK blocks.
! 1878: */
! 1879: tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt,
! 1880: tp->snd_fack) + tp->retran_data;
! 1881: }
! 1882: #endif
! 1883:
! 1884: switch (tp->t_state) {
! 1885:
! 1886: /*
! 1887: * In FIN_WAIT_1 STATE in addition to the processing
! 1888: * for the ESTABLISHED state if our FIN is now acknowledged
! 1889: * then enter FIN_WAIT_2.
! 1890: */
! 1891: case TCPS_FIN_WAIT_1:
! 1892: if (ourfinisacked) {
! 1893: /*
! 1894: * If we can't receive any more
! 1895: * data, then closing user can proceed.
! 1896: * Starting the timer is contrary to the
! 1897: * specification, but if we don't get a FIN
! 1898: * we'll hang forever.
! 1899: */
! 1900: if (so->so_state & SS_CANTRCVMORE) {
! 1901: soisdisconnected(so);
! 1902: TCP_TIMER_ARM(tp, TCPT_2MSL, tcp_maxidle);
! 1903: }
! 1904: tp->t_state = TCPS_FIN_WAIT_2;
! 1905: }
! 1906: break;
! 1907:
! 1908: /*
! 1909: * In CLOSING STATE in addition to the processing for
! 1910: * the ESTABLISHED state if the ACK acknowledges our FIN
! 1911: * then enter the TIME-WAIT state, otherwise ignore
! 1912: * the segment.
! 1913: */
! 1914: case TCPS_CLOSING:
! 1915: if (ourfinisacked) {
! 1916: tp->t_state = TCPS_TIME_WAIT;
! 1917: tcp_canceltimers(tp);
! 1918: TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
! 1919: soisdisconnected(so);
! 1920: }
! 1921: break;
! 1922:
! 1923: /*
! 1924: * In LAST_ACK, we may still be waiting for data to drain
! 1925: * and/or to be acked, as well as for the ack of our FIN.
! 1926: * If our FIN is now acknowledged, delete the TCB,
! 1927: * enter the closed state and return.
! 1928: */
! 1929: case TCPS_LAST_ACK:
! 1930: if (ourfinisacked) {
! 1931: tp = tcp_close(tp);
! 1932: goto drop;
! 1933: }
! 1934: break;
! 1935:
! 1936: /*
! 1937: * In TIME_WAIT state the only thing that should arrive
! 1938: * is a retransmission of the remote FIN. Acknowledge
! 1939: * it and restart the finack timer.
! 1940: */
! 1941: case TCPS_TIME_WAIT:
! 1942: TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
! 1943: goto dropafterack;
! 1944: }
! 1945: }
! 1946:
! 1947: step6:
! 1948: /*
! 1949: * Update window information.
! 1950: * Don't look at window if no ACK: TAC's send garbage on first SYN.
! 1951: */
! 1952: if ((tiflags & TH_ACK) &&
! 1953: (SEQ_LT(tp->snd_wl1, th->th_seq) || (tp->snd_wl1 == th->th_seq &&
! 1954: (SEQ_LT(tp->snd_wl2, th->th_ack) ||
! 1955: (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd))))) {
! 1956: /* keep track of pure window updates */
! 1957: if (tlen == 0 &&
! 1958: tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd)
! 1959: tcpstat.tcps_rcvwinupd++;
! 1960: tp->snd_wnd = tiwin;
! 1961: tp->snd_wl1 = th->th_seq;
! 1962: tp->snd_wl2 = th->th_ack;
! 1963: if (tp->snd_wnd > tp->max_sndwnd)
! 1964: tp->max_sndwnd = tp->snd_wnd;
! 1965: needoutput = 1;
! 1966: }
! 1967:
! 1968: /*
! 1969: * Process segments with URG.
! 1970: */
! 1971: if ((tiflags & TH_URG) && th->th_urp &&
! 1972: TCPS_HAVERCVDFIN(tp->t_state) == 0) {
! 1973: /*
! 1974: * This is a kludge, but if we receive and accept
! 1975: * random urgent pointers, we'll crash in
! 1976: * soreceive. It's hard to imagine someone
! 1977: * actually wanting to send this much urgent data.
! 1978: */
! 1979: if (th->th_urp + so->so_rcv.sb_cc > sb_max) {
! 1980: th->th_urp = 0; /* XXX */
! 1981: tiflags &= ~TH_URG; /* XXX */
! 1982: goto dodata; /* XXX */
! 1983: }
! 1984: /*
! 1985: * If this segment advances the known urgent pointer,
! 1986: * then mark the data stream. This should not happen
! 1987: * in CLOSE_WAIT, CLOSING, LAST_ACK or TIME_WAIT STATES since
! 1988: * a FIN has been received from the remote side.
! 1989: * In these states we ignore the URG.
! 1990: *
! 1991: * According to RFC961 (Assigned Protocols),
! 1992: * the urgent pointer points to the last octet
! 1993: * of urgent data. We continue, however,
! 1994: * to consider it to indicate the first octet
! 1995: * of data past the urgent section as the original
! 1996: * spec states (in one of two places).
! 1997: */
! 1998: if (SEQ_GT(th->th_seq+th->th_urp, tp->rcv_up)) {
! 1999: tp->rcv_up = th->th_seq + th->th_urp;
! 2000: so->so_oobmark = so->so_rcv.sb_cc +
! 2001: (tp->rcv_up - tp->rcv_nxt) - 1;
! 2002: if (so->so_oobmark == 0)
! 2003: so->so_state |= SS_RCVATMARK;
! 2004: sohasoutofband(so);
! 2005: tp->t_oobflags &= ~(TCPOOB_HAVEDATA | TCPOOB_HADDATA);
! 2006: }
! 2007: /*
! 2008: * Remove out of band data so doesn't get presented to user.
! 2009: * This can happen independent of advancing the URG pointer,
! 2010: * but if two URG's are pending at once, some out-of-band
! 2011: * data may creep in... ick.
! 2012: */
! 2013: if (th->th_urp <= (u_int16_t) tlen
! 2014: #ifdef SO_OOBINLINE
! 2015: && (so->so_options & SO_OOBINLINE) == 0
! 2016: #endif
! 2017: )
! 2018: tcp_pulloutofband(so, th->th_urp, m, hdroptlen);
! 2019: } else
! 2020: /*
! 2021: * If no out of band data is expected,
! 2022: * pull receive urgent pointer along
! 2023: * with the receive window.
! 2024: */
! 2025: if (SEQ_GT(tp->rcv_nxt, tp->rcv_up))
! 2026: tp->rcv_up = tp->rcv_nxt;
! 2027: dodata: /* XXX */
! 2028:
! 2029: /*
! 2030: * Process the segment text, merging it into the TCP sequencing queue,
! 2031: * and arranging for acknowledgment of receipt if necessary.
! 2032: * This process logically involves adjusting tp->rcv_wnd as data
! 2033: * is presented to the user (this happens in tcp_usrreq.c,
! 2034: * case PRU_RCVD). If a FIN has already been received on this
! 2035: * connection then we just ignore the text.
! 2036: */
! 2037: if ((tlen || (tiflags & TH_FIN)) &&
! 2038: TCPS_HAVERCVDFIN(tp->t_state) == 0) {
! 2039: #ifdef TCP_SACK
! 2040: tcp_seq laststart = th->th_seq;
! 2041: tcp_seq lastend = th->th_seq + tlen;
! 2042: #endif
! 2043: tcp_reass_lock(tp);
! 2044: if (th->th_seq == tp->rcv_nxt && TAILQ_EMPTY(&tp->t_segq) &&
! 2045: tp->t_state == TCPS_ESTABLISHED) {
! 2046: tcp_reass_unlock(tp);
! 2047: TCP_SETUP_ACK(tp, tiflags);
! 2048: tp->rcv_nxt += tlen;
! 2049: tiflags = th->th_flags & TH_FIN;
! 2050: tcpstat.tcps_rcvpack++;
! 2051: tcpstat.tcps_rcvbyte += tlen;
! 2052: ND6_HINT(tp);
! 2053: if (so->so_state & SS_CANTRCVMORE)
! 2054: m_freem(m);
! 2055: else {
! 2056: m_adj(m, hdroptlen);
! 2057: sbappendstream(&so->so_rcv, m);
! 2058: }
! 2059: sorwakeup(so);
! 2060: } else {
! 2061: m_adj(m, hdroptlen);
! 2062: tiflags = tcp_reass(tp, th, m, &tlen);
! 2063: tcp_reass_unlock(tp);
! 2064: tp->t_flags |= TF_ACKNOW;
! 2065: }
! 2066: #ifdef TCP_SACK
! 2067: if (tp->sack_enable)
! 2068: tcp_update_sack_list(tp, laststart, lastend);
! 2069: #endif
! 2070:
! 2071: /*
! 2072: * variable len never referenced again in modern BSD,
! 2073: * so why bother computing it ??
! 2074: */
! 2075: #if 0
! 2076: /*
! 2077: * Note the amount of data that peer has sent into
! 2078: * our window, in order to estimate the sender's
! 2079: * buffer size.
! 2080: */
! 2081: len = so->so_rcv.sb_hiwat - (tp->rcv_adv - tp->rcv_nxt);
! 2082: #endif /* 0 */
! 2083: } else {
! 2084: m_freem(m);
! 2085: tiflags &= ~TH_FIN;
! 2086: }
! 2087:
! 2088: /*
! 2089: * If FIN is received ACK the FIN and let the user know
! 2090: * that the connection is closing. Ignore a FIN received before
! 2091: * the connection is fully established.
! 2092: */
! 2093: if ((tiflags & TH_FIN) && TCPS_HAVEESTABLISHED(tp->t_state)) {
! 2094: if (TCPS_HAVERCVDFIN(tp->t_state) == 0) {
! 2095: socantrcvmore(so);
! 2096: tp->t_flags |= TF_ACKNOW;
! 2097: tp->rcv_nxt++;
! 2098: }
! 2099: switch (tp->t_state) {
! 2100:
! 2101: /*
! 2102: * In ESTABLISHED STATE enter the CLOSE_WAIT state.
! 2103: */
! 2104: case TCPS_ESTABLISHED:
! 2105: tp->t_state = TCPS_CLOSE_WAIT;
! 2106: break;
! 2107:
! 2108: /*
! 2109: * If still in FIN_WAIT_1 STATE FIN has not been acked so
! 2110: * enter the CLOSING state.
! 2111: */
! 2112: case TCPS_FIN_WAIT_1:
! 2113: tp->t_state = TCPS_CLOSING;
! 2114: break;
! 2115:
! 2116: /*
! 2117: * In FIN_WAIT_2 state enter the TIME_WAIT state,
! 2118: * starting the time-wait timer, turning off the other
! 2119: * standard timers.
! 2120: */
! 2121: case TCPS_FIN_WAIT_2:
! 2122: tp->t_state = TCPS_TIME_WAIT;
! 2123: tcp_canceltimers(tp);
! 2124: TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
! 2125: soisdisconnected(so);
! 2126: break;
! 2127:
! 2128: /*
! 2129: * In TIME_WAIT state restart the 2 MSL time_wait timer.
! 2130: */
! 2131: case TCPS_TIME_WAIT:
! 2132: TCP_TIMER_ARM(tp, TCPT_2MSL, 2 * TCPTV_MSL);
! 2133: break;
! 2134: }
! 2135: }
! 2136: if (so->so_options & SO_DEBUG) {
! 2137: switch (tp->pf) {
! 2138: #ifdef INET6
! 2139: case PF_INET6:
! 2140: tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti6,
! 2141: 0, tlen);
! 2142: break;
! 2143: #endif /* INET6 */
! 2144: case PF_INET:
! 2145: tcp_trace(TA_INPUT, ostate, tp, (caddr_t) &tcp_saveti,
! 2146: 0, tlen);
! 2147: break;
! 2148: }
! 2149: }
! 2150:
! 2151: /*
! 2152: * Return any desired output.
! 2153: */
! 2154: if (needoutput || (tp->t_flags & TF_ACKNOW)) {
! 2155: (void) tcp_output(tp);
! 2156: }
! 2157: return;
! 2158:
! 2159: badsyn:
! 2160: /*
! 2161: * Received a bad SYN. Increment counters and dropwithreset.
! 2162: */
! 2163: tcpstat.tcps_badsyn++;
! 2164: tp = NULL;
! 2165: goto dropwithreset;
! 2166:
! 2167: dropafterack_ratelim:
! 2168: if (ppsratecheck(&tcp_ackdrop_ppslim_last, &tcp_ackdrop_ppslim_count,
! 2169: tcp_ackdrop_ppslim) == 0) {
! 2170: /* XXX stat */
! 2171: goto drop;
! 2172: }
! 2173: /* ...fall into dropafterack... */
! 2174:
! 2175: dropafterack:
! 2176: /*
! 2177: * Generate an ACK dropping incoming segment if it occupies
! 2178: * sequence space, where the ACK reflects our state.
! 2179: */
! 2180: if (tiflags & TH_RST)
! 2181: goto drop;
! 2182: m_freem(m);
! 2183: tp->t_flags |= TF_ACKNOW;
! 2184: (void) tcp_output(tp);
! 2185: return;
! 2186:
! 2187: dropwithreset_ratelim:
! 2188: /*
! 2189: * We may want to rate-limit RSTs in certain situations,
! 2190: * particularly if we are sending an RST in response to
! 2191: * an attempt to connect to or otherwise communicate with
! 2192: * a port for which we have no socket.
! 2193: */
! 2194: if (ppsratecheck(&tcp_rst_ppslim_last, &tcp_rst_ppslim_count,
! 2195: tcp_rst_ppslim) == 0) {
! 2196: /* XXX stat */
! 2197: goto drop;
! 2198: }
! 2199: /* ...fall into dropwithreset... */
! 2200:
! 2201: dropwithreset:
! 2202: /*
! 2203: * Generate a RST, dropping incoming segment.
! 2204: * Make ACK acceptable to originator of segment.
! 2205: * Don't bother to respond to RST.
! 2206: */
! 2207: if (tiflags & TH_RST)
! 2208: goto drop;
! 2209: if (tiflags & TH_ACK) {
! 2210: tcp_respond(tp, mtod(m, caddr_t), m, (tcp_seq)0, th->th_ack,
! 2211: TH_RST);
! 2212: } else {
! 2213: if (tiflags & TH_SYN)
! 2214: tlen++;
! 2215: tcp_respond(tp, mtod(m, caddr_t), m, th->th_seq + tlen,
! 2216: (tcp_seq)0, TH_RST|TH_ACK);
! 2217: }
! 2218: return;
! 2219:
! 2220: drop:
! 2221: /*
! 2222: * Drop space held by incoming segment and return.
! 2223: */
! 2224: if (tp && (tp->t_inpcb->inp_socket->so_options & SO_DEBUG)) {
! 2225: switch (tp->pf) {
! 2226: #ifdef INET6
! 2227: case PF_INET6:
! 2228: tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti6,
! 2229: 0, tlen);
! 2230: break;
! 2231: #endif /* INET6 */
! 2232: case PF_INET:
! 2233: tcp_trace(TA_DROP, ostate, tp, (caddr_t) &tcp_saveti,
! 2234: 0, tlen);
! 2235: break;
! 2236: }
! 2237: }
! 2238:
! 2239: m_freem(m);
! 2240: return;
! 2241: }
! 2242:
! 2243: int
! 2244: tcp_dooptions(tp, cp, cnt, th, m, iphlen, oi)
! 2245: struct tcpcb *tp;
! 2246: u_char *cp;
! 2247: int cnt;
! 2248: struct tcphdr *th;
! 2249: struct mbuf *m;
! 2250: int iphlen;
! 2251: struct tcp_opt_info *oi;
! 2252: {
! 2253: u_int16_t mss = 0;
! 2254: int opt, optlen;
! 2255: #ifdef TCP_SIGNATURE
! 2256: caddr_t sigp = NULL;
! 2257: struct tdb *tdb = NULL;
! 2258: #endif /* TCP_SIGNATURE */
! 2259:
! 2260: for (; cp && cnt > 0; cnt -= optlen, cp += optlen) {
! 2261: opt = cp[0];
! 2262: if (opt == TCPOPT_EOL)
! 2263: break;
! 2264: if (opt == TCPOPT_NOP)
! 2265: optlen = 1;
! 2266: else {
! 2267: if (cnt < 2)
! 2268: break;
! 2269: optlen = cp[1];
! 2270: if (optlen < 2 || optlen > cnt)
! 2271: break;
! 2272: }
! 2273: switch (opt) {
! 2274:
! 2275: default:
! 2276: continue;
! 2277:
! 2278: case TCPOPT_MAXSEG:
! 2279: if (optlen != TCPOLEN_MAXSEG)
! 2280: continue;
! 2281: if (!(th->th_flags & TH_SYN))
! 2282: continue;
! 2283: if (TCPS_HAVERCVDSYN(tp->t_state))
! 2284: continue;
! 2285: bcopy((char *) cp + 2, (char *) &mss, sizeof(mss));
! 2286: NTOHS(mss);
! 2287: oi->maxseg = mss;
! 2288: break;
! 2289:
! 2290: case TCPOPT_WINDOW:
! 2291: if (optlen != TCPOLEN_WINDOW)
! 2292: continue;
! 2293: if (!(th->th_flags & TH_SYN))
! 2294: continue;
! 2295: if (TCPS_HAVERCVDSYN(tp->t_state))
! 2296: continue;
! 2297: tp->t_flags |= TF_RCVD_SCALE;
! 2298: tp->requested_s_scale = min(cp[2], TCP_MAX_WINSHIFT);
! 2299: break;
! 2300:
! 2301: case TCPOPT_TIMESTAMP:
! 2302: if (optlen != TCPOLEN_TIMESTAMP)
! 2303: continue;
! 2304: oi->ts_present = 1;
! 2305: bcopy(cp + 2, &oi->ts_val, sizeof(oi->ts_val));
! 2306: NTOHL(oi->ts_val);
! 2307: bcopy(cp + 6, &oi->ts_ecr, sizeof(oi->ts_ecr));
! 2308: NTOHL(oi->ts_ecr);
! 2309:
! 2310: if (!(th->th_flags & TH_SYN))
! 2311: continue;
! 2312: if (TCPS_HAVERCVDSYN(tp->t_state))
! 2313: continue;
! 2314: /*
! 2315: * A timestamp received in a SYN makes
! 2316: * it ok to send timestamp requests and replies.
! 2317: */
! 2318: tp->t_flags |= TF_RCVD_TSTMP;
! 2319: tp->ts_recent = oi->ts_val;
! 2320: tp->ts_recent_age = tcp_now;
! 2321: break;
! 2322:
! 2323: #ifdef TCP_SACK
! 2324: case TCPOPT_SACK_PERMITTED:
! 2325: if (!tp->sack_enable || optlen!=TCPOLEN_SACK_PERMITTED)
! 2326: continue;
! 2327: if (!(th->th_flags & TH_SYN))
! 2328: continue;
! 2329: if (TCPS_HAVERCVDSYN(tp->t_state))
! 2330: continue;
! 2331: /* MUST only be set on SYN */
! 2332: tp->t_flags |= TF_SACK_PERMIT;
! 2333: break;
! 2334: case TCPOPT_SACK:
! 2335: tcp_sack_option(tp, th, cp, optlen);
! 2336: break;
! 2337: #endif
! 2338: #ifdef TCP_SIGNATURE
! 2339: case TCPOPT_SIGNATURE:
! 2340: if (optlen != TCPOLEN_SIGNATURE)
! 2341: continue;
! 2342:
! 2343: if (sigp && bcmp(sigp, cp + 2, 16))
! 2344: return (-1);
! 2345:
! 2346: sigp = cp + 2;
! 2347: break;
! 2348: #endif /* TCP_SIGNATURE */
! 2349: }
! 2350: }
! 2351:
! 2352: #ifdef TCP_SIGNATURE
! 2353: if (tp->t_flags & TF_SIGNATURE) {
! 2354: union sockaddr_union src, dst;
! 2355:
! 2356: memset(&src, 0, sizeof(union sockaddr_union));
! 2357: memset(&dst, 0, sizeof(union sockaddr_union));
! 2358:
! 2359: switch (tp->pf) {
! 2360: case 0:
! 2361: #ifdef INET
! 2362: case AF_INET:
! 2363: src.sa.sa_len = sizeof(struct sockaddr_in);
! 2364: src.sa.sa_family = AF_INET;
! 2365: src.sin.sin_addr = mtod(m, struct ip *)->ip_src;
! 2366: dst.sa.sa_len = sizeof(struct sockaddr_in);
! 2367: dst.sa.sa_family = AF_INET;
! 2368: dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst;
! 2369: break;
! 2370: #endif
! 2371: #ifdef INET6
! 2372: case AF_INET6:
! 2373: src.sa.sa_len = sizeof(struct sockaddr_in6);
! 2374: src.sa.sa_family = AF_INET6;
! 2375: src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src;
! 2376: dst.sa.sa_len = sizeof(struct sockaddr_in6);
! 2377: dst.sa.sa_family = AF_INET6;
! 2378: dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst;
! 2379: break;
! 2380: #endif /* INET6 */
! 2381: }
! 2382:
! 2383: tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP);
! 2384:
! 2385: /*
! 2386: * We don't have an SA for this peer, so we turn off
! 2387: * TF_SIGNATURE on the listen socket
! 2388: */
! 2389: if (tdb == NULL && tp->t_state == TCPS_LISTEN)
! 2390: tp->t_flags &= ~TF_SIGNATURE;
! 2391:
! 2392: }
! 2393:
! 2394: if ((sigp ? TF_SIGNATURE : 0) ^ (tp->t_flags & TF_SIGNATURE)) {
! 2395: tcpstat.tcps_rcvbadsig++;
! 2396: return (-1);
! 2397: }
! 2398:
! 2399: if (sigp) {
! 2400: char sig[16];
! 2401:
! 2402: if (tdb == NULL) {
! 2403: tcpstat.tcps_rcvbadsig++;
! 2404: return (-1);
! 2405: }
! 2406:
! 2407: if (tcp_signature(tdb, tp->pf, m, th, iphlen, 1, sig) < 0)
! 2408: return (-1);
! 2409:
! 2410: if (bcmp(sig, sigp, 16)) {
! 2411: tcpstat.tcps_rcvbadsig++;
! 2412: return (-1);
! 2413: }
! 2414:
! 2415: tcpstat.tcps_rcvgoodsig++;
! 2416: }
! 2417: #endif /* TCP_SIGNATURE */
! 2418:
! 2419: return (0);
! 2420: }
! 2421:
! 2422: #if defined(TCP_SACK)
! 2423: u_long
! 2424: tcp_seq_subtract(a, b)
! 2425: u_long a, b;
! 2426: {
! 2427: return ((long)(a - b));
! 2428: }
! 2429: #endif
! 2430:
! 2431:
! 2432: #ifdef TCP_SACK
! 2433: /*
! 2434: * This function is called upon receipt of new valid data (while not in header
! 2435: * prediction mode), and it updates the ordered list of sacks.
! 2436: */
! 2437: void
! 2438: tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart,
! 2439: tcp_seq rcv_lastend)
! 2440: {
! 2441: /*
! 2442: * First reported block MUST be the most recent one. Subsequent
! 2443: * blocks SHOULD be in the order in which they arrived at the
! 2444: * receiver. These two conditions make the implementation fully
! 2445: * compliant with RFC 2018.
! 2446: */
! 2447: int i, j = 0, count = 0, lastpos = -1;
! 2448: struct sackblk sack, firstsack, temp[MAX_SACK_BLKS];
! 2449:
! 2450: /* First clean up current list of sacks */
! 2451: for (i = 0; i < tp->rcv_numsacks; i++) {
! 2452: sack = tp->sackblks[i];
! 2453: if (sack.start == 0 && sack.end == 0) {
! 2454: count++; /* count = number of blocks to be discarded */
! 2455: continue;
! 2456: }
! 2457: if (SEQ_LEQ(sack.end, tp->rcv_nxt)) {
! 2458: tp->sackblks[i].start = tp->sackblks[i].end = 0;
! 2459: count++;
! 2460: } else {
! 2461: temp[j].start = tp->sackblks[i].start;
! 2462: temp[j++].end = tp->sackblks[i].end;
! 2463: }
! 2464: }
! 2465: tp->rcv_numsacks -= count;
! 2466: if (tp->rcv_numsacks == 0) { /* no sack blocks currently (fast path) */
! 2467: tcp_clean_sackreport(tp);
! 2468: if (SEQ_LT(tp->rcv_nxt, rcv_laststart)) {
! 2469: /* ==> need first sack block */
! 2470: tp->sackblks[0].start = rcv_laststart;
! 2471: tp->sackblks[0].end = rcv_lastend;
! 2472: tp->rcv_numsacks = 1;
! 2473: }
! 2474: return;
! 2475: }
! 2476: /* Otherwise, sack blocks are already present. */
! 2477: for (i = 0; i < tp->rcv_numsacks; i++)
! 2478: tp->sackblks[i] = temp[i]; /* first copy back sack list */
! 2479: if (SEQ_GEQ(tp->rcv_nxt, rcv_lastend))
! 2480: return; /* sack list remains unchanged */
! 2481: /*
! 2482: * From here, segment just received should be (part of) the 1st sack.
! 2483: * Go through list, possibly coalescing sack block entries.
! 2484: */
! 2485: firstsack.start = rcv_laststart;
! 2486: firstsack.end = rcv_lastend;
! 2487: for (i = 0; i < tp->rcv_numsacks; i++) {
! 2488: sack = tp->sackblks[i];
! 2489: if (SEQ_LT(sack.end, firstsack.start) ||
! 2490: SEQ_GT(sack.start, firstsack.end))
! 2491: continue; /* no overlap */
! 2492: if (sack.start == firstsack.start && sack.end == firstsack.end){
! 2493: /*
! 2494: * identical block; delete it here since we will
! 2495: * move it to the front of the list.
! 2496: */
! 2497: tp->sackblks[i].start = tp->sackblks[i].end = 0;
! 2498: lastpos = i; /* last posn with a zero entry */
! 2499: continue;
! 2500: }
! 2501: if (SEQ_LEQ(sack.start, firstsack.start))
! 2502: firstsack.start = sack.start; /* merge blocks */
! 2503: if (SEQ_GEQ(sack.end, firstsack.end))
! 2504: firstsack.end = sack.end; /* merge blocks */
! 2505: tp->sackblks[i].start = tp->sackblks[i].end = 0;
! 2506: lastpos = i; /* last posn with a zero entry */
! 2507: }
! 2508: if (lastpos != -1) { /* at least one merge */
! 2509: for (i = 0, j = 1; i < tp->rcv_numsacks; i++) {
! 2510: sack = tp->sackblks[i];
! 2511: if (sack.start == 0 && sack.end == 0)
! 2512: continue;
! 2513: temp[j++] = sack;
! 2514: }
! 2515: tp->rcv_numsacks = j; /* including first blk (added later) */
! 2516: for (i = 1; i < tp->rcv_numsacks; i++) /* now copy back */
! 2517: tp->sackblks[i] = temp[i];
! 2518: } else { /* no merges -- shift sacks by 1 */
! 2519: if (tp->rcv_numsacks < MAX_SACK_BLKS)
! 2520: tp->rcv_numsacks++;
! 2521: for (i = tp->rcv_numsacks-1; i > 0; i--)
! 2522: tp->sackblks[i] = tp->sackblks[i-1];
! 2523: }
! 2524: tp->sackblks[0] = firstsack;
! 2525: return;
! 2526: }
! 2527:
! 2528: /*
! 2529: * Process the TCP SACK option. tp->snd_holes is an ordered list
! 2530: * of holes (oldest to newest, in terms of the sequence space).
! 2531: */
! 2532: void
! 2533: tcp_sack_option(struct tcpcb *tp, struct tcphdr *th, u_char *cp, int optlen)
! 2534: {
! 2535: int tmp_olen;
! 2536: u_char *tmp_cp;
! 2537: struct sackhole *cur, *p, *temp;
! 2538:
! 2539: if (!tp->sack_enable)
! 2540: return;
! 2541: /* SACK without ACK doesn't make sense. */
! 2542: if ((th->th_flags & TH_ACK) == 0)
! 2543: return;
! 2544: /* Make sure the ACK on this segment is in [snd_una, snd_max]. */
! 2545: if (SEQ_LT(th->th_ack, tp->snd_una) ||
! 2546: SEQ_GT(th->th_ack, tp->snd_max))
! 2547: return;
! 2548: /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */
! 2549: if (optlen <= 2 || (optlen - 2) % TCPOLEN_SACK != 0)
! 2550: return;
! 2551: /* Note: TCPOLEN_SACK must be 2*sizeof(tcp_seq) */
! 2552: tmp_cp = cp + 2;
! 2553: tmp_olen = optlen - 2;
! 2554: tcpstat.tcps_sack_rcv_opts++;
! 2555: if (tp->snd_numholes < 0)
! 2556: tp->snd_numholes = 0;
! 2557: if (tp->t_maxseg == 0)
! 2558: panic("tcp_sack_option"); /* Should never happen */
! 2559: while (tmp_olen > 0) {
! 2560: struct sackblk sack;
! 2561:
! 2562: bcopy(tmp_cp, (char *) &(sack.start), sizeof(tcp_seq));
! 2563: NTOHL(sack.start);
! 2564: bcopy(tmp_cp + sizeof(tcp_seq),
! 2565: (char *) &(sack.end), sizeof(tcp_seq));
! 2566: NTOHL(sack.end);
! 2567: tmp_olen -= TCPOLEN_SACK;
! 2568: tmp_cp += TCPOLEN_SACK;
! 2569: if (SEQ_LEQ(sack.end, sack.start))
! 2570: continue; /* bad SACK fields */
! 2571: if (SEQ_LEQ(sack.end, tp->snd_una))
! 2572: continue; /* old block */
! 2573: #if defined(TCP_SACK) && defined(TCP_FACK)
! 2574: /* Updates snd_fack. */
! 2575: if (SEQ_GT(sack.end, tp->snd_fack))
! 2576: tp->snd_fack = sack.end;
! 2577: #endif /* TCP_FACK */
! 2578: if (SEQ_GT(th->th_ack, tp->snd_una)) {
! 2579: if (SEQ_LT(sack.start, th->th_ack))
! 2580: continue;
! 2581: }
! 2582: if (SEQ_GT(sack.end, tp->snd_max))
! 2583: continue;
! 2584: if (tp->snd_holes == NULL) { /* first hole */
! 2585: tp->snd_holes = (struct sackhole *)
! 2586: pool_get(&sackhl_pool, PR_NOWAIT);
! 2587: if (tp->snd_holes == NULL) {
! 2588: /* ENOBUFS, so ignore SACKed block for now*/
! 2589: goto done;
! 2590: }
! 2591: cur = tp->snd_holes;
! 2592: cur->start = th->th_ack;
! 2593: cur->end = sack.start;
! 2594: cur->rxmit = cur->start;
! 2595: cur->next = NULL;
! 2596: tp->snd_numholes = 1;
! 2597: tp->rcv_lastsack = sack.end;
! 2598: /*
! 2599: * dups is at least one. If more data has been
! 2600: * SACKed, it can be greater than one.
! 2601: */
! 2602: cur->dups = min(tcprexmtthresh,
! 2603: ((sack.end - cur->end)/tp->t_maxseg));
! 2604: if (cur->dups < 1)
! 2605: cur->dups = 1;
! 2606: continue; /* with next sack block */
! 2607: }
! 2608: /* Go thru list of holes: p = previous, cur = current */
! 2609: p = cur = tp->snd_holes;
! 2610: while (cur) {
! 2611: if (SEQ_LEQ(sack.end, cur->start))
! 2612: /* SACKs data before the current hole */
! 2613: break; /* no use going through more holes */
! 2614: if (SEQ_GEQ(sack.start, cur->end)) {
! 2615: /* SACKs data beyond the current hole */
! 2616: cur->dups++;
! 2617: if (((sack.end - cur->end)/tp->t_maxseg) >=
! 2618: tcprexmtthresh)
! 2619: cur->dups = tcprexmtthresh;
! 2620: p = cur;
! 2621: cur = cur->next;
! 2622: continue;
! 2623: }
! 2624: if (SEQ_LEQ(sack.start, cur->start)) {
! 2625: /* Data acks at least the beginning of hole */
! 2626: #if defined(TCP_SACK) && defined(TCP_FACK)
! 2627: if (SEQ_GT(sack.end, cur->rxmit))
! 2628: tp->retran_data -=
! 2629: tcp_seq_subtract(cur->rxmit,
! 2630: cur->start);
! 2631: else
! 2632: tp->retran_data -=
! 2633: tcp_seq_subtract(sack.end,
! 2634: cur->start);
! 2635: #endif /* TCP_FACK */
! 2636: if (SEQ_GEQ(sack.end, cur->end)) {
! 2637: /* Acks entire hole, so delete hole */
! 2638: if (p != cur) {
! 2639: p->next = cur->next;
! 2640: pool_put(&sackhl_pool, cur);
! 2641: cur = p->next;
! 2642: } else {
! 2643: cur = cur->next;
! 2644: pool_put(&sackhl_pool, p);
! 2645: p = cur;
! 2646: tp->snd_holes = p;
! 2647: }
! 2648: tp->snd_numholes--;
! 2649: continue;
! 2650: }
! 2651: /* otherwise, move start of hole forward */
! 2652: cur->start = sack.end;
! 2653: cur->rxmit = SEQ_MAX(cur->rxmit, cur->start);
! 2654: p = cur;
! 2655: cur = cur->next;
! 2656: continue;
! 2657: }
! 2658: /* move end of hole backward */
! 2659: if (SEQ_GEQ(sack.end, cur->end)) {
! 2660: #if defined(TCP_SACK) && defined(TCP_FACK)
! 2661: if (SEQ_GT(cur->rxmit, sack.start))
! 2662: tp->retran_data -=
! 2663: tcp_seq_subtract(cur->rxmit,
! 2664: sack.start);
! 2665: #endif /* TCP_FACK */
! 2666: cur->end = sack.start;
! 2667: cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
! 2668: cur->dups++;
! 2669: if (((sack.end - cur->end)/tp->t_maxseg) >=
! 2670: tcprexmtthresh)
! 2671: cur->dups = tcprexmtthresh;
! 2672: p = cur;
! 2673: cur = cur->next;
! 2674: continue;
! 2675: }
! 2676: if (SEQ_LT(cur->start, sack.start) &&
! 2677: SEQ_GT(cur->end, sack.end)) {
! 2678: /*
! 2679: * ACKs some data in middle of a hole; need to
! 2680: * split current hole
! 2681: */
! 2682: temp = (struct sackhole *)
! 2683: pool_get(&sackhl_pool, PR_NOWAIT);
! 2684: if (temp == NULL)
! 2685: goto done; /* ENOBUFS */
! 2686: #if defined(TCP_SACK) && defined(TCP_FACK)
! 2687: if (SEQ_GT(cur->rxmit, sack.end))
! 2688: tp->retran_data -=
! 2689: tcp_seq_subtract(sack.end,
! 2690: sack.start);
! 2691: else if (SEQ_GT(cur->rxmit, sack.start))
! 2692: tp->retran_data -=
! 2693: tcp_seq_subtract(cur->rxmit,
! 2694: sack.start);
! 2695: #endif /* TCP_FACK */
! 2696: temp->next = cur->next;
! 2697: temp->start = sack.end;
! 2698: temp->end = cur->end;
! 2699: temp->dups = cur->dups;
! 2700: temp->rxmit = SEQ_MAX(cur->rxmit, temp->start);
! 2701: cur->end = sack.start;
! 2702: cur->rxmit = SEQ_MIN(cur->rxmit, cur->end);
! 2703: cur->dups++;
! 2704: if (((sack.end - cur->end)/tp->t_maxseg) >=
! 2705: tcprexmtthresh)
! 2706: cur->dups = tcprexmtthresh;
! 2707: cur->next = temp;
! 2708: p = temp;
! 2709: cur = p->next;
! 2710: tp->snd_numholes++;
! 2711: }
! 2712: }
! 2713: /* At this point, p points to the last hole on the list */
! 2714: if (SEQ_LT(tp->rcv_lastsack, sack.start)) {
! 2715: /*
! 2716: * Need to append new hole at end.
! 2717: * Last hole is p (and it's not NULL).
! 2718: */
! 2719: temp = (struct sackhole *)
! 2720: pool_get(&sackhl_pool, PR_NOWAIT);
! 2721: if (temp == NULL)
! 2722: goto done; /* ENOBUFS */
! 2723: temp->start = tp->rcv_lastsack;
! 2724: temp->end = sack.start;
! 2725: temp->dups = min(tcprexmtthresh,
! 2726: ((sack.end - sack.start)/tp->t_maxseg));
! 2727: if (temp->dups < 1)
! 2728: temp->dups = 1;
! 2729: temp->rxmit = temp->start;
! 2730: temp->next = 0;
! 2731: p->next = temp;
! 2732: tp->rcv_lastsack = sack.end;
! 2733: tp->snd_numholes++;
! 2734: }
! 2735: }
! 2736: done:
! 2737: #if defined(TCP_SACK) && defined(TCP_FACK)
! 2738: /*
! 2739: * Update retran_data and snd_awnd. Go through the list of
! 2740: * holes. Increment retran_data by (hole->rxmit - hole->start).
! 2741: */
! 2742: tp->retran_data = 0;
! 2743: cur = tp->snd_holes;
! 2744: while (cur) {
! 2745: tp->retran_data += cur->rxmit - cur->start;
! 2746: cur = cur->next;
! 2747: }
! 2748: tp->snd_awnd = tcp_seq_subtract(tp->snd_nxt, tp->snd_fack) +
! 2749: tp->retran_data;
! 2750: #endif /* TCP_FACK */
! 2751:
! 2752: return;
! 2753: }
! 2754:
! 2755: /*
! 2756: * Delete stale (i.e, cumulatively ack'd) holes. Hole is deleted only if
! 2757: * it is completely acked; otherwise, tcp_sack_option(), called from
! 2758: * tcp_dooptions(), will fix up the hole.
! 2759: */
! 2760: void
! 2761: tcp_del_sackholes(tp, th)
! 2762: struct tcpcb *tp;
! 2763: struct tcphdr *th;
! 2764: {
! 2765: if (tp->sack_enable && tp->t_state != TCPS_LISTEN) {
! 2766: /* max because this could be an older ack just arrived */
! 2767: tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ?
! 2768: th->th_ack : tp->snd_una;
! 2769: struct sackhole *cur = tp->snd_holes;
! 2770: struct sackhole *prev;
! 2771: while (cur)
! 2772: if (SEQ_LEQ(cur->end, lastack)) {
! 2773: prev = cur;
! 2774: cur = cur->next;
! 2775: pool_put(&sackhl_pool, prev);
! 2776: tp->snd_numholes--;
! 2777: } else if (SEQ_LT(cur->start, lastack)) {
! 2778: cur->start = lastack;
! 2779: if (SEQ_LT(cur->rxmit, cur->start))
! 2780: cur->rxmit = cur->start;
! 2781: break;
! 2782: } else
! 2783: break;
! 2784: tp->snd_holes = cur;
! 2785: }
! 2786: }
! 2787:
! 2788: /*
! 2789: * Delete all receiver-side SACK information.
! 2790: */
! 2791: void
! 2792: tcp_clean_sackreport(tp)
! 2793: struct tcpcb *tp;
! 2794: {
! 2795: int i;
! 2796:
! 2797: tp->rcv_numsacks = 0;
! 2798: for (i = 0; i < MAX_SACK_BLKS; i++)
! 2799: tp->sackblks[i].start = tp->sackblks[i].end=0;
! 2800:
! 2801: }
! 2802:
! 2803: /*
! 2804: * Checks for partial ack. If partial ack arrives, turn off retransmission
! 2805: * timer, deflate the window, do not clear tp->t_dupacks, and return 1.
! 2806: * If the ack advances at least to tp->snd_last, return 0.
! 2807: */
! 2808: int
! 2809: tcp_sack_partialack(tp, th)
! 2810: struct tcpcb *tp;
! 2811: struct tcphdr *th;
! 2812: {
! 2813: if (SEQ_LT(th->th_ack, tp->snd_last)) {
! 2814: /* Turn off retx. timer (will start again next segment) */
! 2815: TCP_TIMER_DISARM(tp, TCPT_REXMT);
! 2816: tp->t_rtttime = 0;
! 2817: #ifndef TCP_FACK
! 2818: /*
! 2819: * Partial window deflation. This statement relies on the
! 2820: * fact that tp->snd_una has not been updated yet. In FACK
! 2821: * hold snd_cwnd constant during fast recovery.
! 2822: */
! 2823: if (tp->snd_cwnd > (th->th_ack - tp->snd_una)) {
! 2824: tp->snd_cwnd -= th->th_ack - tp->snd_una;
! 2825: tp->snd_cwnd += tp->t_maxseg;
! 2826: } else
! 2827: tp->snd_cwnd = tp->t_maxseg;
! 2828: #endif
! 2829: return (1);
! 2830: }
! 2831: return (0);
! 2832: }
! 2833: #endif /* TCP_SACK */
! 2834:
! 2835: /*
! 2836: * Pull out of band byte out of a segment so
! 2837: * it doesn't appear in the user's data queue.
! 2838: * It is still reflected in the segment length for
! 2839: * sequencing purposes.
! 2840: */
! 2841: void
! 2842: tcp_pulloutofband(so, urgent, m, off)
! 2843: struct socket *so;
! 2844: u_int urgent;
! 2845: struct mbuf *m;
! 2846: int off;
! 2847: {
! 2848: int cnt = off + urgent - 1;
! 2849:
! 2850: while (cnt >= 0) {
! 2851: if (m->m_len > cnt) {
! 2852: char *cp = mtod(m, caddr_t) + cnt;
! 2853: struct tcpcb *tp = sototcpcb(so);
! 2854:
! 2855: tp->t_iobc = *cp;
! 2856: tp->t_oobflags |= TCPOOB_HAVEDATA;
! 2857: bcopy(cp+1, cp, (unsigned)(m->m_len - cnt - 1));
! 2858: m->m_len--;
! 2859: return;
! 2860: }
! 2861: cnt -= m->m_len;
! 2862: m = m->m_next;
! 2863: if (m == 0)
! 2864: break;
! 2865: }
! 2866: panic("tcp_pulloutofband");
! 2867: }
! 2868:
! 2869: /*
! 2870: * Collect new round-trip time estimate
! 2871: * and update averages and current timeout.
! 2872: */
! 2873: void
! 2874: tcp_xmit_timer(tp, rtt)
! 2875: struct tcpcb *tp;
! 2876: short rtt;
! 2877: {
! 2878: short delta;
! 2879: short rttmin;
! 2880:
! 2881: if (rtt < 0)
! 2882: rtt = 0;
! 2883: else if (rtt > TCP_RTT_MAX)
! 2884: rtt = TCP_RTT_MAX;
! 2885:
! 2886: tcpstat.tcps_rttupdated++;
! 2887: if (tp->t_srtt != 0) {
! 2888: /*
! 2889: * delta is fixed point with 2 (TCP_RTT_BASE_SHIFT) bits
! 2890: * after the binary point (scaled by 4), whereas
! 2891: * srtt is stored as fixed point with 5 bits after the
! 2892: * binary point (i.e., scaled by 32). The following magic
! 2893: * is equivalent to the smoothing algorithm in rfc793 with
! 2894: * an alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed
! 2895: * point).
! 2896: */
! 2897: delta = (rtt << TCP_RTT_BASE_SHIFT) -
! 2898: (tp->t_srtt >> TCP_RTT_SHIFT);
! 2899: if ((tp->t_srtt += delta) <= 0)
! 2900: tp->t_srtt = 1 << TCP_RTT_BASE_SHIFT;
! 2901: /*
! 2902: * We accumulate a smoothed rtt variance (actually, a
! 2903: * smoothed mean difference), then set the retransmit
! 2904: * timer to smoothed rtt + 4 times the smoothed variance.
! 2905: * rttvar is stored as fixed point with 4 bits after the
! 2906: * binary point (scaled by 16). The following is
! 2907: * equivalent to rfc793 smoothing with an alpha of .75
! 2908: * (rttvar = rttvar*3/4 + |delta| / 4). This replaces
! 2909: * rfc793's wired-in beta.
! 2910: */
! 2911: if (delta < 0)
! 2912: delta = -delta;
! 2913: delta -= (tp->t_rttvar >> TCP_RTTVAR_SHIFT);
! 2914: if ((tp->t_rttvar += delta) <= 0)
! 2915: tp->t_rttvar = 1 << TCP_RTT_BASE_SHIFT;
! 2916: } else {
! 2917: /*
! 2918: * No rtt measurement yet - use the unsmoothed rtt.
! 2919: * Set the variance to half the rtt (so our first
! 2920: * retransmit happens at 3*rtt).
! 2921: */
! 2922: tp->t_srtt = (rtt + 1) << (TCP_RTT_SHIFT + TCP_RTT_BASE_SHIFT);
! 2923: tp->t_rttvar = (rtt + 1) <<
! 2924: (TCP_RTTVAR_SHIFT + TCP_RTT_BASE_SHIFT - 1);
! 2925: }
! 2926: tp->t_rtttime = 0;
! 2927: tp->t_rxtshift = 0;
! 2928:
! 2929: /*
! 2930: * the retransmit should happen at rtt + 4 * rttvar.
! 2931: * Because of the way we do the smoothing, srtt and rttvar
! 2932: * will each average +1/2 tick of bias. When we compute
! 2933: * the retransmit timer, we want 1/2 tick of rounding and
! 2934: * 1 extra tick because of +-1/2 tick uncertainty in the
! 2935: * firing of the timer. The bias will give us exactly the
! 2936: * 1.5 tick we need. But, because the bias is
! 2937: * statistical, we have to test that we don't drop below
! 2938: * the minimum feasible timer (which is 2 ticks).
! 2939: */
! 2940: rttmin = min(max(rtt + 2, tp->t_rttmin), TCPTV_REXMTMAX);
! 2941: TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), rttmin, TCPTV_REXMTMAX);
! 2942:
! 2943: /*
! 2944: * We received an ack for a packet that wasn't retransmitted;
! 2945: * it is probably safe to discard any error indications we've
! 2946: * received recently. This isn't quite right, but close enough
! 2947: * for now (a route might have failed after we sent a segment,
! 2948: * and the return path might not be symmetrical).
! 2949: */
! 2950: tp->t_softerror = 0;
! 2951: }
! 2952:
! 2953: /*
! 2954: * Determine a reasonable value for maxseg size.
! 2955: * If the route is known, check route for mtu.
! 2956: * If none, use an mss that can be handled on the outgoing
! 2957: * interface without forcing IP to fragment; if bigger than
! 2958: * an mbuf cluster (MCLBYTES), round down to nearest multiple of MCLBYTES
! 2959: * to utilize large mbufs. If no route is found, route has no mtu,
! 2960: * or the destination isn't local, use a default, hopefully conservative
! 2961: * size (usually 512 or the default IP max size, but no more than the mtu
! 2962: * of the interface), as we can't discover anything about intervening
! 2963: * gateways or networks. We also initialize the congestion/slow start
! 2964: * window to be a single segment if the destination isn't local.
! 2965: * While looking at the routing entry, we also initialize other path-dependent
! 2966: * parameters from pre-set or cached values in the routing entry.
! 2967: *
! 2968: * Also take into account the space needed for options that we
! 2969: * send regularly. Make maxseg shorter by that amount to assure
! 2970: * that we can send maxseg amount of data even when the options
! 2971: * are present. Store the upper limit of the length of options plus
! 2972: * data in maxopd.
! 2973: *
! 2974: * NOTE: offer == -1 indicates that the maxseg size changed due to
! 2975: * Path MTU discovery.
! 2976: */
! 2977: int
! 2978: tcp_mss(tp, offer)
! 2979: struct tcpcb *tp;
! 2980: int offer;
! 2981: {
! 2982: struct rtentry *rt;
! 2983: struct ifnet *ifp;
! 2984: int mss, mssopt;
! 2985: int iphlen;
! 2986: struct inpcb *inp;
! 2987:
! 2988: inp = tp->t_inpcb;
! 2989:
! 2990: mssopt = mss = tcp_mssdflt;
! 2991:
! 2992: rt = in_pcbrtentry(inp);
! 2993:
! 2994: if (rt == NULL)
! 2995: goto out;
! 2996:
! 2997: ifp = rt->rt_ifp;
! 2998:
! 2999: switch (tp->pf) {
! 3000: #ifdef INET6
! 3001: case AF_INET6:
! 3002: iphlen = sizeof(struct ip6_hdr);
! 3003: break;
! 3004: #endif
! 3005: case AF_INET:
! 3006: iphlen = sizeof(struct ip);
! 3007: break;
! 3008: default:
! 3009: /* the family does not support path MTU discovery */
! 3010: goto out;
! 3011: }
! 3012:
! 3013: #ifdef RTV_MTU
! 3014: /*
! 3015: * if there's an mtu associated with the route and we support
! 3016: * path MTU discovery for the underlying protocol family, use it.
! 3017: */
! 3018: if (rt->rt_rmx.rmx_mtu) {
! 3019: /*
! 3020: * One may wish to lower MSS to take into account options,
! 3021: * especially security-related options.
! 3022: */
! 3023: if (tp->pf == AF_INET6 && rt->rt_rmx.rmx_mtu < IPV6_MMTU) {
! 3024: /*
! 3025: * RFC2460 section 5, last paragraph: if path MTU is
! 3026: * smaller than 1280, use 1280 as packet size and
! 3027: * attach fragment header.
! 3028: */
! 3029: mss = IPV6_MMTU - iphlen - sizeof(struct ip6_frag) -
! 3030: sizeof(struct tcphdr);
! 3031: } else
! 3032: mss = rt->rt_rmx.rmx_mtu - iphlen - sizeof(struct tcphdr);
! 3033: } else
! 3034: #endif /* RTV_MTU */
! 3035: if (!ifp)
! 3036: /*
! 3037: * ifp may be null and rmx_mtu may be zero in certain
! 3038: * v6 cases (e.g., if ND wasn't able to resolve the
! 3039: * destination host.
! 3040: */
! 3041: goto out;
! 3042: else if (ifp->if_flags & IFF_LOOPBACK)
! 3043: mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
! 3044: else if (tp->pf == AF_INET) {
! 3045: if (ip_mtudisc)
! 3046: mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
! 3047: else if (inp && in_localaddr(inp->inp_faddr))
! 3048: mss = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
! 3049: }
! 3050: #ifdef INET6
! 3051: else if (tp->pf == AF_INET6) {
! 3052: /*
! 3053: * for IPv6, path MTU discovery is always turned on,
! 3054: * or the node must use packet size <= 1280.
! 3055: */
! 3056: mss = IN6_LINKMTU(ifp) - iphlen - sizeof(struct tcphdr);
! 3057: }
! 3058: #endif /* INET6 */
! 3059:
! 3060: /* Calculate the value that we offer in TCPOPT_MAXSEG */
! 3061: if (offer != -1) {
! 3062: #ifndef INET6
! 3063: mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
! 3064: #else
! 3065: if (tp->pf == AF_INET6)
! 3066: mssopt = IN6_LINKMTU(ifp) - iphlen -
! 3067: sizeof(struct tcphdr);
! 3068: else
! 3069: mssopt = ifp->if_mtu - iphlen - sizeof(struct tcphdr);
! 3070: #endif
! 3071:
! 3072: mssopt = max(tcp_mssdflt, mssopt);
! 3073: }
! 3074:
! 3075: out:
! 3076: /*
! 3077: * The current mss, t_maxseg, is initialized to the default value.
! 3078: * If we compute a smaller value, reduce the current mss.
! 3079: * If we compute a larger value, return it for use in sending
! 3080: * a max seg size option, but don't store it for use
! 3081: * unless we received an offer at least that large from peer.
! 3082: *
! 3083: * However, do not accept offers lower than the minimum of
! 3084: * the interface MTU and 216.
! 3085: */
! 3086: if (offer > 0)
! 3087: tp->t_peermss = offer;
! 3088: if (tp->t_peermss)
! 3089: mss = min(mss, max(tp->t_peermss, 216));
! 3090:
! 3091: /* sanity - at least max opt. space */
! 3092: mss = max(mss, 64);
! 3093:
! 3094: /*
! 3095: * maxopd stores the maximum length of data AND options
! 3096: * in a segment; maxseg is the amount of data in a normal
! 3097: * segment. We need to store this value (maxopd) apart
! 3098: * from maxseg, because now every segment carries options
! 3099: * and thus we normally have somewhat less data in segments.
! 3100: */
! 3101: tp->t_maxopd = mss;
! 3102:
! 3103: if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
! 3104: (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
! 3105: mss -= TCPOLEN_TSTAMP_APPA;
! 3106: #ifdef TCP_SIGNATURE
! 3107: if (tp->t_flags & TF_SIGNATURE)
! 3108: mss -= TCPOLEN_SIGLEN;
! 3109: #endif
! 3110:
! 3111: if (offer == -1) {
! 3112: /* mss changed due to Path MTU discovery */
! 3113: tp->t_flags &= ~TF_PMTUD_PEND;
! 3114: tp->t_pmtud_mtu_sent = 0;
! 3115: tp->t_pmtud_mss_acked = 0;
! 3116: if (mss < tp->t_maxseg) {
! 3117: /*
! 3118: * Follow suggestion in RFC 2414 to reduce the
! 3119: * congestion window by the ratio of the old
! 3120: * segment size to the new segment size.
! 3121: */
! 3122: tp->snd_cwnd = ulmax((tp->snd_cwnd / tp->t_maxseg) *
! 3123: mss, mss);
! 3124: }
! 3125: } else if (tcp_do_rfc3390) {
! 3126: /* increase initial window */
! 3127: tp->snd_cwnd = ulmin(4 * mss, ulmax(2 * mss, 4380));
! 3128: } else
! 3129: tp->snd_cwnd = mss;
! 3130:
! 3131: tp->t_maxseg = mss;
! 3132:
! 3133: return (offer != -1 ? mssopt : mss);
! 3134: }
! 3135:
! 3136: u_int
! 3137: tcp_hdrsz(struct tcpcb *tp)
! 3138: {
! 3139: u_int hlen;
! 3140:
! 3141: switch (tp->pf) {
! 3142: #ifdef INET6
! 3143: case AF_INET6:
! 3144: hlen = sizeof(struct ip6_hdr);
! 3145: break;
! 3146: #endif
! 3147: case AF_INET:
! 3148: hlen = sizeof(struct ip);
! 3149: break;
! 3150: default:
! 3151: hlen = 0;
! 3152: break;
! 3153: }
! 3154: hlen += sizeof(struct tcphdr);
! 3155:
! 3156: if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
! 3157: (tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
! 3158: hlen += TCPOLEN_TSTAMP_APPA;
! 3159: #ifdef TCP_SIGNATURE
! 3160: if (tp->t_flags & TF_SIGNATURE)
! 3161: hlen += TCPOLEN_SIGLEN;
! 3162: #endif
! 3163: return (hlen);
! 3164: }
! 3165:
! 3166: /*
! 3167: * Set connection variables based on the effective MSS.
! 3168: * We are passed the TCPCB for the actual connection. If we
! 3169: * are the server, we are called by the compressed state engine
! 3170: * when the 3-way handshake is complete. If we are the client,
! 3171: * we are called when we receive the SYN,ACK from the server.
! 3172: *
! 3173: * NOTE: The t_maxseg value must be initialized in the TCPCB
! 3174: * before this routine is called!
! 3175: */
! 3176: void
! 3177: tcp_mss_update(tp)
! 3178: struct tcpcb *tp;
! 3179: {
! 3180: int mss;
! 3181: u_long bufsize;
! 3182: struct rtentry *rt;
! 3183: struct socket *so;
! 3184:
! 3185: so = tp->t_inpcb->inp_socket;
! 3186: mss = tp->t_maxseg;
! 3187:
! 3188: rt = in_pcbrtentry(tp->t_inpcb);
! 3189:
! 3190: if (rt == NULL)
! 3191: return;
! 3192:
! 3193: bufsize = so->so_snd.sb_hiwat;
! 3194: if (bufsize < mss) {
! 3195: mss = bufsize;
! 3196: /* Update t_maxseg and t_maxopd */
! 3197: tcp_mss(tp, mss);
! 3198: } else {
! 3199: bufsize = roundup(bufsize, mss);
! 3200: if (bufsize > sb_max)
! 3201: bufsize = sb_max;
! 3202: (void)sbreserve(&so->so_snd, bufsize);
! 3203: }
! 3204:
! 3205: bufsize = so->so_rcv.sb_hiwat;
! 3206: if (bufsize > mss) {
! 3207: bufsize = roundup(bufsize, mss);
! 3208: if (bufsize > sb_max)
! 3209: bufsize = sb_max;
! 3210: (void)sbreserve(&so->so_rcv, bufsize);
! 3211: }
! 3212:
! 3213: }
! 3214:
! 3215: #if defined (TCP_SACK)
! 3216: /*
! 3217: * Checks for partial ack. If partial ack arrives, force the retransmission
! 3218: * of the next unacknowledged segment, do not clear tp->t_dupacks, and return
! 3219: * 1. By setting snd_nxt to ti_ack, this forces retransmission timer to
! 3220: * be started again. If the ack advances at least to tp->snd_last, return 0.
! 3221: */
! 3222: int
! 3223: tcp_newreno(tp, th)
! 3224: struct tcpcb *tp;
! 3225: struct tcphdr *th;
! 3226: {
! 3227: if (SEQ_LT(th->th_ack, tp->snd_last)) {
! 3228: /*
! 3229: * snd_una has not been updated and the socket send buffer
! 3230: * not yet drained of the acked data, so we have to leave
! 3231: * snd_una as it was to get the correct data offset in
! 3232: * tcp_output().
! 3233: */
! 3234: tcp_seq onxt = tp->snd_nxt;
! 3235: u_long ocwnd = tp->snd_cwnd;
! 3236: TCP_TIMER_DISARM(tp, TCPT_REXMT);
! 3237: tp->t_rtttime = 0;
! 3238: tp->snd_nxt = th->th_ack;
! 3239: /*
! 3240: * Set snd_cwnd to one segment beyond acknowledged offset
! 3241: * (tp->snd_una not yet updated when this function is called)
! 3242: */
! 3243: tp->snd_cwnd = tp->t_maxseg + (th->th_ack - tp->snd_una);
! 3244: (void) tcp_output(tp);
! 3245: tp->snd_cwnd = ocwnd;
! 3246: if (SEQ_GT(onxt, tp->snd_nxt))
! 3247: tp->snd_nxt = onxt;
! 3248: /*
! 3249: * Partial window deflation. Relies on fact that tp->snd_una
! 3250: * not updated yet.
! 3251: */
! 3252: if (tp->snd_cwnd > th->th_ack - tp->snd_una)
! 3253: tp->snd_cwnd -= th->th_ack - tp->snd_una;
! 3254: else
! 3255: tp->snd_cwnd = 0;
! 3256: tp->snd_cwnd += tp->t_maxseg;
! 3257:
! 3258: return 1;
! 3259: }
! 3260: return 0;
! 3261: }
! 3262: #endif /* TCP_SACK */
! 3263:
! 3264: int
! 3265: tcp_mss_adv(struct ifnet *ifp, int af)
! 3266: {
! 3267: int mss = 0;
! 3268: int iphlen;
! 3269:
! 3270: switch (af) {
! 3271: case AF_INET:
! 3272: if (ifp != NULL)
! 3273: mss = ifp->if_mtu;
! 3274: iphlen = sizeof(struct ip);
! 3275: break;
! 3276: #ifdef INET6
! 3277: case AF_INET6:
! 3278: if (ifp != NULL)
! 3279: mss = IN6_LINKMTU(ifp);
! 3280: iphlen = sizeof(struct ip6_hdr);
! 3281: break;
! 3282: #endif
! 3283: }
! 3284: mss = mss - iphlen - sizeof(struct tcphdr);
! 3285: return (max(mss, tcp_mssdflt));
! 3286: }
! 3287:
! 3288: /*
! 3289: * TCP compressed state engine. Currently used to hold compressed
! 3290: * state for SYN_RECEIVED.
! 3291: */
! 3292:
! 3293: u_long syn_cache_count;
! 3294: u_int32_t syn_hash1, syn_hash2;
! 3295:
! 3296: #define SYN_HASH(sa, sp, dp) \
! 3297: ((((sa)->s_addr^syn_hash1)*(((((u_int32_t)(dp))<<16) + \
! 3298: ((u_int32_t)(sp)))^syn_hash2)))
! 3299: #ifndef INET6
! 3300: #define SYN_HASHALL(hash, src, dst) \
! 3301: do { \
! 3302: hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \
! 3303: ((struct sockaddr_in *)(src))->sin_port, \
! 3304: ((struct sockaddr_in *)(dst))->sin_port); \
! 3305: } while (/*CONSTCOND*/ 0)
! 3306: #else
! 3307: #define SYN_HASH6(sa, sp, dp) \
! 3308: ((((sa)->s6_addr32[0] ^ (sa)->s6_addr32[3] ^ syn_hash1) * \
! 3309: (((((u_int32_t)(dp))<<16) + ((u_int32_t)(sp)))^syn_hash2)) \
! 3310: & 0x7fffffff)
! 3311:
! 3312: #define SYN_HASHALL(hash, src, dst) \
! 3313: do { \
! 3314: switch ((src)->sa_family) { \
! 3315: case AF_INET: \
! 3316: hash = SYN_HASH(&((struct sockaddr_in *)(src))->sin_addr, \
! 3317: ((struct sockaddr_in *)(src))->sin_port, \
! 3318: ((struct sockaddr_in *)(dst))->sin_port); \
! 3319: break; \
! 3320: case AF_INET6: \
! 3321: hash = SYN_HASH6(&((struct sockaddr_in6 *)(src))->sin6_addr, \
! 3322: ((struct sockaddr_in6 *)(src))->sin6_port, \
! 3323: ((struct sockaddr_in6 *)(dst))->sin6_port); \
! 3324: break; \
! 3325: default: \
! 3326: hash = 0; \
! 3327: } \
! 3328: } while (/*CONSTCOND*/0)
! 3329: #endif /* INET6 */
! 3330:
! 3331: #define SYN_CACHE_RM(sc) \
! 3332: do { \
! 3333: (sc)->sc_flags |= SCF_DEAD; \
! 3334: TAILQ_REMOVE(&tcp_syn_cache[(sc)->sc_bucketidx].sch_bucket, \
! 3335: (sc), sc_bucketq); \
! 3336: (sc)->sc_tp = NULL; \
! 3337: LIST_REMOVE((sc), sc_tpq); \
! 3338: tcp_syn_cache[(sc)->sc_bucketidx].sch_length--; \
! 3339: timeout_del(&(sc)->sc_timer); \
! 3340: syn_cache_count--; \
! 3341: } while (/*CONSTCOND*/0)
! 3342:
! 3343: #define SYN_CACHE_PUT(sc) \
! 3344: do { \
! 3345: if ((sc)->sc_ipopts) \
! 3346: (void) m_free((sc)->sc_ipopts); \
! 3347: if ((sc)->sc_route4.ro_rt != NULL) \
! 3348: RTFREE((sc)->sc_route4.ro_rt); \
! 3349: timeout_set(&(sc)->sc_timer, syn_cache_reaper, (sc)); \
! 3350: timeout_add(&(sc)->sc_timer, 0); \
! 3351: } while (/*CONSTCOND*/0)
! 3352:
! 3353: struct pool syn_cache_pool;
! 3354:
! 3355: /*
! 3356: * We don't estimate RTT with SYNs, so each packet starts with the default
! 3357: * RTT and each timer step has a fixed timeout value.
! 3358: */
! 3359: #define SYN_CACHE_TIMER_ARM(sc) \
! 3360: do { \
! 3361: TCPT_RANGESET((sc)->sc_rxtcur, \
! 3362: TCPTV_SRTTDFLT * tcp_backoff[(sc)->sc_rxtshift], TCPTV_MIN, \
! 3363: TCPTV_REXMTMAX); \
! 3364: if (!timeout_initialized(&(sc)->sc_timer)) \
! 3365: timeout_set(&(sc)->sc_timer, syn_cache_timer, (sc)); \
! 3366: timeout_add(&(sc)->sc_timer, (sc)->sc_rxtcur * (hz / PR_SLOWHZ)); \
! 3367: } while (/*CONSTCOND*/0)
! 3368:
! 3369: #define SYN_CACHE_TIMESTAMP(sc) tcp_now + (sc)->sc_modulate
! 3370:
! 3371: void
! 3372: syn_cache_init()
! 3373: {
! 3374: int i;
! 3375:
! 3376: /* Initialize the hash buckets. */
! 3377: for (i = 0; i < tcp_syn_cache_size; i++)
! 3378: TAILQ_INIT(&tcp_syn_cache[i].sch_bucket);
! 3379:
! 3380: /* Initialize the syn cache pool. */
! 3381: pool_init(&syn_cache_pool, sizeof(struct syn_cache), 0, 0, 0,
! 3382: "synpl", NULL);
! 3383: }
! 3384:
! 3385: void
! 3386: syn_cache_insert(sc, tp)
! 3387: struct syn_cache *sc;
! 3388: struct tcpcb *tp;
! 3389: {
! 3390: struct syn_cache_head *scp;
! 3391: struct syn_cache *sc2;
! 3392: int s;
! 3393:
! 3394: /*
! 3395: * If there are no entries in the hash table, reinitialize
! 3396: * the hash secrets.
! 3397: */
! 3398: if (syn_cache_count == 0) {
! 3399: syn_hash1 = arc4random();
! 3400: syn_hash2 = arc4random();
! 3401: }
! 3402:
! 3403: SYN_HASHALL(sc->sc_hash, &sc->sc_src.sa, &sc->sc_dst.sa);
! 3404: sc->sc_bucketidx = sc->sc_hash % tcp_syn_cache_size;
! 3405: scp = &tcp_syn_cache[sc->sc_bucketidx];
! 3406:
! 3407: /*
! 3408: * Make sure that we don't overflow the per-bucket
! 3409: * limit or the total cache size limit.
! 3410: */
! 3411: s = splsoftnet();
! 3412: if (scp->sch_length >= tcp_syn_bucket_limit) {
! 3413: tcpstat.tcps_sc_bucketoverflow++;
! 3414: /*
! 3415: * The bucket is full. Toss the oldest element in the
! 3416: * bucket. This will be the first entry in the bucket.
! 3417: */
! 3418: sc2 = TAILQ_FIRST(&scp->sch_bucket);
! 3419: #ifdef DIAGNOSTIC
! 3420: /*
! 3421: * This should never happen; we should always find an
! 3422: * entry in our bucket.
! 3423: */
! 3424: if (sc2 == NULL)
! 3425: panic("syn_cache_insert: bucketoverflow: impossible");
! 3426: #endif
! 3427: SYN_CACHE_RM(sc2);
! 3428: SYN_CACHE_PUT(sc2);
! 3429: } else if (syn_cache_count >= tcp_syn_cache_limit) {
! 3430: struct syn_cache_head *scp2, *sce;
! 3431:
! 3432: tcpstat.tcps_sc_overflowed++;
! 3433: /*
! 3434: * The cache is full. Toss the oldest entry in the
! 3435: * first non-empty bucket we can find.
! 3436: *
! 3437: * XXX We would really like to toss the oldest
! 3438: * entry in the cache, but we hope that this
! 3439: * condition doesn't happen very often.
! 3440: */
! 3441: scp2 = scp;
! 3442: if (TAILQ_EMPTY(&scp2->sch_bucket)) {
! 3443: sce = &tcp_syn_cache[tcp_syn_cache_size];
! 3444: for (++scp2; scp2 != scp; scp2++) {
! 3445: if (scp2 >= sce)
! 3446: scp2 = &tcp_syn_cache[0];
! 3447: if (! TAILQ_EMPTY(&scp2->sch_bucket))
! 3448: break;
! 3449: }
! 3450: #ifdef DIAGNOSTIC
! 3451: /*
! 3452: * This should never happen; we should always find a
! 3453: * non-empty bucket.
! 3454: */
! 3455: if (scp2 == scp)
! 3456: panic("syn_cache_insert: cacheoverflow: "
! 3457: "impossible");
! 3458: #endif
! 3459: }
! 3460: sc2 = TAILQ_FIRST(&scp2->sch_bucket);
! 3461: SYN_CACHE_RM(sc2);
! 3462: SYN_CACHE_PUT(sc2);
! 3463: }
! 3464:
! 3465: /*
! 3466: * Initialize the entry's timer.
! 3467: */
! 3468: sc->sc_rxttot = 0;
! 3469: sc->sc_rxtshift = 0;
! 3470: SYN_CACHE_TIMER_ARM(sc);
! 3471:
! 3472: /* Link it from tcpcb entry */
! 3473: LIST_INSERT_HEAD(&tp->t_sc, sc, sc_tpq);
! 3474:
! 3475: /* Put it into the bucket. */
! 3476: TAILQ_INSERT_TAIL(&scp->sch_bucket, sc, sc_bucketq);
! 3477: scp->sch_length++;
! 3478: syn_cache_count++;
! 3479:
! 3480: tcpstat.tcps_sc_added++;
! 3481: splx(s);
! 3482: }
! 3483:
! 3484: /*
! 3485: * Walk the timer queues, looking for SYN,ACKs that need to be retransmitted.
! 3486: * If we have retransmitted an entry the maximum number of times, expire
! 3487: * that entry.
! 3488: */
! 3489: void
! 3490: syn_cache_timer(void *arg)
! 3491: {
! 3492: struct syn_cache *sc = arg;
! 3493: int s;
! 3494:
! 3495: s = splsoftnet();
! 3496: if (sc->sc_flags & SCF_DEAD) {
! 3497: splx(s);
! 3498: return;
! 3499: }
! 3500:
! 3501: if (__predict_false(sc->sc_rxtshift == TCP_MAXRXTSHIFT)) {
! 3502: /* Drop it -- too many retransmissions. */
! 3503: goto dropit;
! 3504: }
! 3505:
! 3506: /*
! 3507: * Compute the total amount of time this entry has
! 3508: * been on a queue. If this entry has been on longer
! 3509: * than the keep alive timer would allow, expire it.
! 3510: */
! 3511: sc->sc_rxttot += sc->sc_rxtcur;
! 3512: if (sc->sc_rxttot >= tcptv_keep_init)
! 3513: goto dropit;
! 3514:
! 3515: tcpstat.tcps_sc_retransmitted++;
! 3516: (void) syn_cache_respond(sc, NULL);
! 3517:
! 3518: /* Advance the timer back-off. */
! 3519: sc->sc_rxtshift++;
! 3520: SYN_CACHE_TIMER_ARM(sc);
! 3521:
! 3522: splx(s);
! 3523: return;
! 3524:
! 3525: dropit:
! 3526: tcpstat.tcps_sc_timed_out++;
! 3527: SYN_CACHE_RM(sc);
! 3528: SYN_CACHE_PUT(sc);
! 3529: splx(s);
! 3530: }
! 3531:
! 3532: void
! 3533: syn_cache_reaper(void *arg)
! 3534: {
! 3535: struct syn_cache *sc = arg;
! 3536: int s;
! 3537:
! 3538: s = splsoftnet();
! 3539: pool_put(&syn_cache_pool, (sc));
! 3540: splx(s);
! 3541: return;
! 3542: }
! 3543:
! 3544: /*
! 3545: * Remove syn cache created by the specified tcb entry,
! 3546: * because this does not make sense to keep them
! 3547: * (if there's no tcb entry, syn cache entry will never be used)
! 3548: */
! 3549: void
! 3550: syn_cache_cleanup(tp)
! 3551: struct tcpcb *tp;
! 3552: {
! 3553: struct syn_cache *sc, *nsc;
! 3554: int s;
! 3555:
! 3556: s = splsoftnet();
! 3557:
! 3558: for (sc = LIST_FIRST(&tp->t_sc); sc != NULL; sc = nsc) {
! 3559: nsc = LIST_NEXT(sc, sc_tpq);
! 3560:
! 3561: #ifdef DIAGNOSTIC
! 3562: if (sc->sc_tp != tp)
! 3563: panic("invalid sc_tp in syn_cache_cleanup");
! 3564: #endif
! 3565: SYN_CACHE_RM(sc);
! 3566: SYN_CACHE_PUT(sc);
! 3567: }
! 3568: /* just for safety */
! 3569: LIST_INIT(&tp->t_sc);
! 3570:
! 3571: splx(s);
! 3572: }
! 3573:
! 3574: /*
! 3575: * Find an entry in the syn cache.
! 3576: */
! 3577: struct syn_cache *
! 3578: syn_cache_lookup(src, dst, headp)
! 3579: struct sockaddr *src;
! 3580: struct sockaddr *dst;
! 3581: struct syn_cache_head **headp;
! 3582: {
! 3583: struct syn_cache *sc;
! 3584: struct syn_cache_head *scp;
! 3585: u_int32_t hash;
! 3586: int s;
! 3587:
! 3588: SYN_HASHALL(hash, src, dst);
! 3589:
! 3590: scp = &tcp_syn_cache[hash % tcp_syn_cache_size];
! 3591: *headp = scp;
! 3592: s = splsoftnet();
! 3593: for (sc = TAILQ_FIRST(&scp->sch_bucket); sc != NULL;
! 3594: sc = TAILQ_NEXT(sc, sc_bucketq)) {
! 3595: if (sc->sc_hash != hash)
! 3596: continue;
! 3597: if (!bcmp(&sc->sc_src, src, src->sa_len) &&
! 3598: !bcmp(&sc->sc_dst, dst, dst->sa_len)) {
! 3599: splx(s);
! 3600: return (sc);
! 3601: }
! 3602: }
! 3603: splx(s);
! 3604: return (NULL);
! 3605: }
! 3606:
! 3607: /*
! 3608: * This function gets called when we receive an ACK for a
! 3609: * socket in the LISTEN state. We look up the connection
! 3610: * in the syn cache, and if its there, we pull it out of
! 3611: * the cache and turn it into a full-blown connection in
! 3612: * the SYN-RECEIVED state.
! 3613: *
! 3614: * The return values may not be immediately obvious, and their effects
! 3615: * can be subtle, so here they are:
! 3616: *
! 3617: * NULL SYN was not found in cache; caller should drop the
! 3618: * packet and send an RST.
! 3619: *
! 3620: * -1 We were unable to create the new connection, and are
! 3621: * aborting it. An ACK,RST is being sent to the peer
! 3622: * (unless we got screwey sequence numbners; see below),
! 3623: * because the 3-way handshake has been completed. Caller
! 3624: * should not free the mbuf, since we may be using it. If
! 3625: * we are not, we will free it.
! 3626: *
! 3627: * Otherwise, the return value is a pointer to the new socket
! 3628: * associated with the connection.
! 3629: */
! 3630: struct socket *
! 3631: syn_cache_get(src, dst, th, hlen, tlen, so, m)
! 3632: struct sockaddr *src;
! 3633: struct sockaddr *dst;
! 3634: struct tcphdr *th;
! 3635: unsigned int hlen, tlen;
! 3636: struct socket *so;
! 3637: struct mbuf *m;
! 3638: {
! 3639: struct syn_cache *sc;
! 3640: struct syn_cache_head *scp;
! 3641: struct inpcb *inp = NULL;
! 3642: struct tcpcb *tp = 0;
! 3643: struct mbuf *am;
! 3644: int s;
! 3645: struct socket *oso;
! 3646:
! 3647: s = splsoftnet();
! 3648: if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
! 3649: splx(s);
! 3650: return (NULL);
! 3651: }
! 3652:
! 3653: /*
! 3654: * Verify the sequence and ack numbers. Try getting the correct
! 3655: * response again.
! 3656: */
! 3657: if ((th->th_ack != sc->sc_iss + 1) ||
! 3658: SEQ_LEQ(th->th_seq, sc->sc_irs) ||
! 3659: SEQ_GT(th->th_seq, sc->sc_irs + 1 + sc->sc_win)) {
! 3660: (void) syn_cache_respond(sc, m);
! 3661: splx(s);
! 3662: return ((struct socket *)(-1));
! 3663: }
! 3664:
! 3665: /* Remove this cache entry */
! 3666: SYN_CACHE_RM(sc);
! 3667: splx(s);
! 3668:
! 3669: /*
! 3670: * Ok, create the full blown connection, and set things up
! 3671: * as they would have been set up if we had created the
! 3672: * connection when the SYN arrived. If we can't create
! 3673: * the connection, abort it.
! 3674: */
! 3675: oso = so;
! 3676: so = sonewconn(so, SS_ISCONNECTED);
! 3677: if (so == NULL)
! 3678: goto resetandabort;
! 3679:
! 3680: inp = sotoinpcb(oso);
! 3681: #ifdef IPSEC
! 3682: /*
! 3683: * We need to copy the required security levels
! 3684: * from the old pcb. Ditto for any other
! 3685: * IPsec-related information.
! 3686: */
! 3687: {
! 3688: struct inpcb *newinp = (struct inpcb *)so->so_pcb;
! 3689: bcopy(inp->inp_seclevel, newinp->inp_seclevel,
! 3690: sizeof(inp->inp_seclevel));
! 3691: newinp->inp_secrequire = inp->inp_secrequire;
! 3692: if (inp->inp_ipo != NULL) {
! 3693: newinp->inp_ipo = inp->inp_ipo;
! 3694: inp->inp_ipo->ipo_ref_count++;
! 3695: }
! 3696: if (inp->inp_ipsec_remotecred != NULL) {
! 3697: newinp->inp_ipsec_remotecred = inp->inp_ipsec_remotecred;
! 3698: inp->inp_ipsec_remotecred->ref_count++;
! 3699: }
! 3700: if (inp->inp_ipsec_remoteauth != NULL) {
! 3701: newinp->inp_ipsec_remoteauth
! 3702: = inp->inp_ipsec_remoteauth;
! 3703: inp->inp_ipsec_remoteauth->ref_count++;
! 3704: }
! 3705: }
! 3706: #endif /* IPSEC */
! 3707: #ifdef INET6
! 3708: /*
! 3709: * inp still has the OLD in_pcb stuff, set the
! 3710: * v6-related flags on the new guy, too.
! 3711: */
! 3712: {
! 3713: int flags = inp->inp_flags;
! 3714: struct inpcb *oldinpcb = inp;
! 3715:
! 3716: inp = (struct inpcb *)so->so_pcb;
! 3717: inp->inp_flags |= (flags & INP_IPV6);
! 3718: if ((inp->inp_flags & INP_IPV6) != 0) {
! 3719: inp->inp_ipv6.ip6_hlim =
! 3720: oldinpcb->inp_ipv6.ip6_hlim;
! 3721: }
! 3722: }
! 3723: #else /* INET6 */
! 3724: inp = (struct inpcb *)so->so_pcb;
! 3725: #endif /* INET6 */
! 3726:
! 3727: inp->inp_lport = th->th_dport;
! 3728: switch (src->sa_family) {
! 3729: #ifdef INET6
! 3730: case AF_INET6:
! 3731: inp->inp_laddr6 = ((struct sockaddr_in6 *)dst)->sin6_addr;
! 3732: break;
! 3733: #endif /* INET6 */
! 3734: case AF_INET:
! 3735:
! 3736: inp->inp_laddr = ((struct sockaddr_in *)dst)->sin_addr;
! 3737: inp->inp_options = ip_srcroute();
! 3738: if (inp->inp_options == NULL) {
! 3739: inp->inp_options = sc->sc_ipopts;
! 3740: sc->sc_ipopts = NULL;
! 3741: }
! 3742: break;
! 3743: }
! 3744: in_pcbrehash(inp);
! 3745:
! 3746: /*
! 3747: * Give the new socket our cached route reference.
! 3748: */
! 3749: if (src->sa_family == AF_INET)
! 3750: inp->inp_route = sc->sc_route4; /* struct assignment */
! 3751: #ifdef INET6
! 3752: else
! 3753: inp->inp_route6 = sc->sc_route6;
! 3754: #endif
! 3755: sc->sc_route4.ro_rt = NULL;
! 3756:
! 3757: am = m_get(M_DONTWAIT, MT_SONAME); /* XXX */
! 3758: if (am == NULL)
! 3759: goto resetandabort;
! 3760: am->m_len = src->sa_len;
! 3761: bcopy(src, mtod(am, caddr_t), src->sa_len);
! 3762:
! 3763: switch (src->sa_family) {
! 3764: case AF_INET:
! 3765: /* drop IPv4 packet to AF_INET6 socket */
! 3766: if (inp->inp_flags & INP_IPV6) {
! 3767: (void) m_free(am);
! 3768: goto resetandabort;
! 3769: }
! 3770: if (in_pcbconnect(inp, am)) {
! 3771: (void) m_free(am);
! 3772: goto resetandabort;
! 3773: }
! 3774: break;
! 3775: #ifdef INET6
! 3776: case AF_INET6:
! 3777: if (in6_pcbconnect(inp, am)) {
! 3778: (void) m_free(am);
! 3779: goto resetandabort;
! 3780: }
! 3781: break;
! 3782: #endif
! 3783: }
! 3784: (void) m_free(am);
! 3785:
! 3786: tp = intotcpcb(inp);
! 3787: tp->t_flags = sototcpcb(oso)->t_flags & TF_NODELAY;
! 3788: if (sc->sc_request_r_scale != 15) {
! 3789: tp->requested_s_scale = sc->sc_requested_s_scale;
! 3790: tp->request_r_scale = sc->sc_request_r_scale;
! 3791: tp->snd_scale = sc->sc_requested_s_scale;
! 3792: tp->rcv_scale = sc->sc_request_r_scale;
! 3793: tp->t_flags |= TF_REQ_SCALE|TF_RCVD_SCALE;
! 3794: }
! 3795: if (sc->sc_flags & SCF_TIMESTAMP)
! 3796: tp->t_flags |= TF_REQ_TSTMP|TF_RCVD_TSTMP;
! 3797:
! 3798: tp->t_template = tcp_template(tp);
! 3799: if (tp->t_template == 0) {
! 3800: tp = tcp_drop(tp, ENOBUFS); /* destroys socket */
! 3801: so = NULL;
! 3802: m_freem(m);
! 3803: goto abort;
! 3804: }
! 3805: #ifdef TCP_SACK
! 3806: tp->sack_enable = sc->sc_flags & SCF_SACK_PERMIT;
! 3807: #endif
! 3808:
! 3809: tp->ts_modulate = sc->sc_modulate;
! 3810: tp->iss = sc->sc_iss;
! 3811: tp->irs = sc->sc_irs;
! 3812: tcp_sendseqinit(tp);
! 3813: #if defined (TCP_SACK) || defined(TCP_ECN)
! 3814: tp->snd_last = tp->snd_una;
! 3815: #endif /* TCP_SACK */
! 3816: #if defined(TCP_SACK) && defined(TCP_FACK)
! 3817: tp->snd_fack = tp->snd_una;
! 3818: tp->retran_data = 0;
! 3819: tp->snd_awnd = 0;
! 3820: #endif /* TCP_FACK */
! 3821: #ifdef TCP_ECN
! 3822: if (sc->sc_flags & SCF_ECN_PERMIT) {
! 3823: tp->t_flags |= TF_ECN_PERMIT;
! 3824: tcpstat.tcps_ecn_accepts++;
! 3825: }
! 3826: #endif
! 3827: #ifdef TCP_SACK
! 3828: if (sc->sc_flags & SCF_SACK_PERMIT)
! 3829: tp->t_flags |= TF_SACK_PERMIT;
! 3830: #endif
! 3831: #ifdef TCP_SIGNATURE
! 3832: if (sc->sc_flags & SCF_SIGNATURE)
! 3833: tp->t_flags |= TF_SIGNATURE;
! 3834: #endif
! 3835: tcp_rcvseqinit(tp);
! 3836: tp->t_state = TCPS_SYN_RECEIVED;
! 3837: tp->t_rcvtime = tcp_now;
! 3838: TCP_TIMER_ARM(tp, TCPT_KEEP, tcptv_keep_init);
! 3839: tcpstat.tcps_accepts++;
! 3840:
! 3841: tcp_mss(tp, sc->sc_peermaxseg); /* sets t_maxseg */
! 3842: if (sc->sc_peermaxseg)
! 3843: tcp_mss_update(tp);
! 3844: /* Reset initial window to 1 segment for retransmit */
! 3845: if (sc->sc_rxtshift > 0)
! 3846: tp->snd_cwnd = tp->t_maxseg;
! 3847: tp->snd_wl1 = sc->sc_irs;
! 3848: tp->rcv_up = sc->sc_irs + 1;
! 3849:
! 3850: /*
! 3851: * This is what whould have happened in tcp_output() when
! 3852: * the SYN,ACK was sent.
! 3853: */
! 3854: tp->snd_up = tp->snd_una;
! 3855: tp->snd_max = tp->snd_nxt = tp->iss+1;
! 3856: TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
! 3857: if (sc->sc_win > 0 && SEQ_GT(tp->rcv_nxt + sc->sc_win, tp->rcv_adv))
! 3858: tp->rcv_adv = tp->rcv_nxt + sc->sc_win;
! 3859: tp->last_ack_sent = tp->rcv_nxt;
! 3860:
! 3861: tcpstat.tcps_sc_completed++;
! 3862: SYN_CACHE_PUT(sc);
! 3863: return (so);
! 3864:
! 3865: resetandabort:
! 3866: tcp_respond(NULL, mtod(m, caddr_t), m, (tcp_seq)0, th->th_ack, TH_RST);
! 3867: abort:
! 3868: if (so != NULL)
! 3869: (void) soabort(so);
! 3870: SYN_CACHE_PUT(sc);
! 3871: tcpstat.tcps_sc_aborted++;
! 3872: return ((struct socket *)(-1));
! 3873: }
! 3874:
! 3875: /*
! 3876: * This function is called when we get a RST for a
! 3877: * non-existent connection, so that we can see if the
! 3878: * connection is in the syn cache. If it is, zap it.
! 3879: */
! 3880:
! 3881: void
! 3882: syn_cache_reset(src, dst, th)
! 3883: struct sockaddr *src;
! 3884: struct sockaddr *dst;
! 3885: struct tcphdr *th;
! 3886: {
! 3887: struct syn_cache *sc;
! 3888: struct syn_cache_head *scp;
! 3889: int s = splsoftnet();
! 3890:
! 3891: if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
! 3892: splx(s);
! 3893: return;
! 3894: }
! 3895: if (SEQ_LT(th->th_seq, sc->sc_irs) ||
! 3896: SEQ_GT(th->th_seq, sc->sc_irs+1)) {
! 3897: splx(s);
! 3898: return;
! 3899: }
! 3900: SYN_CACHE_RM(sc);
! 3901: splx(s);
! 3902: tcpstat.tcps_sc_reset++;
! 3903: SYN_CACHE_PUT(sc);
! 3904: }
! 3905:
! 3906: void
! 3907: syn_cache_unreach(src, dst, th)
! 3908: struct sockaddr *src;
! 3909: struct sockaddr *dst;
! 3910: struct tcphdr *th;
! 3911: {
! 3912: struct syn_cache *sc;
! 3913: struct syn_cache_head *scp;
! 3914: int s;
! 3915:
! 3916: s = splsoftnet();
! 3917: if ((sc = syn_cache_lookup(src, dst, &scp)) == NULL) {
! 3918: splx(s);
! 3919: return;
! 3920: }
! 3921: /* If the sequence number != sc_iss, then it's a bogus ICMP msg */
! 3922: if (ntohl (th->th_seq) != sc->sc_iss) {
! 3923: splx(s);
! 3924: return;
! 3925: }
! 3926:
! 3927: /*
! 3928: * If we've retransmitted 3 times and this is our second error,
! 3929: * we remove the entry. Otherwise, we allow it to continue on.
! 3930: * This prevents us from incorrectly nuking an entry during a
! 3931: * spurious network outage.
! 3932: *
! 3933: * See tcp_notify().
! 3934: */
! 3935: if ((sc->sc_flags & SCF_UNREACH) == 0 || sc->sc_rxtshift < 3) {
! 3936: sc->sc_flags |= SCF_UNREACH;
! 3937: splx(s);
! 3938: return;
! 3939: }
! 3940:
! 3941: SYN_CACHE_RM(sc);
! 3942: splx(s);
! 3943: tcpstat.tcps_sc_unreach++;
! 3944: SYN_CACHE_PUT(sc);
! 3945: }
! 3946:
! 3947: /*
! 3948: * Given a LISTEN socket and an inbound SYN request, add
! 3949: * this to the syn cache, and send back a segment:
! 3950: * <SEQ=ISS><ACK=RCV_NXT><CTL=SYN,ACK>
! 3951: * to the source.
! 3952: *
! 3953: * IMPORTANT NOTE: We do _NOT_ ACK data that might accompany the SYN.
! 3954: * Doing so would require that we hold onto the data and deliver it
! 3955: * to the application. However, if we are the target of a SYN-flood
! 3956: * DoS attack, an attacker could send data which would eventually
! 3957: * consume all available buffer space if it were ACKed. By not ACKing
! 3958: * the data, we avoid this DoS scenario.
! 3959: */
! 3960:
! 3961: int
! 3962: syn_cache_add(src, dst, th, iphlen, so, m, optp, optlen, oi, issp)
! 3963: struct sockaddr *src;
! 3964: struct sockaddr *dst;
! 3965: struct tcphdr *th;
! 3966: unsigned int iphlen;
! 3967: struct socket *so;
! 3968: struct mbuf *m;
! 3969: u_char *optp;
! 3970: int optlen;
! 3971: struct tcp_opt_info *oi;
! 3972: tcp_seq *issp;
! 3973: {
! 3974: struct tcpcb tb, *tp;
! 3975: long win;
! 3976: struct syn_cache *sc;
! 3977: struct syn_cache_head *scp;
! 3978: struct mbuf *ipopts;
! 3979:
! 3980: tp = sototcpcb(so);
! 3981:
! 3982: /*
! 3983: * RFC1122 4.2.3.10, p. 104: discard bcast/mcast SYN
! 3984: *
! 3985: * Note this check is performed in tcp_input() very early on.
! 3986: */
! 3987:
! 3988: /*
! 3989: * Initialize some local state.
! 3990: */
! 3991: win = sbspace(&so->so_rcv);
! 3992: if (win > TCP_MAXWIN)
! 3993: win = TCP_MAXWIN;
! 3994:
! 3995: #ifdef TCP_SIGNATURE
! 3996: if (optp || (tp->t_flags & TF_SIGNATURE)) {
! 3997: #else
! 3998: if (optp) {
! 3999: #endif
! 4000: tb.pf = tp->pf;
! 4001: #ifdef TCP_SACK
! 4002: tb.sack_enable = tp->sack_enable;
! 4003: #endif
! 4004: tb.t_flags = tcp_do_rfc1323 ? (TF_REQ_SCALE|TF_REQ_TSTMP) : 0;
! 4005: #ifdef TCP_SIGNATURE
! 4006: if (tp->t_flags & TF_SIGNATURE)
! 4007: tb.t_flags |= TF_SIGNATURE;
! 4008: #endif
! 4009: tb.t_state = TCPS_LISTEN;
! 4010: if (tcp_dooptions(&tb, optp, optlen, th, m, iphlen, oi))
! 4011: return (0);
! 4012: } else
! 4013: tb.t_flags = 0;
! 4014:
! 4015: switch (src->sa_family) {
! 4016: #ifdef INET
! 4017: case AF_INET:
! 4018: /*
! 4019: * Remember the IP options, if any.
! 4020: */
! 4021: ipopts = ip_srcroute();
! 4022: break;
! 4023: #endif
! 4024: default:
! 4025: ipopts = NULL;
! 4026: }
! 4027:
! 4028: /*
! 4029: * See if we already have an entry for this connection.
! 4030: * If we do, resend the SYN,ACK. We do not count this
! 4031: * as a retransmission (XXX though maybe we should).
! 4032: */
! 4033: if ((sc = syn_cache_lookup(src, dst, &scp)) != NULL) {
! 4034: tcpstat.tcps_sc_dupesyn++;
! 4035: if (ipopts) {
! 4036: /*
! 4037: * If we were remembering a previous source route,
! 4038: * forget it and use the new one we've been given.
! 4039: */
! 4040: if (sc->sc_ipopts)
! 4041: (void) m_free(sc->sc_ipopts);
! 4042: sc->sc_ipopts = ipopts;
! 4043: }
! 4044: sc->sc_timestamp = tb.ts_recent;
! 4045: if (syn_cache_respond(sc, m) == 0) {
! 4046: tcpstat.tcps_sndacks++;
! 4047: tcpstat.tcps_sndtotal++;
! 4048: }
! 4049: return (1);
! 4050: }
! 4051:
! 4052: sc = pool_get(&syn_cache_pool, PR_NOWAIT);
! 4053: if (sc == NULL) {
! 4054: if (ipopts)
! 4055: (void) m_free(ipopts);
! 4056: return (0);
! 4057: }
! 4058:
! 4059: /*
! 4060: * Fill in the cache, and put the necessary IP and TCP
! 4061: * options into the reply.
! 4062: */
! 4063: bzero(sc, sizeof(struct syn_cache));
! 4064: bzero(&sc->sc_timer, sizeof(sc->sc_timer));
! 4065: bcopy(src, &sc->sc_src, src->sa_len);
! 4066: bcopy(dst, &sc->sc_dst, dst->sa_len);
! 4067: sc->sc_flags = 0;
! 4068: sc->sc_ipopts = ipopts;
! 4069: sc->sc_irs = th->th_seq;
! 4070:
! 4071: #ifdef TCP_COMPAT_42
! 4072: tcp_iss += TCP_ISSINCR/2;
! 4073: sc->sc_iss = tcp_iss;
! 4074: #else
! 4075: sc->sc_iss = issp ? *issp : arc4random();
! 4076: #endif
! 4077: sc->sc_peermaxseg = oi->maxseg;
! 4078: sc->sc_ourmaxseg = tcp_mss_adv(m->m_flags & M_PKTHDR ?
! 4079: m->m_pkthdr.rcvif : NULL, sc->sc_src.sa.sa_family);
! 4080: sc->sc_win = win;
! 4081: sc->sc_timestamp = tb.ts_recent;
! 4082: if ((tb.t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP)) ==
! 4083: (TF_REQ_TSTMP|TF_RCVD_TSTMP)) {
! 4084: sc->sc_flags |= SCF_TIMESTAMP;
! 4085: sc->sc_modulate = arc4random();
! 4086: }
! 4087: if ((tb.t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
! 4088: (TF_RCVD_SCALE|TF_REQ_SCALE)) {
! 4089: sc->sc_requested_s_scale = tb.requested_s_scale;
! 4090: sc->sc_request_r_scale = 0;
! 4091: while (sc->sc_request_r_scale < TCP_MAX_WINSHIFT &&
! 4092: TCP_MAXWIN << sc->sc_request_r_scale <
! 4093: so->so_rcv.sb_hiwat)
! 4094: sc->sc_request_r_scale++;
! 4095: } else {
! 4096: sc->sc_requested_s_scale = 15;
! 4097: sc->sc_request_r_scale = 15;
! 4098: }
! 4099: #ifdef TCP_ECN
! 4100: /*
! 4101: * if both ECE and CWR flag bits are set, peer is ECN capable.
! 4102: */
! 4103: if (tcp_do_ecn &&
! 4104: (th->th_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR))
! 4105: sc->sc_flags |= SCF_ECN_PERMIT;
! 4106: #endif
! 4107: #ifdef TCP_SACK
! 4108: /*
! 4109: * Set SCF_SACK_PERMIT if peer did send a SACK_PERMITTED option
! 4110: * (i.e., if tcp_dooptions() did set TF_SACK_PERMIT).
! 4111: */
! 4112: if (tb.sack_enable && (tb.t_flags & TF_SACK_PERMIT))
! 4113: sc->sc_flags |= SCF_SACK_PERMIT;
! 4114: #endif
! 4115: #ifdef TCP_SIGNATURE
! 4116: if (tb.t_flags & TF_SIGNATURE)
! 4117: sc->sc_flags |= SCF_SIGNATURE;
! 4118: #endif
! 4119: sc->sc_tp = tp;
! 4120: if (syn_cache_respond(sc, m) == 0) {
! 4121: syn_cache_insert(sc, tp);
! 4122: tcpstat.tcps_sndacks++;
! 4123: tcpstat.tcps_sndtotal++;
! 4124: } else {
! 4125: SYN_CACHE_PUT(sc);
! 4126: tcpstat.tcps_sc_dropped++;
! 4127: }
! 4128: return (1);
! 4129: }
! 4130:
! 4131: int
! 4132: syn_cache_respond(sc, m)
! 4133: struct syn_cache *sc;
! 4134: struct mbuf *m;
! 4135: {
! 4136: struct route *ro;
! 4137: u_int8_t *optp;
! 4138: int optlen, error;
! 4139: u_int16_t tlen;
! 4140: struct ip *ip = NULL;
! 4141: #ifdef INET6
! 4142: struct ip6_hdr *ip6 = NULL;
! 4143: #endif
! 4144: struct tcphdr *th;
! 4145: u_int hlen;
! 4146: struct inpcb *inp;
! 4147:
! 4148: switch (sc->sc_src.sa.sa_family) {
! 4149: case AF_INET:
! 4150: hlen = sizeof(struct ip);
! 4151: ro = &sc->sc_route4;
! 4152: break;
! 4153: #ifdef INET6
! 4154: case AF_INET6:
! 4155: hlen = sizeof(struct ip6_hdr);
! 4156: ro = (struct route *)&sc->sc_route6;
! 4157: break;
! 4158: #endif
! 4159: default:
! 4160: if (m)
! 4161: m_freem(m);
! 4162: return (EAFNOSUPPORT);
! 4163: }
! 4164:
! 4165: /* Compute the size of the TCP options. */
! 4166: optlen = 4 + (sc->sc_request_r_scale != 15 ? 4 : 0) +
! 4167: #ifdef TCP_SACK
! 4168: ((sc->sc_flags & SCF_SACK_PERMIT) ? 4 : 0) +
! 4169: #endif
! 4170: #ifdef TCP_SIGNATURE
! 4171: ((sc->sc_flags & SCF_SIGNATURE) ? TCPOLEN_SIGLEN : 0) +
! 4172: #endif
! 4173: ((sc->sc_flags & SCF_TIMESTAMP) ? TCPOLEN_TSTAMP_APPA : 0);
! 4174:
! 4175: tlen = hlen + sizeof(struct tcphdr) + optlen;
! 4176:
! 4177: /*
! 4178: * Create the IP+TCP header from scratch.
! 4179: */
! 4180: if (m)
! 4181: m_freem(m);
! 4182: #ifdef DIAGNOSTIC
! 4183: if (max_linkhdr + tlen > MCLBYTES)
! 4184: return (ENOBUFS);
! 4185: #endif
! 4186: MGETHDR(m, M_DONTWAIT, MT_DATA);
! 4187: if (m && max_linkhdr + tlen > MHLEN) {
! 4188: MCLGET(m, M_DONTWAIT);
! 4189: if ((m->m_flags & M_EXT) == 0) {
! 4190: m_freem(m);
! 4191: m = NULL;
! 4192: }
! 4193: }
! 4194: if (m == NULL)
! 4195: return (ENOBUFS);
! 4196:
! 4197: /* Fixup the mbuf. */
! 4198: m->m_data += max_linkhdr;
! 4199: m->m_len = m->m_pkthdr.len = tlen;
! 4200: m->m_pkthdr.rcvif = NULL;
! 4201: memset(mtod(m, u_char *), 0, tlen);
! 4202:
! 4203: switch (sc->sc_src.sa.sa_family) {
! 4204: case AF_INET:
! 4205: ip = mtod(m, struct ip *);
! 4206: ip->ip_dst = sc->sc_src.sin.sin_addr;
! 4207: ip->ip_src = sc->sc_dst.sin.sin_addr;
! 4208: ip->ip_p = IPPROTO_TCP;
! 4209: th = (struct tcphdr *)(ip + 1);
! 4210: th->th_dport = sc->sc_src.sin.sin_port;
! 4211: th->th_sport = sc->sc_dst.sin.sin_port;
! 4212: break;
! 4213: #ifdef INET6
! 4214: case AF_INET6:
! 4215: ip6 = mtod(m, struct ip6_hdr *);
! 4216: ip6->ip6_dst = sc->sc_src.sin6.sin6_addr;
! 4217: ip6->ip6_src = sc->sc_dst.sin6.sin6_addr;
! 4218: ip6->ip6_nxt = IPPROTO_TCP;
! 4219: /* ip6_plen will be updated in ip6_output() */
! 4220: th = (struct tcphdr *)(ip6 + 1);
! 4221: th->th_dport = sc->sc_src.sin6.sin6_port;
! 4222: th->th_sport = sc->sc_dst.sin6.sin6_port;
! 4223: break;
! 4224: #endif
! 4225: default:
! 4226: th = NULL;
! 4227: }
! 4228:
! 4229: th->th_seq = htonl(sc->sc_iss);
! 4230: th->th_ack = htonl(sc->sc_irs + 1);
! 4231: th->th_off = (sizeof(struct tcphdr) + optlen) >> 2;
! 4232: th->th_flags = TH_SYN|TH_ACK;
! 4233: #ifdef TCP_ECN
! 4234: /* Set ECE for SYN-ACK if peer supports ECN. */
! 4235: if (tcp_do_ecn && (sc->sc_flags & SCF_ECN_PERMIT))
! 4236: th->th_flags |= TH_ECE;
! 4237: #endif
! 4238: th->th_win = htons(sc->sc_win);
! 4239: /* th_sum already 0 */
! 4240: /* th_urp already 0 */
! 4241:
! 4242: /* Tack on the TCP options. */
! 4243: optp = (u_int8_t *)(th + 1);
! 4244: *optp++ = TCPOPT_MAXSEG;
! 4245: *optp++ = 4;
! 4246: *optp++ = (sc->sc_ourmaxseg >> 8) & 0xff;
! 4247: *optp++ = sc->sc_ourmaxseg & 0xff;
! 4248:
! 4249: #ifdef TCP_SACK
! 4250: /* Include SACK_PERMIT_HDR option if peer has already done so. */
! 4251: if (sc->sc_flags & SCF_SACK_PERMIT) {
! 4252: *((u_int32_t *)optp) = htonl(TCPOPT_SACK_PERMIT_HDR);
! 4253: optp += 4;
! 4254: }
! 4255: #endif
! 4256:
! 4257: if (sc->sc_request_r_scale != 15) {
! 4258: *((u_int32_t *)optp) = htonl(TCPOPT_NOP << 24 |
! 4259: TCPOPT_WINDOW << 16 | TCPOLEN_WINDOW << 8 |
! 4260: sc->sc_request_r_scale);
! 4261: optp += 4;
! 4262: }
! 4263:
! 4264: if (sc->sc_flags & SCF_TIMESTAMP) {
! 4265: u_int32_t *lp = (u_int32_t *)(optp);
! 4266: /* Form timestamp option as shown in appendix A of RFC 1323. */
! 4267: *lp++ = htonl(TCPOPT_TSTAMP_HDR);
! 4268: *lp++ = htonl(SYN_CACHE_TIMESTAMP(sc));
! 4269: *lp = htonl(sc->sc_timestamp);
! 4270: optp += TCPOLEN_TSTAMP_APPA;
! 4271: }
! 4272:
! 4273: #ifdef TCP_SIGNATURE
! 4274: if (sc->sc_flags & SCF_SIGNATURE) {
! 4275: union sockaddr_union src, dst;
! 4276: struct tdb *tdb;
! 4277:
! 4278: bzero(&src, sizeof(union sockaddr_union));
! 4279: bzero(&dst, sizeof(union sockaddr_union));
! 4280: src.sa.sa_len = sc->sc_src.sa.sa_len;
! 4281: src.sa.sa_family = sc->sc_src.sa.sa_family;
! 4282: dst.sa.sa_len = sc->sc_dst.sa.sa_len;
! 4283: dst.sa.sa_family = sc->sc_dst.sa.sa_family;
! 4284:
! 4285: switch (sc->sc_src.sa.sa_family) {
! 4286: case 0: /*default to PF_INET*/
! 4287: #ifdef INET
! 4288: case AF_INET:
! 4289: src.sin.sin_addr = mtod(m, struct ip *)->ip_src;
! 4290: dst.sin.sin_addr = mtod(m, struct ip *)->ip_dst;
! 4291: break;
! 4292: #endif /* INET */
! 4293: #ifdef INET6
! 4294: case AF_INET6:
! 4295: src.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_src;
! 4296: dst.sin6.sin6_addr = mtod(m, struct ip6_hdr *)->ip6_dst;
! 4297: break;
! 4298: #endif /* INET6 */
! 4299: }
! 4300:
! 4301: tdb = gettdbbysrcdst(0, &src, &dst, IPPROTO_TCP);
! 4302: if (tdb == NULL) {
! 4303: if (m)
! 4304: m_freem(m);
! 4305: return (EPERM);
! 4306: }
! 4307:
! 4308: /* Send signature option */
! 4309: *(optp++) = TCPOPT_SIGNATURE;
! 4310: *(optp++) = TCPOLEN_SIGNATURE;
! 4311:
! 4312: if (tcp_signature(tdb, sc->sc_src.sa.sa_family, m, th,
! 4313: hlen, 0, optp) < 0) {
! 4314: if (m)
! 4315: m_freem(m);
! 4316: return (EINVAL);
! 4317: }
! 4318: optp += 16;
! 4319:
! 4320: /* Pad options list to the next 32 bit boundary and
! 4321: * terminate it.
! 4322: */
! 4323: *optp++ = TCPOPT_NOP;
! 4324: *optp++ = TCPOPT_EOL;
! 4325: }
! 4326: #endif /* TCP_SIGNATURE */
! 4327:
! 4328: /* Compute the packet's checksum. */
! 4329: switch (sc->sc_src.sa.sa_family) {
! 4330: case AF_INET:
! 4331: ip->ip_len = htons(tlen - hlen);
! 4332: th->th_sum = 0;
! 4333: th->th_sum = in_cksum(m, tlen);
! 4334: break;
! 4335: #ifdef INET6
! 4336: case AF_INET6:
! 4337: ip6->ip6_plen = htons(tlen - hlen);
! 4338: th->th_sum = 0;
! 4339: th->th_sum = in6_cksum(m, IPPROTO_TCP, hlen, tlen - hlen);
! 4340: break;
! 4341: #endif
! 4342: }
! 4343:
! 4344: /* use IPsec policy and ttl from listening socket, on SYN ACK */
! 4345: inp = sc->sc_tp ? sc->sc_tp->t_inpcb : NULL;
! 4346:
! 4347: /*
! 4348: * Fill in some straggling IP bits. Note the stack expects
! 4349: * ip_len to be in host order, for convenience.
! 4350: */
! 4351: switch (sc->sc_src.sa.sa_family) {
! 4352: #ifdef INET
! 4353: case AF_INET:
! 4354: ip->ip_len = htons(tlen);
! 4355: ip->ip_ttl = inp ? inp->inp_ip.ip_ttl : ip_defttl;
! 4356: /* XXX tos? */
! 4357: break;
! 4358: #endif
! 4359: #ifdef INET6
! 4360: case AF_INET6:
! 4361: ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
! 4362: ip6->ip6_vfc |= IPV6_VERSION;
! 4363: ip6->ip6_plen = htons(tlen - hlen);
! 4364: /* ip6_hlim will be initialized afterwards */
! 4365: /* leave flowlabel = 0, it is legal and require no state mgmt */
! 4366: break;
! 4367: #endif
! 4368: }
! 4369:
! 4370: switch (sc->sc_src.sa.sa_family) {
! 4371: #ifdef INET
! 4372: case AF_INET:
! 4373: error = ip_output(m, sc->sc_ipopts, ro,
! 4374: (ip_mtudisc ? IP_MTUDISC : 0),
! 4375: (struct ip_moptions *)NULL, inp);
! 4376: break;
! 4377: #endif
! 4378: #ifdef INET6
! 4379: case AF_INET6:
! 4380: ip6->ip6_hlim = in6_selecthlim(NULL,
! 4381: ro->ro_rt ? ro->ro_rt->rt_ifp : NULL);
! 4382:
! 4383: error = ip6_output(m, NULL /*XXX*/, (struct route_in6 *)ro, 0,
! 4384: (struct ip6_moptions *)0, NULL, NULL);
! 4385: break;
! 4386: #endif
! 4387: default:
! 4388: error = EAFNOSUPPORT;
! 4389: break;
! 4390: }
! 4391: return (error);
! 4392: }
CVSweb