[BACK]Return to altq_rio.c CVS log [TXT][DIR] Up to [local] / sys / altq

Annotation of sys/altq/altq_rio.c, Revision 1.1.1.1

1.1       nbrk        1: /*     $OpenBSD: altq_rio.c,v 1.10 2007/06/17 19:58:58 jasper Exp $    */
                      2: /*     $KAME: altq_rio.c,v 1.8 2000/12/14 08:12:46 thorpej Exp $       */
                      3:
                      4: /*
                      5:  * Copyright (C) 1998-2000
                      6:  *     Sony Computer Science Laboratories Inc.  All rights reserved.
                      7:  *
                      8:  * Redistribution and use in source and binary forms, with or without
                      9:  * modification, are permitted provided that the following conditions
                     10:  * are met:
                     11:  * 1. Redistributions of source code must retain the above copyright
                     12:  *    notice, this list of conditions and the following disclaimer.
                     13:  * 2. Redistributions in binary form must reproduce the above copyright
                     14:  *    notice, this list of conditions and the following disclaimer in the
                     15:  *    documentation and/or other materials provided with the distribution.
                     16:  *
                     17:  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
                     18:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
                     19:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
                     20:  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
                     21:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
                     22:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
                     23:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
                     24:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
                     25:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
                     26:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
                     27:  * SUCH DAMAGE.
                     28:  */
                     29: /*
                     30:  * Copyright (c) 1990-1994 Regents of the University of California.
                     31:  * All rights reserved.
                     32:  *
                     33:  * Redistribution and use in source and binary forms, with or without
                     34:  * modification, are permitted provided that the following conditions
                     35:  * are met:
                     36:  * 1. Redistributions of source code must retain the above copyright
                     37:  *    notice, this list of conditions and the following disclaimer.
                     38:  * 2. Redistributions in binary form must reproduce the above copyright
                     39:  *    notice, this list of conditions and the following disclaimer in the
                     40:  *    documentation and/or other materials provided with the distribution.
                     41:  * 3. All advertising materials mentioning features or use of this software
                     42:  *    must display the following acknowledgement:
                     43:  *     This product includes software developed by the Computer Systems
                     44:  *     Engineering Group at Lawrence Berkeley Laboratory.
                     45:  * 4. Neither the name of the University nor of the Laboratory may be used
                     46:  *    to endorse or promote products derived from this software without
                     47:  *    specific prior written permission.
                     48:  *
                     49:  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
                     50:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
                     51:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
                     52:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
                     53:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
                     54:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
                     55:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
                     56:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
                     57:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
                     58:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
                     59:  * SUCH DAMAGE.
                     60:  */
                     61:
                     62: #include <sys/param.h>
                     63: #include <sys/malloc.h>
                     64: #include <sys/mbuf.h>
                     65: #include <sys/socket.h>
                     66: #include <sys/systm.h>
                     67: #include <sys/errno.h>
                     68:
                     69: #include <net/if.h>
                     70: #include <net/if_types.h>
                     71:
                     72: #include <netinet/in.h>
                     73: #include <netinet/in_systm.h>
                     74: #include <netinet/ip.h>
                     75: #ifdef INET6
                     76: #include <netinet/ip6.h>
                     77: #endif
                     78:
                     79: #include <net/pfvar.h>
                     80: #include <altq/altq.h>
                     81: #include <altq/altq_cdnr.h>
                     82: #include <altq/altq_red.h>
                     83: #include <altq/altq_rio.h>
                     84:
                     85: /*
                     86:  * RIO: RED with IN/OUT bit
                     87:  *   described in
                     88:  *     "Explicit Allocation of Best Effort Packet Delivery Service"
                     89:  *     David D. Clark and Wenjia Fang, MIT Lab for Computer Science
                     90:  *     http://diffserv.lcs.mit.edu/Papers/exp-alloc-ddc-wf.{ps,pdf}
                     91:  *
                     92:  * this implementation is extended to support more than 2 drop precedence
                     93:  * values as described in RFC2597 (Assured Forwarding PHB Group).
                     94:  *
                     95:  */
                     96: /*
                     97:  * AF DS (differentiated service) codepoints.
                     98:  * (classes can be mapped to CBQ or H-FSC classes.)
                     99:  *
                    100:  *      0   1   2   3   4   5   6   7
                    101:  *    +---+---+---+---+---+---+---+---+
                    102:  *    |   CLASS   |DropPre| 0 |  CU   |
                    103:  *    +---+---+---+---+---+---+---+---+
                    104:  *
                    105:  *    class 1: 001
                    106:  *    class 2: 010
                    107:  *    class 3: 011
                    108:  *    class 4: 100
                    109:  *
                    110:  *    low drop prec:    01
                    111:  *    medium drop prec: 10
                    112:  *    high drop prec:   01
                    113:  */
                    114:
                    115: /* normal red parameters */
                    116: #define        W_WEIGHT        512     /* inverse of weight of EWMA (511/512) */
                    117:                                /* q_weight = 0.00195 */
                    118:
                    119: /* red parameters for a slow link */
                    120: #define        W_WEIGHT_1      128     /* inverse of weight of EWMA (127/128) */
                    121:                                /* q_weight = 0.0078125 */
                    122:
                    123: /* red parameters for a very slow link (e.g., dialup) */
                    124: #define        W_WEIGHT_2      64      /* inverse of weight of EWMA (63/64) */
                    125:                                /* q_weight = 0.015625 */
                    126:
                    127: /* fixed-point uses 12-bit decimal places */
                    128: #define        FP_SHIFT        12      /* fixed-point shift */
                    129:
                    130: /* red parameters for drop probability */
                    131: #define        INV_P_MAX       10      /* inverse of max drop probability */
                    132: #define        TH_MIN           5      /* min threshold */
                    133: #define        TH_MAX          15      /* max threshold */
                    134:
                    135: #define        RIO_LIMIT       60      /* default max queue length */
                    136: #define        RIO_STATS               /* collect statistics */
                    137:
                    138: #define        TV_DELTA(a, b, delta) {                                 \
                    139:        int     xxs;                                    \
                    140:                                                                \
                    141:        delta = (a)->tv_usec - (b)->tv_usec;                    \
                    142:        if ((xxs = (a)->tv_sec - (b)->tv_sec) != 0) {           \
                    143:                if (xxs < 0) {                                  \
                    144:                        delta = 60000000;                       \
                    145:                } else if (xxs > 4)  {                          \
                    146:                        if (xxs > 60)                           \
                    147:                                delta = 60000000;               \
                    148:                        else                                    \
                    149:                                delta += xxs * 1000000;         \
                    150:                } else while (xxs > 0) {                        \
                    151:                        delta += 1000000;                       \
                    152:                        xxs--;                                  \
                    153:                }                                               \
                    154:        }                                                       \
                    155: }
                    156:
                    157: /* default rio parameter values */
                    158: static struct redparams default_rio_params[RIO_NDROPPREC] = {
                    159:   /* th_min,            th_max,     inv_pmax */
                    160:   { TH_MAX * 2 + TH_MIN, TH_MAX * 3, INV_P_MAX }, /* low drop precedence */
                    161:   { TH_MAX + TH_MIN,    TH_MAX * 2, INV_P_MAX }, /* medium drop precedence */
                    162:   { TH_MIN,             TH_MAX,     INV_P_MAX }  /* high drop precedence */
                    163: };
                    164:
                    165: /* internal function prototypes */
                    166: static int dscp2index(u_int8_t);
                    167:
                    168: rio_t *
                    169: rio_alloc(int weight, struct redparams *params, int flags, int pkttime)
                    170: {
                    171:        rio_t   *rp;
                    172:        int      w, i;
                    173:        int      npkts_per_sec;
                    174:
                    175:        MALLOC(rp, rio_t *, sizeof(rio_t), M_DEVBUF, M_WAITOK);
                    176:        if (rp == NULL)
                    177:                return (NULL);
                    178:        bzero(rp, sizeof(rio_t));
                    179:
                    180:        rp->rio_flags = flags;
                    181:        if (pkttime == 0)
                    182:                /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
                    183:                rp->rio_pkttime = 800;
                    184:        else
                    185:                rp->rio_pkttime = pkttime;
                    186:
                    187:        if (weight != 0)
                    188:                rp->rio_weight = weight;
                    189:        else {
                    190:                /* use default */
                    191:                rp->rio_weight = W_WEIGHT;
                    192:
                    193:                /* when the link is very slow, adjust red parameters */
                    194:                npkts_per_sec = 1000000 / rp->rio_pkttime;
                    195:                if (npkts_per_sec < 50) {
                    196:                        /* up to about 400Kbps */
                    197:                        rp->rio_weight = W_WEIGHT_2;
                    198:                } else if (npkts_per_sec < 300) {
                    199:                        /* up to about 2.4Mbps */
                    200:                        rp->rio_weight = W_WEIGHT_1;
                    201:                }
                    202:        }
                    203:
                    204:        /* calculate wshift.  weight must be power of 2 */
                    205:        w = rp->rio_weight;
                    206:        for (i = 0; w > 1; i++)
                    207:                w = w >> 1;
                    208:        rp->rio_wshift = i;
                    209:        w = 1 << rp->rio_wshift;
                    210:        if (w != rp->rio_weight) {
                    211:                printf("invalid weight value %d for red! use %d\n",
                    212:                       rp->rio_weight, w);
                    213:                rp->rio_weight = w;
                    214:        }
                    215:
                    216:        /* allocate weight table */
                    217:        rp->rio_wtab = wtab_alloc(rp->rio_weight);
                    218:
                    219:        for (i = 0; i < RIO_NDROPPREC; i++) {
                    220:                struct dropprec_state *prec = &rp->rio_precstate[i];
                    221:
                    222:                prec->avg = 0;
                    223:                prec->idle = 1;
                    224:
                    225:                if (params == NULL || params[i].inv_pmax == 0)
                    226:                        prec->inv_pmax = default_rio_params[i].inv_pmax;
                    227:                else
                    228:                        prec->inv_pmax = params[i].inv_pmax;
                    229:                if (params == NULL || params[i].th_min == 0)
                    230:                        prec->th_min = default_rio_params[i].th_min;
                    231:                else
                    232:                        prec->th_min = params[i].th_min;
                    233:                if (params == NULL || params[i].th_max == 0)
                    234:                        prec->th_max = default_rio_params[i].th_max;
                    235:                else
                    236:                        prec->th_max = params[i].th_max;
                    237:
                    238:                /*
                    239:                 * th_min_s and th_max_s are scaled versions of th_min
                    240:                 * and th_max to be compared with avg.
                    241:                 */
                    242:                prec->th_min_s = prec->th_min << (rp->rio_wshift + FP_SHIFT);
                    243:                prec->th_max_s = prec->th_max << (rp->rio_wshift + FP_SHIFT);
                    244:
                    245:                /*
                    246:                 * precompute probability denominator
                    247:                 *  probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
                    248:                 */
                    249:                prec->probd = (2 * (prec->th_max - prec->th_min)
                    250:                               * prec->inv_pmax) << FP_SHIFT;
                    251:
                    252:                microtime(&prec->last);
                    253:        }
                    254:
                    255:        return (rp);
                    256: }
                    257:
                    258: void
                    259: rio_destroy(rio_t *rp)
                    260: {
                    261:        wtab_destroy(rp->rio_wtab);
                    262:        FREE(rp, M_DEVBUF);
                    263: }
                    264:
                    265: void
                    266: rio_getstats(rio_t *rp, struct redstats *sp)
                    267: {
                    268:        int     i;
                    269:
                    270:        for (i = 0; i < RIO_NDROPPREC; i++) {
                    271:                bcopy(&rp->q_stats[i], sp, sizeof(struct redstats));
                    272:                sp->q_avg = rp->rio_precstate[i].avg >> rp->rio_wshift;
                    273:                sp++;
                    274:        }
                    275: }
                    276:
                    277: #if (RIO_NDROPPREC == 3)
                    278: /*
                    279:  * internally, a drop precedence value is converted to an index
                    280:  * starting from 0.
                    281:  */
                    282: static int
                    283: dscp2index(u_int8_t dscp)
                    284: {
                    285:        int     dpindex = dscp & AF_DROPPRECMASK;
                    286:
                    287:        if (dpindex == 0)
                    288:                return (0);
                    289:        return ((dpindex >> 3) - 1);
                    290: }
                    291: #endif
                    292:
                    293: #if 1
                    294: /*
                    295:  * kludge: when a packet is dequeued, we need to know its drop precedence
                    296:  * in order to keep the queue length of each drop precedence.
                    297:  * use m_pkthdr.rcvif to pass this info.
                    298:  */
                    299: #define        RIOM_SET_PRECINDEX(m, idx)      \
                    300:        do { (m)->m_pkthdr.rcvif = (struct ifnet *)((long)(idx)); } while (0)
                    301: #define        RIOM_GET_PRECINDEX(m)   \
                    302:        ({ long idx; idx = (long)((m)->m_pkthdr.rcvif); \
                    303:        (m)->m_pkthdr.rcvif = NULL; idx; })
                    304: #endif
                    305:
                    306: int
                    307: rio_addq(rio_t *rp, class_queue_t *q, struct mbuf *m,
                    308:     struct altq_pktattr *pktattr)
                    309: {
                    310:        int                      avg, droptype;
                    311:        u_int8_t                 dsfield, odsfield;
                    312:        int                      dpindex, i, n, t;
                    313:        struct timeval           now;
                    314:        struct dropprec_state   *prec;
                    315:
                    316:        dsfield = odsfield = read_dsfield(m, pktattr);
                    317:        dpindex = dscp2index(dsfield);
                    318:
                    319:        /*
                    320:         * update avg of the precedence states whose drop precedence
                    321:         * is larger than or equal to the drop precedence of the packet
                    322:         */
                    323:        now.tv_sec = 0;
                    324:        for (i = dpindex; i < RIO_NDROPPREC; i++) {
                    325:                prec = &rp->rio_precstate[i];
                    326:                avg = prec->avg;
                    327:                if (prec->idle) {
                    328:                        prec->idle = 0;
                    329:                        if (now.tv_sec == 0)
                    330:                                microtime(&now);
                    331:                        t = (now.tv_sec - prec->last.tv_sec);
                    332:                        if (t > 60)
                    333:                                avg = 0;
                    334:                        else {
                    335:                                t = t * 1000000 +
                    336:                                        (now.tv_usec - prec->last.tv_usec);
                    337:                                n = t / rp->rio_pkttime;
                    338:                                /* calculate (avg = (1 - Wq)^n * avg) */
                    339:                                if (n > 0)
                    340:                                        avg = (avg >> FP_SHIFT) *
                    341:                                                pow_w(rp->rio_wtab, n);
                    342:                        }
                    343:                }
                    344:
                    345:                /* run estimator. (avg is scaled by WEIGHT in fixed-point) */
                    346:                avg += (prec->qlen << FP_SHIFT) - (avg >> rp->rio_wshift);
                    347:                prec->avg = avg;                /* save the new value */
                    348:                /*
                    349:                 * count keeps a tally of arriving traffic that has not
                    350:                 * been dropped.
                    351:                 */
                    352:                prec->count++;
                    353:        }
                    354:
                    355:        prec = &rp->rio_precstate[dpindex];
                    356:        avg = prec->avg;
                    357:
                    358:        /* see if we drop early */
                    359:        droptype = DTYPE_NODROP;
                    360:        if (avg >= prec->th_min_s && prec->qlen > 1) {
                    361:                if (avg >= prec->th_max_s) {
                    362:                        /* avg >= th_max: forced drop */
                    363:                        droptype = DTYPE_FORCED;
                    364:                } else if (prec->old == 0) {
                    365:                        /* first exceeds th_min */
                    366:                        prec->count = 1;
                    367:                        prec->old = 1;
                    368:                } else if (drop_early((avg - prec->th_min_s) >> rp->rio_wshift,
                    369:                                      prec->probd, prec->count)) {
                    370:                        /* unforced drop by red */
                    371:                        droptype = DTYPE_EARLY;
                    372:                }
                    373:        } else {
                    374:                /* avg < th_min */
                    375:                prec->old = 0;
                    376:        }
                    377:
                    378:        /*
                    379:         * if the queue length hits the hard limit, it's a forced drop.
                    380:         */
                    381:        if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
                    382:                droptype = DTYPE_FORCED;
                    383:
                    384:        if (droptype != DTYPE_NODROP) {
                    385:                /* always drop incoming packet (as opposed to randomdrop) */
                    386:                for (i = dpindex; i < RIO_NDROPPREC; i++)
                    387:                        rp->rio_precstate[i].count = 0;
                    388: #ifdef RIO_STATS
                    389:                if (droptype == DTYPE_EARLY)
                    390:                        rp->q_stats[dpindex].drop_unforced++;
                    391:                else
                    392:                        rp->q_stats[dpindex].drop_forced++;
                    393:                PKTCNTR_ADD(&rp->q_stats[dpindex].drop_cnt, m_pktlen(m));
                    394: #endif
                    395:                m_freem(m);
                    396:                return (-1);
                    397:        }
                    398:
                    399:        for (i = dpindex; i < RIO_NDROPPREC; i++)
                    400:                rp->rio_precstate[i].qlen++;
                    401:
                    402:        /* save drop precedence index in mbuf hdr */
                    403:        RIOM_SET_PRECINDEX(m, dpindex);
                    404:
                    405:        if (rp->rio_flags & RIOF_CLEARDSCP)
                    406:                dsfield &= ~DSCP_MASK;
                    407:
                    408:        if (dsfield != odsfield)
                    409:                write_dsfield(m, pktattr, dsfield);
                    410:
                    411:        _addq(q, m);
                    412:
                    413: #ifdef RIO_STATS
                    414:        PKTCNTR_ADD(&rp->q_stats[dpindex].xmit_cnt, m_pktlen(m));
                    415: #endif
                    416:        return (0);
                    417: }
                    418:
                    419: struct mbuf *
                    420: rio_getq(rio_t *rp, class_queue_t *q)
                    421: {
                    422:        struct mbuf     *m;
                    423:        int              dpindex, i;
                    424:
                    425:        if ((m = _getq(q)) == NULL)
                    426:                return NULL;
                    427:
                    428:        dpindex = RIOM_GET_PRECINDEX(m);
                    429:        for (i = dpindex; i < RIO_NDROPPREC; i++) {
                    430:                if (--rp->rio_precstate[i].qlen == 0) {
                    431:                        if (rp->rio_precstate[i].idle == 0) {
                    432:                                rp->rio_precstate[i].idle = 1;
                    433:                                microtime(&rp->rio_precstate[i].last);
                    434:                        }
                    435:                }
                    436:        }
                    437:        return (m);
                    438: }

CVSweb