Annotation of sys/altq/altq_red.c, Revision 1.1.1.1
1.1 nbrk 1: /* $OpenBSD: altq_red.c,v 1.13 2007/05/28 17:16:38 henning Exp $ */
2: /* $KAME: altq_red.c,v 1.10 2002/04/03 05:38:51 kjc Exp $ */
3:
4: /*
5: * Copyright (C) 1997-2002
6: * Sony Computer Science Laboratories Inc. All rights reserved.
7: *
8: * Redistribution and use in source and binary forms, with or without
9: * modification, are permitted provided that the following conditions
10: * are met:
11: * 1. Redistributions of source code must retain the above copyright
12: * notice, this list of conditions and the following disclaimer.
13: * 2. Redistributions in binary form must reproduce the above copyright
14: * notice, this list of conditions and the following disclaimer in the
15: * documentation and/or other materials provided with the distribution.
16: *
17: * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
18: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20: * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
21: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27: * SUCH DAMAGE.
28: *
29: */
30: /*
31: * Copyright (c) 1990-1994 Regents of the University of California.
32: * All rights reserved.
33: *
34: * Redistribution and use in source and binary forms, with or without
35: * modification, are permitted provided that the following conditions
36: * are met:
37: * 1. Redistributions of source code must retain the above copyright
38: * notice, this list of conditions and the following disclaimer.
39: * 2. Redistributions in binary form must reproduce the above copyright
40: * notice, this list of conditions and the following disclaimer in the
41: * documentation and/or other materials provided with the distribution.
42: * 3. All advertising materials mentioning features or use of this software
43: * must display the following acknowledgement:
44: * This product includes software developed by the Computer Systems
45: * Engineering Group at Lawrence Berkeley Laboratory.
46: * 4. Neither the name of the University nor of the Laboratory may be used
47: * to endorse or promote products derived from this software without
48: * specific prior written permission.
49: *
50: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
51: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
52: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
53: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
54: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
55: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
56: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
57: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
58: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
59: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
60: * SUCH DAMAGE.
61: */
62:
63: #include <sys/param.h>
64: #include <sys/malloc.h>
65: #include <sys/mbuf.h>
66: #include <sys/socket.h>
67: #include <sys/systm.h>
68: #include <sys/errno.h>
69:
70: #include <net/if.h>
71: #include <net/if_types.h>
72:
73: #include <netinet/in.h>
74: #include <netinet/in_systm.h>
75: #include <netinet/ip.h>
76: #ifdef INET6
77: #include <netinet/ip6.h>
78: #endif
79:
80: #include <net/pfvar.h>
81: #include <altq/altq.h>
82: #include <altq/altq_red.h>
83:
84: /*
85: * ALTQ/RED (Random Early Detection) implementation using 32-bit
86: * fixed-point calculation.
87: *
88: * written by kjc using the ns code as a reference.
89: * you can learn more about red and ns from Sally's home page at
90: * http://www-nrg.ee.lbl.gov/floyd/
91: *
92: * most of the red parameter values are fixed in this implementation
93: * to prevent fixed-point overflow/underflow.
94: * if you change the parameters, watch out for overflow/underflow!
95: *
96: * the parameters used are recommended values by Sally.
97: * the corresponding ns config looks:
98: * q_weight=0.00195
99: * minthresh=5 maxthresh=15 queue-size=60
100: * linterm=30
101: * dropmech=drop-tail
102: * bytes=false (can't be handled by 32-bit fixed-point)
103: * doubleq=false dqthresh=false
104: * wait=true
105: */
106: /*
107: * alternative red parameters for a slow link.
108: *
109: * assume the queue length becomes from zero to L and keeps L, it takes
110: * N packets for q_avg to reach 63% of L.
111: * when q_weight is 0.002, N is about 500 packets.
112: * for a slow link like dial-up, 500 packets takes more than 1 minute!
113: * when q_weight is 0.008, N is about 127 packets.
114: * when q_weight is 0.016, N is about 63 packets.
115: * bursts of 50 packets are allowed for 0.002, bursts of 25 packets
116: * are allowed for 0.016.
117: * see Sally's paper for more details.
118: */
119: /* normal red parameters */
120: #define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */
121: /* q_weight = 0.00195 */
122:
123: /* red parameters for a slow link */
124: #define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */
125: /* q_weight = 0.0078125 */
126:
127: /* red parameters for a very slow link (e.g., dialup) */
128: #define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */
129: /* q_weight = 0.015625 */
130:
131: /* fixed-point uses 12-bit decimal places */
132: #define FP_SHIFT 12 /* fixed-point shift */
133:
134: /* red parameters for drop probability */
135: #define INV_P_MAX 10 /* inverse of max drop probability */
136: #define TH_MIN 5 /* min threshold */
137: #define TH_MAX 15 /* max threshold */
138:
139: #define RED_LIMIT 60 /* default max queue length */
140: #define RED_STATS /* collect statistics */
141:
142: /*
143: * our default policy for forced-drop is drop-tail.
144: * (in altq-1.1.2 or earlier, the default was random-drop.
145: * but it makes more sense to punish the cause of the surge.)
146: * to switch to the random-drop policy, define "RED_RANDOM_DROP".
147: */
148:
149: /* default red parameter values */
150: static int default_th_min = TH_MIN;
151: static int default_th_max = TH_MAX;
152: static int default_inv_pmax = INV_P_MAX;
153:
154: /*
155: * red support routines
156: */
157: red_t *
158: red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags,
159: int pkttime)
160: {
161: red_t *rp;
162: int w, i;
163: int npkts_per_sec;
164:
165: MALLOC(rp, red_t *, sizeof(red_t), M_DEVBUF, M_WAITOK);
166: if (rp == NULL)
167: return (NULL);
168: bzero(rp, sizeof(red_t));
169:
170: rp->red_avg = 0;
171: rp->red_idle = 1;
172:
173: if (weight == 0)
174: rp->red_weight = W_WEIGHT;
175: else
176: rp->red_weight = weight;
177: if (inv_pmax == 0)
178: rp->red_inv_pmax = default_inv_pmax;
179: else
180: rp->red_inv_pmax = inv_pmax;
181: if (th_min == 0)
182: rp->red_thmin = default_th_min;
183: else
184: rp->red_thmin = th_min;
185: if (th_max == 0)
186: rp->red_thmax = default_th_max;
187: else
188: rp->red_thmax = th_max;
189:
190: rp->red_flags = flags;
191:
192: if (pkttime == 0)
193: /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */
194: rp->red_pkttime = 800;
195: else
196: rp->red_pkttime = pkttime;
197:
198: if (weight == 0) {
199: /* when the link is very slow, adjust red parameters */
200: npkts_per_sec = 1000000 / rp->red_pkttime;
201: if (npkts_per_sec < 50) {
202: /* up to about 400Kbps */
203: rp->red_weight = W_WEIGHT_2;
204: } else if (npkts_per_sec < 300) {
205: /* up to about 2.4Mbps */
206: rp->red_weight = W_WEIGHT_1;
207: }
208: }
209:
210: /* calculate wshift. weight must be power of 2 */
211: w = rp->red_weight;
212: for (i = 0; w > 1; i++)
213: w = w >> 1;
214: rp->red_wshift = i;
215: w = 1 << rp->red_wshift;
216: if (w != rp->red_weight) {
217: printf("invalid weight value %d for red! use %d\n",
218: rp->red_weight, w);
219: rp->red_weight = w;
220: }
221:
222: /*
223: * thmin_s and thmax_s are scaled versions of th_min and th_max
224: * to be compared with avg.
225: */
226: rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT);
227: rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT);
228:
229: /*
230: * precompute probability denominator
231: * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point
232: */
233: rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin)
234: * rp->red_inv_pmax) << FP_SHIFT;
235:
236: /* allocate weight table */
237: rp->red_wtab = wtab_alloc(rp->red_weight);
238:
239: microtime(&rp->red_last);
240: return (rp);
241: }
242:
243: void
244: red_destroy(red_t *rp)
245: {
246: wtab_destroy(rp->red_wtab);
247: FREE(rp, M_DEVBUF);
248: }
249:
250: void
251: red_getstats(red_t *rp, struct redstats *sp)
252: {
253: sp->q_avg = rp->red_avg >> rp->red_wshift;
254: sp->xmit_cnt = rp->red_stats.xmit_cnt;
255: sp->drop_cnt = rp->red_stats.drop_cnt;
256: sp->drop_forced = rp->red_stats.drop_forced;
257: sp->drop_unforced = rp->red_stats.drop_unforced;
258: sp->marked_packets = rp->red_stats.marked_packets;
259: }
260:
261: int
262: red_addq(red_t *rp, class_queue_t *q, struct mbuf *m,
263: struct altq_pktattr *pktattr)
264: {
265: int avg, droptype;
266: int n;
267:
268: avg = rp->red_avg;
269:
270: /*
271: * if we were idle, we pretend that n packets arrived during
272: * the idle period.
273: */
274: if (rp->red_idle) {
275: struct timeval now;
276: int t;
277:
278: rp->red_idle = 0;
279: microtime(&now);
280: t = (now.tv_sec - rp->red_last.tv_sec);
281: if (t > 60) {
282: /*
283: * being idle for more than 1 minute, set avg to zero.
284: * this prevents t from overflow.
285: */
286: avg = 0;
287: } else {
288: t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec);
289: n = t / rp->red_pkttime - 1;
290:
291: /* the following line does (avg = (1 - Wq)^n * avg) */
292: if (n > 0)
293: avg = (avg >> FP_SHIFT) *
294: pow_w(rp->red_wtab, n);
295: }
296: }
297:
298: /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */
299: avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift);
300: rp->red_avg = avg; /* save the new value */
301:
302: /*
303: * red_count keeps a tally of arriving traffic that has not
304: * been dropped.
305: */
306: rp->red_count++;
307:
308: /* see if we drop early */
309: droptype = DTYPE_NODROP;
310: if (avg >= rp->red_thmin_s && qlen(q) > 1) {
311: if (avg >= rp->red_thmax_s) {
312: /* avg >= th_max: forced drop */
313: droptype = DTYPE_FORCED;
314: } else if (rp->red_old == 0) {
315: /* first exceeds th_min */
316: rp->red_count = 1;
317: rp->red_old = 1;
318: } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift,
319: rp->red_probd, rp->red_count)) {
320: /* mark or drop by red */
321: if ((rp->red_flags & REDF_ECN) &&
322: mark_ecn(m, pktattr, rp->red_flags)) {
323: /* successfully marked. do not drop. */
324: rp->red_count = 0;
325: #ifdef RED_STATS
326: rp->red_stats.marked_packets++;
327: #endif
328: } else {
329: /* unforced drop by red */
330: droptype = DTYPE_EARLY;
331: }
332: }
333: } else {
334: /* avg < th_min */
335: rp->red_old = 0;
336: }
337:
338: /*
339: * if the queue length hits the hard limit, it's a forced drop.
340: */
341: if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q))
342: droptype = DTYPE_FORCED;
343:
344: #ifdef RED_RANDOM_DROP
345: /* if successful or forced drop, enqueue this packet. */
346: if (droptype != DTYPE_EARLY)
347: _addq(q, m);
348: #else
349: /* if successful, enqueue this packet. */
350: if (droptype == DTYPE_NODROP)
351: _addq(q, m);
352: #endif
353: if (droptype != DTYPE_NODROP) {
354: if (droptype == DTYPE_EARLY) {
355: /* drop the incoming packet */
356: #ifdef RED_STATS
357: rp->red_stats.drop_unforced++;
358: #endif
359: } else {
360: /* forced drop, select a victim packet in the queue. */
361: #ifdef RED_RANDOM_DROP
362: m = _getq_random(q);
363: #endif
364: #ifdef RED_STATS
365: rp->red_stats.drop_forced++;
366: #endif
367: }
368: #ifdef RED_STATS
369: PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m));
370: #endif
371: rp->red_count = 0;
372: m_freem(m);
373: return (-1);
374: }
375: /* successfully queued */
376: #ifdef RED_STATS
377: PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m));
378: #endif
379: return (0);
380: }
381:
382: /*
383: * early-drop probability is calculated as follows:
384: * prob = p_max * (avg - th_min) / (th_max - th_min)
385: * prob_a = prob / (2 - count*prob)
386: * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min))
387: * here prob_a increases as successive undrop count increases.
388: * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)),
389: * becomes 1 when (count >= (2 / prob))).
390: */
391: int
392: drop_early(int fp_len, int fp_probd, int count)
393: {
394: int d; /* denominator of drop-probability */
395:
396: d = fp_probd - count * fp_len;
397: if (d <= 0)
398: /* count exceeds the hard limit: drop or mark */
399: return (1);
400:
401: /*
402: * now the range of d is [1..600] in fixed-point. (when
403: * th_max-th_min=10 and p_max=1/30)
404: * drop probability = (avg - TH_MIN) / d
405: */
406:
407: if ((random() % d) < fp_len) {
408: /* drop or mark */
409: return (1);
410: }
411: /* no drop/mark */
412: return (0);
413: }
414:
415: /*
416: * try to mark CE bit to the packet.
417: * returns 1 if successfully marked, 0 otherwise.
418: */
419: int
420: mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags)
421: {
422: struct mbuf *m0;
423: void *hdr;
424:
425: hdr = m->m_pkthdr.pf.hdr;
426:
427: /* verify that pattr_hdr is within the mbuf data */
428: for (m0 = m; m0 != NULL; m0 = m0->m_next)
429: if (((caddr_t)(hdr) >= m0->m_data) &&
430: ((caddr_t)(hdr) < m0->m_data + m0->m_len))
431: break;
432: if (m0 == NULL) {
433: /* ick, tag info is stale */
434: return (0);
435: }
436:
437: switch (((struct ip *)hdr)->ip_v) {
438: case 4:
439: if (flags & REDF_ECN4) {
440: struct ip *ip = hdr;
441: u_int8_t otos;
442: int sum;
443:
444: if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT)
445: return (0); /* not-ECT */
446: if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
447: return (1); /* already marked */
448:
449: /*
450: * ecn-capable but not marked,
451: * mark CE and update checksum
452: */
453: otos = ip->ip_tos;
454: ip->ip_tos |= IPTOS_ECN_CE;
455: /*
456: * update checksum (from RFC1624)
457: * HC' = ~(~HC + ~m + m')
458: */
459: sum = ~ntohs(ip->ip_sum) & 0xffff;
460: sum += (~otos & 0xffff) + ip->ip_tos;
461: sum = (sum >> 16) + (sum & 0xffff);
462: sum += (sum >> 16); /* add carry */
463: ip->ip_sum = htons(~sum & 0xffff);
464: return (1);
465: }
466: break;
467: #ifdef INET6
468: case 6:
469: if (flags & REDF_ECN6) {
470: struct ip6_hdr *ip6 = hdr;
471: u_int32_t flowlabel;
472:
473: flowlabel = ntohl(ip6->ip6_flow);
474: if ((flowlabel >> 28) != 6)
475: return (0); /* version mismatch! */
476: if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
477: (IPTOS_ECN_NOTECT << 20))
478: return (0); /* not-ECT */
479: if ((flowlabel & (IPTOS_ECN_MASK << 20)) ==
480: (IPTOS_ECN_CE << 20))
481: return (1); /* already marked */
482: /*
483: * ecn-capable but not marked, mark CE
484: */
485: flowlabel |= (IPTOS_ECN_CE << 20);
486: ip6->ip6_flow = htonl(flowlabel);
487: return (1);
488: }
489: break;
490: #endif /* INET6 */
491: }
492:
493: /* not marked */
494: return (0);
495: }
496:
497: struct mbuf *
498: red_getq(rp, q)
499: red_t *rp;
500: class_queue_t *q;
501: {
502: struct mbuf *m;
503:
504: if ((m = _getq(q)) == NULL) {
505: if (rp->red_idle == 0) {
506: rp->red_idle = 1;
507: microtime(&rp->red_last);
508: }
509: return NULL;
510: }
511:
512: rp->red_idle = 0;
513: return (m);
514: }
515:
516: /*
517: * helper routine to calibrate avg during idle.
518: * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point
519: * here Wq = 1/weight and the code assumes Wq is close to zero.
520: *
521: * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point.
522: */
523: static struct wtab *wtab_list = NULL; /* pointer to wtab list */
524:
525: struct wtab *
526: wtab_alloc(int weight)
527: {
528: struct wtab *w;
529: int i;
530:
531: for (w = wtab_list; w != NULL; w = w->w_next)
532: if (w->w_weight == weight) {
533: w->w_refcount++;
534: return (w);
535: }
536:
537: MALLOC(w, struct wtab *, sizeof(struct wtab), M_DEVBUF, M_WAITOK);
538: if (w == NULL)
539: panic("wtab_alloc: malloc failed!");
540: bzero(w, sizeof(struct wtab));
541: w->w_weight = weight;
542: w->w_refcount = 1;
543: w->w_next = wtab_list;
544: wtab_list = w;
545:
546: /* initialize the weight table */
547: w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight;
548: for (i = 1; i < 32; i++) {
549: w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT;
550: if (w->w_tab[i] == 0 && w->w_param_max == 0)
551: w->w_param_max = 1 << i;
552: }
553:
554: return (w);
555: }
556:
557: int
558: wtab_destroy(struct wtab *w)
559: {
560: struct wtab *prev;
561:
562: if (--w->w_refcount > 0)
563: return (0);
564:
565: if (wtab_list == w)
566: wtab_list = w->w_next;
567: else for (prev = wtab_list; prev->w_next != NULL; prev = prev->w_next)
568: if (prev->w_next == w) {
569: prev->w_next = w->w_next;
570: break;
571: }
572:
573: FREE(w, M_DEVBUF);
574: return (0);
575: }
576:
577: int32_t
578: pow_w(struct wtab *w, int n)
579: {
580: int i, bit;
581: int32_t val;
582:
583: if (n >= w->w_param_max)
584: return (0);
585:
586: val = 1 << FP_SHIFT;
587: if (n <= 0)
588: return (val);
589:
590: bit = 1;
591: i = 0;
592: while (n) {
593: if (n & bit) {
594: val = (val * w->w_tab[i]) >> FP_SHIFT;
595: n &= ~bit;
596: }
597: i++;
598: bit <<= 1;
599: }
600: return (val);
601: }
CVSweb