Annotation of sys/kern/vfs_bio.c, Revision 1.1.1.1
1.1 nbrk 1: /* $OpenBSD: vfs_bio.c,v 1.99 2007/08/07 04:32:45 beck Exp $ */
2: /* $NetBSD: vfs_bio.c,v 1.44 1996/06/11 11:15:36 pk Exp $ */
3:
4: /*-
5: * Copyright (c) 1994 Christopher G. Demetriou
6: * Copyright (c) 1982, 1986, 1989, 1993
7: * The Regents of the University of California. All rights reserved.
8: * (c) UNIX System Laboratories, Inc.
9: * All or some portions of this file are derived from material licensed
10: * to the University of California by American Telephone and Telegraph
11: * Co. or Unix System Laboratories, Inc. and are reproduced herein with
12: * the permission of UNIX System Laboratories, Inc.
13: *
14: * Redistribution and use in source and binary forms, with or without
15: * modification, are permitted provided that the following conditions
16: * are met:
17: * 1. Redistributions of source code must retain the above copyright
18: * notice, this list of conditions and the following disclaimer.
19: * 2. Redistributions in binary form must reproduce the above copyright
20: * notice, this list of conditions and the following disclaimer in the
21: * documentation and/or other materials provided with the distribution.
22: * 3. Neither the name of the University nor the names of its contributors
23: * may be used to endorse or promote products derived from this software
24: * without specific prior written permission.
25: *
26: * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29: * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36: * SUCH DAMAGE.
37: *
38: * @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
39: */
40:
41: /*
42: * Some references:
43: * Bach: The Design of the UNIX Operating System (Prentice Hall, 1986)
44: * Leffler, et al.: The Design and Implementation of the 4.3BSD
45: * UNIX Operating System (Addison Welley, 1989)
46: */
47:
48: #include <sys/param.h>
49: #include <sys/systm.h>
50: #include <sys/proc.h>
51: #include <sys/buf.h>
52: #include <sys/vnode.h>
53: #include <sys/mount.h>
54: #include <sys/malloc.h>
55: #include <sys/pool.h>
56: #include <sys/resourcevar.h>
57: #include <sys/conf.h>
58: #include <sys/kernel.h>
59:
60: #include <uvm/uvm_extern.h>
61:
62: #include <miscfs/specfs/specdev.h>
63:
64: /*
65: * Definitions for the buffer hash lists.
66: */
67: #define BUFHASH(dvp, lbn) \
68: (&bufhashtbl[((long)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
69: LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
70: u_long bufhash;
71:
72: /*
73: * Insq/Remq for the buffer hash lists.
74: */
75: #define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash)
76: #define bremhash(bp) LIST_REMOVE(bp, b_hash)
77:
78: /*
79: * Definitions for the buffer free lists.
80: */
81: #define BQUEUES 6 /* number of free buffer queues */
82:
83: #define BQ_DIRTY 0 /* LRU queue with dirty buffers */
84:
85:
86: TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
87: int bqpages[BQUEUES]; /* pages allocated, per queue */
88: int bqpagelow;
89: int needbuffer;
90: struct bio_ops bioops;
91:
92: /*
93: * Buffer pool for I/O buffers.
94: */
95: struct pool bufpool;
96: struct vm_map *buf_map;
97: struct bufhead bufhead = LIST_HEAD_INITIALIZER(bufhead);
98: struct buf *buf_get(size_t);
99: struct buf *buf_stub(struct vnode *, daddr64_t);
100: void buf_put(struct buf *);
101:
102: /*
103: * Insq/Remq for the buffer free lists.
104: */
105: #define binsheadfree(bp, dp) TAILQ_INSERT_HEAD(dp, bp, b_freelist)
106: #define binstailfree(bp, dp) TAILQ_INSERT_TAIL(dp, bp, b_freelist)
107:
108: struct buf *bio_doread(struct vnode *, daddr64_t, int, int);
109: struct buf *getnewbuf(size_t, int, int, int *);
110: void buf_init(struct buf *, int);
111: void bread_cluster_callback(struct buf *);
112:
113: /*
114: * We keep a few counters to monitor the utilization of the buffer cache
115: *
116: * numbufpages - number of pages totally allocated.
117: * numdirtypages - number of pages on BQ_DIRTY queue.
118: * lodirtypages - low water mark for buffer cleaning daemon.
119: * hidirtypages - high water mark for buffer cleaning daemon.
120: * numfreepages - number of pages on BQ_CLEAN and BQ_DIRTY queues. unused.
121: * numcleanpages - number of pages on BQ_CLEAN queue.
122: * Used to track the need to speedup the cleaner and
123: * as a reserve for special processes like syncer.
124: * maxcleanpages - the highest page count on BQ_CLEAN.
125: */
126: long numbufpages;
127: long numdirtypages;
128: long lodirtypages;
129: long hidirtypages;
130: long numfreepages;
131: long numcleanpages;
132: long locleanpages;
133: long hicleanpages;
134: long maxcleanpages;
135:
136: struct proc *cleanerproc;
137: int bd_req; /* Sleep point for cleaner daemon. */
138:
139: int size2cqueue(int *size);
140:
141: int
142: size2cqueue(int *size)
143: {
144: int i = 0, q;
145: int s = *size;
146: s -= 1;
147: while (s > 0) {
148: s = s >> 1;
149: i++;
150: }
151: if (i < PAGE_SHIFT) {
152: i = PAGE_SHIFT; /* < 4096 -> 4096 */
153: }
154: *size = 1 << i;
155: q = (i + 1 - PAGE_SHIFT); /* XXX 4096 is queue 1 */
156: if (q >= BQUEUES)
157: panic("queue %d > BQUEUES %d", q, BQUEUES);
158: if (q == 0)
159: panic("can't return dirty q");
160: return(q);
161: }
162:
163: void
164: bremfree(struct buf *bp)
165: {
166: struct bqueues *dp = NULL;
167: int queue;
168:
169: /*
170: * We only calculate the head of the freelist when removing
171: * the last element of the list as that is the only time that
172: * it is needed (e.g. to reset the tail pointer).
173: *
174: * NB: This makes an assumption about how tailq's are implemented.
175: */
176: if (TAILQ_NEXT(bp, b_freelist) == NULL) {
177: for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
178: if (dp->tqh_last == &TAILQ_NEXT(bp, b_freelist))
179: break;
180: if (dp == &bufqueues[BQUEUES])
181: panic("bremfree: lost tail");
182: }
183: numfreepages -= btoc(bp->b_bufsize);
184: if (!ISSET(bp->b_flags, B_DELWRI)) {
185: int qs = bp->b_bufsize;
186: queue = size2cqueue(&qs);
187: numcleanpages -= btoc(bp->b_bufsize);
188: bqpages[queue] -= btoc(bp->b_bufsize);
189: } else
190: numdirtypages -= btoc(bp->b_bufsize);
191: TAILQ_REMOVE(dp, bp, b_freelist);
192: }
193:
194: void
195: buf_init(struct buf *bp, int size)
196: {
197: int npages, queue;
198:
199: splassert(IPL_BIO);
200:
201: npages = btoc(size);
202: bzero((char *)bp, sizeof *bp);
203: bp->b_vnbufs.le_next = NOLIST;
204: bp->b_freelist.tqe_next = NOLIST;
205: bp->b_synctime = time_uptime + 300;
206: bp->b_dev = NODEV;
207: queue = size2cqueue(&size);
208: LIST_INIT(&bp->b_dep);
209: numbufpages += npages;
210: numfreepages += npages;
211: numcleanpages += npages;
212: bqpages[queue] += npages;
213: if (maxcleanpages < numcleanpages)
214: maxcleanpages = numcleanpages;
215: }
216:
217: /*
218: * This is a non-sleeping expanded equivalent of getblk() that allocates only
219: * the buffer structure, and not its contents.
220: */
221: struct buf *
222: buf_stub(struct vnode *vp, daddr64_t lblkno)
223: {
224: struct buf *bp;
225: int s;
226:
227: s = splbio();
228: bp = pool_get(&bufpool, PR_NOWAIT);
229: splx(s);
230:
231: if (bp == NULL)
232: return (NULL);
233:
234: bzero((char *)bp, sizeof *bp);
235: bp->b_vnbufs.le_next = NOLIST;
236: bp->b_freelist.tqe_next = NOLIST;
237: bp->b_synctime = time_uptime + 300;
238: bp->b_dev = NODEV;
239: bp->b_bufsize = 0;
240: bp->b_data = NULL;
241: bp->b_flags = B_BUSY;
242: bp->b_dev = NODEV;
243: bp->b_blkno = bp->b_lblkno = lblkno;
244: bp->b_iodone = NULL;
245: bp->b_error = 0;
246: bp->b_resid = 0;
247: bp->b_bcount = 0;
248: bp->b_dirtyoff = bp->b_dirtyend = 0;
249: bp->b_validoff = bp->b_validend = 0;
250:
251: LIST_INIT(&bp->b_dep);
252:
253: s = splbio();
254: LIST_INSERT_HEAD(&bufhead, bp, b_list);
255: bgetvp(vp, bp);
256: splx(s);
257:
258: return (bp);
259: }
260:
261: struct buf *
262: buf_get(size_t size)
263: {
264: struct bqueues *dp;
265: struct buf *bp;
266: int npages;
267: int queue, qs;
268: void *data;
269:
270: splassert(IPL_BIO);
271:
272: KASSERT(size > 0);
273:
274: size = round_page(size);
275: qs = size;
276: queue = size2cqueue(&qs);
277: npages = btoc(qs);
278:
279: if (numbufpages + npages > bufpages)
280: return (NULL);
281:
282: bp = pool_get(&bufpool, PR_WAITOK);
283:
284: data = (void *)uvm_km_alloc(buf_map, qs);
285: if (data == NULL) {
286: pool_put(&bufpool, bp);
287: return (NULL);
288: }
289: buf_init(bp, qs);
290: bp->b_flags = B_INVAL;
291: bp->b_bufsize = qs;
292: bp->b_data = data;
293: dp = &bufqueues[queue];
294: binsheadfree(bp, dp);
295: binshash(bp, &invalhash);
296: LIST_INSERT_HEAD(&bufhead, bp, b_list);
297:
298: return (bp);
299: }
300:
301: void
302: buf_put(struct buf *bp)
303: {
304: splassert(IPL_BIO);
305: #ifdef DIAGNOSTIC
306: if (bp->b_data != NULL)
307: KASSERT(bp->b_bufsize > 0);
308: #endif
309: #ifdef QUEUE_MACRO_DEBUG
310: if (bp->b_freelist.tqe_next != NOLIST &&
311: bp->b_freelist.tqe_next != (void *)-1)
312: panic("buf_put: still on the free list");
313:
314: if (bp->b_vnbufs.le_next != NOLIST &&
315: bp->b_vnbufs.le_next != (void *)-1)
316: panic("buf_put: still on the vnode list");
317: #endif
318: #ifdef DIAGNOSTIC
319: if (!LIST_EMPTY(&bp->b_dep))
320: panic("buf_put: b_dep is not empty");
321: #endif
322: LIST_REMOVE(bp, b_list);
323:
324: if (bp->b_data != NULL) {
325: bremhash(bp);
326: numbufpages -= btoc(bp->b_bufsize);
327: uvm_km_free(buf_map, (vaddr_t)bp->b_data, bp->b_bufsize);
328: }
329:
330: pool_put(&bufpool, bp);
331: }
332:
333: /*
334: * Initialize buffers and hash links for buffers.
335: */
336: void
337: bufinit(void)
338: {
339: vaddr_t minaddr, maxaddr;
340: struct bqueues *dp;
341:
342: pool_init(&bufpool, sizeof(struct buf), 0, 0, 0, "bufpl", NULL);
343: pool_setipl(&bufpool, IPL_BIO);
344: for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
345: TAILQ_INIT(dp);
346: minaddr = vm_map_min(kernel_map);
347: buf_map = uvm_km_suballoc(kernel_map, &minaddr, &maxaddr,
348: ptoa(bufpages), 0, FALSE, NULL);
349:
350: /*
351: * XXX don't starve any one queue below 5% of the total number
352: * of buffer cache pages.
353: */
354: bqpagelow = bufpages / 20;
355:
356: bufhashtbl = hashinit(bufpages / 4, M_CACHE, M_WAITOK, &bufhash);
357: hidirtypages = (bufpages / 4) * 3;
358: lodirtypages = bufpages / 2;
359:
360: /*
361: * Reserve 5% of bufpages for syncer's needs,
362: * but not more than 25% and if possible
363: * not less than 2 * MAXBSIZE. locleanpages
364: * value must be not too small
365: */
366: hicleanpages = bufpages / 2;
367: locleanpages = hicleanpages / 2;
368: if (locleanpages < btoc(2 * MAXBSIZE))
369: locleanpages = btoc(2 * MAXBSIZE);
370: if (locleanpages > bufpages / 4)
371: locleanpages = bufpages / 4;
372:
373: maxcleanpages = locleanpages;
374: }
375:
376: struct buf *
377: bio_doread(struct vnode *vp, daddr64_t blkno, int size, int async)
378: {
379: struct buf *bp;
380:
381: bp = getblk(vp, blkno, size, 0, 0);
382:
383: /*
384: * If buffer does not have valid data, start a read.
385: * Note that if buffer is B_INVAL, getblk() won't return it.
386: * Therefore, it's valid if its I/O has completed or been delayed.
387: */
388: if (!ISSET(bp->b_flags, (B_DONE | B_DELWRI))) {
389: SET(bp->b_flags, B_READ | async);
390: VOP_STRATEGY(bp);
391:
392: /* Pay for the read. */
393: curproc->p_stats->p_ru.ru_inblock++; /* XXX */
394: } else if (async) {
395: brelse(bp);
396: }
397:
398: return (bp);
399: }
400:
401: /*
402: * Read a disk block.
403: * This algorithm described in Bach (p.54).
404: */
405: int
406: bread(struct vnode *vp, daddr64_t blkno, int size, struct ucred *cred,
407: struct buf **bpp)
408: {
409: struct buf *bp;
410:
411: /* Get buffer for block. */
412: bp = *bpp = bio_doread(vp, blkno, size, 0);
413:
414: /* Wait for the read to complete, and return result. */
415: return (biowait(bp));
416: }
417:
418: /*
419: * Read-ahead multiple disk blocks. The first is sync, the rest async.
420: * Trivial modification to the breada algorithm presented in Bach (p.55).
421: */
422: int
423: breadn(struct vnode *vp, daddr64_t blkno, int size, daddr64_t rablks[],
424: int rasizes[], int nrablks, struct ucred *cred, struct buf **bpp)
425: {
426: struct buf *bp;
427: int i;
428:
429: bp = *bpp = bio_doread(vp, blkno, size, 0);
430:
431: /*
432: * For each of the read-ahead blocks, start a read, if necessary.
433: */
434: for (i = 0; i < nrablks; i++) {
435: /* If it's in the cache, just go on to next one. */
436: if (incore(vp, rablks[i]))
437: continue;
438:
439: /* Get a buffer for the read-ahead block */
440: (void) bio_doread(vp, rablks[i], rasizes[i], B_ASYNC);
441: }
442:
443: /* Otherwise, we had to start a read for it; wait until it's valid. */
444: return (biowait(bp));
445: }
446:
447: /*
448: * Called from interrupt context.
449: */
450: void
451: bread_cluster_callback(struct buf *bp)
452: {
453: int i;
454: struct buf **xbpp;
455:
456: xbpp = (struct buf **)bp->b_saveaddr;
457:
458: for (i = 0; xbpp[i] != 0; i++) {
459: if (ISSET(bp->b_flags, B_ERROR))
460: SET(xbpp[i]->b_flags, B_INVAL | B_ERROR);
461: biodone(xbpp[i]);
462: }
463:
464: free(xbpp, M_TEMP);
465: bp->b_data = NULL;
466: buf_put(bp);
467: }
468:
469: int
470: bread_cluster(struct vnode *vp, daddr64_t blkno, int size, struct buf **rbpp)
471: {
472: struct buf *bp, **xbpp;
473: int howmany, i, maxra, inc;
474: daddr64_t sblkno;
475: size_t spill;
476:
477: *rbpp = bio_doread(vp, blkno, size, 0);
478:
479: if (size != round_page(size))
480: return (biowait(*rbpp));
481:
482: if (VOP_BMAP(vp, blkno + 1, NULL, &sblkno, &maxra))
483: return (biowait(*rbpp));
484:
485: maxra++;
486: if (sblkno == -1 || maxra < 2)
487: return (biowait(*rbpp));
488:
489: howmany = MAXPHYS / size;
490: if (howmany > maxra)
491: howmany = maxra;
492:
493: xbpp = malloc((howmany + 1) * sizeof(struct buf *), M_TEMP, M_NOWAIT);
494: if (xbpp == NULL)
495: return (biowait(*rbpp));
496:
497: for (i = 0; i < howmany; i++) {
498: if (incore(vp, blkno + i + 1)) {
499: for (--i; i >= 0; i--) {
500: SET(xbpp[i]->b_flags, B_INVAL);
501: brelse(xbpp[i]);
502: }
503: free(xbpp, M_TEMP);
504: return (biowait(*rbpp));
505: }
506: xbpp[i] = buf_stub(vp, blkno + i + 1);
507: if (xbpp[i] == NULL) {
508: for (--i; i >= 0; i--) {
509: SET(xbpp[i]->b_flags, B_INVAL);
510: brelse(xbpp[i]);
511: }
512: free(xbpp, M_TEMP);
513: return (biowait(*rbpp));
514: }
515: }
516:
517: xbpp[howmany] = 0;
518:
519: bp = getnewbuf(howmany * size, 0, 0, NULL);
520: if (bp == NULL) {
521: for (i = 0; i < howmany; i++) {
522: SET(xbpp[i]->b_flags, B_INVAL);
523: brelse(xbpp[i]);
524: }
525: free(xbpp, M_TEMP);
526: return (biowait(*rbpp));
527: }
528:
529: inc = btodb(size);
530:
531: for (i = 0; i < howmany; i++) {
532: SET(xbpp[i]->b_flags, B_READ | B_ASYNC);
533: binshash(xbpp[i], BUFHASH(vp, xbpp[i]->b_lblkno));
534: xbpp[i]->b_blkno = sblkno + (i * inc);
535: xbpp[i]->b_bufsize = xbpp[i]->b_bcount = size;
536: xbpp[i]->b_data = bp->b_data + (i * size);
537: }
538:
539: bp->b_blkno = sblkno;
540: bp->b_lblkno = blkno + 1;
541: SET(bp->b_flags, B_READ | B_ASYNC | B_CALL);
542: bp->b_saveaddr = (void *)xbpp;
543: bp->b_iodone = bread_cluster_callback;
544: bp->b_vp = vp;
545: spill = bp->b_bufsize - bp->b_bcount;
546: if (spill) {
547: uvm_km_free(buf_map, (vaddr_t) bp->b_data + bp->b_bcount,
548: spill);
549: numbufpages -= atop(spill);
550: }
551: VOP_STRATEGY(bp);
552: curproc->p_stats->p_ru.ru_inblock++;
553:
554: return (biowait(*rbpp));
555: }
556:
557: /*
558: * Block write. Described in Bach (p.56)
559: */
560: int
561: bwrite(struct buf *bp)
562: {
563: int rv, async, wasdelayed, s;
564: struct vnode *vp;
565: struct mount *mp;
566:
567: vp = bp->b_vp;
568: if (vp != NULL)
569: mp = vp->v_type == VBLK? vp->v_specmountpoint : vp->v_mount;
570: else
571: mp = NULL;
572:
573: /*
574: * Remember buffer type, to switch on it later. If the write was
575: * synchronous, but the file system was mounted with MNT_ASYNC,
576: * convert it to a delayed write.
577: * XXX note that this relies on delayed tape writes being converted
578: * to async, not sync writes (which is safe, but ugly).
579: */
580: async = ISSET(bp->b_flags, B_ASYNC);
581: if (!async && mp && ISSET(mp->mnt_flag, MNT_ASYNC)) {
582: bdwrite(bp);
583: return (0);
584: }
585:
586: /*
587: * Collect statistics on synchronous and asynchronous writes.
588: * Writes to block devices are charged to their associated
589: * filesystem (if any).
590: */
591: if (mp != NULL) {
592: if (async)
593: mp->mnt_stat.f_asyncwrites++;
594: else
595: mp->mnt_stat.f_syncwrites++;
596: }
597:
598: wasdelayed = ISSET(bp->b_flags, B_DELWRI);
599: CLR(bp->b_flags, (B_READ | B_DONE | B_ERROR | B_DELWRI));
600:
601: s = splbio();
602:
603: /*
604: * If not synchronous, pay for the I/O operation and make
605: * sure the buf is on the correct vnode queue. We have
606: * to do this now, because if we don't, the vnode may not
607: * be properly notified that its I/O has completed.
608: */
609: if (wasdelayed) {
610: reassignbuf(bp);
611: } else
612: curproc->p_stats->p_ru.ru_oublock++;
613:
614:
615: /* Initiate disk write. Make sure the appropriate party is charged. */
616: bp->b_vp->v_numoutput++;
617: splx(s);
618: SET(bp->b_flags, B_WRITEINPROG);
619: VOP_STRATEGY(bp);
620:
621: if (async)
622: return (0);
623:
624: /*
625: * If I/O was synchronous, wait for it to complete.
626: */
627: rv = biowait(bp);
628:
629: /* Release the buffer. */
630: brelse(bp);
631:
632: return (rv);
633: }
634:
635:
636: /*
637: * Delayed write.
638: *
639: * The buffer is marked dirty, but is not queued for I/O.
640: * This routine should be used when the buffer is expected
641: * to be modified again soon, typically a small write that
642: * partially fills a buffer.
643: *
644: * NB: magnetic tapes cannot be delayed; they must be
645: * written in the order that the writes are requested.
646: *
647: * Described in Leffler, et al. (pp. 208-213).
648: */
649: void
650: bdwrite(struct buf *bp)
651: {
652: int s;
653:
654: /*
655: * If the block hasn't been seen before:
656: * (1) Mark it as having been seen,
657: * (2) Charge for the write.
658: * (3) Make sure it's on its vnode's correct block list,
659: * (4) If a buffer is rewritten, move it to end of dirty list
660: */
661: if (!ISSET(bp->b_flags, B_DELWRI)) {
662: SET(bp->b_flags, B_DELWRI);
663: bp->b_synctime = time_uptime + 35;
664: s = splbio();
665: reassignbuf(bp);
666: splx(s);
667: curproc->p_stats->p_ru.ru_oublock++; /* XXX */
668: } else {
669: /*
670: * see if this buffer has slacked through the syncer
671: * and enforce an async write upon it.
672: */
673: if (bp->b_synctime < time_uptime) {
674: bawrite(bp);
675: return;
676: }
677: }
678:
679: /* If this is a tape block, write the block now. */
680: if (major(bp->b_dev) < nblkdev &&
681: bdevsw[major(bp->b_dev)].d_type == D_TAPE) {
682: bawrite(bp);
683: return;
684: }
685:
686: /* Otherwise, the "write" is done, so mark and release the buffer. */
687: CLR(bp->b_flags, B_NEEDCOMMIT);
688: SET(bp->b_flags, B_DONE);
689: brelse(bp);
690: }
691:
692: /*
693: * Asynchronous block write; just an asynchronous bwrite().
694: */
695: void
696: bawrite(struct buf *bp)
697: {
698:
699: SET(bp->b_flags, B_ASYNC);
700: VOP_BWRITE(bp);
701: }
702:
703: /*
704: * Must be called at splbio()
705: */
706: void
707: buf_dirty(struct buf *bp)
708: {
709: splassert(IPL_BIO);
710:
711: if (ISSET(bp->b_flags, B_DELWRI) == 0) {
712: SET(bp->b_flags, B_DELWRI);
713: bp->b_synctime = time_uptime + 35;
714: reassignbuf(bp);
715: }
716: }
717:
718: /*
719: * Must be called at splbio()
720: */
721: void
722: buf_undirty(struct buf *bp)
723: {
724: splassert(IPL_BIO);
725:
726: if (ISSET(bp->b_flags, B_DELWRI)) {
727: CLR(bp->b_flags, B_DELWRI);
728: reassignbuf(bp);
729: }
730: }
731:
732: /*
733: * Release a buffer on to the free lists.
734: * Described in Bach (p. 46).
735: */
736: void
737: brelse(struct buf *bp)
738: {
739: struct bqueues *bufq;
740: int s;
741:
742: /* Block disk interrupts. */
743: s = splbio();
744:
745: if (bp->b_data != NULL)
746: KASSERT(bp->b_bufsize > 0);
747:
748: /*
749: * Determine which queue the buffer should be on, then put it there.
750: */
751:
752: /* If it's not cacheable, or an error, mark it invalid. */
753: if (ISSET(bp->b_flags, (B_NOCACHE|B_ERROR)))
754: SET(bp->b_flags, B_INVAL);
755:
756: if (ISSET(bp->b_flags, B_INVAL)) {
757: int queue, qs;
758:
759: /*
760: * If the buffer is invalid, place it in the clean queue, so it
761: * can be reused.
762: */
763: if (LIST_FIRST(&bp->b_dep) != NULL)
764: buf_deallocate(bp);
765:
766: if (ISSET(bp->b_flags, B_DELWRI)) {
767: CLR(bp->b_flags, B_DELWRI);
768: }
769:
770: if (bp->b_vp)
771: brelvp(bp);
772:
773: /*
774: * If the buffer has no associated data, place it back in the
775: * pool.
776: */
777: if (bp->b_data == NULL) {
778: buf_put(bp);
779: splx(s);
780: return;
781: }
782:
783: qs = bp->b_bufsize;
784: queue = size2cqueue(&qs);
785: numcleanpages += btoc(bp->b_bufsize);
786: bqpages[queue] += btoc(bp->b_bufsize);
787: if (maxcleanpages < numcleanpages)
788: maxcleanpages = numcleanpages;
789: binsheadfree(bp, &bufqueues[queue]);
790: } else {
791: /*
792: * It has valid data. Put it on the end of the appropriate
793: * queue, so that it'll stick around for as long as possible.
794: */
795: int queue, qs;
796: numfreepages += btoc(bp->b_bufsize);
797: qs = bp->b_bufsize;
798: queue = size2cqueue(&qs);
799:
800: if (!ISSET(bp->b_flags, B_DELWRI)) {
801: numcleanpages += btoc(bp->b_bufsize);
802: bqpages[queue] += btoc(bp->b_bufsize);
803: if (maxcleanpages < numcleanpages)
804: maxcleanpages = numcleanpages;
805: bufq = &bufqueues[queue];
806: } else {
807: numdirtypages += btoc(bp->b_bufsize);
808: bufq = &bufqueues[BQ_DIRTY];
809: }
810: if (ISSET(bp->b_flags, B_AGE)) {
811: binsheadfree(bp, bufq);
812: bp->b_synctime = time_uptime + 30;
813: } else {
814: binstailfree(bp, bufq);
815: bp->b_synctime = time_uptime + 300;
816: }
817: }
818:
819: /* Unlock the buffer. */
820: CLR(bp->b_flags, (B_AGE | B_ASYNC | B_BUSY | B_NOCACHE | B_DEFERRED));
821:
822: /* Wake up any processes waiting for any buffer to become free. */
823: if (needbuffer) {
824: needbuffer--;
825: wakeup_one(&needbuffer);
826: }
827:
828: /* Wake up any processes waiting for _this_ buffer to become free. */
829: if (ISSET(bp->b_flags, B_WANTED)) {
830: CLR(bp->b_flags, B_WANTED);
831: wakeup(bp);
832: }
833:
834: splx(s);
835: }
836:
837: /*
838: * Determine if a block is in the cache. Just look on what would be its hash
839: * chain. If it's there, return a pointer to it, unless it's marked invalid.
840: */
841: struct buf *
842: incore(struct vnode *vp, daddr64_t blkno)
843: {
844: struct buf *bp;
845:
846: /* Search hash chain */
847: LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) {
848: if (bp->b_lblkno == blkno && bp->b_vp == vp &&
849: !ISSET(bp->b_flags, B_INVAL))
850: return (bp);
851: }
852:
853: return (NULL);
854: }
855:
856: /*
857: * Get a block of requested size that is associated with
858: * a given vnode and block offset. If it is found in the
859: * block cache, mark it as having been found, make it busy
860: * and return it. Otherwise, return an empty block of the
861: * correct size. It is up to the caller to ensure that the
862: * cached blocks be of the correct size.
863: */
864: struct buf *
865: getblk(struct vnode *vp, daddr64_t blkno, int size, int slpflag, int slptimeo)
866: {
867: struct bufhashhdr *bh;
868: struct buf *bp, *nb = NULL;
869: int s, error;
870:
871: /*
872: * XXX
873: * The following is an inlined version of 'incore()', but with
874: * the 'invalid' test moved to after the 'busy' test. It's
875: * necessary because there are some cases in which the NFS
876: * code sets B_INVAL prior to writing data to the server, but
877: * in which the buffers actually contain valid data. In this
878: * case, we can't allow the system to allocate a new buffer for
879: * the block until the write is finished.
880: */
881: bh = BUFHASH(vp, blkno);
882: start:
883: LIST_FOREACH(bp, BUFHASH(vp, blkno), b_hash) {
884: if (bp->b_lblkno != blkno || bp->b_vp != vp)
885: continue;
886:
887: s = splbio();
888: if (ISSET(bp->b_flags, B_BUSY)) {
889: if (nb != NULL) {
890: SET(nb->b_flags, B_INVAL);
891: binshash(nb, &invalhash);
892: brelse(nb);
893: nb = NULL;
894: }
895: SET(bp->b_flags, B_WANTED);
896: error = tsleep(bp, slpflag | (PRIBIO + 1), "getblk",
897: slptimeo);
898: splx(s);
899: if (error)
900: return (NULL);
901: goto start;
902: }
903:
904: if (!ISSET(bp->b_flags, B_INVAL)) {
905: SET(bp->b_flags, (B_BUSY | B_CACHE));
906: bremfree(bp);
907: splx(s);
908: break;
909: }
910: splx(s);
911: }
912: if (nb && bp) {
913: SET(nb->b_flags, B_INVAL);
914: binshash(nb, &invalhash);
915: brelse(nb);
916: nb = NULL;
917: }
918: if (bp == NULL && nb == NULL) {
919: nb = getnewbuf(size, slpflag, slptimeo, &error);
920: if (nb == NULL) {
921: if (error == ERESTART || error == EINTR)
922: return (NULL);
923: }
924: goto start;
925: }
926: if (nb) {
927: bp = nb;
928: binshash(bp, bh);
929: bp->b_blkno = bp->b_lblkno = blkno;
930: s = splbio();
931: bgetvp(vp, bp);
932: splx(s);
933: }
934: return (bp);
935: }
936:
937: /*
938: * Get an empty, disassociated buffer of given size.
939: */
940: struct buf *
941: geteblk(int size)
942: {
943: struct buf *bp;
944:
945: while ((bp = getnewbuf(size, 0, 0, NULL)) == NULL)
946: ;
947: SET(bp->b_flags, B_INVAL);
948: binshash(bp, &invalhash);
949:
950: return (bp);
951: }
952:
953: /*
954: * Find a buffer which is available for use.
955: */
956: struct buf *
957: getnewbuf(size_t size, int slpflag, int slptimeo, int *ep)
958: {
959: struct buf *bp;
960: int s, error, queue, qs;
961:
962: #if 0 /* we would really like this but sblock update kills it */
963: KASSERT(curproc != syncerproc && curproc != cleanerproc);
964: #endif
965:
966: s = splbio();
967: /*
968: * Wake up cleaner if we're getting low on pages.
969: */
970: if (numdirtypages >= hidirtypages || numcleanpages <= locleanpages)
971: wakeup(&bd_req);
972:
973: /* we just ask. it can say no.. */
974: getsome:
975: qs = size;
976: queue = size2cqueue(&qs);
977: bp = buf_get(qs); /* XXX use qs instead and no need in buf_get? */
978: if (bp == NULL) {
979: /*
980: * No free ones, try to reuse a clean one of the same or
981: * larger size.
982: */
983: do {
984: bp = TAILQ_FIRST(&bufqueues[queue]);
985: queue++;
986: } while (bp == NULL && queue < BQUEUES);
987: }
988: if (bp == NULL) {
989: /* we couldn't reuse a free one, nothing of the right size */
990: /* XXX free 20 buffers per q - ugly hack should really
991: * reuse big ones without truncating. fix later
992: */
993: int q, gotsome = 0;
994: int freemax = 20;
995: for (q = 1; q < BQUEUES; q++) {
996: int i = freemax;
997: while (bqpages[q] > bqpagelow
998: && (bp = TAILQ_FIRST(&bufqueues[q]))
999: && i--) {
1000: gotsome++;
1001: bremfree(bp);
1002: if (LIST_FIRST(&bp->b_dep) != NULL)
1003: buf_deallocate(bp);
1004:
1005: if (ISSET(bp->b_flags, B_DELWRI)) {
1006: CLR(bp->b_flags, B_DELWRI);
1007: }
1008:
1009: if (bp->b_vp)
1010: brelvp(bp);
1011:
1012: buf_put(bp);
1013: }
1014: }
1015: if (gotsome)
1016: goto getsome;
1017: }
1018: if (bp == NULL) {
1019: /* wait for a free buffer of any kind */
1020: needbuffer++;
1021: error = tsleep(&needbuffer, slpflag | (PRIBIO + 1),
1022: "getnewbuf", slptimeo);
1023: if (ep != NULL) {
1024: *ep = error;
1025: if (error) {
1026: splx(s);
1027: return (NULL);
1028: }
1029: }
1030: goto getsome;
1031: }
1032:
1033: bremfree(bp);
1034: /* Buffer is no longer on free lists. */
1035: SET(bp->b_flags, B_BUSY);
1036:
1037: #ifdef DIAGNOSTIC
1038: if (ISSET(bp->b_flags, B_DELWRI))
1039: panic("Dirty buffer on BQ_CLEAN");
1040: #endif
1041:
1042: /* disassociate us from our vnode, if we had one... */
1043: if (bp->b_vp)
1044: brelvp(bp);
1045:
1046: splx(s);
1047:
1048: #ifdef DIAGNOSTIC
1049: /* CLEAN buffers must have no dependencies */
1050: if (LIST_FIRST(&bp->b_dep) != NULL)
1051: panic("BQ_CLEAN has buffer with dependencies");
1052: #endif
1053:
1054: /* clear out various other fields */
1055: bp->b_flags = B_BUSY;
1056: bp->b_dev = NODEV;
1057: bp->b_blkno = bp->b_lblkno = 0;
1058: bp->b_iodone = NULL;
1059: bp->b_error = 0;
1060: bp->b_resid = 0;
1061: bp->b_bcount = size;
1062: bp->b_dirtyoff = bp->b_dirtyend = 0;
1063: bp->b_validoff = bp->b_validend = 0;
1064:
1065: bremhash(bp);
1066: return (bp);
1067: }
1068:
1069: /*
1070: * Buffer cleaning daemon.
1071: */
1072: void
1073: buf_daemon(struct proc *p)
1074: {
1075: struct timeval starttime, timediff;
1076: struct buf *bp;
1077: int s;
1078:
1079: cleanerproc = curproc;
1080:
1081: s = splbio();
1082: for (;;) {
1083: if (!numdirtypages ||
1084: (numdirtypages < hidirtypages && !needbuffer))
1085: tsleep(&bd_req, PRIBIO - 7, "cleaner", 0);
1086:
1087: getmicrouptime(&starttime);
1088:
1089: while ((bp = TAILQ_FIRST(&bufqueues[BQ_DIRTY]))) {
1090: struct timeval tv;
1091:
1092: if (numdirtypages < lodirtypages && !needbuffer)
1093: break;
1094:
1095: bremfree(bp);
1096: SET(bp->b_flags, B_BUSY);
1097: splx(s);
1098:
1099: if (ISSET(bp->b_flags, B_INVAL)) {
1100: brelse(bp);
1101: s = splbio();
1102: continue;
1103: }
1104: #ifdef DIAGNOSTIC
1105: if (!ISSET(bp->b_flags, B_DELWRI))
1106: panic("Clean buffer on BQ_DIRTY");
1107: #endif
1108: if (LIST_FIRST(&bp->b_dep) != NULL &&
1109: !ISSET(bp->b_flags, B_DEFERRED) &&
1110: buf_countdeps(bp, 0, 0)) {
1111: SET(bp->b_flags, B_DEFERRED);
1112: s = splbio();
1113: numfreepages += btoc(bp->b_bufsize);
1114: numdirtypages += btoc(bp->b_bufsize);
1115: binstailfree(bp, &bufqueues[BQ_DIRTY]);
1116: CLR(bp->b_flags, B_BUSY);
1117: continue;
1118: }
1119:
1120: bawrite(bp);
1121:
1122: /* Never allow processing to run for more than 1 sec */
1123: getmicrouptime(&tv);
1124: timersub(&tv, &starttime, &timediff);
1125: if (timediff.tv_sec)
1126: break;
1127:
1128: s = splbio();
1129: }
1130: }
1131: }
1132:
1133: /*
1134: * Wait for operations on the buffer to complete.
1135: * When they do, extract and return the I/O's error value.
1136: */
1137: int
1138: biowait(struct buf *bp)
1139: {
1140: int s;
1141:
1142: s = splbio();
1143: while (!ISSET(bp->b_flags, B_DONE))
1144: tsleep(bp, PRIBIO + 1, "biowait", 0);
1145: splx(s);
1146:
1147: /* check for interruption of I/O (e.g. via NFS), then errors. */
1148: if (ISSET(bp->b_flags, B_EINTR)) {
1149: CLR(bp->b_flags, B_EINTR);
1150: return (EINTR);
1151: }
1152:
1153: if (ISSET(bp->b_flags, B_ERROR))
1154: return (bp->b_error ? bp->b_error : EIO);
1155: else
1156: return (0);
1157: }
1158:
1159: /*
1160: * Mark I/O complete on a buffer.
1161: *
1162: * If a callback has been requested, e.g. the pageout
1163: * daemon, do so. Otherwise, awaken waiting processes.
1164: *
1165: * [ Leffler, et al., says on p.247:
1166: * "This routine wakes up the blocked process, frees the buffer
1167: * for an asynchronous write, or, for a request by the pagedaemon
1168: * process, invokes a procedure specified in the buffer structure" ]
1169: *
1170: * In real life, the pagedaemon (or other system processes) wants
1171: * to do async stuff to, and doesn't want the buffer brelse()'d.
1172: * (for swap pager, that puts swap buffers on the free lists (!!!),
1173: * for the vn device, that puts malloc'd buffers on the free lists!)
1174: *
1175: * Must be called at splbio().
1176: */
1177: void
1178: biodone(struct buf *bp)
1179: {
1180: splassert(IPL_BIO);
1181:
1182: if (ISSET(bp->b_flags, B_DONE))
1183: panic("biodone already");
1184: SET(bp->b_flags, B_DONE); /* note that it's done */
1185:
1186: if (LIST_FIRST(&bp->b_dep) != NULL)
1187: buf_complete(bp);
1188:
1189: if (!ISSET(bp->b_flags, B_READ)) {
1190: CLR(bp->b_flags, B_WRITEINPROG);
1191: vwakeup(bp->b_vp);
1192: }
1193:
1194: if (ISSET(bp->b_flags, B_CALL)) { /* if necessary, call out */
1195: CLR(bp->b_flags, B_CALL); /* but note callout done */
1196: (*bp->b_iodone)(bp);
1197: } else {
1198: if (ISSET(bp->b_flags, B_ASYNC)) {/* if async, release it */
1199: brelse(bp);
1200: } else { /* or just wakeup the buffer */
1201: CLR(bp->b_flags, B_WANTED);
1202: wakeup(bp);
1203: }
1204: }
1205: }
1206:
1207: #if 1
1208: void
1209: vfs_bufstats(void) {
1210: return;
1211: }
1212: /* #ifdef DDB */
1213: #else
1214: /*
1215: * Print out statistics on the current allocation of the buffer pool.
1216: * Can be enabled to print out on every ``sync'' by setting "syncprt"
1217: * in vfs_syscalls.c using sysctl.
1218: */
1219: void
1220: vfs_bufstats(void)
1221: {
1222: int s, i, j, count;
1223: struct buf *bp;
1224: struct bqueues *dp;
1225: int counts[MAXBSIZE/PAGE_SIZE+1];
1226: int totals[BQUEUES];
1227: long ptotals[BQUEUES];
1228: long pages;
1229: static char *bname[BQUEUES] = { "CLEAN", "DIRTY", "EMPTY" };
1230:
1231: s = splbio();
1232: for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
1233: count = 0;
1234: pages = 0;
1235: for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
1236: counts[j] = 0;
1237: TAILQ_FOREACH(bp, dp, b_freelist) {
1238: counts[bp->b_bufsize/PAGE_SIZE]++;
1239: count++;
1240: pages += btoc(bp->b_bufsize);
1241: }
1242: totals[i] = count;
1243: ptotals[i] = pages;
1244: printf("%s: total-%d(%d pages)", bname[i], count, pages);
1245: for (j = 0; j <= MAXBSIZE/PAGE_SIZE; j++)
1246: if (counts[j] != 0)
1247: printf(", %d-%d", j * PAGE_SIZE, counts[j]);
1248: printf("\n");
1249: }
1250: if ((ptotals[BQ_CLEAN] + ptotals[BQ_DIRTY]) != numfreepages)
1251: printf("numfreepages counter wrong: %ld != %ld\n",
1252: numfreepages, ptotals[BQ_CLEAN] + ptotals[BQ_DIRTY]);
1253: if (ptotals[BQ_CLEAN] != numcleanpages)
1254: printf("numcleanpages counter wrong: %ld != %ld\n",
1255: numcleanpages, ptotals[<BQ_CLEAN]);
1256: else
1257: printf("numcleanpages: %ld\n", numcleanpages);
1258: if (numdirtypages != ptotals[BQ_DIRTY])
1259: printf("numdirtypages counter wrong: %ld != %ld\n",
1260: numdirtypages, ptotals[BQ_DIRTY]);
1261: else
1262: printf("numdirtypages: %ld\n", numdirtypages);
1263:
1264: printf("syncer eating up to %ld pages from %ld reserved\n",
1265: maxcleanpages - hicleanpages, locleanpages);
1266: splx(s);
1267: }
1268: #endif /* DEBUG */
CVSweb