Annotation of sys/dev/raidframe/rf_driver.c, Revision 1.1.1.1
1.1 nbrk 1: /* $OpenBSD: rf_driver.c,v 1.11 2002/12/16 07:01:03 tdeval Exp $ */
2: /* $NetBSD: rf_driver.c,v 1.37 2000/06/04 02:05:13 oster Exp $ */
3:
4: /*
5: * Copyright (c) 1999 The NetBSD Foundation, Inc.
6: * All rights reserved.
7: *
8: * This code is derived from software contributed to The NetBSD Foundation
9: * by Greg Oster
10: *
11: * Redistribution and use in source and binary forms, with or without
12: * modification, are permitted provided that the following conditions
13: * are met:
14: * 1. Redistributions of source code must retain the above copyright
15: * notice, this list of conditions and the following disclaimer.
16: * 2. Redistributions in binary form must reproduce the above copyright
17: * notice, this list of conditions and the following disclaimer in the
18: * documentation and/or other materials provided with the distribution.
19: * 3. All advertising materials mentioning features or use of this software
20: * must display the following acknowledgement:
21: * This product includes software developed by the NetBSD
22: * Foundation, Inc. and its contributors.
23: * 4. Neither the name of The NetBSD Foundation nor the names of its
24: * contributors may be used to endorse or promote products derived
25: * from this software without specific prior written permission.
26: *
27: * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
28: * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
29: * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30: * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
31: * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32: * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33: * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34: * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35: * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36: * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37: * POSSIBILITY OF SUCH DAMAGE.
38: */
39:
40: /*
41: * Copyright (c) 1995 Carnegie-Mellon University.
42: * All rights reserved.
43: *
44: * Author: Mark Holland, Khalil Amiri, Claudson Bornstein,
45: * William V. Courtright II, Robby Findler, Daniel Stodolsky,
46: * Rachad Youssef, Jim Zelenka
47: *
48: * Permission to use, copy, modify and distribute this software and
49: * its documentation is hereby granted, provided that both the copyright
50: * notice and this permission notice appear in all copies of the
51: * software, derivative works or modified versions, and any portions
52: * thereof, and that both notices appear in supporting documentation.
53: *
54: * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
55: * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
56: * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
57: *
58: * Carnegie Mellon requests users of this software to return to
59: *
60: * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
61: * School of Computer Science
62: * Carnegie Mellon University
63: * Pittsburgh PA 15213-3890
64: *
65: * any improvements or extensions that they make and grant Carnegie the
66: * rights to redistribute these changes.
67: */
68:
69: /*****************************************************************************
70: *
71: * rf_driver.c -- Main setup, teardown, and access routines for the RAID
72: * driver
73: *
74: * All routines are prefixed with rf_ (RAIDframe), to avoid conficts.
75: *
76: *****************************************************************************/
77:
78: #include <sys/types.h>
79: #include <sys/param.h>
80: #include <sys/systm.h>
81: #include <sys/ioctl.h>
82: #include <sys/fcntl.h>
83: #ifdef __NetBSD__
84: #include <sys/vnode.h>
85: #endif
86:
87:
88: #include "rf_archs.h"
89: #include "rf_threadstuff.h"
90:
91:
92: #include <sys/errno.h>
93:
94: #include "rf_raid.h"
95: #include "rf_dag.h"
96: #include "rf_aselect.h"
97: #include "rf_diskqueue.h"
98: #include "rf_parityscan.h"
99: #include "rf_alloclist.h"
100: #include "rf_dagutils.h"
101: #include "rf_utils.h"
102: #include "rf_etimer.h"
103: #include "rf_acctrace.h"
104: #include "rf_configure.h"
105: #include "rf_general.h"
106: #include "rf_desc.h"
107: #include "rf_states.h"
108: #include "rf_freelist.h"
109: #include "rf_decluster.h"
110: #include "rf_map.h"
111: #include "rf_revent.h"
112: #include "rf_callback.h"
113: #include "rf_engine.h"
114: #include "rf_memchunk.h"
115: #include "rf_mcpair.h"
116: #include "rf_nwayxor.h"
117: #include "rf_debugprint.h"
118: #include "rf_copyback.h"
119: #include "rf_driver.h"
120: #include "rf_options.h"
121: #include "rf_shutdown.h"
122: #include "rf_kintf.h"
123:
124: #include <sys/buf.h>
125:
126: /* rad == RF_RaidAccessDesc_t */
127: static RF_FreeList_t *rf_rad_freelist;
128: #define RF_MAX_FREE_RAD 128
129: #define RF_RAD_INC 16
130: #define RF_RAD_INITIAL 32
131:
132: /* Debug variables. */
133: char rf_panicbuf[2048]; /*
134: * A buffer to hold an error msg when we panic.
135: */
136:
137: /* Main configuration routines. */
138: static int raidframe_booted = 0;
139:
140: void rf_ConfigureDebug(RF_Config_t *);
141: void rf_set_debug_option(char *, long);
142: void rf_UnconfigureArray(void);
143: int rf_init_rad(RF_RaidAccessDesc_t *);
144: void rf_clean_rad(RF_RaidAccessDesc_t *);
145: void rf_ShutdownRDFreeList(void *);
146: int rf_ConfigureRDFreeList(RF_ShutdownList_t **);
147:
148: RF_DECLARE_MUTEX(rf_printf_mutex); /*
149: * Debug only: Avoids interleaved
150: * printfs by different stripes.
151: */
152:
153: #define SIGNAL_QUIESCENT_COND(_raid_) wakeup(&((_raid_)->accesses_suspended))
154: #define WAIT_FOR_QUIESCENCE(_raid_) \
155: tsleep(&((_raid_)->accesses_suspended), PRIBIO, "RAIDframe quiesce", 0);
156:
157: #define IO_BUF_ERR(bp, err) \
158: do { \
159: bp->b_flags |= B_ERROR; \
160: bp->b_resid = bp->b_bcount; \
161: bp->b_error = err; \
162: biodone(bp); \
163: } while (0)
164:
165: static int configureCount = 0; /* Number of active configurations. */
166: static int isconfigged = 0; /*
167: * Is basic RAIDframe (non per-array)
168: * stuff configured ?
169: */
170: RF_DECLARE_STATIC_MUTEX(configureMutex); /*
171: * Used to lock the
172: * configuration stuff.
173: */
174: static RF_ShutdownList_t *globalShutdown; /* Non array-specific stuff. */
175: int rf_ConfigureRDFreeList(RF_ShutdownList_t **);
176:
177:
178: /* Called at system boot time. */
179: int
180: rf_BootRaidframe(void)
181: {
182: int rc;
183:
184: if (raidframe_booted)
185: return (EBUSY);
186: raidframe_booted = 1;
187:
188: rc = rf_mutex_init(&configureMutex);
189: if (rc) {
190: RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d.\n",
191: __FILE__, __LINE__, rc);
192: RF_PANIC();
193: }
194: configureCount = 0;
195: isconfigged = 0;
196: globalShutdown = NULL;
197: return (0);
198: }
199:
200:
201: /*
202: * This function is really just for debugging user-level stuff: It
203: * frees up all memory, other RAIDframe resources that might otherwise
204: * be kept around. This is used with systems like "sentinel" to detect
205: * memory leaks.
206: */
207: int
208: rf_UnbootRaidframe(void)
209: {
210: int rc;
211:
212: RF_LOCK_MUTEX(configureMutex);
213: if (configureCount) {
214: RF_UNLOCK_MUTEX(configureMutex);
215: return (EBUSY);
216: }
217: raidframe_booted = 0;
218: RF_UNLOCK_MUTEX(configureMutex);
219: rc = rf_mutex_destroy(&configureMutex);
220: if (rc) {
221: RF_ERRORMSG3("Unable to destroy mutex file %s line %d"
222: " rc=%d.\n", __FILE__, __LINE__, rc);
223: RF_PANIC();
224: }
225: return (0);
226: }
227:
228:
229: /*
230: * Called whenever an array is shutdown.
231: */
232: void
233: rf_UnconfigureArray(void)
234: {
235: int rc;
236:
237: RF_LOCK_MUTEX(configureMutex);
238: if (--configureCount == 0) { /*
239: * If no active configurations, shut
240: * everything down.
241: */
242: isconfigged = 0;
243:
244: rc = rf_ShutdownList(&globalShutdown);
245: if (rc) {
246: RF_ERRORMSG1("RAIDFRAME: unable to do global shutdown,"
247: " rc=%d.\n", rc);
248: }
249:
250: /*
251: * We must wait until now, because the AllocList module
252: * uses the DebugMem module.
253: */
254: if (rf_memDebug)
255: rf_print_unfreed();
256: }
257: RF_UNLOCK_MUTEX(configureMutex);
258: }
259:
260:
261: /*
262: * Called to shut down an array.
263: */
264: int
265: rf_Shutdown(RF_Raid_t *raidPtr)
266: {
267: if (!raidPtr->valid) {
268: RF_ERRORMSG("Attempt to shut down unconfigured RAIDframe"
269: " driver. Aborting shutdown.\n");
270: return (EINVAL);
271: }
272: /*
273: * Wait for outstanding IOs to land.
274: * As described in rf_raid.h, we use the rad_freelist lock
275: * to protect the per-array info about outstanding descs,
276: * since we need to do freelist locking anyway, and this
277: * cuts down on the amount of serialization we've got going
278: * on.
279: */
280: RF_FREELIST_DO_LOCK(rf_rad_freelist);
281: if (raidPtr->waitShutdown) {
282: RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
283: return (EBUSY);
284: }
285: raidPtr->waitShutdown = 1;
286: while (raidPtr->nAccOutstanding) {
287: RF_WAIT_COND(raidPtr->outstandingCond, RF_FREELIST_MUTEX_OF(rf_rad_freelist));
288: }
289: RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
290:
291: /* Wait for any parity re-writes to stop... */
292: while (raidPtr->parity_rewrite_in_progress) {
293: printf("Waiting for parity re-write to exit...\n");
294: tsleep(&raidPtr->parity_rewrite_in_progress, PRIBIO,
295: "rfprwshutdown", 0);
296: }
297:
298: raidPtr->valid = 0;
299:
300: rf_update_component_labels(raidPtr, RF_FINAL_COMPONENT_UPDATE);
301:
302: rf_UnconfigureVnodes(raidPtr);
303:
304: rf_ShutdownList(&raidPtr->shutdownList);
305:
306: rf_UnconfigureArray();
307:
308: return (0);
309: }
310:
311: #define DO_INIT_CONFIGURE(f) \
312: do { \
313: rc = f (&globalShutdown); \
314: if (rc) { \
315: RF_ERRORMSG2("RAIDFRAME: failed %s with %d.\n", \
316: RF_STRING(f), rc); \
317: rf_ShutdownList(&globalShutdown); \
318: configureCount--; \
319: RF_UNLOCK_MUTEX(configureMutex); \
320: return(rc); \
321: } \
322: } while (0)
323:
324: #define DO_RAID_FAIL() \
325: do { \
326: rf_UnconfigureVnodes(raidPtr); \
327: rf_ShutdownList(&raidPtr->shutdownList); \
328: rf_UnconfigureArray(); \
329: } while (0)
330:
331: #define DO_RAID_INIT_CONFIGURE(f) \
332: do { \
333: rc = (f)(&raidPtr->shutdownList, raidPtr, cfgPtr); \
334: if (rc) { \
335: RF_ERRORMSG2("RAIDFRAME: failed %s with %d.\n", \
336: RF_STRING(f), rc); \
337: DO_RAID_FAIL(); \
338: return(rc); \
339: } \
340: } while (0)
341:
342: #define DO_RAID_MUTEX(_m_) \
343: do { \
344: rc = rf_create_managed_mutex(&raidPtr->shutdownList, (_m_)); \
345: if (rc) { \
346: RF_ERRORMSG3("Unable to init mutex file %s line %d" \
347: " rc=%d.\n", __FILE__, __LINE__, rc); \
348: DO_RAID_FAIL(); \
349: return(rc); \
350: } \
351: } while (0)
352:
353: #define DO_RAID_COND(_c_) \
354: do { \
355: rc = rf_create_managed_cond(&raidPtr->shutdownList, (_c_)); \
356: if (rc) { \
357: RF_ERRORMSG3("Unable to init cond file %s line %d" \
358: " rc=%d.\n", __FILE__, __LINE__, rc); \
359: DO_RAID_FAIL(); \
360: return(rc); \
361: } \
362: } while (0)
363:
364: int
365: rf_Configure(RF_Raid_t *raidPtr, RF_Config_t *cfgPtr, RF_AutoConfig_t *ac)
366: {
367: RF_RowCol_t row, col;
368: int i, rc;
369:
370: /*
371: * XXX This check can probably be removed now, since
372: * RAIDFRAME_CONFIGURE now checks to make sure that the
373: * RAID set is not already valid.
374: */
375: if (raidPtr->valid) {
376: RF_ERRORMSG("RAIDframe configuration not shut down."
377: " Aborting configure.\n");
378: return (EINVAL);
379: }
380: RF_LOCK_MUTEX(configureMutex);
381: configureCount++;
382: if (isconfigged == 0) {
383: rc = rf_create_managed_mutex(&globalShutdown, &rf_printf_mutex);
384: if (rc) {
385: RF_ERRORMSG3("Unable to init mutex file %s line %d"
386: " rc=%d.\n", __FILE__, __LINE__, rc);
387: rf_ShutdownList(&globalShutdown);
388: return (rc);
389: }
390: /* Initialize globals. */
391: #ifdef RAIDDEBUG
392: printf("RAIDFRAME: protectedSectors is %ld.\n",
393: rf_protectedSectors);
394: #endif /* RAIDDEBUG */
395:
396: rf_clear_debug_print_buffer();
397:
398: DO_INIT_CONFIGURE(rf_ConfigureAllocList);
399:
400: /*
401: * Yes, this does make debugging general to the whole
402: * system instead of being array specific. Bummer, drag.
403: */
404: rf_ConfigureDebug(cfgPtr);
405: DO_INIT_CONFIGURE(rf_ConfigureDebugMem);
406: DO_INIT_CONFIGURE(rf_ConfigureAccessTrace);
407: DO_INIT_CONFIGURE(rf_ConfigureMapModule);
408: DO_INIT_CONFIGURE(rf_ConfigureReconEvent);
409: DO_INIT_CONFIGURE(rf_ConfigureCallback);
410: DO_INIT_CONFIGURE(rf_ConfigureMemChunk);
411: DO_INIT_CONFIGURE(rf_ConfigureRDFreeList);
412: DO_INIT_CONFIGURE(rf_ConfigureNWayXor);
413: DO_INIT_CONFIGURE(rf_ConfigureStripeLockFreeList);
414: DO_INIT_CONFIGURE(rf_ConfigureMCPair);
415: DO_INIT_CONFIGURE(rf_ConfigureDAGs);
416: DO_INIT_CONFIGURE(rf_ConfigureDAGFuncs);
417: DO_INIT_CONFIGURE(rf_ConfigureDebugPrint);
418: DO_INIT_CONFIGURE(rf_ConfigureReconstruction);
419: DO_INIT_CONFIGURE(rf_ConfigureCopyback);
420: DO_INIT_CONFIGURE(rf_ConfigureDiskQueueSystem);
421: isconfigged = 1;
422: }
423: RF_UNLOCK_MUTEX(configureMutex);
424:
425: DO_RAID_MUTEX(&raidPtr->mutex);
426: /*
427: * Set up the cleanup list. Do this after ConfigureDebug so that
428: * value of memDebug will be set.
429: */
430:
431: rf_MakeAllocList(raidPtr->cleanupList);
432: if (raidPtr->cleanupList == NULL) {
433: DO_RAID_FAIL();
434: return (ENOMEM);
435: }
436: rc = rf_ShutdownCreate(&raidPtr->shutdownList,
437: (void (*) (void *)) rf_FreeAllocList, raidPtr->cleanupList);
438: if (rc) {
439: RF_ERRORMSG3("Unable to add to shutdown list file %s line %d"
440: " rc=%d.\n", __FILE__, __LINE__, rc);
441: DO_RAID_FAIL();
442: return (rc);
443: }
444: raidPtr->numRow = cfgPtr->numRow;
445: raidPtr->numCol = cfgPtr->numCol;
446: raidPtr->numSpare = cfgPtr->numSpare;
447:
448: /*
449: * XXX We don't even pretend to support more than one row in the
450: * kernel...
451: */
452: if (raidPtr->numRow != 1) {
453: RF_ERRORMSG("Only one row supported in kernel.\n");
454: DO_RAID_FAIL();
455: return (EINVAL);
456: }
457: RF_CallocAndAdd(raidPtr->status, raidPtr->numRow,
458: sizeof(RF_RowStatus_t), (RF_RowStatus_t *), raidPtr->cleanupList);
459: if (raidPtr->status == NULL) {
460: DO_RAID_FAIL();
461: return (ENOMEM);
462: }
463: RF_CallocAndAdd(raidPtr->reconControl, raidPtr->numRow,
464: sizeof(RF_ReconCtrl_t *), (RF_ReconCtrl_t **), raidPtr->cleanupList);
465: if (raidPtr->reconControl == NULL) {
466: DO_RAID_FAIL();
467: return (ENOMEM);
468: }
469: for (i = 0; i < raidPtr->numRow; i++) {
470: raidPtr->status[i] = rf_rs_optimal;
471: raidPtr->reconControl[i] = NULL;
472: }
473:
474: DO_RAID_INIT_CONFIGURE(rf_ConfigureEngine);
475: DO_RAID_INIT_CONFIGURE(rf_ConfigureStripeLocks);
476:
477: DO_RAID_COND(&raidPtr->outstandingCond);
478:
479: raidPtr->nAccOutstanding = 0;
480: raidPtr->waitShutdown = 0;
481:
482: DO_RAID_MUTEX(&raidPtr->access_suspend_mutex);
483: DO_RAID_COND(&raidPtr->quiescent_cond);
484:
485: DO_RAID_COND(&raidPtr->waitForReconCond);
486:
487: DO_RAID_MUTEX(&raidPtr->recon_done_proc_mutex);
488:
489: if (ac != NULL) {
490: /*
491: * We have an AutoConfig structure... Don't do the
492: * normal disk configuration... call the auto config
493: * stuff.
494: */
495: rf_AutoConfigureDisks(raidPtr, cfgPtr, ac);
496: } else {
497: DO_RAID_INIT_CONFIGURE(rf_ConfigureDisks);
498: DO_RAID_INIT_CONFIGURE(rf_ConfigureSpareDisks);
499: }
500: /*
501: * Do this after ConfigureDisks & ConfigureSpareDisks to be sure
502: * devno is set.
503: */
504: DO_RAID_INIT_CONFIGURE(rf_ConfigureDiskQueues);
505:
506: DO_RAID_INIT_CONFIGURE(rf_ConfigureLayout);
507:
508: DO_RAID_INIT_CONFIGURE(rf_ConfigurePSStatus);
509:
510: for (row = 0; row < raidPtr->numRow; row++) {
511: for (col = 0; col < raidPtr->numCol; col++) {
512: /*
513: * XXX Better distribution.
514: */
515: raidPtr->hist_diskreq[row][col] = 0;
516: }
517: }
518:
519: raidPtr->numNewFailures = 0;
520: raidPtr->copyback_in_progress = 0;
521: raidPtr->parity_rewrite_in_progress = 0;
522: raidPtr->recon_in_progress = 0;
523: raidPtr->maxOutstanding = cfgPtr->maxOutstandingDiskReqs;
524:
525: /*
526: * Autoconfigure and root_partition will actually get filled in
527: * after the config is done.
528: */
529: raidPtr->autoconfigure = 0;
530: raidPtr->root_partition = 0;
531: raidPtr->last_unit = raidPtr->raidid;
532: raidPtr->config_order = 0;
533:
534: if (rf_keepAccTotals) {
535: raidPtr->keep_acc_totals = 1;
536: }
537: rf_StartUserStats(raidPtr);
538:
539: raidPtr->valid = 1;
540: return (0);
541: }
542:
543: int
544: rf_init_rad(RF_RaidAccessDesc_t *desc)
545: {
546: int rc;
547:
548: rc = rf_mutex_init(&desc->mutex);
549: if (rc) {
550: RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d.\n", __FILE__,
551: __LINE__, rc);
552: return (rc);
553: }
554: rc = rf_cond_init(&desc->cond);
555: if (rc) {
556: RF_ERRORMSG3("Unable to init cond file %s line %d rc=%d.\n", __FILE__,
557: __LINE__, rc);
558: rf_mutex_destroy(&desc->mutex);
559: return (rc);
560: }
561: return (0);
562: }
563:
564: void
565: rf_clean_rad(RF_RaidAccessDesc_t *desc)
566: {
567: rf_mutex_destroy(&desc->mutex);
568: rf_cond_destroy(&desc->cond);
569: }
570:
571: void
572: rf_ShutdownRDFreeList(void *ignored)
573: {
574: RF_FREELIST_DESTROY_CLEAN(rf_rad_freelist, next,
575: (RF_RaidAccessDesc_t *), rf_clean_rad);
576: }
577:
578: int
579: rf_ConfigureRDFreeList(RF_ShutdownList_t **listp)
580: {
581: int rc;
582:
583: RF_FREELIST_CREATE(rf_rad_freelist, RF_MAX_FREE_RAD,
584: RF_RAD_INC, sizeof(RF_RaidAccessDesc_t));
585: if (rf_rad_freelist == NULL) {
586: return (ENOMEM);
587: }
588: rc = rf_ShutdownCreate(listp, rf_ShutdownRDFreeList, NULL);
589: if (rc) {
590: RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d.\n", __FILE__,
591: __LINE__, rc);
592: rf_ShutdownRDFreeList(NULL);
593: return (rc);
594: }
595: RF_FREELIST_PRIME_INIT(rf_rad_freelist, RF_RAD_INITIAL, next,
596: (RF_RaidAccessDesc_t *), rf_init_rad);
597: return (0);
598: }
599:
600: RF_RaidAccessDesc_t *
601: rf_AllocRaidAccDesc(
602: RF_Raid_t *raidPtr,
603: RF_IoType_t type,
604: RF_RaidAddr_t raidAddress,
605: RF_SectorCount_t numBlocks,
606: caddr_t bufPtr,
607: void *bp,
608: RF_DagHeader_t **paramDAG,
609: RF_AccessStripeMapHeader_t **paramASM,
610: RF_RaidAccessFlags_t flags,
611: void (*cbF) (struct buf *),
612: void *cbA,
613: RF_AccessState_t *states
614: )
615: {
616: RF_RaidAccessDesc_t *desc;
617:
618: RF_FREELIST_GET_INIT_NOUNLOCK(rf_rad_freelist, desc, next,
619: (RF_RaidAccessDesc_t *), rf_init_rad);
620: if (raidPtr->waitShutdown) {
621: /*
622: * Actually, we're shutting the array down. Free the desc
623: * and return NULL.
624: */
625: RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
626: RF_FREELIST_FREE_CLEAN(rf_rad_freelist, desc, next,
627: rf_clean_rad);
628: return (NULL);
629: }
630: raidPtr->nAccOutstanding++;
631: RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
632:
633: desc->raidPtr = (void *) raidPtr;
634: desc->type = type;
635: desc->raidAddress = raidAddress;
636: desc->numBlocks = numBlocks;
637: desc->bufPtr = bufPtr;
638: desc->bp = bp;
639: desc->paramDAG = paramDAG;
640: desc->paramASM = paramASM;
641: desc->flags = flags;
642: desc->states = states;
643: desc->state = 0;
644:
645: desc->status = 0;
646: bzero((char *) &desc->tracerec, sizeof(RF_AccTraceEntry_t));
647: desc->callbackFunc = (void (*) (RF_CBParam_t)) cbF; /* XXX */
648: desc->callbackArg = cbA;
649: desc->next = NULL;
650: desc->head = desc;
651: desc->numPending = 0;
652: desc->cleanupList = NULL;
653: rf_MakeAllocList(desc->cleanupList);
654: return (desc);
655: }
656:
657: void
658: rf_FreeRaidAccDesc(RF_RaidAccessDesc_t * desc)
659: {
660: RF_Raid_t *raidPtr = desc->raidPtr;
661:
662: RF_ASSERT(desc);
663:
664: rf_FreeAllocList(desc->cleanupList);
665: RF_FREELIST_FREE_CLEAN_NOUNLOCK(rf_rad_freelist, desc, next, rf_clean_rad);
666: raidPtr->nAccOutstanding--;
667: if (raidPtr->waitShutdown) {
668: RF_SIGNAL_COND(raidPtr->outstandingCond);
669: }
670: RF_FREELIST_DO_UNLOCK(rf_rad_freelist);
671: }
672:
673:
674: /********************************************************************
675: * Main routine for performing an access.
676: * Accesses are retried until a DAG can not be selected. This occurs
677: * when either the DAG library is incomplete or there are too many
678: * failures in a parity group.
679: ********************************************************************/
680: int
681: rf_DoAccess(
682: RF_Raid_t *raidPtr,
683: RF_IoType_t type, /* Should be read or write. */
684: int async_flag, /*
685: * Should be RF_TRUE
686: * or RF_FALSE.
687: */
688: RF_RaidAddr_t raidAddress,
689: RF_SectorCount_t numBlocks,
690: caddr_t bufPtr,
691: void *bp_in, /*
692: * It's a buf pointer.
693: * void * to facilitate
694: * ignoring it outside
695: * the kernel.
696: */
697: RF_DagHeader_t **paramDAG,
698: RF_AccessStripeMapHeader_t **paramASM,
699: RF_RaidAccessFlags_t flags,
700: RF_RaidAccessDesc_t **paramDesc,
701: void (*cbF) (struct buf *),
702: void *cbA
703: )
704: {
705: RF_RaidAccessDesc_t *desc;
706: caddr_t lbufPtr = bufPtr;
707: struct buf *bp = (struct buf *) bp_in;
708:
709: raidAddress += rf_raidSectorOffset;
710:
711: if (!raidPtr->valid) {
712: RF_ERRORMSG("RAIDframe driver not successfully configured."
713: " Rejecting access.\n");
714: IO_BUF_ERR(bp, EINVAL);
715: return (EINVAL);
716: }
717:
718: if (rf_accessDebug) {
719:
720: printf("logBytes is: %d %d %d.\n", raidPtr->raidid,
721: raidPtr->logBytesPerSector,
722: (int) rf_RaidAddressToByte(raidPtr, numBlocks));
723: printf("raid%d: %s raidAddr %d (stripeid %d-%d) numBlocks %d (%d bytes) buf 0x%lx.\n", raidPtr->raidid,
724: (type == RF_IO_TYPE_READ) ? "READ" : "WRITE", (int) raidAddress,
725: (int) rf_RaidAddressToStripeID(&raidPtr->Layout, raidAddress),
726: (int) rf_RaidAddressToStripeID(&raidPtr->Layout, raidAddress + numBlocks - 1),
727: (int) numBlocks,
728: (int) rf_RaidAddressToByte(raidPtr, numBlocks),
729: (long) bufPtr);
730: }
731: if (raidAddress + numBlocks > raidPtr->totalSectors) {
732:
733: printf("DoAccess: raid addr %lu too large to access %lu sectors. Max legal addr is %lu.\n",
734: (u_long) raidAddress, (u_long) numBlocks, (u_long) raidPtr->totalSectors);
735:
736: IO_BUF_ERR(bp, ENOSPC);
737: return (ENOSPC);
738: }
739: desc = rf_AllocRaidAccDesc(raidPtr, type, raidAddress,
740: numBlocks, lbufPtr, bp, paramDAG, paramASM,
741: flags, cbF, cbA, raidPtr->Layout.map->states);
742:
743: if (desc == NULL) {
744: return (ENOMEM);
745: }
746: RF_ETIMER_START(desc->tracerec.tot_timer);
747:
748: desc->async_flag = async_flag;
749:
750: rf_ContinueRaidAccess(desc);
751:
752: return (0);
753: }
754:
755:
756: /* Force the array into reconfigured mode without doing reconstruction. */
757: int
758: rf_SetReconfiguredMode(RF_Raid_t *raidPtr, int row, int col)
759: {
760: if (!(raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
761: printf("Can't set reconfigured mode in dedicated-spare"
762: " array.\n");
763: RF_PANIC();
764: }
765: RF_LOCK_MUTEX(raidPtr->mutex);
766: raidPtr->numFailures++;
767: raidPtr->Disks[row][col].status = rf_ds_dist_spared;
768: raidPtr->status[row] = rf_rs_reconfigured;
769: rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
770: /*
771: * Install spare table only if declustering + distributed sparing
772: * architecture.
773: */
774: if (raidPtr->Layout.map->flags & RF_BD_DECLUSTERED)
775: rf_InstallSpareTable(raidPtr, row, col);
776: RF_UNLOCK_MUTEX(raidPtr->mutex);
777: return (0);
778: }
779:
780: extern int fail_row, fail_col, fail_time;
781: extern int delayed_recon;
782:
783: int
784: rf_FailDisk(RF_Raid_t *raidPtr, int frow, int fcol, int initRecon)
785: {
786: printf("raid%d: Failing disk r%d c%d.\n", raidPtr->raidid, frow, fcol);
787: RF_LOCK_MUTEX(raidPtr->mutex);
788: raidPtr->numFailures++;
789: raidPtr->Disks[frow][fcol].status = rf_ds_failed;
790: raidPtr->status[frow] = rf_rs_degraded;
791: rf_update_component_labels(raidPtr, RF_NORMAL_COMPONENT_UPDATE);
792: RF_UNLOCK_MUTEX(raidPtr->mutex);
793: if (initRecon)
794: rf_ReconstructFailedDisk(raidPtr, frow, fcol);
795: return (0);
796: }
797:
798:
799: /*
800: * Releases a thread that is waiting for the array to become quiesced.
801: * access_suspend_mutex should be locked upon calling this.
802: */
803: void
804: rf_SignalQuiescenceLock(RF_Raid_t *raidPtr, RF_RaidReconDesc_t *reconDesc)
805: {
806: if (rf_quiesceDebug) {
807: printf("raid%d: Signalling quiescence lock.\n",
808: raidPtr->raidid);
809: }
810: raidPtr->access_suspend_release = 1;
811:
812: if (raidPtr->waiting_for_quiescence) {
813: SIGNAL_QUIESCENT_COND(raidPtr);
814: }
815: }
816:
817:
818: /*
819: * Suspends all new requests to the array. No effect on accesses that are
820: * in flight.
821: */
822: int
823: rf_SuspendNewRequestsAndWait(RF_Raid_t *raidPtr)
824: {
825: if (rf_quiesceDebug)
826: printf("Suspending new reqs.\n");
827:
828: RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
829: raidPtr->accesses_suspended++;
830: raidPtr->waiting_for_quiescence = (raidPtr->accs_in_flight == 0) ? 0 : 1;
831:
832: if (raidPtr->waiting_for_quiescence) {
833: raidPtr->access_suspend_release = 0;
834: while (!raidPtr->access_suspend_release) {
835: printf("Suspending: Waiting for Quiescence.\n");
836: WAIT_FOR_QUIESCENCE(raidPtr);
837: raidPtr->waiting_for_quiescence = 0;
838: }
839: }
840: printf("Quiescence reached...\n");
841:
842: RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);
843: return (raidPtr->waiting_for_quiescence);
844: }
845:
846:
847: /* Wake up everyone waiting for quiescence to be released. */
848: void
849: rf_ResumeNewRequests(RF_Raid_t *raidPtr)
850: {
851: RF_CallbackDesc_t *t, *cb;
852:
853: if (rf_quiesceDebug)
854: printf("Resuming new reqs.\n");
855:
856: RF_LOCK_MUTEX(raidPtr->access_suspend_mutex);
857: raidPtr->accesses_suspended--;
858: if (raidPtr->accesses_suspended == 0)
859: cb = raidPtr->quiesce_wait_list;
860: else
861: cb = NULL;
862: raidPtr->quiesce_wait_list = NULL;
863: RF_UNLOCK_MUTEX(raidPtr->access_suspend_mutex);
864:
865: while (cb) {
866: t = cb;
867: cb = cb->next;
868: (t->callbackFunc) (t->callbackArg);
869: rf_FreeCallbackDesc(t);
870: }
871: }
872:
873:
874: /*****************************************************************************
875: *
876: * Debug routines.
877: *
878: *****************************************************************************/
879:
880: void
881: rf_set_debug_option(char *name, long val)
882: {
883: RF_DebugName_t *p;
884:
885: for (p = rf_debugNames; p->name; p++) {
886: if (!strcmp(p->name, name)) {
887: *(p->ptr) = val;
888: printf("[Set debug variable %s to %ld]\n", name, val);
889: return;
890: }
891: }
892: RF_ERRORMSG1("Unknown debug string \"%s\"\n", name);
893: }
894:
895:
896: /* Would like to use sscanf here, but apparently not available in kernel. */
897: /*ARGSUSED*/
898: void
899: rf_ConfigureDebug(RF_Config_t *cfgPtr)
900: {
901: char *val_p, *name_p, *white_p;
902: long val;
903: int i;
904:
905: rf_ResetDebugOptions();
906: for (i = 0; cfgPtr->debugVars[i][0] && i < RF_MAXDBGV; i++) {
907: name_p = rf_find_non_white(&cfgPtr->debugVars[i][0]);
908: white_p = rf_find_white(name_p); /*
909: * Skip to start of 2nd
910: * word.
911: */
912: val_p = rf_find_non_white(white_p);
913: if (*val_p == '0' && *(val_p + 1) == 'x')
914: val = rf_htoi(val_p + 2);
915: else
916: val = rf_atoi(val_p);
917: *white_p = '\0';
918: rf_set_debug_option(name_p, val);
919: }
920: }
921:
922:
923: /* Performance monitoring stuff. */
924:
925: #if !defined(_KERNEL) && !defined(SIMULATE)
926:
927: /*
928: * Throughput stats currently only used in user-level RAIDframe.
929: */
930:
931: int
932: rf_InitThroughputStats(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
933: RF_Config_t *cfgPtr)
934: {
935: int rc;
936:
937: /* These used by user-level RAIDframe only. */
938: rc = rf_create_managed_mutex(listp, &raidPtr->throughputstats.mutex);
939: if (rc) {
940: RF_ERRORMSG3("Unable to init mutex file %s line %d rc=%d.\n",
941: __FILE__, __LINE__, rc);
942: return (rc);
943: }
944: raidPtr->throughputstats.sum_io_us = 0;
945: raidPtr->throughputstats.num_ios = 0;
946: raidPtr->throughputstats.num_out_ios = 0;
947: return (0);
948: }
949:
950: void
951: rf_StartThroughputStats(RF_Raid_t *raidPtr)
952: {
953: RF_LOCK_MUTEX(raidPtr->throughputstats.mutex);
954: raidPtr->throughputstats.num_ios++;
955: raidPtr->throughputstats.num_out_ios++;
956: if (raidPtr->throughputstats.num_out_ios == 1)
957: RF_GETTIME(raidPtr->throughputstats.start);
958: RF_UNLOCK_MUTEX(raidPtr->throughputstats.mutex);
959: }
960:
961: void
962: rf_StopThroughputStats(RF_Raid_t *raidPtr)
963: {
964: struct timeval diff;
965:
966: RF_LOCK_MUTEX(raidPtr->throughputstats.mutex);
967: raidPtr->throughputstats.num_out_ios--;
968: if (raidPtr->throughputstats.num_out_ios == 0) {
969: RF_GETTIME(raidPtr->throughputstats.stop);
970: RF_TIMEVAL_DIFF(&raidPtr->throughputstats.start,
971: &raidPtr->throughputstats.stop, &diff);
972: raidPtr->throughputstats.sum_io_us += RF_TIMEVAL_TO_US(diff);
973: }
974: RF_UNLOCK_MUTEX(raidPtr->throughputstats.mutex);
975: }
976:
977: void
978: rf_PrintThroughputStats(RF_Raid_t *raidPtr)
979: {
980: RF_ASSERT(raidPtr->throughputstats.num_out_ios == 0);
981: if (raidPtr->throughputstats.sum_io_us != 0) {
982: printf("[Througphut: %8.2f IOs/second]\n",
983: raidPtr->throughputstats.num_ios /
984: (raidPtr->throughputstats.sum_io_us / 1000000.0));
985: }
986: }
987:
988: #endif /* !_KERNEL && !SIMULATE */
989:
990: void
991: rf_StartUserStats(RF_Raid_t *raidPtr)
992: {
993: RF_GETTIME(raidPtr->userstats.start);
994: raidPtr->userstats.sum_io_us = 0;
995: raidPtr->userstats.num_ios = 0;
996: raidPtr->userstats.num_sect_moved = 0;
997: }
998:
999: void
1000: rf_StopUserStats(RF_Raid_t *raidPtr)
1001: {
1002: RF_GETTIME(raidPtr->userstats.stop);
1003: }
1004:
1005: void
1006: rf_UpdateUserStats(
1007: RF_Raid_t *raidPtr,
1008: int rt, /* Response time in us. */
1009: int numsect /* Number of sectors for this access. */
1010: )
1011: {
1012: raidPtr->userstats.sum_io_us += rt;
1013: raidPtr->userstats.num_ios++;
1014: raidPtr->userstats.num_sect_moved += numsect;
1015: }
1016:
1017: void
1018: rf_PrintUserStats(RF_Raid_t *raidPtr)
1019: {
1020: long elapsed_us, mbs, mbs_frac;
1021: struct timeval diff;
1022:
1023: RF_TIMEVAL_DIFF(&raidPtr->userstats.start, &raidPtr->userstats.stop,
1024: &diff);
1025: elapsed_us = RF_TIMEVAL_TO_US(diff);
1026:
1027: /* 2000 sectors per megabyte, 10000000 microseconds per second. */
1028: if (elapsed_us)
1029: mbs = (raidPtr->userstats.num_sect_moved / 2000) /
1030: (elapsed_us / 1000000);
1031: else
1032: mbs = 0;
1033:
1034: /* This computes only the first digit of the fractional mb/s moved. */
1035: if (elapsed_us) {
1036: mbs_frac = ((raidPtr->userstats.num_sect_moved / 200) /
1037: (elapsed_us / 1000000)) - (mbs * 10);
1038: } else {
1039: mbs_frac = 0;
1040: }
1041:
1042: printf("Number of I/Os: %ld\n",
1043: raidPtr->userstats.num_ios);
1044: printf("Elapsed time (us): %ld\n",
1045: elapsed_us);
1046: printf("User I/Os per second: %ld\n",
1047: RF_DB0_CHECK(raidPtr->userstats.num_ios, (elapsed_us / 1000000)));
1048: printf("Average user response time: %ld us\n",
1049: RF_DB0_CHECK(raidPtr->userstats.sum_io_us,
1050: raidPtr->userstats.num_ios));
1051: printf("Total sectors moved: %ld\n",
1052: raidPtr->userstats.num_sect_moved);
1053: printf("Average access size (sect): %ld\n",
1054: RF_DB0_CHECK(raidPtr->userstats.num_sect_moved,
1055: raidPtr->userstats.num_ios));
1056: printf("Achieved data rate: %ld.%ld MB/sec\n",
1057: mbs, mbs_frac);
1058: }
CVSweb