Annotation of sys/dev/raidframe/rf_dagutils.c, Revision 1.1.1.1
1.1 nbrk 1: /* $OpenBSD: rf_dagutils.c,v 1.4 2002/12/16 07:01:03 tdeval Exp $ */
2: /* $NetBSD: rf_dagutils.c,v 1.6 1999/12/09 02:26:09 oster Exp $ */
3:
4: /*
5: * Copyright (c) 1995 Carnegie-Mellon University.
6: * All rights reserved.
7: *
8: * Authors: Mark Holland, William V. Courtright II, Jim Zelenka
9: *
10: * Permission to use, copy, modify and distribute this software and
11: * its documentation is hereby granted, provided that both the copyright
12: * notice and this permission notice appear in all copies of the
13: * software, derivative works or modified versions, and any portions
14: * thereof, and that both notices appear in supporting documentation.
15: *
16: * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
17: * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
18: * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
19: *
20: * Carnegie Mellon requests users of this software to return to
21: *
22: * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
23: * School of Computer Science
24: * Carnegie Mellon University
25: * Pittsburgh PA 15213-3890
26: *
27: * any improvements or extensions that they make and grant Carnegie the
28: * rights to redistribute these changes.
29: */
30:
31: /*****************************************************************************
32: *
33: * rf_dagutils.c -- Utility routines for manipulating dags.
34: *
35: *****************************************************************************/
36:
37: #include "rf_archs.h"
38: #include "rf_types.h"
39: #include "rf_threadstuff.h"
40: #include "rf_raid.h"
41: #include "rf_dag.h"
42: #include "rf_dagutils.h"
43: #include "rf_dagfuncs.h"
44: #include "rf_general.h"
45: #include "rf_freelist.h"
46: #include "rf_map.h"
47: #include "rf_shutdown.h"
48:
49: #define SNUM_DIFF(_a_,_b_) (((_a_)>(_b_))?((_a_)-(_b_)):((_b_)-(_a_)))
50:
51: RF_RedFuncs_t rf_xorFuncs = {
52: rf_RegularXorFunc, "Reg Xr", rf_SimpleXorFunc, "Simple Xr"
53: };
54:
55: RF_RedFuncs_t rf_xorRecoveryFuncs = {
56: rf_RecoveryXorFunc, "Recovery Xr", rf_RecoveryXorFunc, "Recovery Xr"
57: };
58:
59: void rf_RecurPrintDAG(RF_DagNode_t *, int, int);
60: void rf_PrintDAG(RF_DagHeader_t *);
61: int rf_ValidateBranch(RF_DagNode_t *, int *, int *, RF_DagNode_t **, int);
62: void rf_ValidateBranchVisitedBits(RF_DagNode_t *, int, int);
63: void rf_ValidateVisitedBits(RF_DagHeader_t *);
64:
65: /*****************************************************************************
66: *
67: * InitNode - Initialize a dag node.
68: *
69: * The size of the propList array is always the same as that of the
70: * successors array.
71: *
72: *****************************************************************************/
73: void
74: rf_InitNode(
75: RF_DagNode_t *node,
76: RF_NodeStatus_t initstatus,
77: int commit,
78: int (*doFunc) (RF_DagNode_t *),
79: int (*undoFunc) (RF_DagNode_t *node),
80: int (*wakeFunc) (RF_DagNode_t *node, int),
81: int nSucc,
82: int nAnte,
83: int nParam,
84: int nResult,
85: RF_DagHeader_t *hdr,
86: char *name,
87: RF_AllocListElem_t *alist
88: )
89: {
90: void **ptrs;
91: int nptrs;
92:
93: if (nAnte > RF_MAX_ANTECEDENTS)
94: RF_PANIC();
95: node->status = initstatus;
96: node->commitNode = commit;
97: node->doFunc = doFunc;
98: node->undoFunc = undoFunc;
99: node->wakeFunc = wakeFunc;
100: node->numParams = nParam;
101: node->numResults = nResult;
102: node->numAntecedents = nAnte;
103: node->numAntDone = 0;
104: node->next = NULL;
105: node->numSuccedents = nSucc;
106: node->name = name;
107: node->dagHdr = hdr;
108: node->visited = 0;
109:
110: /* Allocate all the pointers with one call to malloc. */
111: nptrs = nSucc + nAnte + nResult + nSucc;
112:
113: if (nptrs <= RF_DAG_PTRCACHESIZE) {
114: /*
115: * The dag_ptrs field of the node is basically some scribble
116: * space to be used here. We could get rid of it, and always
117: * allocate the range of pointers, but that's expensive. So,
118: * we pick a "common case" size for the pointer cache.
119: * Hopefully, we'll find that:
120: * (1) Generally, nptrs doesn't exceed RF_DAG_PTRCACHESIZE by
121: * only a little bit (least efficient case).
122: * (2) Generally, ntprs isn't a lot less than
123: * RF_DAG_PTRCACHESIZE (wasted memory).
124: */
125: ptrs = (void **) node->dag_ptrs;
126: } else {
127: RF_CallocAndAdd(ptrs, nptrs, sizeof(void *), (void **), alist);
128: }
129: node->succedents = (nSucc) ? (RF_DagNode_t **) ptrs : NULL;
130: node->antecedents = (nAnte) ? (RF_DagNode_t **) (ptrs + nSucc) : NULL;
131: node->results = (nResult) ? (void **) (ptrs + nSucc + nAnte) : NULL;
132: node->propList = (nSucc) ? (RF_PropHeader_t **)
133: (ptrs + nSucc + nAnte + nResult) : NULL;
134:
135: if (nParam) {
136: if (nParam <= RF_DAG_PARAMCACHESIZE) {
137: node->params = (RF_DagParam_t *) node->dag_params;
138: } else {
139: RF_CallocAndAdd(node->params, nParam,
140: sizeof(RF_DagParam_t), (RF_DagParam_t *), alist);
141: }
142: } else {
143: node->params = NULL;
144: }
145: }
146:
147:
148:
149: /*****************************************************************************
150: *
151: * Allocation and deallocation routines.
152: *
153: *****************************************************************************/
154:
155: void
156: rf_FreeDAG(RF_DagHeader_t *dag_h)
157: {
158: RF_AccessStripeMapHeader_t *asmap, *t_asmap;
159: RF_DagHeader_t *nextDag;
160: int i;
161:
162: while (dag_h) {
163: nextDag = dag_h->next;
164: for (i = 0; dag_h->memChunk[i] && i < RF_MAXCHUNKS; i++) {
165: /* Release mem chunks. */
166: rf_ReleaseMemChunk(dag_h->memChunk[i]);
167: dag_h->memChunk[i] = NULL;
168: }
169:
170: RF_ASSERT(i == dag_h->chunkIndex);
171: if (dag_h->xtraChunkCnt > 0) {
172: /* Free xtraMemChunks. */
173: for (i = 0; dag_h->xtraMemChunk[i] &&
174: i < dag_h->xtraChunkIndex; i++) {
175: rf_ReleaseMemChunk(dag_h->xtraMemChunk[i]);
176: dag_h->xtraMemChunk[i] = NULL;
177: }
178: RF_ASSERT(i == dag_h->xtraChunkIndex);
179: /* Free ptrs to xtraMemChunks. */
180: RF_Free(dag_h->xtraMemChunk, dag_h->xtraChunkCnt *
181: sizeof(RF_ChunkDesc_t *));
182: }
183: rf_FreeAllocList(dag_h->allocList);
184: for (asmap = dag_h->asmList; asmap;) {
185: t_asmap = asmap;
186: asmap = asmap->next;
187: rf_FreeAccessStripeMap(t_asmap);
188: }
189: rf_FreeDAGHeader(dag_h);
190: dag_h = nextDag;
191: }
192: }
193:
194: RF_PropHeader_t *
195: rf_MakePropListEntry(RF_DagHeader_t *dag_h, int resultNum, int paramNum,
196: RF_PropHeader_t *next, RF_AllocListElem_t *allocList)
197: {
198: RF_PropHeader_t *p;
199:
200: RF_CallocAndAdd(p, 1, sizeof(RF_PropHeader_t), (RF_PropHeader_t *),
201: allocList);
202: p->resultNum = resultNum;
203: p->paramNum = paramNum;
204: p->next = next;
205: return (p);
206: }
207:
208: static RF_FreeList_t *rf_dagh_freelist;
209:
210: #define RF_MAX_FREE_DAGH 128
211: #define RF_DAGH_INC 16
212: #define RF_DAGH_INITIAL 32
213:
214: void rf_ShutdownDAGs(void *);
215: void
216: rf_ShutdownDAGs(void *ignored)
217: {
218: RF_FREELIST_DESTROY(rf_dagh_freelist, next, (RF_DagHeader_t *));
219: }
220:
221: int
222: rf_ConfigureDAGs(RF_ShutdownList_t **listp)
223: {
224: int rc;
225:
226: RF_FREELIST_CREATE(rf_dagh_freelist, RF_MAX_FREE_DAGH, RF_DAGH_INC,
227: sizeof(RF_DagHeader_t));
228: if (rf_dagh_freelist == NULL)
229: return (ENOMEM);
230: rc = rf_ShutdownCreate(listp, rf_ShutdownDAGs, NULL);
231: if (rc) {
232: RF_ERRORMSG3("Unable to add to shutdown list file %s line"
233: " %d rc=%d\n", __FILE__, __LINE__, rc);
234: rf_ShutdownDAGs(NULL);
235: return (rc);
236: }
237: RF_FREELIST_PRIME(rf_dagh_freelist, RF_DAGH_INITIAL, next,
238: (RF_DagHeader_t *));
239: return (0);
240: }
241:
242: RF_DagHeader_t *
243: rf_AllocDAGHeader(void)
244: {
245: RF_DagHeader_t *dh;
246:
247: RF_FREELIST_GET(rf_dagh_freelist, dh, next, (RF_DagHeader_t *));
248: if (dh) {
249: bzero((char *) dh, sizeof(RF_DagHeader_t));
250: }
251: return (dh);
252: }
253:
254: void
255: rf_FreeDAGHeader(RF_DagHeader_t *dh)
256: {
257: RF_FREELIST_FREE(rf_dagh_freelist, dh, next);
258: }
259:
260: /* Allocate a buffer big enough to hold the data described by pda. */
261: void *
262: rf_AllocBuffer(RF_Raid_t *raidPtr, RF_DagHeader_t *dag_h,
263: RF_PhysDiskAddr_t *pda, RF_AllocListElem_t *allocList)
264: {
265: char *p;
266:
267: RF_MallocAndAdd(p, pda->numSector << raidPtr->logBytesPerSector,
268: (char *), allocList);
269: return ((void *) p);
270: }
271:
272:
273: /*****************************************************************************
274: *
275: * Debug routines.
276: *
277: *****************************************************************************/
278:
279: char *
280: rf_NodeStatusString(RF_DagNode_t *node)
281: {
282: switch (node->status) {
283: case rf_wait:
284: return ("wait");
285: case rf_fired:
286: return ("fired");
287: case rf_good:
288: return ("good");
289: case rf_bad:
290: return ("bad");
291: default:
292: return ("?");
293: }
294: }
295:
296: void
297: rf_PrintNodeInfoString(RF_DagNode_t *node)
298: {
299: RF_PhysDiskAddr_t *pda;
300: int (*df) (RF_DagNode_t *) = node->doFunc;
301: int i, lk, unlk;
302: void *bufPtr;
303:
304: if ((df == rf_DiskReadFunc) || (df == rf_DiskWriteFunc) ||
305: (df == rf_DiskReadMirrorIdleFunc) ||
306: (df == rf_DiskReadMirrorPartitionFunc)) {
307: pda = (RF_PhysDiskAddr_t *) node->params[0].p;
308: bufPtr = (void *) node->params[1].p;
309: lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
310: unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
311: RF_ASSERT(!(lk && unlk));
312: printf("r %d c %d offs %ld nsect %d buf 0x%lx %s\n", pda->row,
313: pda->col, (long) pda->startSector, (int) pda->numSector,
314: (long) bufPtr, (lk) ? "LOCK" : ((unlk) ? "UNLK" : " "));
315: return;
316: }
317: if (df == rf_DiskUnlockFunc) {
318: pda = (RF_PhysDiskAddr_t *) node->params[0].p;
319: lk = RF_EXTRACT_LOCK_FLAG(node->params[3].v);
320: unlk = RF_EXTRACT_UNLOCK_FLAG(node->params[3].v);
321: RF_ASSERT(!(lk && unlk));
322: printf("r %d c %d %s\n", pda->row, pda->col,
323: (lk) ? "LOCK" : ((unlk) ? "UNLK" : "nop"));
324: return;
325: }
326: if ((df == rf_SimpleXorFunc) || (df == rf_RegularXorFunc)
327: || (df == rf_RecoveryXorFunc)) {
328: printf("result buf 0x%lx\n", (long) node->results[0]);
329: for (i = 0; i < node->numParams - 1; i += 2) {
330: pda = (RF_PhysDiskAddr_t *) node->params[i].p;
331: bufPtr = (RF_PhysDiskAddr_t *) node->params[i + 1].p;
332: printf(" buf 0x%lx r%d c%d offs %ld nsect %d\n",
333: (long) bufPtr, pda->row, pda->col,
334: (long) pda->startSector, (int) pda->numSector);
335: }
336: return;
337: }
338: #if RF_INCLUDE_PARITYLOGGING > 0
339: if (df == rf_ParityLogOverwriteFunc || df == rf_ParityLogUpdateFunc) {
340: for (i = 0; i < node->numParams - 1; i += 2) {
341: pda = (RF_PhysDiskAddr_t *) node->params[i].p;
342: bufPtr = (RF_PhysDiskAddr_t *) node->params[i + 1].p;
343: printf(" r%d c%d offs %ld nsect %d buf 0x%lx\n",
344: pda->row, pda->col, (long) pda->startSector,
345: (int) pda->numSector, (long) bufPtr);
346: }
347: return;
348: }
349: #endif /* RF_INCLUDE_PARITYLOGGING > 0 */
350:
351: if ((df == rf_TerminateFunc) || (df == rf_NullNodeFunc)) {
352: printf("\n");
353: return;
354: }
355: printf("?\n");
356: }
357:
358: void
359: rf_RecurPrintDAG(RF_DagNode_t *node, int depth, int unvisited)
360: {
361: char *anttype;
362: int i;
363:
364: node->visited = (unvisited) ? 0 : 1;
365: printf("(%d) %d C%d %s: %s,s%d %d/%d,a%d/%d,p%d,r%d S{", depth,
366: node->nodeNum, node->commitNode, node->name,
367: rf_NodeStatusString(node), node->numSuccedents,
368: node->numSuccFired, node->numSuccDone,
369: node->numAntecedents, node->numAntDone,
370: node->numParams, node->numResults);
371: for (i = 0; i < node->numSuccedents; i++) {
372: printf("%d%s", node->succedents[i]->nodeNum,
373: ((i == node->numSuccedents - 1) ? "\0" : " "));
374: }
375: printf("} A{");
376: for (i = 0; i < node->numAntecedents; i++) {
377: switch (node->antType[i]) {
378: case rf_trueData:
379: anttype = "T";
380: break;
381: case rf_antiData:
382: anttype = "A";
383: break;
384: case rf_outputData:
385: anttype = "O";
386: break;
387: case rf_control:
388: anttype = "C";
389: break;
390: default:
391: anttype = "?";
392: break;
393: }
394: printf("%d(%s)%s", node->antecedents[i]->nodeNum, anttype,
395: (i == node->numAntecedents - 1) ? "\0" : " ");
396: }
397: printf("}; ");
398: rf_PrintNodeInfoString(node);
399: for (i = 0; i < node->numSuccedents; i++) {
400: if (node->succedents[i]->visited == unvisited)
401: rf_RecurPrintDAG(node->succedents[i], depth + 1,
402: unvisited);
403: }
404: }
405:
406: void
407: rf_PrintDAG(RF_DagHeader_t *dag_h)
408: {
409: int unvisited, i;
410: char *status;
411:
412: /* Set dag status. */
413: switch (dag_h->status) {
414: case rf_enable:
415: status = "enable";
416: break;
417: case rf_rollForward:
418: status = "rollForward";
419: break;
420: case rf_rollBackward:
421: status = "rollBackward";
422: break;
423: default:
424: status = "illegal !";
425: break;
426: }
427: /* Find out if visited bits are currently set or cleared. */
428: unvisited = dag_h->succedents[0]->visited;
429:
430: printf("DAG type: %s\n", dag_h->creator);
431: printf("format is (depth) num commit type: status,nSucc nSuccFired/n"
432: "SuccDone,nAnte/nAnteDone,nParam,nResult S{x} A{x(type)}; info\n");
433: printf("(0) %d Hdr: %s, s%d, (commit %d/%d) S{", dag_h->nodeNum,
434: status, dag_h->numSuccedents, dag_h->numCommitNodes,
435: dag_h->numCommits);
436: for (i = 0; i < dag_h->numSuccedents; i++) {
437: printf("%d%s", dag_h->succedents[i]->nodeNum,
438: ((i == dag_h->numSuccedents - 1) ? "\0" : " "));
439: }
440: printf("};\n");
441: for (i = 0; i < dag_h->numSuccedents; i++) {
442: if (dag_h->succedents[i]->visited == unvisited)
443: rf_RecurPrintDAG(dag_h->succedents[i], 1, unvisited);
444: }
445: }
446:
447: /* Assign node numbers. */
448: int
449: rf_AssignNodeNums(RF_DagHeader_t *dag_h)
450: {
451: int unvisited, i, nnum;
452: RF_DagNode_t *node;
453:
454: nnum = 0;
455: unvisited = dag_h->succedents[0]->visited;
456:
457: dag_h->nodeNum = nnum++;
458: for (i = 0; i < dag_h->numSuccedents; i++) {
459: node = dag_h->succedents[i];
460: if (node->visited == unvisited) {
461: nnum = rf_RecurAssignNodeNums(dag_h->succedents[i],
462: nnum, unvisited);
463: }
464: }
465: return (nnum);
466: }
467:
468: int
469: rf_RecurAssignNodeNums(RF_DagNode_t *node, int num, int unvisited)
470: {
471: int i;
472:
473: node->visited = (unvisited) ? 0 : 1;
474:
475: node->nodeNum = num++;
476: for (i = 0; i < node->numSuccedents; i++) {
477: if (node->succedents[i]->visited == unvisited) {
478: num = rf_RecurAssignNodeNums(node->succedents[i],
479: num, unvisited);
480: }
481: }
482: return (num);
483: }
484:
485: /* Set the header pointers in each node to "newptr". */
486: void
487: rf_ResetDAGHeaderPointers(RF_DagHeader_t *dag_h, RF_DagHeader_t *newptr)
488: {
489: int i;
490:
491: for (i = 0; i < dag_h->numSuccedents; i++)
492: if (dag_h->succedents[i]->dagHdr != newptr)
493: rf_RecurResetDAGHeaderPointers(dag_h->succedents[i],
494: newptr);
495: }
496:
497: void
498: rf_RecurResetDAGHeaderPointers(RF_DagNode_t *node, RF_DagHeader_t *newptr)
499: {
500: int i;
501:
502: node->dagHdr = newptr;
503: for (i = 0; i < node->numSuccedents; i++)
504: if (node->succedents[i]->dagHdr != newptr)
505: rf_RecurResetDAGHeaderPointers(node->succedents[i],
506: newptr);
507: }
508:
509: void
510: rf_PrintDAGList(RF_DagHeader_t *dag_h)
511: {
512: int i = 0;
513:
514: for (; dag_h; dag_h = dag_h->next) {
515: rf_AssignNodeNums(dag_h);
516: printf("\n\nDAG %d IN LIST:\n", i++);
517: rf_PrintDAG(dag_h);
518: }
519: }
520:
521: int
522: rf_ValidateBranch(RF_DagNode_t *node, int *scount, int *acount,
523: RF_DagNode_t **nodes, int unvisited)
524: {
525: int i, retcode = 0;
526:
527: /* Construct an array of node pointers indexed by node num. */
528: node->visited = (unvisited) ? 0 : 1;
529: nodes[node->nodeNum] = node;
530:
531: if (node->next != NULL) {
532: printf("INVALID DAG: next pointer in node is not NULL.\n");
533: retcode = 1;
534: }
535: if (node->status != rf_wait) {
536: printf("INVALID DAG: Node status is not wait.\n");
537: retcode = 1;
538: }
539: if (node->numAntDone != 0) {
540: printf("INVALID DAG: numAntDone is not zero.\n");
541: retcode = 1;
542: }
543: if (node->doFunc == rf_TerminateFunc) {
544: if (node->numSuccedents != 0) {
545: printf("INVALID DAG: Terminator node has"
546: " succedents.\n");
547: retcode = 1;
548: }
549: } else {
550: if (node->numSuccedents == 0) {
551: printf("INVALID DAG: Non-terminator node has no"
552: " succedents\n");
553: retcode = 1;
554: }
555: }
556: for (i = 0; i < node->numSuccedents; i++) {
557: if (!node->succedents[i]) {
558: printf("INVALID DAG: succedent %d of node %s"
559: " is NULL.\n", i, node->name);
560: retcode = 1;
561: }
562: scount[node->succedents[i]->nodeNum]++;
563: }
564: for (i = 0; i < node->numAntecedents; i++) {
565: if (!node->antecedents[i]) {
566: printf("INVALID DAG: antecedent %d of node %s is"
567: " NULL.\n", i, node->name);
568: retcode = 1;
569: }
570: acount[node->antecedents[i]->nodeNum]++;
571: }
572: for (i = 0; i < node->numSuccedents; i++) {
573: if (node->succedents[i]->visited == unvisited) {
574: if (rf_ValidateBranch(node->succedents[i], scount,
575: acount, nodes, unvisited)) {
576: retcode = 1;
577: }
578: }
579: }
580: return (retcode);
581: }
582:
583: void
584: rf_ValidateBranchVisitedBits(RF_DagNode_t *node, int unvisited, int rl)
585: {
586: int i;
587:
588: RF_ASSERT(node->visited == unvisited);
589: for (i = 0; i < node->numSuccedents; i++) {
590: if (node->succedents[i] == NULL) {
591: printf("node=%lx node->succedents[%d] is NULL.\n",
592: (long) node, i);
593: RF_ASSERT(0);
594: }
595: rf_ValidateBranchVisitedBits(node->succedents[i],
596: unvisited, rl + 1);
597: }
598: }
599:
600: /*
601: * NOTE: Never call this on a big dag, because it is exponential
602: * in execution time.
603: */
604: void
605: rf_ValidateVisitedBits(RF_DagHeader_t *dag)
606: {
607: int i, unvisited;
608:
609: unvisited = dag->succedents[0]->visited;
610:
611: for (i = 0; i < dag->numSuccedents; i++) {
612: if (dag->succedents[i] == NULL) {
613: printf("dag=%lx dag->succedents[%d] is NULL.\n",
614: (long) dag, i);
615: RF_ASSERT(0);
616: }
617: rf_ValidateBranchVisitedBits(dag->succedents[i], unvisited, 0);
618: }
619: }
620:
621: /*
622: * Validate a DAG. _at entry_ verify that:
623: * -- numNodesCompleted is zero
624: * -- node queue is null
625: * -- dag status is rf_enable
626: * -- next pointer is null on every node
627: * -- all nodes have status wait
628: * -- numAntDone is zero in all nodes
629: * -- terminator node has zero successors
630: * -- no other node besides terminator has zero successors
631: * -- no successor or antecedent pointer in a node is NULL
632: * -- number of times that each node appears as a successor of another node
633: * is equal to the antecedent count on that node
634: * -- number of times that each node appears as an antecedent of another node
635: * is equal to the succedent count on that node
636: * -- what else ?
637: */
638: int
639: rf_ValidateDAG(RF_DagHeader_t *dag_h)
640: {
641: int i, nodecount;
642: int *scount, *acount; /* Per-node successor and antecedent counts. */
643: RF_DagNode_t **nodes; /* Array of ptrs to nodes in dag. */
644: int retcode = 0;
645: int unvisited;
646: int commitNodeCount = 0;
647:
648: if (rf_validateVisitedDebug)
649: rf_ValidateVisitedBits(dag_h);
650:
651: if (dag_h->numNodesCompleted != 0) {
652: printf("INVALID DAG: num nodes completed is %d, should be 0.\n",
653: dag_h->numNodesCompleted);
654: retcode = 1;
655: goto validate_dag_bad;
656: }
657: if (dag_h->status != rf_enable) {
658: printf("INVALID DAG: not enabled.\n");
659: retcode = 1;
660: goto validate_dag_bad;
661: }
662: if (dag_h->numCommits != 0) {
663: printf("INVALID DAG: numCommits != 0 (%d)\n",
664: dag_h->numCommits);
665: retcode = 1;
666: goto validate_dag_bad;
667: }
668: if (dag_h->numSuccedents != 1) {
669: /* Currently, all dags must have only one succedent. */
670: printf("INVALID DAG: numSuccedents != 1 (%d).\n",
671: dag_h->numSuccedents);
672: retcode = 1;
673: goto validate_dag_bad;
674: }
675: nodecount = rf_AssignNodeNums(dag_h);
676:
677: unvisited = dag_h->succedents[0]->visited;
678:
679: RF_Calloc(scount, nodecount, sizeof(int), (int *));
680: RF_Calloc(acount, nodecount, sizeof(int), (int *));
681: RF_Calloc(nodes, nodecount, sizeof(RF_DagNode_t *), (RF_DagNode_t **));
682: for (i = 0; i < dag_h->numSuccedents; i++) {
683: if ((dag_h->succedents[i]->visited == unvisited)
684: && rf_ValidateBranch(dag_h->succedents[i], scount,
685: acount, nodes, unvisited)) {
686: retcode = 1;
687: }
688: }
689: /* Start at 1 to skip the header node. */
690: for (i = 1; i < nodecount; i++) {
691: if (nodes[i]->commitNode)
692: commitNodeCount++;
693: if (nodes[i]->doFunc == NULL) {
694: printf("INVALID DAG: node %s has an undefined"
695: " doFunc.\n", nodes[i]->name);
696: retcode = 1;
697: goto validate_dag_out;
698: }
699: if (nodes[i]->undoFunc == NULL) {
700: printf("INVALID DAG: node %s has an undefined"
701: " doFunc.\n", nodes[i]->name);
702: retcode = 1;
703: goto validate_dag_out;
704: }
705: if (nodes[i]->numAntecedents != scount[nodes[i]->nodeNum]) {
706: printf("INVALID DAG: node %s has %d antecedents but"
707: " appears as a succedent %d times.\n",
708: nodes[i]->name, nodes[i]->numAntecedents,
709: scount[nodes[i]->nodeNum]);
710: retcode = 1;
711: goto validate_dag_out;
712: }
713: if (nodes[i]->numSuccedents != acount[nodes[i]->nodeNum]) {
714: printf("INVALID DAG: node %s has %d succedents but"
715: " appears as an antecedent %d times.\n",
716: nodes[i]->name, nodes[i]->numSuccedents,
717: acount[nodes[i]->nodeNum]);
718: retcode = 1;
719: goto validate_dag_out;
720: }
721: }
722:
723: if (dag_h->numCommitNodes != commitNodeCount) {
724: printf("INVALID DAG: incorrect commit node count. "
725: "hdr->numCommitNodes (%d) found (%d) commit nodes"
726: " in graph.\n",
727: dag_h->numCommitNodes, commitNodeCount);
728: retcode = 1;
729: goto validate_dag_out;
730: }
731:
732: validate_dag_out:
733: RF_Free(scount, nodecount * sizeof(int));
734: RF_Free(acount, nodecount * sizeof(int));
735: RF_Free(nodes, nodecount * sizeof(RF_DagNode_t *));
736: if (retcode)
737: rf_PrintDAGList(dag_h);
738:
739: if (rf_validateVisitedDebug)
740: rf_ValidateVisitedBits(dag_h);
741:
742: return (retcode);
743:
744: validate_dag_bad:
745: rf_PrintDAGList(dag_h);
746: return (retcode);
747: }
748:
749:
750: /*****************************************************************************
751: *
752: * Misc construction routines.
753: *
754: *****************************************************************************/
755:
756: void
757: rf_redirect_asm(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap)
758: {
759: int ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) ? 1 : 0;
760: int row = asmap->physInfo->row;
761: int fcol = raidPtr->reconControl[row]->fcol;
762: int srow = raidPtr->reconControl[row]->spareRow;
763: int scol = raidPtr->reconControl[row]->spareCol;
764: RF_PhysDiskAddr_t *pda;
765:
766: RF_ASSERT(raidPtr->status[row] == rf_rs_reconstructing);
767: for (pda = asmap->physInfo; pda; pda = pda->next) {
768: if (pda->col == fcol) {
769: if (rf_dagDebug) {
770: if (!rf_CheckRUReconstructed(
771: raidPtr->reconControl[row]->reconMap,
772: pda->startSector)) {
773: RF_PANIC();
774: }
775: }
776: /*printf("Remapped data for large write\n");*/
777: if (ds) {
778: raidPtr->Layout.map->MapSector(raidPtr,
779: pda->raidAddress, &pda->row, &pda->col,
780: &pda->startSector, RF_REMAP);
781: } else {
782: pda->row = srow;
783: pda->col = scol;
784: }
785: }
786: }
787: for (pda = asmap->parityInfo; pda; pda = pda->next) {
788: if (pda->col == fcol) {
789: if (rf_dagDebug) {
790: if (!rf_CheckRUReconstructed(
791: raidPtr->reconControl[row]->reconMap,
792: pda->startSector)) {
793: RF_PANIC();
794: }
795: }
796: }
797: if (ds) {
798: (raidPtr->Layout.map->MapParity) (raidPtr,
799: pda->raidAddress, &pda->row, &pda->col,
800: &pda->startSector, RF_REMAP);
801: } else {
802: pda->row = srow;
803: pda->col = scol;
804: }
805: }
806: }
807:
808:
809: /*
810: * This routine allocates read buffers and generates stripe maps for the
811: * regions of the array from the start of the stripe to the start of the
812: * access, and from the end of the access to the end of the stripe. It also
813: * computes and returns the number of DAG nodes needed to read all this data.
814: * Note that this routine does the wrong thing if the access is fully
815: * contained within one stripe unit, so we RF_ASSERT against this case at the
816: * start.
817: */
818: void
819: rf_MapUnaccessedPortionOfStripe(
820: RF_Raid_t *raidPtr,
821: RF_RaidLayout_t *layoutPtr, /* in: layout information */
822: RF_AccessStripeMap_t *asmap, /* in: access stripe map */
823: RF_DagHeader_t *dag_h, /* in: header of the dag */
824: /* to create */
825: RF_AccessStripeMapHeader_t **new_asm_h, /* in: ptr to array of 2 */
826: /* headers, to be */
827: /* filled in */
828: int *nRodNodes, /* out: num nodes to be */
829: /* generated to read */
830: /* unaccessed data */
831: char **sosBuffer, /* out: pointers to newly */
832: /* allocated buffer */
833: char **eosBuffer,
834: RF_AllocListElem_t *allocList
835: )
836: {
837: RF_RaidAddr_t sosRaidAddress, eosRaidAddress;
838: RF_SectorNum_t sosNumSector, eosNumSector;
839:
840: RF_ASSERT(asmap->numStripeUnitsAccessed > (layoutPtr->numDataCol / 2));
841: /*
842: * Generate an access map for the region of the array from start of
843: * stripe to start of access.
844: */
845: new_asm_h[0] = new_asm_h[1] = NULL;
846: *nRodNodes = 0;
847: if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->raidAddress)) {
848: sosRaidAddress = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
849: asmap->raidAddress);
850: sosNumSector = asmap->raidAddress - sosRaidAddress;
851: RF_MallocAndAdd(*sosBuffer, rf_RaidAddressToByte(raidPtr,
852: sosNumSector), (char *), allocList);
853: new_asm_h[0] = rf_MapAccess(raidPtr, sosRaidAddress,
854: sosNumSector, *sosBuffer, RF_DONT_REMAP);
855: new_asm_h[0]->next = dag_h->asmList;
856: dag_h->asmList = new_asm_h[0];
857: *nRodNodes += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
858:
859: RF_ASSERT(new_asm_h[0]->stripeMap->next == NULL);
860: /* We're totally within one stripe here. */
861: if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
862: rf_redirect_asm(raidPtr, new_asm_h[0]->stripeMap);
863: }
864: /*
865: * Generate an access map for the region of the array from end of
866: * access to end of stripe.
867: */
868: if (!rf_RaidAddressStripeAligned(layoutPtr, asmap->endRaidAddress)) {
869: eosRaidAddress = asmap->endRaidAddress;
870: eosNumSector = rf_RaidAddressOfNextStripeBoundary(layoutPtr,
871: eosRaidAddress) - eosRaidAddress;
872: RF_MallocAndAdd(*eosBuffer, rf_RaidAddressToByte(raidPtr,
873: eosNumSector), (char *), allocList);
874: new_asm_h[1] = rf_MapAccess(raidPtr, eosRaidAddress,
875: eosNumSector, *eosBuffer, RF_DONT_REMAP);
876: new_asm_h[1]->next = dag_h->asmList;
877: dag_h->asmList = new_asm_h[1];
878: *nRodNodes += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
879:
880: RF_ASSERT(new_asm_h[1]->stripeMap->next == NULL);
881: /* We're totally within one stripe here. */
882: if (asmap->flags & RF_ASM_REDIR_LARGE_WRITE)
883: rf_redirect_asm(raidPtr, new_asm_h[1]->stripeMap);
884: }
885: }
886:
887:
888: /* Returns non-zero if the indicated ranges of stripe unit offsets overlap. */
889: int
890: rf_PDAOverlap(RF_RaidLayout_t *layoutPtr, RF_PhysDiskAddr_t *src,
891: RF_PhysDiskAddr_t *dest)
892: {
893: RF_SectorNum_t soffs =
894: rf_StripeUnitOffset(layoutPtr, src->startSector);
895: RF_SectorNum_t doffs =
896: rf_StripeUnitOffset(layoutPtr, dest->startSector);
897: /* Use -1 to be sure we stay within SU. */
898: RF_SectorNum_t send =
899: rf_StripeUnitOffset(layoutPtr, src->startSector +
900: src->numSector - 1);
901: RF_SectorNum_t dend =
902: rf_StripeUnitOffset(layoutPtr, dest->startSector +
903: dest->numSector - 1);
904:
905: return ((RF_MAX(soffs, doffs) <= RF_MIN(send, dend)) ? 1 : 0);
906: }
907:
908:
909: /*
910: * GenerateFailedAccessASMs
911: *
912: * This routine figures out what portion of the stripe needs to be read
913: * to effect the degraded read or write operation. It's primary function
914: * is to identify everything required to recover the data, and then
915: * eliminate anything that is already being accessed by the user.
916: *
917: * The main result is two new ASMs, one for the region from the start of the
918: * stripe to the start of the access, and one for the region from the end of
919: * the access to the end of the stripe. These ASMs describe everything that
920: * needs to be read to effect the degraded access. Other results are:
921: * nXorBufs -- The total number of buffers that need to be XORed together
922: * to recover the lost data,
923: * rpBufPtr -- Ptr to a newly-allocated buffer to hold the parity. If NULL
924: * at entry, not allocated.
925: * overlappingPDAs --
926: * Describes which of the non-failed PDAs, in the user access,
927: * overlap data that needs to be read to effect recovery.
928: * overlappingPDAs[i]==1 if and only if, neglecting the failed
929: * PDA, the i'th pda in the input asm overlaps data that needs
930: * to be read for recovery.
931: */
932: /* in: asmap - ASM for the actual access, one stripe only. */
933: /* in: faildPDA - Which component of the access has failed. */
934: /* in: dag_h - Header of the DAG we're going to create. */
935: /* out: new_asm_h - The two new ASMs. */
936: /* out: nXorBufs - The total number of xor bufs required. */
937: /* out: rpBufPtr - A buffer for the parity read. */
938: void
939: rf_GenerateFailedAccessASMs(
940: RF_Raid_t *raidPtr,
941: RF_AccessStripeMap_t *asmap,
942: RF_PhysDiskAddr_t *failedPDA,
943: RF_DagHeader_t *dag_h,
944: RF_AccessStripeMapHeader_t **new_asm_h,
945: int *nXorBufs,
946: char **rpBufPtr,
947: char *overlappingPDAs,
948: RF_AllocListElem_t *allocList
949: )
950: {
951: RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
952:
953: /* s=start, e=end, s=stripe, a=access, f=failed, su=stripe unit */
954: RF_RaidAddr_t sosAddr, sosEndAddr, eosStartAddr, eosAddr;
955:
956: RF_SectorCount_t numSect[2], numParitySect;
957: RF_PhysDiskAddr_t *pda;
958: char *rdBuf, *bufP;
959: int foundit, i;
960:
961: bufP = NULL;
962: foundit = 0;
963: /*
964: * First compute the following raid addresses:
965: * - Start of stripe
966: * - (sosAddr) MIN(start of access, start of failed SU)
967: * - (sosEndAddr) MAX(end of access, end of failed SU)
968: * - (eosStartAddr) end of stripe (i.e. start of next stripe)
969: * (eosAddr)
970: */
971: sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
972: asmap->raidAddress);
973: sosEndAddr = RF_MIN(asmap->raidAddress,
974: rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
975: failedPDA->raidAddress));
976: eosStartAddr = RF_MAX(asmap->endRaidAddress,
977: rf_RaidAddressOfNextStripeUnitBoundary(layoutPtr,
978: failedPDA->raidAddress));
979: eosAddr = rf_RaidAddressOfNextStripeBoundary(layoutPtr,
980: asmap->raidAddress);
981:
982: /*
983: * Now generate access stripe maps for each of the above regions of
984: * the stripe. Use a dummy (NULL) buf ptr for now.
985: */
986:
987: new_asm_h[0] = (sosAddr != sosEndAddr) ?
988: rf_MapAccess(raidPtr, sosAddr, sosEndAddr - sosAddr, NULL,
989: RF_DONT_REMAP) : NULL;
990: new_asm_h[1] = (eosStartAddr != eosAddr) ?
991: rf_MapAccess(raidPtr, eosStartAddr, eosAddr - eosStartAddr, NULL,
992: RF_DONT_REMAP) : NULL;
993:
994: /*
995: * Walk through the PDAs and range-restrict each SU to the region of
996: * the SU touched on the failed PDA. Also compute total data buffer
997: * space requirements in this step. Ignore the parity for now.
998: */
999:
1000: numSect[0] = numSect[1] = 0;
1001: if (new_asm_h[0]) {
1002: new_asm_h[0]->next = dag_h->asmList;
1003: dag_h->asmList = new_asm_h[0];
1004: for (pda = new_asm_h[0]->stripeMap->physInfo; pda;
1005: pda = pda->next) {
1006: rf_RangeRestrictPDA(raidPtr, failedPDA, pda,
1007: RF_RESTRICT_NOBUFFER, 0);
1008: numSect[0] += pda->numSector;
1009: }
1010: }
1011: if (new_asm_h[1]) {
1012: new_asm_h[1]->next = dag_h->asmList;
1013: dag_h->asmList = new_asm_h[1];
1014: for (pda = new_asm_h[1]->stripeMap->physInfo;
1015: pda; pda = pda->next) {
1016: rf_RangeRestrictPDA(raidPtr, failedPDA, pda,
1017: RF_RESTRICT_NOBUFFER, 0);
1018: numSect[1] += pda->numSector;
1019: }
1020: }
1021: numParitySect = failedPDA->numSector;
1022:
1023: /*
1024: * Allocate buffer space for the data & parity we have to read to
1025: * recover from the failure.
1026: */
1027:
1028: if (numSect[0] + numSect[1] + ((rpBufPtr) ? numParitySect : 0)) {
1029: /* Don't allocate parity buf if not needed. */
1030: RF_MallocAndAdd(rdBuf, rf_RaidAddressToByte(raidPtr,
1031: numSect[0] + numSect[1] + numParitySect), (char *),
1032: allocList);
1033: bufP = rdBuf;
1034: if (rf_degDagDebug)
1035: printf("Newly allocated buffer (%d bytes) is 0x%lx\n",
1036: (int) rf_RaidAddressToByte(raidPtr,
1037: numSect[0] + numSect[1] + numParitySect),
1038: (unsigned long) bufP);
1039: }
1040: /*
1041: * Now walk through the pdas one last time and assign buffer pointers
1042: * (ugh!). Again, ignore the parity. Also, count nodes to find out
1043: * how many bufs need to be xored together.
1044: */
1045: (*nXorBufs) = 1; /* In read case, 1 is for parity. */
1046: /* In write case, 1 is for failed data. */
1047: if (new_asm_h[0]) {
1048: for (pda = new_asm_h[0]->stripeMap->physInfo; pda;
1049: pda = pda->next) {
1050: pda->bufPtr = bufP;
1051: bufP += rf_RaidAddressToByte(raidPtr, pda->numSector);
1052: }
1053: *nXorBufs += new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
1054: }
1055: if (new_asm_h[1]) {
1056: for (pda = new_asm_h[1]->stripeMap->physInfo; pda;
1057: pda = pda->next) {
1058: pda->bufPtr = bufP;
1059: bufP += rf_RaidAddressToByte(raidPtr, pda->numSector);
1060: }
1061: (*nXorBufs) += new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
1062: }
1063: if (rpBufPtr)
1064: /* The rest of the buffer is for parity. */
1065: *rpBufPtr = bufP;
1066:
1067: /*
1068: * The last step is to figure out how many more distinct buffers need
1069: * to get xor'd to produce the missing unit. there's one for each
1070: * user-data read node that overlaps the portion of the failed unit
1071: * being accessed.
1072: */
1073:
1074: for (foundit = i = 0, pda = asmap->physInfo;
1075: pda; i++, pda = pda->next) {
1076: if (pda == failedPDA) {
1077: i--;
1078: foundit = 1;
1079: continue;
1080: }
1081: if (rf_PDAOverlap(layoutPtr, pda, failedPDA)) {
1082: overlappingPDAs[i] = 1;
1083: (*nXorBufs)++;
1084: }
1085: }
1086: if (!foundit) {
1087: RF_ERRORMSG("GenerateFailedAccessASMs: did not find failedPDA"
1088: " in asm list.\n");
1089: RF_ASSERT(0);
1090: }
1091: if (rf_degDagDebug) {
1092: if (new_asm_h[0]) {
1093: printf("First asm:\n");
1094: rf_PrintFullAccessStripeMap(new_asm_h[0], 1);
1095: }
1096: if (new_asm_h[1]) {
1097: printf("Second asm:\n");
1098: rf_PrintFullAccessStripeMap(new_asm_h[1], 1);
1099: }
1100: }
1101: }
1102:
1103:
1104: /*
1105: * Adjust the offset and number of sectors in the destination pda so that
1106: * it covers at most the region of the SU covered by the source PDA. This
1107: * is exclusively a restriction: the number of sectors indicated by the
1108: * target PDA can only shrink.
1109: *
1110: * For example: s = sectors within SU indicated by source PDA
1111: * d = sectors within SU indicated by dest PDA
1112: * r = results, stored in dest PDA
1113: *
1114: * |--------------- one stripe unit ---------------------|
1115: * | sssssssssssssssssssssssssssssssss |
1116: * | ddddddddddddddddddddddddddddddddddddddddddddd |
1117: * | rrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrrr |
1118: *
1119: * Another example:
1120: *
1121: * |--------------- one stripe unit ---------------------|
1122: * | sssssssssssssssssssssssssssssssss |
1123: * | ddddddddddddddddddddddd |
1124: * | rrrrrrrrrrrrrrrr |
1125: *
1126: */
1127: void
1128: rf_RangeRestrictPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *src,
1129: RF_PhysDiskAddr_t *dest, int dobuffer, int doraidaddr)
1130: {
1131: RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1132: RF_SectorNum_t soffs =
1133: rf_StripeUnitOffset(layoutPtr, src->startSector);
1134: RF_SectorNum_t doffs =
1135: rf_StripeUnitOffset(layoutPtr, dest->startSector);
1136: RF_SectorNum_t send =
1137: rf_StripeUnitOffset(layoutPtr, src->startSector +
1138: src->numSector - 1); /* Use -1 to be sure we stay within SU. */
1139: RF_SectorNum_t dend =
1140: rf_StripeUnitOffset(layoutPtr, dest->startSector +
1141: dest->numSector - 1);
1142: RF_SectorNum_t subAddr =
1143: rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
1144: dest->startSector); /* Stripe unit boundary. */
1145:
1146: dest->startSector = subAddr + RF_MAX(soffs, doffs);
1147: dest->numSector = subAddr + RF_MIN(send, dend) + 1 - dest->startSector;
1148:
1149: if (dobuffer)
1150: dest->bufPtr += (soffs > doffs) ?
1151: rf_RaidAddressToByte(raidPtr, soffs - doffs) : 0;
1152: if (doraidaddr) {
1153: dest->raidAddress =
1154: rf_RaidAddressOfPrevStripeUnitBoundary(layoutPtr,
1155: dest->raidAddress) +
1156: rf_StripeUnitOffset(layoutPtr, dest->startSector);
1157: }
1158: }
1159:
1160: /*
1161: * Want the highest of these primes to be the largest one
1162: * less than the max expected number of columns (won't hurt
1163: * to be too small or too large, but won't be optimal, either)
1164: * --jimz
1165: */
1166: #define NLOWPRIMES 8
1167: static int lowprimes[NLOWPRIMES] = {2, 3, 5, 7, 11, 13, 17, 19};
1168:
1169: /*****************************************************************************
1170: * Compute the workload shift factor. (chained declustering)
1171: *
1172: * Return nonzero if access should shift to secondary, otherwise,
1173: * access is to primary.
1174: *****************************************************************************/
1175: int
1176: rf_compute_workload_shift(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda)
1177: {
1178: /*
1179: * Variables:
1180: * d = Column of disk containing primary.
1181: * f = Column of failed disk.
1182: * n = Number of disks in array.
1183: * sd = "shift distance"
1184: * (number of columns that d is to the right of f).
1185: * row = Row of array the access is in.
1186: * v = Numerator of redirection ratio.
1187: * k = Denominator of redirection ratio.
1188: */
1189: RF_RowCol_t d, f, sd, row, n;
1190: int k, v, ret, i;
1191:
1192: row = pda->row;
1193: n = raidPtr->numCol;
1194:
1195: /* Assign column of primary copy to d. */
1196: d = pda->col;
1197:
1198: /* Assign column of dead disk to f. */
1199: for (f = 0; ((!RF_DEAD_DISK(raidPtr->Disks[row][f].status)) &&
1200: (f < n)); f++);
1201:
1202: RF_ASSERT(f < n);
1203: RF_ASSERT(f != d);
1204:
1205: sd = (f > d) ? (n + d - f) : (d - f);
1206: RF_ASSERT(sd < n);
1207:
1208: /*
1209: * v of every k accesses should be redirected.
1210: *
1211: * v/k := (n-1-sd)/(n-1)
1212: */
1213: v = (n - 1 - sd);
1214: k = (n - 1);
1215:
1216: #if 1
1217: /*
1218: * XXX
1219: * Is this worth it ?
1220: *
1221: * Now reduce the fraction, by repeatedly factoring
1222: * out primes (just like they teach in elementary school !).
1223: */
1224: for (i = 0; i < NLOWPRIMES; i++) {
1225: if (lowprimes[i] > v)
1226: break;
1227: while (((v % lowprimes[i]) == 0) && ((k % lowprimes[i]) == 0)) {
1228: v /= lowprimes[i];
1229: k /= lowprimes[i];
1230: }
1231: }
1232: #endif
1233:
1234: raidPtr->hist_diskreq[row][d]++;
1235: if (raidPtr->hist_diskreq[row][d] > v) {
1236: ret = 0; /* Do not redirect. */
1237: } else {
1238: ret = 1; /* Redirect. */
1239: }
1240:
1241: #if 0
1242: printf("d=%d f=%d sd=%d v=%d k=%d ret=%d h=%d\n", d, f, sd, v, k, ret,
1243: raidPtr->hist_diskreq[row][d]);
1244: #endif
1245:
1246: if (raidPtr->hist_diskreq[row][d] >= k) {
1247: /* Reset counter. */
1248: raidPtr->hist_diskreq[row][d] = 0;
1249: }
1250: return (ret);
1251: }
1252:
1253: /*
1254: * Disk selection routines.
1255: */
1256:
1257: /*
1258: * Select the disk with the shortest queue from a mirror pair.
1259: * Both the disk I/Os queued in RAIDframe as well as those at the physical
1260: * disk are counted as members of the "queue".
1261: */
1262: void
1263: rf_SelectMirrorDiskIdle(RF_DagNode_t *node)
1264: {
1265: RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
1266: RF_RowCol_t rowData, colData, rowMirror, colMirror;
1267: int dataQueueLength, mirrorQueueLength, usemirror;
1268: RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *) node->params[0].p;
1269: RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *) node->params[4].p;
1270: RF_PhysDiskAddr_t *tmp_pda;
1271: RF_RaidDisk_t **disks = raidPtr->Disks;
1272: RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
1273:
1274: /* Return the [row col] of the disk with the shortest queue. */
1275: rowData = data_pda->row;
1276: colData = data_pda->col;
1277: rowMirror = mirror_pda->row;
1278: colMirror = mirror_pda->col;
1279: dataQueue = &(dqs[rowData][colData]);
1280: mirrorQueue = &(dqs[rowMirror][colMirror]);
1281:
1282: #ifdef RF_LOCK_QUEUES_TO_READ_LEN
1283: RF_LOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
1284: #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
1285: dataQueueLength = dataQueue->queueLength + dataQueue->numOutstanding;
1286: #ifdef RF_LOCK_QUEUES_TO_READ_LEN
1287: RF_UNLOCK_QUEUE_MUTEX(dataQueue, "SelectMirrorDiskIdle");
1288: RF_LOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
1289: #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
1290: mirrorQueueLength = mirrorQueue->queueLength +
1291: mirrorQueue->numOutstanding;
1292: #ifdef RF_LOCK_QUEUES_TO_READ_LEN
1293: RF_UNLOCK_QUEUE_MUTEX(mirrorQueue, "SelectMirrorDiskIdle");
1294: #endif /* RF_LOCK_QUEUES_TO_READ_LEN */
1295:
1296: usemirror = 0;
1297: if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
1298: usemirror = 0;
1299: } else
1300: if (RF_DEAD_DISK(disks[rowData][colData].status)) {
1301: usemirror = 1;
1302: } else
1303: if (raidPtr->parity_good == RF_RAID_DIRTY) {
1304: /* Trust only the main disk. */
1305: usemirror = 0;
1306: } else
1307: if (dataQueueLength < mirrorQueueLength) {
1308: usemirror = 0;
1309: } else
1310: if (mirrorQueueLength < dataQueueLength) {
1311: usemirror = 1;
1312: } else {
1313: /* Queues are equal length. */
1314: /* Attempt cleverness. */
1315: if (SNUM_DIFF(dataQueue
1316: ->last_deq_sector, data_pda
1317: ->startSector) <=
1318: SNUM_DIFF(mirrorQueue
1319: ->last_deq_sector, mirror_pda
1320: ->startSector)) {
1321: usemirror = 0;
1322: } else {
1323: usemirror = 1;
1324: }
1325: }
1326:
1327: if (usemirror) {
1328: /* Use mirror (parity) disk, swap params 0 & 4. */
1329: tmp_pda = data_pda;
1330: node->params[0].p = mirror_pda;
1331: node->params[4].p = tmp_pda;
1332: } else {
1333: /* Use data disk, leave param 0 unchanged. */
1334: }
1335: /*printf("dataQueueLength %d, mirrorQueueLength %d\n", dataQueueLength,
1336: mirrorQueueLength);*/
1337: }
1338:
1339: /*
1340: * Do simple partitioning. This assumes that
1341: * the data and parity disks are laid out identically.
1342: */
1343: void
1344: rf_SelectMirrorDiskPartition(RF_DagNode_t *node)
1345: {
1346: RF_Raid_t *raidPtr = (RF_Raid_t *) node->dagHdr->raidPtr;
1347: RF_RowCol_t rowData, colData, rowMirror, colMirror;
1348: RF_PhysDiskAddr_t *data_pda = (RF_PhysDiskAddr_t *) node->params[0].p;
1349: RF_PhysDiskAddr_t *mirror_pda = (RF_PhysDiskAddr_t *) node->params[4].p;
1350: RF_PhysDiskAddr_t *tmp_pda;
1351: RF_RaidDisk_t **disks = raidPtr->Disks;
1352: RF_DiskQueue_t **dqs = raidPtr->Queues, *dataQueue, *mirrorQueue;
1353: int usemirror;
1354:
1355: /* Return the [row col] of the disk with the shortest queue. */
1356: rowData = data_pda->row;
1357: colData = data_pda->col;
1358: rowMirror = mirror_pda->row;
1359: colMirror = mirror_pda->col;
1360: dataQueue = &(dqs[rowData][colData]);
1361: mirrorQueue = &(dqs[rowMirror][colMirror]);
1362:
1363: usemirror = 0;
1364: if (RF_DEAD_DISK(disks[rowMirror][colMirror].status)) {
1365: usemirror = 0;
1366: } else
1367: if (RF_DEAD_DISK(disks[rowData][colData].status)) {
1368: usemirror = 1;
1369: } else
1370: if (raidPtr->parity_good == RF_RAID_DIRTY) {
1371: /* Trust only the main disk. */
1372: usemirror = 0;
1373: } else
1374: if (data_pda->startSector <
1375: (disks[rowData][colData].numBlocks / 2)) {
1376: usemirror = 0;
1377: } else {
1378: usemirror = 1;
1379: }
1380:
1381: if (usemirror) {
1382: /* Use mirror (parity) disk, swap params 0 & 4. */
1383: tmp_pda = data_pda;
1384: node->params[0].p = mirror_pda;
1385: node->params[4].p = tmp_pda;
1386: } else {
1387: /* Use data disk, leave param 0 unchanged. */
1388: }
1389: }
CVSweb