Annotation of sys/dev/raidframe/rf_dagdegrd.c, Revision 1.1.1.1
1.1 nbrk 1: /* $OpenBSD: rf_dagdegrd.c,v 1.6 2006/07/09 22:10:05 mk Exp $ */
2: /* $NetBSD: rf_dagdegrd.c,v 1.5 2000/01/07 03:40:57 oster Exp $ */
3:
4: /*
5: * Copyright (c) 1995 Carnegie-Mellon University.
6: * All rights reserved.
7: *
8: * Author: Mark Holland, Daniel Stodolsky, William V. Courtright II
9: *
10: * Permission to use, copy, modify and distribute this software and
11: * its documentation is hereby granted, provided that both the copyright
12: * notice and this permission notice appear in all copies of the
13: * software, derivative works or modified versions, and any portions
14: * thereof, and that both notices appear in supporting documentation.
15: *
16: * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
17: * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
18: * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
19: *
20: * Carnegie Mellon requests users of this software to return to
21: *
22: * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
23: * School of Computer Science
24: * Carnegie Mellon University
25: * Pittsburgh PA 15213-3890
26: *
27: * any improvements or extensions that they make and grant Carnegie the
28: * rights to redistribute these changes.
29: */
30:
31: /*
32: * rf_dagdegrd.c
33: *
34: * Code for creating degraded read DAGs.
35: */
36:
37: #include "rf_types.h"
38: #include "rf_raid.h"
39: #include "rf_dag.h"
40: #include "rf_dagutils.h"
41: #include "rf_dagfuncs.h"
42: #include "rf_debugMem.h"
43: #include "rf_memchunk.h"
44: #include "rf_general.h"
45: #include "rf_dagdegrd.h"
46:
47:
48: /*****************************************************************************
49: *
50: * General comments on DAG creation:
51: *
52: * All DAGs in this file use roll-away error recovery. Each DAG has a single
53: * commit node, usually called "Cmt". If an error occurs before the Cmt node
54: * is reached, the execution engine will halt forward execution and work
55: * backward through the graph, executing the undo functions. Assuming that
56: * each node in the graph prior to the Cmt node are undoable and atomic - or -
57: * does not make changes to permanent state, the graph will fail atomically.
58: * If an error occurs after the Cmt node executes, the engine will roll-forward
59: * through the graph, blindly executing nodes until it reaches the end.
60: * If a graph reaches the end, it is assumed to have completed successfully.
61: *
62: * A graph has only 1 Cmt node.
63: *
64: *****************************************************************************/
65:
66:
67: /*****************************************************************************
68: *
69: * The following wrappers map the standard DAG creation interface to the
70: * DAG creation routines. Additionally, these wrappers enable experimentation
71: * with new DAG structures by providing an extra level of indirection, allowing
72: * the DAG creation routines to be replaced at this single point.
73: *
74: *****************************************************************************/
75:
76: void
77: rf_CreateRaidFiveDegradedReadDAG(
78: RF_Raid_t *raidPtr,
79: RF_AccessStripeMap_t *asmap,
80: RF_DagHeader_t *dag_h,
81: void *bp,
82: RF_RaidAccessFlags_t flags,
83: RF_AllocListElem_t *allocList)
84: {
85: rf_CreateDegradedReadDAG(raidPtr, asmap, dag_h, bp, flags, allocList,
86: &rf_xorRecoveryFuncs);
87: }
88:
89:
90: /*****************************************************************************
91: *
92: * DAG creation code begins here.
93: *
94: *****************************************************************************/
95:
96:
97: /*****************************************************************************
98: * Create a degraded read DAG for RAID level 1.
99: *
100: * Hdr -> Nil -> R(p/s)d -> Commit -> Trm
101: *
102: * The "Rd" node reads data from the surviving disk in the mirror pair.
103: * Rpd - read of primary copy
104: * Rsd - read of secondary copy
105: *
106: * Parameters: raidPtr - description of the physical array
107: * asmap - logical & physical addresses for this access
108: * bp - buffer ptr (for holding write data)
109: * flags - general flags (e.g. disk locking)
110: * allocList - list of memory allocated in DAG creation
111: *****************************************************************************/
112:
113: void
114: rf_CreateRaidOneDegradedReadDAG(
115: RF_Raid_t *raidPtr,
116: RF_AccessStripeMap_t *asmap,
117: RF_DagHeader_t *dag_h,
118: void *bp,
119: RF_RaidAccessFlags_t flags,
120: RF_AllocListElem_t *allocList)
121: {
122: RF_DagNode_t *nodes, *rdNode, *blockNode, *commitNode, *termNode;
123: RF_StripeNum_t parityStripeID;
124: RF_ReconUnitNum_t which_ru;
125: RF_PhysDiskAddr_t *pda;
126: int useMirror, i;
127:
128: useMirror = 0;
129: parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
130: asmap->raidAddress, &which_ru);
131: if (rf_dagDebug) {
132: printf("[Creating RAID level 1 degraded read DAG]\n");
133: }
134: dag_h->creator = "RaidOneDegradedReadDAG";
135: /* Alloc the Wnd nodes and the Wmir node. */
136: if (asmap->numDataFailed == 0)
137: useMirror = RF_FALSE;
138: else
139: useMirror = RF_TRUE;
140:
141: /* Total number of nodes = 1 + (block + commit + terminator). */
142: RF_CallocAndAdd(nodes, 4, sizeof(RF_DagNode_t), (RF_DagNode_t *),
143: allocList);
144: i = 0;
145: rdNode = &nodes[i];
146: i++;
147: blockNode = &nodes[i];
148: i++;
149: commitNode = &nodes[i];
150: i++;
151: termNode = &nodes[i];
152: i++;
153:
154: /*
155: * This dag can not commit until the commit node is reached. Errors
156: * prior to the commit point imply the dag has failed and must be
157: * retried.
158: */
159: dag_h->numCommitNodes = 1;
160: dag_h->numCommits = 0;
161: dag_h->numSuccedents = 1;
162:
163: /* Initialize the block, commit, and terminator nodes. */
164: rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
165: rf_NullNodeUndoFunc, NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
166: rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
167: rf_NullNodeUndoFunc, NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList);
168: rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
169: rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
170:
171: pda = asmap->physInfo;
172: RF_ASSERT(pda != NULL);
173: /* parityInfo must describe entire parity unit. */
174: RF_ASSERT(asmap->parityInfo->next == NULL);
175:
176: /* Initialize the data node. */
177: if (!useMirror) {
178: /* Read primary copy of data. */
179: rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc,
180: rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
181: dag_h, "Rpd", allocList);
182: rdNode->params[0].p = pda;
183: rdNode->params[1].p = pda->bufPtr;
184: rdNode->params[2].v = parityStripeID;
185: rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
186: 0, 0, which_ru);
187: } else {
188: /* Read secondary copy of data. */
189: rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc,
190: rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
191: dag_h, "Rsd", allocList);
192: rdNode->params[0].p = asmap->parityInfo;
193: rdNode->params[1].p = pda->bufPtr;
194: rdNode->params[2].v = parityStripeID;
195: rdNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY,
196: 0, 0, which_ru);
197: }
198:
199: /* Connect header to block node. */
200: RF_ASSERT(dag_h->numSuccedents == 1);
201: RF_ASSERT(blockNode->numAntecedents == 0);
202: dag_h->succedents[0] = blockNode;
203:
204: /* Connect block node to rdnode. */
205: RF_ASSERT(blockNode->numSuccedents == 1);
206: RF_ASSERT(rdNode->numAntecedents == 1);
207: blockNode->succedents[0] = rdNode;
208: rdNode->antecedents[0] = blockNode;
209: rdNode->antType[0] = rf_control;
210:
211: /* Connect rdnode to commit node. */
212: RF_ASSERT(rdNode->numSuccedents == 1);
213: RF_ASSERT(commitNode->numAntecedents == 1);
214: rdNode->succedents[0] = commitNode;
215: commitNode->antecedents[0] = rdNode;
216: commitNode->antType[0] = rf_control;
217:
218: /* Connect commit node to terminator. */
219: RF_ASSERT(commitNode->numSuccedents == 1);
220: RF_ASSERT(termNode->numAntecedents == 1);
221: RF_ASSERT(termNode->numSuccedents == 0);
222: commitNode->succedents[0] = termNode;
223: termNode->antecedents[0] = commitNode;
224: termNode->antType[0] = rf_control;
225: }
226:
227:
228: /*****************************************************************************
229: *
230: * Create a DAG to perform a degraded-mode read of data within one stripe.
231: * This DAG is as follows:
232: *
233: * Hdr -> Block -> Rud -> Xor -> Cmt -> T
234: * -> Rrd ->
235: * -> Rp -->
236: *
237: * Each R node is a successor of the L node.
238: * One successor arc from each R node goes to C, and the other to X.
239: * There is one Rud for each chunk of surviving user data requested by the
240: * user, and one Rrd for each chunk of surviving user data _not_ being read by
241: * the user.
242: * R = read, ud = user data, rd = recovery (surviving) data, p = parity
243: * X = XOR, C = Commit, T = terminate
244: *
245: * The block node guarantees a single source node.
246: *
247: * Note: The target buffer for the XOR node is set to the actual user buffer
248: * where the failed data is supposed to end up. This buffer is zero'd by the
249: * code here. Thus, if you create a degraded read dag, use it, and then
250: * re-use, you have to be sure to zero the target buffer prior to the re-use.
251: *
252: * The recfunc argument at the end specifies the name and function used for
253: * the redundancy recovery function.
254: *
255: *****************************************************************************/
256:
257: void
258: rf_CreateDegradedReadDAG(
259: RF_Raid_t *raidPtr,
260: RF_AccessStripeMap_t *asmap,
261: RF_DagHeader_t *dag_h,
262: void *bp,
263: RF_RaidAccessFlags_t flags,
264: RF_AllocListElem_t *allocList,
265: RF_RedFuncs_t *recFunc)
266: {
267: RF_DagNode_t *nodes, *rudNodes, *rrdNodes, *xorNode, *blockNode;
268: RF_DagNode_t *commitNode, *rpNode, *termNode;
269: int nNodes, nRrdNodes, nRudNodes, nXorBufs, i;
270: int j, paramNum;
271: RF_SectorCount_t sectorsPerSU;
272: RF_ReconUnitNum_t which_ru;
273: char *overlappingPDAs; /* A temporary array of flags. */
274: RF_AccessStripeMapHeader_t *new_asm_h[2];
275: RF_PhysDiskAddr_t *pda, *parityPDA;
276: RF_StripeNum_t parityStripeID;
277: RF_PhysDiskAddr_t *failedPDA;
278: RF_RaidLayout_t *layoutPtr;
279: char *rpBuf;
280:
281: layoutPtr = &(raidPtr->Layout);
282: /*
283: * failedPDA points to the pda within the asm that targets
284: * the failed disk.
285: */
286: failedPDA = asmap->failedPDAs[0];
287: parityStripeID = rf_RaidAddressToParityStripeID(layoutPtr,
288: asmap->raidAddress, &which_ru);
289: sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
290:
291: if (rf_dagDebug) {
292: printf("[Creating degraded read DAG]\n");
293: }
294: RF_ASSERT(asmap->numDataFailed == 1);
295: dag_h->creator = "DegradedReadDAG";
296:
297: /*
298: * Generate two ASMs identifying the surviving data we need
299: * in order to recover the lost data.
300: */
301:
302: /* overlappingPDAs array must be zero'd. */
303: RF_Calloc(overlappingPDAs, asmap->numStripeUnitsAccessed,
304: sizeof(char), (char *));
305: rf_GenerateFailedAccessASMs(raidPtr, asmap, failedPDA, dag_h,
306: new_asm_h, &nXorBufs, &rpBuf, overlappingPDAs, allocList);
307:
308: /*
309: * Create all the nodes at once.
310: *
311: * -1 because no access is generated for the failed pda.
312: */
313: nRudNodes = asmap->numStripeUnitsAccessed - 1;
314: nRrdNodes = ((new_asm_h[0]) ?
315: new_asm_h[0]->stripeMap->numStripeUnitsAccessed : 0) +
316: ((new_asm_h[1]) ?
317: new_asm_h[1]->stripeMap->numStripeUnitsAccessed : 0);
318: nNodes = 5 + nRudNodes + nRrdNodes; /*
319: * lock, unlock, xor, Rp,
320: * Rud, Rrd
321: */
322: RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *),
323: allocList);
324: i = 0;
325: blockNode = &nodes[i];
326: i++;
327: commitNode = &nodes[i];
328: i++;
329: xorNode = &nodes[i];
330: i++;
331: rpNode = &nodes[i];
332: i++;
333: termNode = &nodes[i];
334: i++;
335: rudNodes = &nodes[i];
336: i += nRudNodes;
337: rrdNodes = &nodes[i];
338: i += nRrdNodes;
339: RF_ASSERT(i == nNodes);
340:
341: /* Initialize nodes. */
342: dag_h->numCommitNodes = 1;
343: dag_h->numCommits = 0;
344: /*
345: * This dag can not commit until the commit node is reached.
346: * Errors prior to the commit point imply the dag has failed.
347: */
348: dag_h->numSuccedents = 1;
349:
350: rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
351: rf_NullNodeUndoFunc, NULL, nRudNodes + nRrdNodes + 1, 0, 0, 0,
352: dag_h, "Nil", allocList);
353: rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
354: rf_NullNodeUndoFunc, NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList);
355: rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
356: rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
357: rf_InitNode(xorNode, rf_wait, RF_FALSE, recFunc->simple,
358: rf_NullNodeUndoFunc, NULL, 1, nRudNodes + nRrdNodes + 1,
359: 2 * nXorBufs + 2, 1, dag_h, recFunc->SimpleName, allocList);
360:
361: /* Fill in the Rud nodes. */
362: for (pda = asmap->physInfo, i = 0; i < nRudNodes;
363: i++, pda = pda->next) {
364: if (pda == failedPDA) {
365: i--;
366: continue;
367: }
368: rf_InitNode(&rudNodes[i], rf_wait, RF_FALSE, rf_DiskReadFunc,
369: rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
370: dag_h, "Rud", allocList);
371: RF_ASSERT(pda);
372: rudNodes[i].params[0].p = pda;
373: rudNodes[i].params[1].p = pda->bufPtr;
374: rudNodes[i].params[2].v = parityStripeID;
375: rudNodes[i].params[3].v =
376: RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
377: }
378:
379: /* Fill in the Rrd nodes. */
380: i = 0;
381: if (new_asm_h[0]) {
382: for (pda = new_asm_h[0]->stripeMap->physInfo;
383: i < new_asm_h[0]->stripeMap->numStripeUnitsAccessed;
384: i++, pda = pda->next) {
385: rf_InitNode(&rrdNodes[i], rf_wait, RF_FALSE,
386: rf_DiskReadFunc, rf_DiskReadUndoFunc,
387: rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
388: "Rrd", allocList);
389: RF_ASSERT(pda);
390: rrdNodes[i].params[0].p = pda;
391: rrdNodes[i].params[1].p = pda->bufPtr;
392: rrdNodes[i].params[2].v = parityStripeID;
393: rrdNodes[i].params[3].v =
394: RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0,
395: which_ru);
396: }
397: }
398: if (new_asm_h[1]) {
399: for (j = 0, pda = new_asm_h[1]->stripeMap->physInfo;
400: j < new_asm_h[1]->stripeMap->numStripeUnitsAccessed;
401: j++, pda = pda->next) {
402: rf_InitNode(&rrdNodes[i + j], rf_wait, RF_FALSE,
403: rf_DiskReadFunc, rf_DiskReadUndoFunc,
404: rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
405: "Rrd", allocList);
406: RF_ASSERT(pda);
407: rrdNodes[i + j].params[0].p = pda;
408: rrdNodes[i + j].params[1].p = pda->bufPtr;
409: rrdNodes[i + j].params[2].v = parityStripeID;
410: rrdNodes[i + j].params[3].v =
411: RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0,
412: which_ru);
413: }
414: }
415: /* Make a PDA for the parity unit. */
416: RF_MallocAndAdd(parityPDA, sizeof(RF_PhysDiskAddr_t),
417: (RF_PhysDiskAddr_t *), allocList);
418: parityPDA->row = asmap->parityInfo->row;
419: parityPDA->col = asmap->parityInfo->col;
420: parityPDA->startSector = ((asmap->parityInfo->startSector /
421: sectorsPerSU) * sectorsPerSU) +
422: (failedPDA->startSector % sectorsPerSU);
423: parityPDA->numSector = failedPDA->numSector;
424:
425: /* Initialize the Rp node. */
426: rf_InitNode(rpNode, rf_wait, RF_FALSE, rf_DiskReadFunc,
427: rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0, dag_h,
428: "Rp ", allocList);
429: rpNode->params[0].p = parityPDA;
430: rpNode->params[1].p = rpBuf;
431: rpNode->params[2].v = parityStripeID;
432: rpNode->params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0,
433: which_ru);
434:
435: /*
436: * The last and nastiest step is to assign all
437: * the parameters of the Xor node.
438: */
439: paramNum = 0;
440: for (i = 0; i < nRrdNodes; i++) {
441: /* All the Rrd nodes need to be xored together. */
442: xorNode->params[paramNum++] = rrdNodes[i].params[0];
443: xorNode->params[paramNum++] = rrdNodes[i].params[1];
444: }
445: for (i = 0; i < nRudNodes; i++) {
446: /* Any Rud nodes that overlap the failed access need to be
447: * xored in. */
448: if (overlappingPDAs[i]) {
449: RF_MallocAndAdd(pda, sizeof(RF_PhysDiskAddr_t),
450: (RF_PhysDiskAddr_t *), allocList);
451: bcopy((char *) rudNodes[i].params[0].p, (char *) pda,
452: sizeof(RF_PhysDiskAddr_t));
453: rf_RangeRestrictPDA(raidPtr, failedPDA, pda,
454: RF_RESTRICT_DOBUFFER, 0);
455: xorNode->params[paramNum++].p = pda;
456: xorNode->params[paramNum++].p = pda->bufPtr;
457: }
458: }
459: RF_Free(overlappingPDAs, asmap->numStripeUnitsAccessed * sizeof(char));
460:
461: /* Install parity pda as last set of params to be xor'd. */
462: xorNode->params[paramNum++].p = parityPDA;
463: xorNode->params[paramNum++].p = rpBuf;
464:
465: /*
466: * The last 2 params to the recovery xor node are
467: * the failed PDA and the raidPtr.
468: */
469: xorNode->params[paramNum++].p = failedPDA;
470: xorNode->params[paramNum++].p = raidPtr;
471: RF_ASSERT(paramNum == 2 * nXorBufs + 2);
472:
473: /*
474: * The xor node uses results[0] as the target buffer.
475: * Set pointer and zero the buffer. In the kernel, this
476: * may be a user buffer in which case we have to remap it.
477: */
478: xorNode->results[0] = failedPDA->bufPtr;
479: RF_BZERO(bp, failedPDA->bufPtr, rf_RaidAddressToByte(raidPtr,
480: failedPDA->numSector));
481:
482: /* Connect nodes to form graph. */
483: /* Connect the header to the block node. */
484: RF_ASSERT(dag_h->numSuccedents == 1);
485: RF_ASSERT(blockNode->numAntecedents == 0);
486: dag_h->succedents[0] = blockNode;
487:
488: /* Connect the block node to the read nodes. */
489: RF_ASSERT(blockNode->numSuccedents == (1 + nRrdNodes + nRudNodes));
490: RF_ASSERT(rpNode->numAntecedents == 1);
491: blockNode->succedents[0] = rpNode;
492: rpNode->antecedents[0] = blockNode;
493: rpNode->antType[0] = rf_control;
494: for (i = 0; i < nRrdNodes; i++) {
495: RF_ASSERT(rrdNodes[i].numSuccedents == 1);
496: blockNode->succedents[1 + i] = &rrdNodes[i];
497: rrdNodes[i].antecedents[0] = blockNode;
498: rrdNodes[i].antType[0] = rf_control;
499: }
500: for (i = 0; i < nRudNodes; i++) {
501: RF_ASSERT(rudNodes[i].numSuccedents == 1);
502: blockNode->succedents[1 + nRrdNodes + i] = &rudNodes[i];
503: rudNodes[i].antecedents[0] = blockNode;
504: rudNodes[i].antType[0] = rf_control;
505: }
506:
507: /* Connect the read nodes to the xor node. */
508: RF_ASSERT(xorNode->numAntecedents == (1 + nRrdNodes + nRudNodes));
509: RF_ASSERT(rpNode->numSuccedents == 1);
510: rpNode->succedents[0] = xorNode;
511: xorNode->antecedents[0] = rpNode;
512: xorNode->antType[0] = rf_trueData;
513: for (i = 0; i < nRrdNodes; i++) {
514: RF_ASSERT(rrdNodes[i].numSuccedents == 1);
515: rrdNodes[i].succedents[0] = xorNode;
516: xorNode->antecedents[1 + i] = &rrdNodes[i];
517: xorNode->antType[1 + i] = rf_trueData;
518: }
519: for (i = 0; i < nRudNodes; i++) {
520: RF_ASSERT(rudNodes[i].numSuccedents == 1);
521: rudNodes[i].succedents[0] = xorNode;
522: xorNode->antecedents[1 + nRrdNodes + i] = &rudNodes[i];
523: xorNode->antType[1 + nRrdNodes + i] = rf_trueData;
524: }
525:
526: /* Connect the xor node to the commit node. */
527: RF_ASSERT(xorNode->numSuccedents == 1);
528: RF_ASSERT(commitNode->numAntecedents == 1);
529: xorNode->succedents[0] = commitNode;
530: commitNode->antecedents[0] = xorNode;
531: commitNode->antType[0] = rf_control;
532:
533: /* Connect the termNode to the commit node. */
534: RF_ASSERT(commitNode->numSuccedents == 1);
535: RF_ASSERT(termNode->numAntecedents == 1);
536: RF_ASSERT(termNode->numSuccedents == 0);
537: commitNode->succedents[0] = termNode;
538: termNode->antType[0] = rf_control;
539: termNode->antecedents[0] = commitNode;
540: }
541:
542:
543: /*****************************************************************************
544: * Create a degraded read DAG for Chained Declustering.
545: *
546: * Hdr -> Nil -> R(p/s)d -> Cmt -> Trm
547: *
548: * The "Rd" node reads data from the surviving disk in the mirror pair
549: * Rpd - read of primary copy
550: * Rsd - read of secondary copy
551: *
552: * Parameters: raidPtr - description of the physical array
553: * asmap - logical & physical addresses for this access
554: * bp - buffer ptr (for holding write data)
555: * flags - general flags (e.g. disk locking)
556: * allocList - list of memory allocated in DAG creation
557: *****************************************************************************/
558:
559: void
560: rf_CreateRaidCDegradedReadDAG(
561: RF_Raid_t *raidPtr,
562: RF_AccessStripeMap_t *asmap,
563: RF_DagHeader_t *dag_h,
564: void *bp,
565: RF_RaidAccessFlags_t flags,
566: RF_AllocListElem_t *allocList
567: )
568: {
569: RF_DagNode_t *nodes, *rdNode, *blockNode, *commitNode, *termNode;
570: RF_StripeNum_t parityStripeID;
571: int useMirror, i, shiftable;
572: RF_ReconUnitNum_t which_ru;
573: RF_PhysDiskAddr_t *pda;
574:
575: if ((asmap->numDataFailed + asmap->numParityFailed) == 0) {
576: shiftable = RF_TRUE;
577: } else {
578: shiftable = RF_FALSE;
579: }
580: useMirror = 0;
581: parityStripeID = rf_RaidAddressToParityStripeID(&(raidPtr->Layout),
582: asmap->raidAddress, &which_ru);
583:
584: if (rf_dagDebug) {
585: printf("[Creating RAID C degraded read DAG]\n");
586: }
587: dag_h->creator = "RaidCDegradedReadDAG";
588: /* Alloc the Wnd nodes and the Wmir node. */
589: if (asmap->numDataFailed == 0)
590: useMirror = RF_FALSE;
591: else
592: useMirror = RF_TRUE;
593:
594: /* total number of nodes = 1 + (block + commit + terminator) */
595: RF_CallocAndAdd(nodes, 4, sizeof(RF_DagNode_t), (RF_DagNode_t *),
596: allocList);
597: i = 0;
598: rdNode = &nodes[i];
599: i++;
600: blockNode = &nodes[i];
601: i++;
602: commitNode = &nodes[i];
603: i++;
604: termNode = &nodes[i];
605: i++;
606:
607: /*
608: * This dag can not commit until the commit node is reached.
609: * Errors prior to the commit point imply the dag has failed
610: * and must be retried.
611: */
612: dag_h->numCommitNodes = 1;
613: dag_h->numCommits = 0;
614: dag_h->numSuccedents = 1;
615:
616: /* initialize the block, commit, and terminator nodes */
617: rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
618: rf_NullNodeUndoFunc, NULL, 1, 0, 0, 0, dag_h, "Nil", allocList);
619: rf_InitNode(commitNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
620: rf_NullNodeUndoFunc, NULL, 1, 1, 0, 0, dag_h, "Cmt", allocList);
621: rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
622: rf_TerminateUndoFunc, NULL, 0, 1, 0, 0, dag_h, "Trm", allocList);
623:
624: pda = asmap->physInfo;
625: RF_ASSERT(pda != NULL);
626: /* ParityInfo must describe entire parity unit. */
627: RF_ASSERT(asmap->parityInfo->next == NULL);
628:
629: /* Initialize the data node. */
630: if (!useMirror) {
631: rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc,
632: rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
633: dag_h, "Rpd", allocList);
634: if (shiftable && rf_compute_workload_shift(raidPtr, pda)) {
635: /* Shift this read to the next disk in line. */
636: rdNode->params[0].p = asmap->parityInfo;
637: rdNode->params[1].p = pda->bufPtr;
638: rdNode->params[2].v = parityStripeID;
639: rdNode->params[3].v = RF_CREATE_PARAM3(
640: RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
641: } else {
642: /* Read primary copy. */
643: rdNode->params[0].p = pda;
644: rdNode->params[1].p = pda->bufPtr;
645: rdNode->params[2].v = parityStripeID;
646: rdNode->params[3].v = RF_CREATE_PARAM3(
647: RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
648: }
649: } else {
650: /* Read secondary copy of data. */
651: rf_InitNode(rdNode, rf_wait, RF_FALSE, rf_DiskReadFunc,
652: rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 1, 1, 4, 0,
653: dag_h, "Rsd", allocList);
654: rdNode->params[0].p = asmap->parityInfo;
655: rdNode->params[1].p = pda->bufPtr;
656: rdNode->params[2].v = parityStripeID;
657: rdNode->params[3].v =
658: RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, 0, 0, which_ru);
659: }
660:
661: /* Connect header to block node. */
662: RF_ASSERT(dag_h->numSuccedents == 1);
663: RF_ASSERT(blockNode->numAntecedents == 0);
664: dag_h->succedents[0] = blockNode;
665:
666: /* Connect block node to rdnode. */
667: RF_ASSERT(blockNode->numSuccedents == 1);
668: RF_ASSERT(rdNode->numAntecedents == 1);
669: blockNode->succedents[0] = rdNode;
670: rdNode->antecedents[0] = blockNode;
671: rdNode->antType[0] = rf_control;
672:
673: /* Connect rdnode to commit node. */
674: RF_ASSERT(rdNode->numSuccedents == 1);
675: RF_ASSERT(commitNode->numAntecedents == 1);
676: rdNode->succedents[0] = commitNode;
677: commitNode->antecedents[0] = rdNode;
678: commitNode->antType[0] = rf_control;
679:
680: /* Connect commit node to terminator. */
681: RF_ASSERT(commitNode->numSuccedents == 1);
682: RF_ASSERT(termNode->numAntecedents == 1);
683: RF_ASSERT(termNode->numSuccedents == 0);
684: commitNode->succedents[0] = termNode;
685: termNode->antecedents[0] = commitNode;
686: termNode->antType[0] = rf_control;
687: }
688:
689: /*
690: * XXX move this elsewhere ?
691: */
692: void
693: rf_DD_GenerateFailedAccessASMs(
694: RF_Raid_t *raidPtr,
695: RF_AccessStripeMap_t *asmap,
696: RF_PhysDiskAddr_t **pdap,
697: int *nNodep,
698: RF_PhysDiskAddr_t **pqpdap,
699: int *nPQNodep,
700: RF_AllocListElem_t *allocList
701: )
702: {
703: RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
704: int PDAPerDisk, i;
705: RF_SectorCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
706: int numDataCol = layoutPtr->numDataCol;
707: int state;
708: RF_SectorNum_t suoff, suend;
709: unsigned firstDataCol, napdas, count;
710: RF_SectorNum_t fone_start, fone_end, ftwo_start = 0, ftwo_end = 0;
711: RF_PhysDiskAddr_t *fone = asmap->failedPDAs[0];
712: RF_PhysDiskAddr_t *ftwo = asmap->failedPDAs[1];
713: RF_PhysDiskAddr_t *pda_p;
714: RF_PhysDiskAddr_t *phys_p;
715: RF_RaidAddr_t sosAddr;
716:
717: /*
718: * Determine how many pda's we will have to generate per unaccessed
719: * stripe. If there is only one failed data unit, it is one; if two,
720: * possibly two, depending whether they overlap.
721: */
722:
723: fone_start = rf_StripeUnitOffset(layoutPtr, fone->startSector);
724: fone_end = fone_start + fone->numSector;
725:
726: #define CONS_PDA(if,start,num) do { \
727: pda_p->row = asmap->if->row; \
728: pda_p->col = asmap->if->col; \
729: pda_p->startSector = ((asmap->if->startSector / secPerSU) * \
730: secPerSU) + start; \
731: pda_p->numSector = num; \
732: pda_p->next = NULL; \
733: RF_MallocAndAdd(pda_p->bufPtr, \
734: rf_RaidAddressToByte(raidPtr,num),(char *), allocList); \
735: } while (0)
736:
737: if (asmap->numDataFailed == 1) {
738: PDAPerDisk = 1;
739: state = 1;
740: RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t),
741: (RF_PhysDiskAddr_t *), allocList);
742: pda_p = *pqpdap;
743: /* Build p. */
744: CONS_PDA(parityInfo, fone_start, fone->numSector);
745: pda_p->type = RF_PDA_TYPE_PARITY;
746: pda_p++;
747: /* Build q. */
748: CONS_PDA(qInfo, fone_start, fone->numSector);
749: pda_p->type = RF_PDA_TYPE_Q;
750: } else {
751: ftwo_start = rf_StripeUnitOffset(layoutPtr, ftwo->startSector);
752: ftwo_end = ftwo_start + ftwo->numSector;
753: if (fone->numSector + ftwo->numSector > secPerSU) {
754: PDAPerDisk = 1;
755: state = 2;
756: RF_MallocAndAdd(*pqpdap, 2 * sizeof(RF_PhysDiskAddr_t),
757: (RF_PhysDiskAddr_t *), allocList);
758: pda_p = *pqpdap;
759: CONS_PDA(parityInfo, 0, secPerSU);
760: pda_p->type = RF_PDA_TYPE_PARITY;
761: pda_p++;
762: CONS_PDA(qInfo, 0, secPerSU);
763: pda_p->type = RF_PDA_TYPE_Q;
764: } else {
765: PDAPerDisk = 2;
766: state = 3;
767: /* Four of them, fone, then ftwo. */
768: RF_MallocAndAdd(*pqpdap, 4 * sizeof(RF_PhysDiskAddr_t),
769: (RF_PhysDiskAddr_t *), allocList);
770: pda_p = *pqpdap;
771: CONS_PDA(parityInfo, fone_start, fone->numSector);
772: pda_p->type = RF_PDA_TYPE_PARITY;
773: pda_p++;
774: CONS_PDA(qInfo, fone_start, fone->numSector);
775: pda_p->type = RF_PDA_TYPE_Q;
776: pda_p++;
777: CONS_PDA(parityInfo, ftwo_start, ftwo->numSector);
778: pda_p->type = RF_PDA_TYPE_PARITY;
779: pda_p++;
780: CONS_PDA(qInfo, ftwo_start, ftwo->numSector);
781: pda_p->type = RF_PDA_TYPE_Q;
782: }
783: }
784: /* Figure out number of nonaccessed pda. */
785: napdas = PDAPerDisk * (numDataCol - asmap->numStripeUnitsAccessed -
786: (ftwo == NULL ? 1 : 0));
787: *nPQNodep = PDAPerDisk;
788:
789: /*
790: * Sweep over the over accessed pda's, figuring out the number of
791: * additional pda's to generate. Of course, skip the failed ones.
792: */
793:
794: count = 0;
795: for (pda_p = asmap->physInfo; pda_p; pda_p = pda_p->next) {
796: if ((pda_p == fone) || (pda_p == ftwo))
797: continue;
798: suoff = rf_StripeUnitOffset(layoutPtr, pda_p->startSector);
799: suend = suoff + pda_p->numSector;
800: switch (state) {
801: case 1: /* One failed PDA to overlap. */
802: /*
803: * If a PDA doesn't contain the failed unit, it can
804: * only miss the start or end, not both.
805: */
806: if ((suoff > fone_start) || (suend < fone_end))
807: count++;
808: break;
809: case 2: /* Whole stripe. */
810: if (suoff) /* Leak at begining. */
811: count++;
812: if (suend < numDataCol) /* Leak at end. */
813: count++;
814: break;
815: case 3: /* Two disjoint units. */
816: if ((suoff > fone_start) || (suend < fone_end))
817: count++;
818: if ((suoff > ftwo_start) || (suend < ftwo_end))
819: count++;
820: break;
821: default:
822: RF_PANIC();
823: }
824: }
825:
826: napdas += count;
827: *nNodep = napdas;
828: if (napdas == 0)
829: return; /* short circuit */
830:
831: /* Allocate up our list of pda's. */
832:
833: RF_CallocAndAdd(pda_p, napdas, sizeof(RF_PhysDiskAddr_t),
834: (RF_PhysDiskAddr_t *), allocList);
835: *pdap = pda_p;
836:
837: /* Link them together. */
838: for (i = 0; i < (napdas - 1); i++)
839: pda_p[i].next = pda_p + (i + 1);
840:
841: /* March through the one's up to the first accessed disk. */
842: firstDataCol = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
843: asmap->physInfo->raidAddress) % numDataCol;
844: sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
845: asmap->raidAddress);
846: for (i = 0; i < firstDataCol; i++) {
847: if ((pda_p - (*pdap)) == napdas)
848: continue;
849: pda_p->type = RF_PDA_TYPE_DATA;
850: pda_p->raidAddress = sosAddr + (i * secPerSU);
851: (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress,
852: &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
853: /* Skip over dead disks. */
854: if (RF_DEAD_DISK(raidPtr->Disks[pda_p->row][pda_p->col].status))
855: continue;
856: switch (state) {
857: case 1: /* Fone. */
858: pda_p->numSector = fone->numSector;
859: pda_p->raidAddress += fone_start;
860: pda_p->startSector += fone_start;
861: RF_MallocAndAdd(pda_p->bufPtr,
862: rf_RaidAddressToByte(raidPtr, pda_p->numSector),
863: (char *), allocList);
864: break;
865: case 2: /* Full stripe. */
866: pda_p->numSector = secPerSU;
867: RF_MallocAndAdd(pda_p->bufPtr,
868: rf_RaidAddressToByte(raidPtr, secPerSU),
869: (char *), allocList);
870: break;
871: case 3: /* Two slabs. */
872: pda_p->numSector = fone->numSector;
873: pda_p->raidAddress += fone_start;
874: pda_p->startSector += fone_start;
875: RF_MallocAndAdd(pda_p->bufPtr,
876: rf_RaidAddressToByte(raidPtr, pda_p->numSector),
877: (char *), allocList);
878: pda_p++;
879: pda_p->type = RF_PDA_TYPE_DATA;
880: pda_p->raidAddress = sosAddr + (i * secPerSU);
881: (raidPtr->Layout.map->MapSector) (raidPtr,
882: pda_p->raidAddress, &(pda_p->row), &(pda_p->col),
883: &(pda_p->startSector), 0);
884: pda_p->numSector = ftwo->numSector;
885: pda_p->raidAddress += ftwo_start;
886: pda_p->startSector += ftwo_start;
887: RF_MallocAndAdd(pda_p->bufPtr,
888: rf_RaidAddressToByte(raidPtr, pda_p->numSector),
889: (char *), allocList);
890: break;
891: default:
892: RF_PANIC();
893: }
894: pda_p++;
895: }
896:
897: /* March through the touched stripe units. */
898: for (phys_p = asmap->physInfo; phys_p; phys_p = phys_p->next, i++) {
899: if ((phys_p == asmap->failedPDAs[0]) ||
900: (phys_p == asmap->failedPDAs[1]))
901: continue;
902: suoff = rf_StripeUnitOffset(layoutPtr, phys_p->startSector);
903: suend = suoff + phys_p->numSector;
904: switch (state) {
905: case 1: /* Single buffer. */
906: if (suoff > fone_start) {
907: RF_ASSERT(suend >= fone_end);
908: /*
909: * The data read starts after the mapped
910: * access, snip off the begining.
911: */
912: pda_p->numSector = suoff - fone_start;
913: pda_p->raidAddress = sosAddr + (i * secPerSU)
914: + fone_start;
915: (raidPtr->Layout.map->MapSector) (raidPtr,
916: pda_p->raidAddress, &(pda_p->row),
917: &(pda_p->col), &(pda_p->startSector), 0);
918: RF_MallocAndAdd(pda_p->bufPtr,
919: rf_RaidAddressToByte(raidPtr,
920: pda_p->numSector), (char *), allocList);
921: pda_p++;
922: }
923: if (suend < fone_end) {
924: RF_ASSERT(suoff <= fone_start);
925: /*
926: * The data read stops before the end of the
927: * failed access, extend.
928: */
929: pda_p->numSector = fone_end - suend;
930: pda_p->raidAddress = sosAddr + (i * secPerSU)
931: + suend; /* off by one? */
932: (raidPtr->Layout.map->MapSector) (raidPtr,
933: pda_p->raidAddress, &(pda_p->row),
934: &(pda_p->col), &(pda_p->startSector), 0);
935: RF_MallocAndAdd(pda_p->bufPtr,
936: rf_RaidAddressToByte(raidPtr,
937: pda_p->numSector), (char *), allocList);
938: pda_p++;
939: }
940: break;
941: case 2: /* Whole stripe unit. */
942: RF_ASSERT((suoff == 0) || (suend == secPerSU));
943: if (suend < secPerSU) {
944: /* Short read, snip from end on. */
945: pda_p->numSector = secPerSU - suend;
946: pda_p->raidAddress = sosAddr + (i * secPerSU)
947: + suend; /* off by one? */
948: (raidPtr->Layout.map->MapSector) (raidPtr,
949: pda_p->raidAddress, &(pda_p->row),
950: &(pda_p->col), &(pda_p->startSector), 0);
951: RF_MallocAndAdd(pda_p->bufPtr,
952: rf_RaidAddressToByte(raidPtr,
953: pda_p->numSector), (char *), allocList);
954: pda_p++;
955: } else
956: if (suoff > 0) {
957: /* Short at front. */
958: pda_p->numSector = suoff;
959: pda_p->raidAddress = sosAddr +
960: (i * secPerSU);
961: (raidPtr->Layout.map->MapSector)
962: (raidPtr, pda_p->raidAddress,
963: &(pda_p->row), &(pda_p->col),
964: &(pda_p->startSector), 0);
965: RF_MallocAndAdd(pda_p->bufPtr,
966: rf_RaidAddressToByte(raidPtr,
967: pda_p->numSector), (char *),
968: allocList);
969: pda_p++;
970: }
971: break;
972: case 3: /* Two nonoverlapping failures. */
973: if ((suoff > fone_start) || (suend < fone_end)) {
974: if (suoff > fone_start) {
975: RF_ASSERT(suend >= fone_end);
976: /*
977: * The data read starts after the
978: * mapped access, snip off the
979: * begining.
980: */
981: pda_p->numSector = suoff - fone_start;
982: pda_p->raidAddress = sosAddr +
983: (i * secPerSU) + fone_start;
984: (raidPtr->Layout.map->MapSector)
985: (raidPtr, pda_p->raidAddress,
986: &(pda_p->row), &(pda_p->col),
987: &(pda_p->startSector), 0);
988: RF_MallocAndAdd(pda_p->bufPtr,
989: rf_RaidAddressToByte(raidPtr,
990: pda_p->numSector), (char *),
991: allocList);
992: pda_p++;
993: }
994: if (suend < fone_end) {
995: RF_ASSERT(suoff <= fone_start);
996: /*
997: * The data read stops before the end
998: * of the failed access, extend.
999: */
1000: pda_p->numSector = fone_end - suend;
1001: pda_p->raidAddress = sosAddr +
1002: (i * secPerSU) +
1003: suend; /* Off by one ? */
1004: (raidPtr->Layout.map->MapSector)
1005: (raidPtr, pda_p->raidAddress,
1006: &(pda_p->row), &(pda_p->col),
1007: &(pda_p->startSector), 0);
1008: RF_MallocAndAdd(pda_p->bufPtr,
1009: rf_RaidAddressToByte(raidPtr,
1010: pda_p->numSector), (char *),
1011: allocList);
1012: pda_p++;
1013: }
1014: }
1015: if ((suoff > ftwo_start) || (suend < ftwo_end)) {
1016: if (suoff > ftwo_start) {
1017: RF_ASSERT(suend >= ftwo_end);
1018: /*
1019: * The data read starts after the
1020: * mapped access, snip off the
1021: * begining.
1022: */
1023: pda_p->numSector = suoff - ftwo_start;
1024: pda_p->raidAddress = sosAddr +
1025: (i * secPerSU) + ftwo_start;
1026: (raidPtr->Layout.map->MapSector)
1027: (raidPtr, pda_p->raidAddress,
1028: &(pda_p->row), &(pda_p->col),
1029: &(pda_p->startSector), 0);
1030: RF_MallocAndAdd(pda_p->bufPtr,
1031: rf_RaidAddressToByte(raidPtr,
1032: pda_p->numSector), (char *),
1033: allocList);
1034: pda_p++;
1035: }
1036: if (suend < ftwo_end) {
1037: RF_ASSERT(suoff <= ftwo_start);
1038: /*
1039: * The data read stops before the end
1040: * of the failed access, extend.
1041: */
1042: pda_p->numSector = ftwo_end - suend;
1043: pda_p->raidAddress = sosAddr +
1044: (i * secPerSU) +
1045: suend; /* Off by one ? */
1046: (raidPtr->Layout.map->MapSector)
1047: (raidPtr, pda_p->raidAddress,
1048: &(pda_p->row), &(pda_p->col),
1049: &(pda_p->startSector), 0);
1050: RF_MallocAndAdd(pda_p->bufPtr,
1051: rf_RaidAddressToByte(raidPtr,
1052: pda_p->numSector), (char *),
1053: allocList);
1054: pda_p++;
1055: }
1056: }
1057: break;
1058: default:
1059: RF_PANIC();
1060: }
1061: }
1062:
1063: /* After the last accessed disk. */
1064: for (; i < numDataCol; i++) {
1065: if ((pda_p - (*pdap)) == napdas)
1066: continue;
1067: pda_p->type = RF_PDA_TYPE_DATA;
1068: pda_p->raidAddress = sosAddr + (i * secPerSU);
1069: (raidPtr->Layout.map->MapSector) (raidPtr, pda_p->raidAddress,
1070: &(pda_p->row), &(pda_p->col), &(pda_p->startSector), 0);
1071: /* Skip over dead disks. */
1072: if (RF_DEAD_DISK(raidPtr->Disks[pda_p->row][pda_p->col].status))
1073: continue;
1074: switch (state) {
1075: case 1: /* Fone. */
1076: pda_p->numSector = fone->numSector;
1077: pda_p->raidAddress += fone_start;
1078: pda_p->startSector += fone_start;
1079: RF_MallocAndAdd(pda_p->bufPtr,
1080: rf_RaidAddressToByte(raidPtr, pda_p->numSector),
1081: (char *), allocList);
1082: break;
1083: case 2: /* Full stripe. */
1084: pda_p->numSector = secPerSU;
1085: RF_MallocAndAdd(pda_p->bufPtr,
1086: rf_RaidAddressToByte(raidPtr, secPerSU),
1087: (char *), allocList);
1088: break;
1089: case 3: /* Two slabs. */
1090: pda_p->numSector = fone->numSector;
1091: pda_p->raidAddress += fone_start;
1092: pda_p->startSector += fone_start;
1093: RF_MallocAndAdd(pda_p->bufPtr,
1094: rf_RaidAddressToByte(raidPtr, pda_p->numSector),
1095: (char *), allocList);
1096: pda_p++;
1097: pda_p->type = RF_PDA_TYPE_DATA;
1098: pda_p->raidAddress = sosAddr + (i * secPerSU);
1099: (raidPtr->Layout.map->MapSector) (raidPtr,
1100: pda_p->raidAddress, &(pda_p->row), &(pda_p->col),
1101: &(pda_p->startSector), 0);
1102: pda_p->numSector = ftwo->numSector;
1103: pda_p->raidAddress += ftwo_start;
1104: pda_p->startSector += ftwo_start;
1105: RF_MallocAndAdd(pda_p->bufPtr,
1106: rf_RaidAddressToByte(raidPtr, pda_p->numSector),
1107: (char *), allocList);
1108: break;
1109: default:
1110: RF_PANIC();
1111: }
1112: pda_p++;
1113: }
1114:
1115: RF_ASSERT(pda_p - *pdap == napdas);
1116: return;
1117: }
1118:
1119: #define INIT_DISK_NODE(node,name) do { \
1120: rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, \
1121: rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2,1,4,0, \
1122: dag_h, name, allocList); \
1123: (node)->succedents[0] = unblockNode; \
1124: (node)->succedents[1] = recoveryNode; \
1125: (node)->antecedents[0] = blockNode; \
1126: (node)->antType[0] = rf_control; \
1127: } while (0)
1128:
1129: #define DISK_NODE_PARAMS(_node_,_p_) do { \
1130: (_node_).params[0].p = _p_ ; \
1131: (_node_).params[1].p = (_p_)->bufPtr; \
1132: (_node_).params[2].v = parityStripeID; \
1133: (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, \
1134: 0, 0, which_ru); \
1135: } while (0)
1136:
1137: void
1138: rf_DoubleDegRead(
1139: RF_Raid_t *raidPtr,
1140: RF_AccessStripeMap_t *asmap,
1141: RF_DagHeader_t *dag_h,
1142: void *bp,
1143: RF_RaidAccessFlags_t flags,
1144: RF_AllocListElem_t *allocList,
1145: char *redundantReadNodeName,
1146: char *recoveryNodeName,
1147: int (*recovFunc) (RF_DagNode_t *)
1148: )
1149: {
1150: RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
1151: RF_DagNode_t *nodes, *rudNodes, *rrdNodes, *recoveryNode, *blockNode,
1152: *unblockNode, *rpNodes, *rqNodes, *termNode;
1153: RF_PhysDiskAddr_t *pda, *pqPDAs;
1154: RF_PhysDiskAddr_t *npdas;
1155: int nNodes, nRrdNodes, nRudNodes, i;
1156: RF_ReconUnitNum_t which_ru;
1157: int nReadNodes, nPQNodes;
1158: RF_PhysDiskAddr_t *failedPDA = asmap->failedPDAs[0];
1159: RF_PhysDiskAddr_t *failedPDAtwo = asmap->failedPDAs[1];
1160: RF_StripeNum_t parityStripeID = rf_RaidAddressToParityStripeID(
1161: layoutPtr, asmap->raidAddress, &which_ru);
1162:
1163: if (rf_dagDebug)
1164: printf("[Creating Double Degraded Read DAG]\n");
1165: rf_DD_GenerateFailedAccessASMs(raidPtr, asmap, &npdas, &nRrdNodes,
1166: &pqPDAs, &nPQNodes, allocList);
1167:
1168: nRudNodes = asmap->numStripeUnitsAccessed - (asmap->numDataFailed);
1169: nReadNodes = nRrdNodes + nRudNodes + 2 * nPQNodes;
1170: nNodes = 4 /* Block, unblock, recovery, term. */ + nReadNodes;
1171:
1172: RF_CallocAndAdd(nodes, nNodes, sizeof(RF_DagNode_t), (RF_DagNode_t *),
1173: allocList);
1174: i = 0;
1175: blockNode = &nodes[i];
1176: i += 1;
1177: unblockNode = &nodes[i];
1178: i += 1;
1179: recoveryNode = &nodes[i];
1180: i += 1;
1181: termNode = &nodes[i];
1182: i += 1;
1183: rudNodes = &nodes[i];
1184: i += nRudNodes;
1185: rrdNodes = &nodes[i];
1186: i += nRrdNodes;
1187: rpNodes = &nodes[i];
1188: i += nPQNodes;
1189: rqNodes = &nodes[i];
1190: i += nPQNodes;
1191: RF_ASSERT(i == nNodes);
1192:
1193: dag_h->numSuccedents = 1;
1194: dag_h->succedents[0] = blockNode;
1195: dag_h->creator = "DoubleDegRead";
1196: dag_h->numCommits = 0;
1197: dag_h->numCommitNodes = 1; /* Unblock. */
1198:
1199: rf_InitNode(termNode, rf_wait, RF_FALSE, rf_TerminateFunc,
1200: rf_TerminateUndoFunc, NULL, 0, 2, 0, 0, dag_h, "Trm", allocList);
1201: termNode->antecedents[0] = unblockNode;
1202: termNode->antType[0] = rf_control;
1203: termNode->antecedents[1] = recoveryNode;
1204: termNode->antType[1] = rf_control;
1205:
1206: /*
1207: * Init the block and unblock nodes.
1208: * The block node has all nodes except itself, unblock and
1209: * recovery as successors.
1210: * Similarly for predecessors of the unblock.
1211: */
1212: rf_InitNode(blockNode, rf_wait, RF_FALSE, rf_NullNodeFunc,
1213: rf_NullNodeUndoFunc, NULL, nReadNodes, 0, 0, 0, dag_h,
1214: "Nil", allocList);
1215: rf_InitNode(unblockNode, rf_wait, RF_TRUE, rf_NullNodeFunc,
1216: rf_NullNodeUndoFunc, NULL, 1, nReadNodes, 0, 0, dag_h,
1217: "Nil", allocList);
1218:
1219: for (i = 0; i < nReadNodes; i++) {
1220: blockNode->succedents[i] = rudNodes + i;
1221: unblockNode->antecedents[i] = rudNodes + i;
1222: unblockNode->antType[i] = rf_control;
1223: }
1224: unblockNode->succedents[0] = termNode;
1225:
1226: /*
1227: * The recovery node has all the reads as predecessors, and the term
1228: * node as successors. It gets a pda as a param from each of the read
1229: * nodes plus the raidPtr. For each failed unit is has a result pda.
1230: */
1231: rf_InitNode(recoveryNode, rf_wait, RF_FALSE, recovFunc,
1232: rf_NullNodeUndoFunc, NULL,
1233: 1, /* succesors */
1234: nReadNodes, /* preds */
1235: nReadNodes + 2, /* params */
1236: asmap->numDataFailed, /* results */
1237: dag_h, recoveryNodeName, allocList);
1238:
1239: recoveryNode->succedents[0] = termNode;
1240: for (i = 0; i < nReadNodes; i++) {
1241: recoveryNode->antecedents[i] = rudNodes + i;
1242: recoveryNode->antType[i] = rf_trueData;
1243: }
1244:
1245: /*
1246: * Build the read nodes, then come back and fill in recovery params
1247: * and results.
1248: */
1249: pda = asmap->physInfo;
1250: for (i = 0; i < nRudNodes; pda = pda->next) {
1251: if ((pda == failedPDA) || (pda == failedPDAtwo))
1252: continue;
1253: INIT_DISK_NODE(rudNodes + i, "Rud");
1254: RF_ASSERT(pda);
1255: DISK_NODE_PARAMS(rudNodes[i], pda);
1256: i++;
1257: }
1258:
1259: pda = npdas;
1260: for (i = 0; i < nRrdNodes; i++, pda = pda->next) {
1261: INIT_DISK_NODE(rrdNodes + i, "Rrd");
1262: RF_ASSERT(pda);
1263: DISK_NODE_PARAMS(rrdNodes[i], pda);
1264: }
1265:
1266: /* Redundancy pdas. */
1267: pda = pqPDAs;
1268: INIT_DISK_NODE(rpNodes, "Rp");
1269: RF_ASSERT(pda);
1270: DISK_NODE_PARAMS(rpNodes[0], pda);
1271: pda++;
1272: INIT_DISK_NODE(rqNodes, redundantReadNodeName);
1273: RF_ASSERT(pda);
1274: DISK_NODE_PARAMS(rqNodes[0], pda);
1275: if (nPQNodes == 2) {
1276: pda++;
1277: INIT_DISK_NODE(rpNodes + 1, "Rp");
1278: RF_ASSERT(pda);
1279: DISK_NODE_PARAMS(rpNodes[1], pda);
1280: pda++;
1281: INIT_DISK_NODE(rqNodes + 1, redundantReadNodeName);
1282: RF_ASSERT(pda);
1283: DISK_NODE_PARAMS(rqNodes[1], pda);
1284: }
1285: /* Fill in recovery node params. */
1286: for (i = 0; i < nReadNodes; i++)
1287: recoveryNode->params[i] = rudNodes[i].params[0]; /* pda */
1288: recoveryNode->params[i++].p = (void *) raidPtr;
1289: recoveryNode->params[i++].p = (void *) asmap;
1290: recoveryNode->results[0] = failedPDA;
1291: if (asmap->numDataFailed == 2)
1292: recoveryNode->results[1] = failedPDAtwo;
1293:
1294: /* Zero fill the target data buffers ? */
1295: }
CVSweb