Annotation of sys/dev/raidframe/rf_pqdegdags.c, Revision 1.1.1.1
1.1 nbrk 1: /* $OpenBSD: rf_pqdegdags.c,v 1.5 2002/12/16 07:01:04 tdeval Exp $ */
2: /* $NetBSD: rf_pqdegdags.c,v 1.5 1999/08/15 02:36:40 oster Exp $ */
3:
4: /*
5: * Copyright (c) 1995 Carnegie-Mellon University.
6: * All rights reserved.
7: *
8: * Author: Daniel Stodolsky
9: *
10: * Permission to use, copy, modify and distribute this software and
11: * its documentation is hereby granted, provided that both the copyright
12: * notice and this permission notice appear in all copies of the
13: * software, derivative works or modified versions, and any portions
14: * thereof, and that both notices appear in supporting documentation.
15: *
16: * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
17: * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
18: * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
19: *
20: * Carnegie Mellon requests users of this software to return to
21: *
22: * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
23: * School of Computer Science
24: * Carnegie Mellon University
25: * Pittsburgh PA 15213-3890
26: *
27: * any improvements or extensions that they make and grant Carnegie the
28: * rights to redistribute these changes.
29: */
30:
31: /*
32: * rf_pqdegdags.c
33: * Degraded mode dags for double fault cases.
34: */
35:
36:
37: #include "rf_archs.h"
38:
39: #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
40:
41: #include "rf_types.h"
42: #include "rf_raid.h"
43: #include "rf_dag.h"
44: #include "rf_dagdegrd.h"
45: #include "rf_dagdegwr.h"
46: #include "rf_dagfuncs.h"
47: #include "rf_dagutils.h"
48: #include "rf_etimer.h"
49: #include "rf_acctrace.h"
50: #include "rf_general.h"
51: #include "rf_pqdegdags.h"
52: #include "rf_pq.h"
53:
54: void rf_applyPDA(RF_Raid_t *, RF_PhysDiskAddr_t *, RF_PhysDiskAddr_t *,
55: RF_PhysDiskAddr_t *, void *);
56:
57: /*
58: * Two data drives have failed, and we are doing a read that covers one of them.
59: * We may also be reading some of the surviving drives.
60: */
61:
62:
63: /*****************************************************************************
64: *
65: * Creates a DAG to perform a degraded-mode read of data within one stripe.
66: * This DAG is as follows:
67: *
68: * Hdr
69: * |
70: * Block
71: * / / \ \ \ \
72: * Rud ... Rud Rrd ... Rrd Rp Rq
73: * | \ | \ | \ | \ | \ | \
74: *
75: * | |
76: * Unblock X
77: * \ /
78: * ------ T ------
79: *
80: * Each R node is a successor of the L node.
81: * One successor arc from each R node goes to U, and the other to X.
82: * There is one Rud for each chunk of surviving user data requested by the
83: * user, and one Rrd for each chunk of surviving user data _not_ being read
84: * by the user.
85: * R = read, ud = user data, rd = recovery (surviving) data, p = P data,
86: * q = Qdata, X = pq recovery node, T = terminate
87: *
88: * The block & unblock nodes are leftovers from a previous version. They
89: * do nothing, but I haven't deleted them because it would be a tremendous
90: * effort to put them back in.
91: *
92: * Note: The target buffer for the XOR node is set to the actual user buffer
93: * where the failed data is supposed to end up. This buffer is zero'd by the
94: * code here. Thus, if you create a degraded read dag, use it, and then
95: * re-use. You have to be sure to zero the target buffer prior to the re-use.
96: *
97: * Every buffer read is passed to the pq recovery node, whose job it is to
98: * sort out what's needed and what's not.
99: *****************************************************************************/
100:
101: /* Init a disk node with 2 successors and one predecessor. */
102: #define INIT_DISK_NODE(node,name) \
103: do { \
104: rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, \
105: rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2, 1, 4, 0, \
106: dag_h, name, allocList); \
107: (node)->succedents[0] = unblockNode; \
108: (node)->succedents[1] = recoveryNode; \
109: (node)->antecedents[0] = blockNode; \
110: (node)->antType[0] = rf_control; \
111: } while (0)
112:
113: #define DISK_NODE_PARAMS(_node_,_p_) \
114: do { \
115: (_node_).params[0].p = _p_ ; \
116: (_node_).params[1].p = (_p_)->bufPtr; \
117: (_node_).params[2].v = parityStripeID; \
118: (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, \
119: 0, 0, which_ru); \
120: } while (0)
121:
122: #define DISK_NODE_PDA(node) ((node)->params[0].p)
123:
124: RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
125: {
126: rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
127: "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
128: }
129:
130: void
131: rf_applyPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda,
132: RF_PhysDiskAddr_t *ppda, RF_PhysDiskAddr_t *qpda, void *bp)
133: {
134: RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
135: RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
136: RF_SectorCount_t s0len = ppda->numSector, len;
137: RF_SectorNum_t suoffset;
138: unsigned coeff;
139: char *pbuf = ppda->bufPtr;
140: char *qbuf = qpda->bufPtr;
141: char *buf;
142: int delta;
143:
144: suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
145: len = pda->numSector;
146: /* See if pda intersects a recovery pda. */
147: if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) {
148: buf = pda->bufPtr;
149: coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
150: pda->raidAddress);
151: coeff = (coeff % raidPtr->Layout.numDataCol);
152:
153: if (suoffset < s0off) {
154: delta = s0off - suoffset;
155: buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
156: delta);
157: suoffset = s0off;
158: len -= delta;
159: }
160: if (suoffset > s0off) {
161: delta = suoffset - s0off;
162: pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
163: delta);
164: qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
165: delta);
166: }
167: if ((suoffset + len) > (s0len + s0off))
168: len = s0len + s0off - suoffset;
169:
170: /* Src, dest, len. */
171: rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp);
172:
173: /* Dest, src, len, coeff. */
174: rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf,
175: rf_RaidAddressToByte(raidPtr, len), coeff);
176: }
177: }
178:
179:
180: /*
181: * Recover data in the case of a double failure. There can be two
182: * result buffers, one for each chunk of data trying to be recovered.
183: * The params are pda's that have not been range restricted or otherwise
184: * politely massaged - this should be done here. The last params are the
185: * pdas of P and Q, followed by the raidPtr. The list can look like
186: *
187: * pda, pda, ..., p pda, q pda, raidptr, asm
188: *
189: * or
190: *
191: * pda, pda, ..., p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
192: *
193: * depending on whether two chunks of recovery data were required.
194: *
195: * The second condition only arises if there are two failed buffers
196: * whose lengths do not add up a stripe unit.
197: */
198:
199: int
200: rf_PQDoubleRecoveryFunc(RF_DagNode_t *node)
201: {
202: int np = node->numParams;
203: RF_AccessStripeMap_t *asmap =
204: (RF_AccessStripeMap_t *) node->params[np - 1].p;
205: RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
206: RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
207: int d, i;
208: unsigned coeff;
209: RF_RaidAddr_t sosAddr, suoffset;
210: RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
211: int two = 0;
212: RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda;
213: char *buf;
214: int numDataCol = layoutPtr->numDataCol;
215: RF_Etimer_t timer;
216: RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
217:
218: RF_ETIMER_START(timer);
219:
220: if (asmap->failedPDAs[1] &&
221: (asmap->failedPDAs[1]->numSector +
222: asmap->failedPDAs[0]->numSector < secPerSU)) {
223: RF_ASSERT(0);
224: ppda = node->params[np - 6].p;
225: ppda2 = node->params[np - 5].p;
226: qpda = node->params[np - 4].p;
227: qpda2 = node->params[np - 3].p;
228: d = (np - 6);
229: two = 1;
230: } else {
231: ppda = node->params[np - 4].p;
232: qpda = node->params[np - 3].p;
233: d = (np - 4);
234: }
235:
236: for (i = 0; i < d; i++) {
237: pda = node->params[i].p;
238: buf = pda->bufPtr;
239: suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
240: len = pda->numSector;
241: coeff = rf_RaidAddressToStripeUnitID(layoutPtr,
242: pda->raidAddress);
243: /* Compute the data unit offset within the column. */
244: coeff = (coeff % raidPtr->Layout.numDataCol);
245: /* See if pda intersects a recovery pda. */
246: rf_applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
247: if (two)
248: rf_applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
249: }
250:
251: /*
252: * Ok, we got the parity back to the point where we can recover. We
253: * now need to determine the coeff of the columns that need to be
254: * recovered. We can also only need to recover a single stripe unit.
255: */
256:
257: if (asmap->failedPDAs[1] == NULL) { /*
258: * Only a single stripe unit
259: * to recover.
260: */
261: pda = asmap->failedPDAs[0];
262: sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
263: asmap->raidAddress);
264: /* Need to determine the column of the other failed disk. */
265: coeff = rf_RaidAddressToStripeUnitID(layoutPtr,
266: pda->raidAddress);
267: /* Compute the data unit offset within the column. */
268: coeff = (coeff % raidPtr->Layout.numDataCol);
269: for (i = 0; i < numDataCol; i++) {
270: npda.raidAddress = sosAddr + (i * secPerSU);
271: (raidPtr->Layout.map->MapSector) (raidPtr,
272: npda.raidAddress, &(npda.row), &(npda.col),
273: &(npda.startSector), 0);
274: /* Skip over dead disks. */
275: if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col]
276: .status))
277: if (i != coeff)
278: break;
279: }
280: RF_ASSERT(i < numDataCol);
281: RF_ASSERT(two == 0);
282: /*
283: * Recover the data. Since we need only to recover one
284: * column, we overwrite the parity with the other one.
285: */
286: if (coeff < i) /* Recovering 'a'. */
287: rf_PQ_recover((unsigned long *) ppda->bufPtr,
288: (unsigned long *) qpda->bufPtr,
289: (unsigned long *) pda->bufPtr,
290: (unsigned long *) ppda->bufPtr,
291: rf_RaidAddressToByte(raidPtr, pda->numSector),
292: coeff, i);
293: else /* Recovering 'b'. */
294: rf_PQ_recover((unsigned long *) ppda->bufPtr,
295: (unsigned long *) qpda->bufPtr,
296: (unsigned long *) ppda->bufPtr,
297: (unsigned long *) pda->bufPtr,
298: rf_RaidAddressToByte(raidPtr, pda->numSector),
299: i, coeff);
300: } else
301: RF_PANIC();
302:
303: RF_ETIMER_STOP(timer);
304: RF_ETIMER_EVAL(timer);
305: if (tracerec)
306: tracerec->q_us += RF_ETIMER_VAL_US(timer);
307: rf_GenericWakeupFunc(node, 0);
308: return (0);
309: }
310:
311: int
312: rf_PQWriteDoubleRecoveryFunc(RF_DagNode_t *node)
313: {
314: /*
315: * The situation:
316: *
317: * We are doing a write that hits only one failed data unit. The other
318: * failed data unit is not being overwritten, so we need to generate
319: * it.
320: *
321: * For the moment, we assume all the nonfailed data being written is in
322: * the shadow of the failed data unit. (i.e., either a single data
323: * unit write or the entire failed stripe unit is being overwritten.)
324: *
325: * Recovery strategy: apply the recovery data to the parity and Q.
326: * Use P & Q to recover the second failed data unit in P. Zero fill
327: * Q, then apply the recovered data to P. Then apply the data being
328: * written to the failed drive. Then walk through the surviving drives,
329: * applying new data when it exists, othewise the recovery data.
330: * Quite a mess.
331: *
332: *
333: * The params:
334: *
335: * read pda0, read pda1, ..., read pda (numDataCol-3),
336: * write pda0, ..., write pda (numStripeUnitAccess - numDataFailed),
337: * failed pda, raidPtr, asmap
338: */
339:
340: int np = node->numParams;
341: RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *)
342: node->params[np - 1].p;
343: RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
344: RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
345: int i;
346: RF_RaidAddr_t sosAddr;
347: unsigned coeff;
348: RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
349: RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
350: int numDataCol = layoutPtr->numDataCol;
351: RF_Etimer_t timer;
352: RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
353:
354: RF_ASSERT(node->numResults == 2);
355: RF_ASSERT(asmap->failedPDAs[1] == NULL);
356: RF_ETIMER_START(timer);
357: ppda = node->results[0];
358: qpda = node->results[1];
359: /* apply the recovery data */
360: for (i = 0; i < numDataCol - 2; i++)
361: rf_applyPDA(raidPtr, node->params[i].p, ppda, qpda,
362: node->dagHdr->bp);
363:
364: /* Determine the other failed data unit. */
365: pda = asmap->failedPDAs[0];
366: sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
367: asmap->raidAddress);
368: /* Need to determine the column of the other failed disk. */
369: coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
370: /* Compute the data unit offset within the column. */
371: coeff = (coeff % raidPtr->Layout.numDataCol);
372: for (i = 0; i < numDataCol; i++) {
373: npda.raidAddress = sosAddr + (i * secPerSU);
374: (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress,
375: &(npda.row), &(npda.col), &(npda.startSector), 0);
376: /* Skip over dead disks. */
377: if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
378: if (i != coeff)
379: break;
380: }
381: RF_ASSERT(i < numDataCol);
382: /*
383: * Recover the data. The column we want to recover, we write over the
384: * parity. The column we don't care about, we dump in q.
385: */
386: if (coeff < i) /* Recovering 'a'. */
387: rf_PQ_recover((unsigned long *) ppda->bufPtr,
388: (unsigned long *) qpda->bufPtr,
389: (unsigned long *) ppda->bufPtr,
390: (unsigned long *) qpda->bufPtr,
391: rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
392: else /* Recovering 'b'. */
393: rf_PQ_recover((unsigned long *) ppda->bufPtr,
394: (unsigned long *) qpda->bufPtr,
395: (unsigned long *) qpda->bufPtr,
396: (unsigned long *) ppda->bufPtr,
397: rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
398:
399: /* OK. The valid data is in P. Zero fill Q, then inc it into it. */
400: bzero(qpda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector));
401: rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr,
402: rf_RaidAddressToByte(raidPtr, qpda->numSector), i);
403:
404: /* Now apply all the write data to the buffer. */
405: /*
406: * Single stripe unit write case: The failed data is the only thing
407: * we are writing.
408: */
409: RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
410: /* Dest, src, len, coeff. */
411: rf_IncQ((unsigned long *) qpda->bufPtr,
412: (unsigned long *) asmap->failedPDAs[0]->bufPtr,
413: rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
414: rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr,
415: rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp);
416:
417: /* Now apply all the recovery data. */
418: for (i = 0; i < numDataCol - 2; i++)
419: rf_applyPDA(raidPtr, node->params[i].p, ppda, qpda,
420: node->dagHdr->bp);
421:
422: RF_ETIMER_STOP(timer);
423: RF_ETIMER_EVAL(timer);
424: if (tracerec)
425: tracerec->q_us += RF_ETIMER_VAL_US(timer);
426:
427: rf_GenericWakeupFunc(node, 0);
428: return (0);
429: }
430:
431: RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
432: {
433: RF_PANIC();
434: }
435:
436:
437: /*
438: * Two lost data unit write case.
439: *
440: * There are really two cases here:
441: *
442: * (1) The write completely covers the two lost data units.
443: * In that case, a reconstruct write that doesn't write the
444: * failed data units will do the correct thing. So in this case,
445: * the dag looks like
446: *
447: * Full stripe read of surviving data units (not being overwritten)
448: * Write new data (ignoring failed units)
449: * Compute P&Q
450: * Write P&Q
451: *
452: *
453: * (2) The write does not completely cover both failed data units
454: * (but touches at least one of them). Then we need to do the
455: * equivalent of a reconstruct read to recover the missing data
456: * unit from the other stripe.
457: *
458: * For any data we are writing that is not in the "shadow"
459: * of the failed units, we need to do a four cycle update.
460: * PANIC on this case. For now.
461: *
462: */
463:
464: RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
465: {
466: RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
467: RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
468: int sum;
469: int nf = asmap->numDataFailed;
470:
471: sum = asmap->failedPDAs[0]->numSector;
472: if (nf == 2)
473: sum += asmap->failedPDAs[1]->numSector;
474:
475: if ((nf == 2) && (sum == (2 * sectorsPerSU))) {
476: /* Large write case. */
477: rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
478: return;
479: }
480: if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) {
481: /* Small write case, no user data not in shadow. */
482: rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags,
483: allocList);
484: return;
485: }
486: RF_PANIC();
487: }
488:
489: RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
490: {
491: rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList,
492: "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
493: }
494:
495: #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */
CVSweb