Annotation of sys/dev/raidframe/rf_pqdegdags.c, Revision 1.1
1.1 ! nbrk 1: /* $OpenBSD: rf_pqdegdags.c,v 1.5 2002/12/16 07:01:04 tdeval Exp $ */
! 2: /* $NetBSD: rf_pqdegdags.c,v 1.5 1999/08/15 02:36:40 oster Exp $ */
! 3:
! 4: /*
! 5: * Copyright (c) 1995 Carnegie-Mellon University.
! 6: * All rights reserved.
! 7: *
! 8: * Author: Daniel Stodolsky
! 9: *
! 10: * Permission to use, copy, modify and distribute this software and
! 11: * its documentation is hereby granted, provided that both the copyright
! 12: * notice and this permission notice appear in all copies of the
! 13: * software, derivative works or modified versions, and any portions
! 14: * thereof, and that both notices appear in supporting documentation.
! 15: *
! 16: * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
! 17: * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
! 18: * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
! 19: *
! 20: * Carnegie Mellon requests users of this software to return to
! 21: *
! 22: * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
! 23: * School of Computer Science
! 24: * Carnegie Mellon University
! 25: * Pittsburgh PA 15213-3890
! 26: *
! 27: * any improvements or extensions that they make and grant Carnegie the
! 28: * rights to redistribute these changes.
! 29: */
! 30:
! 31: /*
! 32: * rf_pqdegdags.c
! 33: * Degraded mode dags for double fault cases.
! 34: */
! 35:
! 36:
! 37: #include "rf_archs.h"
! 38:
! 39: #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)
! 40:
! 41: #include "rf_types.h"
! 42: #include "rf_raid.h"
! 43: #include "rf_dag.h"
! 44: #include "rf_dagdegrd.h"
! 45: #include "rf_dagdegwr.h"
! 46: #include "rf_dagfuncs.h"
! 47: #include "rf_dagutils.h"
! 48: #include "rf_etimer.h"
! 49: #include "rf_acctrace.h"
! 50: #include "rf_general.h"
! 51: #include "rf_pqdegdags.h"
! 52: #include "rf_pq.h"
! 53:
! 54: void rf_applyPDA(RF_Raid_t *, RF_PhysDiskAddr_t *, RF_PhysDiskAddr_t *,
! 55: RF_PhysDiskAddr_t *, void *);
! 56:
! 57: /*
! 58: * Two data drives have failed, and we are doing a read that covers one of them.
! 59: * We may also be reading some of the surviving drives.
! 60: */
! 61:
! 62:
! 63: /*****************************************************************************
! 64: *
! 65: * Creates a DAG to perform a degraded-mode read of data within one stripe.
! 66: * This DAG is as follows:
! 67: *
! 68: * Hdr
! 69: * |
! 70: * Block
! 71: * / / \ \ \ \
! 72: * Rud ... Rud Rrd ... Rrd Rp Rq
! 73: * | \ | \ | \ | \ | \ | \
! 74: *
! 75: * | |
! 76: * Unblock X
! 77: * \ /
! 78: * ------ T ------
! 79: *
! 80: * Each R node is a successor of the L node.
! 81: * One successor arc from each R node goes to U, and the other to X.
! 82: * There is one Rud for each chunk of surviving user data requested by the
! 83: * user, and one Rrd for each chunk of surviving user data _not_ being read
! 84: * by the user.
! 85: * R = read, ud = user data, rd = recovery (surviving) data, p = P data,
! 86: * q = Qdata, X = pq recovery node, T = terminate
! 87: *
! 88: * The block & unblock nodes are leftovers from a previous version. They
! 89: * do nothing, but I haven't deleted them because it would be a tremendous
! 90: * effort to put them back in.
! 91: *
! 92: * Note: The target buffer for the XOR node is set to the actual user buffer
! 93: * where the failed data is supposed to end up. This buffer is zero'd by the
! 94: * code here. Thus, if you create a degraded read dag, use it, and then
! 95: * re-use. You have to be sure to zero the target buffer prior to the re-use.
! 96: *
! 97: * Every buffer read is passed to the pq recovery node, whose job it is to
! 98: * sort out what's needed and what's not.
! 99: *****************************************************************************/
! 100:
! 101: /* Init a disk node with 2 successors and one predecessor. */
! 102: #define INIT_DISK_NODE(node,name) \
! 103: do { \
! 104: rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, \
! 105: rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2, 1, 4, 0, \
! 106: dag_h, name, allocList); \
! 107: (node)->succedents[0] = unblockNode; \
! 108: (node)->succedents[1] = recoveryNode; \
! 109: (node)->antecedents[0] = blockNode; \
! 110: (node)->antType[0] = rf_control; \
! 111: } while (0)
! 112:
! 113: #define DISK_NODE_PARAMS(_node_,_p_) \
! 114: do { \
! 115: (_node_).params[0].p = _p_ ; \
! 116: (_node_).params[1].p = (_p_)->bufPtr; \
! 117: (_node_).params[2].v = parityStripeID; \
! 118: (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, \
! 119: 0, 0, which_ru); \
! 120: } while (0)
! 121:
! 122: #define DISK_NODE_PDA(node) ((node)->params[0].p)
! 123:
! 124: RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead)
! 125: {
! 126: rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList,
! 127: "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc);
! 128: }
! 129:
! 130: void
! 131: rf_applyPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda,
! 132: RF_PhysDiskAddr_t *ppda, RF_PhysDiskAddr_t *qpda, void *bp)
! 133: {
! 134: RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
! 135: RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector);
! 136: RF_SectorCount_t s0len = ppda->numSector, len;
! 137: RF_SectorNum_t suoffset;
! 138: unsigned coeff;
! 139: char *pbuf = ppda->bufPtr;
! 140: char *qbuf = qpda->bufPtr;
! 141: char *buf;
! 142: int delta;
! 143:
! 144: suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
! 145: len = pda->numSector;
! 146: /* See if pda intersects a recovery pda. */
! 147: if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) {
! 148: buf = pda->bufPtr;
! 149: coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
! 150: pda->raidAddress);
! 151: coeff = (coeff % raidPtr->Layout.numDataCol);
! 152:
! 153: if (suoffset < s0off) {
! 154: delta = s0off - suoffset;
! 155: buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
! 156: delta);
! 157: suoffset = s0off;
! 158: len -= delta;
! 159: }
! 160: if (suoffset > s0off) {
! 161: delta = suoffset - s0off;
! 162: pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
! 163: delta);
! 164: qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
! 165: delta);
! 166: }
! 167: if ((suoffset + len) > (s0len + s0off))
! 168: len = s0len + s0off - suoffset;
! 169:
! 170: /* Src, dest, len. */
! 171: rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp);
! 172:
! 173: /* Dest, src, len, coeff. */
! 174: rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf,
! 175: rf_RaidAddressToByte(raidPtr, len), coeff);
! 176: }
! 177: }
! 178:
! 179:
! 180: /*
! 181: * Recover data in the case of a double failure. There can be two
! 182: * result buffers, one for each chunk of data trying to be recovered.
! 183: * The params are pda's that have not been range restricted or otherwise
! 184: * politely massaged - this should be done here. The last params are the
! 185: * pdas of P and Q, followed by the raidPtr. The list can look like
! 186: *
! 187: * pda, pda, ..., p pda, q pda, raidptr, asm
! 188: *
! 189: * or
! 190: *
! 191: * pda, pda, ..., p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm
! 192: *
! 193: * depending on whether two chunks of recovery data were required.
! 194: *
! 195: * The second condition only arises if there are two failed buffers
! 196: * whose lengths do not add up a stripe unit.
! 197: */
! 198:
! 199: int
! 200: rf_PQDoubleRecoveryFunc(RF_DagNode_t *node)
! 201: {
! 202: int np = node->numParams;
! 203: RF_AccessStripeMap_t *asmap =
! 204: (RF_AccessStripeMap_t *) node->params[np - 1].p;
! 205: RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
! 206: RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
! 207: int d, i;
! 208: unsigned coeff;
! 209: RF_RaidAddr_t sosAddr, suoffset;
! 210: RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit;
! 211: int two = 0;
! 212: RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda;
! 213: char *buf;
! 214: int numDataCol = layoutPtr->numDataCol;
! 215: RF_Etimer_t timer;
! 216: RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
! 217:
! 218: RF_ETIMER_START(timer);
! 219:
! 220: if (asmap->failedPDAs[1] &&
! 221: (asmap->failedPDAs[1]->numSector +
! 222: asmap->failedPDAs[0]->numSector < secPerSU)) {
! 223: RF_ASSERT(0);
! 224: ppda = node->params[np - 6].p;
! 225: ppda2 = node->params[np - 5].p;
! 226: qpda = node->params[np - 4].p;
! 227: qpda2 = node->params[np - 3].p;
! 228: d = (np - 6);
! 229: two = 1;
! 230: } else {
! 231: ppda = node->params[np - 4].p;
! 232: qpda = node->params[np - 3].p;
! 233: d = (np - 4);
! 234: }
! 235:
! 236: for (i = 0; i < d; i++) {
! 237: pda = node->params[i].p;
! 238: buf = pda->bufPtr;
! 239: suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
! 240: len = pda->numSector;
! 241: coeff = rf_RaidAddressToStripeUnitID(layoutPtr,
! 242: pda->raidAddress);
! 243: /* Compute the data unit offset within the column. */
! 244: coeff = (coeff % raidPtr->Layout.numDataCol);
! 245: /* See if pda intersects a recovery pda. */
! 246: rf_applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
! 247: if (two)
! 248: rf_applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp);
! 249: }
! 250:
! 251: /*
! 252: * Ok, we got the parity back to the point where we can recover. We
! 253: * now need to determine the coeff of the columns that need to be
! 254: * recovered. We can also only need to recover a single stripe unit.
! 255: */
! 256:
! 257: if (asmap->failedPDAs[1] == NULL) { /*
! 258: * Only a single stripe unit
! 259: * to recover.
! 260: */
! 261: pda = asmap->failedPDAs[0];
! 262: sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
! 263: asmap->raidAddress);
! 264: /* Need to determine the column of the other failed disk. */
! 265: coeff = rf_RaidAddressToStripeUnitID(layoutPtr,
! 266: pda->raidAddress);
! 267: /* Compute the data unit offset within the column. */
! 268: coeff = (coeff % raidPtr->Layout.numDataCol);
! 269: for (i = 0; i < numDataCol; i++) {
! 270: npda.raidAddress = sosAddr + (i * secPerSU);
! 271: (raidPtr->Layout.map->MapSector) (raidPtr,
! 272: npda.raidAddress, &(npda.row), &(npda.col),
! 273: &(npda.startSector), 0);
! 274: /* Skip over dead disks. */
! 275: if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col]
! 276: .status))
! 277: if (i != coeff)
! 278: break;
! 279: }
! 280: RF_ASSERT(i < numDataCol);
! 281: RF_ASSERT(two == 0);
! 282: /*
! 283: * Recover the data. Since we need only to recover one
! 284: * column, we overwrite the parity with the other one.
! 285: */
! 286: if (coeff < i) /* Recovering 'a'. */
! 287: rf_PQ_recover((unsigned long *) ppda->bufPtr,
! 288: (unsigned long *) qpda->bufPtr,
! 289: (unsigned long *) pda->bufPtr,
! 290: (unsigned long *) ppda->bufPtr,
! 291: rf_RaidAddressToByte(raidPtr, pda->numSector),
! 292: coeff, i);
! 293: else /* Recovering 'b'. */
! 294: rf_PQ_recover((unsigned long *) ppda->bufPtr,
! 295: (unsigned long *) qpda->bufPtr,
! 296: (unsigned long *) ppda->bufPtr,
! 297: (unsigned long *) pda->bufPtr,
! 298: rf_RaidAddressToByte(raidPtr, pda->numSector),
! 299: i, coeff);
! 300: } else
! 301: RF_PANIC();
! 302:
! 303: RF_ETIMER_STOP(timer);
! 304: RF_ETIMER_EVAL(timer);
! 305: if (tracerec)
! 306: tracerec->q_us += RF_ETIMER_VAL_US(timer);
! 307: rf_GenericWakeupFunc(node, 0);
! 308: return (0);
! 309: }
! 310:
! 311: int
! 312: rf_PQWriteDoubleRecoveryFunc(RF_DagNode_t *node)
! 313: {
! 314: /*
! 315: * The situation:
! 316: *
! 317: * We are doing a write that hits only one failed data unit. The other
! 318: * failed data unit is not being overwritten, so we need to generate
! 319: * it.
! 320: *
! 321: * For the moment, we assume all the nonfailed data being written is in
! 322: * the shadow of the failed data unit. (i.e., either a single data
! 323: * unit write or the entire failed stripe unit is being overwritten.)
! 324: *
! 325: * Recovery strategy: apply the recovery data to the parity and Q.
! 326: * Use P & Q to recover the second failed data unit in P. Zero fill
! 327: * Q, then apply the recovered data to P. Then apply the data being
! 328: * written to the failed drive. Then walk through the surviving drives,
! 329: * applying new data when it exists, othewise the recovery data.
! 330: * Quite a mess.
! 331: *
! 332: *
! 333: * The params:
! 334: *
! 335: * read pda0, read pda1, ..., read pda (numDataCol-3),
! 336: * write pda0, ..., write pda (numStripeUnitAccess - numDataFailed),
! 337: * failed pda, raidPtr, asmap
! 338: */
! 339:
! 340: int np = node->numParams;
! 341: RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *)
! 342: node->params[np - 1].p;
! 343: RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p;
! 344: RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout);
! 345: int i;
! 346: RF_RaidAddr_t sosAddr;
! 347: unsigned coeff;
! 348: RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit;
! 349: RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda;
! 350: int numDataCol = layoutPtr->numDataCol;
! 351: RF_Etimer_t timer;
! 352: RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
! 353:
! 354: RF_ASSERT(node->numResults == 2);
! 355: RF_ASSERT(asmap->failedPDAs[1] == NULL);
! 356: RF_ETIMER_START(timer);
! 357: ppda = node->results[0];
! 358: qpda = node->results[1];
! 359: /* apply the recovery data */
! 360: for (i = 0; i < numDataCol - 2; i++)
! 361: rf_applyPDA(raidPtr, node->params[i].p, ppda, qpda,
! 362: node->dagHdr->bp);
! 363:
! 364: /* Determine the other failed data unit. */
! 365: pda = asmap->failedPDAs[0];
! 366: sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr,
! 367: asmap->raidAddress);
! 368: /* Need to determine the column of the other failed disk. */
! 369: coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress);
! 370: /* Compute the data unit offset within the column. */
! 371: coeff = (coeff % raidPtr->Layout.numDataCol);
! 372: for (i = 0; i < numDataCol; i++) {
! 373: npda.raidAddress = sosAddr + (i * secPerSU);
! 374: (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress,
! 375: &(npda.row), &(npda.col), &(npda.startSector), 0);
! 376: /* Skip over dead disks. */
! 377: if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status))
! 378: if (i != coeff)
! 379: break;
! 380: }
! 381: RF_ASSERT(i < numDataCol);
! 382: /*
! 383: * Recover the data. The column we want to recover, we write over the
! 384: * parity. The column we don't care about, we dump in q.
! 385: */
! 386: if (coeff < i) /* Recovering 'a'. */
! 387: rf_PQ_recover((unsigned long *) ppda->bufPtr,
! 388: (unsigned long *) qpda->bufPtr,
! 389: (unsigned long *) ppda->bufPtr,
! 390: (unsigned long *) qpda->bufPtr,
! 391: rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i);
! 392: else /* Recovering 'b'. */
! 393: rf_PQ_recover((unsigned long *) ppda->bufPtr,
! 394: (unsigned long *) qpda->bufPtr,
! 395: (unsigned long *) qpda->bufPtr,
! 396: (unsigned long *) ppda->bufPtr,
! 397: rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff);
! 398:
! 399: /* OK. The valid data is in P. Zero fill Q, then inc it into it. */
! 400: bzero(qpda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector));
! 401: rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr,
! 402: rf_RaidAddressToByte(raidPtr, qpda->numSector), i);
! 403:
! 404: /* Now apply all the write data to the buffer. */
! 405: /*
! 406: * Single stripe unit write case: The failed data is the only thing
! 407: * we are writing.
! 408: */
! 409: RF_ASSERT(asmap->numStripeUnitsAccessed == 1);
! 410: /* Dest, src, len, coeff. */
! 411: rf_IncQ((unsigned long *) qpda->bufPtr,
! 412: (unsigned long *) asmap->failedPDAs[0]->bufPtr,
! 413: rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff);
! 414: rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr,
! 415: rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp);
! 416:
! 417: /* Now apply all the recovery data. */
! 418: for (i = 0; i < numDataCol - 2; i++)
! 419: rf_applyPDA(raidPtr, node->params[i].p, ppda, qpda,
! 420: node->dagHdr->bp);
! 421:
! 422: RF_ETIMER_STOP(timer);
! 423: RF_ETIMER_EVAL(timer);
! 424: if (tracerec)
! 425: tracerec->q_us += RF_ETIMER_VAL_US(timer);
! 426:
! 427: rf_GenericWakeupFunc(node, 0);
! 428: return (0);
! 429: }
! 430:
! 431: RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite)
! 432: {
! 433: RF_PANIC();
! 434: }
! 435:
! 436:
! 437: /*
! 438: * Two lost data unit write case.
! 439: *
! 440: * There are really two cases here:
! 441: *
! 442: * (1) The write completely covers the two lost data units.
! 443: * In that case, a reconstruct write that doesn't write the
! 444: * failed data units will do the correct thing. So in this case,
! 445: * the dag looks like
! 446: *
! 447: * Full stripe read of surviving data units (not being overwritten)
! 448: * Write new data (ignoring failed units)
! 449: * Compute P&Q
! 450: * Write P&Q
! 451: *
! 452: *
! 453: * (2) The write does not completely cover both failed data units
! 454: * (but touches at least one of them). Then we need to do the
! 455: * equivalent of a reconstruct read to recover the missing data
! 456: * unit from the other stripe.
! 457: *
! 458: * For any data we are writing that is not in the "shadow"
! 459: * of the failed units, we need to do a four cycle update.
! 460: * PANIC on this case. For now.
! 461: *
! 462: */
! 463:
! 464: RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG)
! 465: {
! 466: RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
! 467: RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit;
! 468: int sum;
! 469: int nf = asmap->numDataFailed;
! 470:
! 471: sum = asmap->failedPDAs[0]->numSector;
! 472: if (nf == 2)
! 473: sum += asmap->failedPDAs[1]->numSector;
! 474:
! 475: if ((nf == 2) && (sum == (2 * sectorsPerSU))) {
! 476: /* Large write case. */
! 477: rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList);
! 478: return;
! 479: }
! 480: if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) {
! 481: /* Small write case, no user data not in shadow. */
! 482: rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags,
! 483: allocList);
! 484: return;
! 485: }
! 486: RF_PANIC();
! 487: }
! 488:
! 489: RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite)
! 490: {
! 491: rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList,
! 492: "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc);
! 493: }
! 494:
! 495: #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */
CVSweb