/* $OpenBSD: rf_pqdegdags.c,v 1.5 2002/12/16 07:01:04 tdeval Exp $ */ /* $NetBSD: rf_pqdegdags.c,v 1.5 1999/08/15 02:36:40 oster Exp $ */ /* * Copyright (c) 1995 Carnegie-Mellon University. * All rights reserved. * * Author: Daniel Stodolsky * * Permission to use, copy, modify and distribute this software and * its documentation is hereby granted, provided that both the copyright * notice and this permission notice appear in all copies of the * software, derivative works or modified versions, and any portions * thereof, and that both notices appear in supporting documentation. * * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. * * Carnegie Mellon requests users of this software to return to * * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU * School of Computer Science * Carnegie Mellon University * Pittsburgh PA 15213-3890 * * any improvements or extensions that they make and grant Carnegie the * rights to redistribute these changes. */ /* * rf_pqdegdags.c * Degraded mode dags for double fault cases. */ #include "rf_archs.h" #if (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) #include "rf_types.h" #include "rf_raid.h" #include "rf_dag.h" #include "rf_dagdegrd.h" #include "rf_dagdegwr.h" #include "rf_dagfuncs.h" #include "rf_dagutils.h" #include "rf_etimer.h" #include "rf_acctrace.h" #include "rf_general.h" #include "rf_pqdegdags.h" #include "rf_pq.h" void rf_applyPDA(RF_Raid_t *, RF_PhysDiskAddr_t *, RF_PhysDiskAddr_t *, RF_PhysDiskAddr_t *, void *); /* * Two data drives have failed, and we are doing a read that covers one of them. * We may also be reading some of the surviving drives. */ /***************************************************************************** * * Creates a DAG to perform a degraded-mode read of data within one stripe. * This DAG is as follows: * * Hdr * | * Block * / / \ \ \ \ * Rud ... Rud Rrd ... Rrd Rp Rq * | \ | \ | \ | \ | \ | \ * * | | * Unblock X * \ / * ------ T ------ * * Each R node is a successor of the L node. * One successor arc from each R node goes to U, and the other to X. * There is one Rud for each chunk of surviving user data requested by the * user, and one Rrd for each chunk of surviving user data _not_ being read * by the user. * R = read, ud = user data, rd = recovery (surviving) data, p = P data, * q = Qdata, X = pq recovery node, T = terminate * * The block & unblock nodes are leftovers from a previous version. They * do nothing, but I haven't deleted them because it would be a tremendous * effort to put them back in. * * Note: The target buffer for the XOR node is set to the actual user buffer * where the failed data is supposed to end up. This buffer is zero'd by the * code here. Thus, if you create a degraded read dag, use it, and then * re-use. You have to be sure to zero the target buffer prior to the re-use. * * Every buffer read is passed to the pq recovery node, whose job it is to * sort out what's needed and what's not. *****************************************************************************/ /* Init a disk node with 2 successors and one predecessor. */ #define INIT_DISK_NODE(node,name) \ do { \ rf_InitNode(node, rf_wait, RF_FALSE, rf_DiskReadFunc, \ rf_DiskReadUndoFunc, rf_GenericWakeupFunc, 2, 1, 4, 0, \ dag_h, name, allocList); \ (node)->succedents[0] = unblockNode; \ (node)->succedents[1] = recoveryNode; \ (node)->antecedents[0] = blockNode; \ (node)->antType[0] = rf_control; \ } while (0) #define DISK_NODE_PARAMS(_node_,_p_) \ do { \ (_node_).params[0].p = _p_ ; \ (_node_).params[1].p = (_p_)->bufPtr; \ (_node_).params[2].v = parityStripeID; \ (_node_).params[3].v = RF_CREATE_PARAM3(RF_IO_NORMAL_PRIORITY, \ 0, 0, which_ru); \ } while (0) #define DISK_NODE_PDA(node) ((node)->params[0].p) RF_CREATE_DAG_FUNC_DECL(rf_PQ_DoubleDegRead) { rf_DoubleDegRead(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "PQ Recovery", rf_PQDoubleRecoveryFunc); } void rf_applyPDA(RF_Raid_t *raidPtr, RF_PhysDiskAddr_t *pda, RF_PhysDiskAddr_t *ppda, RF_PhysDiskAddr_t *qpda, void *bp) { RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); RF_RaidAddr_t s0off = rf_StripeUnitOffset(layoutPtr, ppda->startSector); RF_SectorCount_t s0len = ppda->numSector, len; RF_SectorNum_t suoffset; unsigned coeff; char *pbuf = ppda->bufPtr; char *qbuf = qpda->bufPtr; char *buf; int delta; suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); len = pda->numSector; /* See if pda intersects a recovery pda. */ if ((suoffset < s0off + s0len) && (suoffset + len > s0off)) { buf = pda->bufPtr; coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), pda->raidAddress); coeff = (coeff % raidPtr->Layout.numDataCol); if (suoffset < s0off) { delta = s0off - suoffset; buf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta); suoffset = s0off; len -= delta; } if (suoffset > s0off) { delta = suoffset - s0off; pbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta); qbuf += rf_RaidAddressToStripeUnitID(&(raidPtr->Layout), delta); } if ((suoffset + len) > (s0len + s0off)) len = s0len + s0off - suoffset; /* Src, dest, len. */ rf_bxor(buf, pbuf, rf_RaidAddressToByte(raidPtr, len), bp); /* Dest, src, len, coeff. */ rf_IncQ((unsigned long *) qbuf, (unsigned long *) buf, rf_RaidAddressToByte(raidPtr, len), coeff); } } /* * Recover data in the case of a double failure. There can be two * result buffers, one for each chunk of data trying to be recovered. * The params are pda's that have not been range restricted or otherwise * politely massaged - this should be done here. The last params are the * pdas of P and Q, followed by the raidPtr. The list can look like * * pda, pda, ..., p pda, q pda, raidptr, asm * * or * * pda, pda, ..., p_1 pda, p_2 pda, q_1 pda, q_2 pda, raidptr, asm * * depending on whether two chunks of recovery data were required. * * The second condition only arises if there are two failed buffers * whose lengths do not add up a stripe unit. */ int rf_PQDoubleRecoveryFunc(RF_DagNode_t *node) { int np = node->numParams; RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p; RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p; RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout); int d, i; unsigned coeff; RF_RaidAddr_t sosAddr, suoffset; RF_SectorCount_t len, secPerSU = layoutPtr->sectorsPerStripeUnit; int two = 0; RF_PhysDiskAddr_t *ppda, *ppda2, *qpda, *qpda2, *pda, npda; char *buf; int numDataCol = layoutPtr->numDataCol; RF_Etimer_t timer; RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; RF_ETIMER_START(timer); if (asmap->failedPDAs[1] && (asmap->failedPDAs[1]->numSector + asmap->failedPDAs[0]->numSector < secPerSU)) { RF_ASSERT(0); ppda = node->params[np - 6].p; ppda2 = node->params[np - 5].p; qpda = node->params[np - 4].p; qpda2 = node->params[np - 3].p; d = (np - 6); two = 1; } else { ppda = node->params[np - 4].p; qpda = node->params[np - 3].p; d = (np - 4); } for (i = 0; i < d; i++) { pda = node->params[i].p; buf = pda->bufPtr; suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector); len = pda->numSector; coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress); /* Compute the data unit offset within the column. */ coeff = (coeff % raidPtr->Layout.numDataCol); /* See if pda intersects a recovery pda. */ rf_applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp); if (two) rf_applyPDA(raidPtr, pda, ppda, qpda, node->dagHdr->bp); } /* * Ok, we got the parity back to the point where we can recover. We * now need to determine the coeff of the columns that need to be * recovered. We can also only need to recover a single stripe unit. */ if (asmap->failedPDAs[1] == NULL) { /* * Only a single stripe unit * to recover. */ pda = asmap->failedPDAs[0]; sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); /* Need to determine the column of the other failed disk. */ coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress); /* Compute the data unit offset within the column. */ coeff = (coeff % raidPtr->Layout.numDataCol); for (i = 0; i < numDataCol; i++) { npda.raidAddress = sosAddr + (i * secPerSU); (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0); /* Skip over dead disks. */ if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col] .status)) if (i != coeff) break; } RF_ASSERT(i < numDataCol); RF_ASSERT(two == 0); /* * Recover the data. Since we need only to recover one * column, we overwrite the parity with the other one. */ if (coeff < i) /* Recovering 'a'. */ rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) pda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i); else /* Recovering 'b'. */ rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) pda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff); } else RF_PANIC(); RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); if (tracerec) tracerec->q_us += RF_ETIMER_VAL_US(timer); rf_GenericWakeupFunc(node, 0); return (0); } int rf_PQWriteDoubleRecoveryFunc(RF_DagNode_t *node) { /* * The situation: * * We are doing a write that hits only one failed data unit. The other * failed data unit is not being overwritten, so we need to generate * it. * * For the moment, we assume all the nonfailed data being written is in * the shadow of the failed data unit. (i.e., either a single data * unit write or the entire failed stripe unit is being overwritten.) * * Recovery strategy: apply the recovery data to the parity and Q. * Use P & Q to recover the second failed data unit in P. Zero fill * Q, then apply the recovered data to P. Then apply the data being * written to the failed drive. Then walk through the surviving drives, * applying new data when it exists, othewise the recovery data. * Quite a mess. * * * The params: * * read pda0, read pda1, ..., read pda (numDataCol-3), * write pda0, ..., write pda (numStripeUnitAccess - numDataFailed), * failed pda, raidPtr, asmap */ int np = node->numParams; RF_AccessStripeMap_t *asmap = (RF_AccessStripeMap_t *) node->params[np - 1].p; RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 2].p; RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & (raidPtr->Layout); int i; RF_RaidAddr_t sosAddr; unsigned coeff; RF_StripeCount_t secPerSU = layoutPtr->sectorsPerStripeUnit; RF_PhysDiskAddr_t *ppda, *qpda, *pda, npda; int numDataCol = layoutPtr->numDataCol; RF_Etimer_t timer; RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec; RF_ASSERT(node->numResults == 2); RF_ASSERT(asmap->failedPDAs[1] == NULL); RF_ETIMER_START(timer); ppda = node->results[0]; qpda = node->results[1]; /* apply the recovery data */ for (i = 0; i < numDataCol - 2; i++) rf_applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp); /* Determine the other failed data unit. */ pda = asmap->failedPDAs[0]; sosAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, asmap->raidAddress); /* Need to determine the column of the other failed disk. */ coeff = rf_RaidAddressToStripeUnitID(layoutPtr, pda->raidAddress); /* Compute the data unit offset within the column. */ coeff = (coeff % raidPtr->Layout.numDataCol); for (i = 0; i < numDataCol; i++) { npda.raidAddress = sosAddr + (i * secPerSU); (raidPtr->Layout.map->MapSector) (raidPtr, npda.raidAddress, &(npda.row), &(npda.col), &(npda.startSector), 0); /* Skip over dead disks. */ if (RF_DEAD_DISK(raidPtr->Disks[npda.row][npda.col].status)) if (i != coeff) break; } RF_ASSERT(i < numDataCol); /* * Recover the data. The column we want to recover, we write over the * parity. The column we don't care about, we dump in q. */ if (coeff < i) /* Recovering 'a'. */ rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), coeff, i); else /* Recovering 'b'. */ rf_PQ_recover((unsigned long *) ppda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, pda->numSector), i, coeff); /* OK. The valid data is in P. Zero fill Q, then inc it into it. */ bzero(qpda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector)); rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) ppda->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), i); /* Now apply all the write data to the buffer. */ /* * Single stripe unit write case: The failed data is the only thing * we are writing. */ RF_ASSERT(asmap->numStripeUnitsAccessed == 1); /* Dest, src, len, coeff. */ rf_IncQ((unsigned long *) qpda->bufPtr, (unsigned long *) asmap->failedPDAs[0]->bufPtr, rf_RaidAddressToByte(raidPtr, qpda->numSector), coeff); rf_bxor(asmap->failedPDAs[0]->bufPtr, ppda->bufPtr, rf_RaidAddressToByte(raidPtr, ppda->numSector), node->dagHdr->bp); /* Now apply all the recovery data. */ for (i = 0; i < numDataCol - 2; i++) rf_applyPDA(raidPtr, node->params[i].p, ppda, qpda, node->dagHdr->bp); RF_ETIMER_STOP(timer); RF_ETIMER_EVAL(timer); if (tracerec) tracerec->q_us += RF_ETIMER_VAL_US(timer); rf_GenericWakeupFunc(node, 0); return (0); } RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDLargeWrite) { RF_PANIC(); } /* * Two lost data unit write case. * * There are really two cases here: * * (1) The write completely covers the two lost data units. * In that case, a reconstruct write that doesn't write the * failed data units will do the correct thing. So in this case, * the dag looks like * * Full stripe read of surviving data units (not being overwritten) * Write new data (ignoring failed units) * Compute P&Q * Write P&Q * * * (2) The write does not completely cover both failed data units * (but touches at least one of them). Then we need to do the * equivalent of a reconstruct read to recover the missing data * unit from the other stripe. * * For any data we are writing that is not in the "shadow" * of the failed units, we need to do a four cycle update. * PANIC on this case. For now. * */ RF_CREATE_DAG_FUNC_DECL(rf_PQ_200_CreateWriteDAG) { RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); RF_SectorCount_t sectorsPerSU = layoutPtr->sectorsPerStripeUnit; int sum; int nf = asmap->numDataFailed; sum = asmap->failedPDAs[0]->numSector; if (nf == 2) sum += asmap->failedPDAs[1]->numSector; if ((nf == 2) && (sum == (2 * sectorsPerSU))) { /* Large write case. */ rf_PQ_DDLargeWrite(raidPtr, asmap, dag_h, bp, flags, allocList); return; } if ((nf == asmap->numStripeUnitsAccessed) || (sum >= sectorsPerSU)) { /* Small write case, no user data not in shadow. */ rf_PQ_DDSimpleSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList); return; } RF_PANIC(); } RF_CREATE_DAG_FUNC_DECL(rf_PQ_DDSimpleSmallWrite) { rf_DoubleDegSmallWrite(raidPtr, asmap, dag_h, bp, flags, allocList, "Rq", "Wq", "PQ Recovery", rf_PQWriteDoubleRecoveryFunc); } #endif /* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */