[BACK]Return to rf_pq.c CVS log [TXT][DIR] Up to [local] / sys / dev / raidframe

File: [local] / sys / dev / raidframe / rf_pq.c (download)

Revision 1.1, Tue Mar 4 16:09:50 2008 UTC (16 years, 2 months ago) by nbrk
Branch point for: MAIN

Initial revision

/*	$OpenBSD: rf_pq.c,v 1.6 2002/12/16 07:01:04 tdeval Exp $	*/
/*	$NetBSD: rf_pq.c,v 1.7 2000/01/07 03:41:02 oster Exp $	*/

/*
 * Copyright (c) 1995 Carnegie-Mellon University.
 * All rights reserved.
 *
 * Author: Daniel Stodolsky
 *
 * Permission to use, copy, modify and distribute this software and
 * its documentation is hereby granted, provided that both the copyright
 * notice and this permission notice appear in all copies of the
 * software, derivative works or modified versions, and any portions
 * thereof, and that both notices appear in supporting documentation.
 *
 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
 * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
 * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
 *
 * Carnegie Mellon requests users of this software to return to
 *
 *  Software Distribution Coordinator  or  Software.Distribution@CS.CMU.EDU
 *  School of Computer Science
 *  Carnegie Mellon University
 *  Pittsburgh PA 15213-3890
 *
 * any improvements or extensions that they make and grant Carnegie the
 * rights to redistribute these changes.
 */

/*
 * Code for RAID level 6 (P + Q) disk array architecture.
 */

#include "rf_archs.h"
#include "rf_types.h"
#include "rf_raid.h"
#include "rf_dag.h"
#include "rf_dagffrd.h"
#include "rf_dagffwr.h"
#include "rf_dagdegrd.h"
#include "rf_dagdegwr.h"
#include "rf_dagutils.h"
#include "rf_dagfuncs.h"
#include "rf_etimer.h"
#include "rf_pqdeg.h"
#include "rf_general.h"
#include "rf_map.h"
#include "rf_pq.h"

RF_RedFuncs_t rf_pFuncs = {
	rf_RegularONPFunc, "Regular Old-New P",
	rf_SimpleONPFunc, "Simple Old-New P"
};
RF_RedFuncs_t rf_pRecoveryFuncs = {
	rf_RecoveryPFunc, "Recovery P Func",
	rf_RecoveryPFunc, "Recovery P Func"
};

int
rf_RegularONPFunc(RF_DagNode_t *node)
{
	return (rf_RegularXorFunc(node));
}


/*
 * Same as simpleONQ func, but the coefficient is always 1.
 */

int
rf_SimpleONPFunc(RF_DagNode_t *node)
{
	return (rf_SimpleXorFunc(node));
}

int
rf_RecoveryPFunc(RF_DagNode_t *node)
{
	return (rf_RecoveryXorFunc(node));
}

int
rf_RegularPFunc(RF_DagNode_t *node)
{
	return (rf_RegularXorFunc(node));
}


#if	(RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0)

void rf_QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
	unsigned char coeff);
void rf_InvertQ(unsigned long *qbuf, unsigned long *abuf, unsigned length,
	unsigned coeff);

RF_RedFuncs_t rf_qFuncs = {
	rf_RegularONQFunc, "Regular Old-New Q",
	rf_SimpleONQFunc, "Simple Old-New Q"
};
RF_RedFuncs_t rf_qRecoveryFuncs = {
	rf_RecoveryQFunc, "Recovery Q Func",
	rf_RecoveryQFunc, "Recovery Q Func"
};
RF_RedFuncs_t rf_pqRecoveryFuncs = {
	rf_RecoveryPQFunc, "Recovery PQ Func",
	rf_RecoveryPQFunc, "Recovery PQ Func"
};

void
rf_PQDagSelect(RF_Raid_t *raidPtr, RF_IoType_t type,
    RF_AccessStripeMap_t *asmap, RF_VoidFuncPtr *createFunc)
{
	RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
	unsigned ndfail = asmap->numDataFailed;
	unsigned npfail = asmap->numParityFailed;
	unsigned ntfail = npfail + ndfail;

	RF_ASSERT(RF_IO_IS_R_OR_W(type));
	if (ntfail > 2) {
		RF_ERRORMSG("more than two disks failed in a single group !"
		            "  Aborting I/O operation.\n");
		 /* *infoFunc = */ *createFunc = NULL;
		return;
	}
	/* Ok, we can do this I/O. */
	if (type == RF_IO_TYPE_READ) {
		switch (ndfail) {
		case 0:
			/* Fault free read. */
			*createFunc = (RF_VoidFuncPtr)
			    rf_CreateFaultFreeReadDAG;	/* Same as raid 5. */
			break;
		case 1:
			/* Lost a single data unit. */
			/*
			 * Two cases:
			 * (1) Parity is not lost. Do a normal raid 5
			 *     reconstruct read.
			 * (2) Parity is lost. Do a reconstruct read using "q".
			 */
			if (ntfail == 2) {	/* Also lost redundancy. */
				if (asmap->failedPDAs[1]->type ==
				    RF_PDA_TYPE_PARITY)
					*createFunc = (RF_VoidFuncPtr)
					    rf_PQ_110_CreateReadDAG;
				else
					*createFunc = (RF_VoidFuncPtr)
					    rf_PQ_101_CreateReadDAG;
			} else {
				/*
				 * P and Q are ok. But is there a failure in
				 * some unaccessed data unit ?
				 */
				if (rf_NumFailedDataUnitsInStripe(raidPtr,
				    asmap) == 2)
					*createFunc = (RF_VoidFuncPtr)
					    rf_PQ_200_CreateReadDAG;
				else
					*createFunc = (RF_VoidFuncPtr)
					    rf_PQ_100_CreateReadDAG;
			}
			break;
		case 2:
			/* Lost two data units. */
			/* *infoFunc = rf_PQOneTwo; */
			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateReadDAG;
			break;
		}
		return;
	}
	/* A write. */
	switch (ntfail) {
	case 0:		/* Fault free. */
		if (rf_suppressLocksAndLargeWrites ||
		    (((asmap->numStripeUnitsAccessed <=
		       (layoutPtr->numDataCol / 2)) &&
		      (layoutPtr->numDataCol != 1)) ||
		     (asmap->parityInfo->next != NULL) ||
		     (asmap->qInfo->next != NULL) ||
		     rf_CheckStripeForFailures(raidPtr, asmap))) {

			*createFunc = (RF_VoidFuncPtr) rf_PQCreateSmallWriteDAG;
		} else {
			*createFunc = (RF_VoidFuncPtr) rf_PQCreateLargeWriteDAG;
		}
		break;

	case 1:		/* Single disk fault. */
		if (npfail == 1) {
			RF_ASSERT((asmap->failedPDAs[0]->type ==
			    RF_PDA_TYPE_PARITY) ||
			    (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q));
			if (asmap->failedPDAs[0]->type == RF_PDA_TYPE_Q) {
				/*
				 * Q died, treat like normal mode raid5 write.
				 */
				if (((asmap->numStripeUnitsAccessed <=
				      (layoutPtr->numDataCol / 2)) ||
				     (asmap->numStripeUnitsAccessed == 1)) ||
				    rf_NumFailedDataUnitsInStripe(raidPtr,
				     asmap))
					*createFunc = (RF_VoidFuncPtr)
					    rf_PQ_001_CreateSmallWriteDAG;
				else
					*createFunc = (RF_VoidFuncPtr)
					    rf_PQ_001_CreateLargeWriteDAG;
			} else {/* Parity died, small write only updating Q. */
				if (((asmap->numStripeUnitsAccessed <=
				      (layoutPtr->numDataCol / 2)) ||
				     (asmap->numStripeUnitsAccessed == 1)) ||
				    rf_NumFailedDataUnitsInStripe(raidPtr,
				     asmap))
					*createFunc = (RF_VoidFuncPtr)
					    rf_PQ_010_CreateSmallWriteDAG;
				else
					*createFunc = (RF_VoidFuncPtr)
					    rf_PQ_010_CreateLargeWriteDAG;
			}
		} else {	/*
				 * Data missing. Do a P reconstruct write if
				 * only a single data unit is lost in the
				 * stripe, otherwise a PQ reconstruct write.
				 */
			if (rf_NumFailedDataUnitsInStripe(raidPtr, asmap) == 2)
				*createFunc = (RF_VoidFuncPtr)
				    rf_PQ_200_CreateWriteDAG;
			else
				*createFunc = (RF_VoidFuncPtr)
				    rf_PQ_100_CreateWriteDAG;
		}
		break;

	case 2:		/* Two disk faults. */
		switch (npfail) {
		case 2:	/* Both p and q dead. */
			*createFunc = (RF_VoidFuncPtr) rf_PQ_011_CreateWriteDAG;
			break;
		case 1:	/* Either p or q and dead data. */
			RF_ASSERT(asmap->failedPDAs[0]->type ==
			          RF_PDA_TYPE_DATA);
			RF_ASSERT((asmap->failedPDAs[1]->type ==
			           RF_PDA_TYPE_PARITY) ||
			          (asmap->failedPDAs[1]->type ==
			           RF_PDA_TYPE_Q));
			if (asmap->failedPDAs[1]->type == RF_PDA_TYPE_Q)
				*createFunc = (RF_VoidFuncPtr)
				    rf_PQ_101_CreateWriteDAG;
			else
				*createFunc = (RF_VoidFuncPtr)
				    rf_PQ_110_CreateWriteDAG;
			break;
		case 0:	/* Double data loss. */
			*createFunc = (RF_VoidFuncPtr) rf_PQ_200_CreateWriteDAG;
			break;
		}
		break;

	default:	/* More than 2 disk faults. */
		*createFunc = NULL;
		RF_PANIC();
	}
	return;
}


/*
 * Used as a stop gap info function.
 */
#if 0
void
rf_PQOne(RF_Raid_t *raidPtr, int *nSucc, int *nAnte,
    RF_AccessStripeMap_t *asmap)
{
	*nSucc = *nAnte = 1;
}

void
rf_PQOneTwo(RF_Raid_t *raidPtr, int *nSucc, int *nAnte,
    RF_AccessStripeMap_t *asmap)
{
	*nSucc = 1;
	*nAnte = 2;
}
#endif

RF_CREATE_DAG_FUNC_DECL(rf_PQCreateLargeWriteDAG)
{
	rf_CommonCreateLargeWriteDAG(raidPtr, asmap, dag_h, bp, flags,
	    allocList, 2, rf_RegularPQFunc, RF_FALSE);
}

int
rf_RegularONQFunc(RF_DagNode_t *node)
{
	int np = node->numParams;
	int d;
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
	int i;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
	RF_Etimer_t timer;
	char *qbuf, *qpbuf;
	char *obuf, *nbuf;
	RF_PhysDiskAddr_t *old, *new;
	unsigned long coeff;
	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;

	RF_ETIMER_START(timer);

	d = (np - 3) / 4;
	RF_ASSERT(4 * d + 3 == np);
	qbuf = (char *) node->params[2 * d + 1].p;	/* Q buffer. */
	for (i = 0; i < d; i++) {
		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
		obuf = (char *) node->params[2 * i + 1].p;
		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
		RF_ASSERT(new->numSector == old->numSector);
		RF_ASSERT(new->raidAddress == old->raidAddress);
		/*
		 * The stripe unit within the stripe tells us the coefficient
		 * to use for the multiply.
		 */
		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
		    new->raidAddress);
		/*
		 * Compute the data unit offset within the column, then add
		 * one.
		 */
		coeff = (coeff % raidPtr->Layout.numDataCol);
		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,
		    old->startSector % secPerSU);
		rf_QDelta(qpbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr,
		    old->numSector), coeff);
	}

	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->q_us += RF_ETIMER_VAL_US(timer);
	rf_GenericWakeupFunc(node, 0);	/*
					 * Call wake func explicitly since no
					 * I/O in this node.
					 */
	return (0);
}


/*
 * See the SimpleXORFunc for the difference between a simple and regular func.
 * These Q functions should be used for
 *	new q = Q(data, old data, old q)
 * style updates and not for
 *	q = (new data, new data, ...)
 * computations.
 *
 * The simple q takes 2(2d+1)+1 params, where d is the number
 * of stripes written. The order of params is
 *   old data pda_0, old data buffer_0, old data pda_1, old data buffer_1, ...
 *   old data pda_d, old data buffer_d
 *   [2d] old q pda_0, old q buffer
 *   [2d_2] new data pda_0, new data buffer_0, ...
 *   new data pda_d, new data buffer_d
 *   raidPtr
 */

int
rf_SimpleONQFunc(RF_DagNode_t *node)
{
	int np = node->numParams;
	int d;
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
	int i;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
	RF_Etimer_t timer;
	char *qbuf;
	char *obuf, *nbuf;
	RF_PhysDiskAddr_t *old, *new;
	unsigned long coeff;

	RF_ETIMER_START(timer);

	d = (np - 3) / 4;
	RF_ASSERT(4 * d + 3 == np);
	qbuf = (char *) node->params[2 * d + 1].p;	/* Q buffer. */
	for (i = 0; i < d; i++) {
		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
		obuf = (char *) node->params[2 * i + 1].p;
		new = (RF_PhysDiskAddr_t *) node->params[2 * (d + 1 + i)].p;
		nbuf = (char *) node->params[2 * (d + 1 + i) + 1].p;
		RF_ASSERT(new->numSector == old->numSector);
		RF_ASSERT(new->raidAddress == old->raidAddress);
		/*
		 * The stripe unit within the stripe tells us the coefficient
		 * to use for the multiply.
		 */
		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
		    new->raidAddress);
		/*
		 * Compute the data unit offset within the column, then add
		 * one.
		 */
		coeff = (coeff % raidPtr->Layout.numDataCol);
		rf_QDelta(qbuf, obuf, nbuf, rf_RaidAddressToByte(raidPtr,
		    old->numSector), coeff);
	}

	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->q_us += RF_ETIMER_VAL_US(timer);
	rf_GenericWakeupFunc(node, 0);	/*
					 * Call wake func explicitly since no
					 * I/O in this node.
					 */
	return (0);
}

RF_CREATE_DAG_FUNC_DECL(rf_PQCreateSmallWriteDAG)
{
	rf_CommonCreateSmallWriteDAG(raidPtr, asmap, dag_h, bp, flags,
	    allocList, &rf_pFuncs, &rf_qFuncs);
}


void rf_RegularQSubr(RF_DagNode_t *, char *);

void
rf_RegularQSubr(RF_DagNode_t *node, char *qbuf)
{
	int np = node->numParams;
	int d;
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
	int i;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
	RF_Etimer_t timer;
	char *obuf, *qpbuf;
	RF_PhysDiskAddr_t *old;
	unsigned long coeff;

	RF_ETIMER_START(timer);

	d = (np - 1) / 2;
	RF_ASSERT(2 * d + 1 == np);
	for (i = 0; i < d; i++) {
		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
		obuf = (char *) node->params[2 * i + 1].p;
		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
		    old->raidAddress);
		/*
		 * Compute the data unit offset within the column, then add
		 * one.
		 */
		coeff = (coeff % raidPtr->Layout.numDataCol);
		/*
		 * The input buffers may not all be aligned with the start of
		 * the stripe. So shift by their sector offset within the
		 * stripe unit.
		 */
		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr,
		    old->startSector % secPerSU);
		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf,
		    rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
	}

	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->q_us += RF_ETIMER_VAL_US(timer);
}


/*
 * Used in degraded writes.
 */

void rf_DegrQSubr(RF_DagNode_t *);

void
rf_DegrQSubr(RF_DagNode_t *node)
{
	int np = node->numParams;
	int d;
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
	RF_Etimer_t timer;
	char *qbuf = node->results[1];
	char *obuf, *qpbuf;
	RF_PhysDiskAddr_t *old;
	unsigned long coeff;
	unsigned fail_start;
	int i, j;

	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
	fail_start = old->startSector % secPerSU;

	RF_ETIMER_START(timer);

	d = (np - 2) / 2;
	RF_ASSERT(2 * d + 2 == np);
	for (i = 0; i < d; i++) {
		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
		obuf = (char *) node->params[2 * i + 1].p;
		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
		    old->raidAddress);
		/*
		 * Compute the data unit offset within the column, then add
		 * one.
		 */
		coeff = (coeff % raidPtr->Layout.numDataCol);
		/*
		 * The input buffers may not all be aligned with the start of
		 * the stripe. So shift by their sector offset within the
		 * stripe unit.
		 */
		j = old->startSector % secPerSU;
		RF_ASSERT(j >= fail_start);
		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf,
		    rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
	}

	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->q_us += RF_ETIMER_VAL_US(timer);
}


/*
 * Called by large write code to compute the new parity and the new q.
 *
 * Structure of the params:
 *
 *   pda_0, buffer_0, pda_1 , buffer_1, ..., pda_d, buffer_d (d = numDataCol)
 *   raidPtr
 *
 * For a total of 2d+1 arguments.
 * The result buffers results[0], results[1] are the buffers for the p and q,
 * respectively.
 *
 * We compute Q first, then compute P. The P calculation may try to reuse
 * one of the input buffers for its output, so if we computed P first, we would
 * corrupt the input for the q calculation.
 */

int
rf_RegularPQFunc(RF_DagNode_t *node)
{
	rf_RegularQSubr(node, node->results[1]);
	return (rf_RegularXorFunc(node));	/* Does the wakeup. */
}

int
rf_RegularQFunc(RF_DagNode_t *node)
{
	/* Almost ... adjust Qsubr args. */
	rf_RegularQSubr(node, node->results[0]);
	rf_GenericWakeupFunc(node, 0);	/*
					 * Call wake func explicitly since no
					 * I/O in this node.
					 */
	return (0);
}


/*
 * Called by singly degraded write code to compute the new parity and
 * the new q.
 *
 * Structure of the params:
 *
 *   pda_0, buffer_0, pda_1 , buffer_1, ..., pda_d, buffer_d
 *   failedPDA raidPtr
 *
 * for a total of 2d+2 arguments.
 * The result buffers results[0], results[1] are the buffers for the parity
 * and q, respectively.
 *
 * We compute Q first, then compute parity. The parity calculation may try
 * to reuse one of the input buffers for its output, so if we computed parity
 * first, we would corrupt the input for the q calculation.
 *
 * We treat this identically to the regularPQ case, ignoring the failedPDA
 * extra argument.
 */

void
rf_Degraded_100_PQFunc(RF_DagNode_t *node)
{
	int     np = node->numParams;

	RF_ASSERT(np >= 2);
	rf_DegrQSubr(node);
	rf_RecoveryXorFunc(node);
}


/*
 * The two below are used when reading a stripe with a single lost data unit.
 * The parameters are
 *
 *  pda_0, buffer_0, ..., pda_n, buffer_n, P pda, P buffer, failedPDA, raidPtr
 *
 * and results[0] contains the data buffer, which is originally zero-filled.
 */

/*
 * This Q func is used by the degraded-mode dag functions to recover lost data.
 * The second-to-last parameter is the PDA for the failed portion of the
 * access. The code here looks at this PDA and assumes that the xor target
 * buffer is equal in size to the number of sectors in the failed PDA. It then
 * uses the other PDAs in the parameter list to determine where within the
 * target buffer the corresponding data should be xored.
 *
 * Recall the basic equation is
 *
 *     Q = (data_1 + 2 * data_2 ... + k * data_k) mod 256
 *
 * so to recover data_j we need
 *
 *    J data_j = (Q - data_1 - 2 data_2 ... - k * data_k) mod 256
 *
 * So the coefficient for each buffer is (255 - data_col), and j should be
 * initialized by copying Q into it. Then we need to do a table lookup to
 * convert to solve
 *   data_j /= J
 *
 */

int
rf_RecoveryQFunc(RF_DagNode_t *node)
{
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
	RF_RaidLayout_t *layoutPtr = (RF_RaidLayout_t *) & raidPtr->Layout;
	RF_PhysDiskAddr_t *failedPDA =
	    (RF_PhysDiskAddr_t *) node->params[node->numParams - 2].p;
	int i;
	RF_PhysDiskAddr_t *pda;
	RF_RaidAddr_t suoffset;
	RF_RaidAddr_t failedSUOffset =
	    rf_StripeUnitOffset(layoutPtr, failedPDA->startSector);
	char *srcbuf, *destbuf;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
	RF_Etimer_t timer;
	unsigned long coeff;

	RF_ETIMER_START(timer);
	/* Start by copying Q into the buffer. */
	bcopy(node->params[node->numParams - 3].p, node->results[0],
	    rf_RaidAddressToByte(raidPtr, failedPDA->numSector));
	for (i = 0; i < node->numParams - 4; i += 2) {
		RF_ASSERT(node->params[i + 1].p != node->results[0]);
		pda = (RF_PhysDiskAddr_t *) node->params[i].p;
		srcbuf = (char *) node->params[i + 1].p;
		suoffset = rf_StripeUnitOffset(layoutPtr, pda->startSector);
		destbuf = ((char *) node->results[0]) +
		    rf_RaidAddressToByte(raidPtr, suoffset - failedSUOffset);
		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
		    pda->raidAddress);
		/* Compute the data unit offset within the column. */
		coeff = (coeff % raidPtr->Layout.numDataCol);
		rf_IncQ((unsigned long *) destbuf, (unsigned long *) srcbuf,
		    rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
	}
	/* Do the nasty inversion now. */
	coeff = (rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
	    failedPDA->startSector) % raidPtr->Layout.numDataCol);
	rf_InvertQ(node->results[0], node->results[0],
	    rf_RaidAddressToByte(raidPtr, pda->numSector), coeff);
	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->q_us += RF_ETIMER_VAL_US(timer);
	rf_GenericWakeupFunc(node, 0);
	return (0);
}

int
rf_RecoveryPQFunc(RF_DagNode_t *node)
{
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[node->numParams - 1].p;
	printf("raid%d: Recovery from PQ not implemented.\n", raidPtr->raidid);
	return (1);
}


/*
 * Degraded write Q subroutine.
 * Used when P is dead.
 * Large-write style Q computation.
 * Parameters:
 *
 * (pda, buf), (pda, buf), ..., (failedPDA, bufPtr), failedPDA, raidPtr.
 *
 * We ignore failedPDA.
 *
 * This is a "simple style" recovery func.
 */

void
rf_PQ_DegradedWriteQFunc(RF_DagNode_t *node)
{
	int np = node->numParams;
	int d;
	RF_Raid_t *raidPtr = (RF_Raid_t *) node->params[np - 1].p;
	unsigned secPerSU = raidPtr->Layout.sectorsPerStripeUnit;
	RF_AccTraceEntry_t *tracerec = node->dagHdr->tracerec;
	RF_Etimer_t timer;
	char *qbuf = node->results[0];
	char *obuf, *qpbuf;
	RF_PhysDiskAddr_t *old;
	unsigned long coeff;
	int fail_start, i, j;

	old = (RF_PhysDiskAddr_t *) node->params[np - 2].p;
	fail_start = old->startSector % secPerSU;

	RF_ETIMER_START(timer);

	d = (np - 2) / 2;
	RF_ASSERT(2 * d + 2 == np);

	for (i = 0; i < d; i++) {
		old = (RF_PhysDiskAddr_t *) node->params[2 * i].p;
		obuf = (char *) node->params[2 * i + 1].p;
		coeff = rf_RaidAddressToStripeUnitID(&(raidPtr->Layout),
		    old->raidAddress);
		/*
		 * Compute the data unit offset within the column, then add
		 * one.
		 */
		coeff = (coeff % raidPtr->Layout.numDataCol);
		j = old->startSector % secPerSU;
		RF_ASSERT(j >= fail_start);
		qpbuf = qbuf + rf_RaidAddressToByte(raidPtr, j - fail_start);
		rf_IncQ((unsigned long *) qpbuf, (unsigned long *) obuf,
		    rf_RaidAddressToByte(raidPtr, old->numSector), coeff);
	}

	RF_ETIMER_STOP(timer);
	RF_ETIMER_EVAL(timer);
	tracerec->q_us += RF_ETIMER_VAL_US(timer);
	rf_GenericWakeupFunc(node, 0);
}


/* Q computations. */

/*
 * Coeff - colummn;
 *
 * Compute  dest ^= qfor[28-coeff][rn[coeff+1] a]
 *
 * On 5-bit basis;
 * Length in bytes;
 */

void
rf_IncQ(unsigned long *dest, unsigned long *buf, unsigned length,
    unsigned coeff)
{
	unsigned long a, d, new;
	unsigned long a1, a2;
	unsigned int *q = &(rf_qfor[28 - coeff][0]);
	unsigned r = rf_rn[coeff + 1];

#define	EXTRACT(a,i)	((a >> (5L*i)) & 0x1f)
#define	INSERT(a,i)	(a << (5L*i))

	length /= 8;
	/* 13 5 bit quants in a 64 bit word. */
	while (length) {
		a = *buf++;
		d = *dest;
		a1 = EXTRACT(a, 0) ^ r;
		a2 = EXTRACT(a, 1) ^ r;
		new = INSERT(a2, 1) | a1;
		a1 = EXTRACT(a, 2) ^ r;
		a2 = EXTRACT(a, 3) ^ r;
		a1 = q[a1];
		a2 = q[a2];
		new = new | INSERT(a1, 2) | INSERT(a2, 3);
		a1 = EXTRACT(a, 4) ^ r;
		a2 = EXTRACT(a, 5) ^ r;
		a1 = q[a1];
		a2 = q[a2];
		new = new | INSERT(a1, 4) | INSERT(a2, 5);
		a1 = EXTRACT(a, 5) ^ r;
		a2 = EXTRACT(a, 6) ^ r;
		a1 = q[a1];
		a2 = q[a2];
		new = new | INSERT(a1, 5) | INSERT(a2, 6);
#if	RF_LONGSHIFT > 2
		a1 = EXTRACT(a, 7) ^ r;
		a2 = EXTRACT(a, 8) ^ r;
		a1 = q[a1];
		a2 = q[a2];
		new = new | INSERT(a1, 7) | INSERT(a2, 8);
		a1 = EXTRACT(a, 9) ^ r;
		a2 = EXTRACT(a, 10) ^ r;
		a1 = q[a1];
		a2 = q[a2];
		new = new | INSERT(a1, 9) | INSERT(a2, 10);
		a1 = EXTRACT(a, 11) ^ r;
		a2 = EXTRACT(a, 12) ^ r;
		a1 = q[a1];
		a2 = q[a2];
		new = new | INSERT(a1, 11) | INSERT(a2, 12);
#endif	/* RF_LONGSHIFT > 2 */
		d ^= new;
		*dest++ = d;
		length--;
	}
}


/*
 * Compute.
 *
 * dest ^= rf_qfor[28-coeff][rf_rn[coeff+1] (old^new)]
 *
 * On a five bit basis.
 * Optimization: compute old ^ new on 64 bit basis.
 *
 * Length in bytes.
 */

void
rf_QDelta(char *dest, char *obuf, char *nbuf, unsigned length,
    unsigned char coeff)
{
	unsigned long a, d, new;
	unsigned long a1, a2;
	unsigned int *q = &(rf_qfor[28 - coeff][0]);
	unsigned int r = rf_rn[coeff + 1];

	r = a1 = a2 = new = d = a = 0; /* XXX For now... */
	q = NULL; /* XXX For now */

#ifdef	_KERNEL
	/*
	 * PQ in kernel currently not supported because the encoding/decoding
	 * table is not present.
	 */
	bzero(dest, length);
#else	/* _KERNEL */
	/* This code probably doesn't work and should be rewritten. -wvcii */
	/* 13 5 bit quants in a 64 bit word. */
	length /= 8;
	while (length) {
		a = *obuf++;	/*
				 * XXX Need to reorg to avoid cache conflicts.
				 */
		a ^= *nbuf++;
		d = *dest;
		a1 = EXTRACT(a, 0) ^ r;
		a2 = EXTRACT(a, 1) ^ r;
		a1 = q[a1];
		a2 = q[a2];
		new = INSERT(a2, 1) | a1;
		a1 = EXTRACT(a, 2) ^ r;
		a2 = EXTRACT(a, 3) ^ r;
		a1 = q[a1];
		a2 = q[a2];
		new = new | INSERT(a1, 2) | INSERT(a2, 3);
		a1 = EXTRACT(a, 4) ^ r;
		a2 = EXTRACT(a, 5) ^ r;
		a1 = q[a1];
		a2 = q[a2];
		new = new | INSERT(a1, 4) | INSERT(a2, 5);
		a1 = EXTRACT(a, 5) ^ r;
		a2 = EXTRACT(a, 6) ^ r;
		a1 = q[a1];
		a2 = q[a2];
		new = new | INSERT(a1, 5) | INSERT(a2, 6);
#if	RF_LONGSHIFT > 2
		a1 = EXTRACT(a, 7) ^ r;
		a2 = EXTRACT(a, 8) ^ r;
		a1 = q[a1];
		a2 = q[a2];
		new = new | INSERT(a1, 7) | INSERT(a2, 8);
		a1 = EXTRACT(a, 9) ^ r;
		a2 = EXTRACT(a, 10) ^ r;
		a1 = q[a1];
		a2 = q[a2];
		new = new | INSERT(a1, 9) | INSERT(a2, 10);
		a1 = EXTRACT(a, 11) ^ r;
		a2 = EXTRACT(a, 12) ^ r;
		a1 = q[a1];
		a2 = q[a2];
		new = new | INSERT(a1, 11) | INSERT(a2, 12);
#endif	/* RF_LONGSHIFT > 2 */
		d ^= new;
		*dest++ = d;
		length--;
	}
#endif	/* _KERNEL */
}


/*
 * Recover columns a and b from the given p and q into
 * bufs abuf and bbuf. All bufs are word aligned.
 * Length is in bytes.
 */

/*
 * XXX
 *
 * Everything about this seems wrong.
 */

void
rf_PQ_recover(unsigned long *pbuf, unsigned long *qbuf, unsigned long *abuf,
    unsigned long *bbuf, unsigned length, unsigned coeff_a, unsigned coeff_b)
{
	unsigned long p, q, a, a0, a1;
	int col = (29 * coeff_a) + coeff_b;
	unsigned char *q0 = &(rf_qinv[col][0]);

	length /= 8;
	while (length) {
		p = *pbuf++;
		q = *qbuf++;
		a0 = EXTRACT(p, 0);
		a1 = EXTRACT(q, 0);
		a = q0[a0 << 5 | a1];

#define	MF(i)								\
do {									\
	a0 = EXTRACT(p, i);						\
	a1 = EXTRACT(q, i);						\
	a  = a | INSERT(q0[a0<<5 | a1], i);				\
} while (0)

		MF(1);
		MF(2);
		MF(3);
		MF(4);
		MF(5);
		MF(6);
#if 0
		MF(7);
		MF(8);
		MF(9);
		MF(10);
		MF(11);
		MF(12);
#endif	/* 0 */
		*abuf++ = a;
		*bbuf++ = a ^ p;
		length--;
	}
}


/*
 * Lost parity and a data column. Recover that data column.
 * Assume col coeff is lost. Let q the contents of Q after
 * all surviving data columns have been q-xored out of it.
 * Then we have the equation
 *
 *   q[28-coeff][a_i ^ r_i+1] = q
 *
 * but q is cyclic with period 31.
 * So q[3+coeff][q[28-coeff][a_i ^ r_{i+1}]] =
 *    q[31][a_i ^ r_{i+1}] = a_i ^ r_{i+1} .
 *
 * so a_i = r_{coeff+1} ^ q[3+coeff][q]
 *
 * The routine is passed q buffer and the buffer
 * the data is to be recoverd into. They can be the same.
 */

void
rf_InvertQ(unsigned long *qbuf, unsigned long *abuf, unsigned length,
    unsigned coeff)
{
	unsigned long a, new;
	unsigned long a1, a2;
	unsigned int *q = &(rf_qfor[3 + coeff][0]);
	unsigned r = rf_rn[coeff + 1];

	/* 13 5 bit quants in a 64 bit word. */
	length /= 8;
	while (length) {
		a = *qbuf++;
		a1 = EXTRACT(a, 0);
		a2 = EXTRACT(a, 1);
		a1 = r ^ q[a1];
		a2 = r ^ q[a2];
		new = INSERT(a2, 1) | a1;

#define	M(i,j)								\
do {									\
	a1 = EXTRACT(a, i);						\
	a2 = EXTRACT(a, j);						\
	a1 = r ^ q[a1];							\
	a2 = r ^ q[a2];							\
	new = new | INSERT(a1, i) | INSERT(a2, j);			\
} while (0)

		M(2, 3);
		M(4, 5);
		M(5, 6);
#if	RF_LONGSHIFT > 2
		M(7, 8);
		M(9, 10);
		M(11, 12);
#endif	/* RF_LONGSHIFT > 2 */
		*abuf++ = new;
		length--;
	}
}
#endif	/* (RF_INCLUDE_DECL_PQ > 0) || (RF_INCLUDE_RAID6 > 0) */