Annotation of sys/dev/raidframe/rf_reconstruct.c, Revision 1.1
1.1 ! nbrk 1: /* $OpenBSD: rf_reconstruct.c,v 1.16 2007/06/05 00:38:22 deraadt Exp $ */
! 2: /* $NetBSD: rf_reconstruct.c,v 1.26 2000/06/04 02:05:13 oster Exp $ */
! 3:
! 4: /*
! 5: * Copyright (c) 1995 Carnegie-Mellon University.
! 6: * All rights reserved.
! 7: *
! 8: * Author: Mark Holland
! 9: *
! 10: * Permission to use, copy, modify and distribute this software and
! 11: * its documentation is hereby granted, provided that both the copyright
! 12: * notice and this permission notice appear in all copies of the
! 13: * software, derivative works or modified versions, and any portions
! 14: * thereof, and that both notices appear in supporting documentation.
! 15: *
! 16: * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
! 17: * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
! 18: * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
! 19: *
! 20: * Carnegie Mellon requests users of this software to return to
! 21: *
! 22: * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
! 23: * School of Computer Science
! 24: * Carnegie Mellon University
! 25: * Pittsburgh PA 15213-3890
! 26: *
! 27: * any improvements or extensions that they make and grant Carnegie the
! 28: * rights to redistribute these changes.
! 29: */
! 30:
! 31: /**************************************************************
! 32: *
! 33: * rf_reconstruct.c -- Code to perform on-line reconstruction.
! 34: *
! 35: **************************************************************/
! 36:
! 37: #include "rf_types.h"
! 38: #include <sys/time.h>
! 39: #include <sys/buf.h>
! 40: #include <sys/errno.h>
! 41:
! 42: #include <sys/types.h>
! 43: #include <sys/param.h>
! 44: #include <sys/systm.h>
! 45: #include <sys/proc.h>
! 46: #include <sys/ioctl.h>
! 47: #include <sys/fcntl.h>
! 48: #if __NETBSD__
! 49: #include <sys/vnode.h>
! 50: #endif
! 51:
! 52: #include "rf_raid.h"
! 53: #include "rf_reconutil.h"
! 54: #include "rf_revent.h"
! 55: #include "rf_reconbuffer.h"
! 56: #include "rf_acctrace.h"
! 57: #include "rf_etimer.h"
! 58: #include "rf_dag.h"
! 59: #include "rf_desc.h"
! 60: #include "rf_general.h"
! 61: #include "rf_freelist.h"
! 62: #include "rf_debugprint.h"
! 63: #include "rf_driver.h"
! 64: #include "rf_utils.h"
! 65: #include "rf_shutdown.h"
! 66:
! 67: #include "rf_kintf.h"
! 68:
! 69: /*
! 70: * Setting these to -1 causes them to be set to their default values if not set
! 71: * by debug options.
! 72: */
! 73:
! 74: #define Dprintf(s) \
! 75: do { \
! 76: if (rf_reconDebug) \
! 77: rf_debug_printf(s, \
! 78: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL); \
! 79: } while (0)
! 80: #define Dprintf1(s,a) \
! 81: do { \
! 82: if (rf_reconDebug) \
! 83: rf_debug_printf(s, \
! 84: (void *)((unsigned long)a), \
! 85: NULL, NULL, NULL, NULL, NULL, NULL, NULL); \
! 86: } while (0)
! 87: #define Dprintf2(s,a,b) \
! 88: do { \
! 89: if (rf_reconDebug) \
! 90: rf_debug_printf(s, \
! 91: (void *)((unsigned long)a), \
! 92: (void *)((unsigned long)b), \
! 93: NULL, NULL, NULL, NULL, NULL, NULL); \
! 94: } while (0)
! 95: #define Dprintf3(s,a,b,c) \
! 96: do { \
! 97: if (rf_reconDebug) \
! 98: rf_debug_printf(s, \
! 99: (void *)((unsigned long)a), \
! 100: (void *)((unsigned long)b), \
! 101: (void *)((unsigned long)c), \
! 102: NULL, NULL, NULL, NULL, NULL); \
! 103: } while (0)
! 104: #define Dprintf4(s,a,b,c,d) \
! 105: do { \
! 106: if (rf_reconDebug) \
! 107: rf_debug_printf(s, \
! 108: (void *)((unsigned long)a), \
! 109: (void *)((unsigned long)b), \
! 110: (void *)((unsigned long)c), \
! 111: (void *)((unsigned long)d), \
! 112: NULL, NULL, NULL, NULL); \
! 113: } while (0)
! 114: #define Dprintf5(s,a,b,c,d,e) \
! 115: do { \
! 116: if (rf_reconDebug) \
! 117: rf_debug_printf(s, \
! 118: (void *)((unsigned long)a), \
! 119: (void *)((unsigned long)b), \
! 120: (void *)((unsigned long)c), \
! 121: (void *)((unsigned long)d), \
! 122: (void *)((unsigned long)e), \
! 123: NULL, NULL, NULL); \
! 124: } while (0)
! 125: #define Dprintf6(s,a,b,c,d,e,f) \
! 126: do { \
! 127: if (rf_reconDebug) \
! 128: rf_debug_printf(s, \
! 129: (void *)((unsigned long)a), \
! 130: (void *)((unsigned long)b), \
! 131: (void *)((unsigned long)c), \
! 132: (void *)((unsigned long)d), \
! 133: (void *)((unsigned long)e), \
! 134: (void *)((unsigned long)f), \
! 135: NULL, NULL); \
! 136: } while (0)
! 137: #define Dprintf7(s,a,b,c,d,e,f,g) \
! 138: do { \
! 139: if (rf_reconDebug) \
! 140: rf_debug_printf(s, \
! 141: (void *)((unsigned long)a), \
! 142: (void *)((unsigned long)b), \
! 143: (void *)((unsigned long)c), \
! 144: (void *)((unsigned long)d), \
! 145: (void *)((unsigned long)e), \
! 146: (void *)((unsigned long)f), \
! 147: (void *)((unsigned long)g), \
! 148: NULL); \
! 149: } while (0)
! 150:
! 151: #define DDprintf1(s,a) \
! 152: do { \
! 153: if (rf_reconDebug) \
! 154: rf_debug_printf(s, \
! 155: (void *)((unsigned long)a), \
! 156: NULL, NULL, NULL, NULL, NULL, NULL, NULL); \
! 157: } while (0)
! 158: #define DDprintf2(s,a,b) \
! 159: do { \
! 160: if (rf_reconDebug) \
! 161: rf_debug_printf(s, \
! 162: (void *)((unsigned long)a), \
! 163: (void *)((unsigned long)b), \
! 164: NULL, NULL, NULL, NULL, NULL, NULL); \
! 165: } while (0)
! 166:
! 167: static RF_FreeList_t *rf_recond_freelist;
! 168: #define RF_MAX_FREE_RECOND 4
! 169: #define RF_RECOND_INC 1
! 170:
! 171: RF_RaidReconDesc_t *rf_AllocRaidReconDesc(RF_Raid_t *,
! 172: RF_RowCol_t, RF_RowCol_t, RF_RaidDisk_t *, int,
! 173: RF_RowCol_t, RF_RowCol_t);
! 174: int rf_ProcessReconEvent(RF_Raid_t *, RF_RowCol_t, RF_ReconEvent_t *);
! 175: int rf_IssueNextReadRequest(RF_Raid_t *, RF_RowCol_t, RF_RowCol_t);
! 176: int rf_TryToRead(RF_Raid_t *, RF_RowCol_t, RF_RowCol_t);
! 177: int rf_ComputePSDiskOffsets(RF_Raid_t *, RF_StripeNum_t,
! 178: RF_RowCol_t, RF_RowCol_t, RF_SectorNum_t *, RF_SectorNum_t *,
! 179: RF_RowCol_t *, RF_RowCol_t *, RF_SectorNum_t *);
! 180: int rf_ReconReadDoneProc(void *, int);
! 181: int rf_ReconWriteDoneProc(void *, int);
! 182: void rf_CheckForNewMinHeadSep(RF_Raid_t *, RF_RowCol_t, RF_HeadSepLimit_t);
! 183: int rf_CheckHeadSeparation(RF_Raid_t *, RF_PerDiskReconCtrl_t *,
! 184: RF_RowCol_t, RF_RowCol_t, RF_HeadSepLimit_t, RF_ReconUnitNum_t);
! 185: void rf_ForceReconReadDoneProc(void *, int);
! 186: void rf_ShutdownReconstruction(void *);
! 187:
! 188: /*
! 189: * These functions are inlined on gcc. If they are used more than
! 190: * once, it is strongly advised to un-line them.
! 191: */
! 192: void rf_FreeReconDesc(RF_RaidReconDesc_t *);
! 193: int rf_IssueNextWriteRequest(RF_Raid_t *, RF_RowCol_t);
! 194: int rf_CheckForcedOrBlockedReconstruction(RF_Raid_t *,
! 195: RF_ReconParityStripeStatus_t *, RF_PerDiskReconCtrl_t *,
! 196: RF_RowCol_t, RF_RowCol_t, RF_StripeNum_t, RF_ReconUnitNum_t);
! 197: void rf_SignalReconDone(RF_Raid_t *);
! 198:
! 199: struct RF_ReconDoneProc_s {
! 200: void (*proc) (RF_Raid_t *, void *);
! 201: void *arg;
! 202: RF_ReconDoneProc_t *next;
! 203: };
! 204:
! 205: static RF_FreeList_t *rf_rdp_freelist;
! 206: #define RF_MAX_FREE_RDP 4
! 207: #define RF_RDP_INC 1
! 208:
! 209: void
! 210: rf_SignalReconDone(RF_Raid_t *raidPtr)
! 211: {
! 212: RF_ReconDoneProc_t *p;
! 213:
! 214: RF_LOCK_MUTEX(raidPtr->recon_done_proc_mutex);
! 215: for (p = raidPtr->recon_done_procs; p; p = p->next) {
! 216: p->proc(raidPtr, p->arg);
! 217: }
! 218: RF_UNLOCK_MUTEX(raidPtr->recon_done_proc_mutex);
! 219: }
! 220:
! 221: int
! 222: rf_RegisterReconDoneProc(RF_Raid_t *raidPtr, void (*proc) (RF_Raid_t *, void *),
! 223: void *arg, RF_ReconDoneProc_t **handlep)
! 224: {
! 225: RF_ReconDoneProc_t *p;
! 226:
! 227: RF_FREELIST_GET(rf_rdp_freelist, p, next, (RF_ReconDoneProc_t *));
! 228: if (p == NULL)
! 229: return (ENOMEM);
! 230: p->proc = proc;
! 231: p->arg = arg;
! 232: RF_LOCK_MUTEX(raidPtr->recon_done_proc_mutex);
! 233: p->next = raidPtr->recon_done_procs;
! 234: raidPtr->recon_done_procs = p;
! 235: RF_UNLOCK_MUTEX(raidPtr->recon_done_proc_mutex);
! 236: if (handlep)
! 237: *handlep = p;
! 238: return (0);
! 239: }
! 240:
! 241: /*****************************************************************************
! 242: *
! 243: * Sets up the parameters that will be used by the reconstruction process.
! 244: * Currently there are none, except for those that the layout-specific
! 245: * configuration (e.g. rf_ConfigureDeclustered) routine sets up.
! 246: *
! 247: * In the kernel, we fire off the recon thread.
! 248: *
! 249: *****************************************************************************/
! 250: void
! 251: rf_ShutdownReconstruction(void *ignored)
! 252: {
! 253: RF_FREELIST_DESTROY(rf_recond_freelist, next, (RF_RaidReconDesc_t *));
! 254: RF_FREELIST_DESTROY(rf_rdp_freelist, next, (RF_ReconDoneProc_t *));
! 255: }
! 256:
! 257: int
! 258: rf_ConfigureReconstruction(RF_ShutdownList_t **listp)
! 259: {
! 260: int rc;
! 261:
! 262: RF_FREELIST_CREATE(rf_recond_freelist, RF_MAX_FREE_RECOND,
! 263: RF_RECOND_INC, sizeof(RF_RaidReconDesc_t));
! 264: if (rf_recond_freelist == NULL)
! 265: return (ENOMEM);
! 266: RF_FREELIST_CREATE(rf_rdp_freelist, RF_MAX_FREE_RDP,
! 267: RF_RDP_INC, sizeof(RF_ReconDoneProc_t));
! 268: if (rf_rdp_freelist == NULL) {
! 269: RF_FREELIST_DESTROY(rf_recond_freelist, next,
! 270: (RF_RaidReconDesc_t *));
! 271: return (ENOMEM);
! 272: }
! 273: rc = rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL);
! 274: if (rc) {
! 275: RF_ERRORMSG3("Unable to add to shutdown list file %s line %d"
! 276: " rc=%d.\n", __FILE__, __LINE__, rc);
! 277: rf_ShutdownReconstruction(NULL);
! 278: return (rc);
! 279: }
! 280: return (0);
! 281: }
! 282:
! 283: RF_RaidReconDesc_t *
! 284: rf_AllocRaidReconDesc(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col,
! 285: RF_RaidDisk_t *spareDiskPtr, int numDisksDone, RF_RowCol_t srow,
! 286: RF_RowCol_t scol)
! 287: {
! 288:
! 289: RF_RaidReconDesc_t *reconDesc;
! 290:
! 291: RF_FREELIST_GET(rf_recond_freelist, reconDesc, next,
! 292: (RF_RaidReconDesc_t *));
! 293:
! 294: reconDesc->raidPtr = raidPtr;
! 295: reconDesc->row = row;
! 296: reconDesc->col = col;
! 297: reconDesc->spareDiskPtr = spareDiskPtr;
! 298: reconDesc->numDisksDone = numDisksDone;
! 299: reconDesc->srow = srow;
! 300: reconDesc->scol = scol;
! 301: reconDesc->state = 0;
! 302: reconDesc->next = NULL;
! 303:
! 304: return (reconDesc);
! 305: }
! 306:
! 307: void
! 308: rf_FreeReconDesc(RF_RaidReconDesc_t *reconDesc)
! 309: {
! 310: #if RF_RECON_STATS > 0
! 311: printf("RAIDframe: %qu recon event waits, %qu recon delays.\n",
! 312: reconDesc->numReconEventWaits, reconDesc->numReconExecDelays);
! 313: #endif /* RF_RECON_STATS > 0 */
! 314:
! 315: printf("RAIDframe: %qu max exec ticks.\n",
! 316: reconDesc->maxReconExecTicks);
! 317:
! 318: #if (RF_RECON_STATS > 0) || defined(_KERNEL)
! 319: printf("\n");
! 320: #endif /* (RF_RECON_STATS > 0) || _KERNEL */
! 321: RF_FREELIST_FREE(rf_recond_freelist, reconDesc, next);
! 322: }
! 323:
! 324:
! 325: /*****************************************************************************
! 326: *
! 327: * Primary routine to reconstruct a failed disk. This should be called from
! 328: * within its own thread. It won't return until reconstruction completes,
! 329: * fails, or is aborted.
! 330: *
! 331: *****************************************************************************/
! 332: int
! 333: rf_ReconstructFailedDisk(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col)
! 334: {
! 335: RF_LayoutSW_t *lp;
! 336: int rc;
! 337:
! 338: lp = raidPtr->Layout.map;
! 339: if (lp->SubmitReconBuffer) {
! 340: /*
! 341: * The current infrastructure only supports reconstructing one
! 342: * disk at a time for each array.
! 343: */
! 344: RF_LOCK_MUTEX(raidPtr->mutex);
! 345: while (raidPtr->reconInProgress) {
! 346: RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
! 347: }
! 348: raidPtr->reconInProgress++;
! 349: RF_UNLOCK_MUTEX(raidPtr->mutex);
! 350: rc = rf_ReconstructFailedDiskBasic(raidPtr, row, col);
! 351: RF_LOCK_MUTEX(raidPtr->mutex);
! 352: raidPtr->reconInProgress--;
! 353: RF_UNLOCK_MUTEX(raidPtr->mutex);
! 354: } else {
! 355: RF_ERRORMSG1("RECON: no way to reconstruct failed disk for"
! 356: " arch %c.\n", lp->parityConfig);
! 357: rc = EIO;
! 358: }
! 359: RF_SIGNAL_COND(raidPtr->waitForReconCond);
! 360: wakeup(&raidPtr->waitForReconCond); /*
! 361: * XXX Methinks this will be
! 362: * needed at some point... GO
! 363: */
! 364: return (rc);
! 365: }
! 366:
! 367: int
! 368: rf_ReconstructFailedDiskBasic(RF_Raid_t *raidPtr, RF_RowCol_t row,
! 369: RF_RowCol_t col)
! 370: {
! 371: RF_ComponentLabel_t c_label;
! 372: RF_RaidDisk_t *spareDiskPtr = NULL;
! 373: RF_RaidReconDesc_t *reconDesc;
! 374: RF_RowCol_t srow, scol;
! 375: int numDisksDone = 0, rc;
! 376:
! 377: /* First look for a spare drive onto which to reconstruct the data. */
! 378: /*
! 379: * Spare disk descriptors are stored in row 0. This may have to
! 380: * change eventually.
! 381: */
! 382:
! 383: RF_LOCK_MUTEX(raidPtr->mutex);
! 384: RF_ASSERT(raidPtr->Disks[row][col].status == rf_ds_failed);
! 385:
! 386: if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
! 387: if (raidPtr->status[row] != rf_rs_degraded) {
! 388: RF_ERRORMSG2("Unable to reconstruct disk at row %d"
! 389: " col %d because status not degraded.\n", row, col);
! 390: RF_UNLOCK_MUTEX(raidPtr->mutex);
! 391: return (EINVAL);
! 392: }
! 393: srow = row;
! 394: scol = (-1);
! 395: } else {
! 396: srow = 0;
! 397: for (scol = raidPtr->numCol;
! 398: scol < raidPtr->numCol + raidPtr->numSpare; scol++) {
! 399: if (raidPtr->Disks[srow][scol].status == rf_ds_spare) {
! 400: spareDiskPtr = &raidPtr->Disks[srow][scol];
! 401: spareDiskPtr->status = rf_ds_used_spare;
! 402: break;
! 403: }
! 404: }
! 405: if (!spareDiskPtr) {
! 406: RF_ERRORMSG2("Unable to reconstruct disk at row %d"
! 407: " col %d because no spares are available.\n",
! 408: row, col);
! 409: RF_UNLOCK_MUTEX(raidPtr->mutex);
! 410: return (ENOSPC);
! 411: }
! 412: printf("RECON: initiating reconstruction on row %d col %d"
! 413: " -> spare at row %d col %d.\n", row, col, srow, scol);
! 414: }
! 415: RF_UNLOCK_MUTEX(raidPtr->mutex);
! 416:
! 417: reconDesc = rf_AllocRaidReconDesc((void *) raidPtr, row, col,
! 418: spareDiskPtr, numDisksDone, srow, scol);
! 419: raidPtr->reconDesc = (void *) reconDesc;
! 420: #if RF_RECON_STATS > 0
! 421: reconDesc->hsStallCount = 0;
! 422: reconDesc->numReconExecDelays = 0;
! 423: reconDesc->numReconEventWaits = 0;
! 424: #endif /* RF_RECON_STATS > 0 */
! 425: reconDesc->reconExecTimerRunning = 0;
! 426: reconDesc->reconExecTicks = 0;
! 427: reconDesc->maxReconExecTicks = 0;
! 428: rc = rf_ContinueReconstructFailedDisk(reconDesc);
! 429:
! 430: if (!rc) {
! 431: /* Fix up the component label. */
! 432: /* Don't actually need the read here... */
! 433: raidread_component_label(
! 434: raidPtr->raid_cinfo[srow][scol].ci_dev,
! 435: raidPtr->raid_cinfo[srow][scol].ci_vp,
! 436: &c_label);
! 437:
! 438: raid_init_component_label(raidPtr, &c_label);
! 439: c_label.row = row;
! 440: c_label.column = col;
! 441: c_label.clean = RF_RAID_DIRTY;
! 442: c_label.status = rf_ds_optimal;
! 443:
! 444: /* XXXX MORE NEEDED HERE. */
! 445:
! 446: raidwrite_component_label(
! 447: raidPtr->raid_cinfo[srow][scol].ci_dev,
! 448: raidPtr->raid_cinfo[srow][scol].ci_vp,
! 449: &c_label);
! 450:
! 451: }
! 452: return (rc);
! 453: }
! 454:
! 455: /*
! 456: *
! 457: * Allow reconstructing a disk in-place -- i.e. component /dev/sd2e goes AWOL,
! 458: * and you don't get a spare until the next Monday. With this function
! 459: * (and hot-swappable drives) you can now put your new disk containing
! 460: * /dev/sd2e on the bus, scsictl it alive, and then use raidctl(8) to
! 461: * rebuild the data "on the spot".
! 462: *
! 463: */
! 464:
! 465: int
! 466: rf_ReconstructInPlace(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col)
! 467: {
! 468: RF_RaidDisk_t *spareDiskPtr = NULL;
! 469: RF_RaidReconDesc_t *reconDesc;
! 470: RF_LayoutSW_t *lp;
! 471: RF_RaidDisk_t *badDisk;
! 472: RF_ComponentLabel_t c_label;
! 473: int numDisksDone = 0, rc;
! 474: struct partinfo dpart;
! 475: struct vnode *vp;
! 476: struct vattr va;
! 477: struct proc *proc;
! 478: int retcode;
! 479: int ac;
! 480:
! 481: lp = raidPtr->Layout.map;
! 482: if (lp->SubmitReconBuffer) {
! 483: /*
! 484: * The current infrastructure only supports reconstructing one
! 485: * disk at a time for each array.
! 486: */
! 487: RF_LOCK_MUTEX(raidPtr->mutex);
! 488: if ((raidPtr->Disks[row][col].status == rf_ds_optimal) &&
! 489: (raidPtr->numFailures > 0)) {
! 490: /* XXX 0 above shouldn't be constant !!! */
! 491: /*
! 492: * Some component other than this has failed.
! 493: * Let's not make things worse than they already
! 494: * are...
! 495: */
! 496: #ifdef RAIDDEBUG
! 497: printf("RAIDFRAME: Unable to reconstruct to disk at:\n"
! 498: " Row: %d Col: %d Too many failures.\n",
! 499: row, col);
! 500: #endif /* RAIDDEBUG */
! 501: RF_UNLOCK_MUTEX(raidPtr->mutex);
! 502: return (EINVAL);
! 503: }
! 504: if (raidPtr->Disks[row][col].status == rf_ds_reconstructing) {
! 505: #ifdef RAIDDEBUG
! 506: printf("RAIDFRAME: Unable to reconstruct to disk at:\n"
! 507: " Row: %d Col: %d Reconstruction already"
! 508: " occurring !\n", row, col);
! 509: #endif /* RAIDDEBUG */
! 510:
! 511: RF_UNLOCK_MUTEX(raidPtr->mutex);
! 512: return (EINVAL);
! 513: }
! 514:
! 515:
! 516: if (raidPtr->Disks[row][col].status != rf_ds_failed) {
! 517: /* "It's gone..." */
! 518: raidPtr->numFailures++;
! 519: raidPtr->Disks[row][col].status = rf_ds_failed;
! 520: raidPtr->status[row] = rf_rs_degraded;
! 521: rf_update_component_labels(raidPtr,
! 522: RF_NORMAL_COMPONENT_UPDATE);
! 523: }
! 524:
! 525: while (raidPtr->reconInProgress) {
! 526: RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
! 527: }
! 528:
! 529: raidPtr->reconInProgress++;
! 530:
! 531: /*
! 532: * First look for a spare drive onto which to reconstruct
! 533: * the data. Spare disk descriptors are stored in row 0.
! 534: * This may have to change eventually.
! 535: */
! 536:
! 537: /*
! 538: * Actually, we don't care if it's failed or not...
! 539: * On a RAID set with correct parity, this function
! 540: * should be callable on any component without ill effects.
! 541: */
! 542: /*
! 543: * RF_ASSERT(raidPtr->Disks[row][col].status == rf_ds_failed);
! 544: */
! 545:
! 546: if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
! 547: RF_ERRORMSG2("Unable to reconstruct to disk at row %d"
! 548: " col %d: operation not supported for"
! 549: " RF_DISTRIBUTE_SPARE.\n", row, col);
! 550:
! 551: raidPtr->reconInProgress--;
! 552: RF_UNLOCK_MUTEX(raidPtr->mutex);
! 553: return (EINVAL);
! 554: }
! 555:
! 556: /*
! 557: * XXX Need goop here to see if the disk is alive,
! 558: * and, if not, make it so...
! 559: */
! 560:
! 561: badDisk = &raidPtr->Disks[row][col];
! 562:
! 563: proc = raidPtr->recon_thread;
! 564:
! 565: /*
! 566: * This device may have been opened successfully the
! 567: * first time. Close it before trying to open it again...
! 568: */
! 569:
! 570: if (raidPtr->raid_cinfo[row][col].ci_vp != NULL) {
! 571: printf("Closing the opened device: %s\n",
! 572: raidPtr->Disks[row][col].devname);
! 573: vp = raidPtr->raid_cinfo[row][col].ci_vp;
! 574: ac = raidPtr->Disks[row][col].auto_configured;
! 575: rf_close_component(raidPtr, vp, ac);
! 576: raidPtr->raid_cinfo[row][col].ci_vp = NULL;
! 577: }
! 578: /*
! 579: * Note that this disk was *not* auto_configured (any longer).
! 580: */
! 581: raidPtr->Disks[row][col].auto_configured = 0;
! 582:
! 583: printf("About to (re-)open the device for rebuilding: %s\n",
! 584: raidPtr->Disks[row][col].devname);
! 585:
! 586: retcode = raidlookup(raidPtr->Disks[row][col].devname,
! 587: proc, &vp);
! 588:
! 589: if (retcode) {
! 590: printf("raid%d: rebuilding: raidlookup on device: %s"
! 591: " failed: %d !\n", raidPtr->raidid,
! 592: raidPtr->Disks[row][col].devname, retcode);
! 593:
! 594: /*
! 595: * XXX the component isn't responding properly...
! 596: * Must still be dead :-(
! 597: */
! 598: raidPtr->reconInProgress--;
! 599: RF_UNLOCK_MUTEX(raidPtr->mutex);
! 600: return(retcode);
! 601:
! 602: } else {
! 603:
! 604: /*
! 605: * Ok, so we can at least do a lookup...
! 606: * How about actually getting a vp for it ?
! 607: */
! 608:
! 609: if ((retcode =
! 610: VOP_GETATTR(vp, &va, proc->p_ucred, proc)) != 0) {
! 611: raidPtr->reconInProgress--;
! 612: RF_UNLOCK_MUTEX(raidPtr->mutex);
! 613: return(retcode);
! 614: }
! 615: retcode = VOP_IOCTL(vp, DIOCGPART, (caddr_t) & dpart,
! 616: FREAD, proc->p_ucred, proc);
! 617: if (retcode) {
! 618: raidPtr->reconInProgress--;
! 619: RF_UNLOCK_MUTEX(raidPtr->mutex);
! 620: return(retcode);
! 621: }
! 622: raidPtr->Disks[row][col].blockSize =
! 623: dpart.disklab->d_secsize;
! 624:
! 625: raidPtr->Disks[row][col].numBlocks =
! 626: DL_GETPSIZE(dpart.part) - rf_protectedSectors;
! 627:
! 628: raidPtr->raid_cinfo[row][col].ci_vp = vp;
! 629: raidPtr->raid_cinfo[row][col].ci_dev = va.va_rdev;
! 630:
! 631: raidPtr->Disks[row][col].dev = va.va_rdev;
! 632:
! 633: /*
! 634: * We allow the user to specify that only a
! 635: * fraction of the disks should be used this is
! 636: * just for debug: it speeds up the parity scan.
! 637: */
! 638: raidPtr->Disks[row][col].numBlocks =
! 639: raidPtr->Disks[row][col].numBlocks *
! 640: rf_sizePercentage / 100;
! 641: }
! 642:
! 643: spareDiskPtr = &raidPtr->Disks[row][col];
! 644: spareDiskPtr->status = rf_ds_used_spare;
! 645:
! 646: printf("RECON: Initiating in-place reconstruction on\n");
! 647: printf(" row %d col %d -> spare at row %d col %d.\n",
! 648: row, col, row, col);
! 649:
! 650: RF_UNLOCK_MUTEX(raidPtr->mutex);
! 651:
! 652: reconDesc = rf_AllocRaidReconDesc((void *) raidPtr, row, col,
! 653: spareDiskPtr, numDisksDone, row, col);
! 654: raidPtr->reconDesc = (void *) reconDesc;
! 655: #if RF_RECON_STATS > 0
! 656: reconDesc->hsStallCount = 0;
! 657: reconDesc->numReconExecDelays = 0;
! 658: reconDesc->numReconEventWaits = 0;
! 659: #endif /* RF_RECON_STATS > 0 */
! 660: reconDesc->reconExecTimerRunning = 0;
! 661: reconDesc->reconExecTicks = 0;
! 662: reconDesc->maxReconExecTicks = 0;
! 663: rc = rf_ContinueReconstructFailedDisk(reconDesc);
! 664:
! 665: RF_LOCK_MUTEX(raidPtr->mutex);
! 666: raidPtr->reconInProgress--;
! 667: RF_UNLOCK_MUTEX(raidPtr->mutex);
! 668:
! 669: } else {
! 670: RF_ERRORMSG1("RECON: no way to reconstruct failed disk for"
! 671: " arch %c.\n", lp->parityConfig);
! 672: rc = EIO;
! 673: }
! 674: RF_LOCK_MUTEX(raidPtr->mutex);
! 675:
! 676: if (!rc) {
! 677: /*
! 678: * Need to set these here, as at this point it'll be claiming
! 679: * that the disk is in rf_ds_spared ! But we know better :-)
! 680: */
! 681:
! 682: raidPtr->Disks[row][col].status = rf_ds_optimal;
! 683: raidPtr->status[row] = rf_rs_optimal;
! 684:
! 685: /* Fix up the component label. */
! 686: /* Don't actually need the read here... */
! 687: raidread_component_label(
! 688: raidPtr->raid_cinfo[row][col].ci_dev,
! 689: raidPtr->raid_cinfo[row][col].ci_vp,
! 690: &c_label);
! 691:
! 692: raid_init_component_label(raidPtr, &c_label);
! 693:
! 694: c_label.row = row;
! 695: c_label.column = col;
! 696:
! 697: raidwrite_component_label(raidPtr->raid_cinfo[row][col].ci_dev,
! 698: raidPtr->raid_cinfo[row][col].ci_vp, &c_label);
! 699:
! 700: }
! 701: RF_UNLOCK_MUTEX(raidPtr->mutex);
! 702: RF_SIGNAL_COND(raidPtr->waitForReconCond);
! 703: wakeup(&raidPtr->waitForReconCond);
! 704: return (rc);
! 705: }
! 706:
! 707:
! 708: int
! 709: rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc)
! 710: {
! 711: RF_Raid_t *raidPtr = reconDesc->raidPtr;
! 712: RF_RowCol_t row = reconDesc->row;
! 713: RF_RowCol_t col = reconDesc->col;
! 714: RF_RowCol_t srow = reconDesc->srow;
! 715: RF_RowCol_t scol = reconDesc->scol;
! 716: RF_ReconMap_t *mapPtr;
! 717:
! 718: RF_ReconEvent_t *event;
! 719: struct timeval etime, elpsd;
! 720: unsigned long xor_s, xor_resid_us;
! 721: int retcode, i, ds;
! 722:
! 723: switch (reconDesc->state) {
! 724: case 0:
! 725: raidPtr->accumXorTimeUs = 0;
! 726:
! 727: /* Create one trace record per physical disk. */
! 728: RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol *
! 729: sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
! 730:
! 731: /*
! 732: * Quiesce the array prior to starting recon. This is needed
! 733: * to assure no nasty interactions with pending user writes.
! 734: * We need to do this before we change the disk or row status.
! 735: */
! 736: reconDesc->state = 1;
! 737:
! 738: Dprintf("RECON: begin request suspend.\n");
! 739: retcode = rf_SuspendNewRequestsAndWait(raidPtr);
! 740: Dprintf("RECON: end request suspend.\n");
! 741: rf_StartUserStats(raidPtr); /*
! 742: * Zero out the stats kept on
! 743: * user accs.
! 744: */
! 745: /* Fall through to state 1. */
! 746: case 1:
! 747: RF_LOCK_MUTEX(raidPtr->mutex);
! 748:
! 749: /*
! 750: * Create the reconstruction control pointer and install it in
! 751: * the right slot.
! 752: */
! 753: raidPtr->reconControl[row] =
! 754: rf_MakeReconControl(reconDesc, row, col, srow, scol);
! 755: mapPtr = raidPtr->reconControl[row]->reconMap;
! 756: raidPtr->status[row] = rf_rs_reconstructing;
! 757: raidPtr->Disks[row][col].status = rf_ds_reconstructing;
! 758: raidPtr->Disks[row][col].spareRow = srow;
! 759: raidPtr->Disks[row][col].spareCol = scol;
! 760:
! 761: RF_UNLOCK_MUTEX(raidPtr->mutex);
! 762:
! 763: RF_GETTIME(raidPtr->reconControl[row]->starttime);
! 764:
! 765: /*
! 766: * Now start up the actual reconstruction: issue a read for
! 767: * each surviving disk.
! 768: */
! 769:
! 770: reconDesc->numDisksDone = 0;
! 771: for (i = 0; i < raidPtr->numCol; i++) {
! 772: if (i != col) {
! 773: /*
! 774: * Find and issue the next I/O on the
! 775: * indicated disk.
! 776: */
! 777: if (rf_IssueNextReadRequest(raidPtr, row, i)) {
! 778: Dprintf2("RECON: done issuing for r%d"
! 779: " c%d.\n", row, i);
! 780: reconDesc->numDisksDone++;
! 781: }
! 782: }
! 783: }
! 784:
! 785: reconDesc->state = 2;
! 786:
! 787: case 2:
! 788: Dprintf("RECON: resume requests.\n");
! 789: rf_ResumeNewRequests(raidPtr);
! 790:
! 791: reconDesc->state = 3;
! 792:
! 793: case 3:
! 794:
! 795: /*
! 796: * Process reconstruction events until all disks report that
! 797: * they've completed all work.
! 798: */
! 799: mapPtr = raidPtr->reconControl[row]->reconMap;
! 800:
! 801: while (reconDesc->numDisksDone < raidPtr->numCol - 1) {
! 802:
! 803: event = rf_GetNextReconEvent(reconDesc, row,
! 804: (void (*) (void *)) rf_ContinueReconstructFailedDisk,
! 805: reconDesc);
! 806: RF_ASSERT(event);
! 807:
! 808: if (rf_ProcessReconEvent(raidPtr, row, event))
! 809: reconDesc->numDisksDone++;
! 810: raidPtr->reconControl[row]->numRUsTotal =
! 811: mapPtr->totalRUs;
! 812: raidPtr->reconControl[row]->numRUsComplete =
! 813: mapPtr->totalRUs -
! 814: rf_UnitsLeftToReconstruct(mapPtr);
! 815:
! 816: raidPtr->reconControl[row]->percentComplete =
! 817: (raidPtr->reconControl[row]->numRUsComplete * 100 /
! 818: raidPtr->reconControl[row]->numRUsTotal);
! 819: if (rf_prReconSched) {
! 820: rf_PrintReconSchedule(
! 821: raidPtr->reconControl[row]->reconMap,
! 822: &(raidPtr->reconControl[row]->starttime));
! 823: }
! 824: }
! 825:
! 826: reconDesc->state = 4;
! 827:
! 828: case 4:
! 829: mapPtr = raidPtr->reconControl[row]->reconMap;
! 830: if (rf_reconDebug) {
! 831: printf("RECON: all reads completed.\n");
! 832: }
! 833: /*
! 834: * At this point all the reads have completed. We now wait
! 835: * for any pending writes to complete, and then we're done.
! 836: */
! 837:
! 838: while (rf_UnitsLeftToReconstruct(
! 839: raidPtr->reconControl[row]->reconMap) > 0) {
! 840:
! 841: event = rf_GetNextReconEvent(reconDesc, row,
! 842: (void (*) (void *)) rf_ContinueReconstructFailedDisk,
! 843: reconDesc);
! 844: RF_ASSERT(event);
! 845:
! 846: /* Ignore return code. */
! 847: (void) rf_ProcessReconEvent(raidPtr, row, event);
! 848: raidPtr->reconControl[row]->percentComplete =
! 849: 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 /
! 850: mapPtr->totalRUs);
! 851: if (rf_prReconSched) {
! 852: rf_PrintReconSchedule(
! 853: raidPtr->reconControl[row]->reconMap,
! 854: &(raidPtr->reconControl[row]->starttime));
! 855: }
! 856: }
! 857: reconDesc->state = 5;
! 858:
! 859: case 5:
! 860: /*
! 861: * Success: mark the dead disk as reconstructed. We quiesce
! 862: * the array here to assure no nasty interactions with pending
! 863: * user accesses, when we free up the psstatus structure as
! 864: * part of FreeReconControl().
! 865: */
! 866:
! 867: reconDesc->state = 6;
! 868:
! 869: retcode = rf_SuspendNewRequestsAndWait(raidPtr);
! 870: rf_StopUserStats(raidPtr);
! 871: rf_PrintUserStats(raidPtr); /*
! 872: * Print out the stats on user
! 873: * accs accumulated during
! 874: * recon.
! 875: */
! 876:
! 877: /* Fall through to state 6. */
! 878: case 6:
! 879: RF_LOCK_MUTEX(raidPtr->mutex);
! 880: raidPtr->numFailures--;
! 881: ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE);
! 882: raidPtr->Disks[row][col].status = (ds) ? rf_ds_dist_spared :
! 883: rf_ds_spared;
! 884: raidPtr->status[row] = (ds) ? rf_rs_reconfigured :
! 885: rf_rs_optimal;
! 886: RF_UNLOCK_MUTEX(raidPtr->mutex);
! 887: RF_GETTIME(etime);
! 888: RF_TIMEVAL_DIFF(&(raidPtr->reconControl[row]->starttime),
! 889: &etime, &elpsd);
! 890:
! 891: /*
! 892: * XXX -- Why is state 7 different from state 6 if there is no
! 893: * return() here ? -- XXX Note that I set elpsd above & use it
! 894: * below, so if you put a return here you'll have to fix this.
! 895: * (also, FreeReconControl is called below).
! 896: */
! 897:
! 898: case 7:
! 899:
! 900: rf_ResumeNewRequests(raidPtr);
! 901:
! 902: printf("Reconstruction of disk at row %d col %d completed.\n",
! 903: row, col);
! 904: xor_s = raidPtr->accumXorTimeUs / 1000000;
! 905: xor_resid_us = raidPtr->accumXorTimeUs % 1000000;
! 906: printf("Recon time was %d.%06d seconds, accumulated XOR time"
! 907: " was %ld us (%ld.%06ld).\n", (int) elpsd.tv_sec,
! 908: (int) elpsd.tv_usec, raidPtr->accumXorTimeUs, xor_s,
! 909: xor_resid_us);
! 910: printf(" (start time %d sec %d usec, end time %d sec %d"
! 911: " usec)\n",
! 912: (int) raidPtr->reconControl[row]->starttime.tv_sec,
! 913: (int) raidPtr->reconControl[row]->starttime.tv_usec,
! 914: (int) etime.tv_sec, (int) etime.tv_usec);
! 915:
! 916: #if RF_RECON_STATS > 0
! 917: printf("Total head-sep stall count was %d.\n",
! 918: (int) reconDesc->hsStallCount);
! 919: #endif /* RF_RECON_STATS > 0 */
! 920: rf_FreeReconControl(raidPtr, row);
! 921: RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol *
! 922: sizeof(RF_AccTraceEntry_t));
! 923: rf_FreeReconDesc(reconDesc);
! 924:
! 925: }
! 926:
! 927: rf_SignalReconDone(raidPtr);
! 928: return (0);
! 929: }
! 930:
! 931:
! 932: /*****************************************************************************
! 933: * Do the right thing upon each reconstruction event.
! 934: * Returns nonzero if and only if there is nothing left unread on the
! 935: * indicated disk.
! 936: *****************************************************************************/
! 937: int
! 938: rf_ProcessReconEvent(RF_Raid_t *raidPtr, RF_RowCol_t frow,
! 939: RF_ReconEvent_t *event)
! 940: {
! 941: int retcode = 0, submitblocked;
! 942: RF_ReconBuffer_t *rbuf;
! 943: RF_SectorCount_t sectorsPerRU;
! 944:
! 945: Dprintf1("RECON: rf_ProcessReconEvent type %d.\n", event->type);
! 946:
! 947: switch (event->type) {
! 948:
! 949: /* A read I/O has completed. */
! 950: case RF_REVENT_READDONE:
! 951: rbuf = raidPtr->reconControl[frow]
! 952: ->perDiskInfo[event->col].rbuf;
! 953: Dprintf3("RECON: READDONE EVENT: row %d col %d psid %ld.\n",
! 954: frow, event->col, rbuf->parityStripeID);
! 955: Dprintf7("RECON: done read psid %ld buf %lx %02x %02x %02x"
! 956: " %02x %02x.\n", rbuf->parityStripeID, rbuf->buffer,
! 957: rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
! 958: rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff,
! 959: rbuf->buffer[4] & 0xff);
! 960: rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
! 961: submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0);
! 962: Dprintf1("RECON: submitblocked=%d.\n", submitblocked);
! 963: if (!submitblocked)
! 964: retcode = rf_IssueNextReadRequest(raidPtr, frow,
! 965: event->col);
! 966: break;
! 967:
! 968: /* A write I/O has completed. */
! 969: case RF_REVENT_WRITEDONE:
! 970: if (rf_floatingRbufDebug) {
! 971: rf_CheckFloatingRbufCount(raidPtr, 1);
! 972: }
! 973: sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit *
! 974: raidPtr->Layout.SUsPerRU;
! 975: rbuf = (RF_ReconBuffer_t *) event->arg;
! 976: rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
! 977: Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d"
! 978: " (%d %% complete).\n",
! 979: rbuf->parityStripeID, rbuf->which_ru,
! 980: raidPtr->reconControl[frow]->percentComplete);
! 981: rf_ReconMapUpdate(raidPtr, raidPtr->reconControl[frow]
! 982: ->reconMap, rbuf->failedDiskSectorOffset,
! 983: rbuf->failedDiskSectorOffset + sectorsPerRU - 1);
! 984: rf_RemoveFromActiveReconTable(raidPtr, frow,
! 985: rbuf->parityStripeID, rbuf->which_ru);
! 986:
! 987: if (rbuf->type == RF_RBUF_TYPE_FLOATING) {
! 988: RF_LOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex);
! 989: raidPtr->numFullReconBuffers--;
! 990: rf_ReleaseFloatingReconBuffer(raidPtr, frow, rbuf);
! 991: RF_UNLOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex);
! 992: } else
! 993: if (rbuf->type == RF_RBUF_TYPE_FORCED)
! 994: rf_FreeReconBuffer(rbuf);
! 995: else
! 996: RF_ASSERT(0);
! 997: break;
! 998:
! 999: /* A buffer-stall condition has been cleared. */
! 1000: case RF_REVENT_BUFCLEAR:
! 1001: Dprintf2("RECON: BUFCLEAR EVENT: row %d col %d.\n", frow,
! 1002: event->col);
! 1003: submitblocked = rf_SubmitReconBuffer(raidPtr
! 1004: ->reconControl[frow]->perDiskInfo[event->col].rbuf, 0,
! 1005: (int) (long) event->arg);
! 1006: RF_ASSERT(!submitblocked); /*
! 1007: * We wouldn't have gotten the
! 1008: * BUFCLEAR event if we
! 1009: * couldn't submit.
! 1010: */
! 1011: retcode = rf_IssueNextReadRequest(raidPtr, frow, event->col);
! 1012: break;
! 1013:
! 1014: /* A user-write reconstruction blockage has been cleared. */
! 1015: case RF_REVENT_BLOCKCLEAR:
! 1016: DDprintf2("RECON: BLOCKCLEAR EVENT: row %d col %d.\n",
! 1017: frow, event->col);
! 1018: retcode = rf_TryToRead(raidPtr, frow, event->col);
! 1019: break;
! 1020:
! 1021: /*
! 1022: * A max-head-separation reconstruction blockage has been
! 1023: * cleared.
! 1024: */
! 1025: case RF_REVENT_HEADSEPCLEAR:
! 1026: Dprintf2("RECON: HEADSEPCLEAR EVENT: row %d col %d.\n",
! 1027: frow, event->col);
! 1028: retcode = rf_TryToRead(raidPtr, frow, event->col);
! 1029: break;
! 1030:
! 1031: /* A buffer has become ready to write. */
! 1032: case RF_REVENT_BUFREADY:
! 1033: Dprintf2("RECON: BUFREADY EVENT: row %d col %d.\n",
! 1034: frow, event->col);
! 1035: retcode = rf_IssueNextWriteRequest(raidPtr, frow);
! 1036: if (rf_floatingRbufDebug) {
! 1037: rf_CheckFloatingRbufCount(raidPtr, 1);
! 1038: }
! 1039: break;
! 1040:
! 1041: /*
! 1042: * We need to skip the current RU entirely because it got
! 1043: * recon'd while we were waiting for something else to happen.
! 1044: */
! 1045: case RF_REVENT_SKIP:
! 1046: DDprintf2("RECON: SKIP EVENT: row %d col %d.\n",
! 1047: frow, event->col);
! 1048: retcode = rf_IssueNextReadRequest(raidPtr, frow, event->col);
! 1049: break;
! 1050:
! 1051: /*
! 1052: * A forced-reconstruction read access has completed. Just
! 1053: * submit the buffer.
! 1054: */
! 1055: case RF_REVENT_FORCEDREADDONE:
! 1056: rbuf = (RF_ReconBuffer_t *) event->arg;
! 1057: rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
! 1058: DDprintf2("RECON: FORCEDREADDONE EVENT: row %d col %d.\n",
! 1059: frow, event->col);
! 1060: submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0);
! 1061: RF_ASSERT(!submitblocked);
! 1062: break;
! 1063:
! 1064: default:
! 1065: RF_PANIC();
! 1066: }
! 1067: rf_FreeReconEventDesc(event);
! 1068: return (retcode);
! 1069: }
! 1070:
! 1071: /*****************************************************************************
! 1072: *
! 1073: * Find the next thing that's needed on the indicated disk, and issue
! 1074: * a read request for it. We assume that the reconstruction buffer
! 1075: * associated with this process is free to receive the data. If
! 1076: * reconstruction is blocked on the indicated RU, we issue a
! 1077: * blockage-release request instead of a physical disk read request.
! 1078: * If the current disk gets too far ahead of the others, we issue a
! 1079: * head-separation wait request and return.
! 1080: *
! 1081: * ctrl->{ru_count, curPSID, diskOffset} and
! 1082: * rbuf->failedDiskSectorOffset are maintained to point to the unit
! 1083: * we're currently accessing. Note that this deviates from the
! 1084: * standard C idiom of having counters point to the next thing to be
! 1085: * accessed. This allows us to easily retry when we're blocked by
! 1086: * head separation or reconstruction-blockage events.
! 1087: *
! 1088: * Returns nonzero if and only if there is nothing left unread on the
! 1089: * indicated disk.
! 1090: *
! 1091: *****************************************************************************/
! 1092: int
! 1093: rf_IssueNextReadRequest(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col)
! 1094: {
! 1095: RF_PerDiskReconCtrl_t *ctrl =
! 1096: &raidPtr->reconControl[row]->perDiskInfo[col];
! 1097: RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
! 1098: RF_ReconBuffer_t *rbuf = ctrl->rbuf;
! 1099: RF_ReconUnitCount_t RUsPerPU =
! 1100: layoutPtr->SUsPerPU / layoutPtr->SUsPerRU;
! 1101: RF_SectorCount_t sectorsPerRU =
! 1102: layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
! 1103: int do_new_check = 0, retcode = 0, status;
! 1104:
! 1105: /*
! 1106: * If we are currently the slowest disk, mark that we have to do a new
! 1107: * check.
! 1108: */
! 1109: if (ctrl->headSepCounter <=
! 1110: raidPtr->reconControl[row]->minHeadSepCounter)
! 1111: do_new_check = 1;
! 1112:
! 1113: while (1) {
! 1114:
! 1115: ctrl->ru_count++;
! 1116: if (ctrl->ru_count < RUsPerPU) {
! 1117: ctrl->diskOffset += sectorsPerRU;
! 1118: rbuf->failedDiskSectorOffset += sectorsPerRU;
! 1119: } else {
! 1120: ctrl->curPSID++;
! 1121: ctrl->ru_count = 0;
! 1122: /* code left over from when head-sep was based on
! 1123: * parity stripe id */
! 1124: if (ctrl->curPSID >=
! 1125: raidPtr->reconControl[row]->lastPSID) {
! 1126: rf_CheckForNewMinHeadSep(raidPtr, row,
! 1127: ++(ctrl->headSepCounter));
! 1128: return (1); /* Finito ! */
! 1129: }
! 1130: /*
! 1131: * Find the disk offsets of the start of the parity
! 1132: * stripe on both the current disk and the failed
! 1133: * disk. Skip this entire parity stripe if either disk
! 1134: * does not appear in the indicated PS.
! 1135: */
! 1136: status = rf_ComputePSDiskOffsets(raidPtr,
! 1137: ctrl->curPSID, row, col, &ctrl->diskOffset,
! 1138: &rbuf->failedDiskSectorOffset, &rbuf->spRow,
! 1139: &rbuf->spCol, &rbuf->spOffset);
! 1140: if (status) {
! 1141: ctrl->ru_count = RUsPerPU - 1;
! 1142: continue;
! 1143: }
! 1144: }
! 1145: rbuf->which_ru = ctrl->ru_count;
! 1146:
! 1147: /* Skip this RU if it's already been reconstructed. */
! 1148: if (rf_CheckRUReconstructed(raidPtr->reconControl[row]
! 1149: ->reconMap, rbuf->failedDiskSectorOffset)) {
! 1150: Dprintf2("Skipping psid %ld ru %d: already"
! 1151: " reconstructed.\n", ctrl->curPSID, ctrl->ru_count);
! 1152: continue;
! 1153: }
! 1154: break;
! 1155: }
! 1156: ctrl->headSepCounter++;
! 1157: if (do_new_check) /* Update min if needed. */
! 1158: rf_CheckForNewMinHeadSep(raidPtr, row, ctrl->headSepCounter);
! 1159:
! 1160:
! 1161: /*
! 1162: * At this point, we have definitely decided what to do, and we have
! 1163: * only to see if we can actually do it now.
! 1164: */
! 1165: rbuf->parityStripeID = ctrl->curPSID;
! 1166: rbuf->which_ru = ctrl->ru_count;
! 1167: bzero((char *) &raidPtr->recon_tracerecs[col],
! 1168: sizeof(raidPtr->recon_tracerecs[col]));
! 1169: raidPtr->recon_tracerecs[col].reconacc = 1;
! 1170: RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
! 1171: retcode = rf_TryToRead(raidPtr, row, col);
! 1172: return (retcode);
! 1173: }
! 1174:
! 1175: /*
! 1176: * Tries to issue the next read on the indicated disk. We may be
! 1177: * blocked by (a) the heads being too far apart, or (b) recon on the
! 1178: * indicated RU being blocked due to a write by a user thread. In
! 1179: * this case, we issue a head-sep or blockage wait request, which will
! 1180: * cause this same routine to be invoked again later when the blockage
! 1181: * has cleared.
! 1182: */
! 1183:
! 1184: int
! 1185: rf_TryToRead(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col)
! 1186: {
! 1187: RF_PerDiskReconCtrl_t *ctrl =
! 1188: &raidPtr->reconControl[row]->perDiskInfo[col];
! 1189: RF_SectorCount_t sectorsPerRU =
! 1190: raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
! 1191: RF_StripeNum_t psid = ctrl->curPSID;
! 1192: RF_ReconUnitNum_t which_ru = ctrl->ru_count;
! 1193: RF_DiskQueueData_t *req;
! 1194: int status, created = 0;
! 1195: RF_ReconParityStripeStatus_t *pssPtr;
! 1196:
! 1197: /*
! 1198: * If the current disk is too far ahead of the others, issue a
! 1199: * head-separation wait and return.
! 1200: */
! 1201: if (rf_CheckHeadSeparation(raidPtr, ctrl, row, col,
! 1202: ctrl->headSepCounter, which_ru))
! 1203: return (0);
! 1204: RF_LOCK_PSS_MUTEX(raidPtr, row, psid);
! 1205: pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]
! 1206: ->pssTable, psid, which_ru, RF_PSS_CREATE, &created);
! 1207:
! 1208: /*
! 1209: * If recon is blocked on the indicated parity stripe, issue a
! 1210: * block-wait request and return. This also must mark the indicated RU
! 1211: * in the stripe as under reconstruction if not blocked.
! 1212: */
! 1213: status = rf_CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl,
! 1214: row, col, psid, which_ru);
! 1215: if (status == RF_PSS_RECON_BLOCKED) {
! 1216: Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked.\n",
! 1217: psid, which_ru);
! 1218: goto out;
! 1219: } else
! 1220: if (status == RF_PSS_FORCED_ON_WRITE) {
! 1221: rf_CauseReconEvent(raidPtr, row, col, NULL,
! 1222: RF_REVENT_SKIP);
! 1223: goto out;
! 1224: }
! 1225: /*
! 1226: * Make one last check to be sure that the indicated RU didn't get
! 1227: * reconstructed while we were waiting for something else to happen.
! 1228: * This is unfortunate in that it causes us to make this check twice
! 1229: * in the normal case. Might want to make some attempt to re-work
! 1230: * this so that we only do this check if we've definitely blocked on
! 1231: * one of the above checks. When this condition is detected, we may
! 1232: * have just created a bogus status entry, which we need to delete.
! 1233: */
! 1234: if (rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap,
! 1235: ctrl->rbuf->failedDiskSectorOffset)) {
! 1236: Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after"
! 1237: " stall.\n", psid, which_ru);
! 1238: if (created)
! 1239: rf_PSStatusDelete(raidPtr,
! 1240: raidPtr->reconControl[row]->pssTable, pssPtr);
! 1241: rf_CauseReconEvent(raidPtr, row, col, NULL, RF_REVENT_SKIP);
! 1242: goto out;
! 1243: }
! 1244: /* Found something to read. Issue the I/O. */
! 1245: Dprintf5("RECON: Read for psid %ld on row %d col %d offset %ld"
! 1246: " buf %lx.\n", psid, row, col, ctrl->diskOffset,
! 1247: ctrl->rbuf->buffer);
! 1248: RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer);
! 1249: RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer);
! 1250: raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us =
! 1251: RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer);
! 1252: RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
! 1253:
! 1254: /*
! 1255: * Should be ok to use a NULL proc pointer here, all the bufs we use
! 1256: * should be in kernel space.
! 1257: */
! 1258: req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset,
! 1259: sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru,
! 1260: rf_ReconReadDoneProc, (void *) ctrl, NULL,
! 1261: &raidPtr->recon_tracerecs[col], (void *) raidPtr, 0, NULL);
! 1262:
! 1263: RF_ASSERT(req); /* XXX -- Fix this. -- XXX */
! 1264:
! 1265: ctrl->rbuf->arg = (void *) req;
! 1266: rf_DiskIOEnqueue(&raidPtr->Queues[row][col], req, RF_IO_RECON_PRIORITY);
! 1267: pssPtr->issued[col] = 1;
! 1268:
! 1269: out:
! 1270: RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
! 1271: return (0);
! 1272: }
! 1273:
! 1274:
! 1275: /*
! 1276: * Given a parity stripe ID, we want to find out whether both the
! 1277: * current disk and the failed disk exist in that parity stripe. If
! 1278: * not, we want to skip this whole PS. If so, we want to find the
! 1279: * disk offset of the start of the PS on both the current disk and the
! 1280: * failed disk.
! 1281: *
! 1282: * This works by getting a list of disks comprising the indicated
! 1283: * parity stripe, and searching the list for the current and failed
! 1284: * disks. Once we've decided they both exist in the parity stripe, we
! 1285: * need to decide whether each is data or parity, so that we'll know
! 1286: * which mapping function to call to get the corresponding disk
! 1287: * offsets.
! 1288: *
! 1289: * This is kind of unpleasant, but doing it this way allows the
! 1290: * reconstruction code to use parity stripe IDs rather than physical
! 1291: * disks address to march through the failed disk, which greatly
! 1292: * simplifies a lot of code, as well as eliminating the need for a
! 1293: * reverse-mapping function. I also think it will execute faster,
! 1294: * since the calls to the mapping module are kept to a minimum.
! 1295: *
! 1296: * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING
! 1297: * THE STRIPE IN THE CORRECT ORDER.
! 1298: */
! 1299:
! 1300: int
! 1301: rf_ComputePSDiskOffsets(
! 1302: RF_Raid_t *raidPtr, /* RAID descriptor. */
! 1303: RF_StripeNum_t psid, /* Parity stripe identifier. */
! 1304: RF_RowCol_t row, /*
! 1305: * Row and column of disk to find
! 1306: * the offsets for.
! 1307: */
! 1308: RF_RowCol_t col,
! 1309: RF_SectorNum_t *outDiskOffset,
! 1310: RF_SectorNum_t *outFailedDiskSectorOffset,
! 1311: RF_RowCol_t *spRow, /*
! 1312: * OUT: Row,col of spare unit for
! 1313: * failed unit.
! 1314: */
! 1315: RF_RowCol_t *spCol,
! 1316: RF_SectorNum_t *spOffset /*
! 1317: * OUT: Offset into disk containing
! 1318: * spare unit.
! 1319: */
! 1320: )
! 1321: {
! 1322: RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
! 1323: RF_RowCol_t fcol = raidPtr->reconControl[row]->fcol;
! 1324: RF_RaidAddr_t sosRaidAddress; /* start-of-stripe */
! 1325: RF_RowCol_t *diskids;
! 1326: u_int i, j, k, i_offset, j_offset;
! 1327: RF_RowCol_t prow, pcol;
! 1328: int testcol, testrow;
! 1329: RF_RowCol_t stripe;
! 1330: RF_SectorNum_t poffset;
! 1331: char i_is_parity = 0, j_is_parity = 0;
! 1332: RF_RowCol_t stripeWidth =
! 1333: layoutPtr->numDataCol + layoutPtr->numParityCol;
! 1334:
! 1335: /* Get a listing of the disks comprising that stripe. */
! 1336: sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid);
! 1337: (layoutPtr->map->IdentifyStripe) (raidPtr, sosRaidAddress, &diskids,
! 1338: &stripe);
! 1339: RF_ASSERT(diskids);
! 1340:
! 1341: /*
! 1342: * Reject this entire parity stripe if it does not contain the
! 1343: * indicated disk or it does not contain the failed disk.
! 1344: */
! 1345: if (row != stripe)
! 1346: goto skipit;
! 1347: for (i = 0; i < stripeWidth; i++) {
! 1348: if (col == diskids[i])
! 1349: break;
! 1350: }
! 1351: if (i == stripeWidth)
! 1352: goto skipit;
! 1353: for (j = 0; j < stripeWidth; j++) {
! 1354: if (fcol == diskids[j])
! 1355: break;
! 1356: }
! 1357: if (j == stripeWidth) {
! 1358: goto skipit;
! 1359: }
! 1360: /* Find out which disk the parity is on. */
! 1361: (layoutPtr->map->MapParity) (raidPtr, sosRaidAddress, &prow, &pcol,
! 1362: &poffset, RF_DONT_REMAP);
! 1363:
! 1364: /* Find out if either the current RU or the failed RU is parity. */
! 1365: /*
! 1366: * Also, if the parity occurs in this stripe prior to the data and/or
! 1367: * failed col, we need to decrement i and/or j.
! 1368: */
! 1369: for (k = 0; k < stripeWidth; k++)
! 1370: if (diskids[k] == pcol)
! 1371: break;
! 1372: RF_ASSERT(k < stripeWidth);
! 1373: i_offset = i;
! 1374: j_offset = j;
! 1375: if (k < i)
! 1376: i_offset--;
! 1377: else
! 1378: if (k == i) {
! 1379: i_is_parity = 1;
! 1380: i_offset = 0;
! 1381: } /*
! 1382: * Set offsets to zero to disable multiply
! 1383: * below.
! 1384: */
! 1385: if (k < j)
! 1386: j_offset--;
! 1387: else
! 1388: if (k == j) {
! 1389: j_is_parity = 1;
! 1390: j_offset = 0;
! 1391: }
! 1392: /*
! 1393: * At this point, [ij]_is_parity tells us whether the [current,failed]
! 1394: * disk is parity at the start of this RU, and, if data, "[ij]_offset"
! 1395: * tells us how far into the stripe the [current,failed] disk is.
! 1396: */
! 1397:
! 1398: /*
! 1399: * Call the mapping routine to get the offset into the current disk,
! 1400: * repeat for failed disk.
! 1401: */
! 1402: if (i_is_parity)
! 1403: layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset *
! 1404: layoutPtr->sectorsPerStripeUnit, &testrow, &testcol,
! 1405: outDiskOffset, RF_DONT_REMAP);
! 1406: else
! 1407: layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset *
! 1408: layoutPtr->sectorsPerStripeUnit, &testrow, &testcol,
! 1409: outDiskOffset, RF_DONT_REMAP);
! 1410:
! 1411: RF_ASSERT(row == testrow && col == testcol);
! 1412:
! 1413: if (j_is_parity)
! 1414: layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset *
! 1415: layoutPtr->sectorsPerStripeUnit, &testrow, &testcol,
! 1416: outFailedDiskSectorOffset, RF_DONT_REMAP);
! 1417: else
! 1418: layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset *
! 1419: layoutPtr->sectorsPerStripeUnit, &testrow, &testcol,
! 1420: outFailedDiskSectorOffset, RF_DONT_REMAP);
! 1421: RF_ASSERT(row == testrow && fcol == testcol);
! 1422:
! 1423: /* Now locate the spare unit for the failed unit. */
! 1424: if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
! 1425: if (j_is_parity)
! 1426: layoutPtr->map->MapParity(raidPtr, sosRaidAddress +
! 1427: j_offset * layoutPtr->sectorsPerStripeUnit, spRow,
! 1428: spCol, spOffset, RF_REMAP);
! 1429: else
! 1430: layoutPtr->map->MapSector(raidPtr, sosRaidAddress +
! 1431: j_offset * layoutPtr->sectorsPerStripeUnit, spRow,
! 1432: spCol, spOffset, RF_REMAP);
! 1433: } else {
! 1434: *spRow = raidPtr->reconControl[row]->spareRow;
! 1435: *spCol = raidPtr->reconControl[row]->spareCol;
! 1436: *spOffset = *outFailedDiskSectorOffset;
! 1437: }
! 1438:
! 1439: return (0);
! 1440:
! 1441: skipit:
! 1442: Dprintf3("RECON: Skipping psid %ld: nothing needed from r%d c%d.\n",
! 1443: psid, row, col);
! 1444: return (1);
! 1445: }
! 1446:
! 1447:
! 1448: /*
! 1449: * This is called when a buffer has become ready to write to the replacement
! 1450: * disk.
! 1451: */
! 1452: int
! 1453: rf_IssueNextWriteRequest(RF_Raid_t *raidPtr, RF_RowCol_t row)
! 1454: {
! 1455: RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
! 1456: RF_SectorCount_t sectorsPerRU =
! 1457: layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
! 1458: RF_RowCol_t fcol = raidPtr->reconControl[row]->fcol;
! 1459: RF_ReconBuffer_t *rbuf;
! 1460: RF_DiskQueueData_t *req;
! 1461:
! 1462: rbuf = rf_GetFullReconBuffer(raidPtr->reconControl[row]);
! 1463: RF_ASSERT(rbuf); /*
! 1464: * There must be one available, or we wouldn't
! 1465: * have gotten the event that sent us here.
! 1466: */
! 1467: RF_ASSERT(rbuf->pssPtr);
! 1468:
! 1469: rbuf->pssPtr->writeRbuf = rbuf;
! 1470: rbuf->pssPtr = NULL;
! 1471:
! 1472: Dprintf7("RECON: New write (r %d c %d offs %d) for psid %ld ru %d"
! 1473: " (failed disk offset %ld) buf %lx.\n",
! 1474: rbuf->spRow, rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID,
! 1475: rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer);
! 1476: Dprintf6("RECON: new write psid %ld %02x %02x %02x %02x %02x.\n",
! 1477: rbuf->parityStripeID, rbuf->buffer[0] & 0xff,
! 1478: rbuf->buffer[1] & 0xff, rbuf->buffer[2] & 0xff,
! 1479: rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
! 1480:
! 1481: /*
! 1482: * Should be ok to use a NULL b_proc here b/c all addrs should be in
! 1483: * kernel space.
! 1484: */
! 1485: req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset,
! 1486: sectorsPerRU, rbuf->buffer, rbuf->parityStripeID, rbuf->which_ru,
! 1487: rf_ReconWriteDoneProc, (void *) rbuf, NULL,
! 1488: &raidPtr->recon_tracerecs[fcol], (void *) raidPtr, 0, NULL);
! 1489:
! 1490: RF_ASSERT(req); /* XXX -- Fix this. -- XXX */
! 1491:
! 1492: rbuf->arg = (void *) req;
! 1493: rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spRow][rbuf->spCol], req,
! 1494: RF_IO_RECON_PRIORITY);
! 1495:
! 1496: return (0);
! 1497: }
! 1498:
! 1499: /*
! 1500: * This gets called upon the completion of a reconstruction read
! 1501: * operation. The arg is a pointer to the per-disk reconstruction
! 1502: * control structure for the process that just finished a read.
! 1503: *
! 1504: * Called at interrupt context in the kernel, so don't do anything
! 1505: * illegal here.
! 1506: */
! 1507: int
! 1508: rf_ReconReadDoneProc(void *arg, int status)
! 1509: {
! 1510: RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg;
! 1511: RF_Raid_t *raidPtr = ctrl->reconCtrl->reconDesc->raidPtr;
! 1512:
! 1513: if (status) {
! 1514: /*
! 1515: * XXX
! 1516: */
! 1517: printf("Recon read failed !\n");
! 1518: RF_PANIC();
! 1519: }
! 1520: RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
! 1521: RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
! 1522: raidPtr->recon_tracerecs[ctrl->col].specific.recon.
! 1523: recon_fetch_to_return_us =
! 1524: RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
! 1525: RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
! 1526:
! 1527: rf_CauseReconEvent(raidPtr, ctrl->row, ctrl->col, NULL,
! 1528: RF_REVENT_READDONE);
! 1529: return (0);
! 1530: }
! 1531:
! 1532:
! 1533: /*
! 1534: * This gets called upon the completion of a reconstruction write operation.
! 1535: * The arg is a pointer to the rbuf that was just written.
! 1536: *
! 1537: * Called at interrupt context in the kernel, so don't do anything illegal here.
! 1538: */
! 1539: int
! 1540: rf_ReconWriteDoneProc(void *arg, int status)
! 1541: {
! 1542: RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg;
! 1543:
! 1544: Dprintf2("Reconstruction completed on psid %ld ru %d.\n",
! 1545: rbuf->parityStripeID, rbuf->which_ru);
! 1546: if (status) {
! 1547: /* fprintf(stderr, "Recon write failed !\n"); */
! 1548: printf("Recon write failed !\n");
! 1549: RF_PANIC();
! 1550: }
! 1551: rf_CauseReconEvent((RF_Raid_t *) rbuf->raidPtr, rbuf->row, rbuf->col,
! 1552: arg, RF_REVENT_WRITEDONE);
! 1553: return (0);
! 1554: }
! 1555:
! 1556:
! 1557: /*
! 1558: * Computes a new minimum head sep, and wakes up anyone who needs to
! 1559: * be woken as a result.
! 1560: */
! 1561: void
! 1562: rf_CheckForNewMinHeadSep(RF_Raid_t *raidPtr, RF_RowCol_t row,
! 1563: RF_HeadSepLimit_t hsCtr)
! 1564: {
! 1565: RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[row];
! 1566: RF_HeadSepLimit_t new_min;
! 1567: RF_RowCol_t i;
! 1568: RF_CallbackDesc_t *p;
! 1569: /* From the definition of a minimum. */
! 1570: RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter);
! 1571:
! 1572:
! 1573: RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
! 1574:
! 1575: new_min = ~(1L << (8 * sizeof(long) - 1)); /* 0x7FFF....FFF */
! 1576: for (i = 0; i < raidPtr->numCol; i++)
! 1577: if (i != reconCtrlPtr->fcol) {
! 1578: if (reconCtrlPtr->perDiskInfo[i].headSepCounter <
! 1579: new_min)
! 1580: new_min =
! 1581: reconCtrlPtr->perDiskInfo[i].headSepCounter;
! 1582: }
! 1583: /* Set the new minimum and wake up anyone who can now run again. */
! 1584: if (new_min != reconCtrlPtr->minHeadSepCounter) {
! 1585: reconCtrlPtr->minHeadSepCounter = new_min;
! 1586: Dprintf1("RECON: new min head pos counter val is %ld.\n",
! 1587: new_min);
! 1588: while (reconCtrlPtr->headSepCBList) {
! 1589: if (reconCtrlPtr->headSepCBList->callbackArg.v >
! 1590: new_min)
! 1591: break;
! 1592: p = reconCtrlPtr->headSepCBList;
! 1593: reconCtrlPtr->headSepCBList = p->next;
! 1594: p->next = NULL;
! 1595: rf_CauseReconEvent(raidPtr, p->row, p->col, NULL,
! 1596: RF_REVENT_HEADSEPCLEAR);
! 1597: rf_FreeCallbackDesc(p);
! 1598: }
! 1599:
! 1600: }
! 1601: RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
! 1602: }
! 1603:
! 1604: /*
! 1605: * Checks to see that the maximum head separation will not be violated
! 1606: * if we initiate a reconstruction I/O on the indicated disk.
! 1607: * Limiting the maximum head separation between two disks eliminates
! 1608: * the nasty buffer-stall conditions that occur when one disk races
! 1609: * ahead of the others and consumes all of the floating recon buffers.
! 1610: * This code is complex and unpleasant but it's necessary to avoid
! 1611: * some very nasty, albeit fairly rare, reconstruction behavior.
! 1612: *
! 1613: * Returns non-zero if and only if we have to stop working on the
! 1614: * indicated disk due to a head-separation delay.
! 1615: */
! 1616: int
! 1617: rf_CheckHeadSeparation(
! 1618: RF_Raid_t *raidPtr,
! 1619: RF_PerDiskReconCtrl_t *ctrl,
! 1620: RF_RowCol_t row,
! 1621: RF_RowCol_t col,
! 1622: RF_HeadSepLimit_t hsCtr,
! 1623: RF_ReconUnitNum_t which_ru
! 1624: )
! 1625: {
! 1626: RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[row];
! 1627: RF_CallbackDesc_t *cb, *p, *pt;
! 1628: int retval = 0;
! 1629:
! 1630: /*
! 1631: * If we're too far ahead of the slowest disk, stop working on this
! 1632: * disk until the slower ones catch up. We do this by scheduling a
! 1633: * wakeup callback for the time when the slowest disk has caught up.
! 1634: * We define "caught up" with 20% hysteresis, i.e. the head separation
! 1635: * must have fallen to at most 80% of the max allowable head
! 1636: * separation before we'll wake up.
! 1637: */
! 1638: RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
! 1639: if ((raidPtr->headSepLimit >= 0) &&
! 1640: ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) >
! 1641: raidPtr->headSepLimit)) {
! 1642: Dprintf6("raid%d: RECON: head sep stall: row %d col %d hsCtr"
! 1643: " %ld minHSCtr %ld limit %ld.\n",
! 1644: raidPtr->raidid, row, col, ctrl->headSepCounter,
! 1645: reconCtrlPtr->minHeadSepCounter, raidPtr->headSepLimit);
! 1646: cb = rf_AllocCallbackDesc();
! 1647: /*
! 1648: * The minHeadSepCounter value we have to get to before we'll
! 1649: * wake up. Build in 20% hysteresis.
! 1650: */
! 1651: cb->callbackArg.v = (ctrl->headSepCounter -
! 1652: raidPtr->headSepLimit + raidPtr->headSepLimit / 5);
! 1653: cb->row = row;
! 1654: cb->col = col;
! 1655: cb->next = NULL;
! 1656:
! 1657: /*
! 1658: * Insert this callback descriptor into the sorted list of
! 1659: * pending head-sep callbacks.
! 1660: */
! 1661: p = reconCtrlPtr->headSepCBList;
! 1662: if (!p)
! 1663: reconCtrlPtr->headSepCBList = cb;
! 1664: else
! 1665: if (cb->callbackArg.v < p->callbackArg.v) {
! 1666: cb->next = reconCtrlPtr->headSepCBList;
! 1667: reconCtrlPtr->headSepCBList = cb;
! 1668: } else {
! 1669: for (pt = p, p = p->next;
! 1670: p && (p->callbackArg.v < cb->callbackArg.v);
! 1671: pt = p, p = p->next);
! 1672: cb->next = p;
! 1673: pt->next = cb;
! 1674: }
! 1675: retval = 1;
! 1676: #if RF_RECON_STATS > 0
! 1677: ctrl->reconCtrl->reconDesc->hsStallCount++;
! 1678: #endif /* RF_RECON_STATS > 0 */
! 1679: }
! 1680: RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
! 1681:
! 1682: return (retval);
! 1683: }
! 1684:
! 1685:
! 1686:
! 1687: /*
! 1688: * Checks to see if reconstruction has been either forced or blocked
! 1689: * by a user operation. If forced, we skip this RU entirely. Else if
! 1690: * blocked, put ourselves on the wait list. Else return 0.
! 1691: *
! 1692: * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY.
! 1693: */
! 1694: int
! 1695: rf_CheckForcedOrBlockedReconstruction(
! 1696: RF_Raid_t *raidPtr,
! 1697: RF_ReconParityStripeStatus_t *pssPtr,
! 1698: RF_PerDiskReconCtrl_t *ctrl,
! 1699: RF_RowCol_t row,
! 1700: RF_RowCol_t col,
! 1701: RF_StripeNum_t psid,
! 1702: RF_ReconUnitNum_t which_ru
! 1703: )
! 1704: {
! 1705: RF_CallbackDesc_t *cb;
! 1706: int retcode = 0;
! 1707:
! 1708: if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) ||
! 1709: (pssPtr->flags & RF_PSS_FORCED_ON_WRITE))
! 1710: retcode = RF_PSS_FORCED_ON_WRITE;
! 1711: else
! 1712: if (pssPtr->flags & RF_PSS_RECON_BLOCKED) {
! 1713: Dprintf4("RECON: row %d col %d blocked at psid %ld"
! 1714: " ru %d.\n", row, col, psid, which_ru);
! 1715: cb = rf_AllocCallbackDesc(); /*
! 1716: * Append ourselves to
! 1717: * the blockage-wait
! 1718: * list.
! 1719: */
! 1720: cb->row = row;
! 1721: cb->col = col;
! 1722: cb->next = pssPtr->blockWaitList;
! 1723: pssPtr->blockWaitList = cb;
! 1724: retcode = RF_PSS_RECON_BLOCKED;
! 1725: }
! 1726: if (!retcode)
! 1727: pssPtr->flags |= RF_PSS_UNDER_RECON; /*
! 1728: * Mark this RU as under
! 1729: * reconstruction.
! 1730: */
! 1731:
! 1732: return (retcode);
! 1733: }
! 1734:
! 1735:
! 1736: /*
! 1737: * If reconstruction is currently ongoing for the indicated stripeID,
! 1738: * reconstruction is forced to completion and we return non-zero to
! 1739: * indicate that the caller must wait. If not, then reconstruction is
! 1740: * blocked on the indicated stripe and the routine returns zero. If
! 1741: * and only if we return non-zero, we'll cause the cbFunc to get
! 1742: * invoked with the cbArg when the reconstruction has completed.
! 1743: */
! 1744: int
! 1745: rf_ForceOrBlockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
! 1746: void (*cbFunc) (RF_Raid_t *, void *), void *cbArg)
! 1747: {
! 1748: RF_RowCol_t row = asmap->physInfo->row; /*
! 1749: * Which row of the array
! 1750: * we're working on.
! 1751: */
! 1752: RF_StripeNum_t stripeID = asmap->stripeID; /*
! 1753: * The stripe ID we're
! 1754: * forcing recon on.
! 1755: */
! 1756: RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit *
! 1757: raidPtr->Layout.SUsPerRU; /* Num sects in one RU. */
! 1758: RF_ReconParityStripeStatus_t *pssPtr; /*
! 1759: * A pointer to the parity
! 1760: * stripe status structure.
! 1761: */
! 1762: RF_StripeNum_t psid; /* Parity stripe id. */
! 1763: RF_SectorNum_t offset, fd_offset; /*
! 1764: * Disk offset, failed-disk
! 1765: * offset.
! 1766: */
! 1767: RF_RowCol_t *diskids;
! 1768: RF_RowCol_t stripe;
! 1769: RF_ReconUnitNum_t which_ru; /* RU within parity stripe. */
! 1770: RF_RowCol_t fcol, diskno, i;
! 1771: RF_ReconBuffer_t *new_rbuf; /* Ptr to newly allocated rbufs. */
! 1772: RF_DiskQueueData_t *req; /* Disk I/O req to be enqueued. */
! 1773: RF_CallbackDesc_t *cb;
! 1774: int created = 0, nPromoted;
! 1775:
! 1776: psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID,
! 1777: &which_ru);
! 1778:
! 1779: RF_LOCK_PSS_MUTEX(raidPtr, row, psid);
! 1780:
! 1781: pssPtr = rf_LookupRUStatus(raidPtr,
! 1782: raidPtr->reconControl[row]->pssTable, psid, which_ru,
! 1783: RF_PSS_CREATE | RF_PSS_RECON_BLOCKED, &created);
! 1784:
! 1785: /* If recon is not ongoing on this PS, just return. */
! 1786: if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
! 1787: RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
! 1788: return (0);
! 1789: }
! 1790: /*
! 1791: * Otherwise, we have to wait for reconstruction to complete on this
! 1792: * RU.
! 1793: */
! 1794: /*
! 1795: * In order to avoid waiting for a potentially large number of
! 1796: * low-priority accesses to complete, we force a normal-priority (i.e.
! 1797: * not low-priority) reconstruction on this RU.
! 1798: */
! 1799: if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) &&
! 1800: !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) {
! 1801: DDprintf1("Forcing recon on psid %ld.\n", psid);
! 1802: /* Mark this RU as under forced recon. */
! 1803: pssPtr->flags |= RF_PSS_FORCED_ON_WRITE;
! 1804: /* Clear the blockage that we just set. */
! 1805: pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;
! 1806: fcol = raidPtr->reconControl[row]->fcol;
! 1807:
! 1808: /*
! 1809: * Get a listing of the disks comprising the indicated stripe.
! 1810: */
! 1811: (raidPtr->Layout.map->IdentifyStripe) (raidPtr,
! 1812: asmap->raidAddress, &diskids, &stripe);
! 1813: RF_ASSERT(row == stripe);
! 1814:
! 1815: /*
! 1816: * For previously issued reads, elevate them to normal
! 1817: * priority. If the I/O has already completed, it won't be
! 1818: * found in the queue, and hence this will be a no-op. For
! 1819: * unissued reads, allocate buffers and issue new reads. The
! 1820: * fact that we've set the FORCED bit means that the regular
! 1821: * recon procs will not re-issue these reqs.
! 1822: */
! 1823: for (i = 0; i < raidPtr->Layout.numDataCol +
! 1824: raidPtr->Layout.numParityCol; i++)
! 1825: if ((diskno = diskids[i]) != fcol) {
! 1826: if (pssPtr->issued[diskno]) {
! 1827: nPromoted = rf_DiskIOPromote(&raidPtr
! 1828: ->Queues[row][diskno], psid,
! 1829: which_ru);
! 1830: if (rf_reconDebug && nPromoted)
! 1831: printf("raid%d: promoted read"
! 1832: " from row %d col %d.\n",
! 1833: raidPtr->raidid, row,
! 1834: diskno);
! 1835: } else {
! 1836: /* Create new buf. */
! 1837: new_rbuf = rf_MakeReconBuffer(raidPtr,
! 1838: row, diskno, RF_RBUF_TYPE_FORCED);
! 1839: /* Find offsets & spare locationp */
! 1840: rf_ComputePSDiskOffsets(raidPtr, psid,
! 1841: row, diskno, &offset, &fd_offset,
! 1842: &new_rbuf->spRow, &new_rbuf->spCol,
! 1843: &new_rbuf->spOffset);
! 1844: new_rbuf->parityStripeID = psid;
! 1845: /* Fill in the buffer. */
! 1846: new_rbuf->which_ru = which_ru;
! 1847: new_rbuf->failedDiskSectorOffset =
! 1848: fd_offset;
! 1849: new_rbuf->priority =
! 1850: RF_IO_NORMAL_PRIORITY;
! 1851:
! 1852: /*
! 1853: * Use NULL b_proc b/c all addrs
! 1854: * should be in kernel space.
! 1855: */
! 1856: req = rf_CreateDiskQueueData(
! 1857: RF_IO_TYPE_READ, offset +
! 1858: which_ru * sectorsPerRU,
! 1859: sectorsPerRU, new_rbuf->buffer,
! 1860: psid, which_ru, (int (*)
! 1861: (void *, int))
! 1862: rf_ForceReconReadDoneProc,
! 1863: (void *) new_rbuf, NULL,
! 1864: NULL, (void *) raidPtr, 0, NULL);
! 1865:
! 1866: RF_ASSERT(req); /*
! 1867: * XXX -- Fix this. --
! 1868: * XXX
! 1869: */
! 1870:
! 1871: new_rbuf->arg = req;
! 1872: /* Enqueue the I/O. */
! 1873: rf_DiskIOEnqueue(&raidPtr
! 1874: ->Queues[row][diskno], req,
! 1875: RF_IO_NORMAL_PRIORITY);
! 1876: Dprintf3("raid%d: Issued new read req"
! 1877: " on row %d col %d.\n",
! 1878: raidPtr->raidid, row, diskno);
! 1879: }
! 1880: }
! 1881: /*
! 1882: * If the write is sitting in the disk queue, elevate its
! 1883: * priority.
! 1884: */
! 1885: if (rf_DiskIOPromote(&raidPtr->Queues[row][fcol],
! 1886: psid, which_ru))
! 1887: printf("raid%d: promoted write to row %d col %d.\n",
! 1888: raidPtr->raidid, row, fcol);
! 1889: }
! 1890: /*
! 1891: * Install a callback descriptor to be invoked when recon completes on
! 1892: * this parity stripe.
! 1893: */
! 1894: cb = rf_AllocCallbackDesc();
! 1895: /*
! 1896: * XXX The following is bogus... These functions don't really match !!!
! 1897: * GO
! 1898: */
! 1899: cb->callbackFunc = (void (*) (RF_CBParam_t)) cbFunc;
! 1900: cb->callbackArg.p = (void *) cbArg;
! 1901: cb->next = pssPtr->procWaitList;
! 1902: pssPtr->procWaitList = cb;
! 1903: DDprintf2("raid%d: Waiting for forced recon on psid %ld.\n",
! 1904: raidPtr->raidid, psid);
! 1905:
! 1906: RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
! 1907: return (1);
! 1908: }
! 1909:
! 1910:
! 1911: /*
! 1912: * Called upon the completion of a forced reconstruction read.
! 1913: * All we do is schedule the FORCEDREADONE event.
! 1914: * Called at interrupt context in the kernel, so don't do anything illegal here.
! 1915: */
! 1916: void
! 1917: rf_ForceReconReadDoneProc(void *arg, int status)
! 1918: {
! 1919: RF_ReconBuffer_t *rbuf = arg;
! 1920:
! 1921: if (status) {
! 1922: /* fprintf(stderr, "Forced recon read failed !\n"); */
! 1923: printf("Forced recon read failed !\n");
! 1924: RF_PANIC();
! 1925: }
! 1926: rf_CauseReconEvent((RF_Raid_t *) rbuf->raidPtr, rbuf->row, rbuf->col,
! 1927: (void *) rbuf, RF_REVENT_FORCEDREADDONE);
! 1928: }
! 1929:
! 1930:
! 1931: /* Releases a block on the reconstruction of the indicated stripe. */
! 1932: int
! 1933: rf_UnblockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap)
! 1934: {
! 1935: RF_RowCol_t row = asmap->origRow;
! 1936: RF_StripeNum_t stripeID = asmap->stripeID;
! 1937: RF_ReconParityStripeStatus_t *pssPtr;
! 1938: RF_ReconUnitNum_t which_ru;
! 1939: RF_StripeNum_t psid;
! 1940: int created = 0;
! 1941: RF_CallbackDesc_t *cb;
! 1942:
! 1943: psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID,
! 1944: &which_ru);
! 1945: RF_LOCK_PSS_MUTEX(raidPtr, row, psid);
! 1946: pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]
! 1947: ->pssTable, psid, which_ru, RF_PSS_NONE, &created);
! 1948:
! 1949: /*
! 1950: * When recon is forced, the pss desc can get deleted before we get
! 1951: * back to unblock recon. But, this can _only_ happen when recon is
! 1952: * forced. It would be good to put some kind of sanity check here, but
! 1953: * how to decide if recon was just forced or not ?
! 1954: */
! 1955: if (!pssPtr) {
! 1956: /*
! 1957: * printf("Warning: no pss descriptor upon unblock on psid %ld"
! 1958: * " RU %d.\n", psid, which_ru);
! 1959: */
! 1960: if (rf_reconDebug || rf_pssDebug)
! 1961: printf("Warning: no pss descriptor upon unblock on"
! 1962: " psid %ld RU %d.\n", (long) psid, which_ru);
! 1963: goto out;
! 1964: }
! 1965: pssPtr->blockCount--;
! 1966: Dprintf3("raid%d: unblocking recon on psid %ld: blockcount is %d.\n",
! 1967: raidPtr->raidid, psid, pssPtr->blockCount);
! 1968: if (pssPtr->blockCount == 0) {
! 1969: /* If recon blockage has been released. */
! 1970:
! 1971: /*
! 1972: * Unblock recon before calling CauseReconEvent in case
! 1973: * CauseReconEvent causes us to try to issue a new read before
! 1974: * returning here.
! 1975: */
! 1976: pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;
! 1977:
! 1978:
! 1979: while (pssPtr->blockWaitList) {
! 1980: /*
! 1981: * Spin through the block-wait list and
! 1982: * release all the waiters.
! 1983: */
! 1984: cb = pssPtr->blockWaitList;
! 1985: pssPtr->blockWaitList = cb->next;
! 1986: cb->next = NULL;
! 1987: rf_CauseReconEvent(raidPtr, cb->row, cb->col, NULL,
! 1988: RF_REVENT_BLOCKCLEAR);
! 1989: rf_FreeCallbackDesc(cb);
! 1990: }
! 1991: if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
! 1992: /* If no recon was requested while recon was blocked. */
! 1993: rf_PSStatusDelete(raidPtr, raidPtr->reconControl[row]
! 1994: ->pssTable, pssPtr);
! 1995: }
! 1996: }
! 1997: out:
! 1998: RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
! 1999: return (0);
! 2000: }
CVSweb