Annotation of sys/dev/raidframe/rf_reconstruct.c, Revision 1.1.1.1
1.1 nbrk 1: /* $OpenBSD: rf_reconstruct.c,v 1.16 2007/06/05 00:38:22 deraadt Exp $ */
2: /* $NetBSD: rf_reconstruct.c,v 1.26 2000/06/04 02:05:13 oster Exp $ */
3:
4: /*
5: * Copyright (c) 1995 Carnegie-Mellon University.
6: * All rights reserved.
7: *
8: * Author: Mark Holland
9: *
10: * Permission to use, copy, modify and distribute this software and
11: * its documentation is hereby granted, provided that both the copyright
12: * notice and this permission notice appear in all copies of the
13: * software, derivative works or modified versions, and any portions
14: * thereof, and that both notices appear in supporting documentation.
15: *
16: * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
17: * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
18: * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
19: *
20: * Carnegie Mellon requests users of this software to return to
21: *
22: * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
23: * School of Computer Science
24: * Carnegie Mellon University
25: * Pittsburgh PA 15213-3890
26: *
27: * any improvements or extensions that they make and grant Carnegie the
28: * rights to redistribute these changes.
29: */
30:
31: /**************************************************************
32: *
33: * rf_reconstruct.c -- Code to perform on-line reconstruction.
34: *
35: **************************************************************/
36:
37: #include "rf_types.h"
38: #include <sys/time.h>
39: #include <sys/buf.h>
40: #include <sys/errno.h>
41:
42: #include <sys/types.h>
43: #include <sys/param.h>
44: #include <sys/systm.h>
45: #include <sys/proc.h>
46: #include <sys/ioctl.h>
47: #include <sys/fcntl.h>
48: #if __NETBSD__
49: #include <sys/vnode.h>
50: #endif
51:
52: #include "rf_raid.h"
53: #include "rf_reconutil.h"
54: #include "rf_revent.h"
55: #include "rf_reconbuffer.h"
56: #include "rf_acctrace.h"
57: #include "rf_etimer.h"
58: #include "rf_dag.h"
59: #include "rf_desc.h"
60: #include "rf_general.h"
61: #include "rf_freelist.h"
62: #include "rf_debugprint.h"
63: #include "rf_driver.h"
64: #include "rf_utils.h"
65: #include "rf_shutdown.h"
66:
67: #include "rf_kintf.h"
68:
69: /*
70: * Setting these to -1 causes them to be set to their default values if not set
71: * by debug options.
72: */
73:
74: #define Dprintf(s) \
75: do { \
76: if (rf_reconDebug) \
77: rf_debug_printf(s, \
78: NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL); \
79: } while (0)
80: #define Dprintf1(s,a) \
81: do { \
82: if (rf_reconDebug) \
83: rf_debug_printf(s, \
84: (void *)((unsigned long)a), \
85: NULL, NULL, NULL, NULL, NULL, NULL, NULL); \
86: } while (0)
87: #define Dprintf2(s,a,b) \
88: do { \
89: if (rf_reconDebug) \
90: rf_debug_printf(s, \
91: (void *)((unsigned long)a), \
92: (void *)((unsigned long)b), \
93: NULL, NULL, NULL, NULL, NULL, NULL); \
94: } while (0)
95: #define Dprintf3(s,a,b,c) \
96: do { \
97: if (rf_reconDebug) \
98: rf_debug_printf(s, \
99: (void *)((unsigned long)a), \
100: (void *)((unsigned long)b), \
101: (void *)((unsigned long)c), \
102: NULL, NULL, NULL, NULL, NULL); \
103: } while (0)
104: #define Dprintf4(s,a,b,c,d) \
105: do { \
106: if (rf_reconDebug) \
107: rf_debug_printf(s, \
108: (void *)((unsigned long)a), \
109: (void *)((unsigned long)b), \
110: (void *)((unsigned long)c), \
111: (void *)((unsigned long)d), \
112: NULL, NULL, NULL, NULL); \
113: } while (0)
114: #define Dprintf5(s,a,b,c,d,e) \
115: do { \
116: if (rf_reconDebug) \
117: rf_debug_printf(s, \
118: (void *)((unsigned long)a), \
119: (void *)((unsigned long)b), \
120: (void *)((unsigned long)c), \
121: (void *)((unsigned long)d), \
122: (void *)((unsigned long)e), \
123: NULL, NULL, NULL); \
124: } while (0)
125: #define Dprintf6(s,a,b,c,d,e,f) \
126: do { \
127: if (rf_reconDebug) \
128: rf_debug_printf(s, \
129: (void *)((unsigned long)a), \
130: (void *)((unsigned long)b), \
131: (void *)((unsigned long)c), \
132: (void *)((unsigned long)d), \
133: (void *)((unsigned long)e), \
134: (void *)((unsigned long)f), \
135: NULL, NULL); \
136: } while (0)
137: #define Dprintf7(s,a,b,c,d,e,f,g) \
138: do { \
139: if (rf_reconDebug) \
140: rf_debug_printf(s, \
141: (void *)((unsigned long)a), \
142: (void *)((unsigned long)b), \
143: (void *)((unsigned long)c), \
144: (void *)((unsigned long)d), \
145: (void *)((unsigned long)e), \
146: (void *)((unsigned long)f), \
147: (void *)((unsigned long)g), \
148: NULL); \
149: } while (0)
150:
151: #define DDprintf1(s,a) \
152: do { \
153: if (rf_reconDebug) \
154: rf_debug_printf(s, \
155: (void *)((unsigned long)a), \
156: NULL, NULL, NULL, NULL, NULL, NULL, NULL); \
157: } while (0)
158: #define DDprintf2(s,a,b) \
159: do { \
160: if (rf_reconDebug) \
161: rf_debug_printf(s, \
162: (void *)((unsigned long)a), \
163: (void *)((unsigned long)b), \
164: NULL, NULL, NULL, NULL, NULL, NULL); \
165: } while (0)
166:
167: static RF_FreeList_t *rf_recond_freelist;
168: #define RF_MAX_FREE_RECOND 4
169: #define RF_RECOND_INC 1
170:
171: RF_RaidReconDesc_t *rf_AllocRaidReconDesc(RF_Raid_t *,
172: RF_RowCol_t, RF_RowCol_t, RF_RaidDisk_t *, int,
173: RF_RowCol_t, RF_RowCol_t);
174: int rf_ProcessReconEvent(RF_Raid_t *, RF_RowCol_t, RF_ReconEvent_t *);
175: int rf_IssueNextReadRequest(RF_Raid_t *, RF_RowCol_t, RF_RowCol_t);
176: int rf_TryToRead(RF_Raid_t *, RF_RowCol_t, RF_RowCol_t);
177: int rf_ComputePSDiskOffsets(RF_Raid_t *, RF_StripeNum_t,
178: RF_RowCol_t, RF_RowCol_t, RF_SectorNum_t *, RF_SectorNum_t *,
179: RF_RowCol_t *, RF_RowCol_t *, RF_SectorNum_t *);
180: int rf_ReconReadDoneProc(void *, int);
181: int rf_ReconWriteDoneProc(void *, int);
182: void rf_CheckForNewMinHeadSep(RF_Raid_t *, RF_RowCol_t, RF_HeadSepLimit_t);
183: int rf_CheckHeadSeparation(RF_Raid_t *, RF_PerDiskReconCtrl_t *,
184: RF_RowCol_t, RF_RowCol_t, RF_HeadSepLimit_t, RF_ReconUnitNum_t);
185: void rf_ForceReconReadDoneProc(void *, int);
186: void rf_ShutdownReconstruction(void *);
187:
188: /*
189: * These functions are inlined on gcc. If they are used more than
190: * once, it is strongly advised to un-line them.
191: */
192: void rf_FreeReconDesc(RF_RaidReconDesc_t *);
193: int rf_IssueNextWriteRequest(RF_Raid_t *, RF_RowCol_t);
194: int rf_CheckForcedOrBlockedReconstruction(RF_Raid_t *,
195: RF_ReconParityStripeStatus_t *, RF_PerDiskReconCtrl_t *,
196: RF_RowCol_t, RF_RowCol_t, RF_StripeNum_t, RF_ReconUnitNum_t);
197: void rf_SignalReconDone(RF_Raid_t *);
198:
199: struct RF_ReconDoneProc_s {
200: void (*proc) (RF_Raid_t *, void *);
201: void *arg;
202: RF_ReconDoneProc_t *next;
203: };
204:
205: static RF_FreeList_t *rf_rdp_freelist;
206: #define RF_MAX_FREE_RDP 4
207: #define RF_RDP_INC 1
208:
209: void
210: rf_SignalReconDone(RF_Raid_t *raidPtr)
211: {
212: RF_ReconDoneProc_t *p;
213:
214: RF_LOCK_MUTEX(raidPtr->recon_done_proc_mutex);
215: for (p = raidPtr->recon_done_procs; p; p = p->next) {
216: p->proc(raidPtr, p->arg);
217: }
218: RF_UNLOCK_MUTEX(raidPtr->recon_done_proc_mutex);
219: }
220:
221: int
222: rf_RegisterReconDoneProc(RF_Raid_t *raidPtr, void (*proc) (RF_Raid_t *, void *),
223: void *arg, RF_ReconDoneProc_t **handlep)
224: {
225: RF_ReconDoneProc_t *p;
226:
227: RF_FREELIST_GET(rf_rdp_freelist, p, next, (RF_ReconDoneProc_t *));
228: if (p == NULL)
229: return (ENOMEM);
230: p->proc = proc;
231: p->arg = arg;
232: RF_LOCK_MUTEX(raidPtr->recon_done_proc_mutex);
233: p->next = raidPtr->recon_done_procs;
234: raidPtr->recon_done_procs = p;
235: RF_UNLOCK_MUTEX(raidPtr->recon_done_proc_mutex);
236: if (handlep)
237: *handlep = p;
238: return (0);
239: }
240:
241: /*****************************************************************************
242: *
243: * Sets up the parameters that will be used by the reconstruction process.
244: * Currently there are none, except for those that the layout-specific
245: * configuration (e.g. rf_ConfigureDeclustered) routine sets up.
246: *
247: * In the kernel, we fire off the recon thread.
248: *
249: *****************************************************************************/
250: void
251: rf_ShutdownReconstruction(void *ignored)
252: {
253: RF_FREELIST_DESTROY(rf_recond_freelist, next, (RF_RaidReconDesc_t *));
254: RF_FREELIST_DESTROY(rf_rdp_freelist, next, (RF_ReconDoneProc_t *));
255: }
256:
257: int
258: rf_ConfigureReconstruction(RF_ShutdownList_t **listp)
259: {
260: int rc;
261:
262: RF_FREELIST_CREATE(rf_recond_freelist, RF_MAX_FREE_RECOND,
263: RF_RECOND_INC, sizeof(RF_RaidReconDesc_t));
264: if (rf_recond_freelist == NULL)
265: return (ENOMEM);
266: RF_FREELIST_CREATE(rf_rdp_freelist, RF_MAX_FREE_RDP,
267: RF_RDP_INC, sizeof(RF_ReconDoneProc_t));
268: if (rf_rdp_freelist == NULL) {
269: RF_FREELIST_DESTROY(rf_recond_freelist, next,
270: (RF_RaidReconDesc_t *));
271: return (ENOMEM);
272: }
273: rc = rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL);
274: if (rc) {
275: RF_ERRORMSG3("Unable to add to shutdown list file %s line %d"
276: " rc=%d.\n", __FILE__, __LINE__, rc);
277: rf_ShutdownReconstruction(NULL);
278: return (rc);
279: }
280: return (0);
281: }
282:
283: RF_RaidReconDesc_t *
284: rf_AllocRaidReconDesc(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col,
285: RF_RaidDisk_t *spareDiskPtr, int numDisksDone, RF_RowCol_t srow,
286: RF_RowCol_t scol)
287: {
288:
289: RF_RaidReconDesc_t *reconDesc;
290:
291: RF_FREELIST_GET(rf_recond_freelist, reconDesc, next,
292: (RF_RaidReconDesc_t *));
293:
294: reconDesc->raidPtr = raidPtr;
295: reconDesc->row = row;
296: reconDesc->col = col;
297: reconDesc->spareDiskPtr = spareDiskPtr;
298: reconDesc->numDisksDone = numDisksDone;
299: reconDesc->srow = srow;
300: reconDesc->scol = scol;
301: reconDesc->state = 0;
302: reconDesc->next = NULL;
303:
304: return (reconDesc);
305: }
306:
307: void
308: rf_FreeReconDesc(RF_RaidReconDesc_t *reconDesc)
309: {
310: #if RF_RECON_STATS > 0
311: printf("RAIDframe: %qu recon event waits, %qu recon delays.\n",
312: reconDesc->numReconEventWaits, reconDesc->numReconExecDelays);
313: #endif /* RF_RECON_STATS > 0 */
314:
315: printf("RAIDframe: %qu max exec ticks.\n",
316: reconDesc->maxReconExecTicks);
317:
318: #if (RF_RECON_STATS > 0) || defined(_KERNEL)
319: printf("\n");
320: #endif /* (RF_RECON_STATS > 0) || _KERNEL */
321: RF_FREELIST_FREE(rf_recond_freelist, reconDesc, next);
322: }
323:
324:
325: /*****************************************************************************
326: *
327: * Primary routine to reconstruct a failed disk. This should be called from
328: * within its own thread. It won't return until reconstruction completes,
329: * fails, or is aborted.
330: *
331: *****************************************************************************/
332: int
333: rf_ReconstructFailedDisk(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col)
334: {
335: RF_LayoutSW_t *lp;
336: int rc;
337:
338: lp = raidPtr->Layout.map;
339: if (lp->SubmitReconBuffer) {
340: /*
341: * The current infrastructure only supports reconstructing one
342: * disk at a time for each array.
343: */
344: RF_LOCK_MUTEX(raidPtr->mutex);
345: while (raidPtr->reconInProgress) {
346: RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
347: }
348: raidPtr->reconInProgress++;
349: RF_UNLOCK_MUTEX(raidPtr->mutex);
350: rc = rf_ReconstructFailedDiskBasic(raidPtr, row, col);
351: RF_LOCK_MUTEX(raidPtr->mutex);
352: raidPtr->reconInProgress--;
353: RF_UNLOCK_MUTEX(raidPtr->mutex);
354: } else {
355: RF_ERRORMSG1("RECON: no way to reconstruct failed disk for"
356: " arch %c.\n", lp->parityConfig);
357: rc = EIO;
358: }
359: RF_SIGNAL_COND(raidPtr->waitForReconCond);
360: wakeup(&raidPtr->waitForReconCond); /*
361: * XXX Methinks this will be
362: * needed at some point... GO
363: */
364: return (rc);
365: }
366:
367: int
368: rf_ReconstructFailedDiskBasic(RF_Raid_t *raidPtr, RF_RowCol_t row,
369: RF_RowCol_t col)
370: {
371: RF_ComponentLabel_t c_label;
372: RF_RaidDisk_t *spareDiskPtr = NULL;
373: RF_RaidReconDesc_t *reconDesc;
374: RF_RowCol_t srow, scol;
375: int numDisksDone = 0, rc;
376:
377: /* First look for a spare drive onto which to reconstruct the data. */
378: /*
379: * Spare disk descriptors are stored in row 0. This may have to
380: * change eventually.
381: */
382:
383: RF_LOCK_MUTEX(raidPtr->mutex);
384: RF_ASSERT(raidPtr->Disks[row][col].status == rf_ds_failed);
385:
386: if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
387: if (raidPtr->status[row] != rf_rs_degraded) {
388: RF_ERRORMSG2("Unable to reconstruct disk at row %d"
389: " col %d because status not degraded.\n", row, col);
390: RF_UNLOCK_MUTEX(raidPtr->mutex);
391: return (EINVAL);
392: }
393: srow = row;
394: scol = (-1);
395: } else {
396: srow = 0;
397: for (scol = raidPtr->numCol;
398: scol < raidPtr->numCol + raidPtr->numSpare; scol++) {
399: if (raidPtr->Disks[srow][scol].status == rf_ds_spare) {
400: spareDiskPtr = &raidPtr->Disks[srow][scol];
401: spareDiskPtr->status = rf_ds_used_spare;
402: break;
403: }
404: }
405: if (!spareDiskPtr) {
406: RF_ERRORMSG2("Unable to reconstruct disk at row %d"
407: " col %d because no spares are available.\n",
408: row, col);
409: RF_UNLOCK_MUTEX(raidPtr->mutex);
410: return (ENOSPC);
411: }
412: printf("RECON: initiating reconstruction on row %d col %d"
413: " -> spare at row %d col %d.\n", row, col, srow, scol);
414: }
415: RF_UNLOCK_MUTEX(raidPtr->mutex);
416:
417: reconDesc = rf_AllocRaidReconDesc((void *) raidPtr, row, col,
418: spareDiskPtr, numDisksDone, srow, scol);
419: raidPtr->reconDesc = (void *) reconDesc;
420: #if RF_RECON_STATS > 0
421: reconDesc->hsStallCount = 0;
422: reconDesc->numReconExecDelays = 0;
423: reconDesc->numReconEventWaits = 0;
424: #endif /* RF_RECON_STATS > 0 */
425: reconDesc->reconExecTimerRunning = 0;
426: reconDesc->reconExecTicks = 0;
427: reconDesc->maxReconExecTicks = 0;
428: rc = rf_ContinueReconstructFailedDisk(reconDesc);
429:
430: if (!rc) {
431: /* Fix up the component label. */
432: /* Don't actually need the read here... */
433: raidread_component_label(
434: raidPtr->raid_cinfo[srow][scol].ci_dev,
435: raidPtr->raid_cinfo[srow][scol].ci_vp,
436: &c_label);
437:
438: raid_init_component_label(raidPtr, &c_label);
439: c_label.row = row;
440: c_label.column = col;
441: c_label.clean = RF_RAID_DIRTY;
442: c_label.status = rf_ds_optimal;
443:
444: /* XXXX MORE NEEDED HERE. */
445:
446: raidwrite_component_label(
447: raidPtr->raid_cinfo[srow][scol].ci_dev,
448: raidPtr->raid_cinfo[srow][scol].ci_vp,
449: &c_label);
450:
451: }
452: return (rc);
453: }
454:
455: /*
456: *
457: * Allow reconstructing a disk in-place -- i.e. component /dev/sd2e goes AWOL,
458: * and you don't get a spare until the next Monday. With this function
459: * (and hot-swappable drives) you can now put your new disk containing
460: * /dev/sd2e on the bus, scsictl it alive, and then use raidctl(8) to
461: * rebuild the data "on the spot".
462: *
463: */
464:
465: int
466: rf_ReconstructInPlace(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col)
467: {
468: RF_RaidDisk_t *spareDiskPtr = NULL;
469: RF_RaidReconDesc_t *reconDesc;
470: RF_LayoutSW_t *lp;
471: RF_RaidDisk_t *badDisk;
472: RF_ComponentLabel_t c_label;
473: int numDisksDone = 0, rc;
474: struct partinfo dpart;
475: struct vnode *vp;
476: struct vattr va;
477: struct proc *proc;
478: int retcode;
479: int ac;
480:
481: lp = raidPtr->Layout.map;
482: if (lp->SubmitReconBuffer) {
483: /*
484: * The current infrastructure only supports reconstructing one
485: * disk at a time for each array.
486: */
487: RF_LOCK_MUTEX(raidPtr->mutex);
488: if ((raidPtr->Disks[row][col].status == rf_ds_optimal) &&
489: (raidPtr->numFailures > 0)) {
490: /* XXX 0 above shouldn't be constant !!! */
491: /*
492: * Some component other than this has failed.
493: * Let's not make things worse than they already
494: * are...
495: */
496: #ifdef RAIDDEBUG
497: printf("RAIDFRAME: Unable to reconstruct to disk at:\n"
498: " Row: %d Col: %d Too many failures.\n",
499: row, col);
500: #endif /* RAIDDEBUG */
501: RF_UNLOCK_MUTEX(raidPtr->mutex);
502: return (EINVAL);
503: }
504: if (raidPtr->Disks[row][col].status == rf_ds_reconstructing) {
505: #ifdef RAIDDEBUG
506: printf("RAIDFRAME: Unable to reconstruct to disk at:\n"
507: " Row: %d Col: %d Reconstruction already"
508: " occurring !\n", row, col);
509: #endif /* RAIDDEBUG */
510:
511: RF_UNLOCK_MUTEX(raidPtr->mutex);
512: return (EINVAL);
513: }
514:
515:
516: if (raidPtr->Disks[row][col].status != rf_ds_failed) {
517: /* "It's gone..." */
518: raidPtr->numFailures++;
519: raidPtr->Disks[row][col].status = rf_ds_failed;
520: raidPtr->status[row] = rf_rs_degraded;
521: rf_update_component_labels(raidPtr,
522: RF_NORMAL_COMPONENT_UPDATE);
523: }
524:
525: while (raidPtr->reconInProgress) {
526: RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex);
527: }
528:
529: raidPtr->reconInProgress++;
530:
531: /*
532: * First look for a spare drive onto which to reconstruct
533: * the data. Spare disk descriptors are stored in row 0.
534: * This may have to change eventually.
535: */
536:
537: /*
538: * Actually, we don't care if it's failed or not...
539: * On a RAID set with correct parity, this function
540: * should be callable on any component without ill effects.
541: */
542: /*
543: * RF_ASSERT(raidPtr->Disks[row][col].status == rf_ds_failed);
544: */
545:
546: if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
547: RF_ERRORMSG2("Unable to reconstruct to disk at row %d"
548: " col %d: operation not supported for"
549: " RF_DISTRIBUTE_SPARE.\n", row, col);
550:
551: raidPtr->reconInProgress--;
552: RF_UNLOCK_MUTEX(raidPtr->mutex);
553: return (EINVAL);
554: }
555:
556: /*
557: * XXX Need goop here to see if the disk is alive,
558: * and, if not, make it so...
559: */
560:
561: badDisk = &raidPtr->Disks[row][col];
562:
563: proc = raidPtr->recon_thread;
564:
565: /*
566: * This device may have been opened successfully the
567: * first time. Close it before trying to open it again...
568: */
569:
570: if (raidPtr->raid_cinfo[row][col].ci_vp != NULL) {
571: printf("Closing the opened device: %s\n",
572: raidPtr->Disks[row][col].devname);
573: vp = raidPtr->raid_cinfo[row][col].ci_vp;
574: ac = raidPtr->Disks[row][col].auto_configured;
575: rf_close_component(raidPtr, vp, ac);
576: raidPtr->raid_cinfo[row][col].ci_vp = NULL;
577: }
578: /*
579: * Note that this disk was *not* auto_configured (any longer).
580: */
581: raidPtr->Disks[row][col].auto_configured = 0;
582:
583: printf("About to (re-)open the device for rebuilding: %s\n",
584: raidPtr->Disks[row][col].devname);
585:
586: retcode = raidlookup(raidPtr->Disks[row][col].devname,
587: proc, &vp);
588:
589: if (retcode) {
590: printf("raid%d: rebuilding: raidlookup on device: %s"
591: " failed: %d !\n", raidPtr->raidid,
592: raidPtr->Disks[row][col].devname, retcode);
593:
594: /*
595: * XXX the component isn't responding properly...
596: * Must still be dead :-(
597: */
598: raidPtr->reconInProgress--;
599: RF_UNLOCK_MUTEX(raidPtr->mutex);
600: return(retcode);
601:
602: } else {
603:
604: /*
605: * Ok, so we can at least do a lookup...
606: * How about actually getting a vp for it ?
607: */
608:
609: if ((retcode =
610: VOP_GETATTR(vp, &va, proc->p_ucred, proc)) != 0) {
611: raidPtr->reconInProgress--;
612: RF_UNLOCK_MUTEX(raidPtr->mutex);
613: return(retcode);
614: }
615: retcode = VOP_IOCTL(vp, DIOCGPART, (caddr_t) & dpart,
616: FREAD, proc->p_ucred, proc);
617: if (retcode) {
618: raidPtr->reconInProgress--;
619: RF_UNLOCK_MUTEX(raidPtr->mutex);
620: return(retcode);
621: }
622: raidPtr->Disks[row][col].blockSize =
623: dpart.disklab->d_secsize;
624:
625: raidPtr->Disks[row][col].numBlocks =
626: DL_GETPSIZE(dpart.part) - rf_protectedSectors;
627:
628: raidPtr->raid_cinfo[row][col].ci_vp = vp;
629: raidPtr->raid_cinfo[row][col].ci_dev = va.va_rdev;
630:
631: raidPtr->Disks[row][col].dev = va.va_rdev;
632:
633: /*
634: * We allow the user to specify that only a
635: * fraction of the disks should be used this is
636: * just for debug: it speeds up the parity scan.
637: */
638: raidPtr->Disks[row][col].numBlocks =
639: raidPtr->Disks[row][col].numBlocks *
640: rf_sizePercentage / 100;
641: }
642:
643: spareDiskPtr = &raidPtr->Disks[row][col];
644: spareDiskPtr->status = rf_ds_used_spare;
645:
646: printf("RECON: Initiating in-place reconstruction on\n");
647: printf(" row %d col %d -> spare at row %d col %d.\n",
648: row, col, row, col);
649:
650: RF_UNLOCK_MUTEX(raidPtr->mutex);
651:
652: reconDesc = rf_AllocRaidReconDesc((void *) raidPtr, row, col,
653: spareDiskPtr, numDisksDone, row, col);
654: raidPtr->reconDesc = (void *) reconDesc;
655: #if RF_RECON_STATS > 0
656: reconDesc->hsStallCount = 0;
657: reconDesc->numReconExecDelays = 0;
658: reconDesc->numReconEventWaits = 0;
659: #endif /* RF_RECON_STATS > 0 */
660: reconDesc->reconExecTimerRunning = 0;
661: reconDesc->reconExecTicks = 0;
662: reconDesc->maxReconExecTicks = 0;
663: rc = rf_ContinueReconstructFailedDisk(reconDesc);
664:
665: RF_LOCK_MUTEX(raidPtr->mutex);
666: raidPtr->reconInProgress--;
667: RF_UNLOCK_MUTEX(raidPtr->mutex);
668:
669: } else {
670: RF_ERRORMSG1("RECON: no way to reconstruct failed disk for"
671: " arch %c.\n", lp->parityConfig);
672: rc = EIO;
673: }
674: RF_LOCK_MUTEX(raidPtr->mutex);
675:
676: if (!rc) {
677: /*
678: * Need to set these here, as at this point it'll be claiming
679: * that the disk is in rf_ds_spared ! But we know better :-)
680: */
681:
682: raidPtr->Disks[row][col].status = rf_ds_optimal;
683: raidPtr->status[row] = rf_rs_optimal;
684:
685: /* Fix up the component label. */
686: /* Don't actually need the read here... */
687: raidread_component_label(
688: raidPtr->raid_cinfo[row][col].ci_dev,
689: raidPtr->raid_cinfo[row][col].ci_vp,
690: &c_label);
691:
692: raid_init_component_label(raidPtr, &c_label);
693:
694: c_label.row = row;
695: c_label.column = col;
696:
697: raidwrite_component_label(raidPtr->raid_cinfo[row][col].ci_dev,
698: raidPtr->raid_cinfo[row][col].ci_vp, &c_label);
699:
700: }
701: RF_UNLOCK_MUTEX(raidPtr->mutex);
702: RF_SIGNAL_COND(raidPtr->waitForReconCond);
703: wakeup(&raidPtr->waitForReconCond);
704: return (rc);
705: }
706:
707:
708: int
709: rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t *reconDesc)
710: {
711: RF_Raid_t *raidPtr = reconDesc->raidPtr;
712: RF_RowCol_t row = reconDesc->row;
713: RF_RowCol_t col = reconDesc->col;
714: RF_RowCol_t srow = reconDesc->srow;
715: RF_RowCol_t scol = reconDesc->scol;
716: RF_ReconMap_t *mapPtr;
717:
718: RF_ReconEvent_t *event;
719: struct timeval etime, elpsd;
720: unsigned long xor_s, xor_resid_us;
721: int retcode, i, ds;
722:
723: switch (reconDesc->state) {
724: case 0:
725: raidPtr->accumXorTimeUs = 0;
726:
727: /* Create one trace record per physical disk. */
728: RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol *
729: sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *));
730:
731: /*
732: * Quiesce the array prior to starting recon. This is needed
733: * to assure no nasty interactions with pending user writes.
734: * We need to do this before we change the disk or row status.
735: */
736: reconDesc->state = 1;
737:
738: Dprintf("RECON: begin request suspend.\n");
739: retcode = rf_SuspendNewRequestsAndWait(raidPtr);
740: Dprintf("RECON: end request suspend.\n");
741: rf_StartUserStats(raidPtr); /*
742: * Zero out the stats kept on
743: * user accs.
744: */
745: /* Fall through to state 1. */
746: case 1:
747: RF_LOCK_MUTEX(raidPtr->mutex);
748:
749: /*
750: * Create the reconstruction control pointer and install it in
751: * the right slot.
752: */
753: raidPtr->reconControl[row] =
754: rf_MakeReconControl(reconDesc, row, col, srow, scol);
755: mapPtr = raidPtr->reconControl[row]->reconMap;
756: raidPtr->status[row] = rf_rs_reconstructing;
757: raidPtr->Disks[row][col].status = rf_ds_reconstructing;
758: raidPtr->Disks[row][col].spareRow = srow;
759: raidPtr->Disks[row][col].spareCol = scol;
760:
761: RF_UNLOCK_MUTEX(raidPtr->mutex);
762:
763: RF_GETTIME(raidPtr->reconControl[row]->starttime);
764:
765: /*
766: * Now start up the actual reconstruction: issue a read for
767: * each surviving disk.
768: */
769:
770: reconDesc->numDisksDone = 0;
771: for (i = 0; i < raidPtr->numCol; i++) {
772: if (i != col) {
773: /*
774: * Find and issue the next I/O on the
775: * indicated disk.
776: */
777: if (rf_IssueNextReadRequest(raidPtr, row, i)) {
778: Dprintf2("RECON: done issuing for r%d"
779: " c%d.\n", row, i);
780: reconDesc->numDisksDone++;
781: }
782: }
783: }
784:
785: reconDesc->state = 2;
786:
787: case 2:
788: Dprintf("RECON: resume requests.\n");
789: rf_ResumeNewRequests(raidPtr);
790:
791: reconDesc->state = 3;
792:
793: case 3:
794:
795: /*
796: * Process reconstruction events until all disks report that
797: * they've completed all work.
798: */
799: mapPtr = raidPtr->reconControl[row]->reconMap;
800:
801: while (reconDesc->numDisksDone < raidPtr->numCol - 1) {
802:
803: event = rf_GetNextReconEvent(reconDesc, row,
804: (void (*) (void *)) rf_ContinueReconstructFailedDisk,
805: reconDesc);
806: RF_ASSERT(event);
807:
808: if (rf_ProcessReconEvent(raidPtr, row, event))
809: reconDesc->numDisksDone++;
810: raidPtr->reconControl[row]->numRUsTotal =
811: mapPtr->totalRUs;
812: raidPtr->reconControl[row]->numRUsComplete =
813: mapPtr->totalRUs -
814: rf_UnitsLeftToReconstruct(mapPtr);
815:
816: raidPtr->reconControl[row]->percentComplete =
817: (raidPtr->reconControl[row]->numRUsComplete * 100 /
818: raidPtr->reconControl[row]->numRUsTotal);
819: if (rf_prReconSched) {
820: rf_PrintReconSchedule(
821: raidPtr->reconControl[row]->reconMap,
822: &(raidPtr->reconControl[row]->starttime));
823: }
824: }
825:
826: reconDesc->state = 4;
827:
828: case 4:
829: mapPtr = raidPtr->reconControl[row]->reconMap;
830: if (rf_reconDebug) {
831: printf("RECON: all reads completed.\n");
832: }
833: /*
834: * At this point all the reads have completed. We now wait
835: * for any pending writes to complete, and then we're done.
836: */
837:
838: while (rf_UnitsLeftToReconstruct(
839: raidPtr->reconControl[row]->reconMap) > 0) {
840:
841: event = rf_GetNextReconEvent(reconDesc, row,
842: (void (*) (void *)) rf_ContinueReconstructFailedDisk,
843: reconDesc);
844: RF_ASSERT(event);
845:
846: /* Ignore return code. */
847: (void) rf_ProcessReconEvent(raidPtr, row, event);
848: raidPtr->reconControl[row]->percentComplete =
849: 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 /
850: mapPtr->totalRUs);
851: if (rf_prReconSched) {
852: rf_PrintReconSchedule(
853: raidPtr->reconControl[row]->reconMap,
854: &(raidPtr->reconControl[row]->starttime));
855: }
856: }
857: reconDesc->state = 5;
858:
859: case 5:
860: /*
861: * Success: mark the dead disk as reconstructed. We quiesce
862: * the array here to assure no nasty interactions with pending
863: * user accesses, when we free up the psstatus structure as
864: * part of FreeReconControl().
865: */
866:
867: reconDesc->state = 6;
868:
869: retcode = rf_SuspendNewRequestsAndWait(raidPtr);
870: rf_StopUserStats(raidPtr);
871: rf_PrintUserStats(raidPtr); /*
872: * Print out the stats on user
873: * accs accumulated during
874: * recon.
875: */
876:
877: /* Fall through to state 6. */
878: case 6:
879: RF_LOCK_MUTEX(raidPtr->mutex);
880: raidPtr->numFailures--;
881: ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE);
882: raidPtr->Disks[row][col].status = (ds) ? rf_ds_dist_spared :
883: rf_ds_spared;
884: raidPtr->status[row] = (ds) ? rf_rs_reconfigured :
885: rf_rs_optimal;
886: RF_UNLOCK_MUTEX(raidPtr->mutex);
887: RF_GETTIME(etime);
888: RF_TIMEVAL_DIFF(&(raidPtr->reconControl[row]->starttime),
889: &etime, &elpsd);
890:
891: /*
892: * XXX -- Why is state 7 different from state 6 if there is no
893: * return() here ? -- XXX Note that I set elpsd above & use it
894: * below, so if you put a return here you'll have to fix this.
895: * (also, FreeReconControl is called below).
896: */
897:
898: case 7:
899:
900: rf_ResumeNewRequests(raidPtr);
901:
902: printf("Reconstruction of disk at row %d col %d completed.\n",
903: row, col);
904: xor_s = raidPtr->accumXorTimeUs / 1000000;
905: xor_resid_us = raidPtr->accumXorTimeUs % 1000000;
906: printf("Recon time was %d.%06d seconds, accumulated XOR time"
907: " was %ld us (%ld.%06ld).\n", (int) elpsd.tv_sec,
908: (int) elpsd.tv_usec, raidPtr->accumXorTimeUs, xor_s,
909: xor_resid_us);
910: printf(" (start time %d sec %d usec, end time %d sec %d"
911: " usec)\n",
912: (int) raidPtr->reconControl[row]->starttime.tv_sec,
913: (int) raidPtr->reconControl[row]->starttime.tv_usec,
914: (int) etime.tv_sec, (int) etime.tv_usec);
915:
916: #if RF_RECON_STATS > 0
917: printf("Total head-sep stall count was %d.\n",
918: (int) reconDesc->hsStallCount);
919: #endif /* RF_RECON_STATS > 0 */
920: rf_FreeReconControl(raidPtr, row);
921: RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol *
922: sizeof(RF_AccTraceEntry_t));
923: rf_FreeReconDesc(reconDesc);
924:
925: }
926:
927: rf_SignalReconDone(raidPtr);
928: return (0);
929: }
930:
931:
932: /*****************************************************************************
933: * Do the right thing upon each reconstruction event.
934: * Returns nonzero if and only if there is nothing left unread on the
935: * indicated disk.
936: *****************************************************************************/
937: int
938: rf_ProcessReconEvent(RF_Raid_t *raidPtr, RF_RowCol_t frow,
939: RF_ReconEvent_t *event)
940: {
941: int retcode = 0, submitblocked;
942: RF_ReconBuffer_t *rbuf;
943: RF_SectorCount_t sectorsPerRU;
944:
945: Dprintf1("RECON: rf_ProcessReconEvent type %d.\n", event->type);
946:
947: switch (event->type) {
948:
949: /* A read I/O has completed. */
950: case RF_REVENT_READDONE:
951: rbuf = raidPtr->reconControl[frow]
952: ->perDiskInfo[event->col].rbuf;
953: Dprintf3("RECON: READDONE EVENT: row %d col %d psid %ld.\n",
954: frow, event->col, rbuf->parityStripeID);
955: Dprintf7("RECON: done read psid %ld buf %lx %02x %02x %02x"
956: " %02x %02x.\n", rbuf->parityStripeID, rbuf->buffer,
957: rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff,
958: rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff,
959: rbuf->buffer[4] & 0xff);
960: rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
961: submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0);
962: Dprintf1("RECON: submitblocked=%d.\n", submitblocked);
963: if (!submitblocked)
964: retcode = rf_IssueNextReadRequest(raidPtr, frow,
965: event->col);
966: break;
967:
968: /* A write I/O has completed. */
969: case RF_REVENT_WRITEDONE:
970: if (rf_floatingRbufDebug) {
971: rf_CheckFloatingRbufCount(raidPtr, 1);
972: }
973: sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit *
974: raidPtr->Layout.SUsPerRU;
975: rbuf = (RF_ReconBuffer_t *) event->arg;
976: rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
977: Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d"
978: " (%d %% complete).\n",
979: rbuf->parityStripeID, rbuf->which_ru,
980: raidPtr->reconControl[frow]->percentComplete);
981: rf_ReconMapUpdate(raidPtr, raidPtr->reconControl[frow]
982: ->reconMap, rbuf->failedDiskSectorOffset,
983: rbuf->failedDiskSectorOffset + sectorsPerRU - 1);
984: rf_RemoveFromActiveReconTable(raidPtr, frow,
985: rbuf->parityStripeID, rbuf->which_ru);
986:
987: if (rbuf->type == RF_RBUF_TYPE_FLOATING) {
988: RF_LOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex);
989: raidPtr->numFullReconBuffers--;
990: rf_ReleaseFloatingReconBuffer(raidPtr, frow, rbuf);
991: RF_UNLOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex);
992: } else
993: if (rbuf->type == RF_RBUF_TYPE_FORCED)
994: rf_FreeReconBuffer(rbuf);
995: else
996: RF_ASSERT(0);
997: break;
998:
999: /* A buffer-stall condition has been cleared. */
1000: case RF_REVENT_BUFCLEAR:
1001: Dprintf2("RECON: BUFCLEAR EVENT: row %d col %d.\n", frow,
1002: event->col);
1003: submitblocked = rf_SubmitReconBuffer(raidPtr
1004: ->reconControl[frow]->perDiskInfo[event->col].rbuf, 0,
1005: (int) (long) event->arg);
1006: RF_ASSERT(!submitblocked); /*
1007: * We wouldn't have gotten the
1008: * BUFCLEAR event if we
1009: * couldn't submit.
1010: */
1011: retcode = rf_IssueNextReadRequest(raidPtr, frow, event->col);
1012: break;
1013:
1014: /* A user-write reconstruction blockage has been cleared. */
1015: case RF_REVENT_BLOCKCLEAR:
1016: DDprintf2("RECON: BLOCKCLEAR EVENT: row %d col %d.\n",
1017: frow, event->col);
1018: retcode = rf_TryToRead(raidPtr, frow, event->col);
1019: break;
1020:
1021: /*
1022: * A max-head-separation reconstruction blockage has been
1023: * cleared.
1024: */
1025: case RF_REVENT_HEADSEPCLEAR:
1026: Dprintf2("RECON: HEADSEPCLEAR EVENT: row %d col %d.\n",
1027: frow, event->col);
1028: retcode = rf_TryToRead(raidPtr, frow, event->col);
1029: break;
1030:
1031: /* A buffer has become ready to write. */
1032: case RF_REVENT_BUFREADY:
1033: Dprintf2("RECON: BUFREADY EVENT: row %d col %d.\n",
1034: frow, event->col);
1035: retcode = rf_IssueNextWriteRequest(raidPtr, frow);
1036: if (rf_floatingRbufDebug) {
1037: rf_CheckFloatingRbufCount(raidPtr, 1);
1038: }
1039: break;
1040:
1041: /*
1042: * We need to skip the current RU entirely because it got
1043: * recon'd while we were waiting for something else to happen.
1044: */
1045: case RF_REVENT_SKIP:
1046: DDprintf2("RECON: SKIP EVENT: row %d col %d.\n",
1047: frow, event->col);
1048: retcode = rf_IssueNextReadRequest(raidPtr, frow, event->col);
1049: break;
1050:
1051: /*
1052: * A forced-reconstruction read access has completed. Just
1053: * submit the buffer.
1054: */
1055: case RF_REVENT_FORCEDREADDONE:
1056: rbuf = (RF_ReconBuffer_t *) event->arg;
1057: rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg);
1058: DDprintf2("RECON: FORCEDREADDONE EVENT: row %d col %d.\n",
1059: frow, event->col);
1060: submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0);
1061: RF_ASSERT(!submitblocked);
1062: break;
1063:
1064: default:
1065: RF_PANIC();
1066: }
1067: rf_FreeReconEventDesc(event);
1068: return (retcode);
1069: }
1070:
1071: /*****************************************************************************
1072: *
1073: * Find the next thing that's needed on the indicated disk, and issue
1074: * a read request for it. We assume that the reconstruction buffer
1075: * associated with this process is free to receive the data. If
1076: * reconstruction is blocked on the indicated RU, we issue a
1077: * blockage-release request instead of a physical disk read request.
1078: * If the current disk gets too far ahead of the others, we issue a
1079: * head-separation wait request and return.
1080: *
1081: * ctrl->{ru_count, curPSID, diskOffset} and
1082: * rbuf->failedDiskSectorOffset are maintained to point to the unit
1083: * we're currently accessing. Note that this deviates from the
1084: * standard C idiom of having counters point to the next thing to be
1085: * accessed. This allows us to easily retry when we're blocked by
1086: * head separation or reconstruction-blockage events.
1087: *
1088: * Returns nonzero if and only if there is nothing left unread on the
1089: * indicated disk.
1090: *
1091: *****************************************************************************/
1092: int
1093: rf_IssueNextReadRequest(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col)
1094: {
1095: RF_PerDiskReconCtrl_t *ctrl =
1096: &raidPtr->reconControl[row]->perDiskInfo[col];
1097: RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1098: RF_ReconBuffer_t *rbuf = ctrl->rbuf;
1099: RF_ReconUnitCount_t RUsPerPU =
1100: layoutPtr->SUsPerPU / layoutPtr->SUsPerRU;
1101: RF_SectorCount_t sectorsPerRU =
1102: layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
1103: int do_new_check = 0, retcode = 0, status;
1104:
1105: /*
1106: * If we are currently the slowest disk, mark that we have to do a new
1107: * check.
1108: */
1109: if (ctrl->headSepCounter <=
1110: raidPtr->reconControl[row]->minHeadSepCounter)
1111: do_new_check = 1;
1112:
1113: while (1) {
1114:
1115: ctrl->ru_count++;
1116: if (ctrl->ru_count < RUsPerPU) {
1117: ctrl->diskOffset += sectorsPerRU;
1118: rbuf->failedDiskSectorOffset += sectorsPerRU;
1119: } else {
1120: ctrl->curPSID++;
1121: ctrl->ru_count = 0;
1122: /* code left over from when head-sep was based on
1123: * parity stripe id */
1124: if (ctrl->curPSID >=
1125: raidPtr->reconControl[row]->lastPSID) {
1126: rf_CheckForNewMinHeadSep(raidPtr, row,
1127: ++(ctrl->headSepCounter));
1128: return (1); /* Finito ! */
1129: }
1130: /*
1131: * Find the disk offsets of the start of the parity
1132: * stripe on both the current disk and the failed
1133: * disk. Skip this entire parity stripe if either disk
1134: * does not appear in the indicated PS.
1135: */
1136: status = rf_ComputePSDiskOffsets(raidPtr,
1137: ctrl->curPSID, row, col, &ctrl->diskOffset,
1138: &rbuf->failedDiskSectorOffset, &rbuf->spRow,
1139: &rbuf->spCol, &rbuf->spOffset);
1140: if (status) {
1141: ctrl->ru_count = RUsPerPU - 1;
1142: continue;
1143: }
1144: }
1145: rbuf->which_ru = ctrl->ru_count;
1146:
1147: /* Skip this RU if it's already been reconstructed. */
1148: if (rf_CheckRUReconstructed(raidPtr->reconControl[row]
1149: ->reconMap, rbuf->failedDiskSectorOffset)) {
1150: Dprintf2("Skipping psid %ld ru %d: already"
1151: " reconstructed.\n", ctrl->curPSID, ctrl->ru_count);
1152: continue;
1153: }
1154: break;
1155: }
1156: ctrl->headSepCounter++;
1157: if (do_new_check) /* Update min if needed. */
1158: rf_CheckForNewMinHeadSep(raidPtr, row, ctrl->headSepCounter);
1159:
1160:
1161: /*
1162: * At this point, we have definitely decided what to do, and we have
1163: * only to see if we can actually do it now.
1164: */
1165: rbuf->parityStripeID = ctrl->curPSID;
1166: rbuf->which_ru = ctrl->ru_count;
1167: bzero((char *) &raidPtr->recon_tracerecs[col],
1168: sizeof(raidPtr->recon_tracerecs[col]));
1169: raidPtr->recon_tracerecs[col].reconacc = 1;
1170: RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
1171: retcode = rf_TryToRead(raidPtr, row, col);
1172: return (retcode);
1173: }
1174:
1175: /*
1176: * Tries to issue the next read on the indicated disk. We may be
1177: * blocked by (a) the heads being too far apart, or (b) recon on the
1178: * indicated RU being blocked due to a write by a user thread. In
1179: * this case, we issue a head-sep or blockage wait request, which will
1180: * cause this same routine to be invoked again later when the blockage
1181: * has cleared.
1182: */
1183:
1184: int
1185: rf_TryToRead(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col)
1186: {
1187: RF_PerDiskReconCtrl_t *ctrl =
1188: &raidPtr->reconControl[row]->perDiskInfo[col];
1189: RF_SectorCount_t sectorsPerRU =
1190: raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU;
1191: RF_StripeNum_t psid = ctrl->curPSID;
1192: RF_ReconUnitNum_t which_ru = ctrl->ru_count;
1193: RF_DiskQueueData_t *req;
1194: int status, created = 0;
1195: RF_ReconParityStripeStatus_t *pssPtr;
1196:
1197: /*
1198: * If the current disk is too far ahead of the others, issue a
1199: * head-separation wait and return.
1200: */
1201: if (rf_CheckHeadSeparation(raidPtr, ctrl, row, col,
1202: ctrl->headSepCounter, which_ru))
1203: return (0);
1204: RF_LOCK_PSS_MUTEX(raidPtr, row, psid);
1205: pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]
1206: ->pssTable, psid, which_ru, RF_PSS_CREATE, &created);
1207:
1208: /*
1209: * If recon is blocked on the indicated parity stripe, issue a
1210: * block-wait request and return. This also must mark the indicated RU
1211: * in the stripe as under reconstruction if not blocked.
1212: */
1213: status = rf_CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl,
1214: row, col, psid, which_ru);
1215: if (status == RF_PSS_RECON_BLOCKED) {
1216: Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked.\n",
1217: psid, which_ru);
1218: goto out;
1219: } else
1220: if (status == RF_PSS_FORCED_ON_WRITE) {
1221: rf_CauseReconEvent(raidPtr, row, col, NULL,
1222: RF_REVENT_SKIP);
1223: goto out;
1224: }
1225: /*
1226: * Make one last check to be sure that the indicated RU didn't get
1227: * reconstructed while we were waiting for something else to happen.
1228: * This is unfortunate in that it causes us to make this check twice
1229: * in the normal case. Might want to make some attempt to re-work
1230: * this so that we only do this check if we've definitely blocked on
1231: * one of the above checks. When this condition is detected, we may
1232: * have just created a bogus status entry, which we need to delete.
1233: */
1234: if (rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap,
1235: ctrl->rbuf->failedDiskSectorOffset)) {
1236: Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after"
1237: " stall.\n", psid, which_ru);
1238: if (created)
1239: rf_PSStatusDelete(raidPtr,
1240: raidPtr->reconControl[row]->pssTable, pssPtr);
1241: rf_CauseReconEvent(raidPtr, row, col, NULL, RF_REVENT_SKIP);
1242: goto out;
1243: }
1244: /* Found something to read. Issue the I/O. */
1245: Dprintf5("RECON: Read for psid %ld on row %d col %d offset %ld"
1246: " buf %lx.\n", psid, row, col, ctrl->diskOffset,
1247: ctrl->rbuf->buffer);
1248: RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer);
1249: RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer);
1250: raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us =
1251: RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer);
1252: RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer);
1253:
1254: /*
1255: * Should be ok to use a NULL proc pointer here, all the bufs we use
1256: * should be in kernel space.
1257: */
1258: req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset,
1259: sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru,
1260: rf_ReconReadDoneProc, (void *) ctrl, NULL,
1261: &raidPtr->recon_tracerecs[col], (void *) raidPtr, 0, NULL);
1262:
1263: RF_ASSERT(req); /* XXX -- Fix this. -- XXX */
1264:
1265: ctrl->rbuf->arg = (void *) req;
1266: rf_DiskIOEnqueue(&raidPtr->Queues[row][col], req, RF_IO_RECON_PRIORITY);
1267: pssPtr->issued[col] = 1;
1268:
1269: out:
1270: RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
1271: return (0);
1272: }
1273:
1274:
1275: /*
1276: * Given a parity stripe ID, we want to find out whether both the
1277: * current disk and the failed disk exist in that parity stripe. If
1278: * not, we want to skip this whole PS. If so, we want to find the
1279: * disk offset of the start of the PS on both the current disk and the
1280: * failed disk.
1281: *
1282: * This works by getting a list of disks comprising the indicated
1283: * parity stripe, and searching the list for the current and failed
1284: * disks. Once we've decided they both exist in the parity stripe, we
1285: * need to decide whether each is data or parity, so that we'll know
1286: * which mapping function to call to get the corresponding disk
1287: * offsets.
1288: *
1289: * This is kind of unpleasant, but doing it this way allows the
1290: * reconstruction code to use parity stripe IDs rather than physical
1291: * disks address to march through the failed disk, which greatly
1292: * simplifies a lot of code, as well as eliminating the need for a
1293: * reverse-mapping function. I also think it will execute faster,
1294: * since the calls to the mapping module are kept to a minimum.
1295: *
1296: * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING
1297: * THE STRIPE IN THE CORRECT ORDER.
1298: */
1299:
1300: int
1301: rf_ComputePSDiskOffsets(
1302: RF_Raid_t *raidPtr, /* RAID descriptor. */
1303: RF_StripeNum_t psid, /* Parity stripe identifier. */
1304: RF_RowCol_t row, /*
1305: * Row and column of disk to find
1306: * the offsets for.
1307: */
1308: RF_RowCol_t col,
1309: RF_SectorNum_t *outDiskOffset,
1310: RF_SectorNum_t *outFailedDiskSectorOffset,
1311: RF_RowCol_t *spRow, /*
1312: * OUT: Row,col of spare unit for
1313: * failed unit.
1314: */
1315: RF_RowCol_t *spCol,
1316: RF_SectorNum_t *spOffset /*
1317: * OUT: Offset into disk containing
1318: * spare unit.
1319: */
1320: )
1321: {
1322: RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1323: RF_RowCol_t fcol = raidPtr->reconControl[row]->fcol;
1324: RF_RaidAddr_t sosRaidAddress; /* start-of-stripe */
1325: RF_RowCol_t *diskids;
1326: u_int i, j, k, i_offset, j_offset;
1327: RF_RowCol_t prow, pcol;
1328: int testcol, testrow;
1329: RF_RowCol_t stripe;
1330: RF_SectorNum_t poffset;
1331: char i_is_parity = 0, j_is_parity = 0;
1332: RF_RowCol_t stripeWidth =
1333: layoutPtr->numDataCol + layoutPtr->numParityCol;
1334:
1335: /* Get a listing of the disks comprising that stripe. */
1336: sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid);
1337: (layoutPtr->map->IdentifyStripe) (raidPtr, sosRaidAddress, &diskids,
1338: &stripe);
1339: RF_ASSERT(diskids);
1340:
1341: /*
1342: * Reject this entire parity stripe if it does not contain the
1343: * indicated disk or it does not contain the failed disk.
1344: */
1345: if (row != stripe)
1346: goto skipit;
1347: for (i = 0; i < stripeWidth; i++) {
1348: if (col == diskids[i])
1349: break;
1350: }
1351: if (i == stripeWidth)
1352: goto skipit;
1353: for (j = 0; j < stripeWidth; j++) {
1354: if (fcol == diskids[j])
1355: break;
1356: }
1357: if (j == stripeWidth) {
1358: goto skipit;
1359: }
1360: /* Find out which disk the parity is on. */
1361: (layoutPtr->map->MapParity) (raidPtr, sosRaidAddress, &prow, &pcol,
1362: &poffset, RF_DONT_REMAP);
1363:
1364: /* Find out if either the current RU or the failed RU is parity. */
1365: /*
1366: * Also, if the parity occurs in this stripe prior to the data and/or
1367: * failed col, we need to decrement i and/or j.
1368: */
1369: for (k = 0; k < stripeWidth; k++)
1370: if (diskids[k] == pcol)
1371: break;
1372: RF_ASSERT(k < stripeWidth);
1373: i_offset = i;
1374: j_offset = j;
1375: if (k < i)
1376: i_offset--;
1377: else
1378: if (k == i) {
1379: i_is_parity = 1;
1380: i_offset = 0;
1381: } /*
1382: * Set offsets to zero to disable multiply
1383: * below.
1384: */
1385: if (k < j)
1386: j_offset--;
1387: else
1388: if (k == j) {
1389: j_is_parity = 1;
1390: j_offset = 0;
1391: }
1392: /*
1393: * At this point, [ij]_is_parity tells us whether the [current,failed]
1394: * disk is parity at the start of this RU, and, if data, "[ij]_offset"
1395: * tells us how far into the stripe the [current,failed] disk is.
1396: */
1397:
1398: /*
1399: * Call the mapping routine to get the offset into the current disk,
1400: * repeat for failed disk.
1401: */
1402: if (i_is_parity)
1403: layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset *
1404: layoutPtr->sectorsPerStripeUnit, &testrow, &testcol,
1405: outDiskOffset, RF_DONT_REMAP);
1406: else
1407: layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset *
1408: layoutPtr->sectorsPerStripeUnit, &testrow, &testcol,
1409: outDiskOffset, RF_DONT_REMAP);
1410:
1411: RF_ASSERT(row == testrow && col == testcol);
1412:
1413: if (j_is_parity)
1414: layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset *
1415: layoutPtr->sectorsPerStripeUnit, &testrow, &testcol,
1416: outFailedDiskSectorOffset, RF_DONT_REMAP);
1417: else
1418: layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset *
1419: layoutPtr->sectorsPerStripeUnit, &testrow, &testcol,
1420: outFailedDiskSectorOffset, RF_DONT_REMAP);
1421: RF_ASSERT(row == testrow && fcol == testcol);
1422:
1423: /* Now locate the spare unit for the failed unit. */
1424: if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
1425: if (j_is_parity)
1426: layoutPtr->map->MapParity(raidPtr, sosRaidAddress +
1427: j_offset * layoutPtr->sectorsPerStripeUnit, spRow,
1428: spCol, spOffset, RF_REMAP);
1429: else
1430: layoutPtr->map->MapSector(raidPtr, sosRaidAddress +
1431: j_offset * layoutPtr->sectorsPerStripeUnit, spRow,
1432: spCol, spOffset, RF_REMAP);
1433: } else {
1434: *spRow = raidPtr->reconControl[row]->spareRow;
1435: *spCol = raidPtr->reconControl[row]->spareCol;
1436: *spOffset = *outFailedDiskSectorOffset;
1437: }
1438:
1439: return (0);
1440:
1441: skipit:
1442: Dprintf3("RECON: Skipping psid %ld: nothing needed from r%d c%d.\n",
1443: psid, row, col);
1444: return (1);
1445: }
1446:
1447:
1448: /*
1449: * This is called when a buffer has become ready to write to the replacement
1450: * disk.
1451: */
1452: int
1453: rf_IssueNextWriteRequest(RF_Raid_t *raidPtr, RF_RowCol_t row)
1454: {
1455: RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
1456: RF_SectorCount_t sectorsPerRU =
1457: layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU;
1458: RF_RowCol_t fcol = raidPtr->reconControl[row]->fcol;
1459: RF_ReconBuffer_t *rbuf;
1460: RF_DiskQueueData_t *req;
1461:
1462: rbuf = rf_GetFullReconBuffer(raidPtr->reconControl[row]);
1463: RF_ASSERT(rbuf); /*
1464: * There must be one available, or we wouldn't
1465: * have gotten the event that sent us here.
1466: */
1467: RF_ASSERT(rbuf->pssPtr);
1468:
1469: rbuf->pssPtr->writeRbuf = rbuf;
1470: rbuf->pssPtr = NULL;
1471:
1472: Dprintf7("RECON: New write (r %d c %d offs %d) for psid %ld ru %d"
1473: " (failed disk offset %ld) buf %lx.\n",
1474: rbuf->spRow, rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID,
1475: rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer);
1476: Dprintf6("RECON: new write psid %ld %02x %02x %02x %02x %02x.\n",
1477: rbuf->parityStripeID, rbuf->buffer[0] & 0xff,
1478: rbuf->buffer[1] & 0xff, rbuf->buffer[2] & 0xff,
1479: rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff);
1480:
1481: /*
1482: * Should be ok to use a NULL b_proc here b/c all addrs should be in
1483: * kernel space.
1484: */
1485: req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset,
1486: sectorsPerRU, rbuf->buffer, rbuf->parityStripeID, rbuf->which_ru,
1487: rf_ReconWriteDoneProc, (void *) rbuf, NULL,
1488: &raidPtr->recon_tracerecs[fcol], (void *) raidPtr, 0, NULL);
1489:
1490: RF_ASSERT(req); /* XXX -- Fix this. -- XXX */
1491:
1492: rbuf->arg = (void *) req;
1493: rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spRow][rbuf->spCol], req,
1494: RF_IO_RECON_PRIORITY);
1495:
1496: return (0);
1497: }
1498:
1499: /*
1500: * This gets called upon the completion of a reconstruction read
1501: * operation. The arg is a pointer to the per-disk reconstruction
1502: * control structure for the process that just finished a read.
1503: *
1504: * Called at interrupt context in the kernel, so don't do anything
1505: * illegal here.
1506: */
1507: int
1508: rf_ReconReadDoneProc(void *arg, int status)
1509: {
1510: RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg;
1511: RF_Raid_t *raidPtr = ctrl->reconCtrl->reconDesc->raidPtr;
1512:
1513: if (status) {
1514: /*
1515: * XXX
1516: */
1517: printf("Recon read failed !\n");
1518: RF_PANIC();
1519: }
1520: RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1521: RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1522: raidPtr->recon_tracerecs[ctrl->col].specific.recon.
1523: recon_fetch_to_return_us =
1524: RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1525: RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer);
1526:
1527: rf_CauseReconEvent(raidPtr, ctrl->row, ctrl->col, NULL,
1528: RF_REVENT_READDONE);
1529: return (0);
1530: }
1531:
1532:
1533: /*
1534: * This gets called upon the completion of a reconstruction write operation.
1535: * The arg is a pointer to the rbuf that was just written.
1536: *
1537: * Called at interrupt context in the kernel, so don't do anything illegal here.
1538: */
1539: int
1540: rf_ReconWriteDoneProc(void *arg, int status)
1541: {
1542: RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg;
1543:
1544: Dprintf2("Reconstruction completed on psid %ld ru %d.\n",
1545: rbuf->parityStripeID, rbuf->which_ru);
1546: if (status) {
1547: /* fprintf(stderr, "Recon write failed !\n"); */
1548: printf("Recon write failed !\n");
1549: RF_PANIC();
1550: }
1551: rf_CauseReconEvent((RF_Raid_t *) rbuf->raidPtr, rbuf->row, rbuf->col,
1552: arg, RF_REVENT_WRITEDONE);
1553: return (0);
1554: }
1555:
1556:
1557: /*
1558: * Computes a new minimum head sep, and wakes up anyone who needs to
1559: * be woken as a result.
1560: */
1561: void
1562: rf_CheckForNewMinHeadSep(RF_Raid_t *raidPtr, RF_RowCol_t row,
1563: RF_HeadSepLimit_t hsCtr)
1564: {
1565: RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[row];
1566: RF_HeadSepLimit_t new_min;
1567: RF_RowCol_t i;
1568: RF_CallbackDesc_t *p;
1569: /* From the definition of a minimum. */
1570: RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter);
1571:
1572:
1573: RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1574:
1575: new_min = ~(1L << (8 * sizeof(long) - 1)); /* 0x7FFF....FFF */
1576: for (i = 0; i < raidPtr->numCol; i++)
1577: if (i != reconCtrlPtr->fcol) {
1578: if (reconCtrlPtr->perDiskInfo[i].headSepCounter <
1579: new_min)
1580: new_min =
1581: reconCtrlPtr->perDiskInfo[i].headSepCounter;
1582: }
1583: /* Set the new minimum and wake up anyone who can now run again. */
1584: if (new_min != reconCtrlPtr->minHeadSepCounter) {
1585: reconCtrlPtr->minHeadSepCounter = new_min;
1586: Dprintf1("RECON: new min head pos counter val is %ld.\n",
1587: new_min);
1588: while (reconCtrlPtr->headSepCBList) {
1589: if (reconCtrlPtr->headSepCBList->callbackArg.v >
1590: new_min)
1591: break;
1592: p = reconCtrlPtr->headSepCBList;
1593: reconCtrlPtr->headSepCBList = p->next;
1594: p->next = NULL;
1595: rf_CauseReconEvent(raidPtr, p->row, p->col, NULL,
1596: RF_REVENT_HEADSEPCLEAR);
1597: rf_FreeCallbackDesc(p);
1598: }
1599:
1600: }
1601: RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1602: }
1603:
1604: /*
1605: * Checks to see that the maximum head separation will not be violated
1606: * if we initiate a reconstruction I/O on the indicated disk.
1607: * Limiting the maximum head separation between two disks eliminates
1608: * the nasty buffer-stall conditions that occur when one disk races
1609: * ahead of the others and consumes all of the floating recon buffers.
1610: * This code is complex and unpleasant but it's necessary to avoid
1611: * some very nasty, albeit fairly rare, reconstruction behavior.
1612: *
1613: * Returns non-zero if and only if we have to stop working on the
1614: * indicated disk due to a head-separation delay.
1615: */
1616: int
1617: rf_CheckHeadSeparation(
1618: RF_Raid_t *raidPtr,
1619: RF_PerDiskReconCtrl_t *ctrl,
1620: RF_RowCol_t row,
1621: RF_RowCol_t col,
1622: RF_HeadSepLimit_t hsCtr,
1623: RF_ReconUnitNum_t which_ru
1624: )
1625: {
1626: RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[row];
1627: RF_CallbackDesc_t *cb, *p, *pt;
1628: int retval = 0;
1629:
1630: /*
1631: * If we're too far ahead of the slowest disk, stop working on this
1632: * disk until the slower ones catch up. We do this by scheduling a
1633: * wakeup callback for the time when the slowest disk has caught up.
1634: * We define "caught up" with 20% hysteresis, i.e. the head separation
1635: * must have fallen to at most 80% of the max allowable head
1636: * separation before we'll wake up.
1637: */
1638: RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex);
1639: if ((raidPtr->headSepLimit >= 0) &&
1640: ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) >
1641: raidPtr->headSepLimit)) {
1642: Dprintf6("raid%d: RECON: head sep stall: row %d col %d hsCtr"
1643: " %ld minHSCtr %ld limit %ld.\n",
1644: raidPtr->raidid, row, col, ctrl->headSepCounter,
1645: reconCtrlPtr->minHeadSepCounter, raidPtr->headSepLimit);
1646: cb = rf_AllocCallbackDesc();
1647: /*
1648: * The minHeadSepCounter value we have to get to before we'll
1649: * wake up. Build in 20% hysteresis.
1650: */
1651: cb->callbackArg.v = (ctrl->headSepCounter -
1652: raidPtr->headSepLimit + raidPtr->headSepLimit / 5);
1653: cb->row = row;
1654: cb->col = col;
1655: cb->next = NULL;
1656:
1657: /*
1658: * Insert this callback descriptor into the sorted list of
1659: * pending head-sep callbacks.
1660: */
1661: p = reconCtrlPtr->headSepCBList;
1662: if (!p)
1663: reconCtrlPtr->headSepCBList = cb;
1664: else
1665: if (cb->callbackArg.v < p->callbackArg.v) {
1666: cb->next = reconCtrlPtr->headSepCBList;
1667: reconCtrlPtr->headSepCBList = cb;
1668: } else {
1669: for (pt = p, p = p->next;
1670: p && (p->callbackArg.v < cb->callbackArg.v);
1671: pt = p, p = p->next);
1672: cb->next = p;
1673: pt->next = cb;
1674: }
1675: retval = 1;
1676: #if RF_RECON_STATS > 0
1677: ctrl->reconCtrl->reconDesc->hsStallCount++;
1678: #endif /* RF_RECON_STATS > 0 */
1679: }
1680: RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex);
1681:
1682: return (retval);
1683: }
1684:
1685:
1686:
1687: /*
1688: * Checks to see if reconstruction has been either forced or blocked
1689: * by a user operation. If forced, we skip this RU entirely. Else if
1690: * blocked, put ourselves on the wait list. Else return 0.
1691: *
1692: * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY.
1693: */
1694: int
1695: rf_CheckForcedOrBlockedReconstruction(
1696: RF_Raid_t *raidPtr,
1697: RF_ReconParityStripeStatus_t *pssPtr,
1698: RF_PerDiskReconCtrl_t *ctrl,
1699: RF_RowCol_t row,
1700: RF_RowCol_t col,
1701: RF_StripeNum_t psid,
1702: RF_ReconUnitNum_t which_ru
1703: )
1704: {
1705: RF_CallbackDesc_t *cb;
1706: int retcode = 0;
1707:
1708: if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) ||
1709: (pssPtr->flags & RF_PSS_FORCED_ON_WRITE))
1710: retcode = RF_PSS_FORCED_ON_WRITE;
1711: else
1712: if (pssPtr->flags & RF_PSS_RECON_BLOCKED) {
1713: Dprintf4("RECON: row %d col %d blocked at psid %ld"
1714: " ru %d.\n", row, col, psid, which_ru);
1715: cb = rf_AllocCallbackDesc(); /*
1716: * Append ourselves to
1717: * the blockage-wait
1718: * list.
1719: */
1720: cb->row = row;
1721: cb->col = col;
1722: cb->next = pssPtr->blockWaitList;
1723: pssPtr->blockWaitList = cb;
1724: retcode = RF_PSS_RECON_BLOCKED;
1725: }
1726: if (!retcode)
1727: pssPtr->flags |= RF_PSS_UNDER_RECON; /*
1728: * Mark this RU as under
1729: * reconstruction.
1730: */
1731:
1732: return (retcode);
1733: }
1734:
1735:
1736: /*
1737: * If reconstruction is currently ongoing for the indicated stripeID,
1738: * reconstruction is forced to completion and we return non-zero to
1739: * indicate that the caller must wait. If not, then reconstruction is
1740: * blocked on the indicated stripe and the routine returns zero. If
1741: * and only if we return non-zero, we'll cause the cbFunc to get
1742: * invoked with the cbArg when the reconstruction has completed.
1743: */
1744: int
1745: rf_ForceOrBlockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap,
1746: void (*cbFunc) (RF_Raid_t *, void *), void *cbArg)
1747: {
1748: RF_RowCol_t row = asmap->physInfo->row; /*
1749: * Which row of the array
1750: * we're working on.
1751: */
1752: RF_StripeNum_t stripeID = asmap->stripeID; /*
1753: * The stripe ID we're
1754: * forcing recon on.
1755: */
1756: RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit *
1757: raidPtr->Layout.SUsPerRU; /* Num sects in one RU. */
1758: RF_ReconParityStripeStatus_t *pssPtr; /*
1759: * A pointer to the parity
1760: * stripe status structure.
1761: */
1762: RF_StripeNum_t psid; /* Parity stripe id. */
1763: RF_SectorNum_t offset, fd_offset; /*
1764: * Disk offset, failed-disk
1765: * offset.
1766: */
1767: RF_RowCol_t *diskids;
1768: RF_RowCol_t stripe;
1769: RF_ReconUnitNum_t which_ru; /* RU within parity stripe. */
1770: RF_RowCol_t fcol, diskno, i;
1771: RF_ReconBuffer_t *new_rbuf; /* Ptr to newly allocated rbufs. */
1772: RF_DiskQueueData_t *req; /* Disk I/O req to be enqueued. */
1773: RF_CallbackDesc_t *cb;
1774: int created = 0, nPromoted;
1775:
1776: psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID,
1777: &which_ru);
1778:
1779: RF_LOCK_PSS_MUTEX(raidPtr, row, psid);
1780:
1781: pssPtr = rf_LookupRUStatus(raidPtr,
1782: raidPtr->reconControl[row]->pssTable, psid, which_ru,
1783: RF_PSS_CREATE | RF_PSS_RECON_BLOCKED, &created);
1784:
1785: /* If recon is not ongoing on this PS, just return. */
1786: if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
1787: RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
1788: return (0);
1789: }
1790: /*
1791: * Otherwise, we have to wait for reconstruction to complete on this
1792: * RU.
1793: */
1794: /*
1795: * In order to avoid waiting for a potentially large number of
1796: * low-priority accesses to complete, we force a normal-priority (i.e.
1797: * not low-priority) reconstruction on this RU.
1798: */
1799: if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) &&
1800: !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) {
1801: DDprintf1("Forcing recon on psid %ld.\n", psid);
1802: /* Mark this RU as under forced recon. */
1803: pssPtr->flags |= RF_PSS_FORCED_ON_WRITE;
1804: /* Clear the blockage that we just set. */
1805: pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;
1806: fcol = raidPtr->reconControl[row]->fcol;
1807:
1808: /*
1809: * Get a listing of the disks comprising the indicated stripe.
1810: */
1811: (raidPtr->Layout.map->IdentifyStripe) (raidPtr,
1812: asmap->raidAddress, &diskids, &stripe);
1813: RF_ASSERT(row == stripe);
1814:
1815: /*
1816: * For previously issued reads, elevate them to normal
1817: * priority. If the I/O has already completed, it won't be
1818: * found in the queue, and hence this will be a no-op. For
1819: * unissued reads, allocate buffers and issue new reads. The
1820: * fact that we've set the FORCED bit means that the regular
1821: * recon procs will not re-issue these reqs.
1822: */
1823: for (i = 0; i < raidPtr->Layout.numDataCol +
1824: raidPtr->Layout.numParityCol; i++)
1825: if ((diskno = diskids[i]) != fcol) {
1826: if (pssPtr->issued[diskno]) {
1827: nPromoted = rf_DiskIOPromote(&raidPtr
1828: ->Queues[row][diskno], psid,
1829: which_ru);
1830: if (rf_reconDebug && nPromoted)
1831: printf("raid%d: promoted read"
1832: " from row %d col %d.\n",
1833: raidPtr->raidid, row,
1834: diskno);
1835: } else {
1836: /* Create new buf. */
1837: new_rbuf = rf_MakeReconBuffer(raidPtr,
1838: row, diskno, RF_RBUF_TYPE_FORCED);
1839: /* Find offsets & spare locationp */
1840: rf_ComputePSDiskOffsets(raidPtr, psid,
1841: row, diskno, &offset, &fd_offset,
1842: &new_rbuf->spRow, &new_rbuf->spCol,
1843: &new_rbuf->spOffset);
1844: new_rbuf->parityStripeID = psid;
1845: /* Fill in the buffer. */
1846: new_rbuf->which_ru = which_ru;
1847: new_rbuf->failedDiskSectorOffset =
1848: fd_offset;
1849: new_rbuf->priority =
1850: RF_IO_NORMAL_PRIORITY;
1851:
1852: /*
1853: * Use NULL b_proc b/c all addrs
1854: * should be in kernel space.
1855: */
1856: req = rf_CreateDiskQueueData(
1857: RF_IO_TYPE_READ, offset +
1858: which_ru * sectorsPerRU,
1859: sectorsPerRU, new_rbuf->buffer,
1860: psid, which_ru, (int (*)
1861: (void *, int))
1862: rf_ForceReconReadDoneProc,
1863: (void *) new_rbuf, NULL,
1864: NULL, (void *) raidPtr, 0, NULL);
1865:
1866: RF_ASSERT(req); /*
1867: * XXX -- Fix this. --
1868: * XXX
1869: */
1870:
1871: new_rbuf->arg = req;
1872: /* Enqueue the I/O. */
1873: rf_DiskIOEnqueue(&raidPtr
1874: ->Queues[row][diskno], req,
1875: RF_IO_NORMAL_PRIORITY);
1876: Dprintf3("raid%d: Issued new read req"
1877: " on row %d col %d.\n",
1878: raidPtr->raidid, row, diskno);
1879: }
1880: }
1881: /*
1882: * If the write is sitting in the disk queue, elevate its
1883: * priority.
1884: */
1885: if (rf_DiskIOPromote(&raidPtr->Queues[row][fcol],
1886: psid, which_ru))
1887: printf("raid%d: promoted write to row %d col %d.\n",
1888: raidPtr->raidid, row, fcol);
1889: }
1890: /*
1891: * Install a callback descriptor to be invoked when recon completes on
1892: * this parity stripe.
1893: */
1894: cb = rf_AllocCallbackDesc();
1895: /*
1896: * XXX The following is bogus... These functions don't really match !!!
1897: * GO
1898: */
1899: cb->callbackFunc = (void (*) (RF_CBParam_t)) cbFunc;
1900: cb->callbackArg.p = (void *) cbArg;
1901: cb->next = pssPtr->procWaitList;
1902: pssPtr->procWaitList = cb;
1903: DDprintf2("raid%d: Waiting for forced recon on psid %ld.\n",
1904: raidPtr->raidid, psid);
1905:
1906: RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
1907: return (1);
1908: }
1909:
1910:
1911: /*
1912: * Called upon the completion of a forced reconstruction read.
1913: * All we do is schedule the FORCEDREADONE event.
1914: * Called at interrupt context in the kernel, so don't do anything illegal here.
1915: */
1916: void
1917: rf_ForceReconReadDoneProc(void *arg, int status)
1918: {
1919: RF_ReconBuffer_t *rbuf = arg;
1920:
1921: if (status) {
1922: /* fprintf(stderr, "Forced recon read failed !\n"); */
1923: printf("Forced recon read failed !\n");
1924: RF_PANIC();
1925: }
1926: rf_CauseReconEvent((RF_Raid_t *) rbuf->raidPtr, rbuf->row, rbuf->col,
1927: (void *) rbuf, RF_REVENT_FORCEDREADDONE);
1928: }
1929:
1930:
1931: /* Releases a block on the reconstruction of the indicated stripe. */
1932: int
1933: rf_UnblockRecon(RF_Raid_t *raidPtr, RF_AccessStripeMap_t *asmap)
1934: {
1935: RF_RowCol_t row = asmap->origRow;
1936: RF_StripeNum_t stripeID = asmap->stripeID;
1937: RF_ReconParityStripeStatus_t *pssPtr;
1938: RF_ReconUnitNum_t which_ru;
1939: RF_StripeNum_t psid;
1940: int created = 0;
1941: RF_CallbackDesc_t *cb;
1942:
1943: psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID,
1944: &which_ru);
1945: RF_LOCK_PSS_MUTEX(raidPtr, row, psid);
1946: pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]
1947: ->pssTable, psid, which_ru, RF_PSS_NONE, &created);
1948:
1949: /*
1950: * When recon is forced, the pss desc can get deleted before we get
1951: * back to unblock recon. But, this can _only_ happen when recon is
1952: * forced. It would be good to put some kind of sanity check here, but
1953: * how to decide if recon was just forced or not ?
1954: */
1955: if (!pssPtr) {
1956: /*
1957: * printf("Warning: no pss descriptor upon unblock on psid %ld"
1958: * " RU %d.\n", psid, which_ru);
1959: */
1960: if (rf_reconDebug || rf_pssDebug)
1961: printf("Warning: no pss descriptor upon unblock on"
1962: " psid %ld RU %d.\n", (long) psid, which_ru);
1963: goto out;
1964: }
1965: pssPtr->blockCount--;
1966: Dprintf3("raid%d: unblocking recon on psid %ld: blockcount is %d.\n",
1967: raidPtr->raidid, psid, pssPtr->blockCount);
1968: if (pssPtr->blockCount == 0) {
1969: /* If recon blockage has been released. */
1970:
1971: /*
1972: * Unblock recon before calling CauseReconEvent in case
1973: * CauseReconEvent causes us to try to issue a new read before
1974: * returning here.
1975: */
1976: pssPtr->flags &= ~RF_PSS_RECON_BLOCKED;
1977:
1978:
1979: while (pssPtr->blockWaitList) {
1980: /*
1981: * Spin through the block-wait list and
1982: * release all the waiters.
1983: */
1984: cb = pssPtr->blockWaitList;
1985: pssPtr->blockWaitList = cb->next;
1986: cb->next = NULL;
1987: rf_CauseReconEvent(raidPtr, cb->row, cb->col, NULL,
1988: RF_REVENT_BLOCKCLEAR);
1989: rf_FreeCallbackDesc(cb);
1990: }
1991: if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) {
1992: /* If no recon was requested while recon was blocked. */
1993: rf_PSStatusDelete(raidPtr, raidPtr->reconControl[row]
1994: ->pssTable, pssPtr);
1995: }
1996: }
1997: out:
1998: RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid);
1999: return (0);
2000: }
CVSweb