Annotation of sys/dev/raidframe/rf_decluster.c, Revision 1.1.1.1
1.1 nbrk 1: /* $OpenBSD: rf_decluster.c,v 1.5 2002/12/16 07:01:03 tdeval Exp $ */
2: /* $NetBSD: rf_decluster.c,v 1.5 2000/03/07 01:54:29 oster Exp $ */
3:
4: /*
5: * Copyright (c) 1995 Carnegie-Mellon University.
6: * All rights reserved.
7: *
8: * Author: Mark Holland
9: *
10: * Permission to use, copy, modify and distribute this software and
11: * its documentation is hereby granted, provided that both the copyright
12: * notice and this permission notice appear in all copies of the
13: * software, derivative works or modified versions, and any portions
14: * thereof, and that both notices appear in supporting documentation.
15: *
16: * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
17: * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND
18: * FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
19: *
20: * Carnegie Mellon requests users of this software to return to
21: *
22: * Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU
23: * School of Computer Science
24: * Carnegie Mellon University
25: * Pittsburgh PA 15213-3890
26: *
27: * any improvements or extensions that they make and grant Carnegie the
28: * rights to redistribute these changes.
29: */
30:
31: /*****************************************************************************
32: *
33: * rf_decluster.c -- Code related to the declustered layout.
34: *
35: * Created 10-21-92 (MCH)
36: *
37: * Nov 93: Adding support for distributed sparing. This code is a little
38: * complex; the basic layout used is as follows:
39: * Let F = (v-1)/GCD(r,v-1). The spare space for each set of
40: * F consecutive fulltables is grouped together and placed after
41: * that set of tables.
42: * +-------------------------------+
43: * | F fulltables |
44: * | Spare Space |
45: * | F fulltables |
46: * | Spare Space |
47: * | ... |
48: * +-------------------------------+
49: *
50: *****************************************************************************/
51:
52: #include "rf_types.h"
53: #include "rf_raid.h"
54: #include "rf_raidframe.h"
55: #include "rf_configure.h"
56: #include "rf_decluster.h"
57: #include "rf_debugMem.h"
58: #include "rf_utils.h"
59: #include "rf_alloclist.h"
60: #include "rf_general.h"
61: #include "rf_shutdown.h"
62:
63: extern int rf_copyback_in_progress; /* Debug only. */
64:
65: /* Found in rf_kintf.c */
66: int rf_GetSpareTableFromDaemon(RF_SparetWait_t *);
67:
68: /* Configuration code. */
69:
70: int
71: rf_ConfigureDeclustered(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
72: RF_Config_t *cfgPtr)
73: {
74: RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
75: int b, v, k, r, lambda; /* block design params */
76: int i, j;
77: RF_RowCol_t *first_avail_slot;
78: RF_StripeCount_t complete_FT_count, numCompleteFullTablesPerDisk;
79: RF_DeclusteredConfigInfo_t *info;
80: RF_StripeCount_t PUsPerDisk, spareRegionDepthInPUs,
81: numCompleteSpareRegionsPerDisk, extraPUsPerDisk;
82: RF_StripeCount_t totSparePUsPerDisk;
83: RF_SectorNum_t diskOffsetOfLastFullTableInSUs;
84: RF_SectorCount_t SpareSpaceInSUs;
85: char *cfgBuf = (char *) (cfgPtr->layoutSpecific);
86: RF_StripeNum_t l, SUID;
87:
88: SUID = l = 0;
89: numCompleteSpareRegionsPerDisk = 0;
90:
91: /* 1. Create layout specific structure. */
92: RF_MallocAndAdd(info, sizeof(RF_DeclusteredConfigInfo_t),
93: (RF_DeclusteredConfigInfo_t *), raidPtr->cleanupList);
94: if (info == NULL)
95: return (ENOMEM);
96: layoutPtr->layoutSpecificInfo = (void *) info;
97: info->SpareTable = NULL;
98:
99: /* 2. Extract parameters from the config structure. */
100: if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) {
101: bcopy(cfgBuf, info->sparemap_fname, RF_SPAREMAP_NAME_LEN);
102: }
103: cfgBuf += RF_SPAREMAP_NAME_LEN;
104:
105: b = *((int *) cfgBuf);
106: cfgBuf += sizeof(int);
107: v = *((int *) cfgBuf);
108: cfgBuf += sizeof(int);
109: k = *((int *) cfgBuf);
110: cfgBuf += sizeof(int);
111: r = *((int *) cfgBuf);
112: cfgBuf += sizeof(int);
113: lambda = *((int *) cfgBuf);
114: cfgBuf += sizeof(int);
115: raidPtr->noRotate = *((int *) cfgBuf);
116: cfgBuf += sizeof(int);
117:
118: /*
119: * The sparemaps are generated assuming that parity is rotated, so we
120: * issue a warning if both distributed sparing and no-rotate are on at
121: * the same time.
122: */
123: if ((layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) &&
124: raidPtr->noRotate) {
125: RF_ERRORMSG("Warning: distributed sparing specified without"
126: " parity rotation.\n");
127: }
128: if (raidPtr->numCol != v) {
129: RF_ERRORMSG2("RAID: config error: table element count (%d)"
130: " not equal to no. of cols (%d).\n", v, raidPtr->numCol);
131: return (EINVAL);
132: }
133: /* 3. Set up the values used in the mapping code. */
134: info->BlocksPerTable = b;
135: info->Lambda = lambda;
136: info->NumParityReps = info->groupSize = k;
137: /* b blks, k-1 SUs each. */
138: info->SUsPerTable = b * (k - 1) * layoutPtr->SUsPerPU;
139: info->SUsPerFullTable = k * info->SUsPerTable; /* rot k times */
140: info->PUsPerBlock = k - 1;
141: info->SUsPerBlock = info->PUsPerBlock * layoutPtr->SUsPerPU;
142: info->TableDepthInPUs = (b * k) / v;
143: /* k repetitions. */
144: info->FullTableDepthInPUs = info->TableDepthInPUs * k;
145:
146: /* Used only in distributed sparing case. */
147: /* (v-1)/gcd fulltables. */
148: info->FullTablesPerSpareRegion = (v - 1) / rf_gcd(r, v - 1);
149: info->TablesPerSpareRegion = k * info->FullTablesPerSpareRegion;
150: info->SpareSpaceDepthPerRegionInSUs = (r * info->TablesPerSpareRegion /
151: (v - 1)) * layoutPtr->SUsPerPU;
152:
153: /* Check to make sure the block design is sufficiently small. */
154: if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
155: if (info->FullTableDepthInPUs * layoutPtr->SUsPerPU +
156: info->SpareSpaceDepthPerRegionInSUs >
157: layoutPtr->stripeUnitsPerDisk) {
158: RF_ERRORMSG3("RAID: config error: Full Table depth"
159: " (%d) + Spare Space (%d) larger than disk size"
160: " (%d) (BD too big).\n",
161: (int) info->FullTableDepthInPUs,
162: (int) info->SpareSpaceDepthPerRegionInSUs,
163: (int) layoutPtr->stripeUnitsPerDisk);
164: return (EINVAL);
165: }
166: } else {
167: if (info->TableDepthInPUs * layoutPtr->SUsPerPU >
168: layoutPtr->stripeUnitsPerDisk) {
169: RF_ERRORMSG2("RAID: config error: Table depth (%d)"
170: " larger than disk size (%d) (BD too big).\n",
171: (int) (info->TableDepthInPUs * layoutPtr->SUsPerPU),
172: (int) layoutPtr->stripeUnitsPerDisk);
173: return (EINVAL);
174: }
175: }
176:
177:
178: /*
179: * Compute the size of each disk, and the number of tables in the last
180: * fulltable (which need not be complete).
181: */
182: if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
183:
184: PUsPerDisk = layoutPtr->stripeUnitsPerDisk /
185: layoutPtr->SUsPerPU;
186: spareRegionDepthInPUs =
187: (info->TablesPerSpareRegion * info->TableDepthInPUs +
188: (info->TablesPerSpareRegion * info->TableDepthInPUs) /
189: (v - 1));
190: info->SpareRegionDepthInSUs =
191: spareRegionDepthInPUs * layoutPtr->SUsPerPU;
192:
193: numCompleteSpareRegionsPerDisk =
194: PUsPerDisk / spareRegionDepthInPUs;
195: info->NumCompleteSRs = numCompleteSpareRegionsPerDisk;
196: extraPUsPerDisk = PUsPerDisk % spareRegionDepthInPUs;
197:
198: /*
199: * Assume conservatively that we need the full amount of spare
200: * space in one region in order to provide spares for the
201: * partial spare region at the end of the array. We set "i"
202: * to the number of tables in the partial spare region. This
203: * may actually include some fulltables.
204: */
205: extraPUsPerDisk -= (info->SpareSpaceDepthPerRegionInSUs /
206: layoutPtr->SUsPerPU);
207: if (extraPUsPerDisk <= 0)
208: i = 0;
209: else
210: i = extraPUsPerDisk / info->TableDepthInPUs;
211:
212: complete_FT_count = raidPtr->numRow *
213: (numCompleteSpareRegionsPerDisk *
214: (info->TablesPerSpareRegion / k) + i / k);
215: info->FullTableLimitSUID =
216: complete_FT_count * info->SUsPerFullTable;
217: info->ExtraTablesPerDisk = i % k;
218:
219: /*
220: * Note that in the last spare region, the spare space is
221: * complete even though data/parity space is not.
222: */
223: totSparePUsPerDisk = (numCompleteSpareRegionsPerDisk + 1) *
224: (info->SpareSpaceDepthPerRegionInSUs / layoutPtr->SUsPerPU);
225: info->TotSparePUsPerDisk = totSparePUsPerDisk;
226:
227: layoutPtr->stripeUnitsPerDisk =
228: ((complete_FT_count / raidPtr->numRow) *
229: info->FullTableDepthInPUs + /* data & parity space */
230: info->ExtraTablesPerDisk * info->TableDepthInPUs +
231: totSparePUsPerDisk /* spare space */
232: ) * layoutPtr->SUsPerPU;
233: layoutPtr->dataStripeUnitsPerDisk =
234: (complete_FT_count * info->FullTableDepthInPUs +
235: info->ExtraTablesPerDisk * info->TableDepthInPUs) *
236: layoutPtr->SUsPerPU * (k - 1) / k;
237:
238: } else {
239: /*
240: * Non-dist spare case: force each disk to contain an
241: * integral number of tables.
242: */
243: layoutPtr->stripeUnitsPerDisk /=
244: (info->TableDepthInPUs * layoutPtr->SUsPerPU);
245: layoutPtr->stripeUnitsPerDisk *=
246: (info->TableDepthInPUs * layoutPtr->SUsPerPU);
247:
248: /*
249: * Compute the number of tables in the last fulltable, which
250: * need not be complete.
251: */
252: complete_FT_count =
253: ((layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU) /
254: info->FullTableDepthInPUs) * raidPtr->numRow;
255:
256: info->FullTableLimitSUID =
257: complete_FT_count * info->SUsPerFullTable;
258: info->ExtraTablesPerDisk =
259: ((layoutPtr->stripeUnitsPerDisk / layoutPtr->SUsPerPU) /
260: info->TableDepthInPUs) % k;
261: }
262:
263: raidPtr->sectorsPerDisk = layoutPtr->stripeUnitsPerDisk *
264: layoutPtr->sectorsPerStripeUnit;
265:
266: /*
267: * Find the disk offset of the stripe unit where the last fulltable
268: * starts.
269: */
270: numCompleteFullTablesPerDisk = complete_FT_count / raidPtr->numRow;
271: diskOffsetOfLastFullTableInSUs = numCompleteFullTablesPerDisk *
272: info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
273: if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
274: SpareSpaceInSUs = numCompleteSpareRegionsPerDisk *
275: info->SpareSpaceDepthPerRegionInSUs;
276: diskOffsetOfLastFullTableInSUs += SpareSpaceInSUs;
277: info->DiskOffsetOfLastSpareSpaceChunkInSUs =
278: diskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk *
279: info->TableDepthInPUs * layoutPtr->SUsPerPU;
280: }
281: info->DiskOffsetOfLastFullTableInSUs = diskOffsetOfLastFullTableInSUs;
282: info->numCompleteFullTablesPerDisk = numCompleteFullTablesPerDisk;
283:
284: /* 4. Create and initialize the lookup tables. */
285: info->LayoutTable = rf_make_2d_array(b, k, raidPtr->cleanupList);
286: if (info->LayoutTable == NULL)
287: return (ENOMEM);
288: info->OffsetTable = rf_make_2d_array(b, k, raidPtr->cleanupList);
289: if (info->OffsetTable == NULL)
290: return (ENOMEM);
291: info->BlockTable = rf_make_2d_array(info->TableDepthInPUs *
292: layoutPtr->SUsPerPU, raidPtr->numCol, raidPtr->cleanupList);
293: if (info->BlockTable == NULL)
294: return (ENOMEM);
295:
296: first_avail_slot = rf_make_1d_array(v, NULL);
297: if (first_avail_slot == NULL)
298: return (ENOMEM);
299:
300: for (i = 0; i < b; i++)
301: for (j = 0; j < k; j++)
302: info->LayoutTable[i][j] = *cfgBuf++;
303:
304: /* Initialize the offset table. */
305: for (i = 0; i < b; i++)
306: for (j = 0; j < k; j++) {
307: info->OffsetTable[i][j] =
308: first_avail_slot[info->LayoutTable[i][j]];
309: first_avail_slot[info->LayoutTable[i][j]]++;
310: }
311:
312: /* Initialize the block table. */
313: for (SUID = l = 0; l < layoutPtr->SUsPerPU; l++) {
314: for (i = 0; i < b; i++) {
315: for (j = 0; j < k; j++) {
316: info->BlockTable[(info->OffsetTable[i][j] *
317: layoutPtr->SUsPerPU) + l]
318: [info->LayoutTable[i][j]] = SUID;
319: }
320: SUID++;
321: }
322: }
323:
324: rf_free_1d_array(first_avail_slot, v);
325:
326: /* 5. Set up the remaining redundant-but-useful parameters. */
327:
328: raidPtr->totalSectors = (k * complete_FT_count + raidPtr->numRow *
329: info->ExtraTablesPerDisk) * info->SUsPerTable *
330: layoutPtr->sectorsPerStripeUnit;
331: layoutPtr->numStripe = (raidPtr->totalSectors /
332: layoutPtr->sectorsPerStripeUnit) / (k - 1);
333:
334: /*
335: * Strange evaluation order below to try and minimize overflow
336: * problems.
337: */
338:
339: layoutPtr->dataSectorsPerStripe =
340: (k - 1) * layoutPtr->sectorsPerStripeUnit;
341: layoutPtr->bytesPerStripeUnit = layoutPtr->sectorsPerStripeUnit <<
342: raidPtr->logBytesPerSector;
343: layoutPtr->numDataCol = k - 1;
344: layoutPtr->numParityCol = 1;
345:
346: return (0);
347: }
348:
349: /* Declustering with distributed sparing. */
350: void rf_ShutdownDeclusteredDS(RF_ThreadArg_t);
351: void
352: rf_ShutdownDeclusteredDS(RF_ThreadArg_t arg)
353: {
354: RF_DeclusteredConfigInfo_t *info;
355: RF_Raid_t *raidPtr;
356:
357: raidPtr = (RF_Raid_t *) arg;
358: info =
359: (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
360: if (info->SpareTable)
361: rf_FreeSpareTable(raidPtr);
362: }
363:
364: int
365: rf_ConfigureDeclusteredDS(RF_ShutdownList_t **listp, RF_Raid_t *raidPtr,
366: RF_Config_t *cfgPtr)
367: {
368: int rc;
369:
370: rc = rf_ConfigureDeclustered(listp, raidPtr, cfgPtr);
371: if (rc)
372: return (rc);
373:
374: rc = rf_ShutdownCreate(listp, rf_ShutdownDeclusteredDS, raidPtr);
375: if (rc) {
376: RF_ERRORMSG1("Got %d adding shutdown event for"
377: " DeclusteredDS.\n", rc);
378: rf_ShutdownDeclusteredDS(raidPtr);
379: return (rc);
380: }
381:
382: return (0);
383: }
384:
385: void
386: rf_MapSectorDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
387: RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap)
388: {
389: RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
390: RF_DeclusteredConfigInfo_t *info =
391: (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
392: RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
393: RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
394: RF_StripeNum_t BlockID, BlockOffset, RepIndex;
395: RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
396: RF_StripeCount_t fulltable_depth =
397: info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
398: RF_StripeNum_t base_suid = 0, outSU, SpareRegion = 0, SpareSpace = 0;
399:
400: rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable,
401: &fulltable_depth, &base_suid);
402:
403: /* Fulltable ID within array (across rows). */
404: FullTableID = SUID / sus_per_fulltable;
405: if (raidPtr->numRow == 1)
406: *row = 0; /* Avoid a mod and a div in the common case. */
407: else {
408: *row = FullTableID % raidPtr->numRow;
409: /* Convert to fulltable ID on this disk. */
410: FullTableID /= raidPtr->numRow;
411: }
412: if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) {
413: SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
414: SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
415: }
416: FullTableOffset = SUID % sus_per_fulltable;
417: TableID = FullTableOffset / info->SUsPerTable;
418: TableOffset = FullTableOffset - TableID * info->SUsPerTable;
419: BlockID = TableOffset / info->PUsPerBlock;
420: BlockOffset = TableOffset - BlockID * info->PUsPerBlock;
421: BlockID %= info->BlocksPerTable;
422: RepIndex = info->PUsPerBlock - TableID;
423: if (!raidPtr->noRotate)
424: BlockOffset += ((BlockOffset >= RepIndex) ? 1 : 0);
425: *col = info->LayoutTable[BlockID][BlockOffset];
426:
427: /* Remap to distributed spare space if indicated. */
428: if (remap) {
429: RF_ASSERT(raidPtr->Disks[*row][*col].status ==
430: rf_ds_reconstructing ||
431: raidPtr->Disks[*row][*col].status == rf_ds_dist_spared ||
432: (rf_copyback_in_progress &&
433: raidPtr->Disks[*row][*col].status == rf_ds_optimal));
434: rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID,
435: TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col,
436: &outSU);
437: } else {
438:
439: outSU = base_suid;
440: outSU += FullTableID * fulltable_depth;
441: /* Offset to start of FT. */
442: outSU += SpareSpace;
443: /* Skip rsvd spare space. */
444: outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU;
445: /* Offset to start of table. */
446: outSU += info->OffsetTable[BlockID][BlockOffset] *
447: layoutPtr->SUsPerPU;
448: /* Offset to the PU. */
449: }
450: outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock);
451: /* offs to the SU within a PU */
452:
453: /*
454: * Convert SUs to sectors, and, if not aligned to SU boundary, add in
455: * offset to sector.
456: */
457: *diskSector = outSU * layoutPtr->sectorsPerStripeUnit +
458: (raidSector % layoutPtr->sectorsPerStripeUnit);
459:
460: RF_ASSERT(*col != -1);
461: }
462:
463: /*
464: * Prototyping this inexplicably causes the compile of the layout table
465: * (rf_layout.c) to fail.
466: */
467: void
468: rf_MapParityDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t raidSector,
469: RF_RowCol_t *row, RF_RowCol_t *col, RF_SectorNum_t *diskSector, int remap)
470: {
471: RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
472: RF_DeclusteredConfigInfo_t *info =
473: (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
474: RF_StripeNum_t SUID = raidSector / layoutPtr->sectorsPerStripeUnit;
475: RF_StripeNum_t FullTableID, FullTableOffset, TableID, TableOffset;
476: RF_StripeNum_t BlockID, BlockOffset, RepIndex;
477: RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
478: RF_StripeCount_t fulltable_depth =
479: info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
480: RF_StripeNum_t base_suid = 0, outSU, SpareRegion = 0, SpareSpace = 0;
481:
482: rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable,
483: &fulltable_depth, &base_suid);
484:
485: /* Compute row & (possibly) spare space exactly as before. */
486: FullTableID = SUID / sus_per_fulltable;
487: if (raidPtr->numRow == 1)
488: *row = 0; /* Avoid a mod and a div in the common case. */
489: else {
490: *row = FullTableID % raidPtr->numRow;
491: /* Convert to fulltable ID on this disk. */
492: FullTableID /= raidPtr->numRow;
493: }
494: if ((raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE)) {
495: SpareRegion = FullTableID / info->FullTablesPerSpareRegion;
496: SpareSpace = SpareRegion * info->SpareSpaceDepthPerRegionInSUs;
497: }
498: /* Compute BlockID and RepIndex exactly as before. */
499: FullTableOffset = SUID % sus_per_fulltable;
500: TableID = FullTableOffset / info->SUsPerTable;
501: TableOffset = FullTableOffset - TableID * info->SUsPerTable;
502: /*TableOffset = FullTableOffset % info->SUsPerTable;*/
503: /*BlockID = (TableOffset / info->PUsPerBlock) %
504: *info->BlocksPerTable;*/
505: BlockID = TableOffset / info->PUsPerBlock;
506: /*BlockOffset = TableOffset % info->PUsPerBlock;*/
507: BlockOffset = TableOffset - BlockID * info->PUsPerBlock;
508: BlockID %= info->BlocksPerTable;
509:
510: /* The parity block is in the position indicated by RepIndex. */
511: RepIndex = (raidPtr->noRotate) ?
512: info->PUsPerBlock : info->PUsPerBlock - TableID;
513: *col = info->LayoutTable[BlockID][RepIndex];
514:
515: if (remap) {
516: RF_ASSERT(raidPtr->Disks[*row][*col].status ==
517: rf_ds_reconstructing ||
518: raidPtr->Disks[*row][*col].status == rf_ds_dist_spared ||
519: (rf_copyback_in_progress &&
520: raidPtr->Disks[*row][*col].status == rf_ds_optimal));
521: rf_remap_to_spare_space(layoutPtr, info, *row, FullTableID,
522: TableID, BlockID, (base_suid) ? 1 : 0, SpareRegion, col,
523: &outSU);
524: } else {
525:
526: /*
527: * Compute sector as before, except use RepIndex instead of
528: * BlockOffset.
529: */
530: outSU = base_suid;
531: outSU += FullTableID * fulltable_depth;
532: outSU += SpareSpace; /* skip rsvd spare space */
533: outSU += TableID * info->TableDepthInPUs * layoutPtr->SUsPerPU;
534: outSU += info->OffsetTable[BlockID][RepIndex] *
535: layoutPtr->SUsPerPU;
536: }
537:
538: outSU += TableOffset / (info->BlocksPerTable * info->PUsPerBlock);
539: *diskSector = outSU * layoutPtr->sectorsPerStripeUnit +
540: (raidSector % layoutPtr->sectorsPerStripeUnit);
541:
542: RF_ASSERT(*col != -1);
543: }
544:
545: /*
546: * Return an array of ints identifying the disks that comprise the stripe
547: * containing the indicated address.
548: * The caller must _never_ attempt to modify this array.
549: */
550: void
551: rf_IdentifyStripeDeclustered(RF_Raid_t *raidPtr, RF_RaidAddr_t addr,
552: RF_RowCol_t **diskids, RF_RowCol_t *outRow)
553: {
554: RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout);
555: RF_DeclusteredConfigInfo_t *info =
556: (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
557: RF_StripeCount_t sus_per_fulltable = info->SUsPerFullTable;
558: RF_StripeCount_t fulltable_depth =
559: info->FullTableDepthInPUs * layoutPtr->SUsPerPU;
560: RF_StripeNum_t base_suid = 0;
561: RF_StripeNum_t SUID = rf_RaidAddressToStripeUnitID(layoutPtr, addr);
562: RF_StripeNum_t stripeID, FullTableID;
563: int tableOffset;
564:
565: rf_decluster_adjust_params(layoutPtr, &SUID, &sus_per_fulltable,
566: &fulltable_depth, &base_suid);
567: /* Fulltable ID within array (across rows). */
568: FullTableID = SUID / sus_per_fulltable;
569: *outRow = FullTableID % raidPtr->numRow;
570: /* Find stripe offset into array. */
571: stripeID = rf_StripeUnitIDToStripeID(layoutPtr, SUID);
572: /* Find offset into block design table. */
573: tableOffset = (stripeID % info->BlocksPerTable);
574: *diskids = info->LayoutTable[tableOffset];
575: }
576:
577: /*
578: * This returns the default head-separation limit, measured in
579: * "required units for reconstruction". Each time a disk fetches
580: * a unit, it bumps a counter. The head-sep code prohibits any disk
581: * from getting more than headSepLimit counter values ahead of any
582: * other.
583: *
584: * We assume here that the number of floating recon buffers is already
585: * set. There are r stripes to be reconstructed in each table, and so
586: * if we have a total of B buffers, we can have at most B/r tables
587: * under recon at any one time. In each table, lambda units are required
588: * from each disk, so given B buffers, the head sep limit has to be
589: * (lambda*B)/r units. We subtract one to avoid weird boundary cases.
590: *
591: * For example, suppose we are given 50 buffers, r=19, and lambda=4 as in
592: * the 20.5 design. There are 19 stripes/table to be reconstructed, so
593: * we can have 50/19 tables concurrently under reconstruction, which means
594: * we can allow the fastest disk to get 50/19 tables ahead of the slower
595: * disk. There are lambda "required units" for each disk, so the fastest
596: * disk can get 4*50/19 = 10 counter values ahead of the slowest.
597: *
598: * If numBufsToAccumulate is not 1, we need to limit the head sep further
599: * because multiple bufs will be required for each stripe under recon.
600: */
601: RF_HeadSepLimit_t
602: rf_GetDefaultHeadSepLimitDeclustered(RF_Raid_t *raidPtr)
603: {
604: RF_DeclusteredConfigInfo_t *info =
605: (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
606:
607: return (info->Lambda * raidPtr->numFloatingReconBufs /
608: info->TableDepthInPUs / rf_numBufsToAccumulate);
609: }
610:
611: /*
612: * Return the default number of recon buffers to use. The value
613: * is somewhat arbitrary... It's intended to be large enough to
614: * allow for a reasonably large head-sep limit, but small enough
615: * that you don't use up all your system memory with buffers.
616: */
617: int
618: rf_GetDefaultNumFloatingReconBuffersDeclustered(RF_Raid_t *raidPtr)
619: {
620: return (100 * rf_numBufsToAccumulate);
621: }
622:
623: /*
624: * Sectors in the last fulltable of the array need to be handled
625: * specially since this fulltable can be incomplete. This function
626: * changes the values of certain params to handle this.
627: *
628: * The idea here is that MapSector et. al. figure out which disk the
629: * addressed unit lives on by computing the modulos of the unit number
630: * with the number of units per fulltable, table, etc. In the last
631: * fulltable, there are fewer units per fulltable, so we need to adjust
632: * the number of user data units per fulltable to reflect this.
633: *
634: * So, we (1) convert the fulltable size and depth parameters to
635: * the size of the partial fulltable at the end, (2) compute the
636: * disk sector offset where this fulltable starts, and (3) convert
637: * the users stripe unit number from an offset into the array to
638: * an offset into the last fulltable.
639: */
640: void
641: rf_decluster_adjust_params(RF_RaidLayout_t *layoutPtr, RF_StripeNum_t *SUID,
642: RF_StripeCount_t *sus_per_fulltable, RF_StripeCount_t *fulltable_depth,
643: RF_StripeNum_t *base_suid)
644: {
645: RF_DeclusteredConfigInfo_t *info =
646: (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
647:
648: if (*SUID >= info->FullTableLimitSUID) {
649: /* New full table size is size of last full table on disk. */
650: *sus_per_fulltable =
651: info->ExtraTablesPerDisk * info->SUsPerTable;
652:
653: /* New full table depth is corresponding depth. */
654: *fulltable_depth =
655: info->ExtraTablesPerDisk * info->TableDepthInPUs *
656: layoutPtr->SUsPerPU;
657:
658: /* Set up the new base offset. */
659: *base_suid = info->DiskOffsetOfLastFullTableInSUs;
660:
661: /*
662: * Convert user's array address to an offset into the last
663: * fulltable.
664: */
665: *SUID -= info->FullTableLimitSUID;
666: }
667: }
668:
669: /*
670: * Map a stripe ID to a parity stripe ID.
671: * See comment above RaidAddressToParityStripeID in layout.c.
672: */
673: void
674: rf_MapSIDToPSIDDeclustered(RF_RaidLayout_t *layoutPtr, RF_StripeNum_t stripeID,
675: RF_StripeNum_t *psID, RF_ReconUnitNum_t *which_ru)
676: {
677: RF_DeclusteredConfigInfo_t *info;
678:
679: info = (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
680:
681: *psID = (stripeID / (layoutPtr->SUsPerPU * info->BlocksPerTable)) *
682: info->BlocksPerTable + (stripeID % info->BlocksPerTable);
683: *which_ru = (stripeID % (info->BlocksPerTable * layoutPtr->SUsPerPU)) /
684: info->BlocksPerTable;
685: RF_ASSERT((*which_ru) < layoutPtr->SUsPerPU / layoutPtr->SUsPerRU);
686: }
687:
688: /*
689: * Called from MapSector and MapParity to retarget an access at the spare unit.
690: * Modifies the "col" and "outSU" parameters only.
691: */
692: void
693: rf_remap_to_spare_space(RF_RaidLayout_t *layoutPtr,
694: RF_DeclusteredConfigInfo_t *info, RF_RowCol_t row,
695: RF_StripeNum_t FullTableID, RF_StripeNum_t TableID, RF_SectorNum_t BlockID,
696: RF_StripeNum_t base_suid, RF_StripeNum_t SpareRegion, RF_RowCol_t *outCol,
697: RF_StripeNum_t *outSU)
698: {
699: RF_StripeNum_t ftID, spareTableStartSU, TableInSpareRegion,
700: lastSROffset, which_ft;
701:
702: /*
703: * Note that FullTableID and hence SpareRegion may have gotten
704: * tweaked by rf_decluster_adjust_params. We detect this by
705: * noticing that base_suid is not 0.
706: */
707: if (base_suid == 0) {
708: ftID = FullTableID;
709: } else {
710: /*
711: * There may be > 1.0 full tables in the last (i.e. partial)
712: * spare region. Find out which of these we are in.
713: */
714: lastSROffset = info->NumCompleteSRs *
715: info->SpareRegionDepthInSUs;
716: which_ft =
717: (info->DiskOffsetOfLastFullTableInSUs - lastSROffset) /
718: (info->FullTableDepthInPUs * layoutPtr->SUsPerPU);
719:
720: /* Compute the actual full table ID. */
721: ftID = info->DiskOffsetOfLastFullTableInSUs /
722: (info->FullTableDepthInPUs * layoutPtr->SUsPerPU) +
723: which_ft;
724: SpareRegion = info->NumCompleteSRs;
725: }
726: TableInSpareRegion = (ftID * info->NumParityReps + TableID) %
727: info->TablesPerSpareRegion;
728:
729: *outCol = info->SpareTable[TableInSpareRegion][BlockID].spareDisk;
730: RF_ASSERT(*outCol != -1);
731:
732: spareTableStartSU = (SpareRegion == info->NumCompleteSRs) ?
733: info->DiskOffsetOfLastFullTableInSUs + info->ExtraTablesPerDisk *
734: info->TableDepthInPUs * layoutPtr->SUsPerPU :
735: (SpareRegion + 1) * info->SpareRegionDepthInSUs -
736: info->SpareSpaceDepthPerRegionInSUs;
737: *outSU = spareTableStartSU +
738: info->SpareTable[TableInSpareRegion][BlockID].spareBlockOffsetInSUs;
739: if (*outSU >= layoutPtr->stripeUnitsPerDisk) {
740: printf("rf_remap_to_spare_space: invalid remapped disk SU"
741: " offset %ld.\n", (long) *outSU);
742: }
743: }
744:
745: int
746: rf_InstallSpareTable(RF_Raid_t *raidPtr, RF_RowCol_t frow, RF_RowCol_t fcol)
747: {
748: RF_DeclusteredConfigInfo_t *info =
749: (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
750: RF_SparetWait_t *req;
751: int retcode;
752:
753: RF_Malloc(req, sizeof(*req), (RF_SparetWait_t *));
754: req->C = raidPtr->numCol;
755: req->G = raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol;
756: req->fcol = fcol;
757: req->SUsPerPU = raidPtr->Layout.SUsPerPU;
758: req->TablesPerSpareRegion = info->TablesPerSpareRegion;
759: req->BlocksPerTable = info->BlocksPerTable;
760: req->TableDepthInPUs = info->TableDepthInPUs;
761: req->SpareSpaceDepthPerRegionInSUs =
762: info->SpareSpaceDepthPerRegionInSUs;
763:
764: retcode = rf_GetSpareTableFromDaemon(req);
765: RF_ASSERT(!retcode);
766: /* XXX -- Fix this to recover gracefully. -- XXX */
767:
768: return (retcode);
769: }
770:
771: /*
772: * Invoked via ioctl to install a spare table in the kernel.
773: */
774: int
775: rf_SetSpareTable(RF_Raid_t *raidPtr, void *data)
776: {
777: RF_DeclusteredConfigInfo_t *info =
778: (RF_DeclusteredConfigInfo_t *) raidPtr->Layout.layoutSpecificInfo;
779: RF_SpareTableEntry_t **ptrs;
780: int i, retcode;
781:
782: /*
783: * What we need to copyin is a 2-d array, so first copyin the user
784: * pointers to the rows in the table.
785: */
786: RF_Malloc(ptrs, info->TablesPerSpareRegion *
787: sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **));
788: retcode = copyin((caddr_t) data, (caddr_t) ptrs,
789: info->TablesPerSpareRegion * sizeof(RF_SpareTableEntry_t *));
790:
791: if (retcode)
792: return (retcode);
793:
794: /* Now allocate kernel space for the row pointers. */
795: RF_Malloc(info->SpareTable, info->TablesPerSpareRegion *
796: sizeof(RF_SpareTableEntry_t *), (RF_SpareTableEntry_t **));
797:
798: /*
799: * Now allocate kernel space for each row in the table, and copy it in
800: * from user space. */
801: for (i = 0; i < info->TablesPerSpareRegion; i++) {
802: RF_Malloc(info->SpareTable[i], info->BlocksPerTable *
803: sizeof(RF_SpareTableEntry_t), (RF_SpareTableEntry_t *));
804: retcode = copyin(ptrs[i], info->SpareTable[i],
805: info->BlocksPerTable * sizeof(RF_SpareTableEntry_t));
806: if (retcode) {
807: /* Blow off the memory we have allocated. */
808: info->SpareTable = NULL;
809: return (retcode);
810: }
811: }
812:
813: /* Free up the temporary array we used. */
814: RF_Free(ptrs, info->TablesPerSpareRegion *
815: sizeof(RF_SpareTableEntry_t *));
816:
817: return (0);
818: }
819:
820: RF_ReconUnitCount_t
821: rf_GetNumSpareRUsDeclustered(RF_Raid_t *raidPtr)
822: {
823: RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
824:
825: return (((RF_DeclusteredConfigInfo_t *)
826: layoutPtr->layoutSpecificInfo)->TotSparePUsPerDisk);
827: }
828:
829:
830: void
831: rf_FreeSpareTable(RF_Raid_t *raidPtr)
832: {
833: long i;
834: RF_RaidLayout_t *layoutPtr = &raidPtr->Layout;
835: RF_DeclusteredConfigInfo_t *info =
836: (RF_DeclusteredConfigInfo_t *) layoutPtr->layoutSpecificInfo;
837: RF_SpareTableEntry_t **table = info->SpareTable;
838:
839: for (i = 0; i < info->TablesPerSpareRegion; i++) {
840: RF_Free(table[i], info->BlocksPerTable *
841: sizeof(RF_SpareTableEntry_t));
842: }
843: RF_Free(table, info->TablesPerSpareRegion *
844: sizeof(RF_SpareTableEntry_t *));
845: info->SpareTable = (RF_SpareTableEntry_t **) NULL;
846: }
CVSweb