diff options
author | 1999-02-16 00:02:21 +0000 | |
---|---|---|
committer | 1999-02-16 00:02:21 +0000 | |
commit | 107d1fd7ea8c7c9cb6d214c802eb1b5fac9fcf43 (patch) | |
tree | 098b83104ca2c705e48751ac59cd919aa52dc650 /sys/dev/raidframe/rf_reconstruct.c | |
parent | Add a RaidFrame bucket type (diff) | |
download | wireguard-openbsd-107d1fd7ea8c7c9cb6d214c802eb1b5fac9fcf43.tar.xz wireguard-openbsd-107d1fd7ea8c7c9cb6d214c802eb1b5fac9fcf43.zip |
Merge from NetBSD, mostly indentation
Diffstat (limited to 'sys/dev/raidframe/rf_reconstruct.c')
-rw-r--r-- | sys/dev/raidframe/rf_reconstruct.c | 2454 |
1 files changed, 1129 insertions, 1325 deletions
diff --git a/sys/dev/raidframe/rf_reconstruct.c b/sys/dev/raidframe/rf_reconstruct.c index 7df351a7ec0..a3f7085241f 100644 --- a/sys/dev/raidframe/rf_reconstruct.c +++ b/sys/dev/raidframe/rf_reconstruct.c @@ -1,5 +1,5 @@ -/* $OpenBSD: rf_reconstruct.c,v 1.1 1999/01/11 14:29:46 niklas Exp $ */ -/* $NetBSD: rf_reconstruct.c,v 1.1 1998/11/13 04:20:33 oster Exp $ */ +/* $OpenBSD: rf_reconstruct.c,v 1.2 1999/02/16 00:03:22 niklas Exp $ */ +/* $NetBSD: rf_reconstruct.c,v 1.4 1999/02/05 00:06:16 oster Exp $ */ /* * Copyright (c) 1995 Carnegie-Mellon University. * All rights reserved. @@ -33,194 +33,9 @@ * ************************************************************/ -/* - * : - * Log: rf_reconstruct.c,v - * Revision 1.65 1996/08/06 22:24:56 jimz - * get rid of sys/buf.h on linux - * - * Revision 1.64 1996/07/30 04:28:53 jimz - * include rf_types.h first - * - * Revision 1.63 1996/07/27 23:36:08 jimz - * Solaris port of simulator - * - * Revision 1.62 1996/07/17 21:00:58 jimz - * clean up timer interface, tracing - * - * Revision 1.61 1996/07/15 05:40:41 jimz - * some recon datastructure cleanup - * better handling of multiple failures - * added undocumented double-recon test - * - * Revision 1.60 1996/07/15 02:57:18 jimz - * added debugging (peek at first couple bytes of recon buffers - * as they go by) - * - * Revision 1.59 1996/07/13 00:00:59 jimz - * sanitized generalized reconstruction architecture - * cleaned up head sep, rbuf problems - * - * Revision 1.58 1996/07/11 19:08:00 jimz - * generalize reconstruction mechanism - * allow raid1 reconstructs via copyback (done with array - * quiesced, not online, therefore not disk-directed) - * - * Revision 1.57 1996/06/17 14:38:33 jimz - * properly #if out RF_DEMO code - * fix bug in MakeConfig that was causing weird behavior - * in configuration routines (config was not zeroed at start) - * clean up genplot handling of stacks - * - * Revision 1.56 1996/06/17 03:24:59 jimz - * include shutdown.h for define of now-macroized ShutdownCreate - * - * Revision 1.55 1996/06/11 10:58:36 jimz - * get rid of simulator-testcode artifacts - * add generic ReconDoneProc mechanism instead - * - * Revision 1.54 1996/06/10 14:18:58 jimz - * move user, throughput stats into per-array structure - * - * Revision 1.53 1996/06/10 11:55:47 jimz - * Straightened out some per-array/not-per-array distinctions, fixed - * a couple bugs related to confusion. Added shutdown lists. Removed - * layout shutdown function (now subsumed by shutdown lists). - * - * Revision 1.52 1996/06/09 02:36:46 jimz - * lots of little crufty cleanup- fixup whitespace - * issues, comment #ifdefs, improve typing in some - * places (esp size-related) - * - * Revision 1.51 1996/06/07 22:26:27 jimz - * type-ify which_ru (RF_ReconUnitNum_t) - * - * Revision 1.50 1996/06/07 21:33:04 jimz - * begin using consistent types for sector numbers, - * stripe numbers, row+col numbers, recon unit numbers - * - * Revision 1.49 1996/06/06 01:24:36 jimz - * don't get rid of reconCtrlPtr until we're done with it - * - * Revision 1.48 1996/06/05 18:06:02 jimz - * Major code cleanup. The Great Renaming is now done. - * Better modularity. Better typing. Fixed a bunch of - * synchronization bugs. Made a lot of global stuff - * per-desc or per-array. Removed dead code. - * - * Revision 1.47 1996/06/03 23:28:26 jimz - * more bugfixes - * check in tree to sync for IPDS runs with current bugfixes - * there still may be a problem with threads in the script test - * getting I/Os stuck- not trivially reproducible (runs ~50 times - * in a row without getting stuck) - * - * Revision 1.46 1996/06/02 17:31:48 jimz - * Moved a lot of global stuff into array structure, where it belongs. - * Fixed up paritylogging, pss modules in this manner. Some general - * code cleanup. Removed lots of dead code, some dead files. - * - * Revision 1.45 1996/05/31 22:26:54 jimz - * fix a lot of mapping problems, memory allocation problems - * found some weird lock issues, fixed 'em - * more code cleanup - * - * Revision 1.44 1996/05/30 23:22:16 jimz - * bugfixes of serialization, timing problems - * more cleanup - * - * Revision 1.43 1996/05/30 11:29:41 jimz - * Numerous bug fixes. Stripe lock release code disagreed with the taking code - * about when stripes should be locked (I made it consistent: no parity, no lock) - * There was a lot of extra serialization of I/Os which I've removed- a lot of - * it was to calculate values for the cache code, which is no longer with us. - * More types, function, macro cleanup. Added code to properly quiesce the array - * on shutdown. Made a lot of stuff array-specific which was (bogusly) general - * before. Fixed memory allocation, freeing bugs. - * - * Revision 1.42 1996/05/27 18:56:37 jimz - * more code cleanup - * better typing - * compiles in all 3 environments - * - * Revision 1.41 1996/05/24 22:17:04 jimz - * continue code + namespace cleanup - * typed a bunch of flags - * - * Revision 1.40 1996/05/24 04:40:40 jimz - * don't do demoMode stuff in kernel - * - * Revision 1.39 1996/05/24 01:59:45 jimz - * another checkpoint in code cleanup for release - * time to sync kernel tree - * - * Revision 1.38 1996/05/23 21:46:35 jimz - * checkpoint in code cleanup (release prep) - * lots of types, function names have been fixed - * - * Revision 1.37 1996/05/23 00:33:23 jimz - * code cleanup: move all debug decls to rf_options.c, all extern - * debug decls to rf_options.h, all debug vars preceded by rf_ - * - * Revision 1.36 1996/05/18 19:51:34 jimz - * major code cleanup- fix syntax, make some types consistent, - * add prototypes, clean out dead code, et cetera - * - * Revision 1.35 1996/05/01 16:28:16 jimz - * don't include ccmn.h - * - * Revision 1.34 1995/12/12 18:10:06 jimz - * MIN -> RF_MIN, MAX -> RF_MAX, ASSERT -> RF_ASSERT - * fix 80-column brain damage in comments - * - * Revision 1.33 1995/12/06 15:05:09 root - * added copyright info - * - * Revision 1.32 1995/11/17 19:04:11 wvcii - * added prototyping to ComputePSDiskOffsets - * prow and pcol now type int (were u_int) - * - * Revision 1.31 1995/11/17 01:39:35 amiri - * isolated some demo related stuff - * - * Revision 1.30 1995/10/18 19:33:14 amiri - * removed fflush (stdin/stdout) calls from ReconstructFailedDisk - * - * Revision 1.29 1995/10/11 10:20:33 jimz - * #if 0'd problem code for sigmetrics - * - * Revision 1.28 1995/10/10 23:18:15 amiri - * added fflushes to stdin/stdout before requesting - * input in demo mode. - * - * Revision 1.27 1995/10/10 19:24:47 amiri - * took out update_mode (for demo) from - * KERNEL source. - * - * Revision 1.26 1995/10/09 23:35:48 amiri - * added support for more meters in recon. demo - * - * Revision 1.25 1995/07/03 18:14:30 holland - * changed the way the number of floating recon bufs & - * the head sep limit get set - * - * Revision 1.24 1995/07/02 15:07:42 holland - * bug fixes related to getting distributed sparing numbers - * - * Revision 1.23 1995/06/23 13:36:36 robby - * updeated to prototypes in rf_layout.h - * -*/ - -#ifdef _KERNEL -#define KERNEL -#endif - #include "rf_types.h" #include <sys/time.h> -#ifndef LINUX #include <sys/buf.h> -#endif /* !LINUX */ #include <sys/errno.h> #include "rf_raid.h" #include "rf_reconutil.h" @@ -240,13 +55,7 @@ #include "rf_shutdown.h" #include "rf_sys.h" -#if RF_DEMO > 0 -#include "rf_demo.h" -#endif /* RF_DEMO > 0 */ - -#ifdef KERNEL #include "rf_kintf.h" -#endif /* KERNEL */ /* setting these to -1 causes them to be set to their default values if not set by debug options */ @@ -269,88 +78,94 @@ #define DDprintf7(s,a,b,c,d,e,f,g) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),NULL) #define DDprintf8(s,a,b,c,d,e,f,g,h) if (rf_reconDebug) rf_debug_printf(s,(void *)((unsigned long)a),(void *)((unsigned long)b),(void *)((unsigned long)c),(void *)((unsigned long)d),(void *)((unsigned long)e),(void *)((unsigned long)f),(void *)((unsigned long)g),(void *)((unsigned long)h)) -#ifdef KERNEL -static RF_Thread_t recon_thr_handle; -static int recon_thread_initialized = 0; -#endif /* KERNEL */ +static RF_Thread_t recon_thr_handle; +static int recon_thread_initialized = 0; static RF_FreeList_t *rf_recond_freelist; #define RF_MAX_FREE_RECOND 4 #define RF_RECOND_INC 1 -static RF_RaidReconDesc_t *AllocRaidReconDesc(RF_Raid_t *raidPtr, - RF_RowCol_t row, RF_RowCol_t col, RF_RaidDisk_t *spareDiskPtr, - int numDisksDone, RF_RowCol_t srow, RF_RowCol_t scol); -static void FreeReconDesc(RF_RaidReconDesc_t *reconDesc); -static int ProcessReconEvent(RF_Raid_t *raidPtr, RF_RowCol_t frow, - RF_ReconEvent_t *event); -static int IssueNextReadRequest(RF_Raid_t *raidPtr, RF_RowCol_t row, - RF_RowCol_t col); -static int TryToRead(RF_Raid_t *raidPtr, RF_RowCol_t row, RF_RowCol_t col); -static int ComputePSDiskOffsets(RF_Raid_t *raidPtr, RF_StripeNum_t psid, - RF_RowCol_t row, RF_RowCol_t col, RF_SectorNum_t *outDiskOffset, - RF_SectorNum_t *outFailedDiskSectorOffset, RF_RowCol_t *spRow, - RF_RowCol_t *spCol, RF_SectorNum_t *spOffset); -static int IssueNextWriteRequest(RF_Raid_t *raidPtr, RF_RowCol_t row); +static RF_RaidReconDesc_t * +AllocRaidReconDesc(RF_Raid_t * raidPtr, + RF_RowCol_t row, RF_RowCol_t col, RF_RaidDisk_t * spareDiskPtr, + int numDisksDone, RF_RowCol_t srow, RF_RowCol_t scol); +static void FreeReconDesc(RF_RaidReconDesc_t * reconDesc); +static int +ProcessReconEvent(RF_Raid_t * raidPtr, RF_RowCol_t frow, + RF_ReconEvent_t * event); +static int +IssueNextReadRequest(RF_Raid_t * raidPtr, RF_RowCol_t row, + RF_RowCol_t col); +static int TryToRead(RF_Raid_t * raidPtr, RF_RowCol_t row, RF_RowCol_t col); +static int +ComputePSDiskOffsets(RF_Raid_t * raidPtr, RF_StripeNum_t psid, + RF_RowCol_t row, RF_RowCol_t col, RF_SectorNum_t * outDiskOffset, + RF_SectorNum_t * outFailedDiskSectorOffset, RF_RowCol_t * spRow, + RF_RowCol_t * spCol, RF_SectorNum_t * spOffset); +static int IssueNextWriteRequest(RF_Raid_t * raidPtr, RF_RowCol_t row); static int ReconReadDoneProc(void *arg, int status); static int ReconWriteDoneProc(void *arg, int status); -static void CheckForNewMinHeadSep(RF_Raid_t *raidPtr, RF_RowCol_t row, - RF_HeadSepLimit_t hsCtr); -static int CheckHeadSeparation(RF_Raid_t *raidPtr, RF_PerDiskReconCtrl_t *ctrl, - RF_RowCol_t row, RF_RowCol_t col, RF_HeadSepLimit_t hsCtr, - RF_ReconUnitNum_t which_ru); -static int CheckForcedOrBlockedReconstruction(RF_Raid_t *raidPtr, - RF_ReconParityStripeStatus_t *pssPtr, RF_PerDiskReconCtrl_t *ctrl, - RF_RowCol_t row, RF_RowCol_t col, RF_StripeNum_t psid, - RF_ReconUnitNum_t which_ru); +static void +CheckForNewMinHeadSep(RF_Raid_t * raidPtr, RF_RowCol_t row, + RF_HeadSepLimit_t hsCtr); +static int +CheckHeadSeparation(RF_Raid_t * raidPtr, RF_PerDiskReconCtrl_t * ctrl, + RF_RowCol_t row, RF_RowCol_t col, RF_HeadSepLimit_t hsCtr, + RF_ReconUnitNum_t which_ru); +static int +CheckForcedOrBlockedReconstruction(RF_Raid_t * raidPtr, + RF_ReconParityStripeStatus_t * pssPtr, RF_PerDiskReconCtrl_t * ctrl, + RF_RowCol_t row, RF_RowCol_t col, RF_StripeNum_t psid, + RF_ReconUnitNum_t which_ru); static void ForceReconReadDoneProc(void *arg, int status); static void rf_ShutdownReconstruction(void *); struct RF_ReconDoneProc_s { - void (*proc)(RF_Raid_t *, void *); - void *arg; - RF_ReconDoneProc_t *next; + void (*proc) (RF_Raid_t *, void *); + void *arg; + RF_ReconDoneProc_t *next; }; static RF_FreeList_t *rf_rdp_freelist; #define RF_MAX_FREE_RDP 4 #define RF_RDP_INC 1 -static void SignalReconDone(RF_Raid_t *raidPtr) +static void +SignalReconDone(RF_Raid_t * raidPtr) { - RF_ReconDoneProc_t *p; + RF_ReconDoneProc_t *p; - RF_LOCK_MUTEX(raidPtr->recon_done_proc_mutex); - for(p=raidPtr->recon_done_procs;p;p=p->next) { - p->proc(raidPtr, p->arg); - } - RF_UNLOCK_MUTEX(raidPtr->recon_done_proc_mutex); + RF_LOCK_MUTEX(raidPtr->recon_done_proc_mutex); + for (p = raidPtr->recon_done_procs; p; p = p->next) { + p->proc(raidPtr, p->arg); + } + RF_UNLOCK_MUTEX(raidPtr->recon_done_proc_mutex); } -int rf_RegisterReconDoneProc( - RF_Raid_t *raidPtr, - void (*proc)(RF_Raid_t *, void *), - void *arg, - RF_ReconDoneProc_t **handlep) +int +rf_RegisterReconDoneProc( + RF_Raid_t * raidPtr, + void (*proc) (RF_Raid_t *, void *), + void *arg, + RF_ReconDoneProc_t ** handlep) { - RF_ReconDoneProc_t *p; - - RF_FREELIST_GET(rf_rdp_freelist,p,next,(RF_ReconDoneProc_t *)); - if (p == NULL) - return(ENOMEM); - p->proc = proc; - p->arg = arg; - RF_LOCK_MUTEX(raidPtr->recon_done_proc_mutex); - p->next = raidPtr->recon_done_procs; - raidPtr->recon_done_procs = p; - RF_UNLOCK_MUTEX(raidPtr->recon_done_proc_mutex); - if (handlep) - *handlep = p; - return(0); + RF_ReconDoneProc_t *p; + + RF_FREELIST_GET(rf_rdp_freelist, p, next, (RF_ReconDoneProc_t *)); + if (p == NULL) + return (ENOMEM); + p->proc = proc; + p->arg = arg; + RF_LOCK_MUTEX(raidPtr->recon_done_proc_mutex); + p->next = raidPtr->recon_done_procs; + raidPtr->recon_done_procs = p; + RF_UNLOCK_MUTEX(raidPtr->recon_done_proc_mutex); + if (handlep) + *handlep = p; + return (0); } - /***************************************************************************************** * * sets up the parameters that will be used by the reconstruction process @@ -360,88 +175,86 @@ int rf_RegisterReconDoneProc( * in the kernel, we fire off the recon thread. * ****************************************************************************************/ -static void rf_ShutdownReconstruction(ignored) - void *ignored; +static void +rf_ShutdownReconstruction(ignored) + void *ignored; { - RF_FREELIST_DESTROY(rf_recond_freelist,next,(RF_RaidReconDesc_t *)); - RF_FREELIST_DESTROY(rf_rdp_freelist,next,(RF_ReconDoneProc_t *)); + RF_FREELIST_DESTROY(rf_recond_freelist, next, (RF_RaidReconDesc_t *)); + RF_FREELIST_DESTROY(rf_rdp_freelist, next, (RF_ReconDoneProc_t *)); } -int rf_ConfigureReconstruction(listp) - RF_ShutdownList_t **listp; +int +rf_ConfigureReconstruction(listp) + RF_ShutdownList_t **listp; { - int rc; - - RF_FREELIST_CREATE(rf_recond_freelist, RF_MAX_FREE_RECOND, - RF_RECOND_INC, sizeof(RF_RaidReconDesc_t)); - if (rf_recond_freelist == NULL) - return(ENOMEM); - RF_FREELIST_CREATE(rf_rdp_freelist, RF_MAX_FREE_RDP, - RF_RDP_INC, sizeof(RF_ReconDoneProc_t)); - if (rf_rdp_freelist == NULL) { - RF_FREELIST_DESTROY(rf_recond_freelist,next,(RF_RaidReconDesc_t *)); - return(ENOMEM); - } - rc = rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL); - if (rc) { - RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", - __FILE__, __LINE__, rc); - rf_ShutdownReconstruction(NULL); - return(rc); - } - -#ifdef KERNEL - if (!recon_thread_initialized) { - RF_CREATE_THREAD(recon_thr_handle, rf_ReconKernelThread, NULL); - recon_thread_initialized = 1; - } -#endif /* KERNEL */ - - return(0); + int rc; + + RF_FREELIST_CREATE(rf_recond_freelist, RF_MAX_FREE_RECOND, + RF_RECOND_INC, sizeof(RF_RaidReconDesc_t)); + if (rf_recond_freelist == NULL) + return (ENOMEM); + RF_FREELIST_CREATE(rf_rdp_freelist, RF_MAX_FREE_RDP, + RF_RDP_INC, sizeof(RF_ReconDoneProc_t)); + if (rf_rdp_freelist == NULL) { + RF_FREELIST_DESTROY(rf_recond_freelist, next, (RF_RaidReconDesc_t *)); + return (ENOMEM); + } + rc = rf_ShutdownCreate(listp, rf_ShutdownReconstruction, NULL); + if (rc) { + RF_ERRORMSG3("Unable to add to shutdown list file %s line %d rc=%d\n", + __FILE__, __LINE__, rc); + rf_ShutdownReconstruction(NULL); + return (rc); + } + if (!recon_thread_initialized) { + RF_CREATE_THREAD(recon_thr_handle, rf_ReconKernelThread, NULL); + recon_thread_initialized = 1; + } + return (0); } -static RF_RaidReconDesc_t *AllocRaidReconDesc(raidPtr, row, col, spareDiskPtr, numDisksDone, srow, scol) - RF_Raid_t *raidPtr; - RF_RowCol_t row; - RF_RowCol_t col; - RF_RaidDisk_t *spareDiskPtr; - int numDisksDone; - RF_RowCol_t srow; - RF_RowCol_t scol; +static RF_RaidReconDesc_t * +AllocRaidReconDesc(raidPtr, row, col, spareDiskPtr, numDisksDone, srow, scol) + RF_Raid_t *raidPtr; + RF_RowCol_t row; + RF_RowCol_t col; + RF_RaidDisk_t *spareDiskPtr; + int numDisksDone; + RF_RowCol_t srow; + RF_RowCol_t scol; { - - RF_RaidReconDesc_t *reconDesc; - - RF_FREELIST_GET(rf_recond_freelist,reconDesc,next,(RF_RaidReconDesc_t *)); - - reconDesc->raidPtr = raidPtr; - reconDesc->row = row; - reconDesc->col = col; - reconDesc->spareDiskPtr=spareDiskPtr; - reconDesc->numDisksDone=numDisksDone; - reconDesc->srow=srow; - reconDesc->scol=scol; - reconDesc->state = 0; - reconDesc->next = NULL; - - return(reconDesc); + + RF_RaidReconDesc_t *reconDesc; + + RF_FREELIST_GET(rf_recond_freelist, reconDesc, next, (RF_RaidReconDesc_t *)); + + reconDesc->raidPtr = raidPtr; + reconDesc->row = row; + reconDesc->col = col; + reconDesc->spareDiskPtr = spareDiskPtr; + reconDesc->numDisksDone = numDisksDone; + reconDesc->srow = srow; + reconDesc->scol = scol; + reconDesc->state = 0; + reconDesc->next = NULL; + + return (reconDesc); } -static void FreeReconDesc(reconDesc) - RF_RaidReconDesc_t *reconDesc; +static void +FreeReconDesc(reconDesc) + RF_RaidReconDesc_t *reconDesc; { #if RF_RECON_STATS > 0 - printf("RAIDframe: %lu recon event waits, %lu recon delays\n", - (long)reconDesc->numReconEventWaits, (long)reconDesc->numReconExecDelays); -#endif /* RF_RECON_STATS > 0 */ -#ifdef KERNEL - printf("RAIDframe: %lu max exec ticks\n", - (long)reconDesc->maxReconExecTicks); -#endif /* KERNEL */ + printf("RAIDframe: %lu recon event waits, %lu recon delays\n", + (long) reconDesc->numReconEventWaits, (long) reconDesc->numReconExecDelays); +#endif /* RF_RECON_STATS > 0 */ + printf("RAIDframe: %lu max exec ticks\n", + (long) reconDesc->maxReconExecTicks); #if (RF_RECON_STATS > 0) || defined(KERNEL) - printf("\n"); -#endif /* (RF_RECON_STATS > 0) || KERNEL */ - RF_FREELIST_FREE(rf_recond_freelist,reconDesc,next); + printf("\n"); +#endif /* (RF_RECON_STATS > 0) || KERNEL */ + RF_FREELIST_FREE(rf_recond_freelist, reconDesc, next); } @@ -451,476 +264,402 @@ static void FreeReconDesc(reconDesc) * within its own thread. It won't return until reconstruction completes, * fails, or is aborted. ****************************************************************************************/ -int rf_ReconstructFailedDisk(raidPtr, row, col) - RF_Raid_t *raidPtr; - RF_RowCol_t row; - RF_RowCol_t col; +int +rf_ReconstructFailedDisk(raidPtr, row, col) + RF_Raid_t *raidPtr; + RF_RowCol_t row; + RF_RowCol_t col; { -#ifdef SIMULATE - RF_PendingRecon_t *pend; - RF_RowCol_t r, c; -#endif /* SIMULATE */ - RF_LayoutSW_t *lp; - int rc; - - lp = raidPtr->Layout.map; - if (lp->SubmitReconBuffer) { - /* - * The current infrastructure only supports reconstructing one - * disk at a time for each array. - */ -#ifdef SIMULATE - if (raidPtr->reconInProgress) { - RF_Malloc(pend, sizeof(RF_PendingRecon_t), (RF_PendingRecon_t *)); - pend->row = row; - pend->col = col; - pend->next = raidPtr->pendingRecon; - raidPtr->pendingRecon = pend; - /* defer until current recon completes */ - return(0); - } - raidPtr->reconInProgress++; -#else /* SIMULATE */ - RF_LOCK_MUTEX(raidPtr->mutex); - while (raidPtr->reconInProgress) { - RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex); - } - raidPtr->reconInProgress++; - RF_UNLOCK_MUTEX(raidPtr->mutex); -#endif /* SIMULATE */ - rc = rf_ReconstructFailedDiskBasic(raidPtr, row, col); - } - else { - RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n", - lp->parityConfig); - rc = EIO; - } -#ifdef SIMULATE - pend = raidPtr->pendingRecon; - if (pend) { - /* launch next recon */ - raidPtr->pendingRecon = pend->next; - r = pend->row; - c = pend->col; - RF_Free(pend, sizeof(RF_PendingRecon_t)); - return(rf_ReconstructFailedDisk(raidPtr, r, c)); - } -#else /* SIMULATE */ - RF_LOCK_MUTEX(raidPtr->mutex); - raidPtr->reconInProgress--; - RF_UNLOCK_MUTEX(raidPtr->mutex); - RF_SIGNAL_COND(raidPtr->waitForReconCond); -#if 1 -#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) - wakeup(&raidPtr->waitForReconCond); /* XXX Methinks this will be needed - at some point... GO*/ -#endif -#endif -#endif /* SIMULATE */ - return(rc); + RF_LayoutSW_t *lp; + int rc; + + lp = raidPtr->Layout.map; + if (lp->SubmitReconBuffer) { + /* + * The current infrastructure only supports reconstructing one + * disk at a time for each array. + */ + RF_LOCK_MUTEX(raidPtr->mutex); + while (raidPtr->reconInProgress) { + RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex); + } + raidPtr->reconInProgress++; + RF_UNLOCK_MUTEX(raidPtr->mutex); + rc = rf_ReconstructFailedDiskBasic(raidPtr, row, col); + } else { + RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n", + lp->parityConfig); + rc = EIO; + } + RF_LOCK_MUTEX(raidPtr->mutex); + raidPtr->reconInProgress--; + RF_UNLOCK_MUTEX(raidPtr->mutex); + RF_SIGNAL_COND(raidPtr->waitForReconCond); + wakeup(&raidPtr->waitForReconCond); /* XXX Methinks this will be + * needed at some point... GO */ + return (rc); } -int rf_ReconstructFailedDiskBasic(raidPtr, row, col) - RF_Raid_t *raidPtr; - RF_RowCol_t row; - RF_RowCol_t col; +int +rf_ReconstructFailedDiskBasic(raidPtr, row, col) + RF_Raid_t *raidPtr; + RF_RowCol_t row; + RF_RowCol_t col; { - RF_RaidDisk_t *spareDiskPtr = NULL; - RF_RaidReconDesc_t *reconDesc; - RF_RowCol_t srow, scol; - int numDisksDone=0, rc; - - /* first look for a spare drive onto which to reconstruct the data */ - /* spare disk descriptors are stored in row 0. This may have to change eventually */ - - RF_LOCK_MUTEX(raidPtr->mutex); - RF_ASSERT (raidPtr->Disks[row][col].status == rf_ds_failed); - - if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { - if (raidPtr->status[row] != rf_rs_degraded) { - RF_ERRORMSG2("Unable to reconstruct disk at row %d col %d because status not degraded\n",row,col); - RF_UNLOCK_MUTEX(raidPtr->mutex); - return(EINVAL); - } - srow = row; - scol = (-1); - } - else { - srow = 0; - for (scol=raidPtr->numCol; scol<raidPtr->numCol + raidPtr->numSpare; scol++) { - if (raidPtr->Disks[srow][scol].status == rf_ds_spare) { - spareDiskPtr = &raidPtr->Disks[srow][scol]; - spareDiskPtr->status = rf_ds_used_spare; - break; - } - } - if (!spareDiskPtr) { - RF_ERRORMSG2("Unable to reconstruct disk at row %d col %d because no spares are available\n",row,col); - RF_UNLOCK_MUTEX(raidPtr->mutex); - return(ENOSPC); - } - -#if RF_DEMO > 0 - if (!rf_demoMode) { -#endif /* RF_DEMO > 0 */ - printf("RECON: initiating reconstruction on row %d col %d -> spare at row %d col %d\n",row, col, srow, scol); -#if RF_DEMO > 0 - } -#endif /* RF_DEMO > 0 */ - } - RF_UNLOCK_MUTEX(raidPtr->mutex); - - reconDesc = AllocRaidReconDesc((void *) raidPtr, row, col,spareDiskPtr, numDisksDone, srow , scol); - raidPtr->reconDesc = (void *) reconDesc; + RF_RaidDisk_t *spareDiskPtr = NULL; + RF_RaidReconDesc_t *reconDesc; + RF_RowCol_t srow, scol; + int numDisksDone = 0, rc; + + /* first look for a spare drive onto which to reconstruct the data */ + /* spare disk descriptors are stored in row 0. This may have to + * change eventually */ + + RF_LOCK_MUTEX(raidPtr->mutex); + RF_ASSERT(raidPtr->Disks[row][col].status == rf_ds_failed); + + if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { + if (raidPtr->status[row] != rf_rs_degraded) { + RF_ERRORMSG2("Unable to reconstruct disk at row %d col %d because status not degraded\n", row, col); + RF_UNLOCK_MUTEX(raidPtr->mutex); + return (EINVAL); + } + srow = row; + scol = (-1); + } else { + srow = 0; + for (scol = raidPtr->numCol; scol < raidPtr->numCol + raidPtr->numSpare; scol++) { + if (raidPtr->Disks[srow][scol].status == rf_ds_spare) { + spareDiskPtr = &raidPtr->Disks[srow][scol]; + spareDiskPtr->status = rf_ds_used_spare; + break; + } + } + if (!spareDiskPtr) { + RF_ERRORMSG2("Unable to reconstruct disk at row %d col %d because no spares are available\n", row, col); + RF_UNLOCK_MUTEX(raidPtr->mutex); + return (ENOSPC); + } + printf("RECON: initiating reconstruction on row %d col %d -> spare at row %d col %d\n", row, col, srow, scol); + } + RF_UNLOCK_MUTEX(raidPtr->mutex); + + reconDesc = AllocRaidReconDesc((void *) raidPtr, row, col, spareDiskPtr, numDisksDone, srow, scol); + raidPtr->reconDesc = (void *) reconDesc; #if RF_RECON_STATS > 0 - reconDesc->hsStallCount = 0; - reconDesc->numReconExecDelays = 0; - reconDesc->numReconEventWaits = 0; -#endif /* RF_RECON_STATS > 0 */ -#ifdef KERNEL - reconDesc->reconExecTimerRunning = 0; - reconDesc->reconExecTicks = 0; - reconDesc->maxReconExecTicks = 0; -#endif /* KERNEL */ -#if RF_DEMO > 0 && !defined(SIMULATE) - if (rf_demoMode) { - char cbuf[10]; - printf("About to start reconstruction, hit return to continue:"); - gets(cbuf); - } -#endif /* RF_DEMO > 0 && !SIMULATE */ - rc = rf_ContinueReconstructFailedDisk(reconDesc); - return(rc); + reconDesc->hsStallCount = 0; + reconDesc->numReconExecDelays = 0; + reconDesc->numReconEventWaits = 0; +#endif /* RF_RECON_STATS > 0 */ + reconDesc->reconExecTimerRunning = 0; + reconDesc->reconExecTicks = 0; + reconDesc->maxReconExecTicks = 0; + rc = rf_ContinueReconstructFailedDisk(reconDesc); + return (rc); } -int rf_ContinueReconstructFailedDisk(reconDesc) - RF_RaidReconDesc_t *reconDesc; +int +rf_ContinueReconstructFailedDisk(reconDesc) + RF_RaidReconDesc_t *reconDesc; { - RF_Raid_t *raidPtr=reconDesc->raidPtr; - RF_RowCol_t row=reconDesc->row; - RF_RowCol_t col=reconDesc->col; - RF_RowCol_t srow=reconDesc->srow; - RF_RowCol_t scol=reconDesc->scol; - RF_ReconMap_t *mapPtr; - - RF_ReconEvent_t *event; - struct timeval etime, elpsd; - unsigned long xor_s, xor_resid_us; - int retcode,i, ds; - - switch (reconDesc->state) - { - - - case 0: - - raidPtr->accumXorTimeUs = 0; - - /* create one trace record per physical disk */ - RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *)); - - /* quiesce the array prior to starting recon. this is needed to assure no nasty interactions - * with pending user writes. We need to do this before we change the disk or row status. - */ - reconDesc->state=1; - - Dprintf("RECON: begin request suspend\n"); - retcode = rf_SuspendNewRequestsAndWait(raidPtr); - Dprintf("RECON: end request suspend\n"); - rf_StartUserStats(raidPtr); /* zero out the stats kept on user accs */ - -#ifdef SIMULATE - if (retcode) return(0); -#endif /* SIMULATE */ - - /* fall through to state 1 */ - - case 1: - - RF_LOCK_MUTEX(raidPtr->mutex); - - /* create the reconstruction control pointer and install it in the right slot */ - raidPtr->reconControl[row] = rf_MakeReconControl(reconDesc, row, col, srow, scol); - mapPtr=raidPtr->reconControl[row]->reconMap; - raidPtr->status[row] = rf_rs_reconstructing; - raidPtr->Disks[row][col].status = rf_ds_reconstructing; - raidPtr->Disks[row][col].spareRow = srow; - raidPtr->Disks[row][col].spareCol = scol; - - RF_UNLOCK_MUTEX(raidPtr->mutex); - - RF_GETTIME(raidPtr->reconControl[row]->starttime); -#if RF_DEMO > 0 - if (rf_demoMode) { - rf_demo_update_mode(RF_DEMO_RECON); - rf_startup_recon_demo(rf_demoMeterVpos, raidPtr->numCol, - raidPtr->Layout.numDataCol+raidPtr->Layout.numParityCol, 0); - } -#endif /* RF_DEMO > 0 */ - - /* now start up the actual reconstruction: issue a read for each surviving disk */ - rf_start_cpu_monitor(); - reconDesc->numDisksDone = 0; - for (i=0; i<raidPtr->numCol; i++) { - if (i != col) { - /* find and issue the next I/O on the indicated disk */ - if (IssueNextReadRequest(raidPtr, row, i)) { - Dprintf2("RECON: done issuing for r%d c%d\n", row, i); - reconDesc->numDisksDone++; - } - } - } - - case 2: - Dprintf("RECON: resume requests\n"); - rf_ResumeNewRequests(raidPtr); - - - reconDesc->state=3; - - case 3: - - /* process reconstruction events until all disks report that they've completed all work */ - mapPtr=raidPtr->reconControl[row]->reconMap; - - - - while (reconDesc->numDisksDone < raidPtr->numCol-1) { - - event = rf_GetNextReconEvent(reconDesc, row, (void (*)(void *))rf_ContinueReconstructFailedDisk,reconDesc); -#ifdef SIMULATE - if (event==NULL) {return(0);} -#else /* SIMULATE */ - RF_ASSERT(event); -#endif /* SIMULATE */ - - if (ProcessReconEvent(raidPtr, row, event)) reconDesc->numDisksDone++; - raidPtr->reconControl[row]->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs); -#if RF_DEMO > 0 - if (rf_prReconSched || rf_demoMode) -#else /* RF_DEMO > 0 */ - if (rf_prReconSched) -#endif /* RF_DEMO > 0 */ - { - rf_PrintReconSchedule(raidPtr->reconControl[row]->reconMap, &(raidPtr->reconControl[row]->starttime)); - } - } - - - - reconDesc->state=4; - - - case 4: - mapPtr=raidPtr->reconControl[row]->reconMap; - if (rf_reconDebug) { - printf("RECON: all reads completed\n"); - } - - - - /* at this point all the reads have completed. We now wait for any pending writes - * to complete, and then we're done - */ - - while (rf_UnitsLeftToReconstruct(raidPtr->reconControl[row]->reconMap) > 0) { - - event = rf_GetNextReconEvent(reconDesc, row, (void (*)(void *))rf_ContinueReconstructFailedDisk,reconDesc); -#ifdef SIMULATE - if (event==NULL) {return(0);} -#else /* SIMULATE */ - RF_ASSERT(event); -#endif /* SIMULATE */ - - (void) ProcessReconEvent(raidPtr, row, event); /* ignore return code */ - raidPtr->reconControl[row]->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs); -#if RF_DEMO > 0 - if (rf_prReconSched || rf_demoMode) -#else /* RF_DEMO > 0 */ - if (rf_prReconSched) -#endif /* RF_DEMO > 0 */ - { - rf_PrintReconSchedule(raidPtr->reconControl[row]->reconMap, &(raidPtr->reconControl[row]->starttime)); - } - } - reconDesc->state=5; - - case 5: - rf_stop_cpu_monitor(); - - /* Success: mark the dead disk as reconstructed. We quiesce the array here to assure no - * nasty interactions with pending user accesses when we free up the psstatus structure - * as part of FreeReconControl() - */ - - - - reconDesc->state=6; - - retcode = rf_SuspendNewRequestsAndWait(raidPtr); - rf_StopUserStats(raidPtr); - rf_PrintUserStats(raidPtr); /* print out the stats on user accs accumulated during recon */ - -#ifdef SIMULATE - if (retcode) return(0); -#endif /* SIMULATE */ - - /* fall through to state 6 */ - case 6: - - - - RF_LOCK_MUTEX(raidPtr->mutex); - raidPtr->numFailures--; - ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE); - raidPtr->Disks[row][col].status = (ds) ? rf_ds_dist_spared : rf_ds_spared; - raidPtr->status[row] = (ds) ? rf_rs_reconfigured : rf_rs_optimal; - RF_UNLOCK_MUTEX(raidPtr->mutex); - RF_GETTIME(etime); - RF_TIMEVAL_DIFF(&(raidPtr->reconControl[row]->starttime), &etime, &elpsd); - - /* XXX -- why is state 7 different from state 6 if there is no return() here? -- XXX - * Note that I set elpsd above & use it below, so if you put a return - * here you'll have to fix this. (also, FreeReconControl is called below) - */ - - case 7: - - rf_ResumeNewRequests(raidPtr); - -#if RF_DEMO > 0 - if (rf_demoMode) { - rf_finish_recon_demo(&elpsd); - } - else { -#endif /* RF_DEMO > 0 */ - printf("Reconstruction of disk at row %d col %d completed and spare disk reassigned\n", row, col); - xor_s = raidPtr->accumXorTimeUs/1000000; - xor_resid_us = raidPtr->accumXorTimeUs%1000000; - printf("Recon time was %d.%06d seconds, accumulated XOR time was %ld us (%ld.%06ld)\n", - (int)elpsd.tv_sec,(int)elpsd.tv_usec,raidPtr->accumXorTimeUs,xor_s,xor_resid_us); - printf(" (start time %d sec %d usec, end time %d sec %d usec)\n", - (int)raidPtr->reconControl[row]->starttime.tv_sec, - (int)raidPtr->reconControl[row]->starttime.tv_usec, - (int)etime.tv_sec, (int)etime.tv_usec); - rf_print_cpu_util("reconstruction"); + RF_Raid_t *raidPtr = reconDesc->raidPtr; + RF_RowCol_t row = reconDesc->row; + RF_RowCol_t col = reconDesc->col; + RF_RowCol_t srow = reconDesc->srow; + RF_RowCol_t scol = reconDesc->scol; + RF_ReconMap_t *mapPtr; + + RF_ReconEvent_t *event; + struct timeval etime, elpsd; + unsigned long xor_s, xor_resid_us; + int retcode, i, ds; + + switch (reconDesc->state) { + + + case 0: + + raidPtr->accumXorTimeUs = 0; + + /* create one trace record per physical disk */ + RF_Malloc(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t), (RF_AccTraceEntry_t *)); + + /* quiesce the array prior to starting recon. this is needed + * to assure no nasty interactions with pending user writes. + * We need to do this before we change the disk or row status. */ + reconDesc->state = 1; + + Dprintf("RECON: begin request suspend\n"); + retcode = rf_SuspendNewRequestsAndWait(raidPtr); + Dprintf("RECON: end request suspend\n"); + rf_StartUserStats(raidPtr); /* zero out the stats kept on + * user accs */ + + /* fall through to state 1 */ + + case 1: + + RF_LOCK_MUTEX(raidPtr->mutex); + + /* create the reconstruction control pointer and install it in + * the right slot */ + raidPtr->reconControl[row] = rf_MakeReconControl(reconDesc, row, col, srow, scol); + mapPtr = raidPtr->reconControl[row]->reconMap; + raidPtr->status[row] = rf_rs_reconstructing; + raidPtr->Disks[row][col].status = rf_ds_reconstructing; + raidPtr->Disks[row][col].spareRow = srow; + raidPtr->Disks[row][col].spareCol = scol; + + RF_UNLOCK_MUTEX(raidPtr->mutex); + + RF_GETTIME(raidPtr->reconControl[row]->starttime); + + /* now start up the actual reconstruction: issue a read for + * each surviving disk */ + rf_start_cpu_monitor(); + reconDesc->numDisksDone = 0; + for (i = 0; i < raidPtr->numCol; i++) { + if (i != col) { + /* find and issue the next I/O on the + * indicated disk */ + if (IssueNextReadRequest(raidPtr, row, i)) { + Dprintf2("RECON: done issuing for r%d c%d\n", row, i); + reconDesc->numDisksDone++; + } + } + } + + case 2: + Dprintf("RECON: resume requests\n"); + rf_ResumeNewRequests(raidPtr); + + + reconDesc->state = 3; + + case 3: + + /* process reconstruction events until all disks report that + * they've completed all work */ + mapPtr = raidPtr->reconControl[row]->reconMap; + + + + while (reconDesc->numDisksDone < raidPtr->numCol - 1) { + + event = rf_GetNextReconEvent(reconDesc, row, (void (*) (void *)) rf_ContinueReconstructFailedDisk, reconDesc); + RF_ASSERT(event); + + if (ProcessReconEvent(raidPtr, row, event)) + reconDesc->numDisksDone++; + raidPtr->reconControl[row]->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs); + if (rf_prReconSched) { + rf_PrintReconSchedule(raidPtr->reconControl[row]->reconMap, &(raidPtr->reconControl[row]->starttime)); + } + } + + + + reconDesc->state = 4; + + + case 4: + mapPtr = raidPtr->reconControl[row]->reconMap; + if (rf_reconDebug) { + printf("RECON: all reads completed\n"); + } + /* at this point all the reads have completed. We now wait + * for any pending writes to complete, and then we're done */ + + while (rf_UnitsLeftToReconstruct(raidPtr->reconControl[row]->reconMap) > 0) { + + event = rf_GetNextReconEvent(reconDesc, row, (void (*) (void *)) rf_ContinueReconstructFailedDisk, reconDesc); + RF_ASSERT(event); + + (void) ProcessReconEvent(raidPtr, row, event); /* ignore return code */ + raidPtr->reconControl[row]->percentComplete = 100 - (rf_UnitsLeftToReconstruct(mapPtr) * 100 / mapPtr->totalRUs); + if (rf_prReconSched) { + rf_PrintReconSchedule(raidPtr->reconControl[row]->reconMap, &(raidPtr->reconControl[row]->starttime)); + } + } + reconDesc->state = 5; + + case 5: + rf_stop_cpu_monitor(); + + /* Success: mark the dead disk as reconstructed. We quiesce + * the array here to assure no nasty interactions with pending + * user accesses when we free up the psstatus structure as + * part of FreeReconControl() */ + + + + reconDesc->state = 6; + + retcode = rf_SuspendNewRequestsAndWait(raidPtr); + rf_StopUserStats(raidPtr); + rf_PrintUserStats(raidPtr); /* print out the stats on user + * accs accumulated during + * recon */ + + /* fall through to state 6 */ + case 6: + + + + RF_LOCK_MUTEX(raidPtr->mutex); + raidPtr->numFailures--; + ds = (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE); + raidPtr->Disks[row][col].status = (ds) ? rf_ds_dist_spared : rf_ds_spared; + raidPtr->status[row] = (ds) ? rf_rs_reconfigured : rf_rs_optimal; + RF_UNLOCK_MUTEX(raidPtr->mutex); + RF_GETTIME(etime); + RF_TIMEVAL_DIFF(&(raidPtr->reconControl[row]->starttime), &etime, &elpsd); + + /* XXX -- why is state 7 different from state 6 if there is no + * return() here? -- XXX Note that I set elpsd above & use it + * below, so if you put a return here you'll have to fix this. + * (also, FreeReconControl is called below) */ + + case 7: + + rf_ResumeNewRequests(raidPtr); + + printf("Reconstruction of disk at row %d col %d completed and spare disk reassigned\n", row, col); + xor_s = raidPtr->accumXorTimeUs / 1000000; + xor_resid_us = raidPtr->accumXorTimeUs % 1000000; + printf("Recon time was %d.%06d seconds, accumulated XOR time was %ld us (%ld.%06ld)\n", + (int) elpsd.tv_sec, (int) elpsd.tv_usec, raidPtr->accumXorTimeUs, xor_s, xor_resid_us); + printf(" (start time %d sec %d usec, end time %d sec %d usec)\n", + (int) raidPtr->reconControl[row]->starttime.tv_sec, + (int) raidPtr->reconControl[row]->starttime.tv_usec, + (int) etime.tv_sec, (int) etime.tv_usec); + rf_print_cpu_util("reconstruction"); #if RF_RECON_STATS > 0 - printf("Total head-sep stall count was %d\n", - (int)reconDesc->hsStallCount); -#endif /* RF_RECON_STATS > 0 */ -#if RF_DEMO > 0 - } -#endif /* RF_DEMO > 0 */ - rf_FreeReconControl(raidPtr, row); - RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t)); - FreeReconDesc(reconDesc); - - } - - SignalReconDone(raidPtr); - return (0); -} + printf("Total head-sep stall count was %d\n", + (int) reconDesc->hsStallCount); +#endif /* RF_RECON_STATS > 0 */ + rf_FreeReconControl(raidPtr, row); + RF_Free(raidPtr->recon_tracerecs, raidPtr->numCol * sizeof(RF_AccTraceEntry_t)); + FreeReconDesc(reconDesc); + } + + SignalReconDone(raidPtr); + return (0); +} /***************************************************************************************** * do the right thing upon each reconstruction event. * returns nonzero if and only if there is nothing left unread on the indicated disk ****************************************************************************************/ -static int ProcessReconEvent(raidPtr, frow, event) - RF_Raid_t *raidPtr; - RF_RowCol_t frow; - RF_ReconEvent_t *event; +static int +ProcessReconEvent(raidPtr, frow, event) + RF_Raid_t *raidPtr; + RF_RowCol_t frow; + RF_ReconEvent_t *event; { - int retcode = 0, submitblocked; - RF_ReconBuffer_t *rbuf; - RF_SectorCount_t sectorsPerRU; - - Dprintf1("RECON: ProcessReconEvent type %d\n", event->type); - switch(event->type) { - - /* a read I/O has completed */ - case RF_REVENT_READDONE: - rbuf = raidPtr->reconControl[frow]->perDiskInfo[event->col].rbuf; - Dprintf3("RECON: READDONE EVENT: row %d col %d psid %ld\n", - frow, event->col, rbuf->parityStripeID); - Dprintf7("RECON: done read psid %ld buf %lx %02x %02x %02x %02x %02x\n", - rbuf->parityStripeID, rbuf->buffer, rbuf->buffer[0]&0xff, rbuf->buffer[1]&0xff, - rbuf->buffer[2]&0xff, rbuf->buffer[3]&0xff, rbuf->buffer[4]&0xff); - rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); - submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0); - Dprintf1("RECON: submitblocked=%d\n", submitblocked); - if (!submitblocked) retcode = IssueNextReadRequest(raidPtr, frow, event->col); - break; - - /* a write I/O has completed */ - case RF_REVENT_WRITEDONE: - if (rf_floatingRbufDebug) { - rf_CheckFloatingRbufCount(raidPtr, 1); - } - sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; - rbuf = (RF_ReconBuffer_t *) event->arg; - rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); - Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d (%d %% complete)\n", - rbuf->parityStripeID, rbuf->which_ru, raidPtr->reconControl[frow]->percentComplete); - rf_ReconMapUpdate(raidPtr, raidPtr->reconControl[frow]->reconMap, - rbuf->failedDiskSectorOffset, rbuf->failedDiskSectorOffset + sectorsPerRU -1); - rf_RemoveFromActiveReconTable(raidPtr, frow, rbuf->parityStripeID, rbuf->which_ru); - - if (rbuf->type == RF_RBUF_TYPE_FLOATING) { - RF_LOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex); - raidPtr->numFullReconBuffers--; - rf_ReleaseFloatingReconBuffer(raidPtr, frow, rbuf); - RF_UNLOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex); - } else if (rbuf->type == RF_RBUF_TYPE_FORCED) rf_FreeReconBuffer(rbuf); - else RF_ASSERT(0); - break; - - case RF_REVENT_BUFCLEAR: /* A buffer-stall condition has been cleared */ - Dprintf2("RECON: BUFCLEAR EVENT: row %d col %d\n",frow, event->col); - submitblocked = rf_SubmitReconBuffer(raidPtr->reconControl[frow]->perDiskInfo[event->col].rbuf, 0, (int) (long)event->arg); - RF_ASSERT(!submitblocked); /* we wouldn't have gotten the BUFCLEAR event if we couldn't submit */ - retcode = IssueNextReadRequest(raidPtr, frow, event->col); - break; - - case RF_REVENT_BLOCKCLEAR: /* A user-write reconstruction blockage has been cleared */ - DDprintf2("RECON: BLOCKCLEAR EVENT: row %d col %d\n",frow, event->col); - retcode = TryToRead(raidPtr, frow, event->col); - break; - - case RF_REVENT_HEADSEPCLEAR: /* A max-head-separation reconstruction blockage has been cleared */ - Dprintf2("RECON: HEADSEPCLEAR EVENT: row %d col %d\n",frow, event->col); - retcode = TryToRead(raidPtr, frow, event->col); - break; - - /* a buffer has become ready to write */ - case RF_REVENT_BUFREADY: - Dprintf2("RECON: BUFREADY EVENT: row %d col %d\n",frow, event->col); - retcode = IssueNextWriteRequest(raidPtr, frow); - if (rf_floatingRbufDebug) { - rf_CheckFloatingRbufCount(raidPtr, 1); - } - break; - - /* we need to skip the current RU entirely because it got recon'd while we were waiting for something else to happen */ - case RF_REVENT_SKIP: - DDprintf2("RECON: SKIP EVENT: row %d col %d\n",frow, event->col); - retcode = IssueNextReadRequest(raidPtr, frow, event->col); - break; - - /* a forced-reconstruction read access has completed. Just submit the buffer */ - case RF_REVENT_FORCEDREADDONE: - rbuf = (RF_ReconBuffer_t *) event->arg; - rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); - DDprintf2("RECON: FORCEDREADDONE EVENT: row %d col %d\n",frow, event->col); - submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0); - RF_ASSERT(!submitblocked); - break; - - default: - RF_PANIC(); - } - rf_FreeReconEventDesc(event); - return(retcode); + int retcode = 0, submitblocked; + RF_ReconBuffer_t *rbuf; + RF_SectorCount_t sectorsPerRU; + + Dprintf1("RECON: ProcessReconEvent type %d\n", event->type); + switch (event->type) { + + /* a read I/O has completed */ + case RF_REVENT_READDONE: + rbuf = raidPtr->reconControl[frow]->perDiskInfo[event->col].rbuf; + Dprintf3("RECON: READDONE EVENT: row %d col %d psid %ld\n", + frow, event->col, rbuf->parityStripeID); + Dprintf7("RECON: done read psid %ld buf %lx %02x %02x %02x %02x %02x\n", + rbuf->parityStripeID, rbuf->buffer, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff, + rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff); + rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); + submitblocked = rf_SubmitReconBuffer(rbuf, 0, 0); + Dprintf1("RECON: submitblocked=%d\n", submitblocked); + if (!submitblocked) + retcode = IssueNextReadRequest(raidPtr, frow, event->col); + break; + + /* a write I/O has completed */ + case RF_REVENT_WRITEDONE: + if (rf_floatingRbufDebug) { + rf_CheckFloatingRbufCount(raidPtr, 1); + } + sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; + rbuf = (RF_ReconBuffer_t *) event->arg; + rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); + Dprintf3("RECON: WRITEDONE EVENT: psid %d ru %d (%d %% complete)\n", + rbuf->parityStripeID, rbuf->which_ru, raidPtr->reconControl[frow]->percentComplete); + rf_ReconMapUpdate(raidPtr, raidPtr->reconControl[frow]->reconMap, + rbuf->failedDiskSectorOffset, rbuf->failedDiskSectorOffset + sectorsPerRU - 1); + rf_RemoveFromActiveReconTable(raidPtr, frow, rbuf->parityStripeID, rbuf->which_ru); + + if (rbuf->type == RF_RBUF_TYPE_FLOATING) { + RF_LOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex); + raidPtr->numFullReconBuffers--; + rf_ReleaseFloatingReconBuffer(raidPtr, frow, rbuf); + RF_UNLOCK_MUTEX(raidPtr->reconControl[frow]->rb_mutex); + } else + if (rbuf->type == RF_RBUF_TYPE_FORCED) + rf_FreeReconBuffer(rbuf); + else + RF_ASSERT(0); + break; + + case RF_REVENT_BUFCLEAR: /* A buffer-stall condition has been + * cleared */ + Dprintf2("RECON: BUFCLEAR EVENT: row %d col %d\n", frow, event->col); + submitblocked = rf_SubmitReconBuffer(raidPtr->reconControl[frow]->perDiskInfo[event->col].rbuf, 0, (int) (long) event->arg); + RF_ASSERT(!submitblocked); /* we wouldn't have gotten the + * BUFCLEAR event if we + * couldn't submit */ + retcode = IssueNextReadRequest(raidPtr, frow, event->col); + break; + + case RF_REVENT_BLOCKCLEAR: /* A user-write reconstruction + * blockage has been cleared */ + DDprintf2("RECON: BLOCKCLEAR EVENT: row %d col %d\n", frow, event->col); + retcode = TryToRead(raidPtr, frow, event->col); + break; + + case RF_REVENT_HEADSEPCLEAR: /* A max-head-separation + * reconstruction blockage has been + * cleared */ + Dprintf2("RECON: HEADSEPCLEAR EVENT: row %d col %d\n", frow, event->col); + retcode = TryToRead(raidPtr, frow, event->col); + break; + + /* a buffer has become ready to write */ + case RF_REVENT_BUFREADY: + Dprintf2("RECON: BUFREADY EVENT: row %d col %d\n", frow, event->col); + retcode = IssueNextWriteRequest(raidPtr, frow); + if (rf_floatingRbufDebug) { + rf_CheckFloatingRbufCount(raidPtr, 1); + } + break; + + /* we need to skip the current RU entirely because it got + * recon'd while we were waiting for something else to happen */ + case RF_REVENT_SKIP: + DDprintf2("RECON: SKIP EVENT: row %d col %d\n", frow, event->col); + retcode = IssueNextReadRequest(raidPtr, frow, event->col); + break; + + /* a forced-reconstruction read access has completed. Just + * submit the buffer */ + case RF_REVENT_FORCEDREADDONE: + rbuf = (RF_ReconBuffer_t *) event->arg; + rf_FreeDiskQueueData((RF_DiskQueueData_t *) rbuf->arg); + DDprintf2("RECON: FORCEDREADDONE EVENT: row %d col %d\n", frow, event->col); + submitblocked = rf_SubmitReconBuffer(rbuf, 1, 0); + RF_ASSERT(!submitblocked); + break; + + default: + RF_PANIC(); + } + rf_FreeReconEventDesc(event); + return (retcode); } /***************************************************************************************** @@ -934,145 +673,155 @@ static int ProcessReconEvent(raidPtr, frow, event) * * ctrl->{ru_count, curPSID, diskOffset} and rbuf->failedDiskSectorOffset are * maintained to point the the unit we're currently accessing. Note that this deviates - * from the standard C idiom of having counters point to the next thing to be + * from the standard C idiom of having counters point to the next thing to be * accessed. This allows us to easily retry when we're blocked by head separation * or reconstruction-blockage events. * * returns nonzero if and only if there is nothing left unread on the indicated disk ****************************************************************************************/ -static int IssueNextReadRequest(raidPtr, row, col) - RF_Raid_t *raidPtr; - RF_RowCol_t row; - RF_RowCol_t col; +static int +IssueNextReadRequest(raidPtr, row, col) + RF_Raid_t *raidPtr; + RF_RowCol_t row; + RF_RowCol_t col; { - RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl[row]->perDiskInfo[col]; - RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; - RF_ReconBuffer_t *rbuf = ctrl->rbuf; - RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU; - RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU; - int do_new_check = 0, retcode = 0, status; - - /* if we are currently the slowest disk, mark that we have to do a new check */ - if (ctrl->headSepCounter <= raidPtr->reconControl[row]->minHeadSepCounter) do_new_check = 1; - - while (1) { - - ctrl->ru_count++; - if (ctrl->ru_count < RUsPerPU) { - ctrl->diskOffset += sectorsPerRU; - rbuf->failedDiskSectorOffset += sectorsPerRU; - } else { - ctrl->curPSID++; - ctrl->ru_count = 0; - /* code left over from when head-sep was based on parity stripe id */ - if (ctrl->curPSID >= raidPtr->reconControl[row]->lastPSID) { - CheckForNewMinHeadSep(raidPtr, row, ++(ctrl->headSepCounter)); - return(1); /* finito! */ - } - - /* find the disk offsets of the start of the parity stripe on both the current disk and the failed disk. - * skip this entire parity stripe if either disk does not appear in the indicated PS - */ - status = ComputePSDiskOffsets(raidPtr, ctrl->curPSID, row, col, &ctrl->diskOffset, &rbuf->failedDiskSectorOffset, - &rbuf->spRow, &rbuf->spCol, &rbuf->spOffset); - if (status) { - ctrl->ru_count = RUsPerPU-1; continue; - } - } - rbuf->which_ru = ctrl->ru_count; - - /* skip this RU if it's already been reconstructed */ - if (rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap, rbuf->failedDiskSectorOffset)) { - Dprintf2("Skipping psid %ld ru %d: already reconstructed\n",ctrl->curPSID,ctrl->ru_count); - continue; - } - break; - } - ctrl->headSepCounter++; - if (do_new_check) CheckForNewMinHeadSep(raidPtr, row, ctrl->headSepCounter); /* update min if needed */ - - - /* at this point, we have definitely decided what to do, and we have only to see if we can actually do it now */ - rbuf->parityStripeID = ctrl->curPSID; - rbuf->which_ru = ctrl->ru_count; - bzero((char *)&raidPtr->recon_tracerecs[col], sizeof(raidPtr->recon_tracerecs[col])); - raidPtr->recon_tracerecs[col].reconacc = 1; - RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer); - retcode = TryToRead(raidPtr, row, col); - return(retcode); + RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl[row]->perDiskInfo[col]; + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_ReconBuffer_t *rbuf = ctrl->rbuf; + RF_ReconUnitCount_t RUsPerPU = layoutPtr->SUsPerPU / layoutPtr->SUsPerRU; + RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU; + int do_new_check = 0, retcode = 0, status; + + /* if we are currently the slowest disk, mark that we have to do a new + * check */ + if (ctrl->headSepCounter <= raidPtr->reconControl[row]->minHeadSepCounter) + do_new_check = 1; + + while (1) { + + ctrl->ru_count++; + if (ctrl->ru_count < RUsPerPU) { + ctrl->diskOffset += sectorsPerRU; + rbuf->failedDiskSectorOffset += sectorsPerRU; + } else { + ctrl->curPSID++; + ctrl->ru_count = 0; + /* code left over from when head-sep was based on + * parity stripe id */ + if (ctrl->curPSID >= raidPtr->reconControl[row]->lastPSID) { + CheckForNewMinHeadSep(raidPtr, row, ++(ctrl->headSepCounter)); + return (1); /* finito! */ + } + /* find the disk offsets of the start of the parity + * stripe on both the current disk and the failed + * disk. skip this entire parity stripe if either disk + * does not appear in the indicated PS */ + status = ComputePSDiskOffsets(raidPtr, ctrl->curPSID, row, col, &ctrl->diskOffset, &rbuf->failedDiskSectorOffset, + &rbuf->spRow, &rbuf->spCol, &rbuf->spOffset); + if (status) { + ctrl->ru_count = RUsPerPU - 1; + continue; + } + } + rbuf->which_ru = ctrl->ru_count; + + /* skip this RU if it's already been reconstructed */ + if (rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap, rbuf->failedDiskSectorOffset)) { + Dprintf2("Skipping psid %ld ru %d: already reconstructed\n", ctrl->curPSID, ctrl->ru_count); + continue; + } + break; + } + ctrl->headSepCounter++; + if (do_new_check) + CheckForNewMinHeadSep(raidPtr, row, ctrl->headSepCounter); /* update min if needed */ + + + /* at this point, we have definitely decided what to do, and we have + * only to see if we can actually do it now */ + rbuf->parityStripeID = ctrl->curPSID; + rbuf->which_ru = ctrl->ru_count; + bzero((char *) &raidPtr->recon_tracerecs[col], sizeof(raidPtr->recon_tracerecs[col])); + raidPtr->recon_tracerecs[col].reconacc = 1; + RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer); + retcode = TryToRead(raidPtr, row, col); + return (retcode); } - /* tries to issue the next read on the indicated disk. We may be blocked by (a) the heads being too * far apart, or (b) recon on the indicated RU being blocked due to a write by a user thread. * In this case, we issue a head-sep or blockage wait request, which will cause this same routine * to be invoked again later when the blockage has cleared. */ -static int TryToRead(raidPtr, row, col) - RF_Raid_t *raidPtr; - RF_RowCol_t row; - RF_RowCol_t col; +static int +TryToRead(raidPtr, row, col) + RF_Raid_t *raidPtr; + RF_RowCol_t row; + RF_RowCol_t col; { - RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl[row]->perDiskInfo[col]; - RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; - RF_StripeNum_t psid = ctrl->curPSID; - RF_ReconUnitNum_t which_ru = ctrl->ru_count; - RF_DiskQueueData_t *req; - int status, created = 0; - RF_ReconParityStripeStatus_t *pssPtr; - - /* if the current disk is too far ahead of the others, issue a head-separation wait and return */ - if (CheckHeadSeparation(raidPtr, ctrl, row, col, ctrl->headSepCounter, which_ru)) return(0); - RF_LOCK_PSS_MUTEX(raidPtr, row, psid); - pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]->pssTable, psid, which_ru, RF_PSS_CREATE, &created); - - /* if recon is blocked on the indicated parity stripe, issue a block-wait request and return. - * this also must mark the indicated RU in the stripe as under reconstruction if not blocked. - */ - status = CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl, row, col, psid, which_ru); - if (status == RF_PSS_RECON_BLOCKED) { - Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked\n",psid,which_ru); - goto out; - } else if (status == RF_PSS_FORCED_ON_WRITE) { - rf_CauseReconEvent(raidPtr, row, col, NULL, RF_REVENT_SKIP); - goto out; - } - - /* make one last check to be sure that the indicated RU didn't get reconstructed while - * we were waiting for something else to happen. This is unfortunate in that it causes - * us to make this check twice in the normal case. Might want to make some attempt to - * re-work this so that we only do this check if we've definitely blocked on one of the - * above checks. When this condition is detected, we may have just created a bogus - * status entry, which we need to delete. - */ - if (rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap, ctrl->rbuf->failedDiskSectorOffset)) { - Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after stall\n",psid,which_ru); - if (created) rf_PSStatusDelete(raidPtr, raidPtr->reconControl[row]->pssTable, pssPtr); - rf_CauseReconEvent(raidPtr, row, col, NULL, RF_REVENT_SKIP); - goto out; - } - - /* found something to read. issue the I/O */ - Dprintf5("RECON: Read for psid %ld on row %d col %d offset %ld buf %lx\n", - psid, row, col, ctrl->diskOffset, ctrl->rbuf->buffer); - RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer); - RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer); - raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us = - RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer); - RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer); - - /* should be ok to use a NULL proc pointer here, all the bufs we use should be in kernel space */ - req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset, sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru, - ReconReadDoneProc, (void *) ctrl, NULL, &raidPtr->recon_tracerecs[col], (void *)raidPtr, 0, NULL); - - RF_ASSERT(req); /* XXX -- fix this -- XXX */ - - ctrl->rbuf->arg = (void *) req; - rf_DiskIOEnqueue(&raidPtr->Queues[row][col], req, RF_IO_RECON_PRIORITY); - pssPtr->issued[col] = 1; + RF_PerDiskReconCtrl_t *ctrl = &raidPtr->reconControl[row]->perDiskInfo[col]; + RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; + RF_StripeNum_t psid = ctrl->curPSID; + RF_ReconUnitNum_t which_ru = ctrl->ru_count; + RF_DiskQueueData_t *req; + int status, created = 0; + RF_ReconParityStripeStatus_t *pssPtr; + + /* if the current disk is too far ahead of the others, issue a + * head-separation wait and return */ + if (CheckHeadSeparation(raidPtr, ctrl, row, col, ctrl->headSepCounter, which_ru)) + return (0); + RF_LOCK_PSS_MUTEX(raidPtr, row, psid); + pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]->pssTable, psid, which_ru, RF_PSS_CREATE, &created); + + /* if recon is blocked on the indicated parity stripe, issue a + * block-wait request and return. this also must mark the indicated RU + * in the stripe as under reconstruction if not blocked. */ + status = CheckForcedOrBlockedReconstruction(raidPtr, pssPtr, ctrl, row, col, psid, which_ru); + if (status == RF_PSS_RECON_BLOCKED) { + Dprintf2("RECON: Stalling psid %ld ru %d: recon blocked\n", psid, which_ru); + goto out; + } else + if (status == RF_PSS_FORCED_ON_WRITE) { + rf_CauseReconEvent(raidPtr, row, col, NULL, RF_REVENT_SKIP); + goto out; + } + /* make one last check to be sure that the indicated RU didn't get + * reconstructed while we were waiting for something else to happen. + * This is unfortunate in that it causes us to make this check twice + * in the normal case. Might want to make some attempt to re-work + * this so that we only do this check if we've definitely blocked on + * one of the above checks. When this condition is detected, we may + * have just created a bogus status entry, which we need to delete. */ + if (rf_CheckRUReconstructed(raidPtr->reconControl[row]->reconMap, ctrl->rbuf->failedDiskSectorOffset)) { + Dprintf2("RECON: Skipping psid %ld ru %d: prior recon after stall\n", psid, which_ru); + if (created) + rf_PSStatusDelete(raidPtr, raidPtr->reconControl[row]->pssTable, pssPtr); + rf_CauseReconEvent(raidPtr, row, col, NULL, RF_REVENT_SKIP); + goto out; + } + /* found something to read. issue the I/O */ + Dprintf5("RECON: Read for psid %ld on row %d col %d offset %ld buf %lx\n", + psid, row, col, ctrl->diskOffset, ctrl->rbuf->buffer); + RF_ETIMER_STOP(raidPtr->recon_tracerecs[col].recon_timer); + RF_ETIMER_EVAL(raidPtr->recon_tracerecs[col].recon_timer); + raidPtr->recon_tracerecs[col].specific.recon.recon_start_to_fetch_us = + RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[col].recon_timer); + RF_ETIMER_START(raidPtr->recon_tracerecs[col].recon_timer); + + /* should be ok to use a NULL proc pointer here, all the bufs we use + * should be in kernel space */ + req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, ctrl->diskOffset, sectorsPerRU, ctrl->rbuf->buffer, psid, which_ru, + ReconReadDoneProc, (void *) ctrl, NULL, &raidPtr->recon_tracerecs[col], (void *) raidPtr, 0, NULL); + + RF_ASSERT(req); /* XXX -- fix this -- XXX */ + + ctrl->rbuf->arg = (void *) req; + rf_DiskIOEnqueue(&raidPtr->Queues[row][col], req, RF_IO_RECON_PRIORITY); + pssPtr->issued[col] = 1; out: - RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid); - return(0); + RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid); + return (0); } @@ -1096,231 +845,254 @@ out: * ASSUMES THAT THE STRIPE IDENTIFIER IDENTIFIES THE DISKS COMPRISING THE STRIPE * IN THE CORRECT ORDER */ -static int ComputePSDiskOffsets( - RF_Raid_t *raidPtr, /* raid descriptor */ - RF_StripeNum_t psid, /* parity stripe identifier */ - RF_RowCol_t row, /* row and column of disk to find the offsets for */ - RF_RowCol_t col, - RF_SectorNum_t *outDiskOffset, - RF_SectorNum_t *outFailedDiskSectorOffset, - RF_RowCol_t *spRow, /* OUT: row,col of spare unit for failed unit */ - RF_RowCol_t *spCol, - RF_SectorNum_t *spOffset) /* OUT: offset into disk containing spare unit */ -{ - RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; - RF_RowCol_t fcol = raidPtr->reconControl[row]->fcol; - RF_RaidAddr_t sosRaidAddress; /* start-of-stripe */ - RF_RowCol_t *diskids; - u_int i, j, k, i_offset, j_offset; - RF_RowCol_t prow, pcol; - int testcol, testrow; - RF_RowCol_t stripe; - RF_SectorNum_t poffset; - char i_is_parity=0, j_is_parity=0; - RF_RowCol_t stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol; - - /* get a listing of the disks comprising that stripe */ - sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid); - (layoutPtr->map->IdentifyStripe)(raidPtr, sosRaidAddress, &diskids, &stripe); - RF_ASSERT(diskids); - - /* reject this entire parity stripe if it does not contain the indicated disk or it does not contain the failed disk */ - if (row != stripe) - goto skipit; - for (i=0; i<stripeWidth; i++) { - if (col == diskids[i]) - break; - } - if (i == stripeWidth) - goto skipit; - for (j=0; j<stripeWidth; j++) { - if (fcol == diskids[j]) - break; - } - if (j == stripeWidth) { - goto skipit; - } - - /* find out which disk the parity is on */ - (layoutPtr->map->MapParity)(raidPtr, sosRaidAddress, &prow, &pcol, &poffset, RF_DONT_REMAP); - - /* find out if either the current RU or the failed RU is parity */ - /* also, if the parity occurs in this stripe prior to the data and/or failed col, we need to decrement i and/or j */ - for (k=0; k<stripeWidth; k++) - if (diskids[k] == pcol) - break; - RF_ASSERT(k < stripeWidth); - i_offset = i; j_offset=j; - if (k < i) i_offset--; else if (k==i) {i_is_parity = 1; i_offset = 0;} /* set offsets to zero to disable multiply below */ - if (k < j) j_offset--; else if (k==j) {j_is_parity = 1; j_offset = 0;} - - /* at this point, [ij]_is_parity tells us whether the [current,failed] disk is parity at - * the start of this RU, and, if data, "[ij]_offset" tells us how far into the stripe - * the [current,failed] disk is. - */ - - /* call the mapping routine to get the offset into the current disk, repeat for failed disk. */ - if (i_is_parity) - layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testrow, &testcol, outDiskOffset, RF_DONT_REMAP); - else - layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testrow, &testcol, outDiskOffset, RF_DONT_REMAP); - - RF_ASSERT(row == testrow && col == testcol); - - if (j_is_parity) - layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testrow, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP); - else - layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testrow, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP); - RF_ASSERT(row == testrow && fcol == testcol); - - /* now locate the spare unit for the failed unit */ - if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { - if (j_is_parity) - layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spRow, spCol, spOffset, RF_REMAP); - else - layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spRow, spCol, spOffset, RF_REMAP); - } else { - *spRow = raidPtr->reconControl[row]->spareRow; - *spCol = raidPtr->reconControl[row]->spareCol; - *spOffset = *outFailedDiskSectorOffset; - } - - return(0); +static int +ComputePSDiskOffsets( + RF_Raid_t * raidPtr, /* raid descriptor */ + RF_StripeNum_t psid, /* parity stripe identifier */ + RF_RowCol_t row, /* row and column of disk to find the offsets + * for */ + RF_RowCol_t col, + RF_SectorNum_t * outDiskOffset, + RF_SectorNum_t * outFailedDiskSectorOffset, + RF_RowCol_t * spRow, /* OUT: row,col of spare unit for failed unit */ + RF_RowCol_t * spCol, + RF_SectorNum_t * spOffset) +{ /* OUT: offset into disk containing spare unit */ + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_RowCol_t fcol = raidPtr->reconControl[row]->fcol; + RF_RaidAddr_t sosRaidAddress; /* start-of-stripe */ + RF_RowCol_t *diskids; + u_int i, j, k, i_offset, j_offset; + RF_RowCol_t prow, pcol; + int testcol, testrow; + RF_RowCol_t stripe; + RF_SectorNum_t poffset; + char i_is_parity = 0, j_is_parity = 0; + RF_RowCol_t stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol; + + /* get a listing of the disks comprising that stripe */ + sosRaidAddress = rf_ParityStripeIDToRaidAddress(layoutPtr, psid); + (layoutPtr->map->IdentifyStripe) (raidPtr, sosRaidAddress, &diskids, &stripe); + RF_ASSERT(diskids); + + /* reject this entire parity stripe if it does not contain the + * indicated disk or it does not contain the failed disk */ + if (row != stripe) + goto skipit; + for (i = 0; i < stripeWidth; i++) { + if (col == diskids[i]) + break; + } + if (i == stripeWidth) + goto skipit; + for (j = 0; j < stripeWidth; j++) { + if (fcol == diskids[j]) + break; + } + if (j == stripeWidth) { + goto skipit; + } + /* find out which disk the parity is on */ + (layoutPtr->map->MapParity) (raidPtr, sosRaidAddress, &prow, &pcol, &poffset, RF_DONT_REMAP); + + /* find out if either the current RU or the failed RU is parity */ + /* also, if the parity occurs in this stripe prior to the data and/or + * failed col, we need to decrement i and/or j */ + for (k = 0; k < stripeWidth; k++) + if (diskids[k] == pcol) + break; + RF_ASSERT(k < stripeWidth); + i_offset = i; + j_offset = j; + if (k < i) + i_offset--; + else + if (k == i) { + i_is_parity = 1; + i_offset = 0; + } /* set offsets to zero to disable multiply + * below */ + if (k < j) + j_offset--; + else + if (k == j) { + j_is_parity = 1; + j_offset = 0; + } + /* at this point, [ij]_is_parity tells us whether the [current,failed] + * disk is parity at the start of this RU, and, if data, "[ij]_offset" + * tells us how far into the stripe the [current,failed] disk is. */ + + /* call the mapping routine to get the offset into the current disk, + * repeat for failed disk. */ + if (i_is_parity) + layoutPtr->map->MapParity(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testrow, &testcol, outDiskOffset, RF_DONT_REMAP); + else + layoutPtr->map->MapSector(raidPtr, sosRaidAddress + i_offset * layoutPtr->sectorsPerStripeUnit, &testrow, &testcol, outDiskOffset, RF_DONT_REMAP); + + RF_ASSERT(row == testrow && col == testcol); + + if (j_is_parity) + layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testrow, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP); + else + layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, &testrow, &testcol, outFailedDiskSectorOffset, RF_DONT_REMAP); + RF_ASSERT(row == testrow && fcol == testcol); + + /* now locate the spare unit for the failed unit */ + if (layoutPtr->map->flags & RF_DISTRIBUTE_SPARE) { + if (j_is_parity) + layoutPtr->map->MapParity(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spRow, spCol, spOffset, RF_REMAP); + else + layoutPtr->map->MapSector(raidPtr, sosRaidAddress + j_offset * layoutPtr->sectorsPerStripeUnit, spRow, spCol, spOffset, RF_REMAP); + } else { + *spRow = raidPtr->reconControl[row]->spareRow; + *spCol = raidPtr->reconControl[row]->spareCol; + *spOffset = *outFailedDiskSectorOffset; + } + + return (0); skipit: - Dprintf3("RECON: Skipping psid %ld: nothing needed from r%d c%d\n", - psid, row, col); - return(1); + Dprintf3("RECON: Skipping psid %ld: nothing needed from r%d c%d\n", + psid, row, col); + return (1); } - /* this is called when a buffer has become ready to write to the replacement disk */ -static int IssueNextWriteRequest(raidPtr, row) - RF_Raid_t *raidPtr; - RF_RowCol_t row; +static int +IssueNextWriteRequest(raidPtr, row) + RF_Raid_t *raidPtr; + RF_RowCol_t row; { - RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; - RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU; - RF_RowCol_t fcol = raidPtr->reconControl[row]->fcol; - RF_ReconBuffer_t *rbuf; - RF_DiskQueueData_t *req; - - rbuf = rf_GetFullReconBuffer(raidPtr->reconControl[row]); - RF_ASSERT(rbuf); /* there must be one available, or we wouldn't have gotten the event that sent us here */ - RF_ASSERT(rbuf->pssPtr); - - rbuf->pssPtr->writeRbuf = rbuf; - rbuf->pssPtr = NULL; - - Dprintf7("RECON: New write (r %d c %d offs %d) for psid %ld ru %d (failed disk offset %ld) buf %lx\n", - rbuf->spRow, rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID, - rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer); - Dprintf6("RECON: new write psid %ld %02x %02x %02x %02x %02x\n", - rbuf->parityStripeID, rbuf->buffer[0]&0xff, rbuf->buffer[1]&0xff, - rbuf->buffer[2]&0xff, rbuf->buffer[3]&0xff, rbuf->buffer[4]&0xff); - - /* should be ok to use a NULL b_proc here b/c all addrs should be in kernel space */ - req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset, - sectorsPerRU, rbuf->buffer, - rbuf->parityStripeID, rbuf->which_ru, - ReconWriteDoneProc, (void *) rbuf, NULL, - &raidPtr->recon_tracerecs[fcol], - (void *)raidPtr, 0, NULL); - - RF_ASSERT(req); /* XXX -- fix this -- XXX */ - - rbuf->arg = (void *) req; - rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spRow][rbuf->spCol], req, RF_IO_RECON_PRIORITY); - - return(0); + RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; + RF_SectorCount_t sectorsPerRU = layoutPtr->sectorsPerStripeUnit * layoutPtr->SUsPerRU; + RF_RowCol_t fcol = raidPtr->reconControl[row]->fcol; + RF_ReconBuffer_t *rbuf; + RF_DiskQueueData_t *req; + + rbuf = rf_GetFullReconBuffer(raidPtr->reconControl[row]); + RF_ASSERT(rbuf); /* there must be one available, or we wouldn't + * have gotten the event that sent us here */ + RF_ASSERT(rbuf->pssPtr); + + rbuf->pssPtr->writeRbuf = rbuf; + rbuf->pssPtr = NULL; + + Dprintf7("RECON: New write (r %d c %d offs %d) for psid %ld ru %d (failed disk offset %ld) buf %lx\n", + rbuf->spRow, rbuf->spCol, rbuf->spOffset, rbuf->parityStripeID, + rbuf->which_ru, rbuf->failedDiskSectorOffset, rbuf->buffer); + Dprintf6("RECON: new write psid %ld %02x %02x %02x %02x %02x\n", + rbuf->parityStripeID, rbuf->buffer[0] & 0xff, rbuf->buffer[1] & 0xff, + rbuf->buffer[2] & 0xff, rbuf->buffer[3] & 0xff, rbuf->buffer[4] & 0xff); + + /* should be ok to use a NULL b_proc here b/c all addrs should be in + * kernel space */ + req = rf_CreateDiskQueueData(RF_IO_TYPE_WRITE, rbuf->spOffset, + sectorsPerRU, rbuf->buffer, + rbuf->parityStripeID, rbuf->which_ru, + ReconWriteDoneProc, (void *) rbuf, NULL, + &raidPtr->recon_tracerecs[fcol], + (void *) raidPtr, 0, NULL); + + RF_ASSERT(req); /* XXX -- fix this -- XXX */ + + rbuf->arg = (void *) req; + rf_DiskIOEnqueue(&raidPtr->Queues[rbuf->spRow][rbuf->spCol], req, RF_IO_RECON_PRIORITY); + + return (0); } - /* this gets called upon the completion of a reconstruction read operation * the arg is a pointer to the per-disk reconstruction control structure * for the process that just finished a read. * * called at interrupt context in the kernel, so don't do anything illegal here. */ -static int ReconReadDoneProc(arg, status) - void *arg; - int status; +static int +ReconReadDoneProc(arg, status) + void *arg; + int status; { - RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg; - RF_Raid_t *raidPtr = ctrl->reconCtrl->reconDesc->raidPtr; - - if (status) { - /* - * XXX - */ - printf("Recon read failed!\n"); - RF_PANIC(); - } - - RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer); - RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer); - raidPtr->recon_tracerecs[ctrl->col].specific.recon.recon_fetch_to_return_us = - RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer); - RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer); - - rf_CauseReconEvent(raidPtr, ctrl->row, ctrl->col, NULL, RF_REVENT_READDONE); - return(0); + RF_PerDiskReconCtrl_t *ctrl = (RF_PerDiskReconCtrl_t *) arg; + RF_Raid_t *raidPtr = ctrl->reconCtrl->reconDesc->raidPtr; + + if (status) { + /* + * XXX + */ + printf("Recon read failed!\n"); + RF_PANIC(); + } + RF_ETIMER_STOP(raidPtr->recon_tracerecs[ctrl->col].recon_timer); + RF_ETIMER_EVAL(raidPtr->recon_tracerecs[ctrl->col].recon_timer); + raidPtr->recon_tracerecs[ctrl->col].specific.recon.recon_fetch_to_return_us = + RF_ETIMER_VAL_US(raidPtr->recon_tracerecs[ctrl->col].recon_timer); + RF_ETIMER_START(raidPtr->recon_tracerecs[ctrl->col].recon_timer); + + rf_CauseReconEvent(raidPtr, ctrl->row, ctrl->col, NULL, RF_REVENT_READDONE); + return (0); } - /* this gets called upon the completion of a reconstruction write operation. * the arg is a pointer to the rbuf that was just written * * called at interrupt context in the kernel, so don't do anything illegal here. */ -static int ReconWriteDoneProc(arg, status) - void *arg; - int status; +static int +ReconWriteDoneProc(arg, status) + void *arg; + int status; { - RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg; + RF_ReconBuffer_t *rbuf = (RF_ReconBuffer_t *) arg; - Dprintf2("Reconstruction completed on psid %ld ru %d\n",rbuf->parityStripeID, rbuf->which_ru); - if (status) {printf("Recon write failed!\n"); /*fprintf(stderr,"Recon write failed!\n");*/ RF_PANIC();} - rf_CauseReconEvent((RF_Raid_t *) rbuf->raidPtr, rbuf->row, rbuf->col, arg, RF_REVENT_WRITEDONE); - return(0); + Dprintf2("Reconstruction completed on psid %ld ru %d\n", rbuf->parityStripeID, rbuf->which_ru); + if (status) { + printf("Recon write failed!\n"); /* fprintf(stderr,"Recon + * write failed!\n"); */ + RF_PANIC(); + } + rf_CauseReconEvent((RF_Raid_t *) rbuf->raidPtr, rbuf->row, rbuf->col, arg, RF_REVENT_WRITEDONE); + return (0); } /* computes a new minimum head sep, and wakes up anyone who needs to be woken as a result */ -static void CheckForNewMinHeadSep(raidPtr, row, hsCtr) - RF_Raid_t *raidPtr; - RF_RowCol_t row; - RF_HeadSepLimit_t hsCtr; +static void +CheckForNewMinHeadSep(raidPtr, row, hsCtr) + RF_Raid_t *raidPtr; + RF_RowCol_t row; + RF_HeadSepLimit_t hsCtr; { - RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[row]; - RF_HeadSepLimit_t new_min; - RF_RowCol_t i; - RF_CallbackDesc_t *p; - RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter); /* from the definition of a minimum */ - - - RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex); - - new_min = ~ (1L<< (8*sizeof(long)-1)); /* 0x7FFF....FFF */ - for (i=0; i<raidPtr->numCol; i++) if (i != reconCtrlPtr->fcol) { - if (reconCtrlPtr->perDiskInfo[i].headSepCounter < new_min) new_min = reconCtrlPtr->perDiskInfo[i].headSepCounter; - } - - /* set the new minimum and wake up anyone who can now run again */ - if (new_min != reconCtrlPtr->minHeadSepCounter) { - reconCtrlPtr->minHeadSepCounter = new_min; - Dprintf1("RECON: new min head pos counter val is %ld\n",new_min); - while (reconCtrlPtr->headSepCBList) { - if (reconCtrlPtr->headSepCBList->callbackArg.v > new_min) break; - p = reconCtrlPtr->headSepCBList; - reconCtrlPtr->headSepCBList = p->next; - p->next = NULL; - rf_CauseReconEvent(raidPtr, p->row, p->col, NULL, RF_REVENT_HEADSEPCLEAR); - rf_FreeCallbackDesc(p); - } - - } - - RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex); -} + RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[row]; + RF_HeadSepLimit_t new_min; + RF_RowCol_t i; + RF_CallbackDesc_t *p; + RF_ASSERT(hsCtr >= reconCtrlPtr->minHeadSepCounter); /* from the definition + * of a minimum */ + + + RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex); + + new_min = ~(1L << (8 * sizeof(long) - 1)); /* 0x7FFF....FFF */ + for (i = 0; i < raidPtr->numCol; i++) + if (i != reconCtrlPtr->fcol) { + if (reconCtrlPtr->perDiskInfo[i].headSepCounter < new_min) + new_min = reconCtrlPtr->perDiskInfo[i].headSepCounter; + } + /* set the new minimum and wake up anyone who can now run again */ + if (new_min != reconCtrlPtr->minHeadSepCounter) { + reconCtrlPtr->minHeadSepCounter = new_min; + Dprintf1("RECON: new min head pos counter val is %ld\n", new_min); + while (reconCtrlPtr->headSepCBList) { + if (reconCtrlPtr->headSepCBList->callbackArg.v > new_min) + break; + p = reconCtrlPtr->headSepCBList; + reconCtrlPtr->headSepCBList = p->next; + p->next = NULL; + rf_CauseReconEvent(raidPtr, p->row, p->col, NULL, RF_REVENT_HEADSEPCLEAR); + rf_FreeCallbackDesc(p); + } + } + RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex); +} /* checks to see that the maximum head separation will not be violated * if we initiate a reconstruction I/O on the indicated disk. Limiting the * maximum head separation between two disks eliminates the nasty buffer-stall @@ -1332,60 +1104,64 @@ static void CheckForNewMinHeadSep(raidPtr, row, hsCtr) * returns non-zero if and only if we have to stop working on the indicated disk * due to a head-separation delay. */ -static int CheckHeadSeparation( - RF_Raid_t *raidPtr, - RF_PerDiskReconCtrl_t *ctrl, - RF_RowCol_t row, - RF_RowCol_t col, - RF_HeadSepLimit_t hsCtr, - RF_ReconUnitNum_t which_ru) +static int +CheckHeadSeparation( + RF_Raid_t * raidPtr, + RF_PerDiskReconCtrl_t * ctrl, + RF_RowCol_t row, + RF_RowCol_t col, + RF_HeadSepLimit_t hsCtr, + RF_ReconUnitNum_t which_ru) { - RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[row]; - RF_CallbackDesc_t *cb, *p, *pt; - int retval = 0, tid; - - /* if we're too far ahead of the slowest disk, stop working on this disk - * until the slower ones catch up. We do this by scheduling a wakeup callback - * for the time when the slowest disk has caught up. We define "caught up" - * with 20% hysteresis, i.e. the head separation must have fallen to at most - * 80% of the max allowable head separation before we'll wake up. - * - */ - rf_get_threadid(tid); - RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex); - if ((raidPtr->headSepLimit >= 0) && - ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) > raidPtr->headSepLimit)) - { - Dprintf6("[%d] RECON: head sep stall: row %d col %d hsCtr %ld minHSCtr %ld limit %ld\n", - tid,row,col,ctrl->headSepCounter, reconCtrlPtr->minHeadSepCounter, raidPtr->headSepLimit); - cb = rf_AllocCallbackDesc(); - /* the minHeadSepCounter value we have to get to before we'll wake up. build in 20% hysteresis. */ - cb->callbackArg.v = (ctrl->headSepCounter - raidPtr->headSepLimit + raidPtr->headSepLimit/5); - cb->row = row; cb->col = col; - cb->next = NULL; - - /* insert this callback descriptor into the sorted list of pending head-sep callbacks */ - p = reconCtrlPtr->headSepCBList; - if (!p) reconCtrlPtr->headSepCBList = cb; - else if (cb->callbackArg.v < p->callbackArg.v) { - cb->next = reconCtrlPtr->headSepCBList; - reconCtrlPtr->headSepCBList = cb; - } - else { - for (pt=p, p=p->next; p && (p->callbackArg.v < cb->callbackArg.v); pt=p,p=p->next); - cb->next = p; - pt->next = cb; - } - retval = 1; + RF_ReconCtrl_t *reconCtrlPtr = raidPtr->reconControl[row]; + RF_CallbackDesc_t *cb, *p, *pt; + int retval = 0, tid; + + /* if we're too far ahead of the slowest disk, stop working on this + * disk until the slower ones catch up. We do this by scheduling a + * wakeup callback for the time when the slowest disk has caught up. + * We define "caught up" with 20% hysteresis, i.e. the head separation + * must have fallen to at most 80% of the max allowable head + * separation before we'll wake up. + * + */ + rf_get_threadid(tid); + RF_LOCK_MUTEX(reconCtrlPtr->rb_mutex); + if ((raidPtr->headSepLimit >= 0) && + ((ctrl->headSepCounter - reconCtrlPtr->minHeadSepCounter) > raidPtr->headSepLimit)) { + Dprintf6("[%d] RECON: head sep stall: row %d col %d hsCtr %ld minHSCtr %ld limit %ld\n", + tid, row, col, ctrl->headSepCounter, reconCtrlPtr->minHeadSepCounter, raidPtr->headSepLimit); + cb = rf_AllocCallbackDesc(); + /* the minHeadSepCounter value we have to get to before we'll + * wake up. build in 20% hysteresis. */ + cb->callbackArg.v = (ctrl->headSepCounter - raidPtr->headSepLimit + raidPtr->headSepLimit / 5); + cb->row = row; + cb->col = col; + cb->next = NULL; + + /* insert this callback descriptor into the sorted list of + * pending head-sep callbacks */ + p = reconCtrlPtr->headSepCBList; + if (!p) + reconCtrlPtr->headSepCBList = cb; + else + if (cb->callbackArg.v < p->callbackArg.v) { + cb->next = reconCtrlPtr->headSepCBList; + reconCtrlPtr->headSepCBList = cb; + } else { + for (pt = p, p = p->next; p && (p->callbackArg.v < cb->callbackArg.v); pt = p, p = p->next); + cb->next = p; + pt->next = cb; + } + retval = 1; #if RF_RECON_STATS > 0 - ctrl->reconCtrl->reconDesc->hsStallCount++; -#endif /* RF_RECON_STATS > 0 */ - } - RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex); + ctrl->reconCtrl->reconDesc->hsStallCount++; +#endif /* RF_RECON_STATS > 0 */ + } + RF_UNLOCK_MUTEX(reconCtrlPtr->rb_mutex); - return(retval); + return (retval); } - /* checks to see if reconstruction has been either forced or blocked by a user operation. * if forced, we skip this RU entirely. * else if blocked, put ourselves on the wait list. @@ -1393,203 +1169,231 @@ static int CheckHeadSeparation( * * ASSUMES THE PSS MUTEX IS LOCKED UPON ENTRY */ -static int CheckForcedOrBlockedReconstruction( - RF_Raid_t *raidPtr, - RF_ReconParityStripeStatus_t *pssPtr, - RF_PerDiskReconCtrl_t *ctrl, - RF_RowCol_t row, - RF_RowCol_t col, - RF_StripeNum_t psid, - RF_ReconUnitNum_t which_ru) +static int +CheckForcedOrBlockedReconstruction( + RF_Raid_t * raidPtr, + RF_ReconParityStripeStatus_t * pssPtr, + RF_PerDiskReconCtrl_t * ctrl, + RF_RowCol_t row, + RF_RowCol_t col, + RF_StripeNum_t psid, + RF_ReconUnitNum_t which_ru) { - RF_CallbackDesc_t *cb; - int retcode = 0; - - if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) || (pssPtr->flags & RF_PSS_FORCED_ON_WRITE)) retcode = RF_PSS_FORCED_ON_WRITE; - else if (pssPtr->flags & RF_PSS_RECON_BLOCKED) { - Dprintf4("RECON: row %d col %d blocked at psid %ld ru %d\n",row, col, psid, which_ru); - cb = rf_AllocCallbackDesc(); /* append ourselves to the blockage-wait list */ - cb->row = row; cb->col = col; - cb->next = pssPtr->blockWaitList; - pssPtr->blockWaitList = cb; - retcode = RF_PSS_RECON_BLOCKED; - } - - if (!retcode) pssPtr->flags |= RF_PSS_UNDER_RECON; /* mark this RU as under reconstruction */ - - return(retcode); + RF_CallbackDesc_t *cb; + int retcode = 0; + + if ((pssPtr->flags & RF_PSS_FORCED_ON_READ) || (pssPtr->flags & RF_PSS_FORCED_ON_WRITE)) + retcode = RF_PSS_FORCED_ON_WRITE; + else + if (pssPtr->flags & RF_PSS_RECON_BLOCKED) { + Dprintf4("RECON: row %d col %d blocked at psid %ld ru %d\n", row, col, psid, which_ru); + cb = rf_AllocCallbackDesc(); /* append ourselves to + * the blockage-wait + * list */ + cb->row = row; + cb->col = col; + cb->next = pssPtr->blockWaitList; + pssPtr->blockWaitList = cb; + retcode = RF_PSS_RECON_BLOCKED; + } + if (!retcode) + pssPtr->flags |= RF_PSS_UNDER_RECON; /* mark this RU as under + * reconstruction */ + + return (retcode); } - /* if reconstruction is currently ongoing for the indicated stripeID, reconstruction * is forced to completion and we return non-zero to indicate that the caller must * wait. If not, then reconstruction is blocked on the indicated stripe and the * routine returns zero. If and only if we return non-zero, we'll cause the cbFunc * to get invoked with the cbArg when the reconstruction has completed. */ -int rf_ForceOrBlockRecon(raidPtr, asmap, cbFunc, cbArg) - RF_Raid_t *raidPtr; - RF_AccessStripeMap_t *asmap; - void (*cbFunc)(RF_Raid_t *,void *); - void *cbArg; +int +rf_ForceOrBlockRecon(raidPtr, asmap, cbFunc, cbArg) + RF_Raid_t *raidPtr; + RF_AccessStripeMap_t *asmap; + void (*cbFunc) (RF_Raid_t *, void *); + void *cbArg; { - RF_RowCol_t row = asmap->physInfo->row; /* which row of the array we're working on */ - RF_StripeNum_t stripeID = asmap->stripeID; /* the stripe ID we're forcing recon on */ - RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; /* num sects in one RU */ - RF_ReconParityStripeStatus_t *pssPtr; /* a pointer to the parity stripe status structure */ - RF_StripeNum_t psid; /* parity stripe id */ - RF_SectorNum_t offset, fd_offset; /* disk offset, failed-disk offset */ - RF_RowCol_t *diskids; - RF_RowCol_t stripe; - int tid; - RF_ReconUnitNum_t which_ru; /* RU within parity stripe */ - RF_RowCol_t fcol, diskno, i; - RF_ReconBuffer_t *new_rbuf; /* ptr to newly allocated rbufs */ - RF_DiskQueueData_t *req; /* disk I/O req to be enqueued */ - RF_CallbackDesc_t *cb; - int created = 0, nPromoted; - - rf_get_threadid(tid); - psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru); - - RF_LOCK_PSS_MUTEX(raidPtr, row, psid); - - pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]->pssTable, psid, which_ru, RF_PSS_CREATE|RF_PSS_RECON_BLOCKED, &created); - - /* if recon is not ongoing on this PS, just return */ - if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) { - RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid); - return(0); - } - - /* otherwise, we have to wait for reconstruction to complete on this RU. */ - /* In order to avoid waiting for a potentially large number of low-priority accesses to - * complete, we force a normal-priority (i.e. not low-priority) reconstruction - * on this RU. - */ - if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) && !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) { - DDprintf1("Forcing recon on psid %ld\n",psid); - pssPtr->flags |= RF_PSS_FORCED_ON_WRITE; /* mark this RU as under forced recon */ - pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; /* clear the blockage that we just set */ - fcol = raidPtr->reconControl[row]->fcol; - - /* get a listing of the disks comprising the indicated stripe */ - (raidPtr->Layout.map->IdentifyStripe)(raidPtr, asmap->raidAddress, &diskids, &stripe); - RF_ASSERT(row == stripe); - - /* For previously issued reads, elevate them to normal priority. If the I/O has already completed, - * it won't be found in the queue, and hence this will be a no-op. - * For unissued reads, allocate buffers and issue new reads. The fact that we've set the - * FORCED bit means that the regular recon procs will not re-issue these reqs - */ - for (i=0; i<raidPtr->Layout.numDataCol+raidPtr->Layout.numParityCol; i++) if ( (diskno = diskids[i]) != fcol) { - if (pssPtr->issued[diskno]) { - nPromoted = rf_DiskIOPromote(&raidPtr->Queues[row][diskno], psid, which_ru); - if (rf_reconDebug && nPromoted) printf("[%d] promoted read from row %d col %d\n",tid,row,diskno); - } else { - new_rbuf = rf_MakeReconBuffer(raidPtr, row, diskno, RF_RBUF_TYPE_FORCED); /* create new buf */ - ComputePSDiskOffsets(raidPtr, psid, row, diskno, &offset, &fd_offset, - &new_rbuf->spRow, &new_rbuf->spCol, &new_rbuf->spOffset); /* find offsets & spare location */ - new_rbuf->parityStripeID = psid; /* fill in the buffer */ - new_rbuf->which_ru = which_ru; - new_rbuf->failedDiskSectorOffset = fd_offset; - new_rbuf->priority = RF_IO_NORMAL_PRIORITY; - - /* use NULL b_proc b/c all addrs should be in kernel space */ - req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, offset + which_ru * sectorsPerRU, sectorsPerRU, new_rbuf->buffer, - psid, which_ru, (int (*)(void *, int))ForceReconReadDoneProc, (void *) new_rbuf, NULL, - NULL,(void *)raidPtr, 0, NULL); - - RF_ASSERT(req); /* XXX -- fix this -- XXX */ - - new_rbuf->arg = req; - rf_DiskIOEnqueue(&raidPtr->Queues[row][diskno], req, RF_IO_NORMAL_PRIORITY); /* enqueue the I/O */ - Dprintf3("[%d] Issued new read req on row %d col %d\n",tid,row,diskno); - } - } - - /* if the write is sitting in the disk queue, elevate its priority */ - if (rf_DiskIOPromote(&raidPtr->Queues[row][fcol], psid, which_ru)) printf("[%d] promoted write to row %d col %d\n",tid,row,fcol); - } - - /* install a callback descriptor to be invoked when recon completes on this parity stripe. */ - cb = rf_AllocCallbackDesc(); - /* XXX the following is bogus.. These functions don't really match!! GO */ - cb->callbackFunc = (void (*)(RF_CBParam_t))cbFunc; - cb->callbackArg.p = (void *) cbArg; - cb->next = pssPtr->procWaitList; - pssPtr->procWaitList = cb; - DDprintf2("[%d] Waiting for forced recon on psid %ld\n",tid,psid); - - RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid); - return(1); + RF_RowCol_t row = asmap->physInfo->row; /* which row of the array + * we're working on */ + RF_StripeNum_t stripeID = asmap->stripeID; /* the stripe ID we're + * forcing recon on */ + RF_SectorCount_t sectorsPerRU = raidPtr->Layout.sectorsPerStripeUnit * raidPtr->Layout.SUsPerRU; /* num sects in one RU */ + RF_ReconParityStripeStatus_t *pssPtr; /* a pointer to the parity + * stripe status structure */ + RF_StripeNum_t psid; /* parity stripe id */ + RF_SectorNum_t offset, fd_offset; /* disk offset, failed-disk + * offset */ + RF_RowCol_t *diskids; + RF_RowCol_t stripe; + int tid; + RF_ReconUnitNum_t which_ru; /* RU within parity stripe */ + RF_RowCol_t fcol, diskno, i; + RF_ReconBuffer_t *new_rbuf; /* ptr to newly allocated rbufs */ + RF_DiskQueueData_t *req;/* disk I/O req to be enqueued */ + RF_CallbackDesc_t *cb; + int created = 0, nPromoted; + + rf_get_threadid(tid); + psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru); + + RF_LOCK_PSS_MUTEX(raidPtr, row, psid); + + pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]->pssTable, psid, which_ru, RF_PSS_CREATE | RF_PSS_RECON_BLOCKED, &created); + + /* if recon is not ongoing on this PS, just return */ + if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) { + RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid); + return (0); + } + /* otherwise, we have to wait for reconstruction to complete on this + * RU. */ + /* In order to avoid waiting for a potentially large number of + * low-priority accesses to complete, we force a normal-priority (i.e. + * not low-priority) reconstruction on this RU. */ + if (!(pssPtr->flags & RF_PSS_FORCED_ON_WRITE) && !(pssPtr->flags & RF_PSS_FORCED_ON_READ)) { + DDprintf1("Forcing recon on psid %ld\n", psid); + pssPtr->flags |= RF_PSS_FORCED_ON_WRITE; /* mark this RU as under + * forced recon */ + pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; /* clear the blockage + * that we just set */ + fcol = raidPtr->reconControl[row]->fcol; + + /* get a listing of the disks comprising the indicated stripe */ + (raidPtr->Layout.map->IdentifyStripe) (raidPtr, asmap->raidAddress, &diskids, &stripe); + RF_ASSERT(row == stripe); + + /* For previously issued reads, elevate them to normal + * priority. If the I/O has already completed, it won't be + * found in the queue, and hence this will be a no-op. For + * unissued reads, allocate buffers and issue new reads. The + * fact that we've set the FORCED bit means that the regular + * recon procs will not re-issue these reqs */ + for (i = 0; i < raidPtr->Layout.numDataCol + raidPtr->Layout.numParityCol; i++) + if ((diskno = diskids[i]) != fcol) { + if (pssPtr->issued[diskno]) { + nPromoted = rf_DiskIOPromote(&raidPtr->Queues[row][diskno], psid, which_ru); + if (rf_reconDebug && nPromoted) + printf("[%d] promoted read from row %d col %d\n", tid, row, diskno); + } else { + new_rbuf = rf_MakeReconBuffer(raidPtr, row, diskno, RF_RBUF_TYPE_FORCED); /* create new buf */ + ComputePSDiskOffsets(raidPtr, psid, row, diskno, &offset, &fd_offset, + &new_rbuf->spRow, &new_rbuf->spCol, &new_rbuf->spOffset); /* find offsets & spare + * location */ + new_rbuf->parityStripeID = psid; /* fill in the buffer */ + new_rbuf->which_ru = which_ru; + new_rbuf->failedDiskSectorOffset = fd_offset; + new_rbuf->priority = RF_IO_NORMAL_PRIORITY; + + /* use NULL b_proc b/c all addrs + * should be in kernel space */ + req = rf_CreateDiskQueueData(RF_IO_TYPE_READ, offset + which_ru * sectorsPerRU, sectorsPerRU, new_rbuf->buffer, + psid, which_ru, (int (*) (void *, int)) ForceReconReadDoneProc, (void *) new_rbuf, NULL, + NULL, (void *) raidPtr, 0, NULL); + + RF_ASSERT(req); /* XXX -- fix this -- + * XXX */ + + new_rbuf->arg = req; + rf_DiskIOEnqueue(&raidPtr->Queues[row][diskno], req, RF_IO_NORMAL_PRIORITY); /* enqueue the I/O */ + Dprintf3("[%d] Issued new read req on row %d col %d\n", tid, row, diskno); + } + } + /* if the write is sitting in the disk queue, elevate its + * priority */ + if (rf_DiskIOPromote(&raidPtr->Queues[row][fcol], psid, which_ru)) + printf("[%d] promoted write to row %d col %d\n", tid, row, fcol); + } + /* install a callback descriptor to be invoked when recon completes on + * this parity stripe. */ + cb = rf_AllocCallbackDesc(); + /* XXX the following is bogus.. These functions don't really match!! + * GO */ + cb->callbackFunc = (void (*) (RF_CBParam_t)) cbFunc; + cb->callbackArg.p = (void *) cbArg; + cb->next = pssPtr->procWaitList; + pssPtr->procWaitList = cb; + DDprintf2("[%d] Waiting for forced recon on psid %ld\n", tid, psid); + + RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid); + return (1); } - /* called upon the completion of a forced reconstruction read. * all we do is schedule the FORCEDREADONE event. * called at interrupt context in the kernel, so don't do anything illegal here. */ -static void ForceReconReadDoneProc(arg, status) - void *arg; - int status; +static void +ForceReconReadDoneProc(arg, status) + void *arg; + int status; { - RF_ReconBuffer_t *rbuf = arg; + RF_ReconBuffer_t *rbuf = arg; - if (status) {printf("Forced recon read failed!\n"); /*fprintf(stderr,"Forced recon read failed!\n");*/ RF_PANIC();} - rf_CauseReconEvent((RF_Raid_t *) rbuf->raidPtr, rbuf->row, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREADDONE); + if (status) { + printf("Forced recon read failed!\n"); /* fprintf(stderr,"Forced + * recon read + * failed!\n"); */ + RF_PANIC(); + } + rf_CauseReconEvent((RF_Raid_t *) rbuf->raidPtr, rbuf->row, rbuf->col, (void *) rbuf, RF_REVENT_FORCEDREADDONE); } - /* releases a block on the reconstruction of the indicated stripe */ -int rf_UnblockRecon(raidPtr, asmap) - RF_Raid_t *raidPtr; - RF_AccessStripeMap_t *asmap; +int +rf_UnblockRecon(raidPtr, asmap) + RF_Raid_t *raidPtr; + RF_AccessStripeMap_t *asmap; { - RF_RowCol_t row = asmap->origRow; - RF_StripeNum_t stripeID = asmap->stripeID; - RF_ReconParityStripeStatus_t *pssPtr; - RF_ReconUnitNum_t which_ru; - RF_StripeNum_t psid; - int tid, created = 0; - RF_CallbackDesc_t *cb; - - rf_get_threadid(tid); - psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru); - RF_LOCK_PSS_MUTEX( raidPtr, row, psid); - pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]->pssTable, psid, which_ru, RF_PSS_NONE, &created); - - /* When recon is forced, the pss desc can get deleted before we get back to unblock recon. - * But, this can _only_ happen when recon is forced. - * It would be good to put some kind of sanity check here, but how to decide if recon - * was just forced or not? - */ - if (!pssPtr) { - /*printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n",psid,which_ru);*/ - if (rf_reconDebug || rf_pssDebug) printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n",(long)psid,which_ru); - goto out; - } - - pssPtr->blockCount--; - Dprintf3("[%d] unblocking recon on psid %ld: blockcount is %d\n",tid,psid,pssPtr->blockCount); - if (pssPtr->blockCount == 0) { /* if recon blockage has been released */ - - /* unblock recon before calling CauseReconEvent in case CauseReconEvent causes us to - * try to issue a new read before returning here. - */ - pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; - - - while (pssPtr->blockWaitList) { /* spin through the block-wait list and release all the waiters */ - cb = pssPtr->blockWaitList; - pssPtr->blockWaitList = cb->next; - cb->next = NULL; - rf_CauseReconEvent(raidPtr, cb->row, cb->col, NULL, RF_REVENT_BLOCKCLEAR); - rf_FreeCallbackDesc(cb); - } - if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) { /* if no recon was requested while recon was blocked */ - rf_PSStatusDelete(raidPtr, raidPtr->reconControl[row]->pssTable, pssPtr); - } - } - + RF_RowCol_t row = asmap->origRow; + RF_StripeNum_t stripeID = asmap->stripeID; + RF_ReconParityStripeStatus_t *pssPtr; + RF_ReconUnitNum_t which_ru; + RF_StripeNum_t psid; + int tid, created = 0; + RF_CallbackDesc_t *cb; + + rf_get_threadid(tid); + psid = rf_MapStripeIDToParityStripeID(&raidPtr->Layout, stripeID, &which_ru); + RF_LOCK_PSS_MUTEX(raidPtr, row, psid); + pssPtr = rf_LookupRUStatus(raidPtr, raidPtr->reconControl[row]->pssTable, psid, which_ru, RF_PSS_NONE, &created); + + /* When recon is forced, the pss desc can get deleted before we get + * back to unblock recon. But, this can _only_ happen when recon is + * forced. It would be good to put some kind of sanity check here, but + * how to decide if recon was just forced or not? */ + if (!pssPtr) { + /* printf("Warning: no pss descriptor upon unblock on psid %ld + * RU %d\n",psid,which_ru); */ + if (rf_reconDebug || rf_pssDebug) + printf("Warning: no pss descriptor upon unblock on psid %ld RU %d\n", (long) psid, which_ru); + goto out; + } + pssPtr->blockCount--; + Dprintf3("[%d] unblocking recon on psid %ld: blockcount is %d\n", tid, psid, pssPtr->blockCount); + if (pssPtr->blockCount == 0) { /* if recon blockage has been released */ + + /* unblock recon before calling CauseReconEvent in case + * CauseReconEvent causes us to try to issue a new read before + * returning here. */ + pssPtr->flags &= ~RF_PSS_RECON_BLOCKED; + + + while (pssPtr->blockWaitList) { /* spin through the block-wait + * list and release all the + * waiters */ + cb = pssPtr->blockWaitList; + pssPtr->blockWaitList = cb->next; + cb->next = NULL; + rf_CauseReconEvent(raidPtr, cb->row, cb->col, NULL, RF_REVENT_BLOCKCLEAR); + rf_FreeCallbackDesc(cb); + } + if (!(pssPtr->flags & RF_PSS_UNDER_RECON)) { /* if no recon was + * requested while recon + * was blocked */ + rf_PSStatusDelete(raidPtr, raidPtr->reconControl[row]->pssTable, pssPtr); + } + } out: - RF_UNLOCK_PSS_MUTEX( raidPtr, row, psid ); - return(0); + RF_UNLOCK_PSS_MUTEX(raidPtr, row, psid); + return (0); } |