diff options
author | 1999-07-30 14:45:31 +0000 | |
---|---|---|
committer | 1999-07-30 14:45:31 +0000 | |
commit | 16c3d6c44805f17bcf201fdae3a0c3b9f321ce72 (patch) | |
tree | e0cb70b347563c6decb963b06643841fb106face | |
parent | rename rl2->rln for sanity (diff) | |
download | wireguard-openbsd-16c3d6c44805f17bcf201fdae3a0c3b9f321ce72.tar.xz wireguard-openbsd-16c3d6c44805f17bcf201fdae3a0c3b9f321ce72.zip |
Update RAIDframe from NetBSD-current as of 1999/07/26.
Please note that you *must* follow the upgrade instructions at
http://www.cs.usask.ca/staff/oster/clabel_upgrade.html
before installing the new raidctl and new kernel using this code.
29 files changed, 3051 insertions, 850 deletions
diff --git a/sbin/raidctl/Makefile b/sbin/raidctl/Makefile index b2fbf126fd9..d87379a608d 100644 --- a/sbin/raidctl/Makefile +++ b/sbin/raidctl/Makefile @@ -1,13 +1,12 @@ -# $OpenBSD: Makefile,v 1.3 1999/05/21 17:59:54 deraadt Exp $ -# $NetBSD: Makefile,v 1.4 1999/02/04 14:50:31 oster Exp $ +# $OpenBSD: Makefile,v 1.4 1999/07/30 14:45:31 peter Exp $ +# $NetBSD: Makefile,v 1.5 1999/03/26 00:46:05 oster Exp $ PROG= raidctl SRCS= rf_configure.c rf_layout.c rf_strutils.c raidctl.c MAN= raidctl.8 LOOKHERE = ${.CURDIR}/../../sys/dev/raidframe -CFLAGS+= -DCSRG_BASED -DNARROWPROTO -DRF_UTILITY=1 -DSIMULATE=1 -CPPFLAGS+= -I${LOOKHERE} +CPPFLAGS+= -DRF_UTILITY=1 -I${LOOKHERE} .PATH: ${LOOKHERE} DPADD= ${LIBUTIL} diff --git a/sbin/raidctl/raidctl.8 b/sbin/raidctl/raidctl.8 index a7c8304e73d..244d32c92dc 100644 --- a/sbin/raidctl/raidctl.8 +++ b/sbin/raidctl/raidctl.8 @@ -1,6 +1,4 @@ -.\" $OpenBSD: raidctl.8,v 1.5 1999/07/03 02:11:08 aaron Exp $ -.\" -.\" $NetBSD: raidctl.8,v 1.3 1999/02/04 14:50:31 oster Exp $ +.\" $NetBSD: raidctl.8,v 1.8 1999/03/24 06:18:30 mycroft Exp $ .\" .\" Copyright (c) 1998 The NetBSD Foundation, Inc. .\" All rights reserved. @@ -39,58 +37,70 @@ .\" .\" Copyright (c) 1995 Carnegie-Mellon University. .\" All rights reserved. -.\" +.\" .\" Author: Mark Holland -.\" +.\" .\" Permission to use, copy, modify and distribute this software and .\" its documentation is hereby granted, provided that both the copyright .\" notice and this permission notice appear in all copies of the .\" software, derivative works or modified versions, and any portions .\" thereof, and that both notices appear in supporting documentation. -.\" +.\" .\" CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS" .\" CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND .\" FOR ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE. -.\" +.\" .\" Carnegie Mellon requests users of this software to return to -.\" +.\" .\" Software Distribution Coordinator or Software.Distribution@CS.CMU.EDU .\" School of Computer Science .\" Carnegie Mellon University .\" Pittsburgh PA 15213-3890 -.\" +.\" .\" any improvements or extensions that they make and grant Carnegie the .\" rights to redistribute these changes. -.\" +.\" .Dd November 6, 1998 .Dt RAIDCTL 8 -.Os +.Os NetBSD .Sh NAME .Nm raidctl .Nd configuration utility for the RAIDframe disk driver .Sh SYNOPSIS -.Nm raidctl +.Nm "" +.Fl a Ar component Ar dev +.Nm "" +.Fl B Ar dev +.Nm "" .Fl c Ar config_file Ar dev -.Nm raidctl -.Fl C Ar dev -.Nm raidctl +.Nm "" +.Fl C Ar config_file Ar dev +.Nm "" .Fl f Ar component Ar dev -.Nm raidctl +.Nm "" .Fl F Ar component Ar dev -.Nm raidctl -.Fl r Ar dev -.Nm raidctl -.Fl R Ar dev -.Nm raidctl -.Fl s Ar dev -.Nm raidctl +.Nm "" +.Fl g Ar component Ar dev +.Nm "" +.Fl i Ar dev +.Nm "" +.Fl I Ar serial_number Ar dev +.Nm "" +.Fl r Ar component Ar dev +.Nm "" +.Fl R Ar component Ar dev +.Nm "" +.Fl s Ar dev +.Nm "" +.Fl S Ar dev +.Nm "" .Fl u Ar dev .Sh DESCRIPTION -.Nm +.Nm "" is the user-land control program for .Xr raid 4 , -the RAIDframe disk device. -.Nm +the RAIDframe disk device. +.Nm "" is primarily used to dynamically configure and unconfigure RAIDframe disk devices. For more information about the RAIDframe disk device, see .Xr raid 4 . @@ -98,92 +108,118 @@ devices. For more information about the RAIDframe disk device, see This document assumes the reader has at least rudimentary knowledge of RAID and RAID concepts. .Pp -The command-line options for +The command-line options for .Nm are as follows: .Bl -tag -width indent +.It Fl a Ar component Ar dev +Add +.Ar component +as a hot spare for the device +.Ar dev . +.It Fl B Ar dev +Initiate a copyback of reconstructed data from a spare disk to +it's original disk. This is performed after a component has failed, +and the failed drive has been reconstructed onto a spare drive. .It Fl c Ar config_file Ar dev -Configure the RAIDframe device +Configure the RAIDframe device .Ar dev according to the configuration given in .Ar config_file . -A description of the contents of +A description of the contents of .Ar config_file is given later. -.It Fl C Ar dev -Initiate a copyback of reconstructed data from a spare disk to -its original disk. This is performed after a component has failed, -and the failed drive has been reconstructed onto a spare drive. +.It Fl C Ar config_file Ar dev +As for +.Ar -c , +but forces the configuration to take place. This is required the +first time a RAID set is configured. .It Fl f Ar component Ar dev -This marks the specified +This marks the specified .Ar component as having failed, but does not initiate a reconstruction of that -component. +component. .It Fl F Ar component Ar dev -Fails the specified +Fails the specified .Ar component -of the device, and immediately beginis a reconstruction of the failed -disk onto an available hot spare. This is the mechanism used to start +of the device, and immediately begin a reconstruction of the failed +disk onto an available hot spare. This is one of the mechanisms used to start the reconstruction process if a component does have a hardware failure. -.It Fl r Ar dev -Re-write the parity on the device. This -.Em must +.It Fl g Ar component Ar dev +Get the component label for the specified component. +.It Fl i Ar dev +Initialize (re-write) the parity on the device. This +.Ar MUST be done before the RAID device is labeled and before filesystems are created on the RAID device, and is normally used after -a system crash (and before a -.Xr fsck 8 ) Ns +a system crash (and before a +.Xr fsck 8 ) to ensure the integrity of the parity. -.It Fl R Ar dev -Check the status of component reconstruction. The output indicates -the amount of progress achieved in reconstructing a failed component. +.It Fl I Ar serial_number Ar dev +Initialize the component labels on each component of the device. +.Ar serial_number +is used as one of the keys in determining whether a +particular set of components belong to the same RAID set. While not +strictly enforced, different serial numbers should be used for +different RAID sets. +.It Fl r Ar component Ar dev +Remove the spare disk specified by +.Ar component +from the set of available spare components. +.It Fl R Ar component Ar dev +Fails the specified +.Ar component , +if necessary, and immediately begins a reconstruction back to +.Ar component . +This is another mechanism for starting the reconstruction process if a +component has a hardware failure. .It Fl s Ar dev Display the status of the RAIDframe device for each of the components -and spares. +and spares. +.It Fl S Ar dev +Check the status of component reconstruction. The output indicates +the amount of progress achieved in reconstructing a failed component. .It Fl u Ar dev Unconfigure the RAIDframe device. .El .Pp -The device used by +The device used by .Nm -is specified by -.Ar dev . +is specified by +.Ar dev . .Ar dev -may be either the full name of the device (e.g., -.Pa /dev/rraid0d -for the i386 architecture, and -.Pa /dev/rraid0c -for all others), -or just simply raid0 (for -.Pa /dev/rraid0d ) . +may be either the full name of the device, e.g. /dev/rraid0d, +for the i386 architecture, and /dev/rraid0c +for all others, or just simply raid0 (for /dev/rraid0d). .Pp The format of the configuration file is complex, and only an abbreviated treatment is given here. In the configuration -files, a +files, a .Sq # indicates the beginning of a comment. .Pp There are 4 required sections of a configuration file, and 2 -optional components. Each section begins with a -.Dq START , +optional components. Each section begins with a +.Sq START , followed by the section name, and the confuration parameters associated with that -section. The first section is the -.Dq array +section. The first section is the +.Sq array section, and it specifies -the number of rows, columns, and spare disks in the RAID array. For -example: +the number of rows, columns, and spare disks in the RAID set. For +example: .Bd -unfilled -offset indent START array 1 3 0 .Ed .Pp indicates an array with 1 row, 3 columns, and 0 spare disks. Note -that although multi-dimensional arrays may be specified, they are -.Em not +that although multi-dimenstional arrays may be specified, they are +.Ar NOT supported in the driver. .Pp -The second section, the -.Dq disks +The second section, the +.Sq disks section, specifies the actual components of the device. For example: .Bd -unfilled -offset indent @@ -195,37 +231,38 @@ START disks .Pp specifies the three component disks to be used in the RAID device. If any of the specified drives cannot be found when the RAID device is -configured, then they will be marked as -.Dq failed , +configured, then they will be marked as +.Sq failed , and the system will -operate in degraded mode. Note that it is -.Em imperative +operate in degraded mode. Note that it is +.Ar imperative that the order of the components in the configuration file does not change between configurations of a RAID device. Changing the order of the components (at least at the time of this writing) will result in -data loss. +data loss. .Pp -The next section, -.Dq spare , -is optional, and if present specifies the devices to be used as -.Dq hot spares +The next section, which is the +.Sq spare +section, is optional, and, if +present, specifies the devices to be used as +.Sq hot spares -- devices which are on-line, but are not actively used by the RAID driver unless -one of the main components fail. A simple -.Dq spare +one of the main components fail. A simple +.Sq spare section might be: .Bd -unfilled -offset indent -START spare +START spare /dev/sd3e .Ed .Pp for a configuration with a single spare component. If no spare drives -are to be used in the configuration, then the -.Dq spare -section may be omitted. +are to be used in the configuration, then the +.Sq spare +section may be omitted. .Pp -The next section is the -.Dq layout +The next section is the +.Sq layout section. This section describes the general layout parameters for the RAID device, and provides such information as sectors per stripe unit, stripe units per parity unit, @@ -238,7 +275,7 @@ START layout .Ed .Pp The sectors per stripe unit specifies, in blocks, the interleave -factor; i.e., the number of contiguous sectors to be written to each +factor; i.e. the number of contiguous sectors to be written to each component for a single stripe. Appropriate selection of this value (32 in this example) is the subject of much research in RAID architectures. The stripe units per parity unit and @@ -247,9 +284,9 @@ While certain values above 1 are permitted, a discussion of valid values and the consequences of using anything other than 1 are outside the scope of this document. The last value in this section (5 in this example) indicates the parity configuration desired. Valid entries -include: +include: .Bl -tag -width inde -.It 0 +.It 0 RAID level 0. No parity, only simple striping. .It 1 RAID level 1. Mirroring. @@ -262,13 +299,13 @@ all components. .El .Pp There are other valid entries here, including those for Even-Odd -parity, RAID level 5 with rotated sparing, Chained declustering, +parity, RAID level 5 with rotated sparing, Chained declustering, and Interleaved declustering, but as of this writing the code for -those parity operations has not been tested with -.Ox . +those parity operations has not been tested with +.Nx . .Pp -The next required section is the -.Dq queue +The next required section is the +.Sq queue section. This is most often specified as: .Bd -unfilled -offset indent @@ -276,41 +313,44 @@ START queue fifo 1 .Ed .Pp -where the queuing method is specified as FIFO (first-in, first-out), -and the size of the per-component queue is limited to 1 request. A +where the queueing method is specified as fifo (first-in, first-out), +and the size of the per-component queue is limited to 1 requests. A value of 1 is quite conservative here, and values of 100 or more may -been used to increase the driver performance. +been used to increase the driver performance. Other queuing methods may also be specified, but a discussion of them is beyond the scope of this document. .Pp -The final section, the -.Dq debug +The final section, the +.Sq debug section, is optional. For more details on this the reader is referred to the RAIDframe documentation -dissussed in the +dissussed in the .Sx HISTORY section. + See .Sx EXAMPLES for a more complete configuration file example. + .Sh EXAMPLES + The examples in this section will focus on a RAID 5 configuration. Other RAID configurations will behave similarly. It is highly recommended that before using the RAID driver for real filesystems -that the system administrator(s) have used -.Em all -of the options for -.Nm , +that the system administrator(s) have used +.Ar all +of the options for +.Nm "" , and that they understand how the component reconstruction process works. While this example is not created as a tutorial, the steps shown here can be easily dupilicated using four equal-sized partitions -from any number of disks (including all four from a single disk). +from any number of disks (including all four from a single disk). .Pp -The primary use of -.Nm +The primary uses of +.Nm "" is to configure and unconfigure -.Xr raid 4 -devices. To configure a device, a configuration +.Xr raid 4 +devices. To configure the device, a configuration file which looks something like: .Bd -unfilled -offset indent START array @@ -334,20 +374,14 @@ fifo 100 .Ed .Pp is first created. In short, this configuration file specifies a RAID -5 configuration consisting of the disks -.Pa /dev/sd1e , -.Pa /dev/sd2e , -and -.Pa /dev/sd3e , -with -.Pa /dev/sd4e -available as a -.Dq hot spare +5 configuration consisting of the components /dev/sd1e, +/dev/sd2e, and /dev/sd3e, with /dev/sd4e available as a +.Sq hot spare in case one of the three main drives should fail. If the above configuration is in a -file called -.Pa rfconfig , -raid device 0 can be configured with: +file called +.Sq rfconfig , +raid device 0 in the normal case can be configured with: .Bd -unfilled -offset indent raidctl -c rfconfig raid0 .Ed @@ -357,12 +391,69 @@ The above is equivalent to the following: raidctl -c rfconfig /dev/rraid0d .Ed .Pp -on the i386 architecture. On all other architectures, -.Pa /dev/rraid0c -is used in place of -.Pa /dev/rraid0d . +on the i386 architecture. On all other architectures, /dev/rraid0c +is used in place of /dev/rraid0d. +.Pp +A RAID set will not configure with +.Fl c +if the component labels are not correct. A +.Sq component label +contains important information about the component, including a +user-specified serial number, the row and column of that component in the RAID +set, and whether the data (and parity) on the component is +.Sq clean . +See +.Xr raid 4 +for more information about component labels. +.Pp +Since new RAID sets will not have correct component labels, the first +configuration of a RAID set must use +.Fl C +instead of +.Fl c : +.Bd -unfilled -offset indent +raidctl -C rfconfig raid0 +.Ed +.Pp +The +.Fl C +forces the configuration to succeed, even if any of the component +labels are incorrect. This option should not be used lightly in +situations other than initial configurations, as if +the system is refusing to configure a RAID set, there is probably a +very good reason for it. +.Pp +When the RAID set is configured for the first time, it is +necessary to initialize the component labels, and to initialize the +parity on the RAID set. Initializing the component labels is done with: +.Bd -unfilled -offset indent +raidctl -I 112341 raid0 +.Ed +.Pp +where +.Sq 112341 +is a user-specified serial number for the RAID set. Using different +serial numbers between RAID sets is strongly encouraged, as using the +same serial number for all RAID sets will only serve to decrease the +usefulness of the component label checking. .Pp -To see how the device is doing, the following will show the status: +Initializing the parity on the RAID set is done via: +.Bd -unfilled -offset indent +raidctl -i raid0 +.Ed +.Pp +Initializing the parity in this way may also be required after an +unclean shutdown. Once the parity is known to be correct, +it is then safe to perform +.Xr disklabel 8 , +.Xr newfs 8 , +or +.Xr fsck 8 +on the device or its filesystems, and then to mount the filesystems +for use. +.Pp +To see how the RAID set is doing, the following command can be used to +show the RAID set's status: .Bd -unfilled -offset indent raidctl -s raid0 .Ed @@ -374,23 +465,36 @@ Components: /dev/sd2e: optimal /dev/sd3e: optimal Spares: - /dev/sd4e [0][0]: spare + /dev/sd4e: spare .Ed .Pp -This indicates that all is well with the RAID array. If this is the first -time this RAID array has been configured, or the system is just being -brought up after an unclean shutdown, it is necessary to -ensure that the parity values are correct. This can be done via: +This indicates that all is well with the RAID set. +.Pp +To check the component label of /dev/sd1e, the following is used: .Bd -unfilled -offset indent -raidctl -r raid0 +raidctl -g /dev/sd1e raid0 .Ed .Pp -Once this is done, it is then safe to perform -.Xr disklabel 8 , Ns -.Xr newfs 8 , Ns -or -.Xr fsck 8 -on the device or its filesystems. +The output of this command will look something like: +.Bd -unfilled -offset indent +Component label for /dev/sd2e: +Version: 1 +Serial Number: 112341 +Mod counter: 6 +Row: 0 +Column: 1 +Num Rows: 1 +Num Columns: 3 +Clean: 0 +Status: optimal +.Ed +.Pp +For a component label to be considered valid, that particular +component label must be in agreement with the other component labels +in the set. For example, the serial number, 'modification counter', +number of rows and number of columns must all be in agreement. If any +of these are different, then the component is not considered to be +part of the set. .Pp If for some reason (perhaps to test reconstruction) it is necessary to pretend a drive @@ -400,7 +504,7 @@ raidctl -f /dev/sd2e raid0 .Ed .Pp The system will then be performing all operations in degraded mode, -where missing data is re-computed from existing data and the parity. +were missing data is re-computed from existing data and the parity. In this case, obtaining the status of raid0 will return: .Bd -unfilled -offset indent Components: @@ -408,19 +512,24 @@ Components: /dev/sd2e: failed /dev/sd3e: optimal Spares: - /dev/sd4e [0][0]: spare + /dev/sd4e: spare .Ed .Pp -Note that with the use of +Note that with the use of .Fl f a reconstruction has not been started. To both fail the disk and -start a reconstruction, the +start a reconstruction, the .Fl F -option must be used. (The +option must be used: +.Bd -unfilled -offset indent +raidctl -F /dev/sd2e raid0 +.Ed +.Pp +The .Fl f option may be used first, and then the .Fl F -option used later, on the same disk, if desired.) +option used later, on the same disk, if desired. Immediately after the reconstruction is started, the status will report: .Bd -unfilled -offset indent Components: @@ -428,12 +537,12 @@ Components: /dev/sd2e: reconstructing /dev/sd3e: optimal Spares: - /dev/sd4e [0][0]: used_spare + /dev/sd4e: used_spare .Ed .Pp This indicates that a reconstruction is in progress. To find out how -the reconstruction is progressing the -.Fl R +the reconstruction is progressing the +.Fl S option may be used. This will indicate the progress in terms of the percentage of the reconstruction that is completed. When the reconstruction is finished the @@ -445,39 +554,33 @@ Components: /dev/sd2e: spared /dev/sd3e: optimal Spares: - /dev/sd4e [0][0]: used_spare + /dev/sd4e: used_spare .Ed .Pp -At this point there are at least two options. First, if -.Pa /dev/sd2e -is known to be good (i.e., the failure was either caused by +At this point there are at least two options. First, if /dev/sd2e is +known to be good (i.e. the failure was either caused by .Fl f -or +or .Fl F , -or the failed disk was replaced), then a copyback of the data can -be initiated with the -.Fl C +or the failed disk was replaced), then a copyback of the data can +be initiated with the +.Fl B option. In this example, this would copy the entire contents of -.Pa /dev/sd4e -to -.Pa /dev/sd2e . -Once the copyback procedure is complete, the status of the device would be: +/dev/sd4e to /dev/sd2e. Once the copyback procedure is complete, the +status of the device would be: .Bd -unfilled -offset indent Components: /dev/sd1e: optimal /dev/sd2e: optimal /dev/sd3e: optimal Spares: - /dev/sd4e [0][0]: spare + /dev/sd4e: spare .Ed .Pp and the system is back to normal operation. .Pp -The second option after the reconstruction is to simply use -.Pa /dev/sd4e -in place of -.Pa /dev/sd2e -in the configuration file. For example, the +The second option after the reconstruction is to simply use /dev/sd4e +in place of /dev/sd2e in the configuration file. For example, the configuration file (in part) might now look like: .Bd -unfilled -offset indent START array @@ -489,21 +592,75 @@ START drives /dev/sd3e .Ed .Pp -This can be done as -.Pa /dev/sd4e -is completely interchangeable with -.Pa /dev/sd2e -at this point. Note that extreme care must be taken when +This can be done as /dev/sd4e is completely interchangeable with +/dev/sd2e at this point. Note that extreme care must be taken when changing the order of the drives in a configuration. This is one of the few instances where the devices and/or their orderings can be changed without loss of data! In general, the ordering of components -in a configuration file should -.Em never +in a configuration file should +.Ar never be changed. .Pp -The final operation performed by +If a component fails and there are no hot spares +available on-line, the status of the RAID set might look like: +.Bd -unfilled -offset indent +Components: + /dev/sd1e: optimal + /dev/sd2e: failed + /dev/sd3e: optimal +No spares. +.Ed +.Pp +In this case there are a number of options. The first option is to add a hot +spare using: +.Bd -unfilled -offset indent +raidctl -a /dev/sd4e raid0 +.Ed +.Pp +After the hot add, the status would then be: +.Bd -unfilled -offset indent +Components: + /dev/sd1e: optimal + /dev/sd2e: failed + /dev/sd3e: optimal +Spares: + /dev/sd4e: spare +.Ed +.Pp +Reconstruction could then take place using +.Fl F +as describe above. +.Pp +A second option is to rebuild directly onto /dev/sd2e. Once the disk +containing /dev/sd2e has been replaced, one can simply use: +.Bd -unfilled -offset indent +raidctl -R /dev/sd2e raid0 +.Ed +.Pp +to rebuild the /dev/sd2e component. As the rebuilding is in progress, +the status will be: +.Bd -unfilled -offset indent +Components: + /dev/sd1e: optimal + /dev/sd2e: reconstructing + /dev/sd3e: optimal +No spares. +.Ed +.Pp +and when completed, will be: +.Bd -unfilled -offset indent +Components: + /dev/sd1e: optimal + /dev/sd2e: optimal + /dev/sd3e: optimal +No spares. +.Ed +.Pp + +.Pp +The final operation performed by .Nm -is to unconfigure a +is to unconfigure a .Xr raid 4 device. This is accomplished via a simple: .Bd -unfilled -offset indent @@ -516,12 +673,12 @@ Certain RAID levels (1, 4, 5, 6, and others) can protect against some data loss due to component failure. However the loss of two components of a RAID 4 or 5 system, or the loss of a single component of a RAID 0 system will result in the entire filesystem being lost. -RAID is -.Em not +RAID is +.Ar NOT a substitute for good backup practices. .Pp -Recomputation of parity -.Em must +Recomputation of parity +.Ar MUST be performed whenever there is a chance that it may have been compromised. This includes after system crashes, or before a RAID device has been used for the first time. Failure to keep parity @@ -529,20 +686,24 @@ correct will be catastrophic should a component ever fail -- it is better to use RAID 0 and get the additional space and speed, than it is to use parity, but not keep the parity correct. At least with RAID 0 there is no perception of increased data security. +.Pp .Sh FILES .Bl -tag -width /dev/XXrXraidX -compact .It Pa /dev/{,r}raid* -.Nm -device special files +.Cm raid +device special files. .El +.Pp .Sh SEE ALSO -.Xr ccd 4 , .Xr raid 4 , +.Xr ccd 4 , .Xr rc 8 +.Sh BUGS +Hot-spare removal is currently not available. .Sh HISTORY RAIDframe is a framework for rapid prototyping of RAID structures developed by the folks at the Parallel Data Laboratory at Carnegie -Mellon University (CMU). +Mellon University (CMU). A more complete description of the internals and functionality of RAIDframe is found in the paper "RAIDframe: A Rapid Prototyping Tool for RAID Systems", by William V. Courtright II, Garth Gibson, Mark @@ -558,7 +719,6 @@ is a complete re-write, and first appeared in .Nx 1.4 . .Sh COPYRIGHT .Bd -unfilled - The RAIDframe Copyright is as follows: Copyright (c) 1994-1996 Carnegie-Mellon University. @@ -583,5 +743,4 @@ Carnegie Mellon requests users of this software to return to any improvements or extensions that they make and grant Carnegie the rights to redistribute these changes. - .Ed diff --git a/sbin/raidctl/raidctl.c b/sbin/raidctl/raidctl.c index 95db0294c6a..6350be52c80 100644 --- a/sbin/raidctl/raidctl.c +++ b/sbin/raidctl/raidctl.c @@ -1,5 +1,5 @@ -/* $OpenBSD: raidctl.c,v 1.2 1999/02/16 21:51:39 niklas Exp $ */ -/* $NetBSD: raidctl.c,v 1.4 1999/02/04 14:50:31 oster Exp $ */ +/* $OpenBSD: raidctl.c,v 1.3 1999/07/30 14:45:32 peter Exp $ */ +/* $NetBSD: raidctl.c,v 1.6 1999/03/02 03:13:59 oster Exp $ */ /*- * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc. * All rights reserved. @@ -57,6 +57,11 @@ #include <errno.h> #include <sys/types.h> #include <string.h> +#ifdef NETBSD +#include <sys/disklabel.h> +#include <machine/disklabel.h> +#endif +#include <stdlib.h> #include <unistd.h> #include "rf_raidframe.h" @@ -65,11 +70,18 @@ extern char *__progname; int main __P((int, char *[])); static void do_ioctl __P((int, unsigned long, void *, char *)); -static void rf_configure __P((int, char*)); +static void rf_configure __P((int, char*, int)); static char *device_status __P((RF_DiskStatus_t)); static void rf_get_device_status __P((int)); +static void get_component_number __P((int, char *, int *, int *)); static void rf_fail_disk __P((int, char *, int)); static void usage __P((void)); +static void get_component_label __P((int, char *)); +static void set_component_label __P((int, char *)); +static void init_component_labels __P((int, int)); +static void add_hot_spare __P((int, char *)); +static void remove_hot_spare __P((int, char *)); +static void rebuild_in_place __P((int, char *)); int main(argc,argv) @@ -84,53 +96,93 @@ main(argc,argv) char config_filename[PATH_MAX]; char dev_name[PATH_MAX]; char name[PATH_MAX]; - char component_to_fail[PATH_MAX]; + char component[PATH_MAX]; int do_recon; int raidID; int rawpart; int recon_percent_done; - struct stat st; - int fd; + int serial_number; + struct stat st; + int fd; + int force; num_options = 0; action = 0; do_recon = 0; + force = 0; - while ((ch = getopt(argc, argv, "c:Cf:F:rRsu")) != -1) + while ((ch = getopt(argc, argv, "a:Bc:C:f:F:g:iI:l:r:R:sSu")) != -1) switch(ch) { + case 'a': + action = RAIDFRAME_ADD_HOT_SPARE; + strncpy(component, optarg, PATH_MAX); + num_options++; + break; + case 'B': + action = RAIDFRAME_COPYBACK; + num_options++; + break; case 'c': - strncpy(config_filename,optarg,PATH_MAX); action = RAIDFRAME_CONFIGURE; + strncpy(config_filename,optarg,PATH_MAX); + force = 0; num_options++; break; case 'C': - action = RAIDFRAME_COPYBACK; + strncpy(config_filename,optarg,PATH_MAX); + action = RAIDFRAME_CONFIGURE; + force = 1; num_options++; break; case 'f': action = RAIDFRAME_FAIL_DISK; + strncpy(component, optarg, PATH_MAX); do_recon = 0; - strncpy(component_to_fail, optarg, PATH_MAX); num_options++; break; case 'F': action = RAIDFRAME_FAIL_DISK; + strncpy(component, optarg, PATH_MAX); do_recon = 1; - strncpy(component_to_fail, optarg, PATH_MAX); num_options++; break; - case 'r': + case 'g': + action = RAIDFRAME_GET_COMPONENT_LABEL; + strncpy(component, optarg, PATH_MAX); + num_options++; + break; + case 'i': action = RAIDFRAME_REWRITEPARITY; num_options++; break; + case 'I': + action = RAIDFRAME_INIT_LABELS; + serial_number = atoi(optarg); + num_options++; + break; + case 'l': + action = RAIDFRAME_SET_COMPONENT_LABEL; + strncpy(component, optarg, PATH_MAX); + num_options++; + break; + case 'r': + action = RAIDFRAME_REMOVE_HOT_SPARE; + strncpy(component, optarg, PATH_MAX); + num_options++; + break; case 'R': - action = RAIDFRAME_CHECKRECON; + strncpy(component,optarg,PATH_MAX); + action = RAIDFRAME_REBUILD_IN_PLACE; num_options++; break; case 's': action = RAIDFRAME_GET_INFO; num_options++; break; + case 'S': + action = RAIDFRAME_CHECKRECON; + num_options++; + break; case 'u': action = RAIDFRAME_SHUTDOWN; num_options++; @@ -180,15 +232,30 @@ main(argc,argv) switch(action) { + case RAIDFRAME_ADD_HOT_SPARE: + add_hot_spare(fd,component); + break; + case RAIDFRAME_REMOVE_HOT_SPARE: + remove_hot_spare(fd,component); + break; case RAIDFRAME_CONFIGURE: - rf_configure(fd, config_filename); + rf_configure(fd, config_filename, force); break; case RAIDFRAME_COPYBACK: printf("Copyback.\n"); do_ioctl(fd, RAIDFRAME_COPYBACK, NULL, "RAIDFRAME_COPYBACK"); break; case RAIDFRAME_FAIL_DISK: - rf_fail_disk(fd,component_to_fail,do_recon); + rf_fail_disk(fd,component,do_recon); + break; + case RAIDFRAME_SET_COMPONENT_LABEL: + set_component_label(fd,component); + break; + case RAIDFRAME_GET_COMPONENT_LABEL: + get_component_label(fd,component); + break; + case RAIDFRAME_INIT_LABELS: + init_component_labels(fd,serial_number); break; case RAIDFRAME_REWRITEPARITY: printf("Initiating re-write of parity\n"); @@ -204,6 +271,9 @@ main(argc,argv) case RAIDFRAME_GET_INFO: rf_get_device_status(fd); break; + case RAIDFRAME_REBUILD_IN_PLACE: + rebuild_in_place(fd,component); + break; case RAIDFRAME_SHUTDOWN: do_ioctl(fd, RAIDFRAME_SHUTDOWN, NULL, "RAIDFRAME_SHUTDOWN"); break; @@ -230,9 +300,10 @@ do_ioctl(fd, command, arg, ioctl_name) static void -rf_configure(fd,config_file) +rf_configure(fd,config_file,force) int fd; char *config_file; + int force; { void *generic; RF_Config_t cfg; @@ -243,6 +314,8 @@ rf_configure(fd,config_file) exit(1); } + cfg.force = force; + /* Note the extra level of redirection needed here, since @@ -252,13 +325,7 @@ rf_configure(fd,config_file) */ generic = (void *) &cfg; - do_ioctl(fd,RAIDFRAME_CONFIGURE,&generic,"RAIDFRAME_CONFIGURE"); -#if 0 - if (ioctl(fd, RAIDFRAME_CONFIGURE, &generic) < 0) { - warn("ioctl (RAIDFRAME_CONFIGURE): failed\n"); - exit(1); - } -#endif + do_ioctl(fd, RAIDFRAME_CONFIGURE, &generic, "RAIDFRAME_CONFIGURE"); } static char * @@ -316,10 +383,8 @@ rf_get_device_status(fd) if (device_config.nspares > 0) { printf("Spares:\n"); for(i=0; i < device_config.nspares; i++) { - printf("%20s [%d][%d]: %s\n", + printf("%20s: %s\n", device_config.spares[i].devname, - device_config.spares[i].spareRow, - device_config.spares[i].spareCol, device_status(device_config.spares[i].status)); } } else { @@ -329,42 +394,55 @@ rf_get_device_status(fd) } static void -rf_fail_disk(fd, component_to_fail, do_recon) +get_component_number(fd, component_name, component_number, num_columns) int fd; - char *component_to_fail; - int do_recon; + char *component_name; + int *component_number; + int *num_columns; { - struct rf_recon_req recon_request; RF_DeviceConfig_t device_config; void *cfg_ptr; int i; int found; - int component_num; - component_num = -1; + *component_number = -1; /* Assuming a full path spec... */ cfg_ptr = &device_config; do_ioctl(fd, RAIDFRAME_GET_INFO, &cfg_ptr, "RAIDFRAME_GET_INFO"); + + *num_columns = device_config.cols; + found = 0; for(i=0; i < device_config.ndevs; i++) { - if (strncmp(component_to_fail, - device_config.devs[i].devname, + if (strncmp(component_name, device_config.devs[i].devname, PATH_MAX)==0) { found = 1; - component_num = i; + *component_number = i; } } if (!found) { - fprintf(stderr,"%s: %s is not a component %s", - __progname, component_to_fail, - "of this device\n"); + fprintf(stderr,"%s: %s is not a component %s", __progname, + component_name, "of this device\n"); exit(1); } +} - recon_request.row = component_num / device_config.cols; - recon_request.col = component_num % device_config.cols; +static void +rf_fail_disk(fd, component_to_fail, do_recon) + int fd; + char *component_to_fail; + int do_recon; +{ + struct rf_recon_req recon_request; + int component_num; + int num_cols; + + get_component_number(fd, component_to_fail, &component_num, &num_cols); + + recon_request.row = component_num / num_cols; + recon_request.col = component_num % num_cols; if (do_recon) { recon_request.flags = RF_FDFLAGS_RECON; } else { @@ -372,20 +450,170 @@ rf_fail_disk(fd, component_to_fail, do_recon) } do_ioctl(fd, RAIDFRAME_FAIL_DISK, &recon_request, "RAIDFRAME_FAIL_DISK"); +} + +static void +get_component_label(fd, component) + int fd; + char *component; +{ + RF_ComponentLabel_t component_label; + void *label_ptr; + int component_num; + int num_cols; + + get_component_number(fd, component, &component_num, &num_cols); + + memset( &component_label, 0, sizeof(RF_ComponentLabel_t)); + component_label.row = component_num / num_cols; + component_label.column = component_num % num_cols; + + label_ptr = &component_label; + do_ioctl( fd, RAIDFRAME_GET_COMPONENT_LABEL, &label_ptr, + "RAIDFRAME_GET_COMPONENT_LABEL"); + + printf("Component label for %s:\n",component); + printf("Version: %d\n",component_label.version); + printf("Serial Number: %d\n",component_label.serial_number); + printf("Mod counter: %d\n",component_label.mod_counter); + printf("Row: %d\n", component_label.row); + printf("Column: %d\n", component_label.column); + printf("Num Rows: %d\n", component_label.num_rows); + printf("Num Columns: %d\n", component_label.num_columns); + printf("Clean: %d\n", component_label.clean); + printf("Status: %s\n", device_status(component_label.status)); +} + +static void +set_component_label(fd, component) + int fd; + char *component; +{ + RF_ComponentLabel_t component_label; + int component_num; + int num_cols; + + get_component_number(fd, component, &component_num, &num_cols); + + /* XXX This is currently here for testing, and future expandability */ + + component_label.version = 1; + component_label.serial_number = 123456; + component_label.mod_counter = 0; + component_label.row = component_num / num_cols; + component_label.column = component_num % num_cols; + component_label.num_rows = 0; + component_label.num_columns = 5; + component_label.clean = 0; + component_label.status = 1; + + do_ioctl( fd, RAIDFRAME_SET_COMPONENT_LABEL, &component_label, + "RAIDFRAME_SET_COMPONENT_LABEL"); +} + + +static void +init_component_labels(fd, serial_number) + int fd; + int serial_number; +{ + RF_ComponentLabel_t component_label; + + component_label.version = 0; + component_label.serial_number = serial_number; + component_label.mod_counter = 0; + component_label.row = 0; + component_label.column = 0; + component_label.num_rows = 0; + component_label.num_columns = 0; + component_label.clean = 0; + component_label.status = 0; + + do_ioctl( fd, RAIDFRAME_INIT_LABELS, &component_label, + "RAIDFRAME_SET_COMPONENT_LABEL"); +} + +static void +add_hot_spare(fd, component) + int fd; + char *component; +{ + RF_SingleComponent_t hot_spare; + + hot_spare.row = 0; + hot_spare.column = 0; + strncpy(hot_spare.component_name, component, + sizeof(hot_spare.component_name)); + + do_ioctl( fd, RAIDFRAME_ADD_HOT_SPARE, &hot_spare, + "RAIDFRAME_ADD_HOT_SPARE"); +} + +static void +remove_hot_spare(fd, component) + int fd; + char *component; +{ + RF_SingleComponent_t hot_spare; + int component_num; + int num_cols; + + get_component_number(fd, component, &component_num, &num_cols); + + hot_spare.row = component_num / num_cols; + hot_spare.column = component_num % num_cols; + strncpy(hot_spare.component_name, component, + sizeof(hot_spare.component_name)); + + do_ioctl( fd, RAIDFRAME_REMOVE_HOT_SPARE, &hot_spare, + "RAIDFRAME_REMOVE_HOT_SPARE"); } static void +rebuild_in_place( fd, component ) + int fd; + char *component; +{ + RF_SingleComponent_t comp; + int component_num; + int num_cols; + + get_component_number(fd, component, &component_num, &num_cols); + + comp.row = 0; + comp.column = component_num; + strncpy(comp.component_name, component, sizeof(comp.component_name)); + + do_ioctl( fd, RAIDFRAME_REBUILD_IN_PLACE, &comp, + "RAIDFRAME_REBUILD_IN_PLACE"); +} + + +static void usage() { - fprintf(stderr, "usage: %s -c config_file dev\n", __progname); - fprintf(stderr, " %s -C dev\n", __progname); + fprintf(stderr, "usage: %s -a component dev\n", __progname); + fprintf(stderr, " %s -B dev\n", __progname); + fprintf(stderr, " %s -c config_file dev\n", __progname); + fprintf(stderr, " %s -C config_file dev\n", __progname); fprintf(stderr, " %s -f component dev\n", __progname); fprintf(stderr, " %s -F component dev\n", __progname); - fprintf(stderr, " %s -r dev\n", __progname); - fprintf(stderr, " %s -R dev\n", __progname); + fprintf(stderr, " %s -g component dev\n", __progname); + fprintf(stderr, " %s -i dev\n", __progname); + fprintf(stderr, " %s -I serial_number dev\n", __progname); + fprintf(stderr, " %s -r component dev\n", __progname); + fprintf(stderr, " %s -R component dev\n", __progname); fprintf(stderr, " %s -s dev\n", __progname); + fprintf(stderr, " %s -S dev\n", __progname); fprintf(stderr, " %s -u dev\n", __progname); +#if 0 + fprintf(stderr, "usage: %s %s\n", __progname, + "-a | -f | -F | -g | -r | -R component dev"); + fprintf(stderr, " %s -B | -i | -s | -S -u dev\n", __progname); + fprintf(stderr, " %s -c | -C config_file dev\n", __progname); + fprintf(stderr, " %s -I serial_number dev\n", __progname); +#endif exit(1); /* NOTREACHED */ } diff --git a/sbin/raidctl/rf_configure.c b/sbin/raidctl/rf_configure.c index ad79a9aac65..d741fab2292 100644 --- a/sbin/raidctl/rf_configure.c +++ b/sbin/raidctl/rf_configure.c @@ -1,6 +1,5 @@ -/* $OpenBSD: rf_configure.c,v 1.2 1999/02/16 21:51:39 niklas Exp $ */ - -/* $NetBSD: rf_configure.c,v 1.5 1999/02/04 14:50:31 oster Exp $ */ +/* $OpenBSD: rf_configure.c,v 1.3 1999/07/30 14:45:32 peter Exp $ */ +/* $NetBSD: rf_configure.c,v 1.6 1999/03/26 00:45:01 oster Exp $ */ /* * Copyright (c) 1995 Carnegie-Mellon University. * All rights reserved. @@ -83,10 +82,6 @@ that file here in userland.. GO -#ifndef SIMULATE -static unsigned int dev_name2num(char *s); -static unsigned int osf_dev_name2num(char *s); -#endif static int rf_search_file_for_start_of(char *string, char *buf, int len, FILE *fp); static int rf_get_next_nonblank_line(char *buf, int len, FILE *fp, @@ -190,15 +185,6 @@ int rf_MakeConfig(configname, cfgPtr) RF_ERRORMSG2("Config file error: unable to get device file for disk at row %d col %d\n",r,c); retcode = -1; goto out; } -#ifndef SIMULATE - val = dev_name2num(&cfgPtr->devnames[r][c][0]); - - if (val < 0) { - RF_ERRORMSG3("Config file error: can't get dev num (dev file '%s') for disk at row %d c %d\n", - &cfgPtr->devnames[r][c][0],r,c); - retcode = -1; goto out; - } else cfgPtr->devs[r][c] = val; -#endif /* !SIMULATE */ } } @@ -210,14 +196,6 @@ int rf_MakeConfig(configname, cfgPtr) RF_ERRORMSG1("Config file error: unable to get device file for spare disk %d\n",c); retcode = -1; goto out; } -#ifndef SIMULATE - val = dev_name2num(&cfgPtr->spare_names[c][0]); - if (val < 0) { - RF_ERRORMSG2("Config file error: can't get dev num (dev file '%s') for spare disk %d\n", - &cfgPtr->spare_names[c][0],c); - retcode = -1; goto out; - } else cfgPtr->spare_devs[c] = val; -#endif /* !SIMULATE */ } /* scan the file for the block related to layout */ @@ -368,47 +346,6 @@ int rf_MakeLayoutSpecificDeclustered(configfp, cfgPtr, arg) * utilities * ***************************************************************************/ -#ifndef SIMULATE -/* convert a device file name to a device number */ -static unsigned int dev_name2num(s) - char *s; -{ - struct stat buf; - - if (stat(s, &buf) < 0) return(osf_dev_name2num(s)); - else return(buf.st_rdev); -} - -/* converts an osf/1 style device name to a device number. We use this - * only if the stat of the device file fails. - */ -static unsigned int osf_dev_name2num(s) - char *s; -{ - int num; - char part_ch, lun_ch; - unsigned int bus, target, lun, part, dev_major; - - dev_major = RF_SCSI_DISK_MAJOR; - if (sscanf(s,"/dev/rrz%d%c", &num, &part_ch) == 2) { - bus = num>>3; - target = num & 0x7; - part = part_ch - 'a'; - lun = 0; - } else if (sscanf(s,"/dev/rrz%c%d%c", &lun_ch, &num, &part_ch) == 3) { - bus = num>>3; - target = num & 0x7; - part = part_ch - 'a'; - lun = lun_ch - 'a' + 1; - } else { - RF_ERRORMSG1("Unable to parse disk dev file name %s\n",s); - return(-1); - } - - return( (dev_major<<20) | (bus<<14) | (target<<10) | (lun<<6) | part ); -} -#endif - /* searches a file for a line that says "START string", where string is * specified as a parameter */ diff --git a/share/man/man4/raid.4 b/share/man/man4/raid.4 index 98a977f5757..dc53004db6d 100644 --- a/share/man/man4/raid.4 +++ b/share/man/man4/raid.4 @@ -1,4 +1,5 @@ -.\" $OpenBSD: raid.4,v 1.6 1999/07/09 13:35:48 aaron Exp $ +.\" $OpenBSD: raid.4,v 1.7 1999/07/30 14:45:31 peter Exp $ +.\" $NetBSD: raid.4,v 1.5 1999/03/16 01:19:17 garbled Exp $ .\" .\" .\" Copyright (c) 1998 The NetBSD Foundation, Inc. @@ -108,16 +109,41 @@ reconstructed from the data and parity present on the other components. This results in much slower data accesses, but does mean that a failure need not bring the system to a complete halt. .Pp +The RAID driver supports and enforces the use of +.Sq component labels . +A +.Sq component label +contains important information about the component, including a +user-specified serial number, the row and column of that component in the RAID +set, and whether the data (and parity) on the component is +.Sq clean . +If the driver determines that the labels are very inconsistent with +respect to each other (e.g. two or more serial numbers do not match) +or that the component label is not consistent with it's assigned place +in the set (e.g. the component label claims the component should be +the 3rd one a 6-disk set, but the RAID set has it as the 3rd component +in a 5-disk set) then the device will fail to configure. If the +driver determines that exactly one component label seems to be +incorrect, and the RAID set is being configured as a set that supports +a single failure, then the RAID set will be allowed to configure, but +the incorrectly labeled component will be marked as +.Sq failed , +and the RAID set will begin operation in degraded mode. +If all of the components are consistent among themselves, the RAID set +will configure normally. +.Pp The driver supports .Sq hot spares , disks which are on-line, but are not actively used in an existing filesystem. Should a disk fail, the -driver is capable of reconstructing the failed disk onto a hot spare. +driver is capable of reconstructing the failed disk onto a hot spare +or back onto a replacment drive. If the components are hot swapable, the failed disk can then be removed, a new disk put in it's place, and a copyback operation performed. The copyback operation, as its name indicates, will copy the reconstructed data from the hot spare to the previously failed -(and now replaced) disk. +(and now replaced) disk. Hot spares can also be hot-added using +.Xr raidctl 8 . .Pp If a component cannot be detected when the RAID device is configured, that component will be simply marked as 'failed'. @@ -130,7 +156,7 @@ is For any of the RAID flavours which have parity data, .Xr raidctl 8 must be used with the -.Fl r +.Fl i option to re-write the data when either a) a new RAID device is brought up for the first time or b) after an un-clean shutdown of a RAID device. By performing this on-demand recomputation of all parity diff --git a/sys/dev/raidframe/rf_configure.h b/sys/dev/raidframe/rf_configure.h index 81048bc43ba..c73b1e37760 100644 --- a/sys/dev/raidframe/rf_configure.h +++ b/sys/dev/raidframe/rf_configure.h @@ -1,5 +1,5 @@ -/* $OpenBSD: rf_configure.h,v 1.2 1999/02/16 00:02:26 niklas Exp $ */ -/* $NetBSD: rf_configure.h,v 1.3 1999/02/05 00:06:06 oster Exp $ */ +/* $OpenBSD: rf_configure.h,v 1.3 1999/07/30 14:45:32 peter Exp $ */ +/* $NetBSD: rf_configure.h,v 1.4 1999/03/02 03:18:49 oster Exp $ */ /* * Copyright (c) 1995 Carnegie-Mellon University. * All rights reserved. @@ -76,6 +76,13 @@ struct RF_Config_s { * layout-specific info */ void *layoutSpecific; /* a pointer to a layout-specific structure to * be copied in */ + int force; /* if !0, ignore many fatal + configuration conditions */ + /* + "force" is used to override cases where the component labels would + indicate that configuration should not proceed without user + intervention + */ }; #ifndef _KERNEL int rf_MakeConfig(char *configname, RF_Config_t * cfgPtr); diff --git a/sys/dev/raidframe/rf_copyback.c b/sys/dev/raidframe/rf_copyback.c index ba06d882559..82984d89dc0 100644 --- a/sys/dev/raidframe/rf_copyback.c +++ b/sys/dev/raidframe/rf_copyback.c @@ -1,5 +1,5 @@ -/* $OpenBSD: rf_copyback.c,v 1.2 1999/02/16 00:02:27 niklas Exp $ */ -/* $NetBSD: rf_copyback.c,v 1.3 1999/02/05 00:06:06 oster Exp $ */ +/* $OpenBSD: rf_copyback.c,v 1.3 1999/07/30 14:45:32 peter Exp $ */ +/* $NetBSD: rf_copyback.c,v 1.7 1999/03/02 03:18:49 oster Exp $ */ /* * Copyright (c) 1995 Carnegie-Mellon University. * All rights reserved. @@ -85,13 +85,17 @@ rf_ConfigureCopyback(listp) #include <sys/vnode.h> #endif +/* XXX these should be in a .h file somewhere */ int raidlookup __P((char *, struct proc *, struct vnode **)); +int raidwrite_component_label __P((dev_t, struct vnode *, RF_ComponentLabel_t *)); +int raidread_component_label __P((dev_t, struct vnode *, RF_ComponentLabel_t *)); /* do a complete copyback */ void rf_CopybackReconstructedData(raidPtr) RF_Raid_t *raidPtr; { + RF_ComponentLabel_t c_label; int done, retcode; RF_CopybackDesc_t *desc; RF_RowCol_t frow, fcol; @@ -131,8 +135,10 @@ rf_CopybackReconstructedData(raidPtr) if (raidPtr->raid_cinfo[frow][fcol].ci_vp != NULL) { printf("Closed the open device: %s\n", raidPtr->Disks[frow][fcol].devname); + VOP_UNLOCK(raidPtr->raid_cinfo[frow][fcol].ci_vp, 0, proc); (void) vn_close(raidPtr->raid_cinfo[frow][fcol].ci_vp, FREAD | FWRITE, proc->p_ucred, proc); + raidPtr->raid_cinfo[frow][fcol].ci_vp = NULL; } printf("About to (re-)open the device: %s\n", raidPtr->Disks[frow][fcol].devname); @@ -228,6 +234,26 @@ rf_CopybackReconstructedData(raidPtr) printf("COPYBACK: Beginning\n"); RF_GETTIME(desc->starttime); rf_ContinueCopyback(desc); + + /* Data has been restored. Fix up the component label. */ + /* Don't actually need the read here.. */ + raidread_component_label( raidPtr->raid_cinfo[frow][fcol].ci_dev, + raidPtr->raid_cinfo[frow][fcol].ci_vp, + &c_label); + + c_label.version = RF_COMPONENT_LABEL_VERSION; + c_label.mod_counter = raidPtr->mod_counter; + c_label.serial_number = raidPtr->serial_number; + c_label.row = frow; + c_label.column = fcol; + c_label.num_rows = raidPtr->numRow; + c_label.num_columns = raidPtr->numCol; + c_label.clean = RF_RAID_DIRTY; + c_label.status = rf_ds_optimal; + + raidwrite_component_label( raidPtr->raid_cinfo[frow][fcol].ci_dev, + raidPtr->raid_cinfo[frow][fcol].ci_vp, + &c_label); } diff --git a/sys/dev/raidframe/rf_cvscan.c b/sys/dev/raidframe/rf_cvscan.c index 4076883cfa0..61876309b4c 100644 --- a/sys/dev/raidframe/rf_cvscan.c +++ b/sys/dev/raidframe/rf_cvscan.c @@ -1,4 +1,4 @@ -/* $OpenBSD: rf_cvscan.c,v 1.2 1999/02/16 00:02:28 niklas Exp $ */ +/* $OpenBSD: rf_cvscan.c,v 1.3 1999/07/30 14:45:32 peter Exp $ */ /* $NetBSD: rf_cvscan.c,v 1.4 1999/02/05 00:06:07 oster Exp $ */ /* * Copyright (c) 1995 Carnegie-Mellon University. @@ -354,11 +354,11 @@ rf_CvscanCreate(RF_SectorCount_t sectPerDisk, #if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) /* PrintCvscanQueue is not used, so we ignore it... */ #else -+ static void -+ PrintCvscanQueue(RF_CvscanHeader_t * hdr) - { - RF_DiskQueueData_t *tmp; - +static void +PrintCvscanQueue(RF_CvscanHeader_t * hdr) +{ + RF_DiskQueueData_t *tmp; + printf("CVSCAN(%d,%d) at %d going %s\n", (int) hdr->range_for_avg, (int) hdr->change_penalty, diff --git a/sys/dev/raidframe/rf_dagfuncs.c b/sys/dev/raidframe/rf_dagfuncs.c index a4ea944ba05..af2160f683b 100644 --- a/sys/dev/raidframe/rf_dagfuncs.c +++ b/sys/dev/raidframe/rf_dagfuncs.c @@ -1,5 +1,5 @@ -/* $OpenBSD: rf_dagfuncs.c,v 1.2 1999/02/16 00:02:32 niklas Exp $ */ -/* $NetBSD: rf_dagfuncs.c,v 1.3 1999/02/05 00:06:08 oster Exp $ */ +/* $OpenBSD: rf_dagfuncs.c,v 1.3 1999/07/30 14:45:32 peter Exp $ */ +/* $NetBSD: rf_dagfuncs.c,v 1.4 1999/03/14 21:53:31 oster Exp $ */ /* * Copyright (c) 1995 Carnegie-Mellon University. * All rights reserved. @@ -507,6 +507,9 @@ rf_GenericWakeupFunc(node, status) node->status = rf_undone; break; default: + printf("rf_GenericWakeupFunc:"); + printf("node->status is %d,", node->status); + printf("status is %d \n", status); RF_PANIC(); break; } diff --git a/sys/dev/raidframe/rf_diskqueue.c b/sys/dev/raidframe/rf_diskqueue.c index 14bccb06ca2..d050089d166 100644 --- a/sys/dev/raidframe/rf_diskqueue.c +++ b/sys/dev/raidframe/rf_diskqueue.c @@ -1,5 +1,5 @@ -/* $OpenBSD: rf_diskqueue.c,v 1.2 1999/02/16 00:02:39 niklas Exp $ */ -/* $NetBSD: rf_diskqueue.c,v 1.6 1999/02/05 00:06:09 oster Exp $ */ +/* $OpenBSD: rf_diskqueue.c,v 1.3 1999/07/30 14:45:32 peter Exp $ */ +/* $NetBSD: rf_diskqueue.c,v 1.7 1999/06/04 01:51:00 oster Exp $ */ /* * Copyright (c) 1995 Carnegie-Mellon University. * All rights reserved. @@ -182,7 +182,12 @@ clean_dqd(dqd) free(dqd->bp, M_RAIDFRAME); } /* configures a single disk queue */ -static int +int config_disk_queue(RF_Raid_t *, RF_DiskQueue_t *, RF_RowCol_t, + RF_RowCol_t, RF_DiskQueueSW_t *, + RF_SectorCount_t, dev_t, int, + RF_ShutdownList_t **, + RF_AllocListElem_t *); +int config_disk_queue( RF_Raid_t * raidPtr, RF_DiskQueue_t * diskqueue, @@ -286,7 +291,10 @@ rf_ConfigureDiskQueues( } raidPtr->Queues = diskQueues; for (r = 0; r < raidPtr->numRow; r++) { - RF_CallocAndAdd(diskQueues[r], raidPtr->numCol + ((r == 0) ? raidPtr->numSpare : 0), sizeof(RF_DiskQueue_t), (RF_DiskQueue_t *), raidPtr->cleanupList); + RF_CallocAndAdd(diskQueues[r], raidPtr->numCol + + ((r == 0) ? RF_MAXSPARE : 0), + sizeof(RF_DiskQueue_t), (RF_DiskQueue_t *), + raidPtr->cleanupList); if (diskQueues[r] == NULL) return (ENOMEM); for (c = 0; c < raidPtr->numCol; c++) { diff --git a/sys/dev/raidframe/rf_disks.c b/sys/dev/raidframe/rf_disks.c index 20daa65340e..003b29e8ddf 100644 --- a/sys/dev/raidframe/rf_disks.c +++ b/sys/dev/raidframe/rf_disks.c @@ -1,5 +1,40 @@ -/* $OpenBSD: rf_disks.c,v 1.2 1999/02/16 00:02:40 niklas Exp $ */ -/* $NetBSD: rf_disks.c,v 1.5 1999/02/05 00:06:09 oster Exp $ */ +/* $OpenBSD: rf_disks.c,v 1.3 1999/07/30 14:45:32 peter Exp $ */ +/* $NetBSD: rf_disks.c,v 1.10 1999/06/04 02:02:39 oster Exp $ */ +/*- + * Copyright (c) 1999 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Greg Oster + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ /* * Copyright (c) 1995 Carnegie-Mellon University. * All rights reserved. @@ -53,62 +88,55 @@ #include <sys/vnode.h> #endif +/* XXX these should be in a header file somewhere */ int raidlookup __P((char *, struct proc * p, struct vnode **)); - +int raidwrite_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *); +int raidread_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *); +void rf_UnconfigureVnodes( RF_Raid_t * ); +int rf_CheckLabels( RF_Raid_t *, RF_Config_t *); #define DPRINTF6(a,b,c,d,e,f) if (rf_diskDebug) printf(a,b,c,d,e,f) #define DPRINTF7(a,b,c,d,e,f,g) if (rf_diskDebug) printf(a,b,c,d,e,f,g) -/**************************************************************************************** +/**************************************************************************** * * initialize the disks comprising the array * - * We want the spare disks to have regular row,col numbers so that we can easily - * substitue a spare for a failed disk. But, the driver code assumes throughout - * that the array contains numRow by numCol _non-spare_ disks, so it's not clear - * how to fit in the spares. This is an unfortunate holdover from raidSim. The - * quick and dirty fix is to make row zero bigger than the rest, and put all the - * spares in it. This probably needs to get changed eventually. + * We want the spare disks to have regular row,col numbers so that we can + * easily substitue a spare for a failed disk. But, the driver code assumes + * throughout that the array contains numRow by numCol _non-spare_ disks, so + * it's not clear how to fit in the spares. This is an unfortunate holdover + * from raidSim. The quick and dirty fix is to make row zero bigger than the + * rest, and put all the spares in it. This probably needs to get changed + * eventually. * - ***************************************************************************************/ + ****************************************************************************/ int -rf_ConfigureDisks( - RF_ShutdownList_t ** listp, - RF_Raid_t * raidPtr, - RF_Config_t * cfgPtr) +rf_ConfigureDisks( listp, raidPtr, cfgPtr ) + RF_ShutdownList_t **listp; + RF_Raid_t *raidPtr; + RF_Config_t *cfgPtr; { RF_RaidDisk_t **disks; RF_SectorCount_t min_numblks = (RF_SectorCount_t) 0x7FFFFFFFFFFFLL; RF_RowCol_t r, c; int bs, ret; unsigned i, count, foundone = 0, numFailuresThisRow; - RF_DiskOp_t *rdcap_op = NULL, *tur_op = NULL; int num_rows_done, num_cols_done; - - struct proc *proc = 0; -#if !defined(__NetBSD__) && !defined(__OpenBSD__) - ret = rf_SCSI_AllocReadCapacity(&rdcap_op); - if (ret) - goto fail; - ret = rf_SCSI_AllocTUR(&tur_op); - if (ret) - goto fail; -#endif /* !__NetBSD__ && !__OpenBSD__ */ + int force; num_rows_done = 0; num_cols_done = 0; - - - RF_CallocAndAdd(disks, raidPtr->numRow, sizeof(RF_RaidDisk_t *), (RF_RaidDisk_t **), raidPtr->cleanupList); + force = cfgPtr->force; + + RF_CallocAndAdd(disks, raidPtr->numRow, sizeof(RF_RaidDisk_t *), + (RF_RaidDisk_t **), raidPtr->cleanupList); if (disks == NULL) { ret = ENOMEM; goto fail; } raidPtr->Disks = disks; - - proc = raidPtr->proc; /* Blah XXX */ - /* get space for the device-specific stuff... */ RF_CallocAndAdd(raidPtr->raid_cinfo, raidPtr->numRow, sizeof(struct raidcinfo *), (struct raidcinfo **), @@ -119,7 +147,12 @@ rf_ConfigureDisks( } for (r = 0; r < raidPtr->numRow; r++) { numFailuresThisRow = 0; - RF_CallocAndAdd(disks[r], raidPtr->numCol + ((r == 0) ? raidPtr->numSpare : 0), sizeof(RF_RaidDisk_t), (RF_RaidDisk_t *), raidPtr->cleanupList); + /* We allocate RF_MAXSPARE on the first row so that we + have room to do hot-swapping of spares */ + RF_CallocAndAdd(disks[r], raidPtr->numCol + + ((r == 0) ? RF_MAXSPARE : 0), + sizeof(RF_RaidDisk_t), (RF_RaidDisk_t *), + raidPtr->cleanupList); if (disks[r] == NULL) { ret = ENOMEM; goto fail; @@ -134,11 +167,19 @@ rf_ConfigureDisks( goto fail; } for (c = 0; c < raidPtr->numCol; c++) { - ret = rf_ConfigureDisk(raidPtr, &cfgPtr->devnames[r][c][0], - &disks[r][c], rdcap_op, tur_op, - cfgPtr->devs[r][c], r, c); + ret = rf_ConfigureDisk(raidPtr, + &cfgPtr->devnames[r][c][0], + &disks[r][c], r, c); if (ret) goto fail; + + if (disks[r][c].status == rf_ds_optimal) { + raidread_component_label( + raidPtr->raid_cinfo[r][c].ci_dev, + raidPtr->raid_cinfo[r][c].ci_vp, + &raidPtr->raid_cinfo[r][c].ci_label); + } + if (disks[r][c].status != rf_ds_optimal) { numFailuresThisRow++; } else { @@ -148,23 +189,18 @@ rf_ConfigureDisks( r, c, disks[r][c].devname, (long int) disks[r][c].numBlocks, disks[r][c].blockSize, - (long int) disks[r][c].numBlocks * disks[r][c].blockSize / 1024 / 1024); + (long int) disks[r][c].numBlocks * + disks[r][c].blockSize / 1024 / 1024); } num_cols_done++; } /* XXX fix for n-fault tolerant */ + /* XXX this should probably check to see how many failures + we can handle for this configuration! */ if (numFailuresThisRow > 0) raidPtr->status[r] = rf_rs_degraded; num_rows_done++; } -#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) - /* we do nothing */ -#else - rf_SCSI_FreeDiskOp(rdcap_op, 1); - rdcap_op = NULL; - rf_SCSI_FreeDiskOp(tur_op, 0); - tur_op = NULL; -#endif /* all disks must be the same size & have the same block size, bs must * be a power of 2 */ bs = 0; @@ -189,6 +225,18 @@ rf_ConfigureDisks( ret = EINVAL; goto fail; } + + if (rf_CheckLabels( raidPtr, cfgPtr )) { + printf("raid%d: There were fatal errors\n", raidPtr->raidid); + if (force != 0) { + printf("raid%d: Fatal errors being ignored.\n", + raidPtr->raidid); + } else { + ret = EINVAL; + goto fail; + } + } + for (r = 0; r < raidPtr->numRow; r++) { for (c = 0; c < raidPtr->numCol; c++) { if (disks[r][c].status == rf_ds_optimal) { @@ -213,99 +261,53 @@ rf_ConfigureDisks( return (0); fail: + rf_UnconfigureVnodes( raidPtr ); -#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) - - for (r = 0; r < raidPtr->numRow; r++) { - for (c = 0; c < raidPtr->numCol; c++) { - /* Cleanup.. */ -#ifdef DEBUG - printf("Cleaning up row: %d col: %d\n", r, c); -#endif - if (raidPtr->raid_cinfo[r][c].ci_vp) { - (void) vn_close(raidPtr->raid_cinfo[r][c].ci_vp, - FREAD | FWRITE, proc->p_ucred, proc); - } - } - } - /* Space allocated for raid_vpp will get cleaned up at some other - * point */ - /* XXX Need more #ifdefs in the above... */ - -#else - - if (rdcap_op) - rf_SCSI_FreeDiskOp(rdcap_op, 1); - if (tur_op) - rf_SCSI_FreeDiskOp(tur_op, 0); - -#endif return (ret); } -/**************************************************************************************** +/**************************************************************************** * set up the data structures describing the spare disks in the array * recall from the above comment that the spare disk descriptors are stored * in row zero, which is specially expanded to hold them. - ***************************************************************************************/ + ****************************************************************************/ int -rf_ConfigureSpareDisks( - RF_ShutdownList_t ** listp, - RF_Raid_t * raidPtr, - RF_Config_t * cfgPtr) +rf_ConfigureSpareDisks( listp, raidPtr, cfgPtr ) + RF_ShutdownList_t ** listp; + RF_Raid_t * raidPtr; + RF_Config_t * cfgPtr; { - char buf[256]; - int r, c, i, ret; - RF_DiskOp_t *rdcap_op = NULL, *tur_op = NULL; - unsigned bs; + int i, ret; + unsigned int bs; RF_RaidDisk_t *disks; int num_spares_done; - struct proc *proc; - -#if !defined(__NetBSD__) && !defined(__OpenBSD__) - ret = rf_SCSI_AllocReadCapacity(&rdcap_op); - if (ret) - goto fail; - ret = rf_SCSI_AllocTUR(&tur_op); - if (ret) - goto fail; -#endif /* !__NetBSD__ && !__OpenBSD__ */ - num_spares_done = 0; - proc = raidPtr->proc; /* The space for the spares should have already been allocated by * ConfigureDisks() */ disks = &raidPtr->Disks[0][raidPtr->numCol]; for (i = 0; i < raidPtr->numSpare; i++) { ret = rf_ConfigureDisk(raidPtr, &cfgPtr->spare_names[i][0], - &disks[i], rdcap_op, tur_op, - cfgPtr->spare_devs[i], 0, raidPtr->numCol + i); + &disks[i], 0, raidPtr->numCol + i); if (ret) goto fail; if (disks[i].status != rf_ds_optimal) { - RF_ERRORMSG1("Warning: spare disk %s failed TUR\n", buf); + RF_ERRORMSG1("Warning: spare disk %s failed TUR\n", + &cfgPtr->spare_names[i][0]); } else { disks[i].status = rf_ds_spare; /* change status to * spare */ DPRINTF6("Spare Disk %d: dev %s numBlocks %ld blockSize %d (%ld MB)\n", i, disks[i].devname, (long int) disks[i].numBlocks, disks[i].blockSize, - (long int) disks[i].numBlocks * disks[i].blockSize / 1024 / 1024); + (long int) disks[i].numBlocks * + disks[i].blockSize / 1024 / 1024); } num_spares_done++; } -#if (defined(__NetBSD__) || defined(__OpenBSD__)) && (_KERNEL) - -#else - rf_SCSI_FreeDiskOp(rdcap_op, 1); - rdcap_op = NULL; - rf_SCSI_FreeDiskOp(tur_op, 0); - tur_op = NULL; -#endif /* check sizes and block sizes on spare disks */ bs = 1 << raidPtr->logBytesPerSector; @@ -317,7 +319,8 @@ rf_ConfigureSpareDisks( } if (disks[i].numBlocks < raidPtr->sectorsPerDisk) { RF_ERRORMSG3("Spare disk %s (%d blocks) is too small to serve as a spare (need %ld blocks)\n", - disks[i].devname, disks[i].blockSize, (long int) raidPtr->sectorsPerDisk); + disks[i].devname, disks[i].blockSize, + (long int) raidPtr->sectorsPerDisk); ret = EINVAL; goto fail; } else @@ -331,43 +334,18 @@ rf_ConfigureSpareDisks( return (0); fail: -#if (defined(__NetBSD__) || defined(__OpenBSD__)) && defined(_KERNEL) /* Release the hold on the main components. We've failed to allocate - * a spare, and since we're failing, we need to free things.. */ - - for (r = 0; r < raidPtr->numRow; r++) { - for (c = 0; c < raidPtr->numCol; c++) { - /* Cleanup.. */ -#ifdef DEBUG - printf("Cleaning up row: %d col: %d\n", r, c); -#endif - if (raidPtr->raid_cinfo[r][c].ci_vp) { - (void) vn_close(raidPtr->raid_cinfo[r][c].ci_vp, - FREAD | FWRITE, proc->p_ucred, proc); - } - } - } + * a spare, and since we're failing, we need to free things.. - for (i = 0; i < raidPtr->numSpare; i++) { - /* Cleanup.. */ -#ifdef DEBUG - printf("Cleaning up spare: %d\n", i); -#endif - if (raidPtr->raid_cinfo[0][raidPtr->numCol + i].ci_vp) { - (void) vn_close(raidPtr->raid_cinfo[0][raidPtr->numCol + i].ci_vp, - FREAD | FWRITE, proc->p_ucred, proc); - } - } + XXX failing to allocate a spare is *not* that big of a deal... + We *can* survive without it, if need be, esp. if we get hot + adding working. + If we don't fail out here, then we need a way to remove this spare... + that should be easier to do here than if we are "live"... + */ -#else - - if (rdcap_op) - rf_SCSI_FreeDiskOp(rdcap_op, 1); - if (tur_op) - rf_SCSI_FreeDiskOp(tur_op, 0); - -#endif + rf_UnconfigureVnodes( raidPtr ); return (ret); } @@ -376,13 +354,10 @@ fail: /* configure a single disk in the array */ int -rf_ConfigureDisk(raidPtr, buf, diskPtr, rdcap_op, tur_op, dev, row, col) - RF_Raid_t *raidPtr; /* We need this down here too!! GO */ +rf_ConfigureDisk(raidPtr, buf, diskPtr, row, col) + RF_Raid_t *raidPtr; char *buf; RF_RaidDisk_t *diskPtr; - RF_DiskOp_t *rdcap_op; - RF_DiskOp_t *tur_op; - dev_t dev; /* device number used only in kernel */ RF_RowCol_t row; RF_RowCol_t col; { @@ -403,42 +378,11 @@ rf_ConfigureDisk(raidPtr, buf, diskPtr, rdcap_op, tur_op, dev, row, col) } (void) strcpy(diskPtr->devname, p); -#if !defined(__NetBSD__) && !defined(__OpenBSD__) - /* get bus, target, lun */ - retcode = rf_extract_ids(p, &busid, &targid, &lun); - if (retcode) - return (retcode); - - /* required in kernel, nop at user level */ - retcode = rf_SCSI_OpenUnit(dev); - if (retcode) - return (retcode); - - diskPtr->dev = dev; - if (rf_SCSI_DoTUR(tur_op, (u_char) busid, (u_char) targid, (u_char) lun, dev)) { - RF_ERRORMSG1("Disk %s failed TUR. Marked as dead.\n", diskPtr->devname); - diskPtr->status = rf_ds_failed; - } else { - diskPtr->status = rf_ds_optimal; - retcode = rf_SCSI_DoReadCapacity(raidPtr, rdcap_op, busid, targid, lun, dev, - &diskPtr->numBlocks, &diskPtr->blockSize, diskPtr->devname); - if (retcode) - return (retcode); - - /* we allow the user to specify that only a fraction of the - * disks should be used this is just for debug: it speeds up - * the parity scan */ - diskPtr->numBlocks = diskPtr->numBlocks * rf_sizePercentage / 100; - } -#endif - proc = raidPtr->proc; /* XXX Yes, this is not nice.. */ /* Let's start by claiming the component is fine and well... */ - /* XXX not the case if the disk is toast.. */ diskPtr->status = rf_ds_optimal; - raidPtr->raid_cinfo[row][col].ci_vp = NULL; raidPtr->raid_cinfo[row][col].ci_dev = NULL; @@ -446,7 +390,7 @@ rf_ConfigureDisk(raidPtr, buf, diskPtr, rdcap_op, tur_op, dev, row, col) if (error) { printf("raidlookup on device: %s failed!\n", diskPtr->devname); if (error == ENXIO) { - /* XXX the component isn't there... must be dead :-( */ + /* the component isn't there... must be dead :-( */ diskPtr->status = rf_ds_failed; } else { return (error); @@ -469,17 +413,484 @@ rf_ConfigureDisk(raidPtr, buf, diskPtr, rdcap_op, tur_op, dev, row, col) raidPtr->raid_cinfo[row][col].ci_vp = vp; raidPtr->raid_cinfo[row][col].ci_dev = va.va_rdev; -#if 0 - diskPtr->dev = dev; -#endif - - diskPtr->dev = va.va_rdev; /* XXX or the above? */ + diskPtr->dev = va.va_rdev; /* we allow the user to specify that only a fraction of the * disks should be used this is just for debug: it speeds up * the parity scan */ - diskPtr->numBlocks = diskPtr->numBlocks * rf_sizePercentage / 100; + diskPtr->numBlocks = diskPtr->numBlocks * + rf_sizePercentage / 100; + } + return (0); +} + +static void rf_print_label_status( RF_Raid_t *, int, int, char *, + RF_ComponentLabel_t *); + +static void +rf_print_label_status( raidPtr, row, column, dev_name, ci_label ) + RF_Raid_t *raidPtr; + int row; + int column; + char *dev_name; + RF_ComponentLabel_t *ci_label; +{ + + printf("raid%d: Component %s being configured at row: %d col: %d\n", + raidPtr->raidid, dev_name, row, column ); + printf(" Row: %d Column: %d Num Rows: %d Num Columns: %d\n", + ci_label->row, ci_label->column, + ci_label->num_rows, ci_label->num_columns); + printf(" Version: %d Serial Number: %d Mod Counter: %d\n", + ci_label->version, ci_label->serial_number, + ci_label->mod_counter); + printf(" Clean: %d Status: %d\n", + ci_label->clean, ci_label->status ); +} +static int rf_check_label_vitals( RF_Raid_t *, int, int, char *, + RF_ComponentLabel_t *, int, int ); +static int rf_check_label_vitals( raidPtr, row, column, dev_name, ci_label, + serial_number, mod_counter ) + RF_Raid_t *raidPtr; + int row; + int column; + char *dev_name; + RF_ComponentLabel_t *ci_label; + int serial_number; + int mod_counter; +{ + int fatal_error = 0; + + if (serial_number != ci_label->serial_number) { + printf("%s has a different serial number: %d %d\n", + dev_name, serial_number, ci_label->serial_number); + fatal_error = 1; + } + if (mod_counter != ci_label->mod_counter) { + printf("%s has a different modfication count: %d %d\n", + dev_name, mod_counter, ci_label->mod_counter); } + + if (row != ci_label->row) { + printf("Row out of alignment for: %s\n", dev_name); + fatal_error = 1; + } + if (column != ci_label->column) { + printf("Column out of alignment for: %s\n", dev_name); + fatal_error = 1; + } + if (raidPtr->numRow != ci_label->num_rows) { + printf("Number of rows do not match for: %s\n", dev_name); + fatal_error = 1; + } + if (raidPtr->numCol != ci_label->num_columns) { + printf("Number of columns do not match for: %s\n", dev_name); + fatal_error = 1; + } + if (ci_label->clean == 0) { + /* it's not clean, but that's not fatal */ + printf("%s is not clean!\n", dev_name); + } + return(fatal_error); +} + + +/* + + rf_CheckLabels() - check all the component labels for consistency. + Return an error if there is anything major amiss. + + */ + +int +rf_CheckLabels( raidPtr, cfgPtr ) + RF_Raid_t *raidPtr; + RF_Config_t *cfgPtr; +{ + int r,c; + char *dev_name; + RF_ComponentLabel_t *ci_label; + int serial_number = 0; + int mod_number = 0; + int fatal_error = 0; + int mod_values[4]; + int mod_count[4]; + int ser_values[4]; + int ser_count[4]; + int num_ser; + int num_mod; + int i; + int found; + int hosed_row; + int hosed_column; + int too_fatal; + int parity_good; + int force; + + hosed_row = -1; + hosed_column = -1; + too_fatal = 0; + force = cfgPtr->force; + + /* + We're going to try to be a little intelligent here. If one + component's label is bogus, and we can identify that it's the + *only* one that's gone, we'll mark it as "failed" and allow + the configuration to proceed. This will be the *only* case + that we'll proceed if there would be (otherwise) fatal errors. + + Basically we simply keep a count of how many components had + what serial number. If all but one agree, we simply mark + the disagreeing component as being failed, and allow + things to come up "normally". + + We do this first for serial numbers, and then for "mod_counter". + + */ + + num_ser = 0; + num_mod = 0; + for (r = 0; r < raidPtr->numRow && !fatal_error ; r++) { + for (c = 0; c < raidPtr->numCol; c++) { + ci_label = &raidPtr->raid_cinfo[r][c].ci_label; + found=0; + for(i=0;i<num_ser;i++) { + if (ser_values[i] == ci_label->serial_number) { + ser_count[i]++; + found=1; + break; + } + } + if (!found) { + ser_values[num_ser] = ci_label->serial_number; + ser_count[num_ser] = 1; + num_ser++; + if (num_ser>2) { + fatal_error = 1; + break; + } + } + found=0; + for(i=0;i<num_mod;i++) { + if (mod_values[i] == ci_label->mod_counter) { + mod_count[i]++; + found=1; + break; + } + } + if (!found) { + mod_values[num_mod] = ci_label->mod_counter; + mod_count[num_mod] = 1; + num_mod++; + if (num_mod>2) { + fatal_error = 1; + break; + } + } + } + } +#if DEBUG + printf("raid%d: Summary of serial numbers:\n", raidPtr->raidid); + for(i=0;i<num_ser;i++) { + printf("%d %d\n", ser_values[i], ser_count[i]); + } + printf("raid%d: Summary of mod counters:\n", raidPtr->raidid); + for(i=0;i<num_mod;i++) { + printf("%d %d\n", mod_values[i], mod_count[i]); + } +#endif + serial_number = ser_values[0]; + if (num_ser == 2) { + if ((ser_count[0] == 1) || (ser_count[1] == 1)) { + /* Locate the maverick component */ + if (ser_count[1] > ser_count[0]) { + serial_number = ser_values[1]; + } + for (r = 0; r < raidPtr->numRow; r++) { + for (c = 0; c < raidPtr->numCol; c++) { + ci_label = &raidPtr->raid_cinfo[r][c].ci_label; + if (serial_number != + ci_label->serial_number) { + hosed_row = r; + hosed_column = c; + break; + } + } + } + printf("Hosed component: %s\n", + &cfgPtr->devnames[hosed_row][hosed_column][0]); + if (!force) { + /* we'll fail this component, as if there are + other major errors, we arn't forcing things + and we'll abort the config anyways */ + raidPtr->Disks[hosed_row][hosed_column].status + = rf_ds_failed; + raidPtr->numFailures++; + raidPtr->status[hosed_row] = rf_rs_degraded; + } + } else { + too_fatal = 1; + } + if (cfgPtr->parityConfig == '0') { + /* We've identified two different serial numbers. + RAID 0 can't cope with that, so we'll punt */ + too_fatal = 1; + } + + } + + /* record the serial number for later. If we bail later, setting + this doesn't matter, otherwise we've got the best guess at the + correct serial number */ + raidPtr->serial_number = serial_number; + + mod_number = mod_values[0]; + if (num_mod == 2) { + if ((mod_count[0] == 1) || (mod_count[1] == 1)) { + /* Locate the maverick component */ + if (mod_count[1] > mod_count[0]) { + mod_number = mod_values[1]; + } else if (mod_count[1] < mod_count[0]) { + mod_number = mod_values[0]; + } else { + /* counts of different modification values + are the same. Assume greater value is + the correct one, all other things + considered */ + if (mod_values[0] > mod_values[1]) { + mod_number = mod_values[0]; + } else { + mod_number = mod_values[1]; + } + + } + for (r = 0; r < raidPtr->numRow && !too_fatal ; r++) { + for (c = 0; c < raidPtr->numCol; c++) { + ci_label = &raidPtr->raid_cinfo[r][c].ci_label; + if (mod_number != + ci_label->mod_counter) { + if ( ( hosed_row == r ) && + ( hosed_column == c )) { + /* same one. Can + deal with it. */ + } else { + hosed_row = r; + hosed_column = c; + if (num_ser != 1) { + too_fatal = 1; + break; + } + } + } + } + } + printf("Hosed component: %s\n", + &cfgPtr->devnames[hosed_row][hosed_column][0]); + if (!force) { + /* we'll fail this component, as if there are + other major errors, we arn't forcing things + and we'll abort the config anyways */ + if (raidPtr->Disks[hosed_row][hosed_column].status != rf_ds_failed) { + raidPtr->Disks[hosed_row][hosed_column].status + = rf_ds_failed; + raidPtr->numFailures++; + raidPtr->status[hosed_row] = rf_rs_degraded; + } + } + } else { + too_fatal = 1; + } + if (cfgPtr->parityConfig == '0') { + /* We've identified two different mod counters. + RAID 0 can't cope with that, so we'll punt */ + too_fatal = 1; + } + } + + raidPtr->mod_counter = mod_number; + + if (too_fatal) { + /* we've had both a serial number mismatch, and a mod_counter + mismatch -- and they involved two different components!! + Bail -- make things fail so that the user must force + the issue... */ + hosed_row = -1; + hosed_column = -1; + } + + if (num_ser > 2) { + printf("raid%d: Too many different serial numbers!\n", + raidPtr->raidid); + } + + if (num_mod > 2) { + printf("raid%d: Too many different mod counters!\n", + raidPtr->raidid); + } + + /* we start by assuming the parity will be good, and flee from + that notion at the slightest sign of trouble */ + + parity_good = RF_RAID_CLEAN; + for (r = 0; r < raidPtr->numRow; r++) { + for (c = 0; c < raidPtr->numCol; c++) { + dev_name = &cfgPtr->devnames[r][c][0]; + ci_label = &raidPtr->raid_cinfo[r][c].ci_label; + + if ((r == hosed_row) && (c == hosed_column)) { + printf("raid%d: Ignoring %s\n", + raidPtr->raidid, dev_name); + } else { + rf_print_label_status( raidPtr, r, c, + dev_name, ci_label ); + if (rf_check_label_vitals( raidPtr, r, c, + dev_name, ci_label, + serial_number, + mod_number )) { + fatal_error = 1; + } + if (ci_label->clean != RF_RAID_CLEAN) { + parity_good = RF_RAID_DIRTY; + } + } + } + } + if (fatal_error) { + parity_good = RF_RAID_DIRTY; + } + + /* we note the state of the parity */ + raidPtr->parity_good = parity_good; + + return(fatal_error); +} + +int config_disk_queue(RF_Raid_t *, RF_DiskQueue_t *, RF_RowCol_t, + RF_RowCol_t, RF_DiskQueueSW_t *, + RF_SectorCount_t, dev_t, int, + RF_ShutdownList_t **, + RF_AllocListElem_t *); + +int rf_add_hot_spare(RF_Raid_t *, RF_SingleComponent_t *); +int +rf_add_hot_spare(raidPtr, sparePtr) + RF_Raid_t *raidPtr; + RF_SingleComponent_t *sparePtr; +{ + RF_RaidDisk_t *disks; + RF_DiskQueue_t *spareQueues; + int ret; + unsigned int bs; + int spare_number; + + printf("Just in rf_add_hot_spare: %d\n",raidPtr->numSpare); + printf("Num col: %d\n",raidPtr->numCol); + if (raidPtr->numSpare >= RF_MAXSPARE) { + RF_ERRORMSG1("Too many spares: %d\n", raidPtr->numSpare); + return(EINVAL); + } + + RF_LOCK_MUTEX(raidPtr->mutex); + + /* the beginning of the spares... */ + disks = &raidPtr->Disks[0][raidPtr->numCol]; + + spare_number = raidPtr->numSpare; + + ret = rf_ConfigureDisk(raidPtr, sparePtr->component_name, + &disks[spare_number], 0, + raidPtr->numCol + spare_number); + + if (ret) + goto fail; + if (disks[spare_number].status != rf_ds_optimal) { + RF_ERRORMSG1("Warning: spare disk %s failed TUR\n", + sparePtr->component_name); + ret=EINVAL; + goto fail; + } else { + disks[spare_number].status = rf_ds_spare; + DPRINTF6("Spare Disk %d: dev %s numBlocks %ld blockSize %d (%ld MB)\n", spare_number, + disks[spare_number].devname, + (long int) disks[spare_number].numBlocks, + disks[spare_number].blockSize, + (long int) disks[spare_number].numBlocks * + disks[spare_number].blockSize / 1024 / 1024); + } + + + /* check sizes and block sizes on the spare disk */ + bs = 1 << raidPtr->logBytesPerSector; + if (disks[spare_number].blockSize != bs) { + RF_ERRORMSG3("Block size of %d on spare disk %s is not the same as on other disks (%d)\n", disks[spare_number].blockSize, disks[spare_number].devname, bs); + ret = EINVAL; + goto fail; + } + if (disks[spare_number].numBlocks < raidPtr->sectorsPerDisk) { + RF_ERRORMSG3("Spare disk %s (%d blocks) is too small to serve as a spare (need %ld blocks)\n", + disks[spare_number].devname, + disks[spare_number].blockSize, + (long int) raidPtr->sectorsPerDisk); + ret = EINVAL; + goto fail; + } else { + if (disks[spare_number].numBlocks > + raidPtr->sectorsPerDisk) { + RF_ERRORMSG2("Warning: truncating spare disk %s to %ld blocks\n", disks[spare_number].devname, + (long int) raidPtr->sectorsPerDisk); + + disks[spare_number].numBlocks = raidPtr->sectorsPerDisk; + } + } + + spareQueues = &raidPtr->Queues[0][raidPtr->numCol]; + ret = config_disk_queue( raidPtr, &spareQueues[spare_number], + 0, raidPtr->numCol + spare_number, + raidPtr->Queues[0][0].qPtr, /* XXX */ + raidPtr->sectorsPerDisk, + raidPtr->Disks[0][raidPtr->numCol + spare_number].dev, + raidPtr->Queues[0][0].maxOutstanding, /* XXX */ + &raidPtr->shutdownList, + raidPtr->cleanupList); + + + raidPtr->numSpare++; + RF_UNLOCK_MUTEX(raidPtr->mutex); + return (0); + +fail: + RF_UNLOCK_MUTEX(raidPtr->mutex); + return(ret); +} + +int +rf_remove_hot_spare(raidPtr,sparePtr) + RF_Raid_t *raidPtr; + RF_SingleComponent_t *sparePtr; +{ + int spare_number; + + + if (raidPtr->numSpare==0) { + printf("No spares to remove!\n"); + return(EINVAL); + } + + spare_number = sparePtr->column; + + return(EINVAL); /* XXX not implemented yet */ +#if 0 + if (spare_number < 0 || spare_number > raidPtr->numSpare) { + return(EINVAL); + } + + /* verify that this spare isn't in use... */ + + /* it's gone.. */ + + raidPtr->numSpare--; + return (0); +#endif } diff --git a/sys/dev/raidframe/rf_disks.h b/sys/dev/raidframe/rf_disks.h index 7fc8f58502f..bb3e551396d 100644 --- a/sys/dev/raidframe/rf_disks.h +++ b/sys/dev/raidframe/rf_disks.h @@ -1,5 +1,5 @@ -/* $OpenBSD: rf_disks.h,v 1.2 1999/02/16 00:02:40 niklas Exp $ */ -/* $NetBSD: rf_disks.h,v 1.3 1999/02/05 00:06:09 oster Exp $ */ +/* $OpenBSD: rf_disks.h,v 1.3 1999/07/30 14:45:32 peter Exp $ */ +/* $NetBSD: rf_disks.h,v 1.4 1999/02/24 00:00:03 oster Exp $ */ /* * Copyright (c) 1995 Carnegie-Mellon University. * All rights reserved. @@ -68,8 +68,8 @@ struct RF_RaidDisk_s { RF_SectorCount_t numBlocks; /* number of blocks, obtained via READ * CAPACITY */ int blockSize; - /* XXX the folling is needed since we seem to need SIMULATE defined in - * order to get user-land stuff to compile, but we *don't* want this + /* XXX the following is needed since we seem to need SIMULATE defined + * in order to get user-land stuff to compile, but we *don't* want this * in the structure for the user-land utilities, as the kernel doesn't * know about it!! (and it messes up the size of the structure, so * there is a communication problem between the kernel and the @@ -100,7 +100,6 @@ rf_ConfigureSpareDisks(RF_ShutdownList_t ** listp, RF_Raid_t * raidPtr, RF_Config_t * cfgPtr); int rf_ConfigureDisk(RF_Raid_t * raidPtr, char *buf, RF_RaidDisk_t * diskPtr, - RF_DiskOp_t * rdcap_op, RF_DiskOp_t * tur_op, dev_t dev, RF_RowCol_t row, RF_RowCol_t col); #endif /* !_RF__RF_DISKS_H_ */ diff --git a/sys/dev/raidframe/rf_driver.c b/sys/dev/raidframe/rf_driver.c index 8b30b825bed..5581cbfd316 100644 --- a/sys/dev/raidframe/rf_driver.c +++ b/sys/dev/raidframe/rf_driver.c @@ -1,5 +1,41 @@ -/* $OpenBSD: rf_driver.c,v 1.2 1999/02/16 00:02:41 niklas Exp $ */ -/* $NetBSD: rf_driver.c,v 1.6 1999/02/05 00:06:10 oster Exp $ */ +/* $OpenBSD: rf_driver.c,v 1.3 1999/07/30 14:45:32 peter Exp $ */ +/* $NetBSD: rf_driver.c,v 1.12 1999/07/19 01:36:07 oster Exp $ */ +/*- + * Copyright (c) 1999 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Greg Oster + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the NetBSD + * Foundation, Inc. and its contributors. + * 4. Neither the name of The NetBSD Foundation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + /* * Copyright (c) 1995 Carnegie-Mellon University. * All rights reserved. @@ -118,6 +154,13 @@ static void clean_rad(RF_RaidAccessDesc_t *); static void rf_ShutdownRDFreeList(void *); static int rf_ConfigureRDFreeList(RF_ShutdownList_t **); +void rf_UnconfigureVnodes( RF_Raid_t * ); + +/* XXX move these to their own .h file! */ +int raidwrite_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *); +int raidread_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *); +int raidmarkclean(dev_t dev, struct vnode *b_vp,int); +void rf_update_component_labels( RF_Raid_t *); RF_DECLARE_MUTEX(rf_printf_mutex) /* debug only: avoids interleaved * printfs by different stripes */ @@ -126,7 +169,7 @@ RF_DECLARE_GLOBAL_THREADID /* declarations for threadid.h */ #define SIGNAL_QUIESCENT_COND(_raid_) wakeup(&((_raid_)->accesses_suspended)) #define WAIT_FOR_QUIESCENCE(_raid_) \ - tsleep(&((_raid_)->accesses_suspended),PRIBIO|PCATCH,"raidframe quiesce", 0); + tsleep(&((_raid_)->accesses_suspended),PRIBIO,"raidframe quiesce", 0); #if DKUSAGE > 0 #define IO_BUF_ERR(bp, err, unit) { \ @@ -157,7 +200,8 @@ RF_DECLARE_STATIC_MUTEX(configureMutex) /* used to lock the configuration static int rf_ConfigureRDFreeList(RF_ShutdownList_t ** listp); /* called at system boot time */ - int rf_BootRaidframe() +int +rf_BootRaidframe() { int rc; @@ -247,10 +291,6 @@ int rf_Shutdown(raidPtr) RF_Raid_t *raidPtr; { - int r, c; - - struct proc *p; - if (!raidPtr->valid) { RF_ERRORMSG("Attempt to shut down unconfigured RAIDframe driver. Aborting shutdown\n"); return (EINVAL); @@ -276,6 +316,23 @@ rf_Shutdown(raidPtr) raidPtr->valid = 0; + rf_update_component_labels(raidPtr); + + rf_UnconfigureVnodes(raidPtr); + + rf_ShutdownList(&raidPtr->shutdownList); + + rf_UnconfigureArray(); + + return (0); +} + +void +rf_UnconfigureVnodes( raidPtr ) + RF_Raid_t *raidPtr; +{ + int r,c; + struct proc *p; /* We take this opportunity to close the vnodes like we should.. */ @@ -285,8 +342,10 @@ rf_Shutdown(raidPtr) for (c = 0; c < raidPtr->numCol; c++) { printf("Closing vnode for row: %d col: %d\n", r, c); if (raidPtr->raid_cinfo[r][c].ci_vp) { - (void) vn_close(raidPtr->raid_cinfo[r][c].ci_vp, - FREAD | FWRITE, p->p_ucred, p); + VOP_UNLOCK(raidPtr->raid_cinfo[r][c].ci_vp, 0, p); + (void) vn_close(raidPtr->raid_cinfo[r][c].ci_vp, + FREAD | FWRITE, p->p_ucred, p); + raidPtr->raid_cinfo[r][c].ci_vp = NULL; } else { printf("vnode was NULL\n"); } @@ -296,20 +355,14 @@ rf_Shutdown(raidPtr) for (r = 0; r < raidPtr->numSpare; r++) { printf("Closing vnode for spare: %d\n", r); if (raidPtr->raid_cinfo[0][raidPtr->numCol + r].ci_vp) { + VOP_UNLOCK(raidPtr->raid_cinfo[0][raidPtr->numCol + r].ci_vp, 0, p); (void) vn_close(raidPtr->raid_cinfo[0][raidPtr->numCol + r].ci_vp, FREAD | FWRITE, p->p_ucred, p); + raidPtr->raid_cinfo[0][raidPtr->numCol + r].ci_vp = NULL; } else { printf("vnode was NULL\n"); } } - - - - rf_ShutdownList(&raidPtr->shutdownList); - - rf_UnconfigureArray(); - - return (0); } #define DO_INIT_CONFIGURE(f) { \ @@ -324,6 +377,7 @@ rf_Shutdown(raidPtr) } #define DO_RAID_FAIL() { \ + rf_UnconfigureVnodes(raidPtr); \ rf_ShutdownList(&raidPtr->shutdownList); \ rf_UnconfigureArray(); \ } diff --git a/sys/dev/raidframe/rf_engine.c b/sys/dev/raidframe/rf_engine.c index 36ae0642b3a..17fce1108a5 100644 --- a/sys/dev/raidframe/rf_engine.c +++ b/sys/dev/raidframe/rf_engine.c @@ -1,5 +1,5 @@ -/* $OpenBSD: rf_engine.c,v 1.2 1999/02/16 00:02:41 niklas Exp $ */ -/* $NetBSD: rf_engine.c,v 1.4 1999/02/05 00:06:11 oster Exp $ */ +/* $OpenBSD: rf_engine.c,v 1.3 1999/07/30 14:45:32 peter Exp $ */ +/* $NetBSD: rf_engine.c,v 1.5 1999/03/14 21:53:31 oster Exp $ */ /* * Copyright (c) 1995 Carnegie-Mellon University. * All rights reserved. @@ -89,7 +89,7 @@ static void DAGExecutionThread(RF_ThreadArg_t arg); */ #define DO_LOCK(_r_) { ks = splbio(); RF_LOCK_MUTEX((_r_)->node_queue_mutex); } #define DO_UNLOCK(_r_) { RF_UNLOCK_MUTEX((_r_)->node_queue_mutex); splx(ks); } -#define DO_WAIT(_r_) tsleep(&(_r_)->node_queue, PRIBIO | PCATCH, "raidframe nq",0) +#define DO_WAIT(_r_) tsleep(&(_r_)->node_queue, PRIBIO, "raidframe nq",0) #define DO_SIGNAL(_r_) wakeup(&(_r_)->node_queue) static void rf_ShutdownEngine(void *); diff --git a/sys/dev/raidframe/rf_kintf.h b/sys/dev/raidframe/rf_kintf.h index 819a7ff9e94..5b36487b095 100644 --- a/sys/dev/raidframe/rf_kintf.h +++ b/sys/dev/raidframe/rf_kintf.h @@ -1,5 +1,5 @@ -/* $OpenBSD: rf_kintf.h,v 1.2 1999/02/16 00:02:53 niklas Exp $ */ -/* $NetBSD: rf_kintf.h,v 1.3 1999/02/05 00:06:12 oster Exp $ */ +/* $OpenBSD: rf_kintf.h,v 1.3 1999/07/30 14:45:32 peter Exp $ */ +/* $NetBSD: rf_kintf.h,v 1.4 1999/03/09 03:52:41 oster Exp $ */ /* * rf_kintf.h * @@ -50,8 +50,7 @@ void rf_ReconKernelThread(void); int rf_GetSpareTableFromDaemon(RF_SparetWait_t * req); caddr_t rf_MapToKernelSpace(struct buf * bp, caddr_t addr); int rf_BzeroWithRemap(struct buf * bp, char *databuf, int len); -int -rf_DoAccessKernel(RF_Raid_t * raidPtr, struct buf * bp, +int rf_DoAccessKernel(RF_Raid_t * raidPtr, struct buf * bp, RF_RaidAccessFlags_t flags, void (*cbFunc) (struct buf *), void *cbArg); int rf_DispatchKernelIO(RF_DiskQueue_t * queue, RF_DiskQueueData_t * req); diff --git a/sys/dev/raidframe/rf_layout.c b/sys/dev/raidframe/rf_layout.c index babc1d691d9..3b337687a72 100644 --- a/sys/dev/raidframe/rf_layout.c +++ b/sys/dev/raidframe/rf_layout.c @@ -1,5 +1,5 @@ -/* $OpenBSD: rf_layout.c,v 1.2 1999/02/16 00:02:54 niklas Exp $ */ -/* $NetBSD: rf_layout.c,v 1.3 1999/02/05 00:06:12 oster Exp $ */ +/* $OpenBSD: rf_layout.c,v 1.3 1999/07/30 14:45:32 peter Exp $ */ +/* $NetBSD: rf_layout.c,v 1.4 1999/07/19 01:35:19 oster Exp $ */ /* * Copyright (c) 1995 Carnegie-Mellon University. * All rights reserved. @@ -437,6 +437,13 @@ rf_ConfigureLayout( layoutPtr->SUsPerRU = cfgPtr->SUsPerRU; parityConfig = cfgPtr->parityConfig; + if (layoutPtr->sectorsPerStripeUnit <= 0) { + RF_ERRORMSG2("raid%d: Invalid sectorsPerStripeUnit: %d\n", + raidPtr->raidid, + (int)layoutPtr->sectorsPerStripeUnit ); + return (EINVAL); + } + layoutPtr->stripeUnitsPerDisk = raidPtr->sectorsPerDisk / layoutPtr->sectorsPerStripeUnit; p = rf_GetLayout(parityConfig); diff --git a/sys/dev/raidframe/rf_mcpair.h b/sys/dev/raidframe/rf_mcpair.h index b3b91a2bb95..493d4450d5b 100644 --- a/sys/dev/raidframe/rf_mcpair.h +++ b/sys/dev/raidframe/rf_mcpair.h @@ -1,5 +1,5 @@ -/* $OpenBSD: rf_mcpair.h,v 1.2 1999/02/16 00:02:56 niklas Exp $ */ -/* $NetBSD: rf_mcpair.h,v 1.3 1999/02/05 00:06:13 oster Exp $ */ +/* $OpenBSD: rf_mcpair.h,v 1.3 1999/07/30 14:45:32 peter Exp $ */ +/* $NetBSD: rf_mcpair.h,v 1.4 1999/03/14 21:53:31 oster Exp $ */ /* * Copyright (c) 1995 Carnegie-Mellon University. * All rights reserved. @@ -43,7 +43,7 @@ struct RF_MCPair_s { int flag; RF_MCPair_t *next; }; -#define RF_WAIT_MCPAIR(_mcp) tsleep(&((_mcp)->flag), PRIBIO | PCATCH, "mcpair", 0) +#define RF_WAIT_MCPAIR(_mcp) tsleep(&((_mcp)->flag), PRIBIO, "mcpair", 0) int rf_ConfigureMCPair(RF_ShutdownList_t ** listp); RF_MCPair_t *rf_AllocMCPair(void); diff --git a/sys/dev/raidframe/rf_netbsd.h b/sys/dev/raidframe/rf_netbsd.h index 274c8193695..04ba6b1ef07 100644 --- a/sys/dev/raidframe/rf_netbsd.h +++ b/sys/dev/raidframe/rf_netbsd.h @@ -1,11 +1,11 @@ -/* $OpenBSD: rf_netbsd.h,v 1.2 1999/02/16 00:02:57 niklas Exp $ */ -/* $NetBSD: rf_netbsd.h,v 1.3 1999/02/05 00:06:13 oster Exp $ */ +/* $OpenBSD: rf_netbsd.h,v 1.3 1999/07/30 14:45:32 peter Exp $ */ +/* $NetBSD: rf_netbsd.h,v 1.6 1999/05/13 21:46:17 ad Exp $ */ /*- * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation - * by Greg Oster + * by Greg Oster; Jason R. Thorpe. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -36,43 +36,6 @@ * POSSIBILITY OF SUCH DAMAGE. */ -/*- - * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc. - * All rights reserved. - * - * This code is derived from software contributed to The NetBSD Foundation - * by Jason R. Thorpe. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the NetBSD - * Foundation, Inc. and its contributors. - * 4. Neither the name of The NetBSD Foundation nor the names of its - * contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS - * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED - * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS - * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR - * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF - * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS - * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN - * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGE. - */ - - #ifndef _RF__RF_NETBSDSTUFF_H_ #define _RF__RF_NETBSDSTUFF_H_ @@ -81,12 +44,49 @@ #include <sys/systm.h> #include <sys/namei.h> #include <sys/vnode.h> +#endif /* _KERNEL */ + +/* The per-component label information that the user can set */ +typedef struct RF_ComponentInfo_s { + int row; /* the row number of this component */ + int column; /* the column number of this component */ + int serial_number; /* a user-specified serial number for this + RAID set */ +} RF_ComponentInfo_t; +/* The per-component label information */ +typedef struct RF_ComponentLabel_s { + int version; /* The version of this label. */ + int serial_number; /* a user-specified serial number for this + RAID set */ + int mod_counter; /* modification counter. Changed (usually + by incrementing) every time the label + is changed */ + int row; /* the row number of this component */ + int column; /* the column number of this component */ + int num_rows; /* number of rows in this RAID set */ + int num_columns; /* number of columns in this RAID set */ + int clean; /* 1 when clean, 0 when dirty */ + int status; /* rf_ds_optimal, rf_ds_dist_spared, whatever. */ +} RF_ComponentLabel_t; +typedef struct RF_SingleComponent_s { + int row; + int column; + char component_name[50]; /* name of the component */ +} RF_SingleComponent_t; + +#ifdef _KERNEL -struct raidcinfo { - struct vnode *ci_vp; /* device's vnode */ - dev_t ci_dev; /* XXX: device's dev_t */ +/* XXX this is *not* the place for these... */ +int rf_add_hot_spare(RF_Raid_t *raidPtr, RF_SingleComponent_t *sparePtr); +int rf_remove_hot_spare(RF_Raid_t *raidPtr, RF_SingleComponent_t *sparePtr); + + + struct raidcinfo { + struct vnode *ci_vp; /* component device's vnode */ + dev_t ci_dev; /* component device's dev_t */ + RF_ComponentLabel_t ci_label; /* components RAIDframe label */ #if 0 size_t ci_size; /* size */ char *ci_path; /* path to component */ diff --git a/sys/dev/raidframe/rf_netbsdkintf.c b/sys/dev/raidframe/rf_netbsdkintf.c index e0ca8caa0f1..a8fc295c0b1 100644 --- a/sys/dev/raidframe/rf_netbsdkintf.c +++ b/sys/dev/raidframe/rf_netbsdkintf.c @@ -1,5 +1,4 @@ -/* $OpenBSD: rf_netbsdkintf.c,v 1.2 1999/02/16 00:02:59 niklas Exp $ */ -/* $NetBSD: rf_netbsdkintf.c,v 1.10 1999/02/11 01:23:32 oster Exp $ */ +/* $NetBSD: rf_netbsdkintf.c,v 1.21 1999/07/21 03:15:26 oster Exp $ */ /*- * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc. * All rights reserved. @@ -178,15 +177,12 @@ static int rf_kbooted = RFK_BOOT_NONE; static RF_Raid_t **raidPtrs; /* global raid device descriptors */ -static int rf_pending_testaccs; - RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex) -RF_DECLARE_STATIC_MUTEX(rf_async_done_q_mutex) + static RF_SparetWait_t *rf_sparet_wait_queue; /* requests to install a * spare table */ static RF_SparetWait_t *rf_sparet_resp_queue; /* responses from * installation process */ -static struct rf_test_acc *rf_async_done_qh, *rf_async_done_qt; static struct rf_recon_req *recon_queue = NULL; /* used to communicate * reconstruction @@ -205,16 +201,17 @@ static void InitBP(struct buf * bp, struct vnode *, unsigned rw_flag, void (*cbFunc) (struct buf *), void *cbArg, int logBytesPerSector, struct proc * b_proc); -#define Dprintf0(s) if (rf_queueDebug) rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL) -#define Dprintf1(s,a) if (rf_queueDebug) rf_debug_printf(s,a,NULL,NULL,NULL,NULL,NULL,NULL,NULL) -#define Dprintf2(s,a,b) if (rf_queueDebug) rf_debug_printf(s,a,b,NULL,NULL,NULL,NULL,NULL,NULL) -#define Dprintf3(s,a,b,c) if (rf_queueDebug) rf_debug_printf(s,a,b,c,NULL,NULL,NULL,NULL,NULL) - +#define Dprintf0(s) if (rf_queueDebug) \ + rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL) +#define Dprintf1(s,a) if (rf_queueDebug) \ + rf_debug_printf(s,a,NULL,NULL,NULL,NULL,NULL,NULL,NULL) +#define Dprintf2(s,a,b) if (rf_queueDebug) \ + rf_debug_printf(s,a,b,NULL,NULL,NULL,NULL,NULL,NULL) +#define Dprintf3(s,a,b,c) if (rf_queueDebug) \ + rf_debug_printf(s,a,b,c,NULL,NULL,NULL,NULL,NULL) -/* this is so that we can compile under 2.0 as well as 3.2 */ -#ifndef proc_to_task -#define proc_to_task(x) ((x)->task) -#endif /* !proc_to_task */ +int raidmarkclean(dev_t dev, struct vnode *b_vp, int); +int raidmarkdirty(dev_t dev, struct vnode *b_vp, int); void raidattach __P((int)); int raidsize __P((dev_t)); @@ -231,6 +228,9 @@ int raidread __P((dev_t, struct uio *, int)); void raidstrategy __P((struct buf *)); int raiddump __P((dev_t, daddr_t, caddr_t, size_t)); +int raidwrite_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *); +int raidread_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *); +void rf_update_component_labels( RF_Raid_t *); /* * Pilfered from ccd.c */ @@ -239,8 +239,7 @@ struct raidbuf { struct buf rf_buf; /* new I/O buf. MUST BE FIRST!!! */ struct buf *rf_obp; /* ptr. to original I/O buf */ int rf_flags; /* misc. flags */ - RF_DiskQueueData_t *req; /* the request that this was - * part of.. */ + RF_DiskQueueData_t *req;/* the request that this was part of.. */ }; @@ -251,7 +250,6 @@ struct raidbuf { or if it should be used in conjunction with that... */ struct raid_softc { - int sc_unit;/* logical unit number */ int sc_flags; /* flags */ int sc_cflags; /* configuration flags */ size_t sc_size;/* size of the raid device */ @@ -270,6 +268,16 @@ struct raid_softc { #define raidunit(x) DISKUNIT(x) static int numraid = 0; +/* + * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device. + * Be aware that large numbers can allow the driver to consume a lot of + * kernel memory, especially on writes... + */ + +#ifndef RAIDOUTSTANDING +#define RAIDOUTSTANDING 10 +#endif + #define RAIDLABELDEV(dev) \ (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART)) @@ -285,12 +293,14 @@ static int raidlock __P((struct raid_softc *)); static void raidunlock __P((struct raid_softc *)); int raidlookup __P((char *, struct proc * p, struct vnode **)); +static void rf_markalldirty __P((RF_Raid_t *)); void raidattach(num) int num; { int raidID; + int i, rc; #ifdef DEBUG printf("raidattach: Asked for %d units\n", num); @@ -310,10 +320,23 @@ raidattach(num) if (raidPtrs == NULL) { panic("raidPtrs is NULL!!\n"); } - rf_kbooted = rf_boot(); - if (rf_kbooted) { - panic("Serious error booting RAID!!\n"); + + rc = rf_mutex_init(&rf_sparet_wait_mutex); + if (rc) { + RF_PANIC(); } + + rf_sparet_wait_queue = rf_sparet_resp_queue = NULL; + recon_queue = NULL; + + for (i = 0; i < numraid; i++) + raidPtrs[i] = NULL; + rc = rf_BootRaidframe(); + if (rc == 0) + printf("Kernelized RAIDframe activated\n"); + else + panic("Serious error booting RAID!!\n"); + rf_kbooted = RFK_BOOT_GOOD; /* put together some datastructures like the CCD device does.. This @@ -396,23 +419,8 @@ raidopen(dev, flags, fmt, p) struct raid_softc *rs; struct disklabel *lp; int part, pmask; - unsigned int raidID; - int rc; int error = 0; - /* This whole next chunk of code is somewhat suspect... Not sure it's - * needed here at all... XXX */ - - if (rf_kbooted == RFK_BOOT_NONE) { - printf("Doing restart on raidopen.\n"); - rf_kbooted = RFK_BOOT_GOOD; - rc = rf_boot(); - if (rc) { - rf_kbooted = RFK_BOOT_BAD; - printf("Someone is unhappy...\n"); - return (rc); - } - } if (unit >= numraid) return (ENXIO); rs = &raid_softc[unit]; @@ -421,13 +429,11 @@ raidopen(dev, flags, fmt, p) return (error); lp = rs->sc_dkdev.dk_label; - raidID = raidunit(dev); - part = DISKPART(dev); pmask = (1 << part); db1_printf(("Opening raid device number: %d partition: %d\n", - raidID, part)); + unit, part)); if ((rs->sc_flags & RAIDF_INITED) && @@ -457,6 +463,21 @@ raidopen(dev, flags, fmt, p) rs->sc_dkdev.dk_bopenmask |= pmask; break; } + + if ((rs->sc_dkdev.dk_openmask == 0) && + ((rs->sc_flags & RAIDF_INITED) != 0)) { + /* First one... mark things as dirty... Note that we *MUST* + have done a configure before this. I DO NOT WANT TO BE + SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED + THAT THEY BELONG TOGETHER!!!!! */ + /* XXX should check to see if we're only open for reading + here... If so, we needn't do this, but then need some + other way of keeping track of what's happened.. */ + + rf_markalldirty( raidPtrs[unit] ); + } + + rs->sc_dkdev.dk_openmask = rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask; @@ -499,6 +520,15 @@ raidclose(dev, flags, fmt, p) } rs->sc_dkdev.dk_openmask = rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask; + + if ((rs->sc_dkdev.dk_openmask == 0) && + ((rs->sc_flags & RAIDF_INITED) != 0)) { + /* Last one... device is not unconfigured yet. + Device shutdown has taken care of setting the + clean bits if RAIDF_INITED is not set + mark things as clean... */ + rf_update_component_labels( raidPtrs[unit] ); + } raidunlock(rs); return (0); @@ -593,7 +623,6 @@ raidread(dev, uio, flags) { int unit = raidunit(dev); struct raid_softc *rs; - int result; int part; if (unit >= numraid) @@ -606,13 +635,7 @@ raidread(dev, uio, flags) db1_printf(("raidread: unit: %d partition: %d\n", unit, part)); -#if 0 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio)); -#endif - result = physio(raidstrategy, NULL, dev, B_READ, minphys, uio); - db1_printf(("raidread done. Result is %d %d\n", - result, uio->uio_resid)); - return (result); } /* ARGSUSED */ @@ -634,7 +657,6 @@ raidwrite(dev, uio, flags) db1_printf(("raidwrite\n")); return (physio(raidstrategy, NULL, dev, B_WRITE, minphys, uio)); - } int @@ -659,16 +681,16 @@ raidioctl(dev, cmd, data, flag, p) RF_Config_t *k_cfg, *u_cfg; u_char *specific_buf; int retcode = 0; - int row; - struct rf_recon_req *rrcopy, *rr; -#if 0 - int nbytes, spl, rw, row; - struct rf_test_acc *ta; - struct buf *bp; - RF_SparetWait_t *waitreq; - struct rf_test_acc *ta_p, *ta_copy; -#endif + int column; + int s; + struct rf_recon_req *rrcopy, *rr; + RF_ComponentLabel_t *component_label; + RF_ComponentLabel_t ci_label; + RF_ComponentLabel_t **c_label_ptr; + RF_SingleComponent_t *sparePtr,*componentPtr; + RF_SingleComponent_t hot_spare; + RF_SingleComponent_t component; if (unit >= numraid) return (ENXIO); @@ -704,6 +726,12 @@ raidioctl(dev, cmd, data, flag, p) case RAIDFRAME_FAIL_DISK: case RAIDFRAME_COPYBACK: case RAIDFRAME_CHECKRECON: + case RAIDFRAME_GET_COMPONENT_LABEL: + case RAIDFRAME_SET_COMPONENT_LABEL: + case RAIDFRAME_ADD_HOT_SPARE: + case RAIDFRAME_REMOVE_HOT_SPARE: + case RAIDFRAME_INIT_LABELS: + case RAIDFRAME_REBUILD_IN_PLACE: if ((rs->sc_flags & RAIDF_INITED) == 0) return (ENXIO); } @@ -771,15 +799,17 @@ raidioctl(dev, cmd, data, flag, p) raidPtrs[unit]->proc = p; /* configure the system */ - rf_pending_testaccs = 0; - raidPtrs[unit]->raidid = unit; + retcode = rf_Configure(raidPtrs[unit], k_cfg); + /* allow this many simultaneous IO's to this RAID device */ + raidPtrs[unit]->openings = RAIDOUTSTANDING; if (retcode == 0) { retcode = raidinit(dev, raidPtrs[unit], unit); + rf_markalldirty( raidPtrs[unit] ); } /* free the buffers. No return code here. */ if (k_cfg->layoutSpecificSize) { @@ -789,6 +819,7 @@ raidioctl(dev, cmd, data, flag, p) db3_printf(("rf_ioctl: retcode=%d RAIDFRAME_CONFIGURE\n", retcode)); + return (retcode); /* shutdown the system */ @@ -810,18 +841,12 @@ raidioctl(dev, cmd, data, flag, p) raidunlock(rs); return (EBUSY); } - /* the intention here was to disallow shutdowns while - * raidframe is mounted, but it doesn't work because the - * shutdown ioctl calls rf_open */ - if (rf_pending_testaccs > 0) { - printf("RAIDFRAME: Can't shutdown because there are %d pending test accs\n", - rf_pending_testaccs); - return (EINVAL); - } + if (rf_debugKernelAccess) { printf("call shutdown\n"); } raidPtrs[unit]->proc = p; /* XXX necessary evil */ + retcode = rf_Shutdown(raidPtrs[unit]); db1_printf(("Done main shutdown\n")); @@ -839,18 +864,178 @@ raidioctl(dev, cmd, data, flag, p) return (retcode); + case RAIDFRAME_GET_COMPONENT_LABEL: + c_label_ptr = (RF_ComponentLabel_t **) data; + /* need to read the component label for the disk indicated + by row,column in component_label + XXX need to sanity check these values!!! + */ + + /* For practice, let's get it directly fromdisk, rather + than from the in-core copy */ + RF_Malloc( component_label, sizeof( RF_ComponentLabel_t ), + (RF_ComponentLabel_t *)); + if (component_label == NULL) + return (ENOMEM); + + bzero((char *) component_label, sizeof(RF_ComponentLabel_t)); + + retcode = copyin( *c_label_ptr, component_label, + sizeof(RF_ComponentLabel_t)); + + if (retcode) { + return(retcode); + } + + row = component_label->row; + printf("Row: %d\n",row); + if (row > raidPtrs[unit]->numRow) { + row = 0; /* XXX */ + } + column = component_label->column; + printf("Column: %d\n",column); + if (column > raidPtrs[unit]->numCol) { + column = 0; /* XXX */ + } + + raidread_component_label( + raidPtrs[unit]->Disks[row][column].dev, + raidPtrs[unit]->raid_cinfo[row][column].ci_vp, + component_label ); + + retcode = copyout((caddr_t) component_label, + (caddr_t) *c_label_ptr, + sizeof(RF_ComponentLabel_t)); + RF_Free( component_label, sizeof(RF_ComponentLabel_t)); + return (retcode); + + case RAIDFRAME_SET_COMPONENT_LABEL: + component_label = (RF_ComponentLabel_t *) data; + + /* XXX check the label for valid stuff... */ + /* Note that some things *should not* get modified -- + the user should be re-initing the labels instead of + trying to patch things. + */ + + printf("Got component label:\n"); + printf("Version: %d\n",component_label->version); + printf("Serial Number: %d\n",component_label->serial_number); + printf("Mod counter: %d\n",component_label->mod_counter); + printf("Row: %d\n", component_label->row); + printf("Column: %d\n", component_label->column); + printf("Num Rows: %d\n", component_label->num_rows); + printf("Num Columns: %d\n", component_label->num_columns); + printf("Clean: %d\n", component_label->clean); + printf("Status: %d\n", component_label->status); + + row = component_label->row; + column = component_label->column; + + if ((row < 0) || (row > raidPtrs[unit]->numRow) || + (column < 0) || (column > raidPtrs[unit]->numCol)) { + return(EINVAL); + } + + /* XXX this isn't allowed to do anything for now :-) */ +#if 0 + raidwrite_component_label( + raidPtrs[unit]->Disks[row][column].dev, + raidPtrs[unit]->raid_cinfo[row][column].ci_vp, + component_label ); +#endif + return (0); + + case RAIDFRAME_INIT_LABELS: + component_label = (RF_ComponentLabel_t *) data; + /* + we only want the serial number from + the above. We get all the rest of the information + from the config that was used to create this RAID + set. + */ + + raidPtrs[unit]->serial_number = component_label->serial_number; + /* current version number */ + ci_label.version = RF_COMPONENT_LABEL_VERSION; + ci_label.serial_number = component_label->serial_number; + ci_label.mod_counter = raidPtrs[unit]->mod_counter; + ci_label.num_rows = raidPtrs[unit]->numRow; + ci_label.num_columns = raidPtrs[unit]->numCol; + ci_label.clean = RF_RAID_DIRTY; /* not clean */ + ci_label.status = rf_ds_optimal; /* "It's good!" */ + + for(row=0;row<raidPtrs[unit]->numRow;row++) { + ci_label.row = row; + for(column=0;column<raidPtrs[unit]->numCol;column++) { + ci_label.column = column; + raidwrite_component_label( + raidPtrs[unit]->Disks[row][column].dev, + raidPtrs[unit]->raid_cinfo[row][column].ci_vp, + &ci_label ); + } + } + + return (retcode); + + /* initialize all parity */ case RAIDFRAME_REWRITEPARITY: + + if (raidPtrs[unit]->Layout.map->faultsTolerated == 0) { + /* Parity for RAID 0 is trivially correct */ + raidPtrs[unit]->parity_good = RF_RAID_CLEAN; + return(0); + } + + /* borrow the thread of the requesting process */ + raidPtrs[unit]->proc = p; /* Blah... :-p GO */ + retcode = rf_RewriteParity(raidPtrs[unit]); + /* return I/O Error if the parity rewrite fails */ - if (raidPtrs[unit]->Layout.map->faultsTolerated == 0) - return (EINVAL); /* borrow the thread of the requesting process */ raidPtrs[unit]->proc = p; /* Blah... :-p GO */ retcode = rf_RewriteParity(raidPtrs[unit]); /* return I/O Error if the parity rewrite fails */ - if (retcode) + if (retcode) { retcode = EIO; + } else { + /* set the clean bit! If we shutdown correctly, + the clean bit on each component label will get + set */ + raidPtrs[unit]->parity_good = RF_RAID_CLEAN; + } + return (retcode); + + + case RAIDFRAME_ADD_HOT_SPARE: + sparePtr = (RF_SingleComponent_t *) data; + memcpy( &hot_spare, sparePtr, sizeof(RF_SingleComponent_t)); + printf("Adding spare\n"); + raidPtrs[unit]->proc = p; /* Blah... :-p GO */ + retcode = rf_add_hot_spare(raidPtrs[unit], &hot_spare); + return(retcode); + + case RAIDFRAME_REMOVE_HOT_SPARE: + return(retcode); + + case RAIDFRAME_REBUILD_IN_PLACE: + componentPtr = (RF_SingleComponent_t *) data; + memcpy( &component, componentPtr, + sizeof(RF_SingleComponent_t)); + row = component.row; + column = component.column; + printf("Rebuild: %d %d\n",row, column); + if ((row < 0) || (row > raidPtrs[unit]->numRow) || + (column < 0) || (column > raidPtrs[unit]->numCol)) { + return(EINVAL); + } + printf("Attempting a rebuild in place\n"); + s = splbio(); + raidPtrs[unit]->proc = p; /* Blah... :-p GO */ + retcode = rf_ReconstructInPlace(raidPtrs[unit], row, column); + splx(s); return (retcode); /* issue a test-unit-ready through raidframe to the indicated @@ -951,7 +1136,8 @@ raidioctl(dev, cmd, data, flag, p) || rr->col < 0 || rr->col >= raidPtrs[unit]->numCol) return (EINVAL); - printf("Failing the disk: row: %d col: %d\n", rr->row, rr->col); + printf("raid%d: Failing the disk: row: %d col: %d\n", + unit, rr->row, rr->col); /* make a copy of the recon request so that we don't rely on * the user's buffer */ @@ -972,7 +1158,9 @@ raidioctl(dev, cmd, data, flag, p) case RAIDFRAME_COPYBACK: /* borrow the current thread to get this done */ raidPtrs[unit]->proc = p; /* ICK.. but needed :-p GO */ + s = splbio(); rf_CopybackReconstructedData(raidPtrs[unit]); + splx(s); return (0); /* return the percentage completion of reconstruction */ @@ -1143,53 +1331,27 @@ raidinit(dev, raidPtr, unit) sprintf(rs->sc_xname, "raid%d", unit); /* XXX doesn't check bounds. */ rs->sc_dkdev.dk_name = rs->sc_xname; + /* disk_attach actually creates space for the CPU disklabel, among * other things, so it's critical to call this *BEFORE* we try putzing * with disklabels. */ + disk_attach(&rs->sc_dkdev); /* XXX There may be a weird interaction here between this, and * protectedSectors, as used in RAIDframe. */ + rs->sc_size = raidPtr->totalSectors; rs->sc_dev = dev; + return (retcode); } - -/********************************************************* - * - * initialization code called at boot time (startup.c) - * - ********************************************************/ -int -rf_boot() -{ - int i, rc; - - rc = rf_mutex_init(&rf_sparet_wait_mutex); - if (rc) { - RF_PANIC(); - } - rc = rf_mutex_init(&rf_async_done_q_mutex); - if (rc) { - RF_PANIC(); - } - rf_sparet_wait_queue = rf_sparet_resp_queue = NULL; - recon_queue = NULL; - rf_async_done_qh = rf_async_done_qt = NULL; - for (i = 0; i < numraid; i++) - raidPtrs[i] = NULL; - rc = rf_BootRaidframe(); - if (rc == 0) - printf("Kernelized RAIDframe activated\n"); - else - rf_kbooted = RFK_BOOT_BAD; - return (rc); -} /* * This kernel thread never exits. It is created once, and persists * until the system reboots. */ + void rf_ReconKernelThread() { @@ -1205,7 +1367,8 @@ rf_ReconKernelThread() LOCK_RECON_Q_MUTEX(); while (!recon_queue) { UNLOCK_RECON_Q_MUTEX(); - tsleep(&recon_queue, PRIBIO | PCATCH, "raidframe recon", 0); + tsleep(&recon_queue, PRIBIO, + "raidframe recon", 0); LOCK_RECON_Q_MUTEX(); } req = recon_queue; @@ -1214,7 +1377,8 @@ rf_ReconKernelThread() /* * If flags specifies that we should start recon, this call - * will not return until reconstruction completes, fails, or is aborted. + * will not return until reconstruction completes, fails, + * or is aborted. */ rf_FailDisk((RF_Raid_t *) req->raidPtr, req->row, req->col, ((req->flags & RF_FDFLAGS_RECON) ? 1 : 0)); @@ -1225,8 +1389,8 @@ rf_ReconKernelThread() /* wake up the daemon & tell it to get us a spare table * XXX * the entries in the queues should be tagged with the raidPtr - * so that in the extremely rare case that two recons happen at once, we know for - * which device were requesting a spare table + * so that in the extremely rare case that two recons happen at once, + * we know for which device were requesting a spare table * XXX */ int @@ -1242,10 +1406,12 @@ rf_GetSpareTableFromDaemon(req) /* mpsleep unlocks the mutex */ while (!rf_sparet_resp_queue) { - tsleep(&rf_sparet_resp_queue, PRIBIO | PCATCH, + tsleep(&rf_sparet_resp_queue, PRIBIO, "raidframe getsparetable", 0); #if 0 - mpsleep(&rf_sparet_resp_queue, PZERO, "sparet resp", 0, (void *) simple_lock_addr(rf_sparet_wait_mutex), MS_LOCK_SIMPLE); + mpsleep(&rf_sparet_resp_queue, PZERO, "sparet resp", 0, + (void *) simple_lock_addr(rf_sparet_wait_mutex), + MS_LOCK_SIMPLE); #endif } req = rf_sparet_resp_queue; @@ -1257,7 +1423,8 @@ rf_GetSpareTableFromDaemon(req) * alloc'd */ return (retcode); } -/* a wrapper around rf_DoAccess that extracts appropriate info from the bp & passes it down. +/* a wrapper around rf_DoAccess that extracts appropriate info from the + * bp & passes it down. * any calls originating in the kernel must use non-blocking I/O * do some extra sanity checking to return "appropriate" error values for * certain conditions (to make some standard utilities work) @@ -1334,12 +1501,24 @@ rf_DoAccessKernel(raidPtr, bp, flags, cbFunc, cbArg) } db1_printf(("Calling DoAccess..\n")); + + /* Put a throttle on the number of requests we handle simultanously */ + + RF_LOCK_MUTEX(raidPtr->mutex); + + while(raidPtr->openings <= 0) { + RF_UNLOCK_MUTEX(raidPtr->mutex); + (void)tsleep(&raidPtr->openings, PRIBIO, "rfdwait", 0); + RF_LOCK_MUTEX(raidPtr->mutex); + } + raidPtr->openings--; + + RF_UNLOCK_MUTEX(raidPtr->mutex); + /* - * XXX For now, all writes are sync + * Everything is async. */ do_async = 1; - if ((bp->b_flags & B_READ) == 0) - do_async = 0; /* don't ever condition on bp->b_flags & B_WRITE. always condition on * B_READ instead */ @@ -1354,12 +1533,6 @@ rf_DoAccessKernel(raidPtr, bp, flags, cbFunc, cbArg) bp->b_data, (int) bp->b_resid)); #endif - /* - * If we requested sync I/O, sleep here. - */ - if ((retcode == 0) && (do_async == 0)) - tsleep(bp, PRIBIO, "raidsyncio", 0); - return (retcode); } /* invoke an I/O from kernel mode. Disk queue should be locked upon entry */ @@ -1394,7 +1567,7 @@ rf_DispatchKernelIO(queue, req) disk_busy(&rs->sc_dkdev); bp = req->bp; - +#if 1 /* XXX when there is a physical disk failure, someone is passing us a * buffer that contains old stuff!! Attempt to deal with this problem * without taking a performance hit... (not sure where the real bug @@ -1406,6 +1579,7 @@ rf_DispatchKernelIO(queue, req) if (bp->b_error != 0) { bp->b_error = 0; } +#endif raidbp = RAIDGETBUF(rs); raidbp->rf_flags = 0; /* XXX not really used anywhere... */ @@ -1549,6 +1723,7 @@ KernelWakeupFunc(vbp) rf_ds_failed; queue->raidPtr->status[queue->row] = rf_rs_degraded; queue->raidPtr->numFailures++; + /* XXX here we should bump the version number for each component, and write that data out */ } else { /* Disk is already dead... */ /* printf("Disk already marked as dead!\n"); */ } @@ -1720,14 +1895,14 @@ raidgetdisklabel(dev) if (lp->d_secperunit != rs->sc_size) printf("WARNING: %s: " "total sector size in disklabel (%d) != " - "the size of raid (%d)\n", rs->sc_xname, - lp->d_secperunit, rs->sc_size); + "the size of raid (%ld)\n", rs->sc_xname, + lp->d_secperunit, (long) rs->sc_size); for (i = 0; i < lp->d_npartitions; i++) { pp = &lp->d_partitions[i]; if (pp->p_offset + pp->p_size > rs->sc_size) printf("WARNING: %s: end of partition `%c' " - "exceeds the size of raid (%d)\n", - rs->sc_xname, 'a' + i, rs->sc_size); + "exceeds the size of raid (%ld)\n", + rs->sc_xname, 'a' + i, (long) rs->sc_size); } } @@ -1835,3 +2010,328 @@ raidunlock(rs) wakeup(rs); } } + + +#define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */ +#define RF_COMPONENT_INFO_SIZE 1024 /* bytes */ + +int +raidmarkclean(dev_t dev, struct vnode *b_vp, int mod_counter) +{ + RF_ComponentLabel_t component_label; + raidread_component_label(dev, b_vp, &component_label); + component_label.mod_counter = mod_counter; + component_label.clean = RF_RAID_CLEAN; + raidwrite_component_label(dev, b_vp, &component_label); + return(0); +} + + +int +raidmarkdirty(dev_t dev, struct vnode *b_vp, int mod_counter) +{ + RF_ComponentLabel_t component_label; + raidread_component_label(dev, b_vp, &component_label); + component_label.mod_counter = mod_counter; + component_label.clean = RF_RAID_DIRTY; + raidwrite_component_label(dev, b_vp, &component_label); + return(0); +} + +/* ARGSUSED */ +int +raidread_component_label(dev, b_vp, component_label) + dev_t dev; + struct vnode *b_vp; + RF_ComponentLabel_t *component_label; +{ + struct buf *bp; + int error; + + /* XXX should probably ensure that we don't try to do this if + someone has changed rf_protected_sectors. */ + + /* get a block of the appropriate size... */ + bp = geteblk((int)RF_COMPONENT_INFO_SIZE); + bp->b_dev = dev; + + /* get our ducks in a row for the read */ + bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE; + bp->b_bcount = RF_COMPONENT_INFO_SIZE; + bp->b_flags = B_BUSY | B_READ; + bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE; + + (*bdevsw[major(bp->b_dev)].d_strategy)(bp); + + error = biowait(bp); + + if (!error) { + memcpy(component_label, bp->b_un.b_addr, + sizeof(RF_ComponentLabel_t)); +#if 0 + printf("raidread_component_label: got component label:\n"); + printf("Version: %d\n",component_label->version); + printf("Serial Number: %d\n",component_label->serial_number); + printf("Mod counter: %d\n",component_label->mod_counter); + printf("Row: %d\n", component_label->row); + printf("Column: %d\n", component_label->column); + printf("Num Rows: %d\n", component_label->num_rows); + printf("Num Columns: %d\n", component_label->num_columns); + printf("Clean: %d\n", component_label->clean); + printf("Status: %d\n", component_label->status); +#endif + } else { + printf("Failed to read RAID component label!\n"); + } + + bp->b_flags = B_INVAL | B_AGE; + brelse(bp); + return(error); +} +/* ARGSUSED */ +int +raidwrite_component_label(dev, b_vp, component_label) + dev_t dev; + struct vnode *b_vp; + RF_ComponentLabel_t *component_label; +{ + struct buf *bp; + int error; + + /* get a block of the appropriate size... */ + bp = geteblk((int)RF_COMPONENT_INFO_SIZE); + bp->b_dev = dev; + + /* get our ducks in a row for the write */ + bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE; + bp->b_bcount = RF_COMPONENT_INFO_SIZE; + bp->b_flags = B_BUSY | B_WRITE; + bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE; + + memset( bp->b_un.b_addr, 0, RF_COMPONENT_INFO_SIZE ); + + memcpy( bp->b_un.b_addr, component_label, sizeof(RF_ComponentLabel_t)); + + (*bdevsw[major(bp->b_dev)].d_strategy)(bp); + error = biowait(bp); + bp->b_flags = B_INVAL | B_AGE; + brelse(bp); + if (error) { + printf("Failed to write RAID component info!\n"); + } + + return(error); +} + +void +rf_markalldirty( raidPtr ) + RF_Raid_t *raidPtr; +{ + RF_ComponentLabel_t c_label; + int r,c; + + raidPtr->mod_counter++; + for (r = 0; r < raidPtr->numRow; r++) { + for (c = 0; c < raidPtr->numCol; c++) { + if (raidPtr->Disks[r][c].status != rf_ds_failed) { + raidread_component_label( + raidPtr->Disks[r][c].dev, + raidPtr->raid_cinfo[r][c].ci_vp, + &c_label); + if (c_label.status == rf_ds_spared) { + /* XXX do something special... + but whatever you do, don't + try to access it!! */ + } else { +#if 0 + c_label.status = + raidPtr->Disks[r][c].status; + raidwrite_component_label( + raidPtr->Disks[r][c].dev, + raidPtr->raid_cinfo[r][c].ci_vp, + &c_label); +#endif + raidmarkdirty( + raidPtr->Disks[r][c].dev, + raidPtr->raid_cinfo[r][c].ci_vp, + raidPtr->mod_counter); + } + } + } + } + /* printf("Component labels marked dirty.\n"); */ +#if 0 + for( c = 0; c < raidPtr->numSpare ; c++) { + sparecol = raidPtr->numCol + c; + if (raidPtr->Disks[r][sparecol].status == rf_ds_used_spare) { + /* + + XXX this is where we get fancy and map this spare + into it's correct spot in the array. + + */ + /* + + we claim this disk is "optimal" if it's + rf_ds_used_spare, as that means it should be + directly substitutable for the disk it replaced. + We note that too... + + */ + + for(i=0;i<raidPtr->numRow;i++) { + for(j=0;j<raidPtr->numCol;j++) { + if ((raidPtr->Disks[i][j].spareRow == + r) && + (raidPtr->Disks[i][j].spareCol == + sparecol)) { + srow = r; + scol = sparecol; + break; + } + } + } + + raidread_component_label( + raidPtr->Disks[r][sparecol].dev, + raidPtr->raid_cinfo[r][sparecol].ci_vp, + &c_label); + /* make sure status is noted */ + c_label.version = RF_COMPONENT_LABEL_VERSION; + c_label.mod_counter = raidPtr->mod_counter; + c_label.serial_number = raidPtr->serial_number; + c_label.row = srow; + c_label.column = scol; + c_label.num_rows = raidPtr->numRow; + c_label.num_columns = raidPtr->numCol; + c_label.clean = RF_RAID_DIRTY; /* changed in a bit*/ + c_label.status = rf_ds_optimal; + raidwrite_component_label( + raidPtr->Disks[r][sparecol].dev, + raidPtr->raid_cinfo[r][sparecol].ci_vp, + &c_label); + raidmarkclean( raidPtr->Disks[r][sparecol].dev, + raidPtr->raid_cinfo[r][sparecol].ci_vp); + } + } + +#endif +} + + +void +rf_update_component_labels( raidPtr ) + RF_Raid_t *raidPtr; +{ + RF_ComponentLabel_t c_label; + int sparecol; + int r,c; + int i,j; + int srow, scol; + + srow = -1; + scol = -1; + + /* XXX should do extra checks to make sure things really are clean, + rather than blindly setting the clean bit... */ + + raidPtr->mod_counter++; + + for (r = 0; r < raidPtr->numRow; r++) { + for (c = 0; c < raidPtr->numCol; c++) { + if (raidPtr->Disks[r][c].status == rf_ds_optimal) { + raidread_component_label( + raidPtr->Disks[r][c].dev, + raidPtr->raid_cinfo[r][c].ci_vp, + &c_label); + /* make sure status is noted */ + c_label.status = rf_ds_optimal; + raidwrite_component_label( + raidPtr->Disks[r][c].dev, + raidPtr->raid_cinfo[r][c].ci_vp, + &c_label); + if (raidPtr->parity_good == RF_RAID_CLEAN) { + raidmarkclean( + raidPtr->Disks[r][c].dev, + raidPtr->raid_cinfo[r][c].ci_vp, + raidPtr->mod_counter); + } + } + /* else we don't touch it.. */ +#if 0 + else if (raidPtr->Disks[r][c].status != + rf_ds_failed) { + raidread_component_label( + raidPtr->Disks[r][c].dev, + raidPtr->raid_cinfo[r][c].ci_vp, + &c_label); + /* make sure status is noted */ + c_label.status = + raidPtr->Disks[r][c].status; + raidwrite_component_label( + raidPtr->Disks[r][c].dev, + raidPtr->raid_cinfo[r][c].ci_vp, + &c_label); + if (raidPtr->parity_good == RF_RAID_CLEAN) { + raidmarkclean( + raidPtr->Disks[r][c].dev, + raidPtr->raid_cinfo[r][c].ci_vp, + raidPtr->mod_counter); + } + } +#endif + } + } + + for( c = 0; c < raidPtr->numSpare ; c++) { + sparecol = raidPtr->numCol + c; + if (raidPtr->Disks[0][sparecol].status == rf_ds_used_spare) { + /* + + we claim this disk is "optimal" if it's + rf_ds_used_spare, as that means it should be + directly substitutable for the disk it replaced. + We note that too... + + */ + + for(i=0;i<raidPtr->numRow;i++) { + for(j=0;j<raidPtr->numCol;j++) { + if ((raidPtr->Disks[i][j].spareRow == + 0) && + (raidPtr->Disks[i][j].spareCol == + sparecol)) { + srow = i; + scol = j; + break; + } + } + } + + raidread_component_label( + raidPtr->Disks[0][sparecol].dev, + raidPtr->raid_cinfo[0][sparecol].ci_vp, + &c_label); + /* make sure status is noted */ + c_label.version = RF_COMPONENT_LABEL_VERSION; + c_label.mod_counter = raidPtr->mod_counter; + c_label.serial_number = raidPtr->serial_number; + c_label.row = srow; + c_label.column = scol; + c_label.num_rows = raidPtr->numRow; + c_label.num_columns = raidPtr->numCol; + c_label.clean = RF_RAID_DIRTY; /* changed in a bit*/ + c_label.status = rf_ds_optimal; + raidwrite_component_label( + raidPtr->Disks[0][sparecol].dev, + raidPtr->raid_cinfo[0][sparecol].ci_vp, + &c_label); + if (raidPtr->parity_good == RF_RAID_CLEAN) { + raidmarkclean( raidPtr->Disks[0][sparecol].dev, + raidPtr->raid_cinfo[0][sparecol].ci_vp, + raidPtr->mod_counter); + } + } + } + /* printf("Component labels updated\n"); */ +} diff --git a/sys/dev/raidframe/rf_openbsd.h b/sys/dev/raidframe/rf_openbsd.h index 6909775613e..59feb987ed3 100644 --- a/sys/dev/raidframe/rf_openbsd.h +++ b/sys/dev/raidframe/rf_openbsd.h @@ -1,4 +1,4 @@ -/* $OpenBSD: rf_openbsd.h,v 1.2 1999/02/16 00:03:01 niklas Exp $ */ +/* $OpenBSD: rf_openbsd.h,v 1.3 1999/07/30 14:45:32 peter Exp $ */ /*- * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc. @@ -81,10 +81,49 @@ #include <sys/systm.h> #include <sys/namei.h> #include <sys/vnode.h> +#endif /* _KERNEL */ + +/* The per-component label information that the user can set */ +typedef struct RF_ComponentInfo_s { + int row; /* the row number of this component */ + int column; /* the column number of this component */ + int serial_number; /* a user-specified serial number for this + RAID set */ +} RF_ComponentInfo_t; +/* The per-component label information */ +typedef struct RF_ComponentLabel_s { + int version; /* The version of this label. */ + int serial_number; /* a user-specified serial number for this + RAID set */ + int mod_counter; /* modification counter. Changed (usually + by incrementing) every time the label + is changed */ + int row; /* the row number of this component */ + int column; /* the column number of this component */ + int num_rows; /* number of rows in this RAID set */ + int num_columns; /* number of columns in this RAID set */ + int clean; /* 1 when clean, 0 when dirty */ + int status; /* rf_ds_optimal, rf_ds_dist_spared, whatever. */ +} RF_ComponentLabel_t; + +typedef struct RF_SingleComponent_s { + int row; + int column; + char component_name[50]; /* name of the component */ +} RF_SingleComponent_t; + +#ifdef _KERNEL + +/* XXX this is *not* the place for these... */ +int rf_add_hot_spare(RF_Raid_t *raidPtr, RF_SingleComponent_t *sparePtr); +int rf_remove_hot_spare(RF_Raid_t *raidPtr, RF_SingleComponent_t *sparePtr); + + struct raidcinfo { - struct vnode *ci_vp; /* device's vnode */ - dev_t ci_dev; /* XXX: device's dev_t */ + struct vnode *ci_vp; /* component device's vnode */ + dev_t ci_dev; /* component device's dev_t */ + RF_ComponentLabel_t ci_label; /* components RAIDframe label */ #if 0 size_t ci_size; /* size */ char *ci_path; /* path to component */ diff --git a/sys/dev/raidframe/rf_openbsdkintf.c b/sys/dev/raidframe/rf_openbsdkintf.c index 64dd4ed1759..7cd373ae592 100644 --- a/sys/dev/raidframe/rf_openbsdkintf.c +++ b/sys/dev/raidframe/rf_openbsdkintf.c @@ -1,4 +1,4 @@ -/* $OpenBSD: rf_openbsdkintf.c,v 1.2 1999/02/16 00:03:01 niklas Exp $ */ +/* $OpenBSD: rf_openbsdkintf.c,v 1.3 1999/07/30 14:45:32 peter Exp $ */ /*- * Copyright (c) 1996, 1997, 1998 The NetBSD Foundation, Inc. @@ -174,17 +174,13 @@ static int rf_kbooted = RFK_BOOT_NONE; static RF_Raid_t **raidPtrs; /* global raid device descriptors */ -static int rf_pending_testaccs; - RF_DECLARE_STATIC_MUTEX(rf_sparet_wait_mutex) -RF_DECLARE_STATIC_MUTEX(rf_async_done_q_mutex) /* requests to install a spare table */ static RF_SparetWait_t *rf_sparet_wait_queue; /* responses from installation process */ static RF_SparetWait_t *rf_sparet_resp_queue; -static struct rf_test_acc *rf_async_done_qh, *rf_async_done_qt; /* used to communicate reconstruction requests */ static struct rf_recon_req *recon_queue = NULL; @@ -200,10 +196,17 @@ void rf_InitBP __P((struct buf *, struct vnode *, unsigned, dev_t, RF_SectorNum_t, RF_SectorCount_t, caddr_t, void (*)(struct buf *), void *, int, struct proc *)); -/* this is so that we can compile under 2.0 as well as 3.2 */ -#ifndef proc_to_task -#define proc_to_task(x) ((x)->task) -#endif /* !proc_to_task */ +#define Dprintf0(s) if (rf_queueDebug) \ + rf_debug_printf(s,NULL,NULL,NULL,NULL,NULL,NULL,NULL,NULL) +#define Dprintf1(s,a) if (rf_queueDebug) \ + rf_debug_printf(s,a,NULL,NULL,NULL,NULL,NULL,NULL,NULL) +#define Dprintf2(s,a,b) if (rf_queueDebug) \ + rf_debug_printf(s,a,b,NULL,NULL,NULL,NULL,NULL,NULL) +#define Dprintf3(s,a,b,c) if (rf_queueDebug) \ + rf_debug_printf(s,a,b,c,NULL,NULL,NULL,NULL,NULL) + +int raidmarkclean(dev_t dev, struct vnode *b_vp, int); +int raidmarkdirty(dev_t dev, struct vnode *b_vp, int); void raidattach __P((int)); int raidsize __P((dev_t)); @@ -220,6 +223,10 @@ int raidread __P((dev_t, struct uio *, int)); void raidstrategy __P((struct buf *)); int raiddump __P((dev_t, daddr_t, caddr_t, size_t)); +int raidwrite_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *); +int raidread_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *); +void rf_update_component_labels( RF_Raid_t *); + /* * Pilfered from ccd.c */ @@ -238,7 +245,6 @@ struct raidbuf { * or if it should be used in conjunction with that... */ struct raid_softc { - int sc_unit; /* logical unit number */ int sc_flags; /* flags */ int sc_cflags; /* configuration flags */ size_t sc_size; /* size of the raid device */ @@ -257,6 +263,16 @@ struct raid_softc { #define raidunit(x) DISKUNIT(x) static int numraid = 0; +/* + * Allow RAIDOUTSTANDING number of simultaneous IO's to this RAID device. + * Be aware that large numbers can allow the driver to consume a lot of + * kernel memory, especially on writes... + */ + +#ifndef RAIDOUTSTANDING +#define RAIDOUTSTANDING 10 +#endif + #define RAIDLABELDEV(dev) \ (MAKEDISKDEV(major((dev)), raidunit((dev)), RAW_PART)) @@ -272,12 +288,14 @@ int raidlock __P((struct raid_softc *)); void raidunlock __P((struct raid_softc *)); int raidlookup __P((char *, struct proc *p, struct vnode **)); +static void rf_markalldirty __P((RF_Raid_t *)); void raidattach(num) int num; { int raidID; + int i, rc; db1_printf(("raidattach: Asked for %d units\n", num)); @@ -296,10 +314,21 @@ raidattach(num) panic("raidPtrs is NULL!!\n"); } - rf_kbooted = rf_boot(); - if (rf_kbooted) { - panic("Serious error booting RAID!!\n"); - } + rc = rf_mutex_init(&rf_sparet_wait_mutex); + if (rc) { + RF_PANIC(); + } + + rf_sparet_wait_queue = rf_sparet_resp_queue = NULL; + recon_queue = NULL; + + for (i = 0; i < numraid; i++) + raidPtrs[i] = NULL; + rc = rf_BootRaidframe(); + if (rc == 0) + printf("Kernelized RAIDframe activated\n"); + else + panic("Serious error booting RAID!!\n"); rf_kbooted = RFK_BOOT_GOOD; @@ -384,25 +413,8 @@ raidopen(dev, flags, fmt, p) struct raid_softc *rs; struct disklabel *lp; int part,pmask; - unsigned int raidID; - int rc; int error = 0; - /* - * XXX This whole next chunk of code is somewhat suspect... Not sure - * it's needed here at all. - */ - if (rf_kbooted == RFK_BOOT_NONE) { - printf("Doing restart on raidopen.\n"); - rf_kbooted = RFK_BOOT_GOOD; - rc = rf_boot(); - if (rc) { - rf_kbooted = RFK_BOOT_BAD; - printf("Someone is unhappy...\n"); - return (rc); - } - } - if (unit >= numraid) return (ENXIO); rs = &raid_softc[unit]; @@ -411,13 +423,11 @@ raidopen(dev, flags, fmt, p) return (error); lp = rs->sc_dkdev.dk_label; - raidID = raidunit(dev); - part = DISKPART(dev); pmask = (1 << part); db1_printf( - ("Opening raid device number: %d partition: %d\n", raidID, part)); + ("Opening raid device number: %d partition: %d\n", unit, part)); if ((rs->sc_flags & RAIDF_INITED) && (rs->sc_dkdev.dk_openmask == 0)) @@ -447,6 +457,20 @@ raidopen(dev, flags, fmt, p) rs->sc_dkdev.dk_bopenmask |= pmask; break; } + + if ((rs->sc_dkdev.dk_openmask == 0) && + ((rs->sc_flags & RAIDF_INITED) != 0)) { + /* First one... mark things as dirty... Note that we *MUST* + have done a configure before this. I DO NOT WANT TO BE + SCRIBBLING TO RANDOM COMPONENTS UNTIL IT'S BEEN DETERMINED + THAT THEY BELONG TOGETHER!!!!! */ + /* XXX should check to see if we're only open for reading + here... If so, we needn't do this, but then need some + other way of keeping track of what's happened.. */ + + rf_markalldirty( raidPtrs[unit] ); + } + rs->sc_dkdev.dk_openmask = rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask; @@ -486,6 +510,15 @@ raidclose(dev, flags, fmt, p) rs->sc_dkdev.dk_bopenmask &= ~(1 << part); break; } + if ((rs->sc_dkdev.dk_openmask == 0) && + ((rs->sc_flags & RAIDF_INITED) != 0)) { + /* Last one... device is not unconfigured yet. + Device shutdown has taken care of setting the + clean bits if RAIDF_INITED is not set + mark things as clean... */ + rf_update_component_labels( raidPtrs[unit] ); + } + rs->sc_dkdev.dk_openmask = rs->sc_dkdev.dk_copenmask | rs->sc_dkdev.dk_bopenmask; @@ -580,7 +613,6 @@ raidread(dev, uio, flags) { int unit = raidunit(dev); struct raid_softc *rs; - int result; int part; if (unit >= numraid) @@ -593,13 +625,7 @@ raidread(dev, uio, flags) db1_printf(("raidread: unit: %d partition: %d\n", unit, part)); -#if 0 return (physio(raidstrategy, NULL, dev, B_READ, minphys, uio)); -#endif - result = physio(raidstrategy, NULL, dev, B_READ, minphys, uio); - db1_printf(("raidread done. Result is %d %d\n", result, - uio->uio_resid)); - return (result); } /* ARGSUSED */ @@ -644,16 +670,16 @@ raidioctl(dev, cmd, data, flag, p) RF_Config_t *k_cfg, *u_cfg; u_char *specific_buf; int retcode = 0; - int row; + int column; + int s; struct rf_recon_req *rrcopy, *rr; -#if 0 - int nbytes, spl, rw, row; - struct rf_test_acc *ta; - struct buf *bp; - RF_SparetWait_t *waitreq; - struct rf_test_acc *ta_p, *ta_copy; -#endif + RF_ComponentLabel_t *component_label; + RF_ComponentLabel_t ci_label; + RF_ComponentLabel_t **c_label_ptr; + RF_SingleComponent_t *sparePtr,*componentPtr; + RF_SingleComponent_t hot_spare; + RF_SingleComponent_t component; if (unit >= numraid) return (ENXIO); @@ -688,6 +714,12 @@ raidioctl(dev, cmd, data, flag, p) case RAIDFRAME_FAIL_DISK: case RAIDFRAME_COPYBACK: case RAIDFRAME_CHECKRECON: + case RAIDFRAME_GET_COMPONENT_LABEL: + case RAIDFRAME_SET_COMPONENT_LABEL: + case RAIDFRAME_ADD_HOT_SPARE: + case RAIDFRAME_REMOVE_HOT_SPARE: + case RAIDFRAME_INIT_LABELS: + case RAIDFRAME_REBUILD_IN_PLACE: if ((rs->sc_flags & RAIDF_INITED) == 0) return (ENXIO); } @@ -762,13 +794,16 @@ raidioctl(dev, cmd, data, flag, p) raidPtrs[unit]->proc = p; /* configure the system */ - rf_pending_testaccs = 0; - raidPtrs[unit]->raidid = unit; + retcode = rf_Configure(raidPtrs[unit], k_cfg); + /* allow this many simultaneous IO's to this RAID device */ + raidPtrs[unit]->openings = RAIDOUTSTANDING; + if (retcode == 0) { retcode = raidinit(dev, raidPtrs[unit],unit); + rf_markalldirty( raidPtrs[unit] ); } /* Free the buffers. No return code here. */ @@ -779,6 +814,7 @@ raidioctl(dev, cmd, data, flag, p) db3_printf(("rf_ioctl: retcode=%d RAIDFRAME_CONFIGURE\n", retcode)); + return (retcode); case RAIDFRAME_SHUTDOWN: @@ -800,22 +836,12 @@ raidioctl(dev, cmd, data, flag, p) raidunlock(rs); return (EBUSY); } - - /* - * The intention here was to disallow shutdowns while - * raidframe is mounted, but it doesn't work because the - * shutdown ioctl calls rf_open. - */ - if (rf_pending_testaccs > 0) { - printf("RAIDFRAME: Can't shutdown because there are " - "%d pending test accs\n", - rf_pending_testaccs); - return (EINVAL); - } + if (rf_debugKernelAccess) { printf("call shutdown\n"); } raidPtrs[unit]->proc = p; /* XXX Necessary evil */ + retcode = rf_Shutdown(raidPtrs[unit]); db1_printf(("Done main shutdown\n")); @@ -830,18 +856,172 @@ raidioctl(dev, cmd, data, flag, p) return (retcode); + case RAIDFRAME_GET_COMPONENT_LABEL: + c_label_ptr = (RF_ComponentLabel_t **) data; + /* need to read the component label for the disk indicated + by row,column in component_label + XXX need to sanity check these values!!! + */ + + /* For practice, let's get it directly fromdisk, rather + than from the in-core copy */ + RF_Malloc( component_label, sizeof( RF_ComponentLabel_t ), + (RF_ComponentLabel_t *)); + if (component_label == NULL) + return (ENOMEM); + + bzero((char *) component_label, sizeof(RF_ComponentLabel_t)); + + retcode = copyin( *c_label_ptr, component_label, + sizeof(RF_ComponentLabel_t)); + + if (retcode) { + return(retcode); + } + + row = component_label->row; + printf("Row: %d\n",row); + if (row > raidPtrs[unit]->numRow) { + row = 0; /* XXX */ + } + column = component_label->column; + printf("Column: %d\n",column); + if (column > raidPtrs[unit]->numCol) { + column = 0; /* XXX */ + } + + raidread_component_label( + raidPtrs[unit]->Disks[row][column].dev, + raidPtrs[unit]->raid_cinfo[row][column].ci_vp, + component_label ); + + retcode = copyout((caddr_t) component_label, + (caddr_t) *c_label_ptr, + sizeof(RF_ComponentLabel_t)); + RF_Free( component_label, sizeof(RF_ComponentLabel_t)); + return (retcode); + + case RAIDFRAME_SET_COMPONENT_LABEL: + component_label = (RF_ComponentLabel_t *) data; + + /* XXX check the label for valid stuff... */ + /* Note that some things *should not* get modified -- + the user should be re-initing the labels instead of + trying to patch things. + */ + + printf("Got component label:\n"); + printf("Version: %d\n",component_label->version); + printf("Serial Number: %d\n",component_label->serial_number); + printf("Mod counter: %d\n",component_label->mod_counter); + printf("Row: %d\n", component_label->row); + printf("Column: %d\n", component_label->column); + printf("Num Rows: %d\n", component_label->num_rows); + printf("Num Columns: %d\n", component_label->num_columns); + printf("Clean: %d\n", component_label->clean); + printf("Status: %d\n", component_label->status); + + row = component_label->row; + column = component_label->column; + + if ((row < 0) || (row > raidPtrs[unit]->numRow) || + (column < 0) || (column > raidPtrs[unit]->numCol)) { + return(EINVAL); + } + + /* XXX this isn't allowed to do anything for now :-) */ +#if 0 + raidwrite_component_label( + raidPtrs[unit]->Disks[row][column].dev, + raidPtrs[unit]->raid_cinfo[row][column].ci_vp, + component_label ); +#endif + return (0); + + case RAIDFRAME_INIT_LABELS: + component_label = (RF_ComponentLabel_t *) data; + /* + we only want the serial number from + the above. We get all the rest of the information + from the config that was used to create this RAID + set. + */ + + raidPtrs[unit]->serial_number = component_label->serial_number; + /* current version number */ + ci_label.version = RF_COMPONENT_LABEL_VERSION; + ci_label.serial_number = component_label->serial_number; + ci_label.mod_counter = raidPtrs[unit]->mod_counter; + ci_label.num_rows = raidPtrs[unit]->numRow; + ci_label.num_columns = raidPtrs[unit]->numCol; + ci_label.clean = RF_RAID_DIRTY; /* not clean */ + ci_label.status = rf_ds_optimal; /* "It's good!" */ + + for(row=0;row<raidPtrs[unit]->numRow;row++) { + ci_label.row = row; + for(column=0;column<raidPtrs[unit]->numCol;column++) { + ci_label.column = column; + raidwrite_component_label( + raidPtrs[unit]->Disks[row][column].dev, + raidPtrs[unit]->raid_cinfo[row][column].ci_vp, + &ci_label ); + } + } + + return (retcode); + case RAIDFRAME_REWRITEPARITY: - /* initialize all parity */ - if (raidPtrs[unit]->Layout.map->faultsTolerated == 0) - return (EINVAL); + if (raidPtrs[unit]->Layout.map->faultsTolerated == 0) { + /* Parity for RAID 0 is trivially correct */ + raidPtrs[unit]->parity_good = RF_RAID_CLEAN; + return(0); + } + /* borrow the thread of the requesting process */ raidPtrs[unit]->proc = p; /* Blah... :-p GO */ retcode = rf_RewriteParity(raidPtrs[unit]); /* return I/O Error if the parity rewrite fails */ - if (retcode) + if (retcode) { retcode = EIO; + } else { + /* set the clean bit! If we shutdown correctly, + the clean bit on each component label will get + set */ + raidPtrs[unit]->parity_good = RF_RAID_CLEAN; + } + return (retcode); + + + case RAIDFRAME_ADD_HOT_SPARE: + sparePtr = (RF_SingleComponent_t *) data; + memcpy( &hot_spare, sparePtr, sizeof(RF_SingleComponent_t)); + printf("Adding spare\n"); + raidPtrs[unit]->proc = p; /* Blah... :-p GO */ + retcode = rf_add_hot_spare(raidPtrs[unit], &hot_spare); + return(retcode); + + case RAIDFRAME_REMOVE_HOT_SPARE: + return(retcode); + + case RAIDFRAME_REBUILD_IN_PLACE: + componentPtr = (RF_SingleComponent_t *) data; + memcpy( &component, componentPtr, + sizeof(RF_SingleComponent_t)); + row = component.row; + column = component.column; + printf("Rebuild: %d %d\n",row, column); + if ((row < 0) || (row > raidPtrs[unit]->numRow) || + (column < 0) || (column > raidPtrs[unit]->numCol)) { + return(EINVAL); + } + printf("Attempting a rebuild in place\n"); + s = splbio(); + raidPtrs[unit]->proc = p; /* Blah... :-p GO */ + retcode = rf_ReconstructInPlace(raidPtrs[unit], row, column); + splx(s); + return (retcode); #if 0 /* XXX not supported yet (ever?) */ @@ -946,7 +1126,8 @@ raidioctl(dev, cmd, data, flag, p) rr->col < 0 || rr->col >= raidPtrs[unit]->numCol) return (EINVAL); - printf("Failing the disk: row: %d col: %d\n",rr->row,rr->col); + printf("raid%d: Failing the disk: row: %d col: %d\n", + unit, rr->row, rr->col); /* * Make a copy of the recon request so that we don't @@ -971,7 +1152,9 @@ raidioctl(dev, cmd, data, flag, p) case RAIDFRAME_COPYBACK: /* Borrow the current thread to get this done */ raidPtrs[unit]->proc = p; /* ICK.. but needed :-p GO */ - rf_CopybackReconstructedData(raidPtrs[unit]); + s = splbio(); + rf_CopybackReconstructedData(raidPtrs[unit]); + splx(s); return (0); /* Return the percentage completion of reconstruction */ @@ -1162,37 +1345,6 @@ raidinit(dev, raidPtr, unit) return (retcode); } -/********************************************************* - * - * initialization code called at boot time (startup.c) - * - ********************************************************/ -int -rf_boot() -{ - int i, rc; - - rc = rf_mutex_init(&rf_sparet_wait_mutex); - if (rc) { - RF_PANIC(); - } - rc = rf_mutex_init(&rf_async_done_q_mutex); - if (rc) { - RF_PANIC(); - } - rf_sparet_wait_queue = rf_sparet_resp_queue = NULL; - recon_queue = NULL; - rf_async_done_qh = rf_async_done_qt = NULL; - for (i = 0; i < numraid; i++) - raidPtrs[i] = NULL; - rc = rf_BootRaidframe(); - if (rc == 0) - printf("Kernelized RAIDframe activated\n"); - else - rf_kbooted = RFK_BOOT_BAD; - return (rc); -} - /* * This kernel thread never exits. It is created once, and persists * until the system reboots. @@ -1214,8 +1366,8 @@ rf_ReconKernelThread() LOCK_RECON_Q_MUTEX(); while (!recon_queue) { UNLOCK_RECON_Q_MUTEX(); - tsleep(&recon_queue, PRIBIO | PCATCH, - "raidframe recon", 0); + tsleep(&recon_queue, PRIBIO, + "raidframe recon", 0); LOCK_RECON_Q_MUTEX(); } req = recon_queue; @@ -1255,8 +1407,8 @@ rf_GetSpareTableFromDaemon(req) /* mpsleep unlocks the mutex */ while (!rf_sparet_resp_queue) { - tsleep(&rf_sparet_resp_queue, PRIBIO | PCATCH, - "raidframe getsparetable", 0); + tsleep(&rf_sparet_resp_queue, PRIBIO, + "raidframe getsparetable", 0); #if 0 mpsleep(&rf_sparet_resp_queue, PZERO, "sparet resp", 0, (void *)simple_lock_addr(rf_sparet_wait_mutex), @@ -1295,6 +1447,7 @@ rf_DoAccessKernel(raidPtr, bp, flags, cbFunc, cbArg) daddr_t blocknum; int unit; struct raid_softc *rs; + int do_async; /* XXX The dev_t used here should be for /dev/[r]raid* !!! */ @@ -1355,6 +1508,24 @@ rf_DoAccessKernel(raidPtr, bp, flags, cbFunc, cbArg) } db1_printf(("Calling DoAccess..\n")); + /* Put a throttle on the number of requests we handle simultanously */ + + RF_LOCK_MUTEX(raidPtr->mutex); + + while(raidPtr->openings <= 0) { + RF_UNLOCK_MUTEX(raidPtr->mutex); + (void)tsleep(&raidPtr->openings, PRIBIO, "rfdwait", 0); + RF_LOCK_MUTEX(raidPtr->mutex); + } + raidPtr->openings--; + + RF_UNLOCK_MUTEX(raidPtr->mutex); + + /* + * Everything is async. + */ + do_async = 1; + /* * Don't ever condition on bp->b_flags & B_WRITE. * always condition on B_READ instead. @@ -1403,6 +1574,7 @@ rf_DispatchKernelIO(queue, req) bp = req->bp; +#if 1 /* * XXX When there is a physical disk failure, someone is passing * us a buffer that contains old stuff!! Attempt to deal with @@ -1416,6 +1588,7 @@ rf_DispatchKernelIO(queue, req) if (bp->b_error!=0) { bp->b_error = 0; } +#endif raidbp = RAIDGETBUF(); @@ -1572,6 +1745,7 @@ rf_KernelWakeupFunc(vbp) rf_ds_failed; queue->raidPtr->status[queue->row] = rf_rs_degraded; queue->raidPtr->numFailures++; + /* XXX here we should bump the version number for each component, and write that data out */ } else { /* Disk is already dead... */ /* printf("Disk already marked as dead!\n"); */ @@ -1741,14 +1915,14 @@ raidgetdisklabel(dev) if (lp->d_secperunit != rs->sc_size) printf("WARNING: %s: " "total sector size in disklabel (%d) != " - "the size of raid (%d)\n", rs->sc_xname, - lp->d_secperunit, rs->sc_size); + "the size of raid (%ld)\n", rs->sc_xname, + lp->d_secperunit, (long) rs->sc_size); for (i = 0; i < lp->d_npartitions; i++) { pp = &lp->d_partitions[i]; if (pp->p_offset + pp->p_size > rs->sc_size) printf("WARNING: %s: end of partition `%c' " - "exceeds the size of raid (%d)\n", - rs->sc_xname, 'a' + i, rs->sc_size); + "exceeds the size of raid (%ld)\n", + rs->sc_xname, 'a' + i, (long) rs->sc_size); } } } @@ -1855,3 +2029,328 @@ raidunlock(rs) wakeup(rs); } } + + +#define RF_COMPONENT_INFO_OFFSET 16384 /* bytes */ +#define RF_COMPONENT_INFO_SIZE 1024 /* bytes */ + +int +raidmarkclean(dev_t dev, struct vnode *b_vp, int mod_counter) +{ + RF_ComponentLabel_t component_label; + raidread_component_label(dev, b_vp, &component_label); + component_label.mod_counter = mod_counter; + component_label.clean = RF_RAID_CLEAN; + raidwrite_component_label(dev, b_vp, &component_label); + return(0); +} + + +int +raidmarkdirty(dev_t dev, struct vnode *b_vp, int mod_counter) +{ + RF_ComponentLabel_t component_label; + raidread_component_label(dev, b_vp, &component_label); + component_label.mod_counter = mod_counter; + component_label.clean = RF_RAID_DIRTY; + raidwrite_component_label(dev, b_vp, &component_label); + return(0); +} + +/* ARGSUSED */ +int +raidread_component_label(dev, b_vp, component_label) + dev_t dev; + struct vnode *b_vp; + RF_ComponentLabel_t *component_label; +{ + struct buf *bp; + int error; + + /* XXX should probably ensure that we don't try to do this if + someone has changed rf_protected_sectors. */ + + /* get a block of the appropriate size... */ + bp = geteblk((int)RF_COMPONENT_INFO_SIZE); + bp->b_dev = dev; + + /* get our ducks in a row for the read */ + bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE; + bp->b_bcount = RF_COMPONENT_INFO_SIZE; + bp->b_flags = B_BUSY | B_READ; + bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE; + + (*bdevsw[major(bp->b_dev)].d_strategy)(bp); + + error = biowait(bp); + + if (!error) { + memcpy(component_label, bp->b_un.b_addr, + sizeof(RF_ComponentLabel_t)); +#if 0 + printf("raidread_component_label: got component label:\n"); + printf("Version: %d\n",component_label->version); + printf("Serial Number: %d\n",component_label->serial_number); + printf("Mod counter: %d\n",component_label->mod_counter); + printf("Row: %d\n", component_label->row); + printf("Column: %d\n", component_label->column); + printf("Num Rows: %d\n", component_label->num_rows); + printf("Num Columns: %d\n", component_label->num_columns); + printf("Clean: %d\n", component_label->clean); + printf("Status: %d\n", component_label->status); +#endif + } else { + printf("Failed to read RAID component label!\n"); + } + + bp->b_flags = B_INVAL | B_AGE; + brelse(bp); + return(error); +} +/* ARGSUSED */ +int +raidwrite_component_label(dev, b_vp, component_label) + dev_t dev; + struct vnode *b_vp; + RF_ComponentLabel_t *component_label; +{ + struct buf *bp; + int error; + + /* get a block of the appropriate size... */ + bp = geteblk((int)RF_COMPONENT_INFO_SIZE); + bp->b_dev = dev; + + /* get our ducks in a row for the write */ + bp->b_blkno = RF_COMPONENT_INFO_OFFSET / DEV_BSIZE; + bp->b_bcount = RF_COMPONENT_INFO_SIZE; + bp->b_flags = B_BUSY | B_WRITE; + bp->b_resid = RF_COMPONENT_INFO_SIZE / DEV_BSIZE; + + memset( bp->b_un.b_addr, 0, RF_COMPONENT_INFO_SIZE ); + + memcpy( bp->b_un.b_addr, component_label, sizeof(RF_ComponentLabel_t)); + + (*bdevsw[major(bp->b_dev)].d_strategy)(bp); + error = biowait(bp); + bp->b_flags = B_INVAL | B_AGE; + brelse(bp); + if (error) { + printf("Failed to write RAID component info!\n"); + } + + return(error); +} + +void +rf_markalldirty( raidPtr ) + RF_Raid_t *raidPtr; +{ + RF_ComponentLabel_t c_label; + int r,c; + + raidPtr->mod_counter++; + for (r = 0; r < raidPtr->numRow; r++) { + for (c = 0; c < raidPtr->numCol; c++) { + if (raidPtr->Disks[r][c].status != rf_ds_failed) { + raidread_component_label( + raidPtr->Disks[r][c].dev, + raidPtr->raid_cinfo[r][c].ci_vp, + &c_label); + if (c_label.status == rf_ds_spared) { + /* XXX do something special... + but whatever you do, don't + try to access it!! */ + } else { +#if 0 + c_label.status = + raidPtr->Disks[r][c].status; + raidwrite_component_label( + raidPtr->Disks[r][c].dev, + raidPtr->raid_cinfo[r][c].ci_vp, + &c_label); +#endif + raidmarkdirty( + raidPtr->Disks[r][c].dev, + raidPtr->raid_cinfo[r][c].ci_vp, + raidPtr->mod_counter); + } + } + } + } + /* printf("Component labels marked dirty.\n"); */ +#if 0 + for( c = 0; c < raidPtr->numSpare ; c++) { + sparecol = raidPtr->numCol + c; + if (raidPtr->Disks[r][sparecol].status == rf_ds_used_spare) { + /* + + XXX this is where we get fancy and map this spare + into it's correct spot in the array. + + */ + /* + + we claim this disk is "optimal" if it's + rf_ds_used_spare, as that means it should be + directly substitutable for the disk it replaced. + We note that too... + + */ + + for(i=0;i<raidPtr->numRow;i++) { + for(j=0;j<raidPtr->numCol;j++) { + if ((raidPtr->Disks[i][j].spareRow == + r) && + (raidPtr->Disks[i][j].spareCol == + sparecol)) { + srow = r; + scol = sparecol; + break; + } + } + } + + raidread_component_label( + raidPtr->Disks[r][sparecol].dev, + raidPtr->raid_cinfo[r][sparecol].ci_vp, + &c_label); + /* make sure status is noted */ + c_label.version = RF_COMPONENT_LABEL_VERSION; + c_label.mod_counter = raidPtr->mod_counter; + c_label.serial_number = raidPtr->serial_number; + c_label.row = srow; + c_label.column = scol; + c_label.num_rows = raidPtr->numRow; + c_label.num_columns = raidPtr->numCol; + c_label.clean = RF_RAID_DIRTY; /* changed in a bit*/ + c_label.status = rf_ds_optimal; + raidwrite_component_label( + raidPtr->Disks[r][sparecol].dev, + raidPtr->raid_cinfo[r][sparecol].ci_vp, + &c_label); + raidmarkclean( raidPtr->Disks[r][sparecol].dev, + raidPtr->raid_cinfo[r][sparecol].ci_vp); + } + } + +#endif +} + + +void +rf_update_component_labels( raidPtr ) + RF_Raid_t *raidPtr; +{ + RF_ComponentLabel_t c_label; + int sparecol; + int r,c; + int i,j; + int srow, scol; + + srow = -1; + scol = -1; + + /* XXX should do extra checks to make sure things really are clean, + rather than blindly setting the clean bit... */ + + raidPtr->mod_counter++; + + for (r = 0; r < raidPtr->numRow; r++) { + for (c = 0; c < raidPtr->numCol; c++) { + if (raidPtr->Disks[r][c].status == rf_ds_optimal) { + raidread_component_label( + raidPtr->Disks[r][c].dev, + raidPtr->raid_cinfo[r][c].ci_vp, + &c_label); + /* make sure status is noted */ + c_label.status = rf_ds_optimal; + raidwrite_component_label( + raidPtr->Disks[r][c].dev, + raidPtr->raid_cinfo[r][c].ci_vp, + &c_label); + if (raidPtr->parity_good == RF_RAID_CLEAN) { + raidmarkclean( + raidPtr->Disks[r][c].dev, + raidPtr->raid_cinfo[r][c].ci_vp, + raidPtr->mod_counter); + } + } + /* else we don't touch it.. */ +#if 0 + else if (raidPtr->Disks[r][c].status != + rf_ds_failed) { + raidread_component_label( + raidPtr->Disks[r][c].dev, + raidPtr->raid_cinfo[r][c].ci_vp, + &c_label); + /* make sure status is noted */ + c_label.status = + raidPtr->Disks[r][c].status; + raidwrite_component_label( + raidPtr->Disks[r][c].dev, + raidPtr->raid_cinfo[r][c].ci_vp, + &c_label); + if (raidPtr->parity_good == RF_RAID_CLEAN) { + raidmarkclean( + raidPtr->Disks[r][c].dev, + raidPtr->raid_cinfo[r][c].ci_vp, + raidPtr->mod_counter); + } + } +#endif + } + } + + for( c = 0; c < raidPtr->numSpare ; c++) { + sparecol = raidPtr->numCol + c; + if (raidPtr->Disks[0][sparecol].status == rf_ds_used_spare) { + /* + + we claim this disk is "optimal" if it's + rf_ds_used_spare, as that means it should be + directly substitutable for the disk it replaced. + We note that too... + + */ + + for(i=0;i<raidPtr->numRow;i++) { + for(j=0;j<raidPtr->numCol;j++) { + if ((raidPtr->Disks[i][j].spareRow == + 0) && + (raidPtr->Disks[i][j].spareCol == + sparecol)) { + srow = i; + scol = j; + break; + } + } + } + + raidread_component_label( + raidPtr->Disks[0][sparecol].dev, + raidPtr->raid_cinfo[0][sparecol].ci_vp, + &c_label); + /* make sure status is noted */ + c_label.version = RF_COMPONENT_LABEL_VERSION; + c_label.mod_counter = raidPtr->mod_counter; + c_label.serial_number = raidPtr->serial_number; + c_label.row = srow; + c_label.column = scol; + c_label.num_rows = raidPtr->numRow; + c_label.num_columns = raidPtr->numCol; + c_label.clean = RF_RAID_DIRTY; /* changed in a bit*/ + c_label.status = rf_ds_optimal; + raidwrite_component_label( + raidPtr->Disks[0][sparecol].dev, + raidPtr->raid_cinfo[0][sparecol].ci_vp, + &c_label); + if (raidPtr->parity_good == RF_RAID_CLEAN) { + raidmarkclean( raidPtr->Disks[0][sparecol].dev, + raidPtr->raid_cinfo[0][sparecol].ci_vp, + raidPtr->mod_counter); + } + } + } + /* printf("Component labels updated\n"); */ +} diff --git a/sys/dev/raidframe/rf_parityscan.c b/sys/dev/raidframe/rf_parityscan.c index 4fce2cbcbf2..60a5bc3b3dc 100644 --- a/sys/dev/raidframe/rf_parityscan.c +++ b/sys/dev/raidframe/rf_parityscan.c @@ -1,5 +1,5 @@ -/* $OpenBSD: rf_parityscan.c,v 1.3 1999/03/02 21:53:50 niklas Exp $ */ -/* $NetBSD: rf_parityscan.c,v 1.3 1999/02/05 00:06:14 oster Exp $ */ +/* $OpenBSD: rf_parityscan.c,v 1.4 1999/07/30 14:45:33 peter Exp $ */ +/* $NetBSD: rf_parityscan.c,v 1.4 1999/03/14 22:10:46 oster Exp $ */ /* * Copyright (c) 1995 Carnegie-Mellon University. * All rights reserved. @@ -64,48 +64,42 @@ rf_RewriteParity(raidPtr) { RF_RaidLayout_t *layoutPtr = &raidPtr->Layout; RF_AccessStripeMapHeader_t *asm_h; - int old_pctg, new_pctg, rc; + int rc; RF_PhysDiskAddr_t pda; RF_SectorNum_t i; pda.startSector = 0; pda.numSector = raidPtr->Layout.sectorsPerStripeUnit; - old_pctg = -1; -/* rf_verifyParityDebug=1; */ - for (i = 0; i < raidPtr->totalSectors; i += layoutPtr->dataSectorsPerStripe) { - asm_h = rf_MapAccess(raidPtr, i, layoutPtr->dataSectorsPerStripe, NULL, RF_DONT_REMAP); + for (i = 0; i < raidPtr->totalSectors; + i += layoutPtr->dataSectorsPerStripe) { + asm_h = rf_MapAccess(raidPtr, i, + layoutPtr->dataSectorsPerStripe, + NULL, RF_DONT_REMAP); rc = rf_VerifyParity(raidPtr, asm_h->stripeMap, 1, 0); - /* printf("Parity verified: rc=%d\n",rc); */ switch (rc) { case RF_PARITY_OKAY: case RF_PARITY_CORRECTED: break; case RF_PARITY_BAD: printf("Parity bad during correction\n"); - return (1); + RF_PANIC(); break; case RF_PARITY_COULD_NOT_CORRECT: printf("Could not correct bad parity\n"); - return (1); + RF_PANIC(); break; case RF_PARITY_COULD_NOT_VERIFY: printf("Could not verify parity\n"); - return (1); + RF_PANIC(); break; default: printf("Bad rc=%d from VerifyParity in RewriteParity\n", rc); RF_PANIC(); } rf_FreeAccessStripeMap(asm_h); - new_pctg = i * 1000 / raidPtr->totalSectors; - if (new_pctg != old_pctg) { - } - old_pctg = new_pctg; } -#if 1 - return (0); /* XXX nothing was here.. GO */ -#endif + return (0); } /***************************************************************************************** * @@ -138,9 +132,12 @@ rf_VerifyParity(raidPtr, aasm, correct_it, flags) rc = RF_PARITY_OKAY; if (lp->VerifyParity) { for (doasm = aasm; doasm; doasm = doasm->next) { - for (parityPDA = doasm->parityInfo; parityPDA; parityPDA = parityPDA->next) { - lrc = lp->VerifyParity(raidPtr, doasm->raidAddress, parityPDA, - correct_it, flags); + for (parityPDA = doasm->parityInfo; parityPDA; + parityPDA = parityPDA->next) { + lrc = lp->VerifyParity(raidPtr, + doasm->raidAddress, + parityPDA, + correct_it, flags); if (lrc > rc) { /* see rf_parityscan.h for why this * works */ @@ -163,7 +160,8 @@ rf_VerifyParityBasic(raidPtr, raidAddr, parityPDA, correct_it, flags) RF_RaidAccessFlags_t flags; { RF_RaidLayout_t *layoutPtr = &(raidPtr->Layout); - RF_RaidAddr_t startAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, raidAddr); + RF_RaidAddr_t startAddr = rf_RaidAddressOfPrevStripeBoundary(layoutPtr, + raidAddr); RF_SectorCount_t numsector = parityPDA->numSector; int numbytes = rf_RaidAddressToByte(raidPtr, numsector); int bytesPerStripe = numbytes * layoutPtr->numDataCol; @@ -176,7 +174,9 @@ rf_VerifyParityBasic(raidPtr, raidAddr, parityPDA, correct_it, flags) char *pbuf, *buf, *end_p, *p; int i, retcode; RF_ReconUnitNum_t which_ru; - RF_StripeNum_t psID = rf_RaidAddressToParityStripeID(layoutPtr, raidAddr, &which_ru); + RF_StripeNum_t psID = rf_RaidAddressToParityStripeID(layoutPtr, + raidAddr, + &which_ru); int stripeWidth = layoutPtr->numDataCol + layoutPtr->numParityCol; RF_AccTraceEntry_t tracerec; RF_MCPair_t *mcpair; diff --git a/sys/dev/raidframe/rf_raid.h b/sys/dev/raidframe/rf_raid.h index 798dcdd1c8f..36b4e806b07 100644 --- a/sys/dev/raidframe/rf_raid.h +++ b/sys/dev/raidframe/rf_raid.h @@ -1,5 +1,5 @@ -/* $OpenBSD: rf_raid.h,v 1.2 1999/02/16 00:03:13 niklas Exp $ */ -/* $NetBSD: rf_raid.h,v 1.3 1999/02/05 00:06:15 oster Exp $ */ +/* $OpenBSD: rf_raid.h,v 1.3 1999/07/30 14:45:33 peter Exp $ */ +/* $NetBSD: rf_raid.h,v 1.6 1999/07/08 00:45:24 oster Exp $ */ /* * Copyright (c) 1995 Carnegie-Mellon University. * All rights reserved. @@ -64,6 +64,10 @@ #define RF_MAX_DISKS 128 /* max disks per array */ #define RF_DEV2RAIDID(_dev) (DISKUNIT(_dev)) +#define RF_COMPONENT_LABEL_VERSION 1 +#define RF_RAID_DIRTY 0 +#define RF_RAID_CLEAN 1 + /* * Each row in the array is a distinct parity group, so * each has it's own status, which is one of the following. @@ -98,7 +102,9 @@ struct RF_ThroughputStats_s { struct RF_Raid_s { /* This portion never changes, and can be accessed without locking */ /* an exception is Disks[][].status, which requires locking when it is - * changed */ + * changed. XXX this is no longer true. numSpare and friends can + * change now. + */ u_int numRow; /* number of rows of disks, typically == # of * ranks */ u_int numCol; /* number of columns of disks, typically == # @@ -130,6 +136,15 @@ struct RF_Raid_s { RF_LockTableEntry_t *quiesceLock; /* quiesnce table */ int numFailures; /* total number of failures in the array */ + int parity_good; /* !0 if parity is known to be correct */ + int serial_number; /* a "serial number" for this set */ + int mod_counter; /* modification counter for component labels */ + int clean; /* the clean bit for this array. */ + + int openings; /* Number of IO's which can be scheduled + simultaneously (high-level - not a + per-component limit)*/ + /* * Cleanup stuff */ diff --git a/sys/dev/raidframe/rf_raidframe.h b/sys/dev/raidframe/rf_raidframe.h index 3c39f0c09d5..7a78cec080c 100644 --- a/sys/dev/raidframe/rf_raidframe.h +++ b/sys/dev/raidframe/rf_raidframe.h @@ -1,5 +1,5 @@ -/* $OpenBSD: rf_raidframe.h,v 1.2 1999/02/16 00:03:19 niklas Exp $ */ -/* $NetBSD: rf_raidframe.h,v 1.3 1999/02/05 00:06:16 oster Exp $ */ +/* $OpenBSD: rf_raidframe.h,v 1.3 1999/07/30 14:45:33 peter Exp $ */ +/* $NetBSD: rf_raidframe.h,v 1.5 1999/03/02 03:18:48 oster Exp $ */ /* * Copyright (c) 1995 Carnegie-Mellon University. * All rights reserved. @@ -137,5 +137,12 @@ typedef struct RF_DeviceConfig_s { * for device */ #define RAIDFRAME_KEEP_ACCTOTALS _IOW ('r', 18, int) /* turn AccTotals on or * off for device */ +#define RAIDFRAME_GET_COMPONENT_LABEL _IOWR ('r', 19, RF_ComponentLabel_t *) +#define RAIDFRAME_SET_COMPONENT_LABEL _IOW ('r', 20, RF_ComponentLabel_t) + +#define RAIDFRAME_INIT_LABELS _IOW ('r', 21, RF_ComponentLabel_t) +#define RAIDFRAME_ADD_HOT_SPARE _IOW ('r', 22, RF_SingleComponent_t) +#define RAIDFRAME_REMOVE_HOT_SPARE _IOW ('r', 23, RF_SingleComponent_t) +#define RAIDFRAME_REBUILD_IN_PLACE _IOW ('r', 24, RF_SingleComponent_t) #endif /* !_RF__RF_RAIDFRAME_H_ */ diff --git a/sys/dev/raidframe/rf_reconstruct.c b/sys/dev/raidframe/rf_reconstruct.c index a3f7085241f..75fd5d75a03 100644 --- a/sys/dev/raidframe/rf_reconstruct.c +++ b/sys/dev/raidframe/rf_reconstruct.c @@ -1,5 +1,5 @@ -/* $OpenBSD: rf_reconstruct.c,v 1.2 1999/02/16 00:03:22 niklas Exp $ */ -/* $NetBSD: rf_reconstruct.c,v 1.4 1999/02/05 00:06:16 oster Exp $ */ +/* $OpenBSD: rf_reconstruct.c,v 1.3 1999/07/30 14:45:33 peter Exp $ */ +/* $NetBSD: rf_reconstruct.c,v 1.5 1999/03/02 03:18:49 oster Exp $ */ /* * Copyright (c) 1995 Carnegie-Mellon University. * All rights reserved. @@ -37,6 +37,17 @@ #include <sys/time.h> #include <sys/buf.h> #include <sys/errno.h> + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/ioctl.h> +#include <sys/fcntl.h> +#if __NETBSD__ +#include <sys/vnode.h> +#endif + #include "rf_raid.h" #include "rf_reconutil.h" #include "rf_revent.h" @@ -121,6 +132,10 @@ static void ForceReconReadDoneProc(void *arg, int status); static void rf_ShutdownReconstruction(void *); +/* XXX these should be in a .h file somewhere */ +int raidlookup __P((char *, struct proc *, struct vnode **)); +int raidwrite_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *); +int raidread_component_label(dev_t, struct vnode *, RF_ComponentLabel_t *); struct RF_ReconDoneProc_s { void (*proc) (RF_Raid_t *, void *); @@ -306,6 +321,7 @@ rf_ReconstructFailedDiskBasic(raidPtr, row, col) RF_RowCol_t row; RF_RowCol_t col; { + RF_ComponentLabel_t c_label; RF_RaidDisk_t *spareDiskPtr = NULL; RF_RaidReconDesc_t *reconDesc; RF_RowCol_t srow, scol; @@ -355,6 +371,256 @@ rf_ReconstructFailedDiskBasic(raidPtr, row, col) reconDesc->reconExecTicks = 0; reconDesc->maxReconExecTicks = 0; rc = rf_ContinueReconstructFailedDisk(reconDesc); + + if (!rc) { + /* fix up the component label */ + /* Don't actually need the read here.. */ + raidread_component_label( + raidPtr->raid_cinfo[srow][scol].ci_dev, + raidPtr->raid_cinfo[srow][scol].ci_vp, + &c_label); + + c_label.version = RF_COMPONENT_LABEL_VERSION; + c_label.mod_counter = raidPtr->mod_counter; + c_label.serial_number = raidPtr->serial_number; + c_label.row = row; + c_label.column = col; + c_label.num_rows = raidPtr->numRow; + c_label.num_columns = raidPtr->numCol; + c_label.clean = RF_RAID_DIRTY; + c_label.status = rf_ds_optimal; + + raidwrite_component_label( + raidPtr->raid_cinfo[srow][scol].ci_dev, + raidPtr->raid_cinfo[srow][scol].ci_vp, + &c_label); + + } + return (rc); +} + +/* + + Allow reconstructing a disk in-place -- i.e. component /dev/sd2e goes AWOL, + and you don't get a spare until the next Monday. With this function + (and hot-swappable drives) you can now put your new disk containing + /dev/sd2e on the bus, scsictl it alive, and then use raidctl(8) to + rebuild the data "on the spot". + +*/ + +int +rf_ReconstructInPlace(raidPtr, row, col) + RF_Raid_t *raidPtr; + RF_RowCol_t row; + RF_RowCol_t col; +{ + RF_RaidDisk_t *spareDiskPtr = NULL; + RF_RaidReconDesc_t *reconDesc; + RF_LayoutSW_t *lp; + RF_RaidDisk_t *badDisk; + RF_ComponentLabel_t c_label; + int numDisksDone = 0, rc; + struct partinfo dpart; + struct vnode *vp; + struct vattr va; + struct proc *proc; + int retcode; + + lp = raidPtr->Layout.map; + if (lp->SubmitReconBuffer) { + /* + * The current infrastructure only supports reconstructing one + * disk at a time for each array. + */ + RF_LOCK_MUTEX(raidPtr->mutex); + if ((raidPtr->Disks[row][col].status == rf_ds_optimal) && + (raidPtr->numFailures > 0)) { + /* XXX 0 above shouldn't be constant!!! */ + /* some component other than this has failed. + Let's not make things worse than they already + are... */ + printf("RAIDFRAME: Unable to reconstruct to disk at:\n"); + printf(" Row: %d Col: %d Too many failures.\n", + row, col); + RF_UNLOCK_MUTEX(raidPtr->mutex); + return (EINVAL); + } + if (raidPtr->Disks[row][col].status == rf_ds_reconstructing) { + printf("RAIDFRAME: Unable to reconstruct to disk at:\n"); + printf(" Row: %d Col: %d Reconstruction already occuring!\n", row, col); + + RF_UNLOCK_MUTEX(raidPtr->mutex); + return (EINVAL); + } + + + if (raidPtr->Disks[row][col].status != rf_ds_failed) { + /* "It's gone..." */ + raidPtr->numFailures++; + raidPtr->Disks[row][col].status = rf_ds_failed; + raidPtr->status[row] = rf_rs_degraded; + } + + while (raidPtr->reconInProgress) { + RF_WAIT_COND(raidPtr->waitForReconCond, raidPtr->mutex); + } + + + /* first look for a spare drive onto which to reconstruct + the data. spare disk descriptors are stored in row 0. + This may have to change eventually */ + + /* Actually, we don't care if it's failed or not... + On a RAID set with correct parity, this function + should be callable on any component without ill affects. */ + /* RF_ASSERT(raidPtr->Disks[row][col].status == rf_ds_failed); + */ + + if (raidPtr->Layout.map->flags & RF_DISTRIBUTE_SPARE) { + RF_ERRORMSG2("Unable to reconstruct to disk at row %d col %d: operation not supported for RF_DISTRIBUTE_SPARE\n", row, col); + + RF_UNLOCK_MUTEX(raidPtr->mutex); + return (EINVAL); + } + + /* XXX need goop here to see if the disk is alive, + and, if not, make it so... */ + + + + badDisk = &raidPtr->Disks[row][col]; + + proc = raidPtr->proc; /* XXX Yes, this is not nice.. */ + + /* This device may have been opened successfully the + first time. Close it before trying to open it again.. */ + + if (raidPtr->raid_cinfo[row][col].ci_vp != NULL) { + printf("Closed the open device: %s\n", + raidPtr->Disks[row][col].devname); + VOP_UNLOCK(raidPtr->raid_cinfo[row][col].ci_vp, 0, proc); + (void) vn_close(raidPtr->raid_cinfo[row][col].ci_vp, + FREAD | FWRITE, proc->p_ucred, proc); + raidPtr->raid_cinfo[row][col].ci_vp = NULL; + } + printf("About to (re-)open the device for rebuilding: %s\n", + raidPtr->Disks[row][col].devname); + + retcode = raidlookup(raidPtr->Disks[row][col].devname, + proc, &vp); + + if (retcode) { + printf("raid%d: rebuilding: raidlookup on device: %s failed: %d!\n",raidPtr->raidid, + raidPtr->Disks[row][col].devname, retcode); + + /* XXX the component isn't responding properly... + must be + * still dead :-( */ + RF_UNLOCK_MUTEX(raidPtr->mutex); + return(retcode); + + } else { + + /* Ok, so we can at least do a lookup... + How about actually getting a vp for it? */ + + if ((retcode = VOP_GETATTR(vp, &va, proc->p_ucred, + proc)) != 0) { + RF_UNLOCK_MUTEX(raidPtr->mutex); + return(retcode); + } + retcode = VOP_IOCTL(vp, DIOCGPART, (caddr_t) & dpart, + FREAD, proc->p_ucred, proc); + if (retcode) { + RF_UNLOCK_MUTEX(raidPtr->mutex); + return(retcode); + } + raidPtr->Disks[row][col].blockSize = + dpart.disklab->d_secsize; + + raidPtr->Disks[row][col].numBlocks = + dpart.part->p_size - rf_protectedSectors; + + raidPtr->raid_cinfo[row][col].ci_vp = vp; + raidPtr->raid_cinfo[row][col].ci_dev = va.va_rdev; + + raidPtr->Disks[row][col].dev = va.va_rdev; + + /* we allow the user to specify that only a + fraction of the disks should be used this is + just for debug: it speeds up + * the parity scan */ + raidPtr->Disks[row][col].numBlocks = + raidPtr->Disks[row][col].numBlocks * + rf_sizePercentage / 100; + } + + + + spareDiskPtr = &raidPtr->Disks[row][col]; + spareDiskPtr->status = rf_ds_used_spare; + + printf("RECON: initiating in-place reconstruction on\n"); + printf(" row %d col %d -> spare at row %d col %d\n", + row, col, row, col); + + raidPtr->reconInProgress++; + + RF_UNLOCK_MUTEX(raidPtr->mutex); + + reconDesc = AllocRaidReconDesc((void *) raidPtr, row, col, + spareDiskPtr, numDisksDone, + row, col); + raidPtr->reconDesc = (void *) reconDesc; +#if RF_RECON_STATS > 0 + reconDesc->hsStallCount = 0; + reconDesc->numReconExecDelays = 0; + reconDesc->numReconEventWaits = 0; +#endif /* RF_RECON_STATS > 0 */ + reconDesc->reconExecTimerRunning = 0; + reconDesc->reconExecTicks = 0; + reconDesc->maxReconExecTicks = 0; + rc = rf_ContinueReconstructFailedDisk(reconDesc); + } else { + RF_ERRORMSG1("RECON: no way to reconstruct failed disk for arch %c\n", + lp->parityConfig); + rc = EIO; + } + RF_LOCK_MUTEX(raidPtr->mutex); + raidPtr->reconInProgress--; + + if (!rc) { + /* Need to set these here, as at this point it'll be claiming + that the disk is in rf_ds_spared! But we know better :-) */ + + raidPtr->Disks[row][col].status = rf_ds_optimal; + raidPtr->status[row] = rf_rs_optimal; + + /* fix up the component label */ + /* Don't actually need the read here.. */ + raidread_component_label(raidPtr->raid_cinfo[row][col].ci_dev, + raidPtr->raid_cinfo[row][col].ci_vp, + &c_label); + + c_label.version = RF_COMPONENT_LABEL_VERSION; + c_label.mod_counter = raidPtr->mod_counter; + c_label.serial_number = raidPtr->serial_number; + c_label.row = row; + c_label.column = col; + c_label.num_rows = raidPtr->numRow; + c_label.num_columns = raidPtr->numCol; + c_label.clean = RF_RAID_DIRTY; + c_label.status = rf_ds_optimal; + + raidwrite_component_label(raidPtr->raid_cinfo[row][col].ci_dev, + raidPtr->raid_cinfo[row][col].ci_vp, + &c_label); + + } + RF_UNLOCK_MUTEX(raidPtr->mutex); + RF_SIGNAL_COND(raidPtr->waitForReconCond); + wakeup(&raidPtr->waitForReconCond); return (rc); } diff --git a/sys/dev/raidframe/rf_reconstruct.h b/sys/dev/raidframe/rf_reconstruct.h index c8bc680f0f6..8b087f514c0 100644 --- a/sys/dev/raidframe/rf_reconstruct.h +++ b/sys/dev/raidframe/rf_reconstruct.h @@ -1,5 +1,5 @@ -/* $OpenBSD: rf_reconstruct.h,v 1.2 1999/02/16 00:03:23 niklas Exp $ */ -/* $NetBSD: rf_reconstruct.h,v 1.3 1999/02/05 00:06:16 oster Exp $ */ +/* $OpenBSD: rf_reconstruct.h,v 1.3 1999/07/30 14:45:33 peter Exp $ */ +/* $NetBSD: rf_reconstruct.h,v 1.4 1999/03/02 03:18:48 oster Exp $ */ /* * Copyright (c) 1995 Carnegie-Mellon University. * All rights reserved. @@ -183,6 +183,9 @@ int rf_ReconstructFailedDiskBasic(RF_Raid_t * raidPtr, RF_RowCol_t row, RF_RowCol_t col); +int +rf_ReconstructInPlace(RF_Raid_t * raidPtr, RF_RowCol_t row, RF_RowCol_t col); + int rf_ContinueReconstructFailedDisk(RF_RaidReconDesc_t * reconDesc); int diff --git a/sys/dev/raidframe/rf_revent.c b/sys/dev/raidframe/rf_revent.c index e58736c7cd9..b6916c22a91 100644 --- a/sys/dev/raidframe/rf_revent.c +++ b/sys/dev/raidframe/rf_revent.c @@ -1,5 +1,5 @@ -/* $OpenBSD: rf_revent.c,v 1.2 1999/02/16 00:03:24 niklas Exp $ */ -/* $NetBSD: rf_revent.c,v 1.3 1999/02/05 00:06:17 oster Exp $ */ +/* $OpenBSD: rf_revent.c,v 1.3 1999/07/30 14:45:33 peter Exp $ */ +/* $NetBSD: rf_revent.c,v 1.4 1999/03/14 21:53:31 oster Exp $ */ /* * Copyright (c) 1995 Carnegie-Mellon University. * All rights reserved. @@ -51,7 +51,7 @@ static RF_FreeList_t *rf_revent_freelist; extern int hz; -#define DO_WAIT(_rc) tsleep(&(_rc)->eventQueue, PRIBIO | PCATCH, "raidframe eventq", 0) +#define DO_WAIT(_rc) tsleep(&(_rc)->eventQueue, PRIBIO, "raidframe eventq", 0) #define DO_SIGNAL(_rc) wakeup(&(_rc)->eventQueue) @@ -146,7 +146,7 @@ rf_GetNextReconEvent(reconDesc, row, continueFunc, continueArg) #if RF_RECON_STATS > 0 reconDesc->numReconExecDelays++; #endif /* RF_RECON_STATS > 0 */ - status = tsleep(&reconDesc->reconExecTicks, PRIBIO | PCATCH, "recon delay", RECON_TIMO); + status = tsleep(&reconDesc->reconExecTicks, PRIBIO, "recon delay", RECON_TIMO); RF_ASSERT(status == EWOULDBLOCK); reconDesc->reconExecTicks = 0; } diff --git a/sys/dev/raidframe/rf_states.c b/sys/dev/raidframe/rf_states.c index 6cb524a6f8c..4f8b39f07ad 100644 --- a/sys/dev/raidframe/rf_states.c +++ b/sys/dev/raidframe/rf_states.c @@ -1,5 +1,5 @@ -/* $OpenBSD: rf_states.c,v 1.2 1999/02/16 00:03:28 niklas Exp $ */ -/* $NetBSD: rf_states.c,v 1.6 1999/02/05 00:06:17 oster Exp $ */ +/* $OpenBSD: rf_states.c,v 1.3 1999/07/30 14:45:33 peter Exp $ */ +/* $NetBSD: rf_states.c,v 1.7 1999/07/08 00:45:24 oster Exp $ */ /* * Copyright (c) 1995 Carnegie-Mellon University. * All rights reserved. @@ -218,6 +218,15 @@ rf_State_LastState(RF_RaidAccessDesc_t * desc) if (desc->async_flag == 0) wakeup(desc->bp); + /* + * Wakeup any requests waiting to go. + */ + + RF_LOCK_MUTEX(((RF_Raid_t *) desc->raidPtr)->mutex); + ((RF_Raid_t *) desc->raidPtr)->openings++; + wakeup(&(((RF_Raid_t *) desc->raidPtr)->openings)); + RF_UNLOCK_MUTEX(((RF_Raid_t *) desc->raidPtr)->mutex); + /* printf("Calling biodone on 0x%x\n",desc->bp); */ biodone(desc->bp); /* access came through ioctl */ } diff --git a/sys/dev/raidframe/rf_threadstuff.h b/sys/dev/raidframe/rf_threadstuff.h index bc44ef348c4..028bef46cc3 100644 --- a/sys/dev/raidframe/rf_threadstuff.h +++ b/sys/dev/raidframe/rf_threadstuff.h @@ -1,5 +1,5 @@ -/* $OpenBSD: rf_threadstuff.h,v 1.2 1999/02/16 00:03:32 niklas Exp $ */ -/* $NetBSD: rf_threadstuff.h,v 1.3 1999/02/05 00:06:18 oster Exp $ */ +/* $OpenBSD: rf_threadstuff.h,v 1.3 1999/07/30 14:45:33 peter Exp $ */ +/* $NetBSD: rf_threadstuff.h,v 1.5 1999/07/06 21:51:22 thorpej Exp $ */ /* * Copyright (c) 1995 Carnegie-Mellon University. * All rights reserved. @@ -92,7 +92,7 @@ typedef void *RF_ThreadArg_t; */ #define RF_WAIT_COND(_c_,_m_) { \ RF_UNLOCK_MUTEX(_m_); \ - tsleep(&_c_, PRIBIO | PCATCH, "rfwcond", 0); \ + tsleep(&_c_, PRIBIO, "rfwcond", 0); \ RF_LOCK_MUTEX(_m_); \ } #define RF_SIGNAL_COND(_c_) wakeup(&(_c_)) |